コード例 #1
0
ファイル: model2.py プロジェクト: tearf001/ucloud
def model_pred(trainX,trainY,testX,model_type):
    if model_type == "rf":
        clf = RandomForestClassifier(n_estimators = 500,n_jobs = 20)
        clf.fit(trainX,trainY)
        pred = clf.predict(testX)
    if model_type == "gbdt":
        clf = GradientBoostingClassifier(n_estimators=6,learning_rate=0.9,random_state=0)
        clf.fit(trainX,trainY)
        pred = clf.predict(testX)
    if model_type == "fusion":
        prob = np.zeros(len(testX))
        params = [100,200,300,400,500]
        for param in params:
            clf = RandomForestClassifier(n_estimators = param,n_jobs = 20,bootstrap=True)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        params = [1,2,3,4,5,6,7,8,9,10]
        for param in params:
            clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        pred = list(prob >= 3)
    print "the pos rate is:",float(sum(pred))/len(pred)
    return pred
コード例 #2
0
def model_pred(trainX, trainY, testX, model_type):
    if model_type == "rf":
        clf = RandomForestClassifier(n_estimators=500, n_jobs=20)
        clf.fit(trainX, trainY)
        pred = clf.predict(testX)
    if model_type == "gbdt":
        clf = GradientBoostingClassifier(n_estimators=6,
                                         learning_rate=0.9,
                                         random_state=0)
        clf.fit(trainX, trainY)
        pred = clf.predict(testX)
    if model_type == "fusion":
        prob = np.zeros(len(testX))
        params = [100, 200, 300, 400, 500]
        for param in params:
            clf = RandomForestClassifier(n_estimators=param,
                                         n_jobs=20,
                                         bootstrap=True)
            clf.fit(trainX, trainY)
            prob += clf.predict(testX)
        '''
        params = [1,2,3,4,5,6,7,8,9,10]
        for param in params:
            clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        pred = list(prob >= 3)
    print "the pos rate is:", float(sum(pred)) / len(pred)
    return pred
コード例 #3
0
class LexicaseForestClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, initial_forrest_factor=5, n_estimators=10, **kwargs):
        self._initial_forrest_size = n_estimators * initial_forrest_factor
        self._final_forrest_size = n_estimators

        rf_fit_args = copy(kwargs)
        rf_fit_args.update({'n_estimators': self._initial_forrest_size})
        self._rf = RandomForestClassifier(**rf_fit_args)

    def fit(self, X, y):
        self._rf.fit(X, y)

        for t in self._rf.estimators_:
            tree_y_pred = t.predict(X)
            t._error_vector = squared_error_vector(y, tree_y_pred)

        final_estimators = []
        for i in range(self._final_forrest_size):
            final_estimators.append(epsilon_lexicase_selection(self._rf.estimators_))

        self._rf.estimators_ = final_estimators
        self._rf.n_estimators = self._final_forrest_size
        # TODO: Set other self._rf parameters to match correct size so that predict works.

    def predict(self, X, y=None):
        return self._rf.predict(X)
コード例 #4
0
def stkFoldCrossValidation():

    X = pickle.load(open('X.p', 'rb'))

    X = np.array(X)

    Y = pickle.load(open('Y.p', 'rb'))

    Y = np.array(Y)

    skf = StratifiedKFold(n_splits=10)
    skf.get_n_splits(X, Y)

    k = 1
    for train_index, test_index in skf.split(X, Y):

        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        print(k)
        k += 1

        rf = RandomForestClassifier()

        rf.fit(X_train, Y_train)

        yp = rf.predict(X_test)
        print(classification_report(Y_test, yp, digits=6))
コード例 #5
0
class RandomForestClassifierImpl():

    def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'):
        self._hyperparams = {
            'n_estimators': n_estimators,
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'bootstrap': bootstrap,
            'oob_score': oob_score,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose,
            'warm_start': warm_start,
            'class_weight': class_weight}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
コード例 #6
0
def forest(X, y, model_path):
    model = RandomForestClassifier()
    model.fit(X, y)
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)
コード例 #7
0
ファイル: sandkasten2.py プロジェクト: jdroenner/fls2
def calcRandomForestClassifier(channels_training, channels_testing,
                               target_training, target_testing):
    clf = RandomForestClassifier(n_estimators=500,
                                 max_features=int(
                                     sqrt(len(channels_training[0]))))
    clf = clf.fit(channels_training, target_training)
    predictions = clf.predict(channels_testing)
    comp = [predictions, target_testing, channels_testing]
    return clf, comp
コード例 #8
0
def calc_score(test, train):
    test_f, test_l = split_data_label(test)
    train_f, train_l = split_data_label(train)
    # 학습시키고 정답률 구하기

    clf = RandomForestClassifier()
    clf.fit(train_f, train_l)
    pre = clf.predict(test_f)
    return metrics.accuracy_score(test_l, pre)
コード例 #9
0
def drawfeature(train_data_path,train_file_name,test_data_path,test_file_name):
    train_file = os.path.join(train_data_path,train_file_name)
    train_data = pd.read_csv(train_file)
    n_data_train = train_data['text'].size
    print 'n_data_train is %s' %n_data_train
    print type(n_data_train)
    
    test_file = os.path.join(test_data_path,test_file_name)
    test_data = pd.read_csv(test_file)
    n_data_test = test_data['text'].size
    print 'n_data_test is %s' %n_data_test
    print type(n_data_test)
    
    vectorizer = CountVectorizer(analyzer='word',tokenizer = None,
        preprocessor = None, stop_words=None, max_features = 5000)
    transformer = TfidfTransformer()
    
    train_data_words = []
    
    print 'start with words in train data set'
    for i in xrange(n_data_train):
        if((i+1)%1000 == 0):
            print 'Drawfeatures line %d of %d' %(i+1,n_data_train)
        train_data_words.append(words_to_features(train_data['text'][i]))
    print 'start bag of words in train data....'
    train_data_features = vectorizer.fit_transform(train_data_words)
    train_data_features = train_data_features.toarray()
    print 'start tfidf in train data....'
    train_data_features = transformer.fit_transform(train_data_features)
    train_data_features = train_data_features.toarray()
    #test-data processing
    test_data_words = []
    for i in xrange(n_data_test):
        if((i+1)%1000 == 0):
            print 'Drawfeatures line %d of %d' %(i+1,n_data_test)
        test_data_words.append(words_to_features(test_data['text'][i]))
    
    test_data_features = vectorizer.fit_transform(test_data_words)
    test_data_features = test_data_features.toarray()
    
    
       
    print'randome forest go...'
    forest = RandomForestClassifier(n_estimators = 13)
    forest = forest.fit(train_data_features,train_data['label'])
    pred = forest.predict(test_data_features)
    pred = pd.Series(pred,name='Target')
    pred.to_csv('SENTI_RF.CSV',index=None, header = None)

    
    print'naive baby go...'
    mnb = MultinomialNB(alpha=0.01)
    mnb = mnb.fit(train_data_features,train_data['label'])
    pred = mnb.predict(test_data_features)
    pred = pd.Series(pred,name = 'Target')
    pred.to_csv('SENTI_MNB',index = None, header = True)
 def RandomForestClassifer(self):
     
     '''
     Function to do RandomForest Classifer.
     '''
     train_Array = self.titanic_train_frame.values
     self.test_Array = self.titanic_test_frame.values
     randomForest = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
     randomForest.fit(train_Array[0::,1::],train_Array[0::,0])
     self.predicted_probability = randomForest.predict(self.test_Array[0::,0::])
     self.predicted_probability_list = self.predicted_probability.tolist()
コード例 #11
0
ファイル: randomForest.py プロジェクト: matanaor1/AIProject1
class RFClassifier(super.abstract_classifier):

    def __init__(self, train_features, train_labels, num_of_trees):
        self.train_features = train_features
        self.train_labels = train_labels
        self.rf_member = RandomForestClassifier(num_of_trees)

    def train(self):
        self.rf_member.fit(self.train_features, self.train_labels)

    def classify(self, newVector):
        return self.rf_member.predict(newVector)
コード例 #12
0
ファイル: meta.py プロジェクト: asilvaguilherme/catdata
def build_and_test_model(classifier, X, Y, Z, param):

    accuracies = []
    ari = []

    for train, test in LeaveOneOut().split(X):

        X_train, Y_train = X[train], Y[train]
        X_test, Y_test, Z_test = X[test], Y[test], Z[test]
        predicted = None

        if classifier == "KNN":
            neigh = KNeighborsClassifier(n_neighbors=param).fit(
                X_train, Y_train)
            predicted = neigh.predict(X_test)

        elif classifier == "RF":
            clf = RandomForestClassifier(n_estimators=param,
                                         random_state=0)  # ,max_depth=2,
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test)

        elif classifier == "SVM":
            clf = svm.SVC(gamma='scale')
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test).astype(int)

        elif classifier == "NAIVE":
            clf = GaussianNB()
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test).astype(int)

        elif classifier == "RANDOM":
            options = list(set(Y_train))
            predicted = [random.choice(options) for _ in range(len(Y_test))]

        accuracies.append(metrics.accuracy_score(Y_test, predicted))
        ari.append(metrics.adjusted_rand_score(Z_test, predicted))

    return np.mean(accuracies), np.std(accuracies), np.mean(ari), np.std(ari)
コード例 #13
0
def evalOne(enabledColumns):
    features = [all_features[i] for i in range(0, len(all_features)) if enabledColumns[i]]
    Y = []
    P = []
    for group in range(0,5):
    #     print("Test group " + str(group + 1))
        trainStationList = []
        testStationList = []
        for i in range(0,5):
            if i == group:
                testStationList.extend(groups[i])
            else:
                trainStationList.extend(groups[i])
        trainStations = set(float(station) for station in trainStationList)
        # reorder train stations
    #     print("\ttrainStationList:" + str(trainStationList))
        trainStationList = [s for s in all_stations if float(s) in trainStations]
    #     print("\ttrainStationList:" + str(trainStationList))
        testStations = set(float(station) for station in testStationList)
    #     print("\ttestStationList:" + str(testStationList))
        trainX, testX, trainY, testY, trainLocation, testLocation = splitDataForXValidationWithLocation(trainStations, testStations, "location", data, features, "target")
     
        train_lower = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i < (len(trainStationList) / 2.0)]
#         train_upper = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i >= (len(trainStationList) / 2.0)]
         
        test_lower = [float(testStationList[i]) for i in range(0, len(testStationList)) if i < (len(testStationList) / 2.0)]
#         test_upper = [float(testStationList[i]) for i in range(0, len(testStationList)) if i >= (len(testStationList) / 2.0)]
         
        trainY = []
        for l in trainLocation:
            if l in train_lower:
                trainY.append(0)
            else:
                trainY.append(1)
         
        testY = []
        for l in testLocation:
            if l in test_lower:
                testY.append(0)
            else:
                testY.append(1)
         
        model = RandomForestClassifier(random_state=42, n_estimators=20, max_depth=9, n_jobs=-1)
        model.fit(trainX, trainY)
        predY = model.predict(testX)
         
        Y.extend(testY)
        P.extend(predY)
     
    f1 = f1_score(Y, P)
    accuracy = accuracy_score(Y, P)
    return f1, accuracy
コード例 #14
0
class Model(BaseModel):
    """Antares implementation of scikit learn random forest classifier

    """
    def __init__(self,
                 categorical_features=None,
                 n_estimators=50,
                 n_jobs=-1,
                 max_depth=10):
        '''
        Example:
            >>> from madmex.modeling.supervised.rf import Model
            >>> rf = Model()
            >>> # Write model to db
            >>> rf.to_db(name='test_model', recipe='mexmad', training_set='no')
            >>> # Read model from db
            >>> rf2 = Model.from_db('test_model')
        '''
        super().__init__(categorical_features=categorical_features)
        self.model = RandomForestClassifier(n_estimators=n_estimators,
                                            n_jobs=n_jobs,
                                            max_depth=max_depth)
        self.model_name = 'rf'

    def fit(self, X, y):
        X = self.hot_encode_training(X)
        self.model.fit(X, y)

    def predict(self, X):
        '''
        Simply passes down the prediction from the underlying model.
        '''
        X = self.hot_encode_predict(X)
        return self.model.predict(X)

    def predict_confidence(self, X):
        """Get confidence of every prediction
        """
        X = self.hot_encode_predict(X)
        return self.model.predict_proba(X).max(axis=1)

    def score(self, X, y):
        '''
        Test the model given a dataset and a target vector.

        This method applies the model that this object represents to the given dataset using
        the response variable y. It is a measure of the accuracy of the trained model. Usually
        the orginal dataset should be splitted in training and testing subsets to cross validate
        the model.
        '''
        return self.model.score(X, y)
コード例 #15
0
def classic_model(image_dir, image_lists, method):

    X, y = get_X_y(image_dir, image_lists, ['training', 'validation'], method)
    classifier = RandomForestClassifier(n_estimators=1000, n_jobs=4)
    classifier.fit(X, y)

    X_test, y_test = get_X_y(image_dir, image_lists, ['testing'], method)
    predictions = classifier.predict(X_test)
    confusion = pandas.crosstab(y_test,
                                predictions,
                                rownames=['Actual Class'],
                                colnames=['Predicted Class'])
    print confusion
    return accuracy_score(y_test, predictions)
コード例 #16
0
ファイル: pscvread.py プロジェクト: coreyabshire/color-names
def just_pred(x, y):
    xlen = len(x)
    i = range(xlen)
    np.random.shuffle(i)
    trainpct = 0.7
    trainlen = int(trainpct * xlen)
    testlen = xlen - trainlen
    xtrain = x.ix[:trainlen,:]
    ytrain = y.ix[:trainlen]
    xtest = x.ix[trainlen:,:]
    ytest = y.ix[trainlen:]
    rf = RandomForestClassifier()
    rf.fit(xtrain, ytrain)
    ypred = rf.predict(xtest)
    return ytest, ypred
コード例 #17
0
def just_pred(x, y):
    xlen = len(x)
    i = range(xlen)
    np.random.shuffle(i)
    trainpct = 0.7
    trainlen = int(trainpct * xlen)
    testlen = xlen - trainlen
    xtrain = x.ix[:trainlen, :]
    ytrain = y.ix[:trainlen]
    xtest = x.ix[trainlen:, :]
    ytest = y.ix[trainlen:]
    rf = RandomForestClassifier()
    rf.fit(xtrain, ytrain)
    ypred = rf.predict(xtest)
    return ytest, ypred
コード例 #18
0
def random_forest(profile, group, n_tree, search_number, avg_acc):
    '''
    对丰度表进行建模
    :param profile:丰度表
    :param group: 分组表
    :param n_tree: 模型中树的颗数
    :param search_number: 搜索随机种子的次数
    :param avg_acc: 随机种子准确率的输出文件
    :return: 加label后的group
    '''
    real_label = set(group.iloc[:, 0])
    label_dict = {}
    for i, j in enumerate(real_label):
        label_dict[j] = i
    label = []
    for sample in group.index:
        label.append(label_dict[group.loc[sample].values[0]])

    group['label'] = label

    n = 0
    with open(avg_acc, 'w') as f:
        f.write('random_state\tavgAcc\n')
        while n < search_number:
            print('现在循环次数为{0}'.format(n+1))
            # random random_state
            random_state = round(random() * 10000)

            rf = RandomForestClassifier(n_estimators=n_tree, max_leaf_nodes=3,
                                        random_state=random_state)

            acc = []
            for i in range(10):
                sample_train = list(profile.sample(n=30).index)
                sample_val = list(set(profile.index).difference(sample_train))
                train = profile.loc[sample_train]
                val = profile.loc[sample_val]
                label_train = group['label'].loc[sample_train]

                rf.fit(train, group['label'][sample_train])
                pre = rf.predict(val)

                acc.append(metrics.accuracy_score(y_true=group['label'][sample_val], y_pred=pre))

            # print('{0}\t{1}\n'.format(random_state, sum(acc) / 10))
            f.write('{0}\t{1}\n'.format(random_state, sum(acc) / 10))
            n += 1
    return group
コード例 #19
0
def decision_frist():

    data = datasets.load_iris()
    x = data["data"]
    y = data["target"]

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    des = DecisionTreeClassifier(max_leaf_nodes=3)
    des.fit(X_train, y_train)
    print(des.predict(X_test))
    print(des.score(X_test, y_test))

    rom = RandomForestClassifier()
    rom.fit(X_train, y_train)
    print(rom.predict(X_test))
    print(rom.score(X_test, y_test))
コード例 #20
0
def crossval(x, y, k=5):
    for i in range(k):
        i = range(len(X))
        np.random.shuffle(i)
        xlen = len(x)
        trainpct = 0.7
        trainlen = int(trainpct * xlen)
        testlen = xlen - trainlen
        xtrain = x.ix[:trainlen, :]
        ytrain = y.ix[:trainlen]
        xtest = x.ix[trainlen:, :]
        ytest = y.ix[trainlen:]
        rf = RandomForestClassifier()
        rf.fit(xtrain, ytrain)
        ypred = rf.predict(xtest)
        print ypred
コード例 #21
0
ファイル: pscvread.py プロジェクト: coreyabshire/color-names
def crossval(x, y, k=5):
    for i in range(k):
        i = range(len(X))
        np.random.shuffle(i)
        xlen = len(x)
        trainpct = 0.7
        trainlen = int(trainpct * xlen)
        testlen = xlen - trainlen
        xtrain = x.ix[:trainlen,:]
        ytrain = y.ix[:trainlen]
        xtest = x.ix[trainlen:,:]
        ytest = y.ix[trainlen:]
        rf = RandomForestClassifier()
        rf.fit(xtrain, ytrain)
        ypred = rf.predict(xtest)
        print ypred
コード例 #22
0
def predict2(text):

    # let o dataset
    dataset_file = os.path.join(BASE_DIR, 'dataset', 'complain.json')

    with open(dataset_file) as data:
        data = json.load(data)

    # Get the number of reviews based on the dataframe column size
    num_complain = len(data)

    # Initialize an empty list to hold the clean complain
    clean_train_complain = []
    target_problem_type = []
    for complain in data:
        clean_train_complain.append(
            clean_data('%s %s' % (complain['title'], complain['complain'])))
        target_problem_type.append(complain['category'])

    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 max_features=500)

    train_data_feature = vectorizer.fit_transform(
        clean_train_complain).toarray()

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators=100)

    # Fit the forest to the training set, using the bag of words as
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    forest = forest.fit(train_data_feature, target_problem_type)

    clean_test_complain = []

    clean_test_complain.append(clean_data(text))

    test_data_features = vectorizer.transform(clean_test_complain)
    test_data_features = test_data_features.toarray()

    result = forest.predict(test_data_features)

    return result
コード例 #23
0
ファイル: dataframes.py プロジェクト: dmbrdev/doutorado
def forestPredict(columName, features, trees):
    
    pd.options.mode.chained_assignment = None
    
    df = pd.read_csv(path.NOTEBOOKS_DATA + 'features2.csv')
    df['pred'] = ""
    #df = df.set_index([df.m_championship_id,df.m_match_group_num])
    df = df.set_index([df.m_match_id])
    
    for champId in range(1,91):
        
        champ = df[(df.m_championship_id == champId)]       
        print(champId)
        
        if (champId < 11 or champId > 20):
            
            if (len(champ) == 380):
                rd = 38
            elif (len(champ) == 306):
                rd = 34
            else:
                rd = 30
                 
            for mid in range(2,rd+1):
                
                train = champ[champ.m_match_group_num < mid]
                test = champ[champ.m_match_group_num == mid]
                  
                target = 'm_column_result'
                  
                X = train[features]
                y = train[target]                  
                Z = test[features]
                  
                clf = RandomForestClassifier(n_estimators=trees,max_features=None )
                clf.fit(X,y)
                          
                pred = clf.predict(Z)
                  
                for i,p in zip(Z.index,pred) :

                    df.set_value(i,'pred',p)
             
    nameFile = 'pred_' + columName + ".csv"
    
    df.to_csv(path.NOTEBOOKS_DATA + nameFile,index=False);
コード例 #24
0
def predict(text, dataset_file_path):

    dataset_file = dataset_file_path

    data_file = open(dataset_file, 'r')

    reader = csv.reader(data_file, delimiter=';', quoting=csv.QUOTE_NONE)

    clean_train_data = []
    target_data = []

    for line in reader:
        clean_train_data.append(clean_data(line[0]))
        target_data.append(line[1])

    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 max_features=500)

    train_data_feature = vectorizer.fit_transform(clean_train_data).toarray()

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators=100)

    # Fit the forest to the training set, using the bag of words as
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    forest = forest.fit(train_data_feature, target_data)

    clean_test_complain = []

    clean_test_complain.append(clean_data(text))

    test_data_features = vectorizer.transform(clean_test_complain)
    test_data_features = test_data_features.toarray()

    result = forest.predict(test_data_features)

    return result
 def plot_rf(self):
     n = self.bestScoreN
     A0 = [row[0] for row in self.dataset if row[2] == 0]
     A1 = [row[0] for row in self.dataset if row[2] == 1]
     B0 = [row[1] for row in self.dataset if row[2] == 0]
     B1 = [row[1] for row in self.dataset if row[2] == 1]
     Xplot = []
     Yplot = []   
     Xplot, Yplot = np.meshgrid(np.arange(-0.2, 4.4, 0.2),np.arange(-0.2, 4.4, 0.2))
     clf = RandomForestClassifier(n_estimators = n).fit(self.X,self.Y)
     predicted = clf.predict(np.c_[Xplot.ravel(), Yplot.ravel()])
     predicted = predicted.reshape(Xplot.shape)             
     plot0 = plt.scatter(A0,B0, marker='+', color = 'red')
     plot1 = plt.scatter(A1,B1, marker = 'o', color = 'green')
     plt.legend((plot0, plot1), ('label 0', 'label 1'), scatterpoints = 1)
     plt.xlabel('A')
     plt.ylabel('B')
     plt.title("RF Classifier")
     plt.contourf(Xplot, Yplot, predicted, alpha=0.5)
     plt.show()
コード例 #26
0
ファイル: dataframes.py プロジェクト: dmbrdev/doutorado
def forestPredict4():
    
    df = pd.read_csv(path.NOTEBOOKS_DATA + 'features3.csv')
    
    df.index = df.m_match_id
    df['rf1000_fs4'] = ""
    pd.options.mode.chained_assignment = None

    
    for t1 in range(11,91,10):
        
        print(t1)
        
        champ = df[(df.m_championship_id < t1) & (df.m_championship_id >= t1-10)].sort_values(['m_match_date'])        
        
        for t2 in range(10,len(champ),10):
              
            train = champ[0:t2]
            test = champ[t2:t2+10]
              
            features = ['m_odd_home','m_odd_away','m_odd_underdog',
                        'm_odd_favorite','m_odd_draw','m_odd_medium',
                        'a_goals_for_mean','h_goals_for_mean']
              
            target = 'm_column_result'
              
            X = train[features]
            y = train[target]             
            Z = test[features]
              
            clf = RandomForestClassifier(n_estimators=1000)
            clf.fit(X,y)
                      
            pred = clf.predict(Z)
              
            for t3,p in zip(Z.index,pred) :
                df.set_value(t3,'rf1000_fs4',p)
        
 
    df.to_csv(path.NOTEBOOKS_DATA + 'features3.csv',index=False);        
コード例 #27
0
def RandomForestIndependent():

    X = pickle.load(open('X.p', 'rb'))
    Y = pickle.load(open('Y.p', 'rb'))

    print('****  *****')

    rf = RandomForestClassifier(n_estimators=10)
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.0,
                                                        random_state=3)

    rf.fit(X_train, Y_train)
    yp = rf.predict(X_train)
    print('**** Training *****')
    print(classification_report(Y_train, yp))
    '''yp = rf.predict(X_test)
    print('**** Testing *****')
    print(classification_report(Y_test, yp,digits=6))'''

    pickle.dump(rf, open('rf.p', 'wb'))
コード例 #28
0
ファイル: rf.py プロジェクト: makeling/antares
class Model(BaseModel):
    '''
    classdocs
    '''

    def __init__(self, path):
        '''
        Constructor
        '''
        self.path = path
        self.model = RandomForestClassifier(n_estimators=150,n_jobs=8)
        self.model_name = 'rf'

    def fit(self, X, y):
        self.model.fit(X,y)

    def predict(self, X):
        '''
        Simply passes down the prediction from the underlying model.
        '''
        return self.model.predict(X)

    def save(self, filepath):
        '''
        Persists the trained model to a file.
        '''
        joblib.dump(self.model, create_filename(filepath,'%s.pkl' % self.model_name)) 

    def load(self, filepath):
        '''
        Loads an already train model from a file to perform predictions.
        '''
        self.model = joblib.load(create_filename(filepath,'%s.pkl' % self.model_name))

    def score(self, X, y):
        '''
        Lets the user load a previously trained model to predict with it. 
        '''
        return self.model.score(X,y)
コード例 #29
0
ファイル: dataframes.py プロジェクト: dmbrdev/doutorado
def forestPredict7030(columName, features, trees):
    
    pd.options.mode.chained_assignment = None
    
    df = pd.read_csv(path.NOTEBOOKS_DATA + 'features2.csv')
    df['pred'] = ""
    #df = df.set_index([df.m_championship_id,df.m_match_group_num])
    df = df.set_index([df.m_match_id])
    
    for champId in range(11,91,10):
        
        champ = df[(df.m_championship_id < champId) & (df.m_championship_id >= champId-10)]       
        
        print(champId)
        
        if (champId != 21):
            
                train = champ[df.m_championship_id <= champId-4]
                test = champ[df.m_championship_id > champId-4]
                  
                target = 'm_column_result'
                  
                X = train[features]
                y = train[target]                  
                Z = test[features]
                  
                clf = RandomForestClassifier(n_estimators=trees,max_features=None )
                clf.fit(X,y)
                          
                pred = clf.predict(Z)
                  
                for i,p in zip(Z.index,pred) :

                    df.set_value(i,'pred',p)
             
    nameFile = 'pred_' + columName + ".csv"
    
    df.to_csv(path.NOTEBOOKS_DATA + nameFile,index=False);
コード例 #30
0
def main():
    header = ["id"]
    feats = []
    df = pd.read_csv("cora.content", sep="\t")
    for i in range(df.shape[1] - 2):
        feat = "feat_" + str(i)
        header.append(feat)
        feats.append(feat)
    header.append("class")

    feats = np.array(feats)

    df.columns = header

    x_train, x_test, y_train, y_test = train_test_split(
        df[feats], df["class"], test_size=0.3
    )

    clf = RandomForestClassifier(n_estimators=200)
    clf.fit(x_train, y_train)

    importances = clf.feature_importances_
    sorted_idx = np.argsort(importances)

    x = list(zip(feats[sorted_idx], importances[sorted_idx]))
    x_sorted = sorted(x, key=lambda x: -x[1])

    # Statistics
    y_pred = clf.predict(x_test)
    precision, recall, fscore, _ = score(y_test, y_pred, average="macro")

    print("Precision:", round(precision, 3))
    print("Recall:   ", round(recall, 3))
    print("F-Score:  ", round(fscore, 3))
    print("Accuracy: ", round((y_pred == y_test).sum() / len(y_pred), 3))

    selected_feats = [key for key, val in x_sorted[:20]]
    print(selected_feats)
コード例 #31
0
class StackingFusion(FusionStrategy):
    '''
    The StackingFusion learns a fusion strategy from training data.
    A classifier is trained that uses the posterior probabilities from all
    microphones in the sensor network as input features.
    '''
    def __init__(self, channel_sort=ChannelSortNone()):
        '''
        Constructor
        @param channel_sort: An object of type ChannelSortStrategy. 
        '''
        self.stacked_classifier = None
        self.channel_sort = channel_sort

    def train(self, log_probs, labels):
        '''
        Train the stacked classifier
        @param log_probs: list of probability matrices (channels, label)
        @param labels: label for each feature-vector
        '''
        print 'Train stacked classifier with %d windows' % labels.shape[0]
        log_probs = [self.channel_sort.sort(f) for f in log_probs]
        log_probs = np.vstack(log_probs)

        # TODO: classifier as Parameter
        self.stacked_classifier = RandomForestClassifier(n_estimators=10)
        self.stacked_classifier.fit(log_probs, labels)

    def apply(self, log_probs):
        '''
        Apply fusion strategy to classifier probabilities
        @param log_probs: log probabilities for each channel and class in shape (channel, class)
        @return: Class index for the predicted class
        '''
        log_probs = self.channel_sort.sort(log_probs)
        # return the classindex as a scalar not as an array
        return self.stacked_classifier.predict(log_probs)[0]
コード例 #32
0
ファイル: dbscan.py プロジェクト: korvin14/twitterFollowBack
for train, test in kf:
    y_train = []
    x_train = []
    for i in train:
        y_train.append(features[i][6])
        tmp = [features[i][0], features[i][1], features[i][2], features[i][3], features[i][4], features[i][5]]
        x_train.append(tmp)
        
    y_test = []
    x_test = []  
    for i in test:
        y_test.append(features[i][6])
        tmp = [features[i][0], features[i][1], features[i][2], features[i][3], features[i][4], features[i][5]]
        x_test.append(tmp)
       
    rf.fit(x_train, y_train)
    rfPredTest = rf.predict(x_test)
    rfPrecisionTest = precision_score(y_test, rfPredTest)
    rfRecallTest = recall_score(y_test, rfPredTest)
    rfF1Test = f1_score(y_test, rfPredTest)
    rfAvgPrecision += rfPrecisionTest
    rfAvgRecall += rfRecallTest
    rfAvgF1 += rfF1Test

print "RF completed in ", time.time() - start, " s"
print "rf:\n Precision {}\n Recall {}\n F1 {}\n".format(rfAvgPrecision / 5, rfAvgRecall / 5, rfAvgF1 / 5)

 


コード例 #33
0
# treino, teste e avaliacao
print('Iniciando o k-Fold...')
for train_index, test_index in k_fold.split(tf_idf):
    x_train, x_test = tf_idf[train_index], tf_idf[test_index]
    y_train, y_test = classes[train_index], classes[test_index]

    # treino do modelo
    print(f'Gerando o Modelo {i}...')
    classifier = RandomForestClassifier(n_estimators=10,
                                        criterion='gini',
                                        random_state=iteracao).fit(
                                            x_train, y_train)

    # classificando o conjunto de teste
    y_pred = classifier.predict(x_test)

    # metricas de desempenho
    aux_accuracy += accuracy_score(y_test, y_pred)
    aux_f1_score += f1_score(y_test, y_pred)
    aux_precision += precision_score(y_test, y_pred)
    aux_recall += recall_score(y_test, y_pred)
    conf_matrices += np.asarray(confusion_matrix(y_test, y_pred))

    print(f'Modelo {i} finalizado e avaliado.')
    i += 1

# resultados
print(f'\nITERATION #{iteracao} -----------------------')
print(f'Accuracy = {aux_accuracy / k_fold.n_splits}')
print(f'F1 Score = {aux_f1_score / k_fold.n_splits}')
コード例 #34
0
x_train, x_test, y_train, y_test = train_test_split(authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123)


# Fit Model
etclf = ExtraTreesClassifier(n_estimators=20, max_depth=10, verbose=1)
etclf.fit(x_train, y_train)

# Print Confusion Matrix
metrics.confusion_matrix(etclf.predict(x_test), y_test)


from sklearn.ensemble.forest import RandomForestClassifier

rdclf = RandomForestClassifier(n_estimators=20, max_depth=10)
rdclf.fit(x_train, y_train)
metrics.confusion_matrix(rdclf.predict(x_test), y_test)


from sklearn.ensemble.weight_boosting import AdaBoostClassifier

adaclf = AdaBoostClassifier(n_estimators=20)
adaclf.fit(x_train, y_train)
metrics.confusion_matrix(adaclf.predict(x_test), y_test)




metrics.confusion_matrix(etclf.predict(x_test), y_test)
metrics.confusion_matrix(rdclf.predict(x_test), y_test)
metrics.confusion_matrix(adaclf.predict(x_test), y_test)
コード例 #35
0
ファイル: TJ_2.py プロジェクト: julbright/tj_research
  y_predict = m.predict(X_test)
  fpr, tpr, thresh = roc_curve(y_test, y_predict, pos_label=1)
  auc = roc_auc_score(y_test, y_predict)

  print 'AUC: ', auc
  print 'Percentage of players that will have TJ in 2014: ',np.mean(y_predict)
  return fpr, tpr, auc


rf_fpr, rf_tpr, rf_auc = evaluate_model(RandomForestClassifier)
svc_fpr, svc_tpr, svc_auc = evaluate_model(SVC)


RFC2 = RandomForestClassifier(n_estimators = 10)
RFC2.fit(X, y)
predict_players['predictions']=RFC2.predict(predict_players[X_cols])
predict_players.to_csv('testing.csv')

print 'Players that RF thinks will have TJ in 2014', predict_players['m1_name'][predict_players['predictions']==1]

the_doomed = predict_players['m1_name'][predict_players['predictions']==1]
injuries2014 = pd.read_csv('.\\intermediate data\\injuries2014.csv')

for each_doomed_person in the_doomed.values:
  if each_doomed_person in injuries2014.values:
    print each_doomed_person, 'has in fact undergone TJ in 2014!'
  else:
    print each_doomed_person, "did not end up having TJ in 2014..."


for each_injured_person in injuries2014[injuries2014.columns[1]].values:
コード例 #36
0
#download the file
raw_data=urllib.urlopen(url)

#get data, add column names and index
feature_names=["times pregnant", "plasma glucose conc.", "distolic blood pressure (mm Hg)", "triceps skin fold thickness (mm)", "2-hour serum insulin (mu U/ml)", "body mass index (kg/m^2)", "diabetes pedigree function", "age (years)", "target"]
dataset=pd.DataFrame.from_csv(raw_data)
dataset=dataset.reset_index()
dataset.columns=feature_names

#split into train and test set
train, test=train_test_split(dataset, test_size=0.3)

#normalize data
df_scaled_train=pd.DataFrame(preprocessing.scale(train), columns=feature_names)
df_scaled_test=pd.DataFrame(preprocessing.scale(test), columns=feature_names)

model=RandomForestClassifier(n_estimators = 100, oob_score = True, random_state =10, max_features = "auto", min_samples_leaf = 20)

#train model
#if getting this error, it is because a matrix with 1 column
#is being passed in when a 1d array is expected. ravel() will work.
#DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main':
#To resolve this error, convert label values to int or str as float is not a valid label-type
#raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array
model.fit(df_scaled_train.ix[:,'times pregnant':'age (years)'], np.asarray(df_scaled_train.ix[:,'target'].astype(int)))
print "Accuracy:", model.score(df_scaled_test.ix[:,'times pregnant':'age (years)'], np.asarray(df_scaled_test.ix[:,'target'].astype(int)))

#predict output
predicted=model.predict(df_scaled_test.ix[:,'times pregnant':'age (years)'])
print predicted
コード例 #37
0
ファイル: tmp.py プロジェクト: Daiver/HRandomForest
from sklearn.ensemble.forest import RandomForestClassifier

def read(fname):
    labels, data = [],[]
    with open(fname) as f:
        for s in f:
            ss = s.split()
            labels.append(int(ss[-1]))
            data.append(map(float, ss[:-2]))
    return labels, data

trainset = read('./trainset')
testset  = read('./testset')

clf = RandomForestClassifier(n_estimators=10)
clf.fit(trainset[1], trainset[0])
print clf.predict(testset[1])
print testset[0]
コード例 #38
0
                       columns=X_train.columns)

#:# model

params = {'max_depth': 3, 'n_estimators': 75}

classifier = RandomForestClassifier(**params)
classifier.fit(X_train, y_train)

#:# hash
#:# 5475503c9e4b64dc0dcc4960399cf72c
md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()
print(f'md5: {md5}')

#:# audit
y_pred = classifier.predict(transform_pipeline.transform(X_test))
y_pred_proba = classifier.predict_proba(
    transform_pipeline.transform(X_test))[:, 1]

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'acc: {accuracy_score(y_test, y_pred)}')
print(f'auc: {roc_auc_score(y_test, y_pred_proba)}')
print(f'precision: {precision_score(y_test, y_pred)}')
print(f'recall: {recall_score(y_test, y_pred)}')
print(f'specificity: {tn/(tn+fp)}')
print(f'f1: {f1_score(y_test, y_pred)}')

#:# session info

# Dodaj wersję pythona w session info
コード例 #39
0
	def runns(resp_var, size_of_test_data,dataset,positive_class,predictor_var, n_estimators,important_features,dealing_with_nulls):
		dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes
		#----DATA PREPROCESSING
		#-------dealing with NULL values in the data
		#----------remove the rows in which the response is null

		dataset=dataset.dropna(subset=[resp_var])
		#----------dealing with nulls
		dataset=deal_with_nulls(dealing_with_nulls,dataset)
		#----FEATURE SELECTION
		#-------get predictors important in predicting the response
		#-----------transform categorical predictors to dummy variables
		predictors=dataset[predictor_var]
		predictors=pd.get_dummies(predictors)
		#-----------balance the classes in the response var
		ros = RandomOverSampler(random_state=0)
		resp=dataset[resp_var]
		prds, resp = ros.fit_sample(predictors, resp)
		#-----------fit the random forest classifier to give us the important predictors
		rf_clf = RandomForestClassifier(n_estimators=n_estimators)
		rf_clf.fit(prds,resp)
		#-------get the important predictors
		feature_imp = pd.Series(rf_clf.feature_importances_,
						index=list(predictors.iloc[:,0:])).sort_values(ascending=False)
		#-------names of the important predictors
		important_predictor_names = feature_imp.index[0:important_features]
		#-------subset the data to get only the important predictors and the response
		resp=pd.DataFrame(data=resp,columns=[resp_var])
		predictors=pd.DataFrame(prds,columns=list(predictors))
		dataset=pd.concat([resp,predictors],axis=1)
		#---------------------------------------------------------
		#----MODEL TRAINING
		#--------Remove the response variables from the features variables - axis 1 refers to the columns
		m_data= dataset.drop(resp_var, axis = 1,inplace=False) 
		# Response variables are the values we want to predict
		resp_var = np.array(dataset[resp_var])

		dataset = pd.get_dummies(m_data)
		
		# Saving feature names for later use
		feature_list = list(m_data.columns)
		# Convert to numpy array
		dataset = np.array(dataset)

		# Split the data into training and testing sets
		train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = float(size_of_test_data), random_state = 402)

		# Instantiate model with n_estimators decision trees
		clf = RandomForestClassifier(n_jobs = 1,n_estimators = n_estimators, random_state = 142)

		# Train the model on training data
		clf.fit(train_features, train_labels)
		# evaluation
		predicted = clf.predict(test_features)
		pred_prob = clf.predict_proba(test_features)
		
		accuracy = accuracy_score(test_labels, predicted)
		#confusion matrix
		cnf = (confusion_matrix(test_labels,predicted))
		#precision score
		precision = precision_score(test_labels,predicted,pos_label=positive_class)
		#avg pres
		avg_precision = average_precision_score(test_labels,pred_prob[:,[1]])
		#recall score
		rec = recall_score(test_labels,predicted,pos_label=positive_class)
		#f1 scorea
		fscore = f1_score(test_labels,predicted,pos_label=positive_class)
		#fbeta score
		fbeta = fbeta_score(test_labels,predicted,beta=0.5)
		#hamming_loss
		hamming = hamming_loss(test_labels,predicted)
		#jaccard similarity score
		jaccard = jaccard_similarity_score(test_labels,predicted)
		#logloss
		logloss = log_loss(test_labels,predicted)
		#zero-oneloss
		zero_one = zero_one_loss(test_labels,predicted)
		#auc roc 
		area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]])
		#cohen_score
		cohen = cohen_kappa_score(test_labels,predicted)
		#mathews corr
		mathews = matthews_corrcoef(test_labels,predicted)
		# Variable importances from the important features selection stage
		variable_importance_list = list(zip(prds, feature_imp))
		output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews}
		output=json.dumps(output)
		return output
コード例 #40
0
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.metrics.classification import classification_report
import pandas as pd
__author__ = 'semyon'


print("reading")
csv = pd.read_csv("data/train.csv")

print("slicing")
train_features = csv.ix[:, 'x23':'x61'].fillna(0).as_matrix()
train_true = csv['y'].tolist()

trtrfe = train_features[:35000, :]
trtrtrue = train_true[:35000]

trtefe = train_features[35000:, :]
trtetrue = train_true[35000:]

print("learning")

for depth in [7, 10, 12, 15, 20, 30, 50, 70]:
    for leaf_samples in [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 20, 40, 60, 150]:
        # model = GradientBoostingClassifier(n_estimators=10, max_depth=depth, min_samples_leaf=leaf_samples, verbose=1)
        model = RandomForestClassifier(n_estimators=50, max_depth=depth, min_samples_leaf=leaf_samples, verbose=0,
                                       n_jobs=4)
        model.fit(trtrfe, trtrtrue)
        # mean accuracy on the given test data and labels
        # print depth, '\t', leaf_samples, '\t', model.score(trtefe, trtetrue)
        predicted = model.predict(trtefe)
        print(classification_report(trtetrue, predicted))
コード例 #41
0
print "Confusion matrix:"
print metrics.confusion_matrix(dat_clean.genre, predicted)

#####################
data_tree = dat_clean.iloc[:,[3,4,5,6,7,8,9,10,13,14,15]]
clf = clf.fit(data_tree, dat_clean.genre)

# Visualize tree
dot_data = StringIO.StringIO()
tree.export_graphviz(clf, out_file=dot_data, feature_names=list(data_tree.columns.values))
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf('dectree.pdf')


# Repeat on test set
y_test_pred = clf.predict(X_test)
print "Accuracy Test: {0:.3f}".format(metrics.accuracy_score(y_test, y_test_pred))
print
print "Classification report:"
print metrics.classification_report(y_test, y_test_pred)
print 
print "Confusion matrix:"
print metrics.confusion_matrix(y_test, y_test_pred)

# Measure performance
y_pred = clf.predict_proba(X_train)

# Repeat on test set
y_test_pred = clf.predict_proba(X_test)

tt = g_test.as_matrix()
コード例 #42
0
ファイル: rf_digits.py プロジェクト: matthagy/sc2_timer
mask = classifications != -1
print mask.sum()
X = images[mask, ...].reshape(mask.sum(), np.prod(images.shape[1::]))
print X.shape
Y = classifications[mask]

acc = []
acc_correct = []
acc_incorrect = []
acc_x_incorrect = []
k_fold = 8
for train_inx, valid_inx in StratifiedKFold(Y, k_fold):
    rf = RandomForestClassifier(n_estimators=100, verbose=0, oob_score=True, compute_importances=True)
    rf.fit(X[train_inx], Y[train_inx])
    Yp = rf.predict(X[valid_inx])
    correct = Yp== Y[valid_inx]
    rf.predict_proba(X[valid_inx])
    p_correct = rf.predict_proba(X[valid_inx]).max(axis=1)
    acc_correct.append(p_correct[correct])
    acc_incorrect.append(p_correct[~correct])

    score = correct.mean()
    print score
    acc.append(score)

    acc_x_incorrect.append([images[mask][valid_inx[~correct]],
                            Y[valid_inx[~correct]],
                            Yp[~correct]])

print 'score', np.mean(acc)
コード例 #43
0
ファイル: RFP.py プロジェクト: DavidChaMun/TAE_FINAL
class TAERandomForestClassifier(object):
    lab_encoders = {}
    dummy_encoder = None
    rfc_model = None
    n_estimators = 100
    max_features = 7
    max_depth = 16

    def encode_fit(self, cat_data):
        #Encodes string to numeric labels
        tdc_set_encoded = cat_data.copy(deep=True)
        for cn in cat_data.columns:
            self.lab_encoders[cn] = preprocessing.LabelEncoder()
            self.lab_encoders[cn].fit(cat_data[str(cn)])
            tdc_set_encoded[str(cn)] = self.lab_encoders[cn].transform(
                cat_data[str(cn)])

        #Encodes to dummy dataset
        self.dummy_encoder = preprocessing.OneHotEncoder(categories="auto")
        self.dummy_encoder.fit(tdc_set_encoded[cat_data.columns])

        #print(len(self.dummy_encoder.get_feature_names()))

        encoded_cat_data = pd.DataFrame(
            data=self.dummy_encoder.transform(tdc_set_encoded).todense(),
            columns=self.dummy_encoder.get_feature_names())
        return encoded_cat_data

    def encode(self, cat_data):
        for cn in cat_data.columns:
            cat_data[str(cn)] = self.lab_encoders[cn].transform(
                cat_data[str(cn)])

        #Encodes to dummy dataset
        encoded_cat_data = pd.DataFrame(
            data=self.dummy_encoder.transform(cat_data).todense(),
            columns=self.dummy_encoder.get_feature_names())
        return encoded_cat_data

    def fit(self, x_train, y_train, cat_cols, num_cols):
        #Separates dataset in categorical and numbers
        x_train_num = x_train[num_cols].copy(deep=True)
        x_train_cat = x_train[cat_cols].copy(deep=True)

        x_train_cat = self.encode_fit(x_train_cat)

        x_train_num.reset_index(drop=True, inplace=True)
        x_train_cat.reset_index(drop=True, inplace=True)

        f_x_train = pd.concat([x_train_num, x_train_cat], axis=1)

        self.rfc_model = RandomForestClassifier(n_estimators=self.n_estimators,
                                                criterion="entropy",
                                                max_features=self.max_features,
                                                max_depth=self.max_depth)
        self.rfc_model = self.rfc_model.fit(f_x_train, y_train)

    def predict(self, x_predict, cat_cols, num_cols):
        #Separates dataset in categorical and numbers
        x_predict_num = x_predict[num_cols].copy(deep=True)
        x_predict_cat = x_predict[cat_cols].copy(deep=True)

        x_predict_cat = self.encode(x_predict_cat)
        f_x_predict = pd.concat([x_predict_num, x_predict_cat], axis=1)
        y_pred = self.rfc_model.predict(f_x_predict)
        return y_pred

    def cal_conf_matrix(self, x_test, y_test, catego_columns, numeric_cols):
        y_pred = self.predict(x_test, catego_columns, numeric_cols)
        # [[VP, FP], [FN, VN]]
        print("Matriz de confusión:")
        print(metrics.confusion_matrix(y_test, y_pred))

        #Correr varias veces y ver como varia. Basado en el indice de jaccard
        print("Precisión:", metrics.accuracy_score(y_test, y_pred))
コード例 #44
0
import autopath
from datasets import training_set, test_set
from util import convert_gray_scale, flatten


Xr,Yr = training_set
Xe,Ye = test_set

Xr = flatten(convert_gray_scale(Xr))
Xe = flatten(convert_gray_scale(Xe))

rf = RandomForestClassifier(n_estimators=100, verbose=3, oob_score=True, compute_importances=True)
rf.fit(Xr, Yr)

Yp = rf.predict(Xe)
print np.mean(Yp == Ye)

Ypp = rf.predict_proba(Xe).max(axis=1)

plt.figure(1)
plt.clf()
plt.hist(Ypp[Yp == Ye], 50, color='b', normed=True, alpha=0.4,
         label='classified')
plt.hist(Ypp[Yp != Ye], 50, color='r', normed=True, alpha=0.4,
         label='misclassified')
plt.legend(loc='upper left')
plt.draw()
plt.show()

plt.figure(3)