Ejemplo n.º 1
0
def model_pred(trainX,trainY,testX,model_type):
    if model_type == "rf":
        clf = RandomForestClassifier(n_estimators = 500,n_jobs = 20)
        clf.fit(trainX,trainY)
        pred = clf.predict(testX)
    if model_type == "gbdt":
        clf = GradientBoostingClassifier(n_estimators=6,learning_rate=0.9,random_state=0)
        clf.fit(trainX,trainY)
        pred = clf.predict(testX)
    if model_type == "fusion":
        prob = np.zeros(len(testX))
        params = [100,200,300,400,500]
        for param in params:
            clf = RandomForestClassifier(n_estimators = param,n_jobs = 20,bootstrap=True)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        params = [1,2,3,4,5,6,7,8,9,10]
        for param in params:
            clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        pred = list(prob >= 3)
    print "the pos rate is:",float(sum(pred))/len(pred)
    return pred
Ejemplo n.º 2
0
def model_pred(trainX, trainY, testX, model_type):
    if model_type == "rf":
        clf = RandomForestClassifier(n_estimators=500, n_jobs=20)
        clf.fit(trainX, trainY)
        pred = clf.predict(testX)
    if model_type == "gbdt":
        clf = GradientBoostingClassifier(n_estimators=6,
                                         learning_rate=0.9,
                                         random_state=0)
        clf.fit(trainX, trainY)
        pred = clf.predict(testX)
    if model_type == "fusion":
        prob = np.zeros(len(testX))
        params = [100, 200, 300, 400, 500]
        for param in params:
            clf = RandomForestClassifier(n_estimators=param,
                                         n_jobs=20,
                                         bootstrap=True)
            clf.fit(trainX, trainY)
            prob += clf.predict(testX)
        '''
        params = [1,2,3,4,5,6,7,8,9,10]
        for param in params:
            clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        pred = list(prob >= 3)
    print "the pos rate is:", float(sum(pred)) / len(pred)
    return pred
class LexicaseForestClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, initial_forrest_factor=5, n_estimators=10, **kwargs):
        self._initial_forrest_size = n_estimators * initial_forrest_factor
        self._final_forrest_size = n_estimators

        rf_fit_args = copy(kwargs)
        rf_fit_args.update({'n_estimators': self._initial_forrest_size})
        self._rf = RandomForestClassifier(**rf_fit_args)

    def fit(self, X, y):
        self._rf.fit(X, y)

        for t in self._rf.estimators_:
            tree_y_pred = t.predict(X)
            t._error_vector = squared_error_vector(y, tree_y_pred)

        final_estimators = []
        for i in range(self._final_forrest_size):
            final_estimators.append(epsilon_lexicase_selection(self._rf.estimators_))

        self._rf.estimators_ = final_estimators
        self._rf.n_estimators = self._final_forrest_size
        # TODO: Set other self._rf parameters to match correct size so that predict works.

    def predict(self, X, y=None):
        return self._rf.predict(X)
Ejemplo n.º 4
0
def stkFoldCrossValidation():

    X = pickle.load(open('X.p', 'rb'))

    X = np.array(X)

    Y = pickle.load(open('Y.p', 'rb'))

    Y = np.array(Y)

    skf = StratifiedKFold(n_splits=10)
    skf.get_n_splits(X, Y)

    k = 1
    for train_index, test_index in skf.split(X, Y):

        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        print(k)
        k += 1

        rf = RandomForestClassifier()

        rf.fit(X_train, Y_train)

        yp = rf.predict(X_test)
        print(classification_report(Y_test, yp, digits=6))
Ejemplo n.º 5
0
class RandomForestClassifierImpl():

    def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'):
        self._hyperparams = {
            'n_estimators': n_estimators,
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'bootstrap': bootstrap,
            'oob_score': oob_score,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose,
            'warm_start': warm_start,
            'class_weight': class_weight}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
Ejemplo n.º 6
0
def forest(X, y, model_path):
    model = RandomForestClassifier()
    model.fit(X, y)
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)
Ejemplo n.º 7
0
def calcRandomForestClassifier(channels_training, channels_testing,
                               target_training, target_testing):
    clf = RandomForestClassifier(n_estimators=500,
                                 max_features=int(
                                     sqrt(len(channels_training[0]))))
    clf = clf.fit(channels_training, target_training)
    predictions = clf.predict(channels_testing)
    comp = [predictions, target_testing, channels_testing]
    return clf, comp
Ejemplo n.º 8
0
def calc_score(test, train):
    test_f, test_l = split_data_label(test)
    train_f, train_l = split_data_label(train)
    # 학습시키고 정답률 구하기

    clf = RandomForestClassifier()
    clf.fit(train_f, train_l)
    pre = clf.predict(test_f)
    return metrics.accuracy_score(test_l, pre)
Ejemplo n.º 9
0
def drawfeature(train_data_path,train_file_name,test_data_path,test_file_name):
    train_file = os.path.join(train_data_path,train_file_name)
    train_data = pd.read_csv(train_file)
    n_data_train = train_data['text'].size
    print 'n_data_train is %s' %n_data_train
    print type(n_data_train)
    
    test_file = os.path.join(test_data_path,test_file_name)
    test_data = pd.read_csv(test_file)
    n_data_test = test_data['text'].size
    print 'n_data_test is %s' %n_data_test
    print type(n_data_test)
    
    vectorizer = CountVectorizer(analyzer='word',tokenizer = None,
        preprocessor = None, stop_words=None, max_features = 5000)
    transformer = TfidfTransformer()
    
    train_data_words = []
    
    print 'start with words in train data set'
    for i in xrange(n_data_train):
        if((i+1)%1000 == 0):
            print 'Drawfeatures line %d of %d' %(i+1,n_data_train)
        train_data_words.append(words_to_features(train_data['text'][i]))
    print 'start bag of words in train data....'
    train_data_features = vectorizer.fit_transform(train_data_words)
    train_data_features = train_data_features.toarray()
    print 'start tfidf in train data....'
    train_data_features = transformer.fit_transform(train_data_features)
    train_data_features = train_data_features.toarray()
    #test-data processing
    test_data_words = []
    for i in xrange(n_data_test):
        if((i+1)%1000 == 0):
            print 'Drawfeatures line %d of %d' %(i+1,n_data_test)
        test_data_words.append(words_to_features(test_data['text'][i]))
    
    test_data_features = vectorizer.fit_transform(test_data_words)
    test_data_features = test_data_features.toarray()
    
    
       
    print'randome forest go...'
    forest = RandomForestClassifier(n_estimators = 13)
    forest = forest.fit(train_data_features,train_data['label'])
    pred = forest.predict(test_data_features)
    pred = pd.Series(pred,name='Target')
    pred.to_csv('SENTI_RF.CSV',index=None, header = None)

    
    print'naive baby go...'
    mnb = MultinomialNB(alpha=0.01)
    mnb = mnb.fit(train_data_features,train_data['label'])
    pred = mnb.predict(test_data_features)
    pred = pd.Series(pred,name = 'Target')
    pred.to_csv('SENTI_MNB',index = None, header = True)
 def RandomForestClassifer(self):
     
     '''
     Function to do RandomForest Classifer.
     '''
     train_Array = self.titanic_train_frame.values
     self.test_Array = self.titanic_test_frame.values
     randomForest = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
     randomForest.fit(train_Array[0::,1::],train_Array[0::,0])
     self.predicted_probability = randomForest.predict(self.test_Array[0::,0::])
     self.predicted_probability_list = self.predicted_probability.tolist()
Ejemplo n.º 11
0
class RFClassifier(super.abstract_classifier):

    def __init__(self, train_features, train_labels, num_of_trees):
        self.train_features = train_features
        self.train_labels = train_labels
        self.rf_member = RandomForestClassifier(num_of_trees)

    def train(self):
        self.rf_member.fit(self.train_features, self.train_labels)

    def classify(self, newVector):
        return self.rf_member.predict(newVector)
Ejemplo n.º 12
0
def build_and_test_model(classifier, X, Y, Z, param):

    accuracies = []
    ari = []

    for train, test in LeaveOneOut().split(X):

        X_train, Y_train = X[train], Y[train]
        X_test, Y_test, Z_test = X[test], Y[test], Z[test]
        predicted = None

        if classifier == "KNN":
            neigh = KNeighborsClassifier(n_neighbors=param).fit(
                X_train, Y_train)
            predicted = neigh.predict(X_test)

        elif classifier == "RF":
            clf = RandomForestClassifier(n_estimators=param,
                                         random_state=0)  # ,max_depth=2,
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test)

        elif classifier == "SVM":
            clf = svm.SVC(gamma='scale')
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test).astype(int)

        elif classifier == "NAIVE":
            clf = GaussianNB()
            clf.fit(X_train, Y_train)
            predicted = clf.predict(X_test).astype(int)

        elif classifier == "RANDOM":
            options = list(set(Y_train))
            predicted = [random.choice(options) for _ in range(len(Y_test))]

        accuracies.append(metrics.accuracy_score(Y_test, predicted))
        ari.append(metrics.adjusted_rand_score(Z_test, predicted))

    return np.mean(accuracies), np.std(accuracies), np.mean(ari), np.std(ari)
Ejemplo n.º 13
0
def evalOne(enabledColumns):
    features = [all_features[i] for i in range(0, len(all_features)) if enabledColumns[i]]
    Y = []
    P = []
    for group in range(0,5):
    #     print("Test group " + str(group + 1))
        trainStationList = []
        testStationList = []
        for i in range(0,5):
            if i == group:
                testStationList.extend(groups[i])
            else:
                trainStationList.extend(groups[i])
        trainStations = set(float(station) for station in trainStationList)
        # reorder train stations
    #     print("\ttrainStationList:" + str(trainStationList))
        trainStationList = [s for s in all_stations if float(s) in trainStations]
    #     print("\ttrainStationList:" + str(trainStationList))
        testStations = set(float(station) for station in testStationList)
    #     print("\ttestStationList:" + str(testStationList))
        trainX, testX, trainY, testY, trainLocation, testLocation = splitDataForXValidationWithLocation(trainStations, testStations, "location", data, features, "target")
     
        train_lower = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i < (len(trainStationList) / 2.0)]
#         train_upper = [float(trainStationList[i]) for i in range(0, len(trainStationList)) if i >= (len(trainStationList) / 2.0)]
         
        test_lower = [float(testStationList[i]) for i in range(0, len(testStationList)) if i < (len(testStationList) / 2.0)]
#         test_upper = [float(testStationList[i]) for i in range(0, len(testStationList)) if i >= (len(testStationList) / 2.0)]
         
        trainY = []
        for l in trainLocation:
            if l in train_lower:
                trainY.append(0)
            else:
                trainY.append(1)
         
        testY = []
        for l in testLocation:
            if l in test_lower:
                testY.append(0)
            else:
                testY.append(1)
         
        model = RandomForestClassifier(random_state=42, n_estimators=20, max_depth=9, n_jobs=-1)
        model.fit(trainX, trainY)
        predY = model.predict(testX)
         
        Y.extend(testY)
        P.extend(predY)
     
    f1 = f1_score(Y, P)
    accuracy = accuracy_score(Y, P)
    return f1, accuracy
Ejemplo n.º 14
0
class Model(BaseModel):
    """Antares implementation of scikit learn random forest classifier

    """
    def __init__(self,
                 categorical_features=None,
                 n_estimators=50,
                 n_jobs=-1,
                 max_depth=10):
        '''
        Example:
            >>> from madmex.modeling.supervised.rf import Model
            >>> rf = Model()
            >>> # Write model to db
            >>> rf.to_db(name='test_model', recipe='mexmad', training_set='no')
            >>> # Read model from db
            >>> rf2 = Model.from_db('test_model')
        '''
        super().__init__(categorical_features=categorical_features)
        self.model = RandomForestClassifier(n_estimators=n_estimators,
                                            n_jobs=n_jobs,
                                            max_depth=max_depth)
        self.model_name = 'rf'

    def fit(self, X, y):
        X = self.hot_encode_training(X)
        self.model.fit(X, y)

    def predict(self, X):
        '''
        Simply passes down the prediction from the underlying model.
        '''
        X = self.hot_encode_predict(X)
        return self.model.predict(X)

    def predict_confidence(self, X):
        """Get confidence of every prediction
        """
        X = self.hot_encode_predict(X)
        return self.model.predict_proba(X).max(axis=1)

    def score(self, X, y):
        '''
        Test the model given a dataset and a target vector.

        This method applies the model that this object represents to the given dataset using
        the response variable y. It is a measure of the accuracy of the trained model. Usually
        the orginal dataset should be splitted in training and testing subsets to cross validate
        the model.
        '''
        return self.model.score(X, y)
Ejemplo n.º 15
0
def classic_model(image_dir, image_lists, method):

    X, y = get_X_y(image_dir, image_lists, ['training', 'validation'], method)
    classifier = RandomForestClassifier(n_estimators=1000, n_jobs=4)
    classifier.fit(X, y)

    X_test, y_test = get_X_y(image_dir, image_lists, ['testing'], method)
    predictions = classifier.predict(X_test)
    confusion = pandas.crosstab(y_test,
                                predictions,
                                rownames=['Actual Class'],
                                colnames=['Predicted Class'])
    print confusion
    return accuracy_score(y_test, predictions)
Ejemplo n.º 16
0
def just_pred(x, y):
    xlen = len(x)
    i = range(xlen)
    np.random.shuffle(i)
    trainpct = 0.7
    trainlen = int(trainpct * xlen)
    testlen = xlen - trainlen
    xtrain = x.ix[:trainlen,:]
    ytrain = y.ix[:trainlen]
    xtest = x.ix[trainlen:,:]
    ytest = y.ix[trainlen:]
    rf = RandomForestClassifier()
    rf.fit(xtrain, ytrain)
    ypred = rf.predict(xtest)
    return ytest, ypred
Ejemplo n.º 17
0
def just_pred(x, y):
    xlen = len(x)
    i = range(xlen)
    np.random.shuffle(i)
    trainpct = 0.7
    trainlen = int(trainpct * xlen)
    testlen = xlen - trainlen
    xtrain = x.ix[:trainlen, :]
    ytrain = y.ix[:trainlen]
    xtest = x.ix[trainlen:, :]
    ytest = y.ix[trainlen:]
    rf = RandomForestClassifier()
    rf.fit(xtrain, ytrain)
    ypred = rf.predict(xtest)
    return ytest, ypred
def random_forest(profile, group, n_tree, search_number, avg_acc):
    '''
    对丰度表进行建模
    :param profile:丰度表
    :param group: 分组表
    :param n_tree: 模型中树的颗数
    :param search_number: 搜索随机种子的次数
    :param avg_acc: 随机种子准确率的输出文件
    :return: 加label后的group
    '''
    real_label = set(group.iloc[:, 0])
    label_dict = {}
    for i, j in enumerate(real_label):
        label_dict[j] = i
    label = []
    for sample in group.index:
        label.append(label_dict[group.loc[sample].values[0]])

    group['label'] = label

    n = 0
    with open(avg_acc, 'w') as f:
        f.write('random_state\tavgAcc\n')
        while n < search_number:
            print('现在循环次数为{0}'.format(n+1))
            # random random_state
            random_state = round(random() * 10000)

            rf = RandomForestClassifier(n_estimators=n_tree, max_leaf_nodes=3,
                                        random_state=random_state)

            acc = []
            for i in range(10):
                sample_train = list(profile.sample(n=30).index)
                sample_val = list(set(profile.index).difference(sample_train))
                train = profile.loc[sample_train]
                val = profile.loc[sample_val]
                label_train = group['label'].loc[sample_train]

                rf.fit(train, group['label'][sample_train])
                pre = rf.predict(val)

                acc.append(metrics.accuracy_score(y_true=group['label'][sample_val], y_pred=pre))

            # print('{0}\t{1}\n'.format(random_state, sum(acc) / 10))
            f.write('{0}\t{1}\n'.format(random_state, sum(acc) / 10))
            n += 1
    return group
Ejemplo n.º 19
0
def decision_frist():

    data = datasets.load_iris()
    x = data["data"]
    y = data["target"]

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    des = DecisionTreeClassifier(max_leaf_nodes=3)
    des.fit(X_train, y_train)
    print(des.predict(X_test))
    print(des.score(X_test, y_test))

    rom = RandomForestClassifier()
    rom.fit(X_train, y_train)
    print(rom.predict(X_test))
    print(rom.score(X_test, y_test))
Ejemplo n.º 20
0
def crossval(x, y, k=5):
    for i in range(k):
        i = range(len(X))
        np.random.shuffle(i)
        xlen = len(x)
        trainpct = 0.7
        trainlen = int(trainpct * xlen)
        testlen = xlen - trainlen
        xtrain = x.ix[:trainlen, :]
        ytrain = y.ix[:trainlen]
        xtest = x.ix[trainlen:, :]
        ytest = y.ix[trainlen:]
        rf = RandomForestClassifier()
        rf.fit(xtrain, ytrain)
        ypred = rf.predict(xtest)
        print ypred
Ejemplo n.º 21
0
def crossval(x, y, k=5):
    for i in range(k):
        i = range(len(X))
        np.random.shuffle(i)
        xlen = len(x)
        trainpct = 0.7
        trainlen = int(trainpct * xlen)
        testlen = xlen - trainlen
        xtrain = x.ix[:trainlen,:]
        ytrain = y.ix[:trainlen]
        xtest = x.ix[trainlen:,:]
        ytest = y.ix[trainlen:]
        rf = RandomForestClassifier()
        rf.fit(xtrain, ytrain)
        ypred = rf.predict(xtest)
        print ypred
Ejemplo n.º 22
0
def predict2(text):

    # let o dataset
    dataset_file = os.path.join(BASE_DIR, 'dataset', 'complain.json')

    with open(dataset_file) as data:
        data = json.load(data)

    # Get the number of reviews based on the dataframe column size
    num_complain = len(data)

    # Initialize an empty list to hold the clean complain
    clean_train_complain = []
    target_problem_type = []
    for complain in data:
        clean_train_complain.append(
            clean_data('%s %s' % (complain['title'], complain['complain'])))
        target_problem_type.append(complain['category'])

    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 max_features=500)

    train_data_feature = vectorizer.fit_transform(
        clean_train_complain).toarray()

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators=100)

    # Fit the forest to the training set, using the bag of words as
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    forest = forest.fit(train_data_feature, target_problem_type)

    clean_test_complain = []

    clean_test_complain.append(clean_data(text))

    test_data_features = vectorizer.transform(clean_test_complain)
    test_data_features = test_data_features.toarray()

    result = forest.predict(test_data_features)

    return result
Ejemplo n.º 23
0
def forestPredict(columName, features, trees):
    
    pd.options.mode.chained_assignment = None
    
    df = pd.read_csv(path.NOTEBOOKS_DATA + 'features2.csv')
    df['pred'] = ""
    #df = df.set_index([df.m_championship_id,df.m_match_group_num])
    df = df.set_index([df.m_match_id])
    
    for champId in range(1,91):
        
        champ = df[(df.m_championship_id == champId)]       
        print(champId)
        
        if (champId < 11 or champId > 20):
            
            if (len(champ) == 380):
                rd = 38
            elif (len(champ) == 306):
                rd = 34
            else:
                rd = 30
                 
            for mid in range(2,rd+1):
                
                train = champ[champ.m_match_group_num < mid]
                test = champ[champ.m_match_group_num == mid]
                  
                target = 'm_column_result'
                  
                X = train[features]
                y = train[target]                  
                Z = test[features]
                  
                clf = RandomForestClassifier(n_estimators=trees,max_features=None )
                clf.fit(X,y)
                          
                pred = clf.predict(Z)
                  
                for i,p in zip(Z.index,pred) :

                    df.set_value(i,'pred',p)
             
    nameFile = 'pred_' + columName + ".csv"
    
    df.to_csv(path.NOTEBOOKS_DATA + nameFile,index=False);
Ejemplo n.º 24
0
def predict(text, dataset_file_path):

    dataset_file = dataset_file_path

    data_file = open(dataset_file, 'r')

    reader = csv.reader(data_file, delimiter=';', quoting=csv.QUOTE_NONE)

    clean_train_data = []
    target_data = []

    for line in reader:
        clean_train_data.append(clean_data(line[0]))
        target_data.append(line[1])

    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 max_features=500)

    train_data_feature = vectorizer.fit_transform(clean_train_data).toarray()

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators=100)

    # Fit the forest to the training set, using the bag of words as
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    forest = forest.fit(train_data_feature, target_data)

    clean_test_complain = []

    clean_test_complain.append(clean_data(text))

    test_data_features = vectorizer.transform(clean_test_complain)
    test_data_features = test_data_features.toarray()

    result = forest.predict(test_data_features)

    return result
 def plot_rf(self):
     n = self.bestScoreN
     A0 = [row[0] for row in self.dataset if row[2] == 0]
     A1 = [row[0] for row in self.dataset if row[2] == 1]
     B0 = [row[1] for row in self.dataset if row[2] == 0]
     B1 = [row[1] for row in self.dataset if row[2] == 1]
     Xplot = []
     Yplot = []   
     Xplot, Yplot = np.meshgrid(np.arange(-0.2, 4.4, 0.2),np.arange(-0.2, 4.4, 0.2))
     clf = RandomForestClassifier(n_estimators = n).fit(self.X,self.Y)
     predicted = clf.predict(np.c_[Xplot.ravel(), Yplot.ravel()])
     predicted = predicted.reshape(Xplot.shape)             
     plot0 = plt.scatter(A0,B0, marker='+', color = 'red')
     plot1 = plt.scatter(A1,B1, marker = 'o', color = 'green')
     plt.legend((plot0, plot1), ('label 0', 'label 1'), scatterpoints = 1)
     plt.xlabel('A')
     plt.ylabel('B')
     plt.title("RF Classifier")
     plt.contourf(Xplot, Yplot, predicted, alpha=0.5)
     plt.show()
Ejemplo n.º 26
0
def forestPredict4():
    
    df = pd.read_csv(path.NOTEBOOKS_DATA + 'features3.csv')
    
    df.index = df.m_match_id
    df['rf1000_fs4'] = ""
    pd.options.mode.chained_assignment = None

    
    for t1 in range(11,91,10):
        
        print(t1)
        
        champ = df[(df.m_championship_id < t1) & (df.m_championship_id >= t1-10)].sort_values(['m_match_date'])        
        
        for t2 in range(10,len(champ),10):
              
            train = champ[0:t2]
            test = champ[t2:t2+10]
              
            features = ['m_odd_home','m_odd_away','m_odd_underdog',
                        'm_odd_favorite','m_odd_draw','m_odd_medium',
                        'a_goals_for_mean','h_goals_for_mean']
              
            target = 'm_column_result'
              
            X = train[features]
            y = train[target]             
            Z = test[features]
              
            clf = RandomForestClassifier(n_estimators=1000)
            clf.fit(X,y)
                      
            pred = clf.predict(Z)
              
            for t3,p in zip(Z.index,pred) :
                df.set_value(t3,'rf1000_fs4',p)
        
 
    df.to_csv(path.NOTEBOOKS_DATA + 'features3.csv',index=False);        
Ejemplo n.º 27
0
def RandomForestIndependent():

    X = pickle.load(open('X.p', 'rb'))
    Y = pickle.load(open('Y.p', 'rb'))

    print('****  *****')

    rf = RandomForestClassifier(n_estimators=10)
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.0,
                                                        random_state=3)

    rf.fit(X_train, Y_train)
    yp = rf.predict(X_train)
    print('**** Training *****')
    print(classification_report(Y_train, yp))
    '''yp = rf.predict(X_test)
    print('**** Testing *****')
    print(classification_report(Y_test, yp,digits=6))'''

    pickle.dump(rf, open('rf.p', 'wb'))
Ejemplo n.º 28
0
class Model(BaseModel):
    '''
    classdocs
    '''

    def __init__(self, path):
        '''
        Constructor
        '''
        self.path = path
        self.model = RandomForestClassifier(n_estimators=150,n_jobs=8)
        self.model_name = 'rf'

    def fit(self, X, y):
        self.model.fit(X,y)

    def predict(self, X):
        '''
        Simply passes down the prediction from the underlying model.
        '''
        return self.model.predict(X)

    def save(self, filepath):
        '''
        Persists the trained model to a file.
        '''
        joblib.dump(self.model, create_filename(filepath,'%s.pkl' % self.model_name)) 

    def load(self, filepath):
        '''
        Loads an already train model from a file to perform predictions.
        '''
        self.model = joblib.load(create_filename(filepath,'%s.pkl' % self.model_name))

    def score(self, X, y):
        '''
        Lets the user load a previously trained model to predict with it. 
        '''
        return self.model.score(X,y)
Ejemplo n.º 29
0
def forestPredict7030(columName, features, trees):
    
    pd.options.mode.chained_assignment = None
    
    df = pd.read_csv(path.NOTEBOOKS_DATA + 'features2.csv')
    df['pred'] = ""
    #df = df.set_index([df.m_championship_id,df.m_match_group_num])
    df = df.set_index([df.m_match_id])
    
    for champId in range(11,91,10):
        
        champ = df[(df.m_championship_id < champId) & (df.m_championship_id >= champId-10)]       
        
        print(champId)
        
        if (champId != 21):
            
                train = champ[df.m_championship_id <= champId-4]
                test = champ[df.m_championship_id > champId-4]
                  
                target = 'm_column_result'
                  
                X = train[features]
                y = train[target]                  
                Z = test[features]
                  
                clf = RandomForestClassifier(n_estimators=trees,max_features=None )
                clf.fit(X,y)
                          
                pred = clf.predict(Z)
                  
                for i,p in zip(Z.index,pred) :

                    df.set_value(i,'pred',p)
             
    nameFile = 'pred_' + columName + ".csv"
    
    df.to_csv(path.NOTEBOOKS_DATA + nameFile,index=False);
def main():
    header = ["id"]
    feats = []
    df = pd.read_csv("cora.content", sep="\t")
    for i in range(df.shape[1] - 2):
        feat = "feat_" + str(i)
        header.append(feat)
        feats.append(feat)
    header.append("class")

    feats = np.array(feats)

    df.columns = header

    x_train, x_test, y_train, y_test = train_test_split(
        df[feats], df["class"], test_size=0.3
    )

    clf = RandomForestClassifier(n_estimators=200)
    clf.fit(x_train, y_train)

    importances = clf.feature_importances_
    sorted_idx = np.argsort(importances)

    x = list(zip(feats[sorted_idx], importances[sorted_idx]))
    x_sorted = sorted(x, key=lambda x: -x[1])

    # Statistics
    y_pred = clf.predict(x_test)
    precision, recall, fscore, _ = score(y_test, y_pred, average="macro")

    print("Precision:", round(precision, 3))
    print("Recall:   ", round(recall, 3))
    print("F-Score:  ", round(fscore, 3))
    print("Accuracy: ", round((y_pred == y_test).sum() / len(y_pred), 3))

    selected_feats = [key for key, val in x_sorted[:20]]
    print(selected_feats)
Ejemplo n.º 31
0
class StackingFusion(FusionStrategy):
    '''
    The StackingFusion learns a fusion strategy from training data.
    A classifier is trained that uses the posterior probabilities from all
    microphones in the sensor network as input features.
    '''
    def __init__(self, channel_sort=ChannelSortNone()):
        '''
        Constructor
        @param channel_sort: An object of type ChannelSortStrategy. 
        '''
        self.stacked_classifier = None
        self.channel_sort = channel_sort

    def train(self, log_probs, labels):
        '''
        Train the stacked classifier
        @param log_probs: list of probability matrices (channels, label)
        @param labels: label for each feature-vector
        '''
        print 'Train stacked classifier with %d windows' % labels.shape[0]
        log_probs = [self.channel_sort.sort(f) for f in log_probs]
        log_probs = np.vstack(log_probs)

        # TODO: classifier as Parameter
        self.stacked_classifier = RandomForestClassifier(n_estimators=10)
        self.stacked_classifier.fit(log_probs, labels)

    def apply(self, log_probs):
        '''
        Apply fusion strategy to classifier probabilities
        @param log_probs: log probabilities for each channel and class in shape (channel, class)
        @return: Class index for the predicted class
        '''
        log_probs = self.channel_sort.sort(log_probs)
        # return the classindex as a scalar not as an array
        return self.stacked_classifier.predict(log_probs)[0]
Ejemplo n.º 32
0
for train, test in kf:
    y_train = []
    x_train = []
    for i in train:
        y_train.append(features[i][6])
        tmp = [features[i][0], features[i][1], features[i][2], features[i][3], features[i][4], features[i][5]]
        x_train.append(tmp)
        
    y_test = []
    x_test = []  
    for i in test:
        y_test.append(features[i][6])
        tmp = [features[i][0], features[i][1], features[i][2], features[i][3], features[i][4], features[i][5]]
        x_test.append(tmp)
       
    rf.fit(x_train, y_train)
    rfPredTest = rf.predict(x_test)
    rfPrecisionTest = precision_score(y_test, rfPredTest)
    rfRecallTest = recall_score(y_test, rfPredTest)
    rfF1Test = f1_score(y_test, rfPredTest)
    rfAvgPrecision += rfPrecisionTest
    rfAvgRecall += rfRecallTest
    rfAvgF1 += rfF1Test

print "RF completed in ", time.time() - start, " s"
print "rf:\n Precision {}\n Recall {}\n F1 {}\n".format(rfAvgPrecision / 5, rfAvgRecall / 5, rfAvgF1 / 5)

 


Ejemplo n.º 33
0
# treino, teste e avaliacao
print('Iniciando o k-Fold...')
for train_index, test_index in k_fold.split(tf_idf):
    x_train, x_test = tf_idf[train_index], tf_idf[test_index]
    y_train, y_test = classes[train_index], classes[test_index]

    # treino do modelo
    print(f'Gerando o Modelo {i}...')
    classifier = RandomForestClassifier(n_estimators=10,
                                        criterion='gini',
                                        random_state=iteracao).fit(
                                            x_train, y_train)

    # classificando o conjunto de teste
    y_pred = classifier.predict(x_test)

    # metricas de desempenho
    aux_accuracy += accuracy_score(y_test, y_pred)
    aux_f1_score += f1_score(y_test, y_pred)
    aux_precision += precision_score(y_test, y_pred)
    aux_recall += recall_score(y_test, y_pred)
    conf_matrices += np.asarray(confusion_matrix(y_test, y_pred))

    print(f'Modelo {i} finalizado e avaliado.')
    i += 1

# resultados
print(f'\nITERATION #{iteracao} -----------------------')
print(f'Accuracy = {aux_accuracy / k_fold.n_splits}')
print(f'F1 Score = {aux_f1_score / k_fold.n_splits}')
Ejemplo n.º 34
0
x_train, x_test, y_train, y_test = train_test_split(authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123)


# Fit Model
etclf = ExtraTreesClassifier(n_estimators=20, max_depth=10, verbose=1)
etclf.fit(x_train, y_train)

# Print Confusion Matrix
metrics.confusion_matrix(etclf.predict(x_test), y_test)


from sklearn.ensemble.forest import RandomForestClassifier

rdclf = RandomForestClassifier(n_estimators=20, max_depth=10)
rdclf.fit(x_train, y_train)
metrics.confusion_matrix(rdclf.predict(x_test), y_test)


from sklearn.ensemble.weight_boosting import AdaBoostClassifier

adaclf = AdaBoostClassifier(n_estimators=20)
adaclf.fit(x_train, y_train)
metrics.confusion_matrix(adaclf.predict(x_test), y_test)




metrics.confusion_matrix(etclf.predict(x_test), y_test)
metrics.confusion_matrix(rdclf.predict(x_test), y_test)
metrics.confusion_matrix(adaclf.predict(x_test), y_test)
Ejemplo n.º 35
0
  y_predict = m.predict(X_test)
  fpr, tpr, thresh = roc_curve(y_test, y_predict, pos_label=1)
  auc = roc_auc_score(y_test, y_predict)

  print 'AUC: ', auc
  print 'Percentage of players that will have TJ in 2014: ',np.mean(y_predict)
  return fpr, tpr, auc


rf_fpr, rf_tpr, rf_auc = evaluate_model(RandomForestClassifier)
svc_fpr, svc_tpr, svc_auc = evaluate_model(SVC)


RFC2 = RandomForestClassifier(n_estimators = 10)
RFC2.fit(X, y)
predict_players['predictions']=RFC2.predict(predict_players[X_cols])
predict_players.to_csv('testing.csv')

print 'Players that RF thinks will have TJ in 2014', predict_players['m1_name'][predict_players['predictions']==1]

the_doomed = predict_players['m1_name'][predict_players['predictions']==1]
injuries2014 = pd.read_csv('.\\intermediate data\\injuries2014.csv')

for each_doomed_person in the_doomed.values:
  if each_doomed_person in injuries2014.values:
    print each_doomed_person, 'has in fact undergone TJ in 2014!'
  else:
    print each_doomed_person, "did not end up having TJ in 2014..."


for each_injured_person in injuries2014[injuries2014.columns[1]].values:
#download the file
raw_data=urllib.urlopen(url)

#get data, add column names and index
feature_names=["times pregnant", "plasma glucose conc.", "distolic blood pressure (mm Hg)", "triceps skin fold thickness (mm)", "2-hour serum insulin (mu U/ml)", "body mass index (kg/m^2)", "diabetes pedigree function", "age (years)", "target"]
dataset=pd.DataFrame.from_csv(raw_data)
dataset=dataset.reset_index()
dataset.columns=feature_names

#split into train and test set
train, test=train_test_split(dataset, test_size=0.3)

#normalize data
df_scaled_train=pd.DataFrame(preprocessing.scale(train), columns=feature_names)
df_scaled_test=pd.DataFrame(preprocessing.scale(test), columns=feature_names)

model=RandomForestClassifier(n_estimators = 100, oob_score = True, random_state =10, max_features = "auto", min_samples_leaf = 20)

#train model
#if getting this error, it is because a matrix with 1 column
#is being passed in when a 1d array is expected. ravel() will work.
#DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main':
#To resolve this error, convert label values to int or str as float is not a valid label-type
#raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array
model.fit(df_scaled_train.ix[:,'times pregnant':'age (years)'], np.asarray(df_scaled_train.ix[:,'target'].astype(int)))
print "Accuracy:", model.score(df_scaled_test.ix[:,'times pregnant':'age (years)'], np.asarray(df_scaled_test.ix[:,'target'].astype(int)))

#predict output
predicted=model.predict(df_scaled_test.ix[:,'times pregnant':'age (years)'])
print predicted
Ejemplo n.º 37
0
from sklearn.ensemble.forest import RandomForestClassifier

def read(fname):
    labels, data = [],[]
    with open(fname) as f:
        for s in f:
            ss = s.split()
            labels.append(int(ss[-1]))
            data.append(map(float, ss[:-2]))
    return labels, data

trainset = read('./trainset')
testset  = read('./testset')

clf = RandomForestClassifier(n_estimators=10)
clf.fit(trainset[1], trainset[0])
print clf.predict(testset[1])
print testset[0]
Ejemplo n.º 38
0
                       columns=X_train.columns)

#:# model

params = {'max_depth': 3, 'n_estimators': 75}

classifier = RandomForestClassifier(**params)
classifier.fit(X_train, y_train)

#:# hash
#:# 5475503c9e4b64dc0dcc4960399cf72c
md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()
print(f'md5: {md5}')

#:# audit
y_pred = classifier.predict(transform_pipeline.transform(X_test))
y_pred_proba = classifier.predict_proba(
    transform_pipeline.transform(X_test))[:, 1]

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'acc: {accuracy_score(y_test, y_pred)}')
print(f'auc: {roc_auc_score(y_test, y_pred_proba)}')
print(f'precision: {precision_score(y_test, y_pred)}')
print(f'recall: {recall_score(y_test, y_pred)}')
print(f'specificity: {tn/(tn+fp)}')
print(f'f1: {f1_score(y_test, y_pred)}')

#:# session info

# Dodaj wersję pythona w session info
Ejemplo n.º 39
0
	def runns(resp_var, size_of_test_data,dataset,positive_class,predictor_var, n_estimators,important_features,dealing_with_nulls):
		dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes
		#----DATA PREPROCESSING
		#-------dealing with NULL values in the data
		#----------remove the rows in which the response is null

		dataset=dataset.dropna(subset=[resp_var])
		#----------dealing with nulls
		dataset=deal_with_nulls(dealing_with_nulls,dataset)
		#----FEATURE SELECTION
		#-------get predictors important in predicting the response
		#-----------transform categorical predictors to dummy variables
		predictors=dataset[predictor_var]
		predictors=pd.get_dummies(predictors)
		#-----------balance the classes in the response var
		ros = RandomOverSampler(random_state=0)
		resp=dataset[resp_var]
		prds, resp = ros.fit_sample(predictors, resp)
		#-----------fit the random forest classifier to give us the important predictors
		rf_clf = RandomForestClassifier(n_estimators=n_estimators)
		rf_clf.fit(prds,resp)
		#-------get the important predictors
		feature_imp = pd.Series(rf_clf.feature_importances_,
						index=list(predictors.iloc[:,0:])).sort_values(ascending=False)
		#-------names of the important predictors
		important_predictor_names = feature_imp.index[0:important_features]
		#-------subset the data to get only the important predictors and the response
		resp=pd.DataFrame(data=resp,columns=[resp_var])
		predictors=pd.DataFrame(prds,columns=list(predictors))
		dataset=pd.concat([resp,predictors],axis=1)
		#---------------------------------------------------------
		#----MODEL TRAINING
		#--------Remove the response variables from the features variables - axis 1 refers to the columns
		m_data= dataset.drop(resp_var, axis = 1,inplace=False) 
		# Response variables are the values we want to predict
		resp_var = np.array(dataset[resp_var])

		dataset = pd.get_dummies(m_data)
		
		# Saving feature names for later use
		feature_list = list(m_data.columns)
		# Convert to numpy array
		dataset = np.array(dataset)

		# Split the data into training and testing sets
		train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = float(size_of_test_data), random_state = 402)

		# Instantiate model with n_estimators decision trees
		clf = RandomForestClassifier(n_jobs = 1,n_estimators = n_estimators, random_state = 142)

		# Train the model on training data
		clf.fit(train_features, train_labels)
		# evaluation
		predicted = clf.predict(test_features)
		pred_prob = clf.predict_proba(test_features)
		
		accuracy = accuracy_score(test_labels, predicted)
		#confusion matrix
		cnf = (confusion_matrix(test_labels,predicted))
		#precision score
		precision = precision_score(test_labels,predicted,pos_label=positive_class)
		#avg pres
		avg_precision = average_precision_score(test_labels,pred_prob[:,[1]])
		#recall score
		rec = recall_score(test_labels,predicted,pos_label=positive_class)
		#f1 scorea
		fscore = f1_score(test_labels,predicted,pos_label=positive_class)
		#fbeta score
		fbeta = fbeta_score(test_labels,predicted,beta=0.5)
		#hamming_loss
		hamming = hamming_loss(test_labels,predicted)
		#jaccard similarity score
		jaccard = jaccard_similarity_score(test_labels,predicted)
		#logloss
		logloss = log_loss(test_labels,predicted)
		#zero-oneloss
		zero_one = zero_one_loss(test_labels,predicted)
		#auc roc 
		area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]])
		#cohen_score
		cohen = cohen_kappa_score(test_labels,predicted)
		#mathews corr
		mathews = matthews_corrcoef(test_labels,predicted)
		# Variable importances from the important features selection stage
		variable_importance_list = list(zip(prds, feature_imp))
		output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews}
		output=json.dumps(output)
		return output
Ejemplo n.º 40
0
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.metrics.classification import classification_report
import pandas as pd
__author__ = 'semyon'


print("reading")
csv = pd.read_csv("data/train.csv")

print("slicing")
train_features = csv.ix[:, 'x23':'x61'].fillna(0).as_matrix()
train_true = csv['y'].tolist()

trtrfe = train_features[:35000, :]
trtrtrue = train_true[:35000]

trtefe = train_features[35000:, :]
trtetrue = train_true[35000:]

print("learning")

for depth in [7, 10, 12, 15, 20, 30, 50, 70]:
    for leaf_samples in [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 20, 40, 60, 150]:
        # model = GradientBoostingClassifier(n_estimators=10, max_depth=depth, min_samples_leaf=leaf_samples, verbose=1)
        model = RandomForestClassifier(n_estimators=50, max_depth=depth, min_samples_leaf=leaf_samples, verbose=0,
                                       n_jobs=4)
        model.fit(trtrfe, trtrtrue)
        # mean accuracy on the given test data and labels
        # print depth, '\t', leaf_samples, '\t', model.score(trtefe, trtetrue)
        predicted = model.predict(trtefe)
        print(classification_report(trtetrue, predicted))
Ejemplo n.º 41
0
print "Confusion matrix:"
print metrics.confusion_matrix(dat_clean.genre, predicted)

#####################
data_tree = dat_clean.iloc[:,[3,4,5,6,7,8,9,10,13,14,15]]
clf = clf.fit(data_tree, dat_clean.genre)

# Visualize tree
dot_data = StringIO.StringIO()
tree.export_graphviz(clf, out_file=dot_data, feature_names=list(data_tree.columns.values))
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf('dectree.pdf')


# Repeat on test set
y_test_pred = clf.predict(X_test)
print "Accuracy Test: {0:.3f}".format(metrics.accuracy_score(y_test, y_test_pred))
print
print "Classification report:"
print metrics.classification_report(y_test, y_test_pred)
print 
print "Confusion matrix:"
print metrics.confusion_matrix(y_test, y_test_pred)

# Measure performance
y_pred = clf.predict_proba(X_train)

# Repeat on test set
y_test_pred = clf.predict_proba(X_test)

tt = g_test.as_matrix()
Ejemplo n.º 42
0
mask = classifications != -1
print mask.sum()
X = images[mask, ...].reshape(mask.sum(), np.prod(images.shape[1::]))
print X.shape
Y = classifications[mask]

acc = []
acc_correct = []
acc_incorrect = []
acc_x_incorrect = []
k_fold = 8
for train_inx, valid_inx in StratifiedKFold(Y, k_fold):
    rf = RandomForestClassifier(n_estimators=100, verbose=0, oob_score=True, compute_importances=True)
    rf.fit(X[train_inx], Y[train_inx])
    Yp = rf.predict(X[valid_inx])
    correct = Yp== Y[valid_inx]
    rf.predict_proba(X[valid_inx])
    p_correct = rf.predict_proba(X[valid_inx]).max(axis=1)
    acc_correct.append(p_correct[correct])
    acc_incorrect.append(p_correct[~correct])

    score = correct.mean()
    print score
    acc.append(score)

    acc_x_incorrect.append([images[mask][valid_inx[~correct]],
                            Y[valid_inx[~correct]],
                            Yp[~correct]])

print 'score', np.mean(acc)
Ejemplo n.º 43
0
class TAERandomForestClassifier(object):
    lab_encoders = {}
    dummy_encoder = None
    rfc_model = None
    n_estimators = 100
    max_features = 7
    max_depth = 16

    def encode_fit(self, cat_data):
        #Encodes string to numeric labels
        tdc_set_encoded = cat_data.copy(deep=True)
        for cn in cat_data.columns:
            self.lab_encoders[cn] = preprocessing.LabelEncoder()
            self.lab_encoders[cn].fit(cat_data[str(cn)])
            tdc_set_encoded[str(cn)] = self.lab_encoders[cn].transform(
                cat_data[str(cn)])

        #Encodes to dummy dataset
        self.dummy_encoder = preprocessing.OneHotEncoder(categories="auto")
        self.dummy_encoder.fit(tdc_set_encoded[cat_data.columns])

        #print(len(self.dummy_encoder.get_feature_names()))

        encoded_cat_data = pd.DataFrame(
            data=self.dummy_encoder.transform(tdc_set_encoded).todense(),
            columns=self.dummy_encoder.get_feature_names())
        return encoded_cat_data

    def encode(self, cat_data):
        for cn in cat_data.columns:
            cat_data[str(cn)] = self.lab_encoders[cn].transform(
                cat_data[str(cn)])

        #Encodes to dummy dataset
        encoded_cat_data = pd.DataFrame(
            data=self.dummy_encoder.transform(cat_data).todense(),
            columns=self.dummy_encoder.get_feature_names())
        return encoded_cat_data

    def fit(self, x_train, y_train, cat_cols, num_cols):
        #Separates dataset in categorical and numbers
        x_train_num = x_train[num_cols].copy(deep=True)
        x_train_cat = x_train[cat_cols].copy(deep=True)

        x_train_cat = self.encode_fit(x_train_cat)

        x_train_num.reset_index(drop=True, inplace=True)
        x_train_cat.reset_index(drop=True, inplace=True)

        f_x_train = pd.concat([x_train_num, x_train_cat], axis=1)

        self.rfc_model = RandomForestClassifier(n_estimators=self.n_estimators,
                                                criterion="entropy",
                                                max_features=self.max_features,
                                                max_depth=self.max_depth)
        self.rfc_model = self.rfc_model.fit(f_x_train, y_train)

    def predict(self, x_predict, cat_cols, num_cols):
        #Separates dataset in categorical and numbers
        x_predict_num = x_predict[num_cols].copy(deep=True)
        x_predict_cat = x_predict[cat_cols].copy(deep=True)

        x_predict_cat = self.encode(x_predict_cat)
        f_x_predict = pd.concat([x_predict_num, x_predict_cat], axis=1)
        y_pred = self.rfc_model.predict(f_x_predict)
        return y_pred

    def cal_conf_matrix(self, x_test, y_test, catego_columns, numeric_cols):
        y_pred = self.predict(x_test, catego_columns, numeric_cols)
        # [[VP, FP], [FN, VN]]
        print("Matriz de confusión:")
        print(metrics.confusion_matrix(y_test, y_pred))

        #Correr varias veces y ver como varia. Basado en el indice de jaccard
        print("Precisión:", metrics.accuracy_score(y_test, y_pred))
Ejemplo n.º 44
0
import autopath
from datasets import training_set, test_set
from util import convert_gray_scale, flatten


Xr,Yr = training_set
Xe,Ye = test_set

Xr = flatten(convert_gray_scale(Xr))
Xe = flatten(convert_gray_scale(Xe))

rf = RandomForestClassifier(n_estimators=100, verbose=3, oob_score=True, compute_importances=True)
rf.fit(Xr, Yr)

Yp = rf.predict(Xe)
print np.mean(Yp == Ye)

Ypp = rf.predict_proba(Xe).max(axis=1)

plt.figure(1)
plt.clf()
plt.hist(Ypp[Yp == Ye], 50, color='b', normed=True, alpha=0.4,
         label='classified')
plt.hist(Ypp[Yp != Ye], 50, color='r', normed=True, alpha=0.4,
         label='misclassified')
plt.legend(loc='upper left')
plt.draw()
plt.show()

plt.figure(3)