def test_RandomForest(self):
        X = [[0, 1], [1, 1]]
        Y = [0, 1]

        regression = RandomForestClassifier(n_estimators=10)
        regression = regression.fit(X, Y)
        regression.predict_proba(X)
Ejemplo n.º 2
0
class RandomForestClassifierImpl():

    def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'):
        self._hyperparams = {
            'n_estimators': n_estimators,
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'bootstrap': bootstrap,
            'oob_score': oob_score,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose,
            'warm_start': warm_start,
            'class_weight': class_weight}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
Ejemplo n.º 3
0
class MyRfClassifier(BaseClassifier):
    def __init__(self, n_estimators, max_depth, min_samples_leaf):
        self.classifier = RandomForestClassifier(
            **{
                'verbose': 1,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_leaf': min_samples_leaf,
                'n_jobs': 40
            })
        self.name = "rf_n{n}_md{md}_ms{ms}".format(**{
            "n": n_estimators,
            "md": max_depth,
            "ms": min_samples_leaf
        })

    def get_name(self):
        return self.name

    def fit(self, X, y, X_t, y_t):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        ipts = dict(zip(feat_names, self.classifier.feature_importances_))
        return ipts
Ejemplo n.º 4
0
class Model(BaseModel):
    """Antares implementation of scikit learn random forest classifier

    """
    def __init__(self,
                 categorical_features=None,
                 n_estimators=50,
                 n_jobs=-1,
                 max_depth=10):
        '''
        Example:
            >>> from madmex.modeling.supervised.rf import Model
            >>> rf = Model()
            >>> # Write model to db
            >>> rf.to_db(name='test_model', recipe='mexmad', training_set='no')
            >>> # Read model from db
            >>> rf2 = Model.from_db('test_model')
        '''
        super().__init__(categorical_features=categorical_features)
        self.model = RandomForestClassifier(n_estimators=n_estimators,
                                            n_jobs=n_jobs,
                                            max_depth=max_depth)
        self.model_name = 'rf'

    def fit(self, X, y):
        X = self.hot_encode_training(X)
        self.model.fit(X, y)

    def predict(self, X):
        '''
        Simply passes down the prediction from the underlying model.
        '''
        X = self.hot_encode_predict(X)
        return self.model.predict(X)

    def predict_confidence(self, X):
        """Get confidence of every prediction
        """
        X = self.hot_encode_predict(X)
        return self.model.predict_proba(X).max(axis=1)

    def score(self, X, y):
        '''
        Test the model given a dataset and a target vector.

        This method applies the model that this object represents to the given dataset using
        the response variable y. It is a measure of the accuracy of the trained model. Usually
        the orginal dataset should be splitted in training and testing subsets to cross validate
        the model.
        '''
        return self.model.score(X, y)
Ejemplo n.º 5
0
def train_model(X_train, y_train):
    print("training the model ...")
    rf = RandomForestClassifier(n_estimators=1000,
                                max_depth=8,
                                n_jobs=-1,
                                verbose=1)
    #    rf = svm.SVC(kernel='rbf', gamma=0.7, C=1.0,probability=True)

    rf.fit(X_train, y_train)
    y_pred_train = rf.predict_proba(X_train)

    fpr, tpr, thresholds = roc_curve(y_train, y_pred_train[:, 0], pos_label=1)
    print("AUC on train : {:.02f} %".format(auc(fpr, tpr) * 100))

    return rf
Ejemplo n.º 6
0
class MyRfClassifier(BaseClassifier):
    def __init__(self, n_estimators, max_depth, min_samples_leaf):
        self.classifier = RandomForestClassifier(**{'verbose':1, 'n_estimators': n_estimators,
                                                    'max_depth':max_depth,'min_samples_leaf':min_samples_leaf,
                                                    'n_jobs':40})
        self.name = "rf_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )
    def get_name(self):
        return self.name

    def fit(self, X, y, X_t, y_t):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        ipts = dict(zip(feat_names, self.classifier.feature_importances_))
        return ipts
Ejemplo n.º 7
0
class MyRandomForestClassifier(BaseClassifier):
    def __init__(self, verbose=1, n_estimators = 2000, max_depth=8, min_samples_leaf=10000,
                 n_jobs=25):
        self.classifier = RandomForestClassifier( **{'verbose': verbose,
                                                     'n_estimators': n_estimators,
                                                     'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf,
                                                      'n_jobs': n_jobs})
        self.name = "rf_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )

    def get_name(self):
        return self.name

    def fit(self, X, y):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        return self.classifier.feature_importances_
Ejemplo n.º 8
0
def tune_model(X, y, K=5):
    print("tuning the model ...")
    """logging"""

    # the winner is
    #    {'max_features' : [sqrt'],
    #              'n_estimators' : [2000],
    #              'min_samples_leaf' : [1]
    #              }
    #
    """ """

    params = {
        'max_features': ['auto', 'sqrt', 0.2, 0.4],
        'n_estimators': [10, 50, 100, 500, 1000, 2000],
        'min_samples_leaf': [0.01, 0.02, 0.05, 0.1, 0.15, 0.2],
        'max_depth': [None, 3, 5, 7, 8, 9, 10]
    }

    nb_scenarios = np.product([len(params[x]) for x in params])
    results = []
    for max_f in params['max_features']:
        for n_est in params['n_estimators']:
            for min_leaf in params['min_samples_leaf']:
                for max_dep in params['max_depth']:
                    kf = StratifiedKFold(n_splits=K)
                    errors_fold = []
                    for train_index, test_index in kf.split(X, y):
                        X_train_bis, X_test = X[train_index], X[test_index]
                        y_train_bis, y_test = y[train_index], y[test_index]

                        rf = RandomForestClassifier(max_features=max_f,
                                                    n_estimators=n_est,
                                                    min_samples_leaf=min_leaf,
                                                    max_depth=max_dep,
                                                    n_jobs=-1,
                                                    class_weight='balanced')
                        rf.fit(X_train_bis, y_train_bis)
                        y_pred_test = rf.predict_proba(X_test)
                        logloss = log_loss(y_test, y_pred_test)
                        errors_fold.append(logloss)

                    result = {
                        'max_features': max_f,
                        'n_estimators': n_est,
                        'min_samples_leaf': min_leaf,
                        'max_depth': max_dep,
                        'cv_logloss': np.mean(errors_fold)
                    }

                    results.append(result)
                    print("=" * 10 +
                          " {}/{} ".format(len(results), nb_scenarios) +
                          "=" * 10)
                    for key, value in result.items():
                        print("{} : {}".format(key, value))

    results = sorted(results, key=lambda x: x['cv_logloss'])
    best_result = results[0]

    with open('data/s2_meta/best_tuning_rf.json', 'w') as fp:
        json.dump(best_result, fp, indent=4)

    return results
Ejemplo n.º 9
0
	def runns(resp_var, size_of_test_data,dataset,positive_class,predictor_var, n_estimators,important_features,dealing_with_nulls):
		dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes
		#----DATA PREPROCESSING
		#-------dealing with NULL values in the data
		#----------remove the rows in which the response is null

		dataset=dataset.dropna(subset=[resp_var])
		#----------dealing with nulls
		dataset=deal_with_nulls(dealing_with_nulls,dataset)
		#----FEATURE SELECTION
		#-------get predictors important in predicting the response
		#-----------transform categorical predictors to dummy variables
		predictors=dataset[predictor_var]
		predictors=pd.get_dummies(predictors)
		#-----------balance the classes in the response var
		ros = RandomOverSampler(random_state=0)
		resp=dataset[resp_var]
		prds, resp = ros.fit_sample(predictors, resp)
		#-----------fit the random forest classifier to give us the important predictors
		rf_clf = RandomForestClassifier(n_estimators=n_estimators)
		rf_clf.fit(prds,resp)
		#-------get the important predictors
		feature_imp = pd.Series(rf_clf.feature_importances_,
						index=list(predictors.iloc[:,0:])).sort_values(ascending=False)
		#-------names of the important predictors
		important_predictor_names = feature_imp.index[0:important_features]
		#-------subset the data to get only the important predictors and the response
		resp=pd.DataFrame(data=resp,columns=[resp_var])
		predictors=pd.DataFrame(prds,columns=list(predictors))
		dataset=pd.concat([resp,predictors],axis=1)
		#---------------------------------------------------------
		#----MODEL TRAINING
		#--------Remove the response variables from the features variables - axis 1 refers to the columns
		m_data= dataset.drop(resp_var, axis = 1,inplace=False) 
		# Response variables are the values we want to predict
		resp_var = np.array(dataset[resp_var])

		dataset = pd.get_dummies(m_data)
		
		# Saving feature names for later use
		feature_list = list(m_data.columns)
		# Convert to numpy array
		dataset = np.array(dataset)

		# Split the data into training and testing sets
		train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = float(size_of_test_data), random_state = 402)

		# Instantiate model with n_estimators decision trees
		clf = RandomForestClassifier(n_jobs = 1,n_estimators = n_estimators, random_state = 142)

		# Train the model on training data
		clf.fit(train_features, train_labels)
		# evaluation
		predicted = clf.predict(test_features)
		pred_prob = clf.predict_proba(test_features)
		
		accuracy = accuracy_score(test_labels, predicted)
		#confusion matrix
		cnf = (confusion_matrix(test_labels,predicted))
		#precision score
		precision = precision_score(test_labels,predicted,pos_label=positive_class)
		#avg pres
		avg_precision = average_precision_score(test_labels,pred_prob[:,[1]])
		#recall score
		rec = recall_score(test_labels,predicted,pos_label=positive_class)
		#f1 scorea
		fscore = f1_score(test_labels,predicted,pos_label=positive_class)
		#fbeta score
		fbeta = fbeta_score(test_labels,predicted,beta=0.5)
		#hamming_loss
		hamming = hamming_loss(test_labels,predicted)
		#jaccard similarity score
		jaccard = jaccard_similarity_score(test_labels,predicted)
		#logloss
		logloss = log_loss(test_labels,predicted)
		#zero-oneloss
		zero_one = zero_one_loss(test_labels,predicted)
		#auc roc 
		area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]])
		#cohen_score
		cohen = cohen_kappa_score(test_labels,predicted)
		#mathews corr
		mathews = matthews_corrcoef(test_labels,predicted)
		# Variable importances from the important features selection stage
		variable_importance_list = list(zip(prds, feature_imp))
		output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews}
		output=json.dumps(output)
		return output
Ejemplo n.º 10
0
# Visualize tree
dot_data = StringIO.StringIO()
tree.export_graphviz(clf, out_file=dot_data, feature_names=list(data_tree.columns.values))
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf('dectree.pdf')


# Repeat on test set
y_test_pred = clf.predict(X_test)
print "Accuracy Test: {0:.3f}".format(metrics.accuracy_score(y_test, y_test_pred))
print
print "Classification report:"
print metrics.classification_report(y_test, y_test_pred)
print 
print "Confusion matrix:"
print metrics.confusion_matrix(y_test, y_test_pred)

# Measure performance
y_pred = clf.predict_proba(X_train)

# Repeat on test set
y_test_pred = clf.predict_proba(X_test)

tt = g_test.as_matrix()
pred = tt* y_test_pred

ss = np.sum(pred, axis=1)

sss = ss.mean()

print sss
Ejemplo n.º 11
0
#:# model

params = {'max_depth': 3, 'n_estimators': 75}

classifier = RandomForestClassifier(**params)
classifier.fit(X_train, y_train)

#:# hash
#:# 5475503c9e4b64dc0dcc4960399cf72c
md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()
print(f'md5: {md5}')

#:# audit
y_pred = classifier.predict(transform_pipeline.transform(X_test))
y_pred_proba = classifier.predict_proba(
    transform_pipeline.transform(X_test))[:, 1]

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'acc: {accuracy_score(y_test, y_pred)}')
print(f'auc: {roc_auc_score(y_test, y_pred_proba)}')
print(f'precision: {precision_score(y_test, y_pred)}')
print(f'recall: {recall_score(y_test, y_pred)}')
print(f'specificity: {tn/(tn+fp)}')
print(f'f1: {f1_score(y_test, y_pred)}')

#:# session info

# Dodaj wersję pythona w session info

sessionInfo = {
Ejemplo n.º 12
0
    def RF_trainandtest(self, unionscores, cutscore, testsize, cv, feature_sel, varthreshold, ntrees, nodes, rfmethod, nclusters, cmethod, resmethod):
        
        #分割数据集为训练集和测试集
        if unionscores == True:
            data_feature = self.data.drop(['name', 'idCard', 'mobileNum', 'cardNum', 'rsk_score'], axis = 1)
        else:
            data_feature = self.data.drop(['name', 'idCard', 'mobileNum', 'cardNum', 'cst_score',
                                           'cnp_score', 'cnt_score', 'chv_score', 'dsi_score','rsk_score'], axis = 1)
        data_target = (self.data['rsk_score'] < cutscore).astype('int')
        X_train, X_test, y_train, y_test = train_test_split(data_feature, data_target, test_size=testsize, random_state=0)
        if testsize == 0:
            X_test, y_test = X_train.head(5), y_train.head(5)
            
        #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化
        X_train, X_test = self.binandwoe_traintest_pkl(X_train, y_train, X_test, nclusters, cmethod, self.label)
       
        #在train中做变量筛选, sklearn.feature_selection中的方法
        if feature_sel == "VarianceThreshold":
            selector = VarianceThreshold(threshold = varthreshold)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "RFECV":
            estimator = LogisticRegression()
            selector = RFECV(estimator, step=1, cv=cv)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "SelectFromModel":
            estimator = LogisticRegression()
            selector = SelectFromModel(estimator)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "SelectKBest":
            selector = SelectKBest()
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        else:
            X_train1, X_test1 = X_train, X_test        

        testcolumns = X_test1.columns 
        
        #重采样resampling 解决样本不平衡问题
        X_train1, y_train = self.imbalanceddata (X_train1, y_train, resmethod) 
            
        #训练并预测随机森林模型
        if rfmethod == 'RandomForest':
            classifier = RandomForestClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes)
        elif rfmethod == 'ExtraTrees':
            classifier = ExtraTreesClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes)
        elif rfmethod == 'GradientBoosting':
            classifier = GradientBoostingClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes)

        classifier.fit(X_train1, y_train)  
        probability = classifier.predict_proba(X_test1)
        
        predresult = pd.DataFrame({'target' : y_test, 'probability' : probability[:,1]})
        predresult = pd.concat([predresult, X_test], axis = 1)

        if self.label != None:#label==None 用于建模训练,label!=None用于保存生产模型
            joblib.dump(classifier, "allinpay projects\\creditscore_TLSW_fyz\\pkl\\classifier_" + self.label + '.pkl')
            joblib.dump(testcolumns, "allinpay projects\\creditscore_TLSW_fyz\\pkl\\testcolumns_" + self.label + '.pkl')
        
        
        return predresult
Ejemplo n.º 13
0
                    feature_labels = dataset_features[ds_label][fe_label][
                        'feature_labels']
                    X = np.copy(
                        dataset_features[ds_label][fe_label]['features'])
                    X = np.nan_to_num(X)
                    feat_train = X[train_index, :]
                    feat_test = X[test_index, :]

                # feature normalization & model training
                feat_train = scaler.fit_transform(feat_train)
                classifier.fit(feat_train, target_train)

                feature_importance[(d, f, i)] = classifier.feature_importances_
                # feature scaling & prediction
                feat_test = scaler.transform(feat_test)
                y_pred_all[(d, f, i)] = classifier.predict_proba(feat_test)
                y_true_all[(d, f, i)] = y[test_index]
                all_file_id = np.array([
                    dataset.file_label_to_id[_]
                    for _ in dataset.metadata['fn_wav'].tolist()
                ])
                file_id_all[(d, f, i)] = all_file_id[test_index]

    with open(os.path.join(dir_results, 'feature_importance.pckl'),
              'wb+') as f:
        pickle.dump(feature_importance, f)

    f1_scores = np.zeros((num_datasets, num_extractors, num_folds))
    f1_scores_file = np.zeros((num_datasets, num_extractors, num_folds))

    # iterate over datasets
Ejemplo n.º 14
0
def main():
    operMode = args.operMode
    logging.info('Random fortest work on operMode: {}'.format(operMode))

    input_in1_file = 'iris.csv'
    df = pd.read_csv(input_in1_file)
    if operMode == 'TRAINING':
        label_name = args.label_name
        n_estimators = args.n_estimators
        shuffle = args.shuffle
        split_ratio = args.split_ratio
        criterion = args.criterion
        max_features = args.max_features
        max_depth = args.max_depth
        min_samples_split = args.min_samples_split
        min_samples_leaf = args.min_samples_leaf
        min_weight_fraction_leaf = args.min_weight_fraction_leaf
        min_impurity_decrease = args.min_impurity_decrease
        bootstrap = args.bootstrap
        n_jobs = args.n_jobs

        logging.info('model parameter as follow:\n'
                     'label_name: {}\n'
                     'n_estimators: {}\n'
                     'split_ratio: {}\n'
                     'shuffle: {}\n'
                     'criterion: {}\n'
                     'max_featrues: {}\n'
                     'max_depth: {}\n'
                     'min_samples_split: {}\n'
                     'min_samples_leaf: {}\n'
                     'min_weight_fraction_leaf: {}\n'
                     'min_impurity_decrease: {}\n'
                     'bootstrap: {}\n'
                     'n_jobs: {}'.format(label_name, n_estimators, split_ratio, shuffle, criterion,
                                         max_features, max_depth, min_samples_split,
                                         min_samples_leaf, min_weight_fraction_leaf,
                                         min_impurity_decrease, bootstrap, n_jobs))

        tra_df, val_df = train_val_split(df, ratio=split_ratio, shuffle=shuffle)
        columns = df.columns.tolist()
        tra_y = tra_df[label_name].values
        val_y = val_df[label_name].values
        columns.remove(label_name)
        tra_x = tra_df[columns].values
        val_x = val_df[columns].values

        logging.info("Random Fortest Training Start...")
        try:
            clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
                                         min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                         min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features,
                                         min_impurity_decrease=min_impurity_decrease,
                                         bootstrap=bootstrap, n_jobs=n_jobs).fit(tra_x, tra_y)
        except Exception as e:
            logging.error("Unexpected Error {}".format(e))
            exit(0)

        logging.info("Random Fortest Training End and Stroe Model...")
        with open("rf.pkl", "wb") as f:
            pickle.dump(clf, f)

        val_y_pred_prob = clf.predict_proba(val_x)
        val_y_pred_label = clf.predict(val_x)

        cfmt = confusion_matrix(val_y, val_y_pred_label).tolist()

        top1_acc = top_k(val_y, val_y_pred_prob, clf.classes_, k=1)
        top5_acc = top_k(val_y, val_y_pred_prob, clf.classes_, k=5)
        fprs = []
        tprs = []
        aucs = []
        recalls = []
        precisions = []
        aps = []

        for c in range(len(clf.classes_)):
            val_y_true_binary = val_y == clf.classes_[c]
            val_y_pred_binary = val_y_pred_prob[:, c]
            fpr, tpr, thres_roc = roc_curve(val_y_true_binary, val_y_pred_binary, pos_label=1)
            auc = roc_auc_score(val_y_true_binary, val_y_pred_binary)
            precision, recall, thres_pr = precision_recall_curve(val_y_true_binary, val_y_pred_binary)
            ap = average_precision_score(val_y_true_binary, val_y_pred_binary)
            fprs.append(fpr.tolist())
            tprs.append(tpr.tolist())
            aucs.append(auc)
            recalls.append(recall.tolist())
            precisions.append(precision.tolist())
            aps.append(ap)

        pfmn_dict = {}
        pfmn_dict['graphs'] = []
        # ROC曲线
        graph_roc = {}
        graph_roc['name'] = 'ROC曲线'
        graph_roc['x_title'] = 'fpr'
        graph_roc['y_title'] = 'tpr'
        graph_roc['lines'] = []
        for i in range(len(fprs)):
            line = {}
            line['name'] = 'label为{}的ROC曲线'.format(i)
            line['relative'] = []
            relative = {}
            relative['name'] = 'auc'
            relative['value'] = aucs[i]
            line['relative'].append(relative)
            line['x_axis'] = fprs[i]
            line['y_axis'] = tprs[i]
            graph_roc['lines'].append(line)
        pfmn_dict['graphs'].append(graph_roc)
        # PR曲线
        graph_pr = {}
        graph_pr['name'] = 'PR曲线'
        graph_pr['x_title'] = 'recall',
        graph_pr['y_title'] = 'precision'
        graph_pr['lines'] = []
        for i in range(len(recalls)):
            line = {}
            line['name'] = 'label为{}的PR曲线'.format(i)
            line['relative'] = []
            relative = {}
            relative['name'] = 'ap'
            relative['value'] = aps[i]
            line['relative'].append(relative)
            line['x_axis'] = recalls[i]
            line['y_axis'] = precisions[i]
            graph_pr['lines'].append(line)
        pfmn_dict['graphs'].append(graph_pr)

        # 混淆矩阵
        pfmn_dict['matrixs'] = []
        matrix = {}
        matrix['name'] = '混淆矩阵'
        matrix['col_name'] = clf.classes_.tolist()
        matrix['row_name'] = clf.classes_.tolist()
        matrix['elements'] = cfmt
        pfmn_dict['matrixs'].append(matrix)
        # 数值型指标
        pfmn_dict['evaluation'] = []
        evals_top1 = {}
        evals_top1['name'] = "top1"
        evals_top1['value'] = top1_acc
        pfmn_dict['evaluation'].append(evals_top1)
        if top5_acc:
            evals_top5 = {}
            evals_top5['name'] = 'top5'
            evals_top5['value'] = top5_acc
            pfmn['evaluation'].append(evals_top5)

        pfmn_str = json.dumps(pfmn_dict)
        with open('pfmn.json', 'w') as f:
            f.write(pfmn_str)
        logging.info('Random Fortest Model Evaluation finished!')
    elif operMode == 'PREDICTION':
        has_label = args.has_label
        label_name = args.label_name
        load_model = args.load_model

        logging.info('model parameter configure as follow:\n'
                     'has_label: {}\n'
                     'label_name: {}\n'
                     'load_model: {}\n'.format(has_label, label_name, load_model))
        if has_label:
            if label_name is None:
                try:
                    raise Exception('if parameter has_label is true, label_name must not be none')
                except Exception as e:
                    logging.error(e)
                    exit(0)
        if has_label:
            columns = df.columns.tolist()
            test_y = df[label_name].values
            columns.remove(label_name)
            test_x = df[columns].values
        else:
            test_x = df.values

        logging.info("Random Fortest Load Model ")
        model_path = load_model
        if not os.path.exists(model_path):
            try:
                raise Exception('model file {} will be loaded not exists!'.format(model_path))
            except Exception as e:
                logging.error('Unexpected Error {}'.format(e))
                exit(0)
        with open(model_path, 'rb') as f:
            clf = pickle.load(f)
        test_y_pred_prob = clf.predict_proba(test_x)
        if has_label:
            fprs = []
            tprs = []
            aucs = []
            recalls = []
            precisions = []
            aps = []
            for c in range(len(clf.classes_)):
                test_y_true_binary = test_y == clf.classes_[c]
                test_y_pred_binary = test_y_pred_prob[:, c]
                fpr, tpr, thres_roc = roc_curve(test_y_true_binary, test_y_pred_binary, pos_label=1)
                auc = roc_auc_score(test_y_true_binary, test_y_pred_binary)
                precision, recall, thres_pr = precision_recall_curve(test_y_true_binary, test_y_pred_binary)
                ap = average_precision_score(test_y_true_binary, test_y_pred_binary)
                fprs.append(fpr.tolist())
                tprs.append(tpr.tolist())
                aucs.append(auc)
                recalls.append(recall.tolist())
                precisions.append(precision.tolist())
                aps.append(ap)
            test_y_pred_label = clf.predict(test_x)
            cfmt = confusion_matrix(test_y, test_y_pred_label).tolist()
            top1_acc = top_k(test_y, test_y_pred_prob, clf.classes_, k=1)
            top5_acc = top_k(test_y, test_y_pred_prob, clf.classes_, k=5)
            pfmn_dict = {}
            pfmn_dict['graphs'] = []
            # ROC曲线
            graph_roc = {}
            graph_roc['name'] = 'ROC曲线'
            graph_roc['x_title'] = 'fpr'
            graph_roc['y_title'] = 'tpr'
            graph_roc['lines'] = []
            for i in range(len(fprs)):
                line = {}
                line['name'] = 'label为{}的ROC曲线'.format(i)
                line['relative'] = []
                relative = {}
                relative['name'] = 'auc'
                relative['value'] = aucs[i]
                line['relative'].append(relative)
                line['x_axis'] = fprs[i]
                line['y_axis'] = tprs[i]
                graph_roc['lines'].append(line)
            pfmn_dict['graphs'].append(graph_roc)
            # PR曲线
            graph_pr = {}
            graph_pr['name'] = 'PR曲线'
            graph_pr['x_title'] = 'recall',
            graph_pr['y_title'] = 'precision'
            graph_pr['lines'] = []
            for i in range(len(recalls)):
                line = {}
                line['name'] = 'label为{}的PR曲线'.format(i)
                line['relative'] = []
                relative = {}
                relative['name'] = 'ap'
                relative['value'] = aps[i]
                line['relative'].append(relative)
                line['x_axis'] = recalls[i]
                line['y_axis'] = precisions[i]
                graph_pr['lines'].append(line)
            pfmn_dict['graphs'].append(graph_pr)

            # 混淆矩阵
            pfmn_dict['matrixs'] = []
            matrix = {}
            matrix['name'] = '混淆矩阵'
            matrix['col_name'] = clf.classes_.tolist()
            matrix['row_name'] = clf.classes_.tolist()
            matrix['elements'] = cfmt
            pfmn_dict['matrixs'].append(matrix)
            # 数值型指标
            pfmn_dict['evaluation'] = []
            evals_top1 = {}
            evals_top1['name'] = "top1"
            evals_top1['value'] = top1_acc
            pfmn_dict['evaluation'].append(evals_top1)
            if top5_acc:
                evals_top5 = {}
                evals_top5['name'] = 'top5'
                evals_top5['value'] = top5_acc
                pfmn_dict['evaluation'].append(evals_top5)

            pfmn_str = json.dumps(pfmn_dict)
            with open('pfmn.json', 'w') as f:
                f.write(pfmn_str)
    else:
        logging.fatal('Random fortest not support {}'.format(operMode))
        raise Exception('Random fortest not support {}'.format(operMode))
Ejemplo n.º 15
0
cv_model.cv_results_

### ASSESS BEST PARAMS TREE AND SCORE
tree_model = RandomForestClassifier(random_state=297,
                                    **cv_model.best_params_)  ####ONLY IF THE PREVIOUS MODEL IS A SearchCV
tree_model = tree_model.fit(trainX, trainY.values.ravel())
tree_model.score(trainX, trainY)
tree_model.score(testX, testY)

### CHECK IMPORTANCE OF FEATURES
feature_importance = pd.DataFrame(tree_model.feature_importances_, index=trainX.columns, columns=['Imp']).reset_index()
feature_importance['pk'] = 1
plot_scatter(feature_importance, 'index', 'Imp', 'index')
plot_bar(feature_importance, 'index', 'Imp', 'index')

### PREDICT
prediction = tree_model.predict(features_all)
tree_model.predict_proba(features_all)

#### VISUALIZE TREE
### ONLY FOR SIMPLE DECISION TREE
# tree.export_graphviz(tree_model,
#                      feature_names=list(trainX.columns),
#                      out_file='/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/tree.dot')
# (graph,) = pydot.graph_from_dot_file('/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/tree.dot')
# graph.write_png('/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/tree.png')

skplt.metrics.plot_confusion_matrix(target, prediction, normalize=True)
pd.crosstab(target['Survived'], prediction)
sum(target['Survived'])
Ejemplo n.º 16
0
    '''

    oversampled_path = "resources/oversampled_normalized_data_ratio_2.bin"
    homesite = Data()
    homesite.load_sliptted_data(oversampled_path)
    del homesite.test_x  # Deleted to save memory.
    print homesite.train_x.shape

    # Creating classifier.
    # clf = DecisionTreeClassifier()
    clf = RandomForestClassifier(max_features=100)
    # clf = AdaBoostClassifier(n_estimators = 10)
    # clf = svm.SVC(gamma = 0.00005)
    # clf = RandomForestClassifier()
    # clf = MultiplePLS(n_classifiers = 10, n_samples = 5000, n_positive_samples = 2500, threshold = 0.9, acc = 0.999)
    # clf = svm.LinearSVC()

    # Train classifier.
    print "Training classifier."
    clf.fit(homesite.train_x, homesite.train_y)

    # Test classifier.
    print 'Testing classifier.'
    predicted_labels = clf.predict_proba(homesite.validation_x)[:, 1]

    # Show final results.
    results = confusion_matrix(homesite.validation_y,
                               np.round(predicted_labels))
    accuracy, precision, recall = compute_performance_metrics(results)
    auc = compute_auc(homesite.validation_y, predicted_labels)
Ejemplo n.º 17
0
from util import convert_gray_scale, flatten


Xr,Yr = training_set
Xe,Ye = test_set

Xr = flatten(convert_gray_scale(Xr))
Xe = flatten(convert_gray_scale(Xe))

rf = RandomForestClassifier(n_estimators=100, verbose=3, oob_score=True, compute_importances=True)
rf.fit(Xr, Yr)

Yp = rf.predict(Xe)
print np.mean(Yp == Ye)

Ypp = rf.predict_proba(Xe).max(axis=1)

plt.figure(1)
plt.clf()
plt.hist(Ypp[Yp == Ye], 50, color='b', normed=True, alpha=0.4,
         label='classified')
plt.hist(Ypp[Yp != Ye], 50, color='r', normed=True, alpha=0.4,
         label='misclassified')
plt.legend(loc='upper left')
plt.draw()
plt.show()

plt.figure(3)
plt.clf()

n = 0.01 * float(len(Yp))
Ejemplo n.º 18
0
#                                 normalize=True)

#clf = xgbwrapper.XgbWrapper({'objective': 'binary:logistic',
#                  'eval_metric': 'auc',
#                  'eta': 0.1,
#                  'silent': 1,
#                  'max_delta_step': 1})

# 'Normal' 70 / 30 cross-validation
if do_cross_val == 1:
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, train.WnvPresent, test_size=0.3, random_state=0)

    clf.fit(X_train, y_train)

    y_pred = clf.predict_proba(X_test)
    print(metrics.roc_auc_score(y_test, y_pred))

elif do_cross_val == 2:

    # Leave-one-year-out cross-validation
    scores = []
    total_pred = np.array([])
    total_test = np.array([])

    for year in [2007, 2009, 2011, 2013]:

        X_train, X_test, y_train, y_test, y_train_numMosquitos, y_test_numMosquitos = year_train_test_split(
            train_for_loo, 'WnvPresent_DateTrapSpecies', year)

        X_train.to_csv("data_per_year/" + str(year) + "X_train.csv",
Ejemplo n.º 19
0
    def RF_trainandtest_kfold(self, unionscores, nsplit, cutscore, cv, feature_sel, varthreshold, ntrees, nodes, rfmethod, nclusters, cmethod, resmethod):
        
        if unionscores == True:
            data_feature = self.data.drop(['name', 'idCard', 'mobileNum', 'cardNum', 'rsk_score'], axis = 1)
        else:
            data_feature = self.data.drop(['name', 'idCard', 'mobileNum', 'cardNum', 'cst_score',
                                           'cnp_score', 'cnt_score', 'chv_score', 'dsi_score','rsk_score'], axis = 1)
        data_target = (self.data['rsk_score'] < cutscore).astype('int')

        #将数据集分割成k个分段分别进行训练和测试,对每个分段,该分段为测试集,其余数据为训练集
        kf = KFold(n_splits=nsplit, shuffle=True)
        predresult = pd.DataFrame()
        for train_index, test_index in kf.split(data_feature):
            X_train, X_test = data_feature.iloc[train_index, ], data_feature.iloc[test_index, ]
            y_train, y_test = data_target.iloc[train_index, ], data_target.iloc[test_index, ]
            
            #如果随机抽样造成train或者test中只有一个分类,跳过此次预测
            if (len(y_train.unique()) == 1) or (len(y_test.unique()) == 1):
                continue
            
            #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化
            X_train, X_test = self.binandwoe_traintest(X_train, y_train, X_test, nclusters, cmethod)
                    
            #在train中做变量筛选, sklearn.feature_selection中的方法
            if feature_sel == "VarianceThreshold":
                selector = VarianceThreshold(threshold = varthreshold)
                X_train1 = pd.DataFrame(selector.fit_transform(X_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "RFECV":
                estimator = LogisticRegression()
                selector = RFECV(estimator, step=1, cv=cv)
                X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "SelectFromModel":
                estimator = LogisticRegression()
                selector = SelectFromModel(estimator)
                X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "SelectKBest":
                selector = SelectKBest()
                X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            else:
                X_train1, X_test1 = X_train, X_test      

            #重采样resampling 解决样本不平衡问题
            X_train1, y_train = self.imbalanceddata (X_train1, y_train, resmethod)
            
            #训练并预测随机森林模型
            if rfmethod == 'RandomForest':
                classifier = RandomForestClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes)
            elif rfmethod == 'ExtraTrees':
                classifier = ExtraTreesClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes)
            elif rfmethod == 'GradientBoosting':
                classifier = GradientBoostingClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes)

            classifier.fit(X_train1, y_train)  
            probability = classifier.predict_proba(X_test1)
            
            temp = pd.DataFrame({'target' : y_test, 'probability' : probability[:,1]})
            predresult = pd.concat([predresult, temp], ignore_index = True)        

            
        return predresult
Ejemplo n.º 20
0
Xr, Yr = training_set
Xe, Ye = test_set

Xr = flatten(convert_gray_scale(Xr))
Xe = flatten(convert_gray_scale(Xe))

rf = RandomForestClassifier(n_estimators=100,
                            verbose=3,
                            oob_score=True,
                            compute_importances=True)
rf.fit(Xr, Yr)

Yp = rf.predict(Xe)
print np.mean(Yp == Ye)

Ypp = rf.predict_proba(Xe).max(axis=1)

plt.figure(1)
plt.clf()
plt.hist(Ypp[Yp == Ye],
         50,
         color='b',
         normed=True,
         alpha=0.4,
         label='classified')
plt.hist(Ypp[Yp != Ye],
         50,
         color='r',
         normed=True,
         alpha=0.4,
         label='misclassified')
from data.numpy_file import save_np_array, load_np_array
from data.plot import plot
import numpy as np
import pandas as pd
from statistics.confusion_matrix import confusion_matrix
from statistics.performance import compute_performance_metrics, compute_auc

if __name__ == '__main__':
    '''
    Classify data changing balancing ratio.
    '''

    # Train and test random forests.
    path = "../homesite_data/resources/parsed_data.bin"
    homesite = Data()
    homesite.load_parsed_data(path)
    homesite.z_norm_train_test_by_feature()
    sm = OverSampler(verbose=False, ratio=2.5)
    homesite.train_x, homesite.train_y = sm.fit_transform(
        homesite.train_x, homesite.train_y)

    clf = RandomForestClassifier(n_estimators=300, max_features=100, n_jobs=4)

    # Train classifier.
    print "Training classifier."
    clf.fit(homesite.train_x, homesite.train_y)
    predicted_labels = clf.predict_proba(homesite.test_x)[:, 1]
    sample = pd.read_csv('../input/sample_submission.csv')
    sample.QuoteConversion_Flag = predicted_labels
    sample.to_csv('rfc_300.csv', index=False)
Ejemplo n.º 22
0
print mask.sum()
X = images[mask, ...].reshape(mask.sum(), np.prod(images.shape[1::]))
print X.shape
Y = classifications[mask]

acc = []
acc_correct = []
acc_incorrect = []
acc_x_incorrect = []
k_fold = 8
for train_inx, valid_inx in StratifiedKFold(Y, k_fold):
    rf = RandomForestClassifier(n_estimators=100, verbose=0, oob_score=True, compute_importances=True)
    rf.fit(X[train_inx], Y[train_inx])
    Yp = rf.predict(X[valid_inx])
    correct = Yp== Y[valid_inx]
    rf.predict_proba(X[valid_inx])
    p_correct = rf.predict_proba(X[valid_inx]).max(axis=1)
    acc_correct.append(p_correct[correct])
    acc_incorrect.append(p_correct[~correct])

    score = correct.mean()
    print score
    acc.append(score)

    acc_x_incorrect.append([images[mask][valid_inx[~correct]],
                            Y[valid_inx[~correct]],
                            Yp[~correct]])

print 'score', np.mean(acc)

rf = RandomForestClassifier(n_estimators=100, verbose=0, oob_score=True, compute_importances=True)
Ejemplo n.º 23
0
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

test_loss = np.zeros((params['n_estimators'],), dtype=np.float64)
train_loss = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
    # clf.loss_ assumes that y_test[i] in {0, 1}
    y_sig = (1.0 / (1.0 + np.exp(0.0 - y_pred)))
    test_loss[i] = log_loss(y_test, y_sig)#clf.loss_(y_test, y_sig)

for i, y_pred in enumerate(clf.staged_decision_function(X_train)):
    # clf.loss_ assumes that y_test[i] in {0, 1}
    y_sig = (1.0 / (1.0 + np.exp(0.0 - y_pred)))
    train_loss[i] = log_loss(y_train, y_sig)#clf.loss_(y_train, y_sig)

plt.figure()
plt.plot(test_loss, 'r', linewidth=2)
plt.plot(train_loss, 'g', linewidth=2)
plt.legend(['test', 'train'])

i = np.argmin(test_loss)
    
print('min log-loss: ', np.round(test_loss[i],2), ' iteration#: ', i)

rfc = RandomForestClassifier(random_state=241, n_estimators=i)
rfc.fit(X_train, y_train)
y_pred = rfc.predict_proba(X_test)

print('RandomForest log-loss: ', np.round(log_loss(y_test, y_pred),2))
Ejemplo n.º 24
0
 
 kf = KFold(n_splits=10)
 predictions = []
 
 print('PCA with RandomForest model training...')
 
 for train_index, val_index in kf.split(df_features):
     
     Train_X = df_features.iloc[train_index]
     Train_Y = df_label.iloc[train_index]
     Val_X = df_features.iloc[val_index]
     
     clf = RandomForestClassifier(n_estimators=50, min_samples_split=2, min_samples_leaf=1, oob_score=True)
     clf.fit(Train_X, Train_Y)
     
     predict_Val_Y = clf.predict_proba(Val_X)[:, 1]
     
     predict_Val_Y[predict_Val_Y <= 0.44] = 0
     predict_Val_Y[predict_Val_Y > 0.44] = 1
     
     predictions.append(predict_Val_Y)
 
 predictions = np.concatenate(predictions, axis=0)
 
 precision = np.count_nonzero(predictions == df_label) / len(predictions)
 
 print(precision)
 
 
 
 
Ejemplo n.º 25
0
tree.export_graphviz(clf,
                     out_file=dot_data,
                     feature_names=list(data_tree.columns.values))
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf('dectree.pdf')

# Repeat on test set
y_test_pred = clf.predict(X_test)
print "Accuracy Test: {0:.3f}".format(
    metrics.accuracy_score(y_test, y_test_pred))
print
print "Classification report:"
print metrics.classification_report(y_test, y_test_pred)
print
print "Confusion matrix:"
print metrics.confusion_matrix(y_test, y_test_pred)

# Measure performance
y_pred = clf.predict_proba(X_train)

# Repeat on test set
y_test_pred = clf.predict_proba(X_test)

tt = g_test.as_matrix()
pred = tt * y_test_pred

ss = np.sum(pred, axis=1)

sss = ss.mean()

print sss
Ejemplo n.º 26
0
df['group'] = 0
df.loc[df.logerror < q20, 'group'] = -1
df.loc[df.logerror > q80, 'group'] = 1

# create train and test set
X_train, X_val, y_train, y_val, scaler = create_inputs_model(df.drop(
    'logerror', axis=1),
                                                             test_size=0.25)

# Predict the class
rfc = RandomForestClassifier(n_estimators=500,
                             verbose=2,
                             n_jobs=-1,
                             max_depth=8)
rfc.fit(X_train, y_train)
y_pred_val = rfc.predict_proba(X_val)
fpr, tpr, thresholds = roc_curve(y_val, y_pred_val[:, 1], pos_label=1)
print("AUC on test : {:.02f} %".format(auc(fpr, tpr) * 100))

#most important features
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

feat_names = df.drop('logerror', axis=1).drop('group', axis=1).columns.values
for f in range(X_val.shape[1]):
    print("{}. feature {} - {} :({:.06f})".format(f + 1, indices[f],
                                                  feat_names[indices[f]],
                                                  importances[indices[f]]))

catalog = describe_features(df)
Ejemplo n.º 27
0
    def runns(resp_var, size_of_test_data, dataset, positive_class,
              predictor_var, n_estimators, important_features,
              dealing_with_nulls):
        dataset = pd.read_csv('raw_data.csv',
                              low_memory=False)  # For testing purposes
        #----DATA PREPROCESSING
        #-------dealing with NULL values in the data
        #----------remove the rows in which the response is null

        dataset = dataset.dropna(subset=[resp_var])
        #----------dealing with nulls
        dataset = deal_with_nulls(dealing_with_nulls, dataset)
        #----FEATURE SELECTION
        #-------get predictors important in predicting the response
        #-----------transform categorical predictors to dummy variables
        predictors = dataset[predictor_var]
        predictors = pd.get_dummies(predictors)
        #-----------balance the classes in the response var
        ros = RandomOverSampler(random_state=0)
        resp = dataset[resp_var]
        prds, resp = ros.fit_sample(predictors, resp)
        #-----------fit the random forest classifier to give us the important predictors
        rf_clf = RandomForestClassifier(n_estimators=n_estimators)
        rf_clf.fit(prds, resp)
        #-------get the important predictors
        feature_imp = pd.Series(
            rf_clf.feature_importances_,
            index=list(predictors.iloc[:, 0:])).sort_values(ascending=False)
        #-------names of the important predictors
        important_predictor_names = feature_imp.index[0:important_features]
        #-------subset the data to get only the important predictors and the response
        resp = pd.DataFrame(data=resp, columns=[resp_var])
        predictors = pd.DataFrame(prds, columns=list(predictors))
        dataset = pd.concat([resp, predictors], axis=1)
        #---------------------------------------------------------
        #----MODEL TRAINING
        #--------Remove the response variables from the features variables - axis 1 refers to the columns
        m_data = dataset.drop(resp_var, axis=1, inplace=False)
        # Response variables are the values we want to predict
        resp_var = np.array(dataset[resp_var])

        dataset = pd.get_dummies(m_data)

        # Saving feature names for later use
        feature_list = list(m_data.columns)
        # Convert to numpy array
        dataset = np.array(dataset)

        # Split the data into training and testing sets
        train_features, test_features, train_labels, test_labels = train_test_split(
            dataset,
            resp_var,
            test_size=float(size_of_test_data),
            random_state=402)

        # Instantiate model with n_estimators decision trees
        clf = RandomForestClassifier(n_jobs=1,
                                     n_estimators=n_estimators,
                                     random_state=142)

        # Train the model on training data
        clf.fit(train_features, train_labels)
        # evaluation
        predicted = clf.predict(test_features)
        pred_prob = clf.predict_proba(test_features)

        accuracy = accuracy_score(test_labels, predicted)
        #confusion matrix
        cnf = (confusion_matrix(test_labels, predicted))
        #precision score
        precision = precision_score(test_labels,
                                    predicted,
                                    pos_label=positive_class)
        #avg pres
        avg_precision = average_precision_score(test_labels, pred_prob[:, [1]])
        #recall score
        rec = recall_score(test_labels, predicted, pos_label=positive_class)
        #f1 scorea
        fscore = f1_score(test_labels, predicted, pos_label=positive_class)
        #fbeta score
        fbeta = fbeta_score(test_labels, predicted, beta=0.5)
        #hamming_loss
        hamming = hamming_loss(test_labels, predicted)
        #jaccard similarity score
        jaccard = jaccard_similarity_score(test_labels, predicted)
        #logloss
        logloss = log_loss(test_labels, predicted)
        #zero-oneloss
        zero_one = zero_one_loss(test_labels, predicted)
        #auc roc
        area_under_roc = roc_auc_score(test_labels, pred_prob[:, [1]])
        #cohen_score
        cohen = cohen_kappa_score(test_labels, predicted)
        #mathews corr
        mathews = matthews_corrcoef(test_labels, predicted)
        # Variable importances from the important features selection stage
        variable_importance_list = list(zip(prds, feature_imp))
        output = {
            "accuracy": accuracy,
            "precision": precision,
            "average precision": avg_precision,
            "recall": rec,
            "fscore": fscore,
            "fbeta": fbeta,
            "hamming": hamming,
            "jaccard": jaccard,
            "logloss": logloss,
            "zero_one": zero_one,
            "area_under_roc": area_under_roc,
            "cohen": cohen,
            "mathews": mathews
        }
        output = json.dumps(output)
        return output
        X_train, X_test, y_train, y_test = year_train_test_split(
            train_for_loo,
            'WnvPresent_DateTrapSpecies',
            year)      

        X_train.to_csv("data_per_year/" + str(year) + "X_train.csv", index=False)
        X_test.to_csv("data_per_year/" + str(year) + "X_test.csv", index=False)
        y_train.to_csv("data_per_year/" + str(year) + "y_train.csv", index=False)
        y_test.to_csv("data_per_year/" + str(year) + "y_test.csv", index=False)

        
        clf.fit(X_train, y_train)

        # y_pred = clf.predict_proba(X_test) [:, 1] # Random Forest
        y_pred = clf.predict_proba(X_test) # For XGB
        
        score = metrics.roc_auc_score(y_test, y_pred)
        scores.append(score)
        
        #import operator
        #feat_importances = dict(zip(X_train.columns, clf.feature_importances_))
        #sorted_feat_importances = sorted(feat_importances.items(), key=operator.itemgetter(1))
        #print(sorted_feat_importances)
        
        total_pred = np.concatenate((total_pred, y_pred))
        total_test = np.concatenate((total_test, y_test))
        
    print("Global ROC score", metrics.roc_auc_score(total_test, total_pred))
        
    print(scores)
Ejemplo n.º 29
0
#                  'eval_metric': 'auc',
#                  'eta': 0.1,
#                  'silent': 1,
#                  'max_delta_step': 1})

# 'Normal' 70 / 30 cross-validation
if do_cross_val == 1:
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X,
        train.WnvPresent,
        test_size=0.3,
        random_state=0)

    clf.fit(X_train, y_train)

    y_pred = clf.predict_proba(X_test)
    print(metrics.roc_auc_score(y_test, y_pred))

elif do_cross_val == 2:

    # Leave-one-year-out cross-validation
    scores = []
    total_pred = np.array([])
    total_test = np.array([])
    
    for year in [2007, 2009, 2011, 2013]:

        X_train,X_test, y_train, y_test, y_train_numMosquitos, y_test_numMosquitos = year_train_test_split(
            train_for_loo,
            'WnvPresent_DateTrapSpecies',
            year)      
Ejemplo n.º 30
0
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(iris.data, iris.target)
knn.predict(iris.data)

len(iris.target)
sum(iris.target == knn.predict(iris.data))
knn.score(iris.data, iris.target)
help(cross_val_predict)
cross_val_predict(knn, iris.data, iris.target, cv=20)
cross_val_score(knn, iris.data, iris.target, cv=20).mean()


rf = RandomForestClassifier(n_estimators=3)
rf.fit(iris.data, iris.target)
rf.predict_proba(iris.data)
rf.score(iris.data, iris.target)
sum(iris.target == rf.predict(iris.data))
cross_val_score(rf, iris.data, iris.target, cv=20).mean()


from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
'''
https://github.com/drivendataorg/box-plots-sklearn/blob/master/src/data/multilabel.py
'''
mcr = OneVsRestClassifier(LogisticRegression())
mcr.fit(iris.data, iris.target)
mcr.predict(iris.data)
mcr.predict_proba(iris.data)
Ejemplo n.º 31
0
    def RF_trainandtest_kfold(self,
                              nsplit,
                              cv,
                              feature_sel,
                              varthreshold,
                              ntrees,
                              nodes,
                              rfmethod,
                              nclusters=10,
                              cmethod=None):

        data_feature = self.data.ix[:, self.data.columns != 'default']
        data_target = self.data['default']

        #将数据集分割成k个分段分别进行训练和测试,对每个分段,该分段为测试集,其余数据为训练集
        kf = KFold(n_splits=nsplit, shuffle=True)
        predresult = pd.DataFrame()
        for train_index, test_index in kf.split(data_feature):
            X_train, X_test = data_feature.iloc[
                train_index, ], data_feature.iloc[test_index, ]
            y_train, y_test = data_target.iloc[
                train_index, ], data_target.iloc[test_index, ]

            #如果随机抽样造成train或者test中只有一个分类,跳过此次预测
            if (len(y_train.unique()) == 1) or (len(y_test.unique()) == 1):
                continue

            #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化
            X_train, X_test = self.binandwoe_traintest(X_train, y_train,
                                                       X_test, nclusters,
                                                       cmethod)

            #在train中做变量筛选, sklearn.feature_selection中的方法
            if feature_sel == "VarianceThreshold":
                selector = VarianceThreshold(threshold=varthreshold)
                X_train1 = pd.DataFrame(selector.fit_transform(X_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "RFECV":
                estimator = LogisticRegression()
                selector = RFECV(estimator, step=1, cv=cv)
                X_train1 = pd.DataFrame(
                    selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "SelectFromModel":
                estimator = LogisticRegression()
                selector = SelectFromModel(estimator)
                X_train1 = pd.DataFrame(
                    selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "SelectKBest":
                selector = SelectKBest()
                X_train1 = pd.DataFrame(
                    selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            else:
                X_train1, X_test1 = X_train, X_test

            #训练并预测随机森林模型
            if rfmethod == 'RandomForest':
                classifier = RandomForestClassifier(n_estimators=ntrees,
                                                    min_samples_split=nodes *
                                                    2,
                                                    min_samples_leaf=nodes)
            elif rfmethod == 'ExtraTrees':
                classifier = ExtraTreesClassifier(n_estimators=ntrees,
                                                  min_samples_split=nodes * 2,
                                                  min_samples_leaf=nodes)
            elif rfmethod == 'GradientBoosting':
                classifier = GradientBoostingClassifier(
                    n_estimators=ntrees,
                    min_samples_split=nodes * 2,
                    min_samples_leaf=nodes)

            classifier.fit(X_train1, y_train)
            probability = classifier.predict_proba(X_test1)[:, 1]

            temp = pd.DataFrame({'target': y_test, 'probability': probability})
            predresult = pd.concat([predresult, temp], ignore_index=True)

        return predresult
Ejemplo n.º 32
0
def main():
    st.title('Você sobreviveria ao Titanic?')
    st.write(
        'Modelo de classificação com RandomForest para prever sobrevivência ou morte de passageiros no Titanic'
    )
    st.subheader('Autor')
    st.write('https://www.linkedin.com/in/lucaszonin/')
    st.write('')
    st.subheader('Agradecimentos')
    st.write('Felipe Maia Polo que me deu algumas dicas:')
    st.write('https://www.linkedin.com/in/felipemaiapolo/')
    st.write('')

    titanic_v1 = pd.read_csv('datasets/train.csv')
    del titanic_v1['Cabin']
    del titanic_v1['PassengerId']
    del titanic_v1['Ticket']
    del titanic_v1['SibSp']
    del titanic_v1['Parch']
    titanic_v1['Age'] = titanic_v1['Age'].fillna(np.mean(titanic_v1['Age']))
    titanic_v1['Age'] = titanic_v1['Age'].astype('int64')
    titanic_v1 = titanic_v1.dropna()

    titanic_v1.loc[titanic_v1['Sex'] == 'male', 'Sex'] = 0
    titanic_v1.loc[titanic_v1['Sex'] == 'female', 'Sex'] = 1
    titanic_v1['Sex'] = titanic_v1['Sex'].astype(int)

    titanic_v1.loc[titanic_v1['Embarked'] == 'C', 'Embarked'] = 0
    titanic_v1.loc[titanic_v1['Embarked'] == 'Q', 'Embarked'] = 1
    titanic_v1.loc[titanic_v1['Embarked'] == 'S', 'Embarked'] = 2
    titanic_v1['Embarked'] = titanic_v1['Embarked'].astype(int)

    #PUXAR SEXO
    sexo = st.radio(label='Sexo do passageiro',
                    options=('Feminino', 'Masculino'))

    #PUXAR IDADE
    idade_passenger = st.slider(label='Idade do passageiro',
                                min_value=1,
                                max_value=max(titanic_v1['Age']))

    #PUXAR EMBARCACAO
    embarked = st.radio(label='Cidade onde embarcou',
                        options=('Cherbourg', 'Queenstown', 'Southampton'))

    #PUXAR VALOR DA PASSAGEM
    valor_pago = st.slider(label='Valor pago pela passagem',
                           min_value=1,
                           max_value=600)

    #PUXAR CLASSE
    classe = st.radio(label='Classe do passageiro',
                      options=('Primeira', 'Segunda', 'Terceira'))

    if sexo == 'Feminino':

        sexo_modelo = 1

    else:

        sexo_modelo = 0

    if embarked == 'Cherbourg':

        embarked_modelo = 0

    elif embarked == 'Queenstown':

        embarked_modelo = 1

    elif embarked == 'Southampton':

        embarked_modelo = 2

    if classe == 'Primeira':

        classe_modelo = 1

    elif classe == 'Segunda':

        classe_modelo = 2

    elif classe == 'Terceira':

        classe_modelo = 3

    titanic_modelo = titanic_v1

    y = titanic_modelo['Survived']
    x = titanic_modelo[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=30)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    model_randomForest = model.predict_proba(X_test)
    #st.write(accuracy_score(y_test,model_randomForest))

    if st.button(label="Prever"):

        st.title('Dados do passageiro:')
        st.write('Sexo :', sexo)
        st.write('Idade :', idade_passenger)
        st.write('Cidade onde embarcou :', embarked)
        st.write('Valor da passagem : US$', valor_pago)
        st.write('Classe da passagem :', classe)

        x_input = pd.DataFrame(
            {
                'Pclass': classe_modelo,
                'Sex': sexo_modelo,
                'Age': idade_passenger,
                'Fare': valor_pago,
                'Embarked': embarked_modelo
            },
            index=[0])
        new_model = RandomForestClassifier()
        new_model.fit(X_train, y_train)
        pred = new_model.predict_proba(x_input)

        st.title('Previsão:')
        st.write('')
        'Probabilidade de morrer:', pred[0, 0] * 100
        'Probabilidade de sobreviver:', pred[0, 1] * 100
Ejemplo n.º 33
0
    '''

    # Train and test random forests.
    # load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.5.bin"
    load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.bin"
    homesite = Data()
    homesite.load_sliptted_data(load_path)
    del homesite.test_x  # Deleted to save memory.

    clf_ann = NeuralNetwork(path = "../homesite_data/ann_weights.bin", lr = 0.00005, \
                        lamb = 0)
    train_output_ann = clf_ann.get_hidden_output(homesite.train_x)
    validation_output_ann = clf_ann.get_hidden_output(homesite.validation_x)
    train_output_ann = np.hstack((train_output_ann, homesite.train_x))
    validation_output_ann = np.hstack((validation_output_ann, homesite.validation_x))

    for c in range(2, 10):
        # Train classifier.
        print "Training classifier."
        clf = RandomForestClassifier(n_estimators = 1 + 100 * c, n_jobs = 4)
        clf.fit(train_output_ann, homesite.train_y)

        # Test classifier.
        print 'Testing classifier.'
        predicted_labels = clf.predict_proba(validation_output_ann)[:, 1]

        # Show final results.
        results = confusion_matrix(homesite.validation_y, np.round(predicted_labels))
        accuracy, precision, recall = compute_performance_metrics(results)
        auc = compute_auc(homesite.validation_y, predicted_labels)
Ejemplo n.º 34
0
    def RF_trainandtest(self,
                        testsize,
                        cv,
                        feature_sel,
                        varthreshold,
                        ntrees,
                        nodes,
                        rfmethod,
                        nclusters=10,
                        cmethod=None):

        #分割数据集为训练集和测试集
        data_feature = self.data.ix[:, self.data.columns != 'default']
        data_target = self.data['default']
        X_train, X_test, y_train, y_test = train_test_split(data_feature,
                                                            data_target,
                                                            test_size=testsize,
                                                            random_state=0)

        #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化
        X_train, X_test = self.binandwoe_traintest(X_train, y_train, X_test,
                                                   nclusters, cmethod)

        #在train中做变量筛选, sklearn.feature_selection中的方法
        if feature_sel == "VarianceThreshold":
            selector = VarianceThreshold(threshold=varthreshold)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "RFECV":
            estimator = LogisticRegression()
            selector = RFECV(estimator, step=1, cv=cv)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "SelectFromModel":
            estimator = LogisticRegression()
            selector = SelectFromModel(estimator)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "SelectKBest":
            selector = SelectKBest()
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        else:
            X_train1, X_test1 = X_train, X_test

        #训练并预测随机森林模型
        if rfmethod == 'RandomForest':
            classifier = RandomForestClassifier(n_estimators=ntrees,
                                                min_samples_split=nodes * 2,
                                                min_samples_leaf=nodes)
        elif rfmethod == 'ExtraTrees':
            classifier = ExtraTreesClassifier(n_estimators=ntrees,
                                              min_samples_split=nodes * 2,
                                              min_samples_leaf=nodes)
        elif rfmethod == 'GradientBoosting':
            classifier = GradientBoostingClassifier(n_estimators=ntrees,
                                                    min_samples_split=nodes *
                                                    2,
                                                    min_samples_leaf=nodes)

        classifier.fit(X_train1, y_train)
        probability = classifier.predict_proba(X_test1)[:, 1]

        predresult = pd.DataFrame({
            'target': y_test,
            'probability': probability
        })

        return predresult
Ejemplo n.º 35
0
#                                 normalize=True)

#clf = XgbWrapper({'objective': 'binary:logistic',
#                  'eval_metric': 'auc',
#                  'eta': 0.1,
#                  'silent': 0,
#                  'max_delta_step': 1})

# 'Normal' 70 / 30 cross-validation
if do_cross_val == 1:
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, train.WnvPresent, test_size=0.3, random_state=0)

    clf.fit(X_train, y_train)

    y_pred = clf.predict_proba(X_test)[:, 1]
    print(metrics.roc_auc_score(y_test, y_pred))

elif do_cross_val == 2:

    # Leave-one-year-out cross-validation
    scores = []
    for year in [2007, 2009, 2011, 2013]:

        X_train, X_test, y_train, y_test = year_train_test_split(
            train_for_loo, 'WnvPresent', year)

        X_train.to_csv("data_per_year/" + str(year) + "X_train.csv",
                       index=False)
        X_test.to_csv("data_per_year/" + str(year) + "X_test.csv", index=False)
        y_train.to_csv("data_per_year/" + str(year) + "y_train.csv",