コード例 #1
0
    def test_RandomForest(self):
        X = [[0, 1], [1, 1]]
        Y = [0, 1]

        regression = RandomForestClassifier(n_estimators=10)
        regression = regression.fit(X, Y)
        regression.predict_proba(X)
コード例 #2
0
class RandomForestClassifierImpl():

    def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'):
        self._hyperparams = {
            'n_estimators': n_estimators,
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'bootstrap': bootstrap,
            'oob_score': oob_score,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose,
            'warm_start': warm_start,
            'class_weight': class_weight}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
コード例 #3
0
class MyRfClassifier(BaseClassifier):
    def __init__(self, n_estimators, max_depth, min_samples_leaf):
        self.classifier = RandomForestClassifier(
            **{
                'verbose': 1,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_leaf': min_samples_leaf,
                'n_jobs': 40
            })
        self.name = "rf_n{n}_md{md}_ms{ms}".format(**{
            "n": n_estimators,
            "md": max_depth,
            "ms": min_samples_leaf
        })

    def get_name(self):
        return self.name

    def fit(self, X, y, X_t, y_t):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        ipts = dict(zip(feat_names, self.classifier.feature_importances_))
        return ipts
コード例 #4
0
class Model(BaseModel):
    """Antares implementation of scikit learn random forest classifier

    """
    def __init__(self,
                 categorical_features=None,
                 n_estimators=50,
                 n_jobs=-1,
                 max_depth=10):
        '''
        Example:
            >>> from madmex.modeling.supervised.rf import Model
            >>> rf = Model()
            >>> # Write model to db
            >>> rf.to_db(name='test_model', recipe='mexmad', training_set='no')
            >>> # Read model from db
            >>> rf2 = Model.from_db('test_model')
        '''
        super().__init__(categorical_features=categorical_features)
        self.model = RandomForestClassifier(n_estimators=n_estimators,
                                            n_jobs=n_jobs,
                                            max_depth=max_depth)
        self.model_name = 'rf'

    def fit(self, X, y):
        X = self.hot_encode_training(X)
        self.model.fit(X, y)

    def predict(self, X):
        '''
        Simply passes down the prediction from the underlying model.
        '''
        X = self.hot_encode_predict(X)
        return self.model.predict(X)

    def predict_confidence(self, X):
        """Get confidence of every prediction
        """
        X = self.hot_encode_predict(X)
        return self.model.predict_proba(X).max(axis=1)

    def score(self, X, y):
        '''
        Test the model given a dataset and a target vector.

        This method applies the model that this object represents to the given dataset using
        the response variable y. It is a measure of the accuracy of the trained model. Usually
        the orginal dataset should be splitted in training and testing subsets to cross validate
        the model.
        '''
        return self.model.score(X, y)
コード例 #5
0
def train_model(X_train, y_train):
    print("training the model ...")
    rf = RandomForestClassifier(n_estimators=1000,
                                max_depth=8,
                                n_jobs=-1,
                                verbose=1)
    #    rf = svm.SVC(kernel='rbf', gamma=0.7, C=1.0,probability=True)

    rf.fit(X_train, y_train)
    y_pred_train = rf.predict_proba(X_train)

    fpr, tpr, thresholds = roc_curve(y_train, y_pred_train[:, 0], pos_label=1)
    print("AUC on train : {:.02f} %".format(auc(fpr, tpr) * 100))

    return rf
コード例 #6
0
ファイル: tree.py プロジェクト: hongbin0908/pytrade
class MyRfClassifier(BaseClassifier):
    def __init__(self, n_estimators, max_depth, min_samples_leaf):
        self.classifier = RandomForestClassifier(**{'verbose':1, 'n_estimators': n_estimators,
                                                    'max_depth':max_depth,'min_samples_leaf':min_samples_leaf,
                                                    'n_jobs':40})
        self.name = "rf_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )
    def get_name(self):
        return self.name

    def fit(self, X, y, X_t, y_t):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        ipts = dict(zip(feat_names, self.classifier.feature_importances_))
        return ipts
コード例 #7
0
class MyRandomForestClassifier(BaseClassifier):
    def __init__(self, verbose=1, n_estimators = 2000, max_depth=8, min_samples_leaf=10000,
                 n_jobs=25):
        self.classifier = RandomForestClassifier( **{'verbose': verbose,
                                                     'n_estimators': n_estimators,
                                                     'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf,
                                                      'n_jobs': n_jobs})
        self.name = "rf_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )

    def get_name(self):
        return self.name

    def fit(self, X, y):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        return self.classifier.feature_importances_
コード例 #8
0
ファイル: s200_predict_bin.py プロジェクト: enima2684/zillow
def tune_model(X, y, K=5):
    print("tuning the model ...")
    """logging"""

    # the winner is
    #    {'max_features' : [sqrt'],
    #              'n_estimators' : [2000],
    #              'min_samples_leaf' : [1]
    #              }
    #
    """ """

    params = {
        'max_features': ['auto', 'sqrt', 0.2, 0.4],
        'n_estimators': [10, 50, 100, 500, 1000, 2000],
        'min_samples_leaf': [0.01, 0.02, 0.05, 0.1, 0.15, 0.2],
        'max_depth': [None, 3, 5, 7, 8, 9, 10]
    }

    nb_scenarios = np.product([len(params[x]) for x in params])
    results = []
    for max_f in params['max_features']:
        for n_est in params['n_estimators']:
            for min_leaf in params['min_samples_leaf']:
                for max_dep in params['max_depth']:
                    kf = StratifiedKFold(n_splits=K)
                    errors_fold = []
                    for train_index, test_index in kf.split(X, y):
                        X_train_bis, X_test = X[train_index], X[test_index]
                        y_train_bis, y_test = y[train_index], y[test_index]

                        rf = RandomForestClassifier(max_features=max_f,
                                                    n_estimators=n_est,
                                                    min_samples_leaf=min_leaf,
                                                    max_depth=max_dep,
                                                    n_jobs=-1,
                                                    class_weight='balanced')
                        rf.fit(X_train_bis, y_train_bis)
                        y_pred_test = rf.predict_proba(X_test)
                        logloss = log_loss(y_test, y_pred_test)
                        errors_fold.append(logloss)

                    result = {
                        'max_features': max_f,
                        'n_estimators': n_est,
                        'min_samples_leaf': min_leaf,
                        'max_depth': max_dep,
                        'cv_logloss': np.mean(errors_fold)
                    }

                    results.append(result)
                    print("=" * 10 +
                          " {}/{} ".format(len(results), nb_scenarios) +
                          "=" * 10)
                    for key, value in result.items():
                        print("{} : {}".format(key, value))

    results = sorted(results, key=lambda x: x['cv_logloss'])
    best_result = results[0]

    with open('data/s2_meta/best_tuning_rf.json', 'w') as fp:
        json.dump(best_result, fp, indent=4)

    return results
コード例 #9
0
	def runns(resp_var, size_of_test_data,dataset,positive_class,predictor_var, n_estimators,important_features,dealing_with_nulls):
		dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes
		#----DATA PREPROCESSING
		#-------dealing with NULL values in the data
		#----------remove the rows in which the response is null

		dataset=dataset.dropna(subset=[resp_var])
		#----------dealing with nulls
		dataset=deal_with_nulls(dealing_with_nulls,dataset)
		#----FEATURE SELECTION
		#-------get predictors important in predicting the response
		#-----------transform categorical predictors to dummy variables
		predictors=dataset[predictor_var]
		predictors=pd.get_dummies(predictors)
		#-----------balance the classes in the response var
		ros = RandomOverSampler(random_state=0)
		resp=dataset[resp_var]
		prds, resp = ros.fit_sample(predictors, resp)
		#-----------fit the random forest classifier to give us the important predictors
		rf_clf = RandomForestClassifier(n_estimators=n_estimators)
		rf_clf.fit(prds,resp)
		#-------get the important predictors
		feature_imp = pd.Series(rf_clf.feature_importances_,
						index=list(predictors.iloc[:,0:])).sort_values(ascending=False)
		#-------names of the important predictors
		important_predictor_names = feature_imp.index[0:important_features]
		#-------subset the data to get only the important predictors and the response
		resp=pd.DataFrame(data=resp,columns=[resp_var])
		predictors=pd.DataFrame(prds,columns=list(predictors))
		dataset=pd.concat([resp,predictors],axis=1)
		#---------------------------------------------------------
		#----MODEL TRAINING
		#--------Remove the response variables from the features variables - axis 1 refers to the columns
		m_data= dataset.drop(resp_var, axis = 1,inplace=False) 
		# Response variables are the values we want to predict
		resp_var = np.array(dataset[resp_var])

		dataset = pd.get_dummies(m_data)
		
		# Saving feature names for later use
		feature_list = list(m_data.columns)
		# Convert to numpy array
		dataset = np.array(dataset)

		# Split the data into training and testing sets
		train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = float(size_of_test_data), random_state = 402)

		# Instantiate model with n_estimators decision trees
		clf = RandomForestClassifier(n_jobs = 1,n_estimators = n_estimators, random_state = 142)

		# Train the model on training data
		clf.fit(train_features, train_labels)
		# evaluation
		predicted = clf.predict(test_features)
		pred_prob = clf.predict_proba(test_features)
		
		accuracy = accuracy_score(test_labels, predicted)
		#confusion matrix
		cnf = (confusion_matrix(test_labels,predicted))
		#precision score
		precision = precision_score(test_labels,predicted,pos_label=positive_class)
		#avg pres
		avg_precision = average_precision_score(test_labels,pred_prob[:,[1]])
		#recall score
		rec = recall_score(test_labels,predicted,pos_label=positive_class)
		#f1 scorea
		fscore = f1_score(test_labels,predicted,pos_label=positive_class)
		#fbeta score
		fbeta = fbeta_score(test_labels,predicted,beta=0.5)
		#hamming_loss
		hamming = hamming_loss(test_labels,predicted)
		#jaccard similarity score
		jaccard = jaccard_similarity_score(test_labels,predicted)
		#logloss
		logloss = log_loss(test_labels,predicted)
		#zero-oneloss
		zero_one = zero_one_loss(test_labels,predicted)
		#auc roc 
		area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]])
		#cohen_score
		cohen = cohen_kappa_score(test_labels,predicted)
		#mathews corr
		mathews = matthews_corrcoef(test_labels,predicted)
		# Variable importances from the important features selection stage
		variable_importance_list = list(zip(prds, feature_imp))
		output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews}
		output=json.dumps(output)
		return output
コード例 #10
0
# Visualize tree
dot_data = StringIO.StringIO()
tree.export_graphviz(clf, out_file=dot_data, feature_names=list(data_tree.columns.values))
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf('dectree.pdf')


# Repeat on test set
y_test_pred = clf.predict(X_test)
print "Accuracy Test: {0:.3f}".format(metrics.accuracy_score(y_test, y_test_pred))
print
print "Classification report:"
print metrics.classification_report(y_test, y_test_pred)
print 
print "Confusion matrix:"
print metrics.confusion_matrix(y_test, y_test_pred)

# Measure performance
y_pred = clf.predict_proba(X_train)

# Repeat on test set
y_test_pred = clf.predict_proba(X_test)

tt = g_test.as_matrix()
pred = tt* y_test_pred

ss = np.sum(pred, axis=1)

sss = ss.mean()

print sss
コード例 #11
0
#:# model

params = {'max_depth': 3, 'n_estimators': 75}

classifier = RandomForestClassifier(**params)
classifier.fit(X_train, y_train)

#:# hash
#:# 5475503c9e4b64dc0dcc4960399cf72c
md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()
print(f'md5: {md5}')

#:# audit
y_pred = classifier.predict(transform_pipeline.transform(X_test))
y_pred_proba = classifier.predict_proba(
    transform_pipeline.transform(X_test))[:, 1]

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'acc: {accuracy_score(y_test, y_pred)}')
print(f'auc: {roc_auc_score(y_test, y_pred_proba)}')
print(f'precision: {precision_score(y_test, y_pred)}')
print(f'recall: {recall_score(y_test, y_pred)}')
print(f'specificity: {tn/(tn+fp)}')
print(f'f1: {f1_score(y_test, y_pred)}')

#:# session info

# Dodaj wersję pythona w session info

sessionInfo = {
コード例 #12
0
    def RF_trainandtest(self, unionscores, cutscore, testsize, cv, feature_sel, varthreshold, ntrees, nodes, rfmethod, nclusters, cmethod, resmethod):
        
        #分割数据集为训练集和测试集
        if unionscores == True:
            data_feature = self.data.drop(['name', 'idCard', 'mobileNum', 'cardNum', 'rsk_score'], axis = 1)
        else:
            data_feature = self.data.drop(['name', 'idCard', 'mobileNum', 'cardNum', 'cst_score',
                                           'cnp_score', 'cnt_score', 'chv_score', 'dsi_score','rsk_score'], axis = 1)
        data_target = (self.data['rsk_score'] < cutscore).astype('int')
        X_train, X_test, y_train, y_test = train_test_split(data_feature, data_target, test_size=testsize, random_state=0)
        if testsize == 0:
            X_test, y_test = X_train.head(5), y_train.head(5)
            
        #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化
        X_train, X_test = self.binandwoe_traintest_pkl(X_train, y_train, X_test, nclusters, cmethod, self.label)
       
        #在train中做变量筛选, sklearn.feature_selection中的方法
        if feature_sel == "VarianceThreshold":
            selector = VarianceThreshold(threshold = varthreshold)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "RFECV":
            estimator = LogisticRegression()
            selector = RFECV(estimator, step=1, cv=cv)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "SelectFromModel":
            estimator = LogisticRegression()
            selector = SelectFromModel(estimator)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "SelectKBest":
            selector = SelectKBest()
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        else:
            X_train1, X_test1 = X_train, X_test        

        testcolumns = X_test1.columns 
        
        #重采样resampling 解决样本不平衡问题
        X_train1, y_train = self.imbalanceddata (X_train1, y_train, resmethod) 
            
        #训练并预测随机森林模型
        if rfmethod == 'RandomForest':
            classifier = RandomForestClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes)
        elif rfmethod == 'ExtraTrees':
            classifier = ExtraTreesClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes)
        elif rfmethod == 'GradientBoosting':
            classifier = GradientBoostingClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes)

        classifier.fit(X_train1, y_train)  
        probability = classifier.predict_proba(X_test1)
        
        predresult = pd.DataFrame({'target' : y_test, 'probability' : probability[:,1]})
        predresult = pd.concat([predresult, X_test], axis = 1)

        if self.label != None:#label==None 用于建模训练,label!=None用于保存生产模型
            joblib.dump(classifier, "allinpay projects\\creditscore_TLSW_fyz\\pkl\\classifier_" + self.label + '.pkl')
            joblib.dump(testcolumns, "allinpay projects\\creditscore_TLSW_fyz\\pkl\\testcolumns_" + self.label + '.pkl')
        
        
        return predresult
コード例 #13
0
                    feature_labels = dataset_features[ds_label][fe_label][
                        'feature_labels']
                    X = np.copy(
                        dataset_features[ds_label][fe_label]['features'])
                    X = np.nan_to_num(X)
                    feat_train = X[train_index, :]
                    feat_test = X[test_index, :]

                # feature normalization & model training
                feat_train = scaler.fit_transform(feat_train)
                classifier.fit(feat_train, target_train)

                feature_importance[(d, f, i)] = classifier.feature_importances_
                # feature scaling & prediction
                feat_test = scaler.transform(feat_test)
                y_pred_all[(d, f, i)] = classifier.predict_proba(feat_test)
                y_true_all[(d, f, i)] = y[test_index]
                all_file_id = np.array([
                    dataset.file_label_to_id[_]
                    for _ in dataset.metadata['fn_wav'].tolist()
                ])
                file_id_all[(d, f, i)] = all_file_id[test_index]

    with open(os.path.join(dir_results, 'feature_importance.pckl'),
              'wb+') as f:
        pickle.dump(feature_importance, f)

    f1_scores = np.zeros((num_datasets, num_extractors, num_folds))
    f1_scores_file = np.zeros((num_datasets, num_extractors, num_folds))

    # iterate over datasets
コード例 #14
0
def main():
    operMode = args.operMode
    logging.info('Random fortest work on operMode: {}'.format(operMode))

    input_in1_file = 'iris.csv'
    df = pd.read_csv(input_in1_file)
    if operMode == 'TRAINING':
        label_name = args.label_name
        n_estimators = args.n_estimators
        shuffle = args.shuffle
        split_ratio = args.split_ratio
        criterion = args.criterion
        max_features = args.max_features
        max_depth = args.max_depth
        min_samples_split = args.min_samples_split
        min_samples_leaf = args.min_samples_leaf
        min_weight_fraction_leaf = args.min_weight_fraction_leaf
        min_impurity_decrease = args.min_impurity_decrease
        bootstrap = args.bootstrap
        n_jobs = args.n_jobs

        logging.info('model parameter as follow:\n'
                     'label_name: {}\n'
                     'n_estimators: {}\n'
                     'split_ratio: {}\n'
                     'shuffle: {}\n'
                     'criterion: {}\n'
                     'max_featrues: {}\n'
                     'max_depth: {}\n'
                     'min_samples_split: {}\n'
                     'min_samples_leaf: {}\n'
                     'min_weight_fraction_leaf: {}\n'
                     'min_impurity_decrease: {}\n'
                     'bootstrap: {}\n'
                     'n_jobs: {}'.format(label_name, n_estimators, split_ratio, shuffle, criterion,
                                         max_features, max_depth, min_samples_split,
                                         min_samples_leaf, min_weight_fraction_leaf,
                                         min_impurity_decrease, bootstrap, n_jobs))

        tra_df, val_df = train_val_split(df, ratio=split_ratio, shuffle=shuffle)
        columns = df.columns.tolist()
        tra_y = tra_df[label_name].values
        val_y = val_df[label_name].values
        columns.remove(label_name)
        tra_x = tra_df[columns].values
        val_x = val_df[columns].values

        logging.info("Random Fortest Training Start...")
        try:
            clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
                                         min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                         min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features,
                                         min_impurity_decrease=min_impurity_decrease,
                                         bootstrap=bootstrap, n_jobs=n_jobs).fit(tra_x, tra_y)
        except Exception as e:
            logging.error("Unexpected Error {}".format(e))
            exit(0)

        logging.info("Random Fortest Training End and Stroe Model...")
        with open("rf.pkl", "wb") as f:
            pickle.dump(clf, f)

        val_y_pred_prob = clf.predict_proba(val_x)
        val_y_pred_label = clf.predict(val_x)

        cfmt = confusion_matrix(val_y, val_y_pred_label).tolist()

        top1_acc = top_k(val_y, val_y_pred_prob, clf.classes_, k=1)
        top5_acc = top_k(val_y, val_y_pred_prob, clf.classes_, k=5)
        fprs = []
        tprs = []
        aucs = []
        recalls = []
        precisions = []
        aps = []

        for c in range(len(clf.classes_)):
            val_y_true_binary = val_y == clf.classes_[c]
            val_y_pred_binary = val_y_pred_prob[:, c]
            fpr, tpr, thres_roc = roc_curve(val_y_true_binary, val_y_pred_binary, pos_label=1)
            auc = roc_auc_score(val_y_true_binary, val_y_pred_binary)
            precision, recall, thres_pr = precision_recall_curve(val_y_true_binary, val_y_pred_binary)
            ap = average_precision_score(val_y_true_binary, val_y_pred_binary)
            fprs.append(fpr.tolist())
            tprs.append(tpr.tolist())
            aucs.append(auc)
            recalls.append(recall.tolist())
            precisions.append(precision.tolist())
            aps.append(ap)

        pfmn_dict = {}
        pfmn_dict['graphs'] = []
        # ROC曲线
        graph_roc = {}
        graph_roc['name'] = 'ROC曲线'
        graph_roc['x_title'] = 'fpr'
        graph_roc['y_title'] = 'tpr'
        graph_roc['lines'] = []
        for i in range(len(fprs)):
            line = {}
            line['name'] = 'label为{}的ROC曲线'.format(i)
            line['relative'] = []
            relative = {}
            relative['name'] = 'auc'
            relative['value'] = aucs[i]
            line['relative'].append(relative)
            line['x_axis'] = fprs[i]
            line['y_axis'] = tprs[i]
            graph_roc['lines'].append(line)
        pfmn_dict['graphs'].append(graph_roc)
        # PR曲线
        graph_pr = {}
        graph_pr['name'] = 'PR曲线'
        graph_pr['x_title'] = 'recall',
        graph_pr['y_title'] = 'precision'
        graph_pr['lines'] = []
        for i in range(len(recalls)):
            line = {}
            line['name'] = 'label为{}的PR曲线'.format(i)
            line['relative'] = []
            relative = {}
            relative['name'] = 'ap'
            relative['value'] = aps[i]
            line['relative'].append(relative)
            line['x_axis'] = recalls[i]
            line['y_axis'] = precisions[i]
            graph_pr['lines'].append(line)
        pfmn_dict['graphs'].append(graph_pr)

        # 混淆矩阵
        pfmn_dict['matrixs'] = []
        matrix = {}
        matrix['name'] = '混淆矩阵'
        matrix['col_name'] = clf.classes_.tolist()
        matrix['row_name'] = clf.classes_.tolist()
        matrix['elements'] = cfmt
        pfmn_dict['matrixs'].append(matrix)
        # 数值型指标
        pfmn_dict['evaluation'] = []
        evals_top1 = {}
        evals_top1['name'] = "top1"
        evals_top1['value'] = top1_acc
        pfmn_dict['evaluation'].append(evals_top1)
        if top5_acc:
            evals_top5 = {}
            evals_top5['name'] = 'top5'
            evals_top5['value'] = top5_acc
            pfmn['evaluation'].append(evals_top5)

        pfmn_str = json.dumps(pfmn_dict)
        with open('pfmn.json', 'w') as f:
            f.write(pfmn_str)
        logging.info('Random Fortest Model Evaluation finished!')
    elif operMode == 'PREDICTION':
        has_label = args.has_label
        label_name = args.label_name
        load_model = args.load_model

        logging.info('model parameter configure as follow:\n'
                     'has_label: {}\n'
                     'label_name: {}\n'
                     'load_model: {}\n'.format(has_label, label_name, load_model))
        if has_label:
            if label_name is None:
                try:
                    raise Exception('if parameter has_label is true, label_name must not be none')
                except Exception as e:
                    logging.error(e)
                    exit(0)
        if has_label:
            columns = df.columns.tolist()
            test_y = df[label_name].values
            columns.remove(label_name)
            test_x = df[columns].values
        else:
            test_x = df.values

        logging.info("Random Fortest Load Model ")
        model_path = load_model
        if not os.path.exists(model_path):
            try:
                raise Exception('model file {} will be loaded not exists!'.format(model_path))
            except Exception as e:
                logging.error('Unexpected Error {}'.format(e))
                exit(0)
        with open(model_path, 'rb') as f:
            clf = pickle.load(f)
        test_y_pred_prob = clf.predict_proba(test_x)
        if has_label:
            fprs = []
            tprs = []
            aucs = []
            recalls = []
            precisions = []
            aps = []
            for c in range(len(clf.classes_)):
                test_y_true_binary = test_y == clf.classes_[c]
                test_y_pred_binary = test_y_pred_prob[:, c]
                fpr, tpr, thres_roc = roc_curve(test_y_true_binary, test_y_pred_binary, pos_label=1)
                auc = roc_auc_score(test_y_true_binary, test_y_pred_binary)
                precision, recall, thres_pr = precision_recall_curve(test_y_true_binary, test_y_pred_binary)
                ap = average_precision_score(test_y_true_binary, test_y_pred_binary)
                fprs.append(fpr.tolist())
                tprs.append(tpr.tolist())
                aucs.append(auc)
                recalls.append(recall.tolist())
                precisions.append(precision.tolist())
                aps.append(ap)
            test_y_pred_label = clf.predict(test_x)
            cfmt = confusion_matrix(test_y, test_y_pred_label).tolist()
            top1_acc = top_k(test_y, test_y_pred_prob, clf.classes_, k=1)
            top5_acc = top_k(test_y, test_y_pred_prob, clf.classes_, k=5)
            pfmn_dict = {}
            pfmn_dict['graphs'] = []
            # ROC曲线
            graph_roc = {}
            graph_roc['name'] = 'ROC曲线'
            graph_roc['x_title'] = 'fpr'
            graph_roc['y_title'] = 'tpr'
            graph_roc['lines'] = []
            for i in range(len(fprs)):
                line = {}
                line['name'] = 'label为{}的ROC曲线'.format(i)
                line['relative'] = []
                relative = {}
                relative['name'] = 'auc'
                relative['value'] = aucs[i]
                line['relative'].append(relative)
                line['x_axis'] = fprs[i]
                line['y_axis'] = tprs[i]
                graph_roc['lines'].append(line)
            pfmn_dict['graphs'].append(graph_roc)
            # PR曲线
            graph_pr = {}
            graph_pr['name'] = 'PR曲线'
            graph_pr['x_title'] = 'recall',
            graph_pr['y_title'] = 'precision'
            graph_pr['lines'] = []
            for i in range(len(recalls)):
                line = {}
                line['name'] = 'label为{}的PR曲线'.format(i)
                line['relative'] = []
                relative = {}
                relative['name'] = 'ap'
                relative['value'] = aps[i]
                line['relative'].append(relative)
                line['x_axis'] = recalls[i]
                line['y_axis'] = precisions[i]
                graph_pr['lines'].append(line)
            pfmn_dict['graphs'].append(graph_pr)

            # 混淆矩阵
            pfmn_dict['matrixs'] = []
            matrix = {}
            matrix['name'] = '混淆矩阵'
            matrix['col_name'] = clf.classes_.tolist()
            matrix['row_name'] = clf.classes_.tolist()
            matrix['elements'] = cfmt
            pfmn_dict['matrixs'].append(matrix)
            # 数值型指标
            pfmn_dict['evaluation'] = []
            evals_top1 = {}
            evals_top1['name'] = "top1"
            evals_top1['value'] = top1_acc
            pfmn_dict['evaluation'].append(evals_top1)
            if top5_acc:
                evals_top5 = {}
                evals_top5['name'] = 'top5'
                evals_top5['value'] = top5_acc
                pfmn_dict['evaluation'].append(evals_top5)

            pfmn_str = json.dumps(pfmn_dict)
            with open('pfmn.json', 'w') as f:
                f.write(pfmn_str)
    else:
        logging.fatal('Random fortest not support {}'.format(operMode))
        raise Exception('Random fortest not support {}'.format(operMode))
コード例 #15
0
cv_model.cv_results_

### ASSESS BEST PARAMS TREE AND SCORE
tree_model = RandomForestClassifier(random_state=297,
                                    **cv_model.best_params_)  ####ONLY IF THE PREVIOUS MODEL IS A SearchCV
tree_model = tree_model.fit(trainX, trainY.values.ravel())
tree_model.score(trainX, trainY)
tree_model.score(testX, testY)

### CHECK IMPORTANCE OF FEATURES
feature_importance = pd.DataFrame(tree_model.feature_importances_, index=trainX.columns, columns=['Imp']).reset_index()
feature_importance['pk'] = 1
plot_scatter(feature_importance, 'index', 'Imp', 'index')
plot_bar(feature_importance, 'index', 'Imp', 'index')

### PREDICT
prediction = tree_model.predict(features_all)
tree_model.predict_proba(features_all)

#### VISUALIZE TREE
### ONLY FOR SIMPLE DECISION TREE
# tree.export_graphviz(tree_model,
#                      feature_names=list(trainX.columns),
#                      out_file='/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/tree.dot')
# (graph,) = pydot.graph_from_dot_file('/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/tree.dot')
# graph.write_png('/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/tree.png')

skplt.metrics.plot_confusion_matrix(target, prediction, normalize=True)
pd.crosstab(target['Survived'], prediction)
sum(target['Survived'])
コード例 #16
0
    '''

    oversampled_path = "resources/oversampled_normalized_data_ratio_2.bin"
    homesite = Data()
    homesite.load_sliptted_data(oversampled_path)
    del homesite.test_x  # Deleted to save memory.
    print homesite.train_x.shape

    # Creating classifier.
    # clf = DecisionTreeClassifier()
    clf = RandomForestClassifier(max_features=100)
    # clf = AdaBoostClassifier(n_estimators = 10)
    # clf = svm.SVC(gamma = 0.00005)
    # clf = RandomForestClassifier()
    # clf = MultiplePLS(n_classifiers = 10, n_samples = 5000, n_positive_samples = 2500, threshold = 0.9, acc = 0.999)
    # clf = svm.LinearSVC()

    # Train classifier.
    print "Training classifier."
    clf.fit(homesite.train_x, homesite.train_y)

    # Test classifier.
    print 'Testing classifier.'
    predicted_labels = clf.predict_proba(homesite.validation_x)[:, 1]

    # Show final results.
    results = confusion_matrix(homesite.validation_y,
                               np.round(predicted_labels))
    accuracy, precision, recall = compute_performance_metrics(results)
    auc = compute_auc(homesite.validation_y, predicted_labels)
コード例 #17
0
from util import convert_gray_scale, flatten


Xr,Yr = training_set
Xe,Ye = test_set

Xr = flatten(convert_gray_scale(Xr))
Xe = flatten(convert_gray_scale(Xe))

rf = RandomForestClassifier(n_estimators=100, verbose=3, oob_score=True, compute_importances=True)
rf.fit(Xr, Yr)

Yp = rf.predict(Xe)
print np.mean(Yp == Ye)

Ypp = rf.predict_proba(Xe).max(axis=1)

plt.figure(1)
plt.clf()
plt.hist(Ypp[Yp == Ye], 50, color='b', normed=True, alpha=0.4,
         label='classified')
plt.hist(Ypp[Yp != Ye], 50, color='r', normed=True, alpha=0.4,
         label='misclassified')
plt.legend(loc='upper left')
plt.draw()
plt.show()

plt.figure(3)
plt.clf()

n = 0.01 * float(len(Yp))
コード例 #18
0
ファイル: crossvalidate.py プロジェクト: Pold87/pikki-virus
#                                 normalize=True)

#clf = xgbwrapper.XgbWrapper({'objective': 'binary:logistic',
#                  'eval_metric': 'auc',
#                  'eta': 0.1,
#                  'silent': 1,
#                  'max_delta_step': 1})

# 'Normal' 70 / 30 cross-validation
if do_cross_val == 1:
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, train.WnvPresent, test_size=0.3, random_state=0)

    clf.fit(X_train, y_train)

    y_pred = clf.predict_proba(X_test)
    print(metrics.roc_auc_score(y_test, y_pred))

elif do_cross_val == 2:

    # Leave-one-year-out cross-validation
    scores = []
    total_pred = np.array([])
    total_test = np.array([])

    for year in [2007, 2009, 2011, 2013]:

        X_train, X_test, y_train, y_test, y_train_numMosquitos, y_test_numMosquitos = year_train_test_split(
            train_for_loo, 'WnvPresent_DateTrapSpecies', year)

        X_train.to_csv("data_per_year/" + str(year) + "X_train.csv",
コード例 #19
0
    def RF_trainandtest_kfold(self, unionscores, nsplit, cutscore, cv, feature_sel, varthreshold, ntrees, nodes, rfmethod, nclusters, cmethod, resmethod):
        
        if unionscores == True:
            data_feature = self.data.drop(['name', 'idCard', 'mobileNum', 'cardNum', 'rsk_score'], axis = 1)
        else:
            data_feature = self.data.drop(['name', 'idCard', 'mobileNum', 'cardNum', 'cst_score',
                                           'cnp_score', 'cnt_score', 'chv_score', 'dsi_score','rsk_score'], axis = 1)
        data_target = (self.data['rsk_score'] < cutscore).astype('int')

        #将数据集分割成k个分段分别进行训练和测试,对每个分段,该分段为测试集,其余数据为训练集
        kf = KFold(n_splits=nsplit, shuffle=True)
        predresult = pd.DataFrame()
        for train_index, test_index in kf.split(data_feature):
            X_train, X_test = data_feature.iloc[train_index, ], data_feature.iloc[test_index, ]
            y_train, y_test = data_target.iloc[train_index, ], data_target.iloc[test_index, ]
            
            #如果随机抽样造成train或者test中只有一个分类,跳过此次预测
            if (len(y_train.unique()) == 1) or (len(y_test.unique()) == 1):
                continue
            
            #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化
            X_train, X_test = self.binandwoe_traintest(X_train, y_train, X_test, nclusters, cmethod)
                    
            #在train中做变量筛选, sklearn.feature_selection中的方法
            if feature_sel == "VarianceThreshold":
                selector = VarianceThreshold(threshold = varthreshold)
                X_train1 = pd.DataFrame(selector.fit_transform(X_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "RFECV":
                estimator = LogisticRegression()
                selector = RFECV(estimator, step=1, cv=cv)
                X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "SelectFromModel":
                estimator = LogisticRegression()
                selector = SelectFromModel(estimator)
                X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "SelectKBest":
                selector = SelectKBest()
                X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            else:
                X_train1, X_test1 = X_train, X_test      

            #重采样resampling 解决样本不平衡问题
            X_train1, y_train = self.imbalanceddata (X_train1, y_train, resmethod)
            
            #训练并预测随机森林模型
            if rfmethod == 'RandomForest':
                classifier = RandomForestClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes)
            elif rfmethod == 'ExtraTrees':
                classifier = ExtraTreesClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes)
            elif rfmethod == 'GradientBoosting':
                classifier = GradientBoostingClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes)

            classifier.fit(X_train1, y_train)  
            probability = classifier.predict_proba(X_test1)
            
            temp = pd.DataFrame({'target' : y_test, 'probability' : probability[:,1]})
            predresult = pd.concat([predresult, temp], ignore_index = True)        

            
        return predresult
コード例 #20
0
Xr, Yr = training_set
Xe, Ye = test_set

Xr = flatten(convert_gray_scale(Xr))
Xe = flatten(convert_gray_scale(Xe))

rf = RandomForestClassifier(n_estimators=100,
                            verbose=3,
                            oob_score=True,
                            compute_importances=True)
rf.fit(Xr, Yr)

Yp = rf.predict(Xe)
print np.mean(Yp == Ye)

Ypp = rf.predict_proba(Xe).max(axis=1)

plt.figure(1)
plt.clf()
plt.hist(Ypp[Yp == Ye],
         50,
         color='b',
         normed=True,
         alpha=0.4,
         label='classified')
plt.hist(Ypp[Yp != Ye],
         50,
         color='r',
         normed=True,
         alpha=0.4,
         label='misclassified')
コード例 #21
0
from data.numpy_file import save_np_array, load_np_array
from data.plot import plot
import numpy as np
import pandas as pd
from statistics.confusion_matrix import confusion_matrix
from statistics.performance import compute_performance_metrics, compute_auc

if __name__ == '__main__':
    '''
    Classify data changing balancing ratio.
    '''

    # Train and test random forests.
    path = "../homesite_data/resources/parsed_data.bin"
    homesite = Data()
    homesite.load_parsed_data(path)
    homesite.z_norm_train_test_by_feature()
    sm = OverSampler(verbose=False, ratio=2.5)
    homesite.train_x, homesite.train_y = sm.fit_transform(
        homesite.train_x, homesite.train_y)

    clf = RandomForestClassifier(n_estimators=300, max_features=100, n_jobs=4)

    # Train classifier.
    print "Training classifier."
    clf.fit(homesite.train_x, homesite.train_y)
    predicted_labels = clf.predict_proba(homesite.test_x)[:, 1]
    sample = pd.read_csv('../input/sample_submission.csv')
    sample.QuoteConversion_Flag = predicted_labels
    sample.to_csv('rfc_300.csv', index=False)
コード例 #22
0
ファイル: rf_digits.py プロジェクト: matthagy/sc2_timer
print mask.sum()
X = images[mask, ...].reshape(mask.sum(), np.prod(images.shape[1::]))
print X.shape
Y = classifications[mask]

acc = []
acc_correct = []
acc_incorrect = []
acc_x_incorrect = []
k_fold = 8
for train_inx, valid_inx in StratifiedKFold(Y, k_fold):
    rf = RandomForestClassifier(n_estimators=100, verbose=0, oob_score=True, compute_importances=True)
    rf.fit(X[train_inx], Y[train_inx])
    Yp = rf.predict(X[valid_inx])
    correct = Yp== Y[valid_inx]
    rf.predict_proba(X[valid_inx])
    p_correct = rf.predict_proba(X[valid_inx]).max(axis=1)
    acc_correct.append(p_correct[correct])
    acc_incorrect.append(p_correct[~correct])

    score = correct.mean()
    print score
    acc.append(score)

    acc_x_incorrect.append([images[mask][valid_inx[~correct]],
                            Y[valid_inx[~correct]],
                            Yp[~correct]])

print 'score', np.mean(acc)

rf = RandomForestClassifier(n_estimators=100, verbose=0, oob_score=True, compute_importances=True)
コード例 #23
0
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

test_loss = np.zeros((params['n_estimators'],), dtype=np.float64)
train_loss = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
    # clf.loss_ assumes that y_test[i] in {0, 1}
    y_sig = (1.0 / (1.0 + np.exp(0.0 - y_pred)))
    test_loss[i] = log_loss(y_test, y_sig)#clf.loss_(y_test, y_sig)

for i, y_pred in enumerate(clf.staged_decision_function(X_train)):
    # clf.loss_ assumes that y_test[i] in {0, 1}
    y_sig = (1.0 / (1.0 + np.exp(0.0 - y_pred)))
    train_loss[i] = log_loss(y_train, y_sig)#clf.loss_(y_train, y_sig)

plt.figure()
plt.plot(test_loss, 'r', linewidth=2)
plt.plot(train_loss, 'g', linewidth=2)
plt.legend(['test', 'train'])

i = np.argmin(test_loss)
    
print('min log-loss: ', np.round(test_loss[i],2), ' iteration#: ', i)

rfc = RandomForestClassifier(random_state=241, n_estimators=i)
rfc.fit(X_train, y_train)
y_pred = rfc.predict_proba(X_test)

print('RandomForest log-loss: ', np.round(log_loss(y_test, y_pred),2))
コード例 #24
0
 
 kf = KFold(n_splits=10)
 predictions = []
 
 print('PCA with RandomForest model training...')
 
 for train_index, val_index in kf.split(df_features):
     
     Train_X = df_features.iloc[train_index]
     Train_Y = df_label.iloc[train_index]
     Val_X = df_features.iloc[val_index]
     
     clf = RandomForestClassifier(n_estimators=50, min_samples_split=2, min_samples_leaf=1, oob_score=True)
     clf.fit(Train_X, Train_Y)
     
     predict_Val_Y = clf.predict_proba(Val_X)[:, 1]
     
     predict_Val_Y[predict_Val_Y <= 0.44] = 0
     predict_Val_Y[predict_Val_Y > 0.44] = 1
     
     predictions.append(predict_Val_Y)
 
 predictions = np.concatenate(predictions, axis=0)
 
 precision = np.count_nonzero(predictions == df_label) / len(predictions)
 
 print(precision)
 
 
 
 
コード例 #25
0
tree.export_graphviz(clf,
                     out_file=dot_data,
                     feature_names=list(data_tree.columns.values))
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf('dectree.pdf')

# Repeat on test set
y_test_pred = clf.predict(X_test)
print "Accuracy Test: {0:.3f}".format(
    metrics.accuracy_score(y_test, y_test_pred))
print
print "Classification report:"
print metrics.classification_report(y_test, y_test_pred)
print
print "Confusion matrix:"
print metrics.confusion_matrix(y_test, y_test_pred)

# Measure performance
y_pred = clf.predict_proba(X_train)

# Repeat on test set
y_test_pred = clf.predict_proba(X_test)

tt = g_test.as_matrix()
pred = tt * y_test_pred

ss = np.sum(pred, axis=1)

sss = ss.mean()

print sss
コード例 #26
0
df['group'] = 0
df.loc[df.logerror < q20, 'group'] = -1
df.loc[df.logerror > q80, 'group'] = 1

# create train and test set
X_train, X_val, y_train, y_val, scaler = create_inputs_model(df.drop(
    'logerror', axis=1),
                                                             test_size=0.25)

# Predict the class
rfc = RandomForestClassifier(n_estimators=500,
                             verbose=2,
                             n_jobs=-1,
                             max_depth=8)
rfc.fit(X_train, y_train)
y_pred_val = rfc.predict_proba(X_val)
fpr, tpr, thresholds = roc_curve(y_val, y_pred_val[:, 1], pos_label=1)
print("AUC on test : {:.02f} %".format(auc(fpr, tpr) * 100))

#most important features
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

feat_names = df.drop('logerror', axis=1).drop('group', axis=1).columns.values
for f in range(X_val.shape[1]):
    print("{}. feature {} - {} :({:.06f})".format(f + 1, indices[f],
                                                  feat_names[indices[f]],
                                                  importances[indices[f]]))

catalog = describe_features(df)
コード例 #27
0
    def runns(resp_var, size_of_test_data, dataset, positive_class,
              predictor_var, n_estimators, important_features,
              dealing_with_nulls):
        dataset = pd.read_csv('raw_data.csv',
                              low_memory=False)  # For testing purposes
        #----DATA PREPROCESSING
        #-------dealing with NULL values in the data
        #----------remove the rows in which the response is null

        dataset = dataset.dropna(subset=[resp_var])
        #----------dealing with nulls
        dataset = deal_with_nulls(dealing_with_nulls, dataset)
        #----FEATURE SELECTION
        #-------get predictors important in predicting the response
        #-----------transform categorical predictors to dummy variables
        predictors = dataset[predictor_var]
        predictors = pd.get_dummies(predictors)
        #-----------balance the classes in the response var
        ros = RandomOverSampler(random_state=0)
        resp = dataset[resp_var]
        prds, resp = ros.fit_sample(predictors, resp)
        #-----------fit the random forest classifier to give us the important predictors
        rf_clf = RandomForestClassifier(n_estimators=n_estimators)
        rf_clf.fit(prds, resp)
        #-------get the important predictors
        feature_imp = pd.Series(
            rf_clf.feature_importances_,
            index=list(predictors.iloc[:, 0:])).sort_values(ascending=False)
        #-------names of the important predictors
        important_predictor_names = feature_imp.index[0:important_features]
        #-------subset the data to get only the important predictors and the response
        resp = pd.DataFrame(data=resp, columns=[resp_var])
        predictors = pd.DataFrame(prds, columns=list(predictors))
        dataset = pd.concat([resp, predictors], axis=1)
        #---------------------------------------------------------
        #----MODEL TRAINING
        #--------Remove the response variables from the features variables - axis 1 refers to the columns
        m_data = dataset.drop(resp_var, axis=1, inplace=False)
        # Response variables are the values we want to predict
        resp_var = np.array(dataset[resp_var])

        dataset = pd.get_dummies(m_data)

        # Saving feature names for later use
        feature_list = list(m_data.columns)
        # Convert to numpy array
        dataset = np.array(dataset)

        # Split the data into training and testing sets
        train_features, test_features, train_labels, test_labels = train_test_split(
            dataset,
            resp_var,
            test_size=float(size_of_test_data),
            random_state=402)

        # Instantiate model with n_estimators decision trees
        clf = RandomForestClassifier(n_jobs=1,
                                     n_estimators=n_estimators,
                                     random_state=142)

        # Train the model on training data
        clf.fit(train_features, train_labels)
        # evaluation
        predicted = clf.predict(test_features)
        pred_prob = clf.predict_proba(test_features)

        accuracy = accuracy_score(test_labels, predicted)
        #confusion matrix
        cnf = (confusion_matrix(test_labels, predicted))
        #precision score
        precision = precision_score(test_labels,
                                    predicted,
                                    pos_label=positive_class)
        #avg pres
        avg_precision = average_precision_score(test_labels, pred_prob[:, [1]])
        #recall score
        rec = recall_score(test_labels, predicted, pos_label=positive_class)
        #f1 scorea
        fscore = f1_score(test_labels, predicted, pos_label=positive_class)
        #fbeta score
        fbeta = fbeta_score(test_labels, predicted, beta=0.5)
        #hamming_loss
        hamming = hamming_loss(test_labels, predicted)
        #jaccard similarity score
        jaccard = jaccard_similarity_score(test_labels, predicted)
        #logloss
        logloss = log_loss(test_labels, predicted)
        #zero-oneloss
        zero_one = zero_one_loss(test_labels, predicted)
        #auc roc
        area_under_roc = roc_auc_score(test_labels, pred_prob[:, [1]])
        #cohen_score
        cohen = cohen_kappa_score(test_labels, predicted)
        #mathews corr
        mathews = matthews_corrcoef(test_labels, predicted)
        # Variable importances from the important features selection stage
        variable_importance_list = list(zip(prds, feature_imp))
        output = {
            "accuracy": accuracy,
            "precision": precision,
            "average precision": avg_precision,
            "recall": rec,
            "fscore": fscore,
            "fbeta": fbeta,
            "hamming": hamming,
            "jaccard": jaccard,
            "logloss": logloss,
            "zero_one": zero_one,
            "area_under_roc": area_under_roc,
            "cohen": cohen,
            "mathews": mathews
        }
        output = json.dumps(output)
        return output
コード例 #28
0
        X_train, X_test, y_train, y_test = year_train_test_split(
            train_for_loo,
            'WnvPresent_DateTrapSpecies',
            year)      

        X_train.to_csv("data_per_year/" + str(year) + "X_train.csv", index=False)
        X_test.to_csv("data_per_year/" + str(year) + "X_test.csv", index=False)
        y_train.to_csv("data_per_year/" + str(year) + "y_train.csv", index=False)
        y_test.to_csv("data_per_year/" + str(year) + "y_test.csv", index=False)

        
        clf.fit(X_train, y_train)

        # y_pred = clf.predict_proba(X_test) [:, 1] # Random Forest
        y_pred = clf.predict_proba(X_test) # For XGB
        
        score = metrics.roc_auc_score(y_test, y_pred)
        scores.append(score)
        
        #import operator
        #feat_importances = dict(zip(X_train.columns, clf.feature_importances_))
        #sorted_feat_importances = sorted(feat_importances.items(), key=operator.itemgetter(1))
        #print(sorted_feat_importances)
        
        total_pred = np.concatenate((total_pred, y_pred))
        total_test = np.concatenate((total_test, y_test))
        
    print("Global ROC score", metrics.roc_auc_score(total_test, total_pred))
        
    print(scores)
コード例 #29
0
ファイル: crossvalidate.py プロジェクト: Pold87/pikki-virus
#                  'eval_metric': 'auc',
#                  'eta': 0.1,
#                  'silent': 1,
#                  'max_delta_step': 1})

# 'Normal' 70 / 30 cross-validation
if do_cross_val == 1:
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X,
        train.WnvPresent,
        test_size=0.3,
        random_state=0)

    clf.fit(X_train, y_train)

    y_pred = clf.predict_proba(X_test)
    print(metrics.roc_auc_score(y_test, y_pred))

elif do_cross_val == 2:

    # Leave-one-year-out cross-validation
    scores = []
    total_pred = np.array([])
    total_test = np.array([])
    
    for year in [2007, 2009, 2011, 2013]:

        X_train,X_test, y_train, y_test, y_train_numMosquitos, y_test_numMosquitos = year_train_test_split(
            train_for_loo,
            'WnvPresent_DateTrapSpecies',
            year)      
コード例 #30
0
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(iris.data, iris.target)
knn.predict(iris.data)

len(iris.target)
sum(iris.target == knn.predict(iris.data))
knn.score(iris.data, iris.target)
help(cross_val_predict)
cross_val_predict(knn, iris.data, iris.target, cv=20)
cross_val_score(knn, iris.data, iris.target, cv=20).mean()


rf = RandomForestClassifier(n_estimators=3)
rf.fit(iris.data, iris.target)
rf.predict_proba(iris.data)
rf.score(iris.data, iris.target)
sum(iris.target == rf.predict(iris.data))
cross_val_score(rf, iris.data, iris.target, cv=20).mean()


from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
'''
https://github.com/drivendataorg/box-plots-sklearn/blob/master/src/data/multilabel.py
'''
mcr = OneVsRestClassifier(LogisticRegression())
mcr.fit(iris.data, iris.target)
mcr.predict(iris.data)
mcr.predict_proba(iris.data)
コード例 #31
0
    def RF_trainandtest_kfold(self,
                              nsplit,
                              cv,
                              feature_sel,
                              varthreshold,
                              ntrees,
                              nodes,
                              rfmethod,
                              nclusters=10,
                              cmethod=None):

        data_feature = self.data.ix[:, self.data.columns != 'default']
        data_target = self.data['default']

        #将数据集分割成k个分段分别进行训练和测试,对每个分段,该分段为测试集,其余数据为训练集
        kf = KFold(n_splits=nsplit, shuffle=True)
        predresult = pd.DataFrame()
        for train_index, test_index in kf.split(data_feature):
            X_train, X_test = data_feature.iloc[
                train_index, ], data_feature.iloc[test_index, ]
            y_train, y_test = data_target.iloc[
                train_index, ], data_target.iloc[test_index, ]

            #如果随机抽样造成train或者test中只有一个分类,跳过此次预测
            if (len(y_train.unique()) == 1) or (len(y_test.unique()) == 1):
                continue

            #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化
            X_train, X_test = self.binandwoe_traintest(X_train, y_train,
                                                       X_test, nclusters,
                                                       cmethod)

            #在train中做变量筛选, sklearn.feature_selection中的方法
            if feature_sel == "VarianceThreshold":
                selector = VarianceThreshold(threshold=varthreshold)
                X_train1 = pd.DataFrame(selector.fit_transform(X_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "RFECV":
                estimator = LogisticRegression()
                selector = RFECV(estimator, step=1, cv=cv)
                X_train1 = pd.DataFrame(
                    selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "SelectFromModel":
                estimator = LogisticRegression()
                selector = SelectFromModel(estimator)
                X_train1 = pd.DataFrame(
                    selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            elif feature_sel == "SelectKBest":
                selector = SelectKBest()
                X_train1 = pd.DataFrame(
                    selector.fit_transform(X_train, y_train))
                X_train1.columns = X_train.columns[selector.get_support(True)]
                X_test1 = X_test[X_train1.columns]
            else:
                X_train1, X_test1 = X_train, X_test

            #训练并预测随机森林模型
            if rfmethod == 'RandomForest':
                classifier = RandomForestClassifier(n_estimators=ntrees,
                                                    min_samples_split=nodes *
                                                    2,
                                                    min_samples_leaf=nodes)
            elif rfmethod == 'ExtraTrees':
                classifier = ExtraTreesClassifier(n_estimators=ntrees,
                                                  min_samples_split=nodes * 2,
                                                  min_samples_leaf=nodes)
            elif rfmethod == 'GradientBoosting':
                classifier = GradientBoostingClassifier(
                    n_estimators=ntrees,
                    min_samples_split=nodes * 2,
                    min_samples_leaf=nodes)

            classifier.fit(X_train1, y_train)
            probability = classifier.predict_proba(X_test1)[:, 1]

            temp = pd.DataFrame({'target': y_test, 'probability': probability})
            predresult = pd.concat([predresult, temp], ignore_index=True)

        return predresult
コード例 #32
0
def main():
    st.title('Você sobreviveria ao Titanic?')
    st.write(
        'Modelo de classificação com RandomForest para prever sobrevivência ou morte de passageiros no Titanic'
    )
    st.subheader('Autor')
    st.write('https://www.linkedin.com/in/lucaszonin/')
    st.write('')
    st.subheader('Agradecimentos')
    st.write('Felipe Maia Polo que me deu algumas dicas:')
    st.write('https://www.linkedin.com/in/felipemaiapolo/')
    st.write('')

    titanic_v1 = pd.read_csv('datasets/train.csv')
    del titanic_v1['Cabin']
    del titanic_v1['PassengerId']
    del titanic_v1['Ticket']
    del titanic_v1['SibSp']
    del titanic_v1['Parch']
    titanic_v1['Age'] = titanic_v1['Age'].fillna(np.mean(titanic_v1['Age']))
    titanic_v1['Age'] = titanic_v1['Age'].astype('int64')
    titanic_v1 = titanic_v1.dropna()

    titanic_v1.loc[titanic_v1['Sex'] == 'male', 'Sex'] = 0
    titanic_v1.loc[titanic_v1['Sex'] == 'female', 'Sex'] = 1
    titanic_v1['Sex'] = titanic_v1['Sex'].astype(int)

    titanic_v1.loc[titanic_v1['Embarked'] == 'C', 'Embarked'] = 0
    titanic_v1.loc[titanic_v1['Embarked'] == 'Q', 'Embarked'] = 1
    titanic_v1.loc[titanic_v1['Embarked'] == 'S', 'Embarked'] = 2
    titanic_v1['Embarked'] = titanic_v1['Embarked'].astype(int)

    #PUXAR SEXO
    sexo = st.radio(label='Sexo do passageiro',
                    options=('Feminino', 'Masculino'))

    #PUXAR IDADE
    idade_passenger = st.slider(label='Idade do passageiro',
                                min_value=1,
                                max_value=max(titanic_v1['Age']))

    #PUXAR EMBARCACAO
    embarked = st.radio(label='Cidade onde embarcou',
                        options=('Cherbourg', 'Queenstown', 'Southampton'))

    #PUXAR VALOR DA PASSAGEM
    valor_pago = st.slider(label='Valor pago pela passagem',
                           min_value=1,
                           max_value=600)

    #PUXAR CLASSE
    classe = st.radio(label='Classe do passageiro',
                      options=('Primeira', 'Segunda', 'Terceira'))

    if sexo == 'Feminino':

        sexo_modelo = 1

    else:

        sexo_modelo = 0

    if embarked == 'Cherbourg':

        embarked_modelo = 0

    elif embarked == 'Queenstown':

        embarked_modelo = 1

    elif embarked == 'Southampton':

        embarked_modelo = 2

    if classe == 'Primeira':

        classe_modelo = 1

    elif classe == 'Segunda':

        classe_modelo = 2

    elif classe == 'Terceira':

        classe_modelo = 3

    titanic_modelo = titanic_v1

    y = titanic_modelo['Survived']
    x = titanic_modelo[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=30)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    model_randomForest = model.predict_proba(X_test)
    #st.write(accuracy_score(y_test,model_randomForest))

    if st.button(label="Prever"):

        st.title('Dados do passageiro:')
        st.write('Sexo :', sexo)
        st.write('Idade :', idade_passenger)
        st.write('Cidade onde embarcou :', embarked)
        st.write('Valor da passagem : US$', valor_pago)
        st.write('Classe da passagem :', classe)

        x_input = pd.DataFrame(
            {
                'Pclass': classe_modelo,
                'Sex': sexo_modelo,
                'Age': idade_passenger,
                'Fare': valor_pago,
                'Embarked': embarked_modelo
            },
            index=[0])
        new_model = RandomForestClassifier()
        new_model.fit(X_train, y_train)
        pred = new_model.predict_proba(x_input)

        st.title('Previsão:')
        st.write('')
        'Probabilidade de morrer:', pred[0, 0] * 100
        'Probabilidade de sobreviver:', pred[0, 1] * 100
コード例 #33
0
    '''

    # Train and test random forests.
    # load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.5.bin"
    load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.bin"
    homesite = Data()
    homesite.load_sliptted_data(load_path)
    del homesite.test_x  # Deleted to save memory.

    clf_ann = NeuralNetwork(path = "../homesite_data/ann_weights.bin", lr = 0.00005, \
                        lamb = 0)
    train_output_ann = clf_ann.get_hidden_output(homesite.train_x)
    validation_output_ann = clf_ann.get_hidden_output(homesite.validation_x)
    train_output_ann = np.hstack((train_output_ann, homesite.train_x))
    validation_output_ann = np.hstack((validation_output_ann, homesite.validation_x))

    for c in range(2, 10):
        # Train classifier.
        print "Training classifier."
        clf = RandomForestClassifier(n_estimators = 1 + 100 * c, n_jobs = 4)
        clf.fit(train_output_ann, homesite.train_y)

        # Test classifier.
        print 'Testing classifier.'
        predicted_labels = clf.predict_proba(validation_output_ann)[:, 1]

        # Show final results.
        results = confusion_matrix(homesite.validation_y, np.round(predicted_labels))
        accuracy, precision, recall = compute_performance_metrics(results)
        auc = compute_auc(homesite.validation_y, predicted_labels)
コード例 #34
0
    def RF_trainandtest(self,
                        testsize,
                        cv,
                        feature_sel,
                        varthreshold,
                        ntrees,
                        nodes,
                        rfmethod,
                        nclusters=10,
                        cmethod=None):

        #分割数据集为训练集和测试集
        data_feature = self.data.ix[:, self.data.columns != 'default']
        data_target = self.data['default']
        X_train, X_test, y_train, y_test = train_test_split(data_feature,
                                                            data_target,
                                                            test_size=testsize,
                                                            random_state=0)

        #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化
        X_train, X_test = self.binandwoe_traintest(X_train, y_train, X_test,
                                                   nclusters, cmethod)

        #在train中做变量筛选, sklearn.feature_selection中的方法
        if feature_sel == "VarianceThreshold":
            selector = VarianceThreshold(threshold=varthreshold)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "RFECV":
            estimator = LogisticRegression()
            selector = RFECV(estimator, step=1, cv=cv)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "SelectFromModel":
            estimator = LogisticRegression()
            selector = SelectFromModel(estimator)
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        elif feature_sel == "SelectKBest":
            selector = SelectKBest()
            X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train))
            X_train1.columns = X_train.columns[selector.get_support(True)]
            X_test1 = X_test[X_train1.columns]
        else:
            X_train1, X_test1 = X_train, X_test

        #训练并预测随机森林模型
        if rfmethod == 'RandomForest':
            classifier = RandomForestClassifier(n_estimators=ntrees,
                                                min_samples_split=nodes * 2,
                                                min_samples_leaf=nodes)
        elif rfmethod == 'ExtraTrees':
            classifier = ExtraTreesClassifier(n_estimators=ntrees,
                                              min_samples_split=nodes * 2,
                                              min_samples_leaf=nodes)
        elif rfmethod == 'GradientBoosting':
            classifier = GradientBoostingClassifier(n_estimators=ntrees,
                                                    min_samples_split=nodes *
                                                    2,
                                                    min_samples_leaf=nodes)

        classifier.fit(X_train1, y_train)
        probability = classifier.predict_proba(X_test1)[:, 1]

        predresult = pd.DataFrame({
            'target': y_test,
            'probability': probability
        })

        return predresult
コード例 #35
0
#                                 normalize=True)

#clf = XgbWrapper({'objective': 'binary:logistic',
#                  'eval_metric': 'auc',
#                  'eta': 0.1,
#                  'silent': 0,
#                  'max_delta_step': 1})

# 'Normal' 70 / 30 cross-validation
if do_cross_val == 1:
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, train.WnvPresent, test_size=0.3, random_state=0)

    clf.fit(X_train, y_train)

    y_pred = clf.predict_proba(X_test)[:, 1]
    print(metrics.roc_auc_score(y_test, y_pred))

elif do_cross_val == 2:

    # Leave-one-year-out cross-validation
    scores = []
    for year in [2007, 2009, 2011, 2013]:

        X_train, X_test, y_train, y_test = year_train_test_split(
            train_for_loo, 'WnvPresent', year)

        X_train.to_csv("data_per_year/" + str(year) + "X_train.csv",
                       index=False)
        X_test.to_csv("data_per_year/" + str(year) + "X_test.csv", index=False)
        y_train.to_csv("data_per_year/" + str(year) + "y_train.csv",