def test_RandomForest(self): X = [[0, 1], [1, 1]] Y = [0, 1] regression = RandomForestClassifier(n_estimators=10) regression = regression.fit(X, Y) regression.predict_proba(X)
class RandomForestClassifierImpl(): def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'): self._hyperparams = { 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'bootstrap': bootstrap, 'oob_score': oob_score, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose, 'warm_start': warm_start, 'class_weight': class_weight} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
class MyRfClassifier(BaseClassifier): def __init__(self, n_estimators, max_depth, min_samples_leaf): self.classifier = RandomForestClassifier( **{ 'verbose': 1, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'n_jobs': 40 }) self.name = "rf_n{n}_md{md}_ms{ms}".format(**{ "n": n_estimators, "md": max_depth, "ms": min_samples_leaf }) def get_name(self): return self.name def fit(self, X, y, X_t, y_t): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self, feat_names): ipts = dict(zip(feat_names, self.classifier.feature_importances_)) return ipts
class Model(BaseModel): """Antares implementation of scikit learn random forest classifier """ def __init__(self, categorical_features=None, n_estimators=50, n_jobs=-1, max_depth=10): ''' Example: >>> from madmex.modeling.supervised.rf import Model >>> rf = Model() >>> # Write model to db >>> rf.to_db(name='test_model', recipe='mexmad', training_set='no') >>> # Read model from db >>> rf2 = Model.from_db('test_model') ''' super().__init__(categorical_features=categorical_features) self.model = RandomForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, max_depth=max_depth) self.model_name = 'rf' def fit(self, X, y): X = self.hot_encode_training(X) self.model.fit(X, y) def predict(self, X): ''' Simply passes down the prediction from the underlying model. ''' X = self.hot_encode_predict(X) return self.model.predict(X) def predict_confidence(self, X): """Get confidence of every prediction """ X = self.hot_encode_predict(X) return self.model.predict_proba(X).max(axis=1) def score(self, X, y): ''' Test the model given a dataset and a target vector. This method applies the model that this object represents to the given dataset using the response variable y. It is a measure of the accuracy of the trained model. Usually the orginal dataset should be splitted in training and testing subsets to cross validate the model. ''' return self.model.score(X, y)
def train_model(X_train, y_train): print("training the model ...") rf = RandomForestClassifier(n_estimators=1000, max_depth=8, n_jobs=-1, verbose=1) # rf = svm.SVC(kernel='rbf', gamma=0.7, C=1.0,probability=True) rf.fit(X_train, y_train) y_pred_train = rf.predict_proba(X_train) fpr, tpr, thresholds = roc_curve(y_train, y_pred_train[:, 0], pos_label=1) print("AUC on train : {:.02f} %".format(auc(fpr, tpr) * 100)) return rf
class MyRfClassifier(BaseClassifier): def __init__(self, n_estimators, max_depth, min_samples_leaf): self.classifier = RandomForestClassifier(**{'verbose':1, 'n_estimators': n_estimators, 'max_depth':max_depth,'min_samples_leaf':min_samples_leaf, 'n_jobs':40}) self.name = "rf_n{n}_md{md}_ms{ms}".format( **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf} ) def get_name(self): return self.name def fit(self, X, y, X_t, y_t): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self, feat_names): ipts = dict(zip(feat_names, self.classifier.feature_importances_)) return ipts
class MyRandomForestClassifier(BaseClassifier): def __init__(self, verbose=1, n_estimators = 2000, max_depth=8, min_samples_leaf=10000, n_jobs=25): self.classifier = RandomForestClassifier( **{'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'n_jobs': n_jobs}) self.name = "rf_n{n}_md{md}_ms{ms}".format( **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf} ) def get_name(self): return self.name def fit(self, X, y): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self, feat_names): return self.classifier.feature_importances_
def tune_model(X, y, K=5): print("tuning the model ...") """logging""" # the winner is # {'max_features' : [sqrt'], # 'n_estimators' : [2000], # 'min_samples_leaf' : [1] # } # """ """ params = { 'max_features': ['auto', 'sqrt', 0.2, 0.4], 'n_estimators': [10, 50, 100, 500, 1000, 2000], 'min_samples_leaf': [0.01, 0.02, 0.05, 0.1, 0.15, 0.2], 'max_depth': [None, 3, 5, 7, 8, 9, 10] } nb_scenarios = np.product([len(params[x]) for x in params]) results = [] for max_f in params['max_features']: for n_est in params['n_estimators']: for min_leaf in params['min_samples_leaf']: for max_dep in params['max_depth']: kf = StratifiedKFold(n_splits=K) errors_fold = [] for train_index, test_index in kf.split(X, y): X_train_bis, X_test = X[train_index], X[test_index] y_train_bis, y_test = y[train_index], y[test_index] rf = RandomForestClassifier(max_features=max_f, n_estimators=n_est, min_samples_leaf=min_leaf, max_depth=max_dep, n_jobs=-1, class_weight='balanced') rf.fit(X_train_bis, y_train_bis) y_pred_test = rf.predict_proba(X_test) logloss = log_loss(y_test, y_pred_test) errors_fold.append(logloss) result = { 'max_features': max_f, 'n_estimators': n_est, 'min_samples_leaf': min_leaf, 'max_depth': max_dep, 'cv_logloss': np.mean(errors_fold) } results.append(result) print("=" * 10 + " {}/{} ".format(len(results), nb_scenarios) + "=" * 10) for key, value in result.items(): print("{} : {}".format(key, value)) results = sorted(results, key=lambda x: x['cv_logloss']) best_result = results[0] with open('data/s2_meta/best_tuning_rf.json', 'w') as fp: json.dump(best_result, fp, indent=4) return results
def runns(resp_var, size_of_test_data,dataset,positive_class,predictor_var, n_estimators,important_features,dealing_with_nulls): dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes #----DATA PREPROCESSING #-------dealing with NULL values in the data #----------remove the rows in which the response is null dataset=dataset.dropna(subset=[resp_var]) #----------dealing with nulls dataset=deal_with_nulls(dealing_with_nulls,dataset) #----FEATURE SELECTION #-------get predictors important in predicting the response #-----------transform categorical predictors to dummy variables predictors=dataset[predictor_var] predictors=pd.get_dummies(predictors) #-----------balance the classes in the response var ros = RandomOverSampler(random_state=0) resp=dataset[resp_var] prds, resp = ros.fit_sample(predictors, resp) #-----------fit the random forest classifier to give us the important predictors rf_clf = RandomForestClassifier(n_estimators=n_estimators) rf_clf.fit(prds,resp) #-------get the important predictors feature_imp = pd.Series(rf_clf.feature_importances_, index=list(predictors.iloc[:,0:])).sort_values(ascending=False) #-------names of the important predictors important_predictor_names = feature_imp.index[0:important_features] #-------subset the data to get only the important predictors and the response resp=pd.DataFrame(data=resp,columns=[resp_var]) predictors=pd.DataFrame(prds,columns=list(predictors)) dataset=pd.concat([resp,predictors],axis=1) #--------------------------------------------------------- #----MODEL TRAINING #--------Remove the response variables from the features variables - axis 1 refers to the columns m_data= dataset.drop(resp_var, axis = 1,inplace=False) # Response variables are the values we want to predict resp_var = np.array(dataset[resp_var]) dataset = pd.get_dummies(m_data) # Saving feature names for later use feature_list = list(m_data.columns) # Convert to numpy array dataset = np.array(dataset) # Split the data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = float(size_of_test_data), random_state = 402) # Instantiate model with n_estimators decision trees clf = RandomForestClassifier(n_jobs = 1,n_estimators = n_estimators, random_state = 142) # Train the model on training data clf.fit(train_features, train_labels) # evaluation predicted = clf.predict(test_features) pred_prob = clf.predict_proba(test_features) accuracy = accuracy_score(test_labels, predicted) #confusion matrix cnf = (confusion_matrix(test_labels,predicted)) #precision score precision = precision_score(test_labels,predicted,pos_label=positive_class) #avg pres avg_precision = average_precision_score(test_labels,pred_prob[:,[1]]) #recall score rec = recall_score(test_labels,predicted,pos_label=positive_class) #f1 scorea fscore = f1_score(test_labels,predicted,pos_label=positive_class) #fbeta score fbeta = fbeta_score(test_labels,predicted,beta=0.5) #hamming_loss hamming = hamming_loss(test_labels,predicted) #jaccard similarity score jaccard = jaccard_similarity_score(test_labels,predicted) #logloss logloss = log_loss(test_labels,predicted) #zero-oneloss zero_one = zero_one_loss(test_labels,predicted) #auc roc area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]]) #cohen_score cohen = cohen_kappa_score(test_labels,predicted) #mathews corr mathews = matthews_corrcoef(test_labels,predicted) # Variable importances from the important features selection stage variable_importance_list = list(zip(prds, feature_imp)) output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews} output=json.dumps(output) return output
# Visualize tree dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data, feature_names=list(data_tree.columns.values)) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('dectree.pdf') # Repeat on test set y_test_pred = clf.predict(X_test) print "Accuracy Test: {0:.3f}".format(metrics.accuracy_score(y_test, y_test_pred)) print print "Classification report:" print metrics.classification_report(y_test, y_test_pred) print print "Confusion matrix:" print metrics.confusion_matrix(y_test, y_test_pred) # Measure performance y_pred = clf.predict_proba(X_train) # Repeat on test set y_test_pred = clf.predict_proba(X_test) tt = g_test.as_matrix() pred = tt* y_test_pred ss = np.sum(pred, axis=1) sss = ss.mean() print sss
#:# model params = {'max_depth': 3, 'n_estimators': 75} classifier = RandomForestClassifier(**params) classifier.fit(X_train, y_train) #:# hash #:# 5475503c9e4b64dc0dcc4960399cf72c md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest() print(f'md5: {md5}') #:# audit y_pred = classifier.predict(transform_pipeline.transform(X_test)) y_pred_proba = classifier.predict_proba( transform_pipeline.transform(X_test))[:, 1] tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print(f'acc: {accuracy_score(y_test, y_pred)}') print(f'auc: {roc_auc_score(y_test, y_pred_proba)}') print(f'precision: {precision_score(y_test, y_pred)}') print(f'recall: {recall_score(y_test, y_pred)}') print(f'specificity: {tn/(tn+fp)}') print(f'f1: {f1_score(y_test, y_pred)}') #:# session info # Dodaj wersję pythona w session info sessionInfo = {
def RF_trainandtest(self, unionscores, cutscore, testsize, cv, feature_sel, varthreshold, ntrees, nodes, rfmethod, nclusters, cmethod, resmethod): #分割数据集为训练集和测试集 if unionscores == True: data_feature = self.data.drop(['name', 'idCard', 'mobileNum', 'cardNum', 'rsk_score'], axis = 1) else: data_feature = self.data.drop(['name', 'idCard', 'mobileNum', 'cardNum', 'cst_score', 'cnp_score', 'cnt_score', 'chv_score', 'dsi_score','rsk_score'], axis = 1) data_target = (self.data['rsk_score'] < cutscore).astype('int') X_train, X_test, y_train, y_test = train_test_split(data_feature, data_target, test_size=testsize, random_state=0) if testsize == 0: X_test, y_test = X_train.head(5), y_train.head(5) #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化 X_train, X_test = self.binandwoe_traintest_pkl(X_train, y_train, X_test, nclusters, cmethod, self.label) #在train中做变量筛选, sklearn.feature_selection中的方法 if feature_sel == "VarianceThreshold": selector = VarianceThreshold(threshold = varthreshold) X_train1 = pd.DataFrame(selector.fit_transform(X_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "RFECV": estimator = LogisticRegression() selector = RFECV(estimator, step=1, cv=cv) X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "SelectFromModel": estimator = LogisticRegression() selector = SelectFromModel(estimator) X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "SelectKBest": selector = SelectKBest() X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] else: X_train1, X_test1 = X_train, X_test testcolumns = X_test1.columns #重采样resampling 解决样本不平衡问题 X_train1, y_train = self.imbalanceddata (X_train1, y_train, resmethod) #训练并预测随机森林模型 if rfmethod == 'RandomForest': classifier = RandomForestClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes) elif rfmethod == 'ExtraTrees': classifier = ExtraTreesClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes) elif rfmethod == 'GradientBoosting': classifier = GradientBoostingClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes) classifier.fit(X_train1, y_train) probability = classifier.predict_proba(X_test1) predresult = pd.DataFrame({'target' : y_test, 'probability' : probability[:,1]}) predresult = pd.concat([predresult, X_test], axis = 1) if self.label != None:#label==None 用于建模训练,label!=None用于保存生产模型 joblib.dump(classifier, "allinpay projects\\creditscore_TLSW_fyz\\pkl\\classifier_" + self.label + '.pkl') joblib.dump(testcolumns, "allinpay projects\\creditscore_TLSW_fyz\\pkl\\testcolumns_" + self.label + '.pkl') return predresult
feature_labels = dataset_features[ds_label][fe_label][ 'feature_labels'] X = np.copy( dataset_features[ds_label][fe_label]['features']) X = np.nan_to_num(X) feat_train = X[train_index, :] feat_test = X[test_index, :] # feature normalization & model training feat_train = scaler.fit_transform(feat_train) classifier.fit(feat_train, target_train) feature_importance[(d, f, i)] = classifier.feature_importances_ # feature scaling & prediction feat_test = scaler.transform(feat_test) y_pred_all[(d, f, i)] = classifier.predict_proba(feat_test) y_true_all[(d, f, i)] = y[test_index] all_file_id = np.array([ dataset.file_label_to_id[_] for _ in dataset.metadata['fn_wav'].tolist() ]) file_id_all[(d, f, i)] = all_file_id[test_index] with open(os.path.join(dir_results, 'feature_importance.pckl'), 'wb+') as f: pickle.dump(feature_importance, f) f1_scores = np.zeros((num_datasets, num_extractors, num_folds)) f1_scores_file = np.zeros((num_datasets, num_extractors, num_folds)) # iterate over datasets
def main(): operMode = args.operMode logging.info('Random fortest work on operMode: {}'.format(operMode)) input_in1_file = 'iris.csv' df = pd.read_csv(input_in1_file) if operMode == 'TRAINING': label_name = args.label_name n_estimators = args.n_estimators shuffle = args.shuffle split_ratio = args.split_ratio criterion = args.criterion max_features = args.max_features max_depth = args.max_depth min_samples_split = args.min_samples_split min_samples_leaf = args.min_samples_leaf min_weight_fraction_leaf = args.min_weight_fraction_leaf min_impurity_decrease = args.min_impurity_decrease bootstrap = args.bootstrap n_jobs = args.n_jobs logging.info('model parameter as follow:\n' 'label_name: {}\n' 'n_estimators: {}\n' 'split_ratio: {}\n' 'shuffle: {}\n' 'criterion: {}\n' 'max_featrues: {}\n' 'max_depth: {}\n' 'min_samples_split: {}\n' 'min_samples_leaf: {}\n' 'min_weight_fraction_leaf: {}\n' 'min_impurity_decrease: {}\n' 'bootstrap: {}\n' 'n_jobs: {}'.format(label_name, n_estimators, split_ratio, shuffle, criterion, max_features, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, min_impurity_decrease, bootstrap, n_jobs)) tra_df, val_df = train_val_split(df, ratio=split_ratio, shuffle=shuffle) columns = df.columns.tolist() tra_y = tra_df[label_name].values val_y = val_df[label_name].values columns.remove(label_name) tra_x = tra_df[columns].values val_x = val_df[columns].values logging.info("Random Fortest Training Start...") try: clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, n_jobs=n_jobs).fit(tra_x, tra_y) except Exception as e: logging.error("Unexpected Error {}".format(e)) exit(0) logging.info("Random Fortest Training End and Stroe Model...") with open("rf.pkl", "wb") as f: pickle.dump(clf, f) val_y_pred_prob = clf.predict_proba(val_x) val_y_pred_label = clf.predict(val_x) cfmt = confusion_matrix(val_y, val_y_pred_label).tolist() top1_acc = top_k(val_y, val_y_pred_prob, clf.classes_, k=1) top5_acc = top_k(val_y, val_y_pred_prob, clf.classes_, k=5) fprs = [] tprs = [] aucs = [] recalls = [] precisions = [] aps = [] for c in range(len(clf.classes_)): val_y_true_binary = val_y == clf.classes_[c] val_y_pred_binary = val_y_pred_prob[:, c] fpr, tpr, thres_roc = roc_curve(val_y_true_binary, val_y_pred_binary, pos_label=1) auc = roc_auc_score(val_y_true_binary, val_y_pred_binary) precision, recall, thres_pr = precision_recall_curve(val_y_true_binary, val_y_pred_binary) ap = average_precision_score(val_y_true_binary, val_y_pred_binary) fprs.append(fpr.tolist()) tprs.append(tpr.tolist()) aucs.append(auc) recalls.append(recall.tolist()) precisions.append(precision.tolist()) aps.append(ap) pfmn_dict = {} pfmn_dict['graphs'] = [] # ROC曲线 graph_roc = {} graph_roc['name'] = 'ROC曲线' graph_roc['x_title'] = 'fpr' graph_roc['y_title'] = 'tpr' graph_roc['lines'] = [] for i in range(len(fprs)): line = {} line['name'] = 'label为{}的ROC曲线'.format(i) line['relative'] = [] relative = {} relative['name'] = 'auc' relative['value'] = aucs[i] line['relative'].append(relative) line['x_axis'] = fprs[i] line['y_axis'] = tprs[i] graph_roc['lines'].append(line) pfmn_dict['graphs'].append(graph_roc) # PR曲线 graph_pr = {} graph_pr['name'] = 'PR曲线' graph_pr['x_title'] = 'recall', graph_pr['y_title'] = 'precision' graph_pr['lines'] = [] for i in range(len(recalls)): line = {} line['name'] = 'label为{}的PR曲线'.format(i) line['relative'] = [] relative = {} relative['name'] = 'ap' relative['value'] = aps[i] line['relative'].append(relative) line['x_axis'] = recalls[i] line['y_axis'] = precisions[i] graph_pr['lines'].append(line) pfmn_dict['graphs'].append(graph_pr) # 混淆矩阵 pfmn_dict['matrixs'] = [] matrix = {} matrix['name'] = '混淆矩阵' matrix['col_name'] = clf.classes_.tolist() matrix['row_name'] = clf.classes_.tolist() matrix['elements'] = cfmt pfmn_dict['matrixs'].append(matrix) # 数值型指标 pfmn_dict['evaluation'] = [] evals_top1 = {} evals_top1['name'] = "top1" evals_top1['value'] = top1_acc pfmn_dict['evaluation'].append(evals_top1) if top5_acc: evals_top5 = {} evals_top5['name'] = 'top5' evals_top5['value'] = top5_acc pfmn['evaluation'].append(evals_top5) pfmn_str = json.dumps(pfmn_dict) with open('pfmn.json', 'w') as f: f.write(pfmn_str) logging.info('Random Fortest Model Evaluation finished!') elif operMode == 'PREDICTION': has_label = args.has_label label_name = args.label_name load_model = args.load_model logging.info('model parameter configure as follow:\n' 'has_label: {}\n' 'label_name: {}\n' 'load_model: {}\n'.format(has_label, label_name, load_model)) if has_label: if label_name is None: try: raise Exception('if parameter has_label is true, label_name must not be none') except Exception as e: logging.error(e) exit(0) if has_label: columns = df.columns.tolist() test_y = df[label_name].values columns.remove(label_name) test_x = df[columns].values else: test_x = df.values logging.info("Random Fortest Load Model ") model_path = load_model if not os.path.exists(model_path): try: raise Exception('model file {} will be loaded not exists!'.format(model_path)) except Exception as e: logging.error('Unexpected Error {}'.format(e)) exit(0) with open(model_path, 'rb') as f: clf = pickle.load(f) test_y_pred_prob = clf.predict_proba(test_x) if has_label: fprs = [] tprs = [] aucs = [] recalls = [] precisions = [] aps = [] for c in range(len(clf.classes_)): test_y_true_binary = test_y == clf.classes_[c] test_y_pred_binary = test_y_pred_prob[:, c] fpr, tpr, thres_roc = roc_curve(test_y_true_binary, test_y_pred_binary, pos_label=1) auc = roc_auc_score(test_y_true_binary, test_y_pred_binary) precision, recall, thres_pr = precision_recall_curve(test_y_true_binary, test_y_pred_binary) ap = average_precision_score(test_y_true_binary, test_y_pred_binary) fprs.append(fpr.tolist()) tprs.append(tpr.tolist()) aucs.append(auc) recalls.append(recall.tolist()) precisions.append(precision.tolist()) aps.append(ap) test_y_pred_label = clf.predict(test_x) cfmt = confusion_matrix(test_y, test_y_pred_label).tolist() top1_acc = top_k(test_y, test_y_pred_prob, clf.classes_, k=1) top5_acc = top_k(test_y, test_y_pred_prob, clf.classes_, k=5) pfmn_dict = {} pfmn_dict['graphs'] = [] # ROC曲线 graph_roc = {} graph_roc['name'] = 'ROC曲线' graph_roc['x_title'] = 'fpr' graph_roc['y_title'] = 'tpr' graph_roc['lines'] = [] for i in range(len(fprs)): line = {} line['name'] = 'label为{}的ROC曲线'.format(i) line['relative'] = [] relative = {} relative['name'] = 'auc' relative['value'] = aucs[i] line['relative'].append(relative) line['x_axis'] = fprs[i] line['y_axis'] = tprs[i] graph_roc['lines'].append(line) pfmn_dict['graphs'].append(graph_roc) # PR曲线 graph_pr = {} graph_pr['name'] = 'PR曲线' graph_pr['x_title'] = 'recall', graph_pr['y_title'] = 'precision' graph_pr['lines'] = [] for i in range(len(recalls)): line = {} line['name'] = 'label为{}的PR曲线'.format(i) line['relative'] = [] relative = {} relative['name'] = 'ap' relative['value'] = aps[i] line['relative'].append(relative) line['x_axis'] = recalls[i] line['y_axis'] = precisions[i] graph_pr['lines'].append(line) pfmn_dict['graphs'].append(graph_pr) # 混淆矩阵 pfmn_dict['matrixs'] = [] matrix = {} matrix['name'] = '混淆矩阵' matrix['col_name'] = clf.classes_.tolist() matrix['row_name'] = clf.classes_.tolist() matrix['elements'] = cfmt pfmn_dict['matrixs'].append(matrix) # 数值型指标 pfmn_dict['evaluation'] = [] evals_top1 = {} evals_top1['name'] = "top1" evals_top1['value'] = top1_acc pfmn_dict['evaluation'].append(evals_top1) if top5_acc: evals_top5 = {} evals_top5['name'] = 'top5' evals_top5['value'] = top5_acc pfmn_dict['evaluation'].append(evals_top5) pfmn_str = json.dumps(pfmn_dict) with open('pfmn.json', 'w') as f: f.write(pfmn_str) else: logging.fatal('Random fortest not support {}'.format(operMode)) raise Exception('Random fortest not support {}'.format(operMode))
cv_model.cv_results_ ### ASSESS BEST PARAMS TREE AND SCORE tree_model = RandomForestClassifier(random_state=297, **cv_model.best_params_) ####ONLY IF THE PREVIOUS MODEL IS A SearchCV tree_model = tree_model.fit(trainX, trainY.values.ravel()) tree_model.score(trainX, trainY) tree_model.score(testX, testY) ### CHECK IMPORTANCE OF FEATURES feature_importance = pd.DataFrame(tree_model.feature_importances_, index=trainX.columns, columns=['Imp']).reset_index() feature_importance['pk'] = 1 plot_scatter(feature_importance, 'index', 'Imp', 'index') plot_bar(feature_importance, 'index', 'Imp', 'index') ### PREDICT prediction = tree_model.predict(features_all) tree_model.predict_proba(features_all) #### VISUALIZE TREE ### ONLY FOR SIMPLE DECISION TREE # tree.export_graphviz(tree_model, # feature_names=list(trainX.columns), # out_file='/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/tree.dot') # (graph,) = pydot.graph_from_dot_file('/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/tree.dot') # graph.write_png('/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/tree.png') skplt.metrics.plot_confusion_matrix(target, prediction, normalize=True) pd.crosstab(target['Survived'], prediction) sum(target['Survived'])
''' oversampled_path = "resources/oversampled_normalized_data_ratio_2.bin" homesite = Data() homesite.load_sliptted_data(oversampled_path) del homesite.test_x # Deleted to save memory. print homesite.train_x.shape # Creating classifier. # clf = DecisionTreeClassifier() clf = RandomForestClassifier(max_features=100) # clf = AdaBoostClassifier(n_estimators = 10) # clf = svm.SVC(gamma = 0.00005) # clf = RandomForestClassifier() # clf = MultiplePLS(n_classifiers = 10, n_samples = 5000, n_positive_samples = 2500, threshold = 0.9, acc = 0.999) # clf = svm.LinearSVC() # Train classifier. print "Training classifier." clf.fit(homesite.train_x, homesite.train_y) # Test classifier. print 'Testing classifier.' predicted_labels = clf.predict_proba(homesite.validation_x)[:, 1] # Show final results. results = confusion_matrix(homesite.validation_y, np.round(predicted_labels)) accuracy, precision, recall = compute_performance_metrics(results) auc = compute_auc(homesite.validation_y, predicted_labels)
from util import convert_gray_scale, flatten Xr,Yr = training_set Xe,Ye = test_set Xr = flatten(convert_gray_scale(Xr)) Xe = flatten(convert_gray_scale(Xe)) rf = RandomForestClassifier(n_estimators=100, verbose=3, oob_score=True, compute_importances=True) rf.fit(Xr, Yr) Yp = rf.predict(Xe) print np.mean(Yp == Ye) Ypp = rf.predict_proba(Xe).max(axis=1) plt.figure(1) plt.clf() plt.hist(Ypp[Yp == Ye], 50, color='b', normed=True, alpha=0.4, label='classified') plt.hist(Ypp[Yp != Ye], 50, color='r', normed=True, alpha=0.4, label='misclassified') plt.legend(loc='upper left') plt.draw() plt.show() plt.figure(3) plt.clf() n = 0.01 * float(len(Yp))
# normalize=True) #clf = xgbwrapper.XgbWrapper({'objective': 'binary:logistic', # 'eval_metric': 'auc', # 'eta': 0.1, # 'silent': 1, # 'max_delta_step': 1}) # 'Normal' 70 / 30 cross-validation if do_cross_val == 1: X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, train.WnvPresent, test_size=0.3, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test) print(metrics.roc_auc_score(y_test, y_pred)) elif do_cross_val == 2: # Leave-one-year-out cross-validation scores = [] total_pred = np.array([]) total_test = np.array([]) for year in [2007, 2009, 2011, 2013]: X_train, X_test, y_train, y_test, y_train_numMosquitos, y_test_numMosquitos = year_train_test_split( train_for_loo, 'WnvPresent_DateTrapSpecies', year) X_train.to_csv("data_per_year/" + str(year) + "X_train.csv",
def RF_trainandtest_kfold(self, unionscores, nsplit, cutscore, cv, feature_sel, varthreshold, ntrees, nodes, rfmethod, nclusters, cmethod, resmethod): if unionscores == True: data_feature = self.data.drop(['name', 'idCard', 'mobileNum', 'cardNum', 'rsk_score'], axis = 1) else: data_feature = self.data.drop(['name', 'idCard', 'mobileNum', 'cardNum', 'cst_score', 'cnp_score', 'cnt_score', 'chv_score', 'dsi_score','rsk_score'], axis = 1) data_target = (self.data['rsk_score'] < cutscore).astype('int') #将数据集分割成k个分段分别进行训练和测试,对每个分段,该分段为测试集,其余数据为训练集 kf = KFold(n_splits=nsplit, shuffle=True) predresult = pd.DataFrame() for train_index, test_index in kf.split(data_feature): X_train, X_test = data_feature.iloc[train_index, ], data_feature.iloc[test_index, ] y_train, y_test = data_target.iloc[train_index, ], data_target.iloc[test_index, ] #如果随机抽样造成train或者test中只有一个分类,跳过此次预测 if (len(y_train.unique()) == 1) or (len(y_test.unique()) == 1): continue #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化 X_train, X_test = self.binandwoe_traintest(X_train, y_train, X_test, nclusters, cmethod) #在train中做变量筛选, sklearn.feature_selection中的方法 if feature_sel == "VarianceThreshold": selector = VarianceThreshold(threshold = varthreshold) X_train1 = pd.DataFrame(selector.fit_transform(X_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "RFECV": estimator = LogisticRegression() selector = RFECV(estimator, step=1, cv=cv) X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "SelectFromModel": estimator = LogisticRegression() selector = SelectFromModel(estimator) X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "SelectKBest": selector = SelectKBest() X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] else: X_train1, X_test1 = X_train, X_test #重采样resampling 解决样本不平衡问题 X_train1, y_train = self.imbalanceddata (X_train1, y_train, resmethod) #训练并预测随机森林模型 if rfmethod == 'RandomForest': classifier = RandomForestClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes) elif rfmethod == 'ExtraTrees': classifier = ExtraTreesClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes) elif rfmethod == 'GradientBoosting': classifier = GradientBoostingClassifier(n_estimators=ntrees,min_samples_split=nodes*2, min_samples_leaf=nodes) classifier.fit(X_train1, y_train) probability = classifier.predict_proba(X_test1) temp = pd.DataFrame({'target' : y_test, 'probability' : probability[:,1]}) predresult = pd.concat([predresult, temp], ignore_index = True) return predresult
Xr, Yr = training_set Xe, Ye = test_set Xr = flatten(convert_gray_scale(Xr)) Xe = flatten(convert_gray_scale(Xe)) rf = RandomForestClassifier(n_estimators=100, verbose=3, oob_score=True, compute_importances=True) rf.fit(Xr, Yr) Yp = rf.predict(Xe) print np.mean(Yp == Ye) Ypp = rf.predict_proba(Xe).max(axis=1) plt.figure(1) plt.clf() plt.hist(Ypp[Yp == Ye], 50, color='b', normed=True, alpha=0.4, label='classified') plt.hist(Ypp[Yp != Ye], 50, color='r', normed=True, alpha=0.4, label='misclassified')
from data.numpy_file import save_np_array, load_np_array from data.plot import plot import numpy as np import pandas as pd from statistics.confusion_matrix import confusion_matrix from statistics.performance import compute_performance_metrics, compute_auc if __name__ == '__main__': ''' Classify data changing balancing ratio. ''' # Train and test random forests. path = "../homesite_data/resources/parsed_data.bin" homesite = Data() homesite.load_parsed_data(path) homesite.z_norm_train_test_by_feature() sm = OverSampler(verbose=False, ratio=2.5) homesite.train_x, homesite.train_y = sm.fit_transform( homesite.train_x, homesite.train_y) clf = RandomForestClassifier(n_estimators=300, max_features=100, n_jobs=4) # Train classifier. print "Training classifier." clf.fit(homesite.train_x, homesite.train_y) predicted_labels = clf.predict_proba(homesite.test_x)[:, 1] sample = pd.read_csv('../input/sample_submission.csv') sample.QuoteConversion_Flag = predicted_labels sample.to_csv('rfc_300.csv', index=False)
print mask.sum() X = images[mask, ...].reshape(mask.sum(), np.prod(images.shape[1::])) print X.shape Y = classifications[mask] acc = [] acc_correct = [] acc_incorrect = [] acc_x_incorrect = [] k_fold = 8 for train_inx, valid_inx in StratifiedKFold(Y, k_fold): rf = RandomForestClassifier(n_estimators=100, verbose=0, oob_score=True, compute_importances=True) rf.fit(X[train_inx], Y[train_inx]) Yp = rf.predict(X[valid_inx]) correct = Yp== Y[valid_inx] rf.predict_proba(X[valid_inx]) p_correct = rf.predict_proba(X[valid_inx]).max(axis=1) acc_correct.append(p_correct[correct]) acc_incorrect.append(p_correct[~correct]) score = correct.mean() print score acc.append(score) acc_x_incorrect.append([images[mask][valid_inx[~correct]], Y[valid_inx[~correct]], Yp[~correct]]) print 'score', np.mean(acc) rf = RandomForestClassifier(n_estimators=100, verbose=0, oob_score=True, compute_importances=True)
clf = ensemble.GradientBoostingClassifier(**params) clf.fit(X_train, y_train) test_loss = np.zeros((params['n_estimators'],), dtype=np.float64) train_loss = np.zeros((params['n_estimators'],), dtype=np.float64) for i, y_pred in enumerate(clf.staged_decision_function(X_test)): # clf.loss_ assumes that y_test[i] in {0, 1} y_sig = (1.0 / (1.0 + np.exp(0.0 - y_pred))) test_loss[i] = log_loss(y_test, y_sig)#clf.loss_(y_test, y_sig) for i, y_pred in enumerate(clf.staged_decision_function(X_train)): # clf.loss_ assumes that y_test[i] in {0, 1} y_sig = (1.0 / (1.0 + np.exp(0.0 - y_pred))) train_loss[i] = log_loss(y_train, y_sig)#clf.loss_(y_train, y_sig) plt.figure() plt.plot(test_loss, 'r', linewidth=2) plt.plot(train_loss, 'g', linewidth=2) plt.legend(['test', 'train']) i = np.argmin(test_loss) print('min log-loss: ', np.round(test_loss[i],2), ' iteration#: ', i) rfc = RandomForestClassifier(random_state=241, n_estimators=i) rfc.fit(X_train, y_train) y_pred = rfc.predict_proba(X_test) print('RandomForest log-loss: ', np.round(log_loss(y_test, y_pred),2))
kf = KFold(n_splits=10) predictions = [] print('PCA with RandomForest model training...') for train_index, val_index in kf.split(df_features): Train_X = df_features.iloc[train_index] Train_Y = df_label.iloc[train_index] Val_X = df_features.iloc[val_index] clf = RandomForestClassifier(n_estimators=50, min_samples_split=2, min_samples_leaf=1, oob_score=True) clf.fit(Train_X, Train_Y) predict_Val_Y = clf.predict_proba(Val_X)[:, 1] predict_Val_Y[predict_Val_Y <= 0.44] = 0 predict_Val_Y[predict_Val_Y > 0.44] = 1 predictions.append(predict_Val_Y) predictions = np.concatenate(predictions, axis=0) precision = np.count_nonzero(predictions == df_label) / len(predictions) print(precision)
tree.export_graphviz(clf, out_file=dot_data, feature_names=list(data_tree.columns.values)) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf('dectree.pdf') # Repeat on test set y_test_pred = clf.predict(X_test) print "Accuracy Test: {0:.3f}".format( metrics.accuracy_score(y_test, y_test_pred)) print print "Classification report:" print metrics.classification_report(y_test, y_test_pred) print print "Confusion matrix:" print metrics.confusion_matrix(y_test, y_test_pred) # Measure performance y_pred = clf.predict_proba(X_train) # Repeat on test set y_test_pred = clf.predict_proba(X_test) tt = g_test.as_matrix() pred = tt * y_test_pred ss = np.sum(pred, axis=1) sss = ss.mean() print sss
df['group'] = 0 df.loc[df.logerror < q20, 'group'] = -1 df.loc[df.logerror > q80, 'group'] = 1 # create train and test set X_train, X_val, y_train, y_val, scaler = create_inputs_model(df.drop( 'logerror', axis=1), test_size=0.25) # Predict the class rfc = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=-1, max_depth=8) rfc.fit(X_train, y_train) y_pred_val = rfc.predict_proba(X_val) fpr, tpr, thresholds = roc_curve(y_val, y_pred_val[:, 1], pos_label=1) print("AUC on test : {:.02f} %".format(auc(fpr, tpr) * 100)) #most important features importances = rfc.feature_importances_ std = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis=0) indices = np.argsort(importances)[::-1] feat_names = df.drop('logerror', axis=1).drop('group', axis=1).columns.values for f in range(X_val.shape[1]): print("{}. feature {} - {} :({:.06f})".format(f + 1, indices[f], feat_names[indices[f]], importances[indices[f]])) catalog = describe_features(df)
def runns(resp_var, size_of_test_data, dataset, positive_class, predictor_var, n_estimators, important_features, dealing_with_nulls): dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes #----DATA PREPROCESSING #-------dealing with NULL values in the data #----------remove the rows in which the response is null dataset = dataset.dropna(subset=[resp_var]) #----------dealing with nulls dataset = deal_with_nulls(dealing_with_nulls, dataset) #----FEATURE SELECTION #-------get predictors important in predicting the response #-----------transform categorical predictors to dummy variables predictors = dataset[predictor_var] predictors = pd.get_dummies(predictors) #-----------balance the classes in the response var ros = RandomOverSampler(random_state=0) resp = dataset[resp_var] prds, resp = ros.fit_sample(predictors, resp) #-----------fit the random forest classifier to give us the important predictors rf_clf = RandomForestClassifier(n_estimators=n_estimators) rf_clf.fit(prds, resp) #-------get the important predictors feature_imp = pd.Series( rf_clf.feature_importances_, index=list(predictors.iloc[:, 0:])).sort_values(ascending=False) #-------names of the important predictors important_predictor_names = feature_imp.index[0:important_features] #-------subset the data to get only the important predictors and the response resp = pd.DataFrame(data=resp, columns=[resp_var]) predictors = pd.DataFrame(prds, columns=list(predictors)) dataset = pd.concat([resp, predictors], axis=1) #--------------------------------------------------------- #----MODEL TRAINING #--------Remove the response variables from the features variables - axis 1 refers to the columns m_data = dataset.drop(resp_var, axis=1, inplace=False) # Response variables are the values we want to predict resp_var = np.array(dataset[resp_var]) dataset = pd.get_dummies(m_data) # Saving feature names for later use feature_list = list(m_data.columns) # Convert to numpy array dataset = np.array(dataset) # Split the data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split( dataset, resp_var, test_size=float(size_of_test_data), random_state=402) # Instantiate model with n_estimators decision trees clf = RandomForestClassifier(n_jobs=1, n_estimators=n_estimators, random_state=142) # Train the model on training data clf.fit(train_features, train_labels) # evaluation predicted = clf.predict(test_features) pred_prob = clf.predict_proba(test_features) accuracy = accuracy_score(test_labels, predicted) #confusion matrix cnf = (confusion_matrix(test_labels, predicted)) #precision score precision = precision_score(test_labels, predicted, pos_label=positive_class) #avg pres avg_precision = average_precision_score(test_labels, pred_prob[:, [1]]) #recall score rec = recall_score(test_labels, predicted, pos_label=positive_class) #f1 scorea fscore = f1_score(test_labels, predicted, pos_label=positive_class) #fbeta score fbeta = fbeta_score(test_labels, predicted, beta=0.5) #hamming_loss hamming = hamming_loss(test_labels, predicted) #jaccard similarity score jaccard = jaccard_similarity_score(test_labels, predicted) #logloss logloss = log_loss(test_labels, predicted) #zero-oneloss zero_one = zero_one_loss(test_labels, predicted) #auc roc area_under_roc = roc_auc_score(test_labels, pred_prob[:, [1]]) #cohen_score cohen = cohen_kappa_score(test_labels, predicted) #mathews corr mathews = matthews_corrcoef(test_labels, predicted) # Variable importances from the important features selection stage variable_importance_list = list(zip(prds, feature_imp)) output = { "accuracy": accuracy, "precision": precision, "average precision": avg_precision, "recall": rec, "fscore": fscore, "fbeta": fbeta, "hamming": hamming, "jaccard": jaccard, "logloss": logloss, "zero_one": zero_one, "area_under_roc": area_under_roc, "cohen": cohen, "mathews": mathews } output = json.dumps(output) return output
X_train, X_test, y_train, y_test = year_train_test_split( train_for_loo, 'WnvPresent_DateTrapSpecies', year) X_train.to_csv("data_per_year/" + str(year) + "X_train.csv", index=False) X_test.to_csv("data_per_year/" + str(year) + "X_test.csv", index=False) y_train.to_csv("data_per_year/" + str(year) + "y_train.csv", index=False) y_test.to_csv("data_per_year/" + str(year) + "y_test.csv", index=False) clf.fit(X_train, y_train) # y_pred = clf.predict_proba(X_test) [:, 1] # Random Forest y_pred = clf.predict_proba(X_test) # For XGB score = metrics.roc_auc_score(y_test, y_pred) scores.append(score) #import operator #feat_importances = dict(zip(X_train.columns, clf.feature_importances_)) #sorted_feat_importances = sorted(feat_importances.items(), key=operator.itemgetter(1)) #print(sorted_feat_importances) total_pred = np.concatenate((total_pred, y_pred)) total_test = np.concatenate((total_test, y_test)) print("Global ROC score", metrics.roc_auc_score(total_test, total_pred)) print(scores)
# 'eval_metric': 'auc', # 'eta': 0.1, # 'silent': 1, # 'max_delta_step': 1}) # 'Normal' 70 / 30 cross-validation if do_cross_val == 1: X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, train.WnvPresent, test_size=0.3, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test) print(metrics.roc_auc_score(y_test, y_pred)) elif do_cross_val == 2: # Leave-one-year-out cross-validation scores = [] total_pred = np.array([]) total_test = np.array([]) for year in [2007, 2009, 2011, 2013]: X_train,X_test, y_train, y_test, y_train_numMosquitos, y_test_numMosquitos = year_train_test_split( train_for_loo, 'WnvPresent_DateTrapSpecies', year)
knn = KNeighborsClassifier(n_neighbors=3) knn.fit(iris.data, iris.target) knn.predict(iris.data) len(iris.target) sum(iris.target == knn.predict(iris.data)) knn.score(iris.data, iris.target) help(cross_val_predict) cross_val_predict(knn, iris.data, iris.target, cv=20) cross_val_score(knn, iris.data, iris.target, cv=20).mean() rf = RandomForestClassifier(n_estimators=3) rf.fit(iris.data, iris.target) rf.predict_proba(iris.data) rf.score(iris.data, iris.target) sum(iris.target == rf.predict(iris.data)) cross_val_score(rf, iris.data, iris.target, cv=20).mean() from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier ''' https://github.com/drivendataorg/box-plots-sklearn/blob/master/src/data/multilabel.py ''' mcr = OneVsRestClassifier(LogisticRegression()) mcr.fit(iris.data, iris.target) mcr.predict(iris.data) mcr.predict_proba(iris.data)
def RF_trainandtest_kfold(self, nsplit, cv, feature_sel, varthreshold, ntrees, nodes, rfmethod, nclusters=10, cmethod=None): data_feature = self.data.ix[:, self.data.columns != 'default'] data_target = self.data['default'] #将数据集分割成k个分段分别进行训练和测试,对每个分段,该分段为测试集,其余数据为训练集 kf = KFold(n_splits=nsplit, shuffle=True) predresult = pd.DataFrame() for train_index, test_index in kf.split(data_feature): X_train, X_test = data_feature.iloc[ train_index, ], data_feature.iloc[test_index, ] y_train, y_test = data_target.iloc[ train_index, ], data_target.iloc[test_index, ] #如果随机抽样造成train或者test中只有一个分类,跳过此次预测 if (len(y_train.unique()) == 1) or (len(y_test.unique()) == 1): continue #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化 X_train, X_test = self.binandwoe_traintest(X_train, y_train, X_test, nclusters, cmethod) #在train中做变量筛选, sklearn.feature_selection中的方法 if feature_sel == "VarianceThreshold": selector = VarianceThreshold(threshold=varthreshold) X_train1 = pd.DataFrame(selector.fit_transform(X_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "RFECV": estimator = LogisticRegression() selector = RFECV(estimator, step=1, cv=cv) X_train1 = pd.DataFrame( selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "SelectFromModel": estimator = LogisticRegression() selector = SelectFromModel(estimator) X_train1 = pd.DataFrame( selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "SelectKBest": selector = SelectKBest() X_train1 = pd.DataFrame( selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] else: X_train1, X_test1 = X_train, X_test #训练并预测随机森林模型 if rfmethod == 'RandomForest': classifier = RandomForestClassifier(n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) elif rfmethod == 'ExtraTrees': classifier = ExtraTreesClassifier(n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) elif rfmethod == 'GradientBoosting': classifier = GradientBoostingClassifier( n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) classifier.fit(X_train1, y_train) probability = classifier.predict_proba(X_test1)[:, 1] temp = pd.DataFrame({'target': y_test, 'probability': probability}) predresult = pd.concat([predresult, temp], ignore_index=True) return predresult
def main(): st.title('Você sobreviveria ao Titanic?') st.write( 'Modelo de classificação com RandomForest para prever sobrevivência ou morte de passageiros no Titanic' ) st.subheader('Autor') st.write('https://www.linkedin.com/in/lucaszonin/') st.write('') st.subheader('Agradecimentos') st.write('Felipe Maia Polo que me deu algumas dicas:') st.write('https://www.linkedin.com/in/felipemaiapolo/') st.write('') titanic_v1 = pd.read_csv('datasets/train.csv') del titanic_v1['Cabin'] del titanic_v1['PassengerId'] del titanic_v1['Ticket'] del titanic_v1['SibSp'] del titanic_v1['Parch'] titanic_v1['Age'] = titanic_v1['Age'].fillna(np.mean(titanic_v1['Age'])) titanic_v1['Age'] = titanic_v1['Age'].astype('int64') titanic_v1 = titanic_v1.dropna() titanic_v1.loc[titanic_v1['Sex'] == 'male', 'Sex'] = 0 titanic_v1.loc[titanic_v1['Sex'] == 'female', 'Sex'] = 1 titanic_v1['Sex'] = titanic_v1['Sex'].astype(int) titanic_v1.loc[titanic_v1['Embarked'] == 'C', 'Embarked'] = 0 titanic_v1.loc[titanic_v1['Embarked'] == 'Q', 'Embarked'] = 1 titanic_v1.loc[titanic_v1['Embarked'] == 'S', 'Embarked'] = 2 titanic_v1['Embarked'] = titanic_v1['Embarked'].astype(int) #PUXAR SEXO sexo = st.radio(label='Sexo do passageiro', options=('Feminino', 'Masculino')) #PUXAR IDADE idade_passenger = st.slider(label='Idade do passageiro', min_value=1, max_value=max(titanic_v1['Age'])) #PUXAR EMBARCACAO embarked = st.radio(label='Cidade onde embarcou', options=('Cherbourg', 'Queenstown', 'Southampton')) #PUXAR VALOR DA PASSAGEM valor_pago = st.slider(label='Valor pago pela passagem', min_value=1, max_value=600) #PUXAR CLASSE classe = st.radio(label='Classe do passageiro', options=('Primeira', 'Segunda', 'Terceira')) if sexo == 'Feminino': sexo_modelo = 1 else: sexo_modelo = 0 if embarked == 'Cherbourg': embarked_modelo = 0 elif embarked == 'Queenstown': embarked_modelo = 1 elif embarked == 'Southampton': embarked_modelo = 2 if classe == 'Primeira': classe_modelo = 1 elif classe == 'Segunda': classe_modelo = 2 elif classe == 'Terceira': classe_modelo = 3 titanic_modelo = titanic_v1 y = titanic_modelo['Survived'] x = titanic_modelo[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']] X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=30) model = RandomForestClassifier() model.fit(X_train, y_train) model_randomForest = model.predict_proba(X_test) #st.write(accuracy_score(y_test,model_randomForest)) if st.button(label="Prever"): st.title('Dados do passageiro:') st.write('Sexo :', sexo) st.write('Idade :', idade_passenger) st.write('Cidade onde embarcou :', embarked) st.write('Valor da passagem : US$', valor_pago) st.write('Classe da passagem :', classe) x_input = pd.DataFrame( { 'Pclass': classe_modelo, 'Sex': sexo_modelo, 'Age': idade_passenger, 'Fare': valor_pago, 'Embarked': embarked_modelo }, index=[0]) new_model = RandomForestClassifier() new_model.fit(X_train, y_train) pred = new_model.predict_proba(x_input) st.title('Previsão:') st.write('') 'Probabilidade de morrer:', pred[0, 0] * 100 'Probabilidade de sobreviver:', pred[0, 1] * 100
''' # Train and test random forests. # load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.5.bin" load_path = "../homesite_data/resources/oversampled_normalized_data_ratio_2.bin" homesite = Data() homesite.load_sliptted_data(load_path) del homesite.test_x # Deleted to save memory. clf_ann = NeuralNetwork(path = "../homesite_data/ann_weights.bin", lr = 0.00005, \ lamb = 0) train_output_ann = clf_ann.get_hidden_output(homesite.train_x) validation_output_ann = clf_ann.get_hidden_output(homesite.validation_x) train_output_ann = np.hstack((train_output_ann, homesite.train_x)) validation_output_ann = np.hstack((validation_output_ann, homesite.validation_x)) for c in range(2, 10): # Train classifier. print "Training classifier." clf = RandomForestClassifier(n_estimators = 1 + 100 * c, n_jobs = 4) clf.fit(train_output_ann, homesite.train_y) # Test classifier. print 'Testing classifier.' predicted_labels = clf.predict_proba(validation_output_ann)[:, 1] # Show final results. results = confusion_matrix(homesite.validation_y, np.round(predicted_labels)) accuracy, precision, recall = compute_performance_metrics(results) auc = compute_auc(homesite.validation_y, predicted_labels)
def RF_trainandtest(self, testsize, cv, feature_sel, varthreshold, ntrees, nodes, rfmethod, nclusters=10, cmethod=None): #分割数据集为训练集和测试集 data_feature = self.data.ix[:, self.data.columns != 'default'] data_target = self.data['default'] X_train, X_test, y_train, y_test = train_test_split(data_feature, data_target, test_size=testsize, random_state=0) #对训练集做变量粗分类和woe转化,并据此对测试集做粗分类和woe转化 X_train, X_test = self.binandwoe_traintest(X_train, y_train, X_test, nclusters, cmethod) #在train中做变量筛选, sklearn.feature_selection中的方法 if feature_sel == "VarianceThreshold": selector = VarianceThreshold(threshold=varthreshold) X_train1 = pd.DataFrame(selector.fit_transform(X_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "RFECV": estimator = LogisticRegression() selector = RFECV(estimator, step=1, cv=cv) X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "SelectFromModel": estimator = LogisticRegression() selector = SelectFromModel(estimator) X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] elif feature_sel == "SelectKBest": selector = SelectKBest() X_train1 = pd.DataFrame(selector.fit_transform(X_train, y_train)) X_train1.columns = X_train.columns[selector.get_support(True)] X_test1 = X_test[X_train1.columns] else: X_train1, X_test1 = X_train, X_test #训练并预测随机森林模型 if rfmethod == 'RandomForest': classifier = RandomForestClassifier(n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) elif rfmethod == 'ExtraTrees': classifier = ExtraTreesClassifier(n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) elif rfmethod == 'GradientBoosting': classifier = GradientBoostingClassifier(n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) classifier.fit(X_train1, y_train) probability = classifier.predict_proba(X_test1)[:, 1] predresult = pd.DataFrame({ 'target': y_test, 'probability': probability }) return predresult
# normalize=True) #clf = XgbWrapper({'objective': 'binary:logistic', # 'eval_metric': 'auc', # 'eta': 0.1, # 'silent': 0, # 'max_delta_step': 1}) # 'Normal' 70 / 30 cross-validation if do_cross_val == 1: X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, train.WnvPresent, test_size=0.3, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test)[:, 1] print(metrics.roc_auc_score(y_test, y_pred)) elif do_cross_val == 2: # Leave-one-year-out cross-validation scores = [] for year in [2007, 2009, 2011, 2013]: X_train, X_test, y_train, y_test = year_train_test_split( train_for_loo, 'WnvPresent', year) X_train.to_csv("data_per_year/" + str(year) + "X_train.csv", index=False) X_test.to_csv("data_per_year/" + str(year) + "X_test.csv", index=False) y_train.to_csv("data_per_year/" + str(year) + "y_train.csv",