def Classify(features, labels, namesClasses): print "Training" # n_estimators is the number of decision trees # max_features also known as m_try is set to the default value of the square root of the number of features clf = RF(n_estimators=100, n_jobs=3) scores = cross_validation.cross_val_score(clf, features, labels, cv=5, n_jobs=1) print "Accuracy of all classes" print np.mean(scores) kf = KFold(labels, n_folds=5) y_pred = np.zeros((len(labels), len(set(labels)))) for train, test in kf: features_train, features_test, labels_train, labels_test = features[ train, :], features[test, :], labels[train], labels[test] clf = RF(n_estimators=100, n_jobs=3) clf.fit(features_train, labels_train) y_pred[test] = clf.predict_proba(features_test) print classification_report(labels, y_pred, target_names=namesClasses) return y_pred
def check_nlp_improvement(fast=False): if fast: clf = RF(n_estimators=100, n_jobs=-1,criterion="entropy",max_features='auto',min_samples_split=5) folds = 5 else: clf = RF(n_estimators=1000, n_jobs=-1, criterion="entropy", max_features=100, min_samples_split=5) folds = 10 paramlist = [str(i) for i in clf.get_params().values()] parlist = str(np.sort(paramlist))+str(folds) h = hashlib.sha1() h.update(parlist.encode('utf-8')) sig = h.hexdigest() try: baseline = np.load("nlp_baseline_"+str(sig)+".npy") except Exception: print("Establishing baseline, this will run once") X_train, y_train, X_test, test_ids = read_json(do_descriptions=False) baseline = cv(X_train, y_train, None, MinMaxScaler(), clf, folds=folds, metric=metrics.log_loss, verbose=True) np.save("nlp_baseline_"+str(sig),baseline) print("Baseline:",baseline) X_train, y_train, X_test, test_ids = read_json(do_descriptions=True) print ("Checking performance, this may take several minutes") res = cv(X_train, y_train, None, MinMaxScaler(), clf, folds=folds, metric=metrics.log_loss, verbose=True) print("Result:",res) if res < baseline: print ("Improvement over baseline",str(baseline-res)) else: print ("Performance worse than baseline by", str(res-baseline))
def rrf(series, n_folds, clfparams, featureparams, aggregateparams, refineparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include=include, exclude=exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] best_pruning = refineparams['n_prunings'] if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index=y.index, columns=pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features( itrain, itest, **aggregateparams) clf = RF(**clfparams) clf.fit(Xtr, ytr) rrf = RRF(clf, **refineparams) rrf.fit(Xtr, ytr) loss2tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr)) loss2te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte)) _run.info['loss'].append(loss2te) _run.info['trainloss'].append(loss2tr) print("Fold {} mlogloss train: {:.4f}, test: {:.4f}".format( i, loss2tr, loss2te)) pred.iloc[itest, :] = rrf.predict_proba(Xte) i += 1 loss = multiclass_log_loss(y.values, pred.values) _run.info['features'] = list(Xtr.columns) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) # # weights = np.concatenate((np.ones(ytr.shape[0]),0.3*np.ones(semilabels.shape[0]))) # Xtr = pd.concat((Xtr, Xtest), axis=0) # ytr = pd.concat((ytr, semilabels)) clf = RF(**clfparams) clf.fit(Xtr, ytr) #,weights) rrf = RRF(clf, **refineparams) rrf.fit(Xtr, ytr) predtest = pd.DataFrame(rrf.predict_proba(Xte), index=yte.index, columns=pred_cols) predtest.to_csv(filename, index_label='id') return loss
def rf(series, n_folds, clfparams, featureparams, aggregateparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run): data = TelstraData(include = include, exclude = exclude, **featureparams) time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ['predict_{}'.format(i) for i in range(3)] if skip_cross_validation: loss = 999. else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0., index = y.index, columns = pred_cols) i = 1 _run.info['loss'] = [] _run.info['trainloss'] = [] feature_importances_ = 0 for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams) clf = RF(**clfparams) clf.fit(Xtr, ytr)#, weights) pred.iloc[itest, :] = clf.predict_proba(Xte) trainloss = multiclass_log_loss(ytr, clf.predict_proba(Xtr)) _run.info['trainloss'].append(trainloss) loss = multiclass_log_loss(yte, pred.iloc[itest].values) _run.info['loss'].append(loss) if i == 1: feature_importances_ = clf.feature_importances_/n_folds else: feature_importances_ += clf.feature_importances_/n_folds i += 1 loss = multiclass_log_loss(y, pred.values) _run.info['features'] = list(Xtr.columns) _run.info['feature_importances'] = list(feature_importances_) # Optionally save oob predictions if save_oob_predictions: filename = '{}_{}.csv'.format(series, time) pred.to_csv(filename, index_label='id') # Optionally generate test predictions if save_test_predictions: filename = '{}_test_{}.csv'.format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) clf = RF(**clfparams) clf.fit(Xtr, ytr)#,weights) predtest = pd.DataFrame(clf.predict_proba(Xte), index = yte.index, columns = pred_cols) predtest.to_csv(filename, index_label='id') return loss
def _calculate_cv_error(base_clf, best_rate, X, y, is_y_noise, clean_type, max_nb_feats, major_oob_label): errors = [] skf = StratifiedKFold(n_splits=NoiseDetectionEnsemble.k_folds, shuffle=True) for train_idxs, val_idxs in skf.split(X=range(len(y)), y=y): train_X = DataHelper.select_rows(X, train_idxs, copy=False) train_y = DataHelper.select_rows(y, train_idxs, copy=False) train_is_y_noise = DataHelper.select_rows(is_y_noise, train_idxs, copy=False) clean_train = NoiseDetectionEnsemble._clean_noisy_data(train_X, train_y, train_is_y_noise, clean_type, major_oob_label) train_X, train_y, adapted_rate = DataHelper.adapt_rate(clean_train[0], clean_train[1], best_rate) ensemble = RF(501, n_jobs=-1, max_features="sqrt") ensemble.fit(train_X, train_y) val_X = DataHelper.select_rows(X, val_idxs, copy=False) val_y = DataHelper.select_rows(y, val_idxs, copy=False) predictions = ensemble.predict(val_X) error = MetricsHelper.calculate_error_score(val_y, predictions) errors.append(error) return mean(errors)
def main(x, y, task): #ys = [yr, ym, y25] #y_names = ['readm', 'mort_h', 'pheno25'] #xs = [x48, onehot, w2v, w48, sentences] #x_names = ['48h', 'sparse_dx', 'w2v', 'w2v_48h', 'sentences'] lr = LR(C=1e-4, penalty='l2', verbose=1) #sag if multiclass/multilabel svm = SVM(C=1e5, verbose=True) rf = RF(n_estimators=60, verbose=1) gbc = GBC(n_estimators=200, learning_rate=1e-3, verbose=1) models = [lr, svm, rf, gbc] names = ['LR', 'SVM', 'RF', 'GBC'] data = {} for idx in range(len(models)): if task != 'binary': data[names[idx]] = {} for ix in range(25): dat = run_experiment(x, y[:, ix], models[idx], task) data[names[idx]][ix] = dat else: dat = run_experiment(x, y, models[idx], task) data[names[idx]] = dat return (data)
def __init__(self, train_X, test_X, train_Y, test_Y, agent, classifier, save_conf_mat): self.train_X = train_X self.test_X = test_X self.train_Y = train_Y self.test_Y = test_Y self.classifier = classifier if (self.classifier.lower() == 'knn'): self.clf = KNN() elif (self.classifier.lower() == 'rf'): self.clf = RF() elif (self.classifier.lower() == 'svm'): self.clf = SVM() else: self.clf = None print('\n[Error!] We don\'t currently support {} classifier...\n'. format(classifier)) exit(1) if (agent == None): self.agent = np.ones(train_X.shape[1]) self.predictions = self.classify() self.accuracy = self.compute_accuracy() self.precision = self.compute_precision() self.recall = self.compute_recall() self.f1_score = self.compute_f1() self.confusion_matrix = self.compute_confusion_matrix() self.plot_confusion_matrix(save_conf_mat)
def try_params(n_iterations, params): n_estimators = int(round(n_iterations * trees_per_iteration)) print "n_estimators:", n_estimators pprint(params) clf = RF(n_estimators=n_estimators, verbose=0, n_jobs=-1, **params) clf.fit(x_train, y_train) p = clf.predict_proba(x_train)[:, 1] ll = log_loss(y_train, p) auc = AUC(y_train, p) acc = accuracy(y_train, np.round(p)) print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc) # p = clf.predict_proba(x_test)[:, 1] ll = log_loss(y_test, p) auc = AUC(y_test, p) acc = accuracy(y_test, np.round(p)) print "# testing | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc) return {'loss': ll, 'log_loss': ll, 'auc': auc}
def analyzing_models(images): """ Program: Main program to analyze models and featuresets Input: Images Output: Accuracy dataframe of ceah molde and featureset """ # model list models = [RF(n_estimators=100, n_jobs=3), MLPClassifier(hidden_layer_sizes=(20, )), BernoulliNB() ExtraTreesClassifier(n_estimators=100, n_jobs=3) ] #model_names =['RandomForest', 'Neural network', 'ExtraTrees'] model_names =['RandomForest', 'Neural network','Bernoulli Naive Bayes', 'ExtraTrees' ] # feature list available_features = ['haralick', 'zernike', 'binary_pattern', #'binary_pattern_small', 'ratio', 'image_size', #'normalized_sift', 'sift' ] combi_features = combi_lists(available_features) # accruacy datafream of accuracys accuracy_df = compare_accuracy(images, combi_features, models, model_names) return accuracy_df
def main(): # Get the clean datasets x,y,xt,feats,sample = readData() #Try out different models xg_class_params = {"objective" : "binary:logistic","eval_metric" : "auc", "booster" : "gbtree", "eta": 0.01,"max_depth": 14,"min_child_weight": 10, "subsample": 0.66, #"colsample_bytree": 0.7, "colsample_bylevel":0.3, "thread": 1,"silent": 1,"seed": 221} xg_class_params2 = {"objective" : "binary:logistic","eval_metric" : "auc", "booster" : "gbtree", "eta": 0.02,"max_depth": 5,"min_child_weight": 10, "subsample": 0.66, #"colsample_bytree": 0.7, "colsample_bylevel":0.3, "thread": 1,"silent": 1,"seed": 221} rf1 = RF(n_estimators=1000,max_features= 50,criterion='entropy',min_samples_split= 40,max_depth= 30, min_samples_leaf= 2, n_jobs = 10,verbose=0,random_state=42) etc1 = ETC(n_estimators=500,max_features= 90,criterion='entropy',min_samples_split= 20,max_depth= 25, min_samples_leaf= 10, n_jobs =10,verbose=0,random_state=42) xgb1 = XGC(xg_class_params,num_rounds=550) xgb2 = XGC(xg_class_params2,num_rounds=600) xgb_bag=bagger(xgb2,num_bags=3,bag_fraction=0.75) # EVALUATE a model score = crossValidate(etc1,x,y,folds=5,runs=1)
def learn_total(): clf = RF(n_estimators=200, max_features="auto", max_depth=8, min_samples_split=10, min_samples_leaf=2, n_jobs=3, oob_score=True, random_state=728) #max_depth = 8最好 #clf = GBDT(n_estimators=100,max_features="auto",max_depth=8,min_samples_split=10,min_samples_leaf=2,verbose=3) rd = 500 * 1000 train = load_data_total("train", rd) train_label = load_label("train") train_label = train_label[:len(train)] train_label = np.array(train_label) print "train_label", len(train_label), "train", len(train) print "train特征数", len(train[0]) print "learn" clf.fit(train, train_label) return clf
def main(): data_dir = '../data/taobao/' #data_dir = '../data/amazon/' item_path = data_dir + 'dim_items.txt' sub_item_path = data_dir + 'match_item.txt' train_pair_path = data_dir + 'train_set_1to1.txt' pg = PairGenerator(item_path, sub_item_path, train_pair_path) print('Preparing data...', time.ctime()) label = '1' if data_dir == '../data/amazon/': label = '0' (train_data, test_data) = pg.fetch_all_topic(label, 0.2) train_x = np.array(train_data['pair_in']) train_y = np.array(train_data['pair_out']) test_num = len(test_data['pair_in']) test_upper = int(test_num * 0.5) test_x = np.array(test_data['pair_in'][test_upper:]) test_y = np.array(test_data['pair_out'][test_upper:]) print('Start training...', time.ctime()) #C = NB() C = RF(verbose=1, n_jobs=2) C.fit(train_x, train_y) test_y_pred = C.predict_proba(test_x) test_y_pred = [result[1] for result in test_y_pred] with open(data_dir.split('/')[-2] + '_lda_pred.dat', 'w') as f: pickle.dump((test_y, test_y_pred), f) line = 'AUC: %s' % (metrics.roc_auc_score(test_y, test_y_pred) * 100) print(line) line = 'Ground Pos: %s, Predict Pos: %s' % (int( np.sum(test_y)), int(np.sum(test_y_pred))) print(line)
def features_imp(df, target): from sklearn.ensemble import RandomForestRegressor as RF df['RAND_bin'] = np.random.randint(2, size=len(df[target])) df['RAND_uniform'] = np.random.uniform(0, 1, len(df[target])) df['RAND_int'] = np.random.randint(100, size=len(df[target])) columns = df.drop(target, axis=1).columns.tolist() estimator = RF(n_estimators=50) estimator.fit(df[columns], df[target]) y_pred = estimator.predict(df[columns]) baseline = MAE(y_pred, df[target]) imp = [] for col in columns: col_imp = [] for n in range(3): save = df[col].copy() df[col] = np.random.permutation(df[col]) y_pred = estimator.predict(df[columns]) m = MAE(y_pred, df[target]) df[col] = save col_imp.append(baseline - m) imp.append(np.mean(col_imp)) FI = DataFrame([]) FI['feature'] = columns FI['value'] = -np.array(imp) FI = FI.sort_values('value', ascending=False).reset_index(drop=True) M = FI[FI['feature'].isin(['RAND_bin', 'RAND_int', 'RAND_uniform'])]['value'].max() S = FI[FI['feature'].isin(['RAND_bin', 'RAND_int', 'RAND_uniform'])]['value'].std() threshold = M + S FI['important'] = np.where(FI['value'] > threshold, True, False) return FI
def RF_First(self, data, n_estimators=400, max_features='auto'): # 对训练数据进行训练,返回模验证数据,预测数据的预测结果 model = RF(n_estimators=n_estimators, max_features=max_features) model.fit(data['train'][:, :-1], data['train'][:, -1]) # 注意存储验证数据集结果和预测数据集结果的不同 # 训练数据集的预测结果 xul = model.predict(data['train'][:, :-1]) # 验证的预测结果 yanre = model.predict(data['test'][:, :-1]) #预测的预测结果 prer = model.predict(data['predict'][:, :-1]) # 储存 self.yanzhneg_pr.append(yanre) self.predi.append(prer) # 分别计算训练、验证、预测的误差 # 每计算一折后,要计算训练、验证、预测数据的误差 xx = self.RMSE(xul, data['train'][:, -1]) yy = self.RMSE(yanre, data['test'][:, -1]) pp = self.RMSE(prer, data['predict'][:, -1]) # 储存误差 self.error_dict['随机森林'] = [xx, yy, pp] # 验证数据集的真实输出结果 self.yanzhneg_real = data['test'][:, -1] # 预测数据集的真实输出结果 self.preal = data['predict'][:, -1] return print('1层中的随机森林运行完毕')
def opt_model_RF(X, y): parameters = opt_RF(X, y) parameters = map(lambda i: int(i) if i > 2 else 2, parameters) rf = RF(max_depth=parameters[0],min_samples_split=parameters[1],min_samples_leaf=parameters[2],\ n_estimators=100,class_weight='balanced',n_jobs=3,max_features="auto",oob_score=True) rf.fit(X, y) return rf
def RF_First(self, data, n_estimators=800, max_features='sqrt'): # 对训练数据进行训练,返回模验证数据,预测数据的预测结果 model = RF(n_estimators=n_estimators, max_features=max_features) model.fit(data['train'][:, :-1], data['train'][:, -1]) # 存储验证数据集结果和预测数据集结果 # 训练数据集的预测结果 xul = model.predict(data['train'][:, :-1]) # 验证的预测结果 yanre = model.predict(data['test'][:, :-1]) # 预测的预测结果 prer = model.predict(data['predict'][:, :-1]) # 每计算一折后,要计算训练、验证、预测数据的误差 xx = self.F1(xul, data['train'][:, -1]) yy = self.F1(yanre, data['test'][:, -1]) pp = self.F1(prer, data['predict'][:, -1]) # 开始结合 self.yanzhneg_pr.append(yanre) self.yanzhneg_real = data['test'][:, -1] self.predi.append(prer) self.preal = data['predict'][:, -1] # 存储误差 self.error_dict['随机森林'] = [xx, yy, pp] return print('1层中的随机森林运行完毕')
def main(): train = pd.read_csv('criminal_train.csv') test = pd.read_csv('criminal_test.csv') print(train.dtypes) train, test = ObjectVariableRectification(train, test) y = np.array(train['Criminal'], dtype = float) X = np.array(train.drop(['Criminal', 'PERID'], axis = 1), dtype = float) assert(X.shape[0] == y.shape[0]) print('-----------------Training------------------\n') clf = RF(n_estimators = 80, max_depth = 80) clf.fit(X, y) print(clf.score(X,y)) print('\n') X_train = np.array(test.drop(['PERID'], axis = 1), dtype = float) assert[X.shape[1] == X_train.shape[1]] print('----------------Predicting-----------------\n') predictions = np.array(clf.predict(X_train), dtype = int) print('---------------WRITING THE FILE------------\n') filePtr = open('MySubmissions.csv', 'a+') filePtr.write('PERID,Criminal\n') for i in range(X_train.shape[0]): filePtr.write(str(test['PERID'][i])) filePtr.write(',') filePtr.write(str(predictions[i])) filePtr.write('\n') print('----------FILE SUCCESSFULY WRITTEN---------\n')
def sup_predict(train, test, delay=delay, known_nodes=known_nodes): days, series = train.shape valdays = test.shape[0] series_sup_accs = [] results = np.zeros((series - known_nodes, valdays - delay)) for s in range(known_nodes, series): sys.stdout.write("\r Supervised prediction for series %s of %s" % (str(s), str(series))) timeseries_train = train[:, s] timeseries_test = test[:, s] delay_trainset = delay_maker(timeseries_train, delay) delay_testset = delay_maker(timeseries_test, delay) model = RF(n_estimators=100, random_state=rs) model = model.fit(delay_trainset[:, :-1], delay_trainset[:, -1]) y_pred = model.predict(delay_testset[:, :-1]) results[s - known_nodes, :] = y_pred series_sup_accs.append(accuracy_score(delay_testset[:, -1], y_pred)) print() return np.array(series_sup_accs), results.T
def profit_curve_main(filepath, cost_benefit): """Main function to test profit curve code. Parameters ---------- filepath : str - path to find churn.csv cost_benefit : ndarray - 2D, with profit values corresponding to: ----------- | TP | FP | ----------- | FN | TN | ----------- """ X_train, X_test, y_train, y_test = get_train_test(filepath) models = [RF(), LR(), GBC(), SVC(probability=True)] model_profits = [] for model in models: profits, thresholds = get_model_profits(model, cost_benefit, X_train, X_test, y_train, y_test) model_profits.append((model, profits, thresholds)) plot_model_profits(model_profits) max_model, max_thresh, max_profit = find_best_threshold(model_profits) max_labeled_positives = max_model.predict_proba(X_test) >= max_thresh proportion_positives = max_labeled_positives.mean() reporting_string = ('Best model:\t\t{}\n' 'Best threshold:\t\t{:.2f}\n' 'Resulting profit:\t{}\n' 'Proportion positives:\t{:.2f}') print reporting_string.format(max_model.__class__.__name__, max_thresh, max_profit, proportion_positives)
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est, depth, n_fold=5): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='rf_{}_{}_{}.log'.format(n_est, depth, feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) clf = RF(n_estimators=n_est, max_depth=depth, random_state=2015) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) logging.info('Cross validation...') p_val = np.zeros_like(y) for i_trn, i_val in cv: clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('AUC = {:.4f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train(train_x, train_y, test_x, test_y, algo, hyperparams, cv=3): if algo == 'SVM': model = GridSearchCV(SVC(), hyperparams, cv=cv) #model = SVC(C=C, kernel=kernel, degree=degree, tol=tol) elif algo == 'RF': model = GridSearchCV(RF(), hyperparams, cv=cv) #model = RF(n_estimators=n_estimators, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf) print('Fitting Model and Tuning Hyperparameters with GridSearch using {}-fold cross-validation...'.format(cv)) model.fit(train_x, train_y) best_params = model.best_params_ print('Best Parameters Found: {}'.format(best_params)) best_score = model.best_score_ print('Mean cross-validated score of the best_estimator: {}'.format(best_score)) print('Getting Predictions...') train_predictions = model.predict(train_x) test_predictions = model.predict(test_x) train_accuracy = accuracy_score(train_y, train_predictions) print('Train Set Accuracy: {}'.format(train_accuracy)) test_accuracy = accuracy_score(test_y, test_predictions) print('Test Set Accuracy: {}'.format(test_accuracy)) print('Test Set Classification Report') test_report = classification_report(test_y, test_predictions) print(test_report) return test_accuracy, test_predictions
def __init__(self, feat=SURF(), cls=RF(n_estimators=40), verbose=True): self._ft = Features(feat) self._da = Data(self._ft) self._fm = FitBoVW(cls) self._verbose = verbose self._cl = cls self._vq = None
def classify_using_random_sampling(self, X_train, X_test, y_train, y_test, portion_of_sampled_dataset_vector, classifiers_for_experiments): psa = PSA() # ---- settings: number_of_runs_for_random_sampling = 20 # ---- Experimenting: recognition_rate_LIST = np.zeros((len(classifiers_for_experiments), len(portion_of_sampled_dataset_vector))) classifier_index = 0 for classifier in classifiers_for_experiments: print('############### Classifier: ' + classifier) portion_index = 0 for portion_of_sampled_dataset in portion_of_sampled_dataset_vector: print('###### Portion of sampled dataset: ' + str(portion_of_sampled_dataset * 100) + '%') # ---- data reduction with random sampling: recognition_rate_with_random_sampling = [None] * number_of_runs_for_random_sampling for run_index in range(number_of_runs_for_random_sampling): shuffled_samples = self.shuffle_samples_randomly(X=X_train, y=y_train) # shuffle samples of classes randomly # ---- data reduction: number_of_classes = len(shuffled_samples) n_samples = [] for class_index in range(number_of_classes): number_of_samples_of_class = shuffled_samples[class_index].shape[0] n_samples.append(int(number_of_samples_of_class * portion_of_sampled_dataset)) X, y = psa.reduce_data(sorted_samples=shuffled_samples, n_samples=n_samples) # ---- report number of sampled data after PSA: if run_index == 0: # only report once in the multiple runs print('number of sampled data in classes, after random sampling: ' + str(n_samples)) # ---- classify with random sampling: if classifier == 'SVM': # --------- train: clf = SVC(kernel='linear') clf.fit(X=X, y=y) elif classifier == 'LDA': # --------- train: clf = LDA() clf.fit(X=X, y=y) elif classifier == 'QDA': # --------- train: clf = QDA() clf.fit(X=X, y=y) elif classifier == 'Random Forest': # --------- train: clf = RF(max_depth=2, random_state=0) clf.fit(X=X, y=y) elif classifier == 'Logistic Regression': # --------- train: clf = LR() clf.fit(X=X, y=y) elif classifier == 'Gaussian Naive Bayes': # --------- train: clf = GaussianNB() clf.fit(X=X, y=y) # --------- test: labels_predicted = clf.predict(X_test) recognition_rate_with_random_sampling[run_index] = (sum(labels_predicted == y_test) / len(labels_predicted)) * 100 recognition_rate_with_random_sampling_average = np.mean(recognition_rate_with_random_sampling) print('The recognition rate using ' + classifier + ' with data number reduction (random sampling): ' + str(recognition_rate_with_random_sampling_average)) recognition_rate_LIST[classifier_index, portion_index] = recognition_rate_with_random_sampling_average portion_index += 1 classifier_index += 1 return recognition_rate_LIST
def get_classifier(self, params): if self.learner_name == 'L1': self.ind_params = { 'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l1' } joint_params = self.ind_params.copy() joint_params.update(params) print(joint_params) clf = LogisticRegression(**joint_params) self.space4classifier = {'C': hp.loguniform('C', -10, 10)} if self.learner_name == 'L2': self.ind_params = { 'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2' } joint_params = self.ind_params.copy() joint_params.update(params) print(joint_params) clf = LogisticRegression(**joint_params) self.space4classifier = {'C': hp.loguniform('C', -5, 5)} if self.learner_name == 'SVM': n_obs = len(y) n_pos, n_neg = np.sum(y), n_obs - np.sum(y) pos_weight = n_obs / 2.0 / n_pos neg_weight = n_obs / 2.0 / n_neg self.ind_params = {'class_weight': {0: neg_weight, 1: pos_weight}} joint_params = self.ind_params.copy() joint_params.update(params) joint_params.update({'probability': True}) clf = svm.SVC(**joint_params) if self.learner_name == 'RF': self.ind_params = {'class_weight': 'balanced', 'n_jobs': 20} joint_params = self.ind_params.copy() joint_params.update(params) clf = RF(**joint_params) if self.learner_name == 'XGB': n_obs = len(self.y) n_pos, n_neg = np.sum(self.y), n_obs - np.sum( self.y) # calculate weights for the pos/neg classes self.ind_params = { 'objective': 'reg:logistic', 'scale_pos_weight': n_pos / n_neg * 1.0 } joint_params = self.ind_params.copy() joint_params.update(params) clf = xgb.XGBClassifier(**joint_params) return clf
def new_rf(): args = {"max_depth":200, "random_state": 0, "n_estimators":49, "class_weight":"balanced_subsample", # "max_features": None, } return RF(**args)
def objective_rf(self, trial): model = RF(n_estimators=int( trial.suggest_int('rf_n_estimators', 1, 100 + 1)), max_depth=int(trial.suggest_int('rf_max_depth', 2, 32 + 1)), max_leaf_nodes=trial.suggest_int('rf_max_leaf', 2, 40 + 1), min_samples_split=trial.suggest_int('rf_min_samples_split', 2, 10 + 1)) return model
def calculateCrossVad(self, labels, subtrain): X_train, X_test, y_train, y_test = cross_validation.train_test_split( subtrain, labels, test_size=0.1) # print X_test, y_test srf = RF(n_estimators=500, n_jobs=-1) srf.fit(X_train, y_train) score = srf.score(X_test, y_test) return score
def _model(self): ''' First iteration will be a random forrest. ''' # Init model self.model = RF(n_estimators=10) X, y = self._splitter() self.model.fit(X, y)
def try_params(n_iterations, params, data): n_estimators = int(round(n_iterations * trees_per_iteration)) print("n_estimators:", n_estimators) pprint(params) clf = RF(n_estimators=n_estimators, verbose=0, n_jobs=-1, **params) return train_and_eval_sklearn_regressor(clf, data)
def RF_Classifier(data_train, labels_train, num_estimators, max_features, oob_score, n_jobs): random_forest = RF(n_estimators=num_estimators, max_features=max_features, oob_score=oob_score, n_jobs=n_jobs) random_forest.fit(data_train, labels_train) return random_forest
try: for ne in range(nb_exp): print 'exp num:', ne X, y = sh(X, y) X_train = X[:n_samples_train, :] X_test = X[n_samples_train:(n_samples_train + n_samples_test), :] y_train = y[:n_samples_train] y_test = y[n_samples_train:(n_samples_train + n_samples_test)] # training only on normal data: X_train = X_train[y_train == 0] y_train = y_train[y_train == 0] print('RF processing...') model = RF() tstart = time() # the lower, the more normal: scoring = model.fit_predict(X_train, y_train, X_test, y_test) fit_predict_time += time() - tstart fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring) f = interp1d(fpr_, tpr_) tpr += f(x_axis) tpr[0] = 0. precision_, recall_ = precision_recall_curve(y_test, scoring)[:2] # cluster: old version of scipy -> interpol1d needs sorted x_input