def main(): split = 0.3 p = optparse.OptionParser() #take training data set p.add_option('--train_dataset', '-i', default='/afs/cern.ch/user/s/sganju/private/2014_target.csv') #specify target column p.add_option('--target', '-y', default="target") #parse inputs options, arguments = p.parse_args() #split different numerical values #load from files train = pd.read_csv(options.train_dataset) data = train[["id", "cpu", "creator", "dbs" , "dtype" , "era" , "nblk" , "nevt" , "nfiles" , "nlumis" , "nrel" , "nsites" , "nusers" , "parent" , "primds" , "proc_evts" , "procds" , "rnaccess" , "rnusers" , "rtotcpu" , "size" , "tier" , "totcpu" , "wct"]] #load target values target = train['target'] #TRAINING DATA SET features_train, features_test, target_train, target_test = train_test_split(data, target, test_size=split, random_state=0) #diffrentiate on the basis of type of problem #RANDOM FOREST CLASSIFIER rf = RandomForestClassifier(n_estimators=100) rf = rf.fit(features_train, target_train) cal_score_accuracy("RANDOM FOREST CLASSIFIER",rf, features_test, target_test) #test data set then make predictions test = pd.read_csv('dataframe-20130101-20130107-TARGET.csv') test = test[["id", "cpu", "creator", "dbs" , "dtype" , "era" , "nblk" , "nevt" , "nfiles" , "nlumis" , "nrel" , "nsites" , "nusers" , "parent" , "primds" , "proc_evts" , "procds" , "rnaccess" , "rnusers" , "rtotcpu" , "size" , "tier" , "totcpu" , "wct"]] predictions = rf.predict_proba(test)
def init_turns_module(values, trees, data, labels): # Fit regression model global turns_regr turns_regr = RandomForestClassifier(n_estimators=trees) turns_regr.fit(data[:, [0,1]], labels) print "init_turns, importances: ", turns_regr.feature_importances_ return
def prediction_confusion_matrix(): df=get_data() X=df.ix[:, (df.columns !='class') & (df.columns !='code')].as_matrix() #this gives a numpy print X y1=df.ix[:,df.columns=='class'].as_matrix() y=y1.reshape(683,1) Y=binary(y) #split into test and training sets features_train, features_test, outcome_train, outcome_test = cv.train_test_split(X, Y, test_size=0.4,random_state=1) #Random Forest Classifier for the classification. forest=RandomForestClassifier(n_estimators=10,min_samples_leaf= 10, criterion= 'gini', max_features= 'auto', max_depth= None) forest=forest.fit(features_train,outcome_train) predicted=forest.predict(features_test) #Confusion_matrix confusion_matrix=metrics.confusion_matrix(outcome_test,predicted) output ={ 'Random Forest Classifier': { 'fp':confusion_matrix[0][1], 'tp':confusion_matrix[1][1], 'fn':confusion_matrix[1][0], 'tn':confusion_matrix[0][0] } } return jsonify(output)
def rand_forest(train_bow,train_labels,test_bow,test_labels,bow_indexes): print("Training rndForest") rf_classifier=RandomForestClassifier() rf_classifier.fit(train_bow,train_labels) print("Testing rndForest") test(rf_classifier,"rf",test_bow,test_labels,bow_indexes)
def __init__(self, data, classes, tree_features, n_trees=100): self.n_features = np.shape(data)[1] n_rows = np.shape(data)[0] n_nans = np.sum(np.isnan(data), 0) data = data[:, n_nans < n_rows] self.n_features = np.shape(data)[1] n_nans = np.sum(np.isnan(data), 1) data = data[n_nans < self.n_features, :] self.n_rows = np.shape(data)[0] if (tree_features > self.n_features): tree_features = self.n_features self.col_list = np.zeros((n_trees, tree_features), dtype='int') self.n_trees = n_trees self.bags = [] for i in range(n_trees): cols = sample(range(self.n_features), tree_features) cols.sort() self.col_list[i, :] = cols data_temp = data[:, cols] n_nans = np.sum(np.isnan(data_temp), 1) data_temp = data_temp[n_nans == 0, :] classes_temp = classes[n_nans == 0] #bag = BaggingClassifier(n_estimators=1, max_features=tree_features) bag = RandomForestClassifier(n_estimators=1, max_features=tree_features) bag.fit(data_temp, classes_temp) self.bags.append(bag) print(np.shape(data_temp))
def train_and_predict(): print('Converting data...') config.X = np.array(config.X) config.Y = np.array(config.Y) config.X_test = np.array(config.X_test) #print(config.X.shape) #print(config.Y.shape) #print(config.X_test.shape) print('Training...') print('Time Elapsed: ' + str((time.time() - config.start_time)/60)) num_classes = len(config.Y[1, :]) for i in range(num_classes): print('Creating Classifier: ', i) rf = RandomForestClassifier(n_estimators=500, max_depth=5, n_jobs=-1, oob_score=True, verbose=2, criterion="entropy") gbm = xgb.XGBClassifier(n_estimators=500, objective='binary:logistic') print('Fitting Random Forest Classifier: ', i) rf.fit(config.X, config.Y[:, i]) print('Fitting With XGBoost Classifier: ', i) gbm.fit(config.X, config.Y[:, i]) print('Getting Random Forest Predictions for attribute: ', i) y_pred_rf = rf.predict(config.X_test) config.Y_pred_rf.append(y_pred_rf) print(y_pred_rf) print('Getting XGBoost Predictions for attribute: ', i) y_pred_xgb = gbm.predict(config.X_test) config.Y_pred_xgb.append(y_pred_xgb) print(y_pred_xgb)
def fit_rf(path, index_filter=None, class_filter=None, feature_filter=None, folds=10, inverse=False, lc_filter=None): """ path: Dirección del dataset a ocupar para entrenar index_filter: Pandas index para filtrar las filas del dataset que se quieren utilizar class_filter: Lista de clases que se quiere utilizar feature_filter: Lista de features que se quiere utilizar """ data = pd.read_csv(path, index_col=0) data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] for train_index, test_index in skf: if inverse: aux = train_index train_index = test_index test_index = aux train_X, test_X = data.iloc[train_index], data.iloc[test_index] train_y, test_y = y.iloc[train_index], y.iloc[test_index] clf = None clf = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=14, min_samples_split=5) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y)) return pd.concat(results)
def crossval_roc(X, y): cv = StratifiedKFold(y, n_folds=10) clf = RandomForestClassifier() mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): fitted = clf.fit(X[train], y[train]) probas_ = fitted.predict_proba(X[test]) scored_ = fitted.predict(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 #roc_auc = auc(fpr, tpr) roc_auc = roc_auc_score(scored_, y[test], average="micro") #plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) return plt.plot(mean_fpr, mean_tpr, label='Mean ROC (area = %0.2f)' % mean_auc, lw=1)
def get_randomforest_classifier(X_train, y_train, params=None): param_grid = {"max_depth": [4, 5, 6, 7], "max_features": [3, 5], "criterion": ["gini", "entropy"]} if params is None: log = RandomForestClassifier() t = start("training random forest ") cv = cross_validation.ShuffleSplit(X_train.shape[0], n_iter=10,test_size=0.2, random_state=123) clf = grid_search.GridSearchCV(log, param_grid, cv=cv, n_jobs=4, scoring='roc_auc') clf = clf.fit(X_train,y_train) report(t, nitems=10*len(param_grid)) print("Best score:{} with scorer {}".format(clf.best_score_, clf.scorer_)) print "With parameters:" best_parameters = clf.best_estimator_.get_params() for param_name in sorted(param_grid.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) else: clf = RandomForestClassifier(**params) clf = clf.fit(X_train,y_train) return clf
def test_string_labels_refit_false(): np.random.seed(123) clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() y_str = y.copy() y_str = y_str.astype(str) y_str[:50] = 'a' y_str[50:100] = 'b' y_str[100:150] = 'c' clf1.fit(X, y_str) clf2.fit(X, y_str) clf3.fit(X, y_str) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97 eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97
def random_forest_classify(train_data,train_label,test_data): rf = RandomForestClassifier(n_estimators=100) rf.fit(train_data, ravel(train_label)) test_label=rf.predict(test_data) save_result(test_label,'sklearn_random_forest_classify_Result.csv') return test_label
def predict_rf(train_features, test_features, train_labels, test_labels): model = RandomForestClassifier(n_estimators=1000) model.fit(train_features, train_labels) predictions = model.predict(train_features) print get_accuracy(predictions, train_labels) predictions = model.predict(test_features) print get_accuracy(predictions, test_labels)
def train_model_on_gestures(wav_list): gestures = {'vattene':0, 'vieniqui':1, 'perfetto':2, 'furbo':3, 'cheduepalle':4, 'chevuoi':5, 'daccordo':6, 'seipazzo':7, 'combinato':8, 'freganiente':9, 'ok':10, 'cosatifarei':11, 'basta':12, 'prendere':13, 'noncenepiu':14, 'fame':15, 'tantotempo':16, 'buonissimo':17, 'messidaccordo':18, 'sonostufo':19} dataX = [] i = 0 for wav in wav_list: path = re.sub('\_audio.wav$', '', wav) print '\n', '##############' print path[-25:] sample = VideoMat(path, True) sk = Skelet(sample) rate, data = get_data(wav) data_frame = np.asarray(create_features(data, sample.labels, sample.numFrames, sk)) #print 'data_frame !', data_frame.shape #data_frame2 = np.asarray(Head_inter(path, sample.labels).data_frame) #data_frame = np.hstack((data_frame, data_frame2)) dataX += copy.copy(data_frame) # 1 target / 19 * 6 joints infos / 8 Head/Hand distances / 5 Head box = 128 features #Train model: Don't use the Head box features, don't really improve the model data_frame = np.asarray(dataX) Y = data_frame[:, 0] Y = np.asarray([gestures[i] for i in Y]) X = data_frame[:, 1:] X = X.astype(np.float32, copy=False) X = X[:, :122] clf = RandomForestClassifier(n_estimators=300, criterion='entropy', min_samples_split=10, min_samples_leaf=1, verbose=2, random_state=1) #n_jobs=2 clf = clf.fit(X, Y) pickle.dump(clf, open('gradient_boosting_model_gestures.pkl','wb'))
def run(): mean_acc = 0.0 mean_logloss = 0.0 skf, X_all, labels = gen_cv() for fold, (test_index, train_index) in enumerate(skf, start=1): logger.info('at fold: {0}'.format(fold)) logger.info('train samples: {0}, test samples: {1}'.format(len(train_index), len(test_index))) X_train, X_test = X_all[train_index], X_all[test_index] y_train, y_test = labels[train_index], labels[test_index] rfc = RandomForestClassifier(n_jobs=10, random_state=919) rfc.fit(X_train, y_train) y_test_predicted = rfc.predict(X_test) y_test_proba = rfc.predict_proba(X_test) # equals = y_test == y_test_predicted # acc = np.sum(equals) / float(len(equals)) acc = accuracy_score(y_test, y_test_predicted) logger.info('test data predicted accuracy: {0}'.format(acc)) # log loss -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp)) logloss = log_loss(y_test, y_test_proba) logger.info('log loss at test data: {0}'.format(logloss)) # logger.info('log loss at test data using label: {0}'.format(log_loss(y_test, y_test_predicted))) mean_acc += acc mean_logloss += logloss n_folds = skf.n_folds logger.info('mean acc: {0}'.format(mean_acc / n_folds)) logger.info('mean log loss: {0}'.format(mean_logloss / n_folds))
def cls_create(xs, ys): if algo == "SVM": classifier = svm.SVC(C = self.parm, probability=True) elif algo == "RF": classifier = RandomForestClassifier(n_estimators = int(self.parm), criterion='entropy', n_jobs = 1) # #classifier = LDA() new_xs = xs """ positive_count = len([y for y in ys if y > 0]) if positive_count >= 20: #self.selector = svm.LinearSVC(C = 1, dual = False, penalty="l1") self.selector = LDA() new_xs = self.selector.fit_transform(xs, ys) else: self.selector = None """ classifier.fit(new_xs, ys) probs = classifier.predict_proba(new_xs) #self.pclassifier = svm.SVC(parm_val = 1.0) #self.pclassifier.fit(probs, ys) self.threshold, self.positive, self.negative = best_threshold_for_f1(probs, 20, ys) return classifier
def brute_force_acc_rd(features_train, labels_train, features_test, labels_test, ids): #0.818181818182 clf = RandomForestClassifier(bootstrap=True, criterion='entropy', max_depth=None, max_features=2, max_leaf_nodes=16, min_samples_split=10, n_estimators=1000, n_jobs=-1, oob_score=False) clf = clf.fit(features_train, labels_train) # print(clf.best_estimator_) pred = clf.predict(features_test) acc = accuracy_score(labels_test, pred) #print pred # if(acc > 0.80): # print acc t0 = time.time() print acc feature_importance = clf.feature_importances_ # feature_importance = 100.0 * (feature_importance / feature_importance.max()) # print feature_importance if(acc > 0.815): data_train.to_csv("data_train{}.tst".format(round(acc,5)), "\t") feature_importance = 100.0 * (feature_importance / feature_importance.max()) print feature_importance if(acc > 0.819): predictions_file = open("data/canivel_random_forest_819.csv", "wb") predictions_file_object = csv.writer(predictions_file) predictions_file_object.writerow(["PassengerId", "Survived"]) predictions_file_object.writerows(zip(ids, pred)) predictions_file.close() print ("NEW FILE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! YEA!!!!") return acc
def model_and_predict(self, X_train, y_train, X_test): district_idx = self.columns.index('PdDistrict') districts = set(X_train[:,district_idx]) district_ys = {} # Grow forest and predict separately for each district's records for d in districts: district_X_train = X_train[X_train[:, district_idx] == d] district_X_train = np.delete(district_X_train, district_idx, 1) district_y_train = y_train[X_train[:, district_idx] == d] district_X_test = X_test[X_test[:, district_idx] == d] district_X_test = np.delete(district_X_test, district_idx, 1) print "Growing forest for", d # Not saving output in Git so make this deterministic # with random_state rf = RandomForestClassifier(n_estimators=self.n_trees, n_jobs=-1, random_state=782629) rf.fit(district_X_train, district_y_train) district_ys[d] = list(rf.predict(district_X_test)) print "Finished", d print "All predictions made" y_hat = [] for row in X_test: d_ys = district_ys[row[district_idx]] y_hat.append(d_ys.pop(0)) return y_hat
def main(): X, y = datasets.make_moons(n_samples=200, shuffle=True, noise=0.1, random_state=None) plt.scatter(X[:, 0], X[:, 1], c=y) for i in range(8): clf = RandomForestClassifier(n_estimators = 2**i) clf.fit(X,y) plot_surface(clf, X, y)
def buildModel(df): train_y = df['arr_del15'][:train_len] train_x = df[cols][:train_len] # transform categorical features train_x['unique_carrier'] = pd.factorize(train_x['unique_carrier'])[0] train_x['dep_conditions'] = pd.factorize(train_x['dep_conditions'])[0] train_x['arr_conditions'] = pd.factorize(train_x['arr_conditions'])[0] pd.set_option('display.max_rows', 500) print(train_x) # train_x['origin'] = pd.factorize(train_x['origin'])[0] # train_x['dest'] = pd.factorize(train_x['dest'])[0] # print(train_x) train_x = enc.fit_transform(train_x) print(train_x.shape) # Create Random Forest classifier with 50 trees clf_rf = RandomForestClassifier(n_estimators=50, n_jobs=-1) clf_rf.fit(train_x.toarray(), train_y) del train_x, train_y print("Model built") return clf_rf
def test_save_prediction(self): model = RandomForestClassifier() model.id = get_model_id(model) model.fit(self.iris.data, self.iris.target) indexes = np.fromfunction(lambda x: x, (self.iris.data.shape[0], ), dtype=np.int32) saving_predict_proba(model, self.iris.data, indexes) os.remove('RandomForestClassifier_r0_N__m5_0p0__m4_2__m1_auto__m0_N__m3_1__m2_N__n0_10__b0_1__c1_gini__c0_N_0_149.csv')
def __init__(self): super(ClassifyDriver, self).__init__() if CLASSIFIER == "SVM": self.driver = svm.SVC() elif CLASSIFIER == "GBC": self.driver = GradientBoostingClassifier(n_estimators=300, max_depth=5, learning_rate=0.05) elif CLASSIFIER == "RFC": self.driver = RandomForestClassifier(n_estimators=N_ESTIMATORS, n_jobs=N_JOBS) else: raise Exception("Classifier %s not supported" % CLASSIFIER) genuineX = [] forgeryX = [] genuineY = [] forgeryY = [] # Training process for sigs in self.train_set: personTrain = PersonTraining(sigs) genuine, forgery = personTrain.calc_train_set() genuineX.extend(genuine) forgeryX.extend(forgery) genuineY = [1] * len(genuineX) forgeryY = [0] * len(forgeryX) trainX = genuineX + forgeryX trainY = genuineY + forgeryY self.driver.fit(trainX, trainY)
def rforests(trainx, trainy, test, n_estimators=100, k=5): trainy = np.ravel(trainy) forest = RandomForestClassifier(n_estimators) forest.fit(trainx, trainy) prob_train = forest.predict_proba(trainx) prob_test = forest.predict_proba(test) # Since the index is the number of the country that's been chosen # we can use these with argsort to get the maximum 5., we will have to do this # for the entire matrix though. sort_train = np.argsort(prob_train)[:,-k:] sort_test = np.argsort(prob_test)[:,-k:] # Now we need to transform these back to countries, but to map I need to # have a dataframe. col_names = [] for i in range(k): name = "country_destination_" + str(i+1) col_names.append(name) pred_train = pd.DataFrame(sort_train, columns=col_names) pred_test = pd.DataFrame(sort_test, columns=col_names) for name in col_names: pred_train[name] = pred_train[name].map(dicts.country) pred_test[name] = pred_test[name].map(dicts.country) pred_train = np.fliplr(pred_train) pred_test = np.fliplr(pred_test) return forest, pred_train, pred_test
def cross_validate(): print("Reading the data") data = cu.get_dataframe(train_file) print("Cross-Validating") rf = RandomForestClassifier(n_estimators=10, verbose=1, compute_importances=True, n_jobs=2) cv = cross_validation.KFold(len(data), k=10, indices=False) results = [] for traincv, testcv in cv: print "\t-- cv [%d]"%len(results) print "\t","extracting features" #... feacv = features.extract_features(feature_names, traincv) print "\t","learning" rf.fit(feacv, data["OpenStatus"]) print "\t","predicting" probs = rf.predict_proba(testcv) print "\t","evaluating" results.append( llfun(target[testcv], [x["OpenStatus"] for x in probas]) ) print "LogLoss: " + str( np.array(results).mean() )
def get_preds(features, trees=3000, depth=19): # features is the number of latents features that I want the nmf to run on # Create dataframes df = get_nmf(k=features) df_full = add_yahoo_to_df(df) df_train = add_dummies(df_full) # Why aren't you using df_full? df_test = get_nmf('data_wednesday', k=features) # put in folder name where the json data is df_test_full = add_yahoo_to_df(df_test) df_test_full = add_dummies(df_test_full) # Create models X_model_class, y_model_class = get_classifier_data(df_full) rf_class = RandomForestClassifier(n_estimators=trees, max_depth=depth) rf_class.fit(X_model_class, y_model_class) # X_model_regress, y_model_regress = get_regressor_data(df_full) rf_regress = RandomForestRegressor(n_estimators=trees, max_depth=depth) rf_regress.fit(X_model_regress, y_model_regress) # Get X and y values X_classify, y_classify = get_classifier_data(pd.DataFrame(df_test_full.ix['2016-04-11'])) X_regress, y_regress = get_regressor_data(pd.DataFrame(df_test_full.ix['2016-04-11'])) # Run models classifier_preds = rf_class.predict(X_classify) classifier_accuracy = accuracy_score(classifier_preds, y_classify) regressor_preds = rf_regress.predict(X_regress) regressor_mse = mean_squared_error(regressor_preds, y_regress) # I want to return the number of features, k, along with the accuracy of the classifier # and the MSE of the regressor. This will give me an idea of how well things are doing # based on the number of features. return [features, classifier_accuracy, regressor_mse]
def myforest(train, test, trees=250): #Training data prep------------------------------------------------------------------------------------------- csv_file_object = csv.reader(open(train, 'rb')) #Load in the training csv file header = csv_file_object.next() #Skip the fist line as it is a header output_header = header[0:2] train_data=[] for row in csv_file_object: #Skip through each row in the csv file train_data.append(row[1:]) #adding each row to the data variable train_data = np.array(train_data) #Then convert from a list to an array #Test data prep----------------------------------------------------------------------------------------------- test_file_object = csv.reader(open(test, 'rb')) #Load in the test csv file header = test_file_object.next() #Skip the fist line as it is a header test_data=[] #Create a variable called 'test_data' ids = [] for row in test_file_object: #Skip through each row in the csv file ids.append(row[0]) test_data.append(row[1:]) #adding each row to the data variable test_data = np.array(test_data) #Then convert from a list to an array #Train the forest print 'Training' forest = RandomForestClassifier(n_estimators=trees) forest = forest.fit(train_data[0::,1::], train_data[0::,0]) print 'Predicting' output = forest.predict(test_data) open_file_object = csv.writer(open("result.csv", "wb")) open_file_object.writerow([output_header[0],output_header[1]]) open_file_object.writerows(zip(ids, output))
def main(): S, col_names_S = load_data(config.paths.training_data, config.paths.cache_folder) Xs, Ys, col_names_S = extract_xy(S, col_names_S) a = RandomForestClassifier(n_estimators=1) a.fit(Xs.toarray(), Ys.toarray().ravel()) best_features = a.feature_importances_ max_ind, max_val = max(enumerate(best_features), key=operator.itemgetter(1)) print best_features print max_ind, max_val print Xs.shape print Ys.shape param_range = [1, 3, 5, 7, 10, 15, 20, 30, 60, 80] train_scores, test_scores = validation_curve(RandomForestClassifier(criterion='entropy'), Xs, Ys.toarray().ravel(), 'n_estimators', param_range) print train_scores print test_scores train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.title("Validation Curve for Random Forest") plt.xlabel("Number of Trees") plt.ylabel("Score") plt.plot(param_range, train_mean, label="Training Score", color='r') plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.2, color='r') plt.plot(param_range, test_mean, label="Test Score", color='b') plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.2, color='b') plt.legend(loc="best") plt.show()
def Random_Forest_classifier(train_input_data,train_output_data,test_input_data,test_output_data): tree_list = [] accuracy_percent = [] for trees in range(10,200,10): clf = RandomForestClassifier(trees) clf.fit(train_input_data,train_output_data) predicted_output = clf.predict(test_input_data) error_list = [] if isinstance(predicted_output,list) ==False: predicted_output = predicted_output.tolist() if isinstance(test_output_data,list) ==False: test_output_data = test_output_data.tolist() for i in range(len(test_output_data)): cur_univ_similarities = similar_univs[similar_univs['univName'] == predicted_output[i]] cur_univ_similarity_list = cur_univ_similarities.values.tolist() cur_univ_similarity_list = [item for sublist in cur_univ_similarity_list for item in sublist] if test_output_data[i] in cur_univ_similarity_list[1:]: error_list.append(0) else: error_list.append(1) tree_list.append(trees) accuracy_percent.append(100 -((sum(error_list)/float(len(error_list))) * 100)) tree_list = np.array(tree_list) accuracy_percent = np.array(accuracy_percent) plt.plot(tree_list,accuracy_percent) plt.xlabel('Number of trees') plt.ylabel('Percent of accuracy') plt.title('Varation of accuracy with trees') plt.grid(True) plt.savefig("rf1.png") plt.show() return predicted_output
def crossValIteration(dat,classes,cutoff,prop=0.9,reshuffle=False): if reshuffle: dat.samples = sampleReshuffle(dat) saved_samples = [i for i in dat.samples] dat.samples = ["{0}_$$_{1}".format(i,v) for i,v in enumerate(dat.samples)] train,test=dat.splitTraining(prop, classes) print test.samples selectedSampleIndicies = [int(i.split("_$$_")[0]) for i in test.samples] dat.samples = saved_samples print test.samples test.samples = [i.split("_$$_")[1] for i in test.samples] train.samples = [i.split("_$$_")[1] for i in train.samples] print "Training set has {0} samples from classes: {1}".format(len(train.samples),",".join(set(train.samples))) print "Test set has {0} samples from classes: {1}".format(len(test.samples),",".join(set(test.samples))) print "Selecting data..." # select features for each disease print "Number of selections made for each class:" print "Setting up SVM..." Xtrain = train.values.transpose() Ytrain = train.samples clf=RandomForestClassifier(n_estimators=1000) clf.fit(Xtrain,Ytrain) Xtest = test.values.transpose() Ytest = test.samples print "Predicting R-forest..." #classification results versus actual acc = zip(Ytest,clf.predict(Xtest)) # (actual,predicted)... for each sample print acc # this is the elemental form of the "result" lists processed below print sum([i[0] == i[1] for i in acc])*1.0/len(acc) return acc
def randomForest_eval_func(self, chromosome): n_estimators, max_features, window_size = self.decode_chromosome(chromosome) if self.check_log(n_estimators, max_features, window_size): return self.get_means_from_log(n_estimators, max_features, window_size)[0] folded_dataset = self.create_folded_dataset(window_size) indim = 21 * (2 * window_size + 1) mean_AUC = 0 mean_decision_value = 0 mean_mcc = 0 sample_size_over_thousand_flag = False for test_fold in xrange(self.fold): test_labels, test_dataset, train_labels, train_dataset = folded_dataset.get_test_and_training_dataset(test_fold) if len(test_labels) + len(train_labels) > 1000: sample_size_over_thousand_flag = True clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features) clf.fit(train_dataset, train_labels) probas = clf.predict_proba(test_dataset) decision_values = map(lambda x: x[1], probas) # Probability of being binding residue AUC, decision_value_and_max_mcc = validate_performance.calculate_AUC(decision_values, test_labels) mean_AUC += AUC mean_decision_value += decision_value_and_max_mcc[0] mean_mcc += decision_value_and_max_mcc[1] if sample_size_over_thousand_flag: break if not sample_size_over_thousand_flag: mean_AUC /= self.fold mean_decision_value /= self.fold mean_mcc /= self.fold self.write_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc) self.add_log(n_estimators, max_features, window_size, mean_AUC, mean_decision_value, mean_mcc) return mean_AUC
def algo(a): global data global week target = data['target'] data = data[["id", "cpu", "creator", "dbs" , "dtype" , "era" , "nblk" , "nevt" , "nfiles" , "nlumis" , "nrel" , "nsites" , "nusers" , "parent" , "primds" , "proc_evts" , "procds" , "rnaccess" , "rnusers" , "rtotcpu" , "size" , "tier" , "totcpu" , "wct", "naccess"]] week['target'] = 0 week['target'] = week.apply(convert, axis=1) week['target'] = week['target'].astype(int) test1 = week week = week[["id", "cpu", "creator", "dbs" , "dtype" , "era" , "nblk" , "nevt" , "nfiles" , "nlumis" , "nrel" , "nsites" , "nusers" , "parent" , "primds" , "proc_evts" , "procds" , "rnaccess" , "rnusers" , "rtotcpu" , "size" , "tier" , "totcpu" , "wct", "naccess"]] if a == 'rf': #RANDOM FOREST CLASSIFIER rf = RandomForestClassifier(n_estimators=100) rf = rf.fit(data, target) predictions = rf.predict(week) cal_score("RANDOM FOREST", rf, predictions, test1['target']) if a == "sgd": #SGD CLASSIFIER clf = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False) clf.fit(data, target) predictions = clf.predict(week) cal_score("SGD Regression",clf, predictions, test1['target']) if a == "nb": clf = GaussianNB() clf.fit(data, target) predictions = clf.predict(week) cal_score("NAIVE BAYES", clf, predictions, test1['target'])
# get numeric columns if use_numeric: x_train, x_train_num = numeric_filter(x_train) x_test, x_test_num = numeric_filter(x_test) fitted_dfs,fitted_encoders,encoder_names = categ_encoder( x_train, y_train[target_col],cols=x_train.columns, encoders=('target','count') ) if use_numeric: fitted_dfs.append(x_train_num) train_feat = pd.concat(fitted_dfs,axis=1) train_label = y_train[target_col] #modeling: logger.writelines('{}: start modeling...\n'.format(time.ctime())) rf = RandomForestClassifier(200,max_depth=5) rf.fit(train_feat,train_label) #validation: logger.writelines('{}: start evaluation...\n'.format(time.ctime())) test_feats = [en.transform(x_test) for en in fitted_encoders] if use_numeric: test_feats.append(x_test_num) test_feat = pd.concat(test_feats,axis=1) test_label = y_test[target_col] train_res = eval_formater(evalution(rf,train_feat,train_label)) test_res = eval_formater(evalution(rf,test_feat,test_label)) logger.writelines('{}: evaluation done.\n'.format(time.ctime())) write_model_info(logger,rf,train_feat) logger.writelines('train:\n----\n') logger.writelines(train_res) logger.writelines('validation:\n----\n')
pred_comb[(self.pred[1] == 1) & (self.pred[2] != 1) & (self.pred[3] != 1) & (self.pred[4] != 1)] = 1 pred_comb[(self.pred[1] != 1) & (self.pred[2] == 1) & (self.pred[3] != 1) & (self.pred[4] != 1)] = 2 pred_comb[(self.pred[1] != 1) & (self.pred[2] != 1) & (self.pred[3] == 1) & (self.pred[4] != 1)] = 3 pred_comb[(self.pred[1] != 1) & (self.pred[2] != 1) & (self.pred[3] != 1) & (self.pred[4] == 1)] = 4 #pred_comb[(pred[1] == 1) & (pred[2] == 1) & (pred[3] != 1) & (pred[4] != 1)] = 1 pred_comb[(self.pred[1] == 1) & (self.pred[2] != 1) & (self.pred[3] == 1) & (self.pred[4] != 1)] = 1 #pred_comb[(pred[1] == 1) & (pred[2] != 1) & (pred[3] != 1) & (pred[4] == 1)] = 1 #pred_comb[(pred[1] != 1) & (pred[2] == 1) & (pred[3] == 1) & (pred[4] != 1)] = 1 pred_comb[(self.pred[1] != 1) & (self.pred[2] == 1) & (self.pred[3] != 1) & (self.pred[4] == 1)] = 2 #pred_comb[(pred[1] != 1) & (pred[2] != 1) & (pred[3] == 1) & (pred[4] == 1)] = 1 return pred_comb #Define final classifier clf_list = [RandomForestClassifier(n_estimators=27, max_features=11), RandomForestClassifier(n_estimators=24, max_features=10), RandomForestClassifier(n_estimators=25, max_features=9), RandomForestClassifier(n_estimators=22, max_features=10)] clf = voting(clf_list,.5) #Split training and testing sets X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=.2) clf.fit(X_train,y_train) #Predict labels pred_comb = clf.predict(X_test) #Compute score of full classifier score = cross_validation.cross_val_score(clf,X,y,cv = cv) #Compute score of each expert ind_scores = [cross_validation.cross_val_score(clf_list[i],X,y_all[i],cv = cv) for i in range(0,len(clf_list))]
return res xgb_params = {} xgb_params['objective'] = 'binary:logistic' xgb_params['learning_rate'] = 0.04 xgb_params['n_estimators'] = 490 xgb_params['max_depth'] = 4 xgb_params['subsample'] = 0.9 xgb_params['colsample_bytree'] = 0.9 xgb_params['min_child_weight'] = 10 # RandomForest params rf_params = {} rf_params['n_estimators'] = 200 rf_params['max_depth'] = 6 rf_params['min_samples_split'] = 70 rf_params['min_samples_leaf'] = 30 xgb_model = XGBClassifier(**xgb_params) rf_model = RandomForestClassifier(**rf_params) log_model = LogisticRegression() stack = Ensemble(n_splits=3, stacker = log_model, base_models = (xgb_model, rf_model)) y_pred = stack.fit_predict(train, target_train, test)
scaler.fit(X_train) x_train_scalednew = scaler.transform(X_train) y_train_scalednew = scaler.transform(y_train) print("transsformed shape: %s" % (x_train_scalednew, )) print("per feature min before scaling: %s" % X_train.min(axis=0)) print("per feature max before scaling: %s" % X_train.max(axis=0)) print("per feature min after scaling: %s" % x_train_scalednew.min(axis=0)) print("per feature max after scaling: %s" % x_train_scalednew.max(axis=0)) x_test_scalednew = scaler.transform(X_test) y_test_scalednew = scaler.transform(y_test) print("per-feature min after scaling: %s" % x_test_scalednew.min(axis=0)) print("per-feature max after scaling: %s" % x_test_scalednew.max(axis=0)) from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators=50) model.fit(X, y) acc_train = model.score(X_train, y_train) acc_test = model.score(X_test, y_test) y_pred = model.predict(X_test) from sklearn.metrics import classification_report, confusion_matrix cl = classification_report(y_test, y_pred) cm = confusion_matrix(y_test, y_pred)
categories = [ 'rec.sport.hockey', 'sci.med', 'soc.religion.christian', 'talk.religion.misc' ] newsgroups_train = load_files( 'C:\\Users\\gaura\\Desktop\\Course Material\\Artificial Intelligence - 537\\Assignments\\HW3\\Selected 20NewsGroup\\Training', encoding='latin-1') newsgroups_test = load_files( 'C:\\Users\\gaura\\Desktop\\Course Material\\Artificial Intelligence - 537\\Assignments\\HW3\\Selected 20NewsGroup\\Test', encoding='latin-1') clf_nb = MultinomialNB(alpha=.01) clf_lr = LogisticRegression() clf_svc = LinearSVC() clf_rf = RandomForestClassifier() i, NB_results = split_test_classifier(clf_nb, newsgroups_train.data, newsgroups_test.data, newsgroups_train.target, newsgroups_test.target) i, LR_results = split_test_classifier(clf_lr, newsgroups_train.data, newsgroups_test.data, newsgroups_train.target, newsgroups_test.target) i, SVM_results = split_test_classifier(clf_svc, newsgroups_train.data, newsgroups_test.data, newsgroups_train.target, newsgroups_test.target)
def randomForest(trainVec, trainScore): model = RandomForestClassifier(max_depth=None) # 取消最大深度,防止过拟合 model.fit(trainVec, trainScore) return model
def validate_fold(X_train, X_test, Y_train, Y_test, Q_vec, weights, evaluator, retrieval_method, **kwargs): """Perform validation on one fold of the data This function evaluates a retrieval method on one split of a dataset. Parameters ---------- X_train : pd.DataFrame, shape = [n_train_samples, codebook_size] Training data. X_test : pd.DataFrame, shape = [n_test_samples, codebook_size] Test data. Y_train : pd.DataFrame, shape = [n_train_samples, n_classes] Training tags. Y_train : pd.DataFrame, shape = [n_test_samples, n_classes] Test tags. Q_vec : array-like, shape = [n_queries, n_classes] The queries to evaluate weights : array-like, shape = [n_queries] Ouery weights. Multi-word queries can be weighted to reflect importance to users. evaluator : object An instance of :class:`cbar.evaluation.Evaluator`. retrieval_method: str, 'loreta', 'pamir', or 'random-forest' The retrieval to be evaluated. kwargs: key-value pairs Additionaly keyword arguments are passed to the retrieval methods. Returns ------- params: dict The ``retrieval_method``'s parameters used for the evaluation """ if retrieval_method in LETOR: method = dict(pamir=PAMIR, loreta=LoretaWARP).get(retrieval_method) letor = method(**kwargs) letor.fit(X_train, Y_train, Q_vec, X_test, Y_test) Y_score = letor.predict(Q_vec, X_test) params = letor.get_params() elif retrieval_method == 'random-forest': rf = RandomForestClassifier(class_weight='balanced', **kwargs) clf = OneVsRestClassifier(rf, n_jobs=-1) clf.fit(X_train, Y_train) model_score = standardize(clf.predict_proba(X_test)) Y_score = Q_vec.dot(model_score.T) params = clf.estimator.get_params() else: raise ValueError('Unknown retrieval method.') n_relevant = make_relevance_matrix(Q_vec, Y_train).sum(axis=1) evaluator.eval(Q_vec, weights, Y_score, Y_test, n_relevant) return params
y = dataset.iloc[:, 4].values # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) # Feature Scaling # Not needed; though, included due to the scaled plotting later from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting DTC to the Training set from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Type in 'cm' in console to print confusion matrix # ([63, 5] 63+29 = 92 (Correct Predictions) # [3, 29]) 3+5 = 8 (Incorrect Predictions) # Visualising the Training set results from matplotlib.colors import ListedColormap
'alpha':[0.0001,0.001,0.01] } best_acc_params_mlp=HyperParamsResultsPlot(BestParams_GridSearchCV(mlp,X_train,y_train,hyper_params_mlp,folds),"Multilayer Perceptron") ''' """#### **Best Models :** ##### **Best Random Forest Model :** """ print('Running Best Candidate Models after Hyperparameter Tunning') # best rf model best_rf = RandomForestClassifier(random_state=random_state, max_depth=12, max_features=15, min_samples_leaf=1, min_samples_split=5, n_estimators=200) #rf_test_pred=main_results(best_rf,X_train,X_val,X_test,y_train,y_val) #submission(rf_test_pred) """##### **Best XGBoost Model :**""" # best xgboost model best_xg = XGBClassifier(class_weight='balanced', max_depth=8, max_features=10, min_child_weight=1, min_samples_split=5, n_estimators=300, random_state=random_state) #xg_test_pred=main_results(best_xg,X_train,X_val,X_test,y_train,y_val)
class RandomForest: def __init__(self, criterion, max_features, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, bootstrap, max_leaf_nodes, min_impurity_decrease, random_state=None, n_jobs=1, class_weight=None, **kwargs): self.n_estimators = self.get_max_iter() self.criterion = criterion self.max_features = max_features self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.bootstrap = bootstrap self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.random_state = random_state self.n_jobs = n_jobs self.class_weight = class_weight self.estimator = None @staticmethod def get_max_iter(): return 100 def get_current_iter(self): return self.estimator.n_estimators def fit(self, X, y, sample_weight=None): from sklearn.ensemble import RandomForestClassifier if self.estimator is None: self.n_estimators = int(self.n_estimators) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf) if self.max_features not in ("sqrt", "log2", "auto"): max_features = int(X.shape[1] ** float(self.max_features)) else: max_features = self.max_features self.bootstrap = check_for_bool(self.bootstrap) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_impurity_decrease = float(self.min_impurity_decrease) # initial fit of only increment trees self.estimator = RandomForestClassifier( n_estimators=self.get_max_iter(), criterion=self.criterion, max_features=max_features, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, bootstrap=self.bootstrap, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, random_state=self.random_state, n_jobs=self.n_jobs, class_weight=self.class_weight, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) return self def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) @staticmethod def get_cs(): cs = ConfigurationSpace() criterion = CategoricalHyperparameter( "criterion", ["gini", "entropy"], default_value="gini") # The maximum number of features used in the forest is calculated as m^max_features, where # m is the total number of features, and max_features is the hyperparameter specified below. # The default is 0.5, which yields sqrt(m) features as max_features in the estimator. This # corresponds with Geurts' heuristic. max_features = UniformFloatHyperparameter( "max_features", 0., 1., default_value=0.5) max_depth = UnParametrizedHyperparameter("max_depth", "None") min_samples_split = UniformIntegerHyperparameter( "min_samples_split", 2, 20, default_value=2) min_samples_leaf = UniformIntegerHyperparameter( "min_samples_leaf", 1, 20, default_value=1) min_weight_fraction_leaf = UnParametrizedHyperparameter("min_weight_fraction_leaf", 0.) max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") min_impurity_decrease = UnParametrizedHyperparameter('min_impurity_decrease', 0.0) bootstrap = CategoricalHyperparameter( "bootstrap", ["True", "False"], default_value="True") cs.add_hyperparameters([criterion, max_features, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, bootstrap, min_impurity_decrease]) return cs
'min_samples_leaf': [10, 15, 20, 30], 'criterion': ['gini', 'entropy'], 'max_depth': [10, 15, 20], 'min_samples_split': [10, 20], 'n_jobs': [-1], 'verbose': [1]} start_time = time.time() scores = ['accuracy', 'f1_macro', 'roc_auc_ovo'] for score in scores: print("Tuning for %s" % score) print("----------------------------------") rf_new = ms.HalvingRandomSearchCV(RandomForestClassifier(), param_grid, scoring='%s' % score) rf_new.fit(X_train, Y_train) print("Best parameters set found is:") print(rf_new.best_params_) print("Grid scores on training set:") means = rf_new.cv_results_['mean_test_score'] stds = rf_new.cv_results_['std_test_score'] print("Average scores are ", means) print("SD for the scores are ", stds) print("Detailed classification report:") y_true, y_pred = Y_test, rf_new.predict(X_test)
def __init__(self): self.clf = Pipeline([ ('imputer', SimpleImputer(strategy='most_frequent')), ('rf', RandomForestClassifier(max_depth=5, n_estimators=10)) ])
with open(malicious_ips_path17, 'r') as fp: malicious_ips_ls17 = json.load(fp)['malicious_ips'] with open(malicious_ips_path18, 'r') as fp: malicious_ips_ls18 = json.load(fp)['malicious_ips'] featurename_csv_path = './data/ls17_rfe_features_sorted_by_importance.txt' host_split_path = './data/host_train_test_IP_splits_2905.pickle' with open(host_split_path, 'rb') as fp: host_splits = pickle.load(fp) ################## ### ESTIMATORS ### ################## rf = RandomForestClassifier(n_jobs=6, random_state=0) rf_tuned = RandomForestClassifier(n_jobs=6, random_state=0, n_estimators=128, max_depth=10) svm = svm.LinearSVC(loss='hinge', random_state=0) knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform', n_jobs=4), all_classifiers = { 'RandomForestClassifier(n_jobs=6, random_state=0)': RandomForestClassifier(n_jobs=6, random_state=0), 'GaussianNB()': GaussianNB(), 'LogisticRegression(max_iter=1000, penalty=\'l2\', random_state=0, solver=\'sag\')':
predicted: List of predicted result from model """ cm=confusion_matrix(target,predicted) print ("Confusion Matrix : \n",cm) accuracy=accuracy_score(target,predicted) print ('Accuracy: {:.2f}'.format(accuracy)) sensitivity=cm[0,0]/float(cm[0,0]+cm[0,1]) print ('Sensitivity: {:.2f}'.format(sensitivity)) specificity=cm[1,1]/float(cm[1,0]+cm[1,1]) print ('Specificity: {:.2f}'.format(specificity)) fpr,tpr,thresholds=roc_curve(target,predicted) Auc_value=auc(fpr,tpr) print ('Area Under Curve: {:.2f}'.format(Auc_value)) #[Model1: Only RandomForest] rfr=RandomForestClassifier(n_estimators=500,random_state=1) rfr.fit(X_train,y_train) #fit the model with training data threshold_rf=FindOptimalCutOff(y_train,X_train,rfr) #get the threshold print ("Threshold: ",threshold_rf) y_predict=rfr.predict(X_test) #predict the testing data rf_o=pd.DataFrame() #dataFrame to record the result rf_o['y_predict_prob']=rfr.predict_proba(X_test)[:,1] #get the predicted prob of testing data rf_o['y_predict']=rf_o['y_predict_prob'].map(lambda x:1 if x>threshold_rf else 0) #get the classification getConfusionMatrix(y_test,rf_o['y_predict']) #get the confusion matrix result #[Model2: PCA+RandomForest] #Principal Components [PCA] pca=PCA() pca.fit(X_train) #find the principal components ex_var_ratio=pca.explained_variance_ratio_ # the amount of variance that each PC explains ex_var_ratio_cum=np.cumsum(np.round(ex_var_ratio,decimals=4)*100)#Cumulative Variance explains
from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.externals import joblib # create all the machine learning models models = [] models.append(('LR', LogisticRegression(random_state=9))) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier(random_state=9))) models.append( ('RF', RandomForestClassifier(n_estimators=num_trees, random_state=9))) models.append(('NB', GaussianNB())) models.append(('SVM', SVC(random_state=9))) # variables to hold the results and names results = [] names = [] scoring = "accuracy" # import the feature vector and trained labels h5f_data = h5py.File('output/data.h5', 'r') h5f_label = h5py.File('output/labels.h5', 'r') global_features_string = h5f_data['dataset_1'] global_labels_string = h5f_label['dataset_1']
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456) print("------ Stacking...") estimators = [('lgbm', lgb.LGBMClassifier(objective='regression_l1', n_jobs=-1, n_estimators=1000, num_leaves=80, scale_pos_weight=0.05, verbose=2)), ('rf', RandomForestClassifier(random_state=123456, n_jobs=-1, max_depth=30, n_estimators=400, verbose=2)), ('xgboost', xgb.XGBClassifier(predictor='cpu_predictor', n_gpus=0, n_jobs=-1, n_estimators=700, eta=0.1, max_depth=10, verbose=2))] stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5, verbose=2)
heart = pandas.read_csv("pc.csv") print(heart.describe()) heart.loc[heart["heartpred"] == 2, "heartpred"] = 1 heart.loc[heart["heartpred"] == 3, "heartpred"] = 1 heart.loc[heart["heartpred"] == 4, "heartpred"] = 1 heart["slope"] = heart["slope"].fillna(heart["slope"].median()) heart["thal"] = heart["thal"].fillna(heart["thal"].median()) heart["ca"] = heart["ca"].fillna(heart["ca"].median()) print(heart.describe()) predictors = [ "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal" ] alg = RandomForestClassifier(n_estimators=75, min_samples_split=20, min_samples_leaf=1) kf = KFold(heart.shape[0], n_folds=10, random_state=1) predictions = [] for train, test in kf: # The predictors we're using the train the algorithm. Note how we only take the rows in the train folds. train_predictors = (heart[predictors].iloc[train, :]) #print(train_predictors) # The target we're using to train the algorithm. train_target = heart["heartpred"].iloc[train] #print(train_target) # Training the algorithm using the predictors and target. alg.fit(train_predictors, train_target) # We can now make predictions on the test fold test_predictions = alg.predict(heart[predictors].iloc[test, :]) predictions.append(test_predictions)
from sklearn.model_selection import cross_val_score train_y=train_y.ravel() train_Y=train_Y.ravel() test_y=test_y.ravel() def evaluate_model(model): model.fit(train_x, train_y) print(model.score(train_x, train_y)) print(model.score(test_x, test_y)) cvs=cross_val_score(model, train_X, train_Y, cv=5) print(cvs) print(np.mean(cvs), np.std(cvs)) rfc = RandomForestClassifier(n_estimators=50) evaluate_model(rfc) ''' 1.0 0.8134328358208955 [0.77653631 0.81564246 0.84269663 0.79775281 0.84180791] 0.817115441698256 ''' lr = LogisticRegression(C=2, penalty='l2', tol=1e-8) evaluate_model(lr) ''' 0.8491171749598716
print("For Decision Trees With One Variable") classification_model(model, traindf, predictor_var, outcome_var) print("") print("") """ The accuracy of the prediction is much much better now. Using a single predictor gives a 97% prediction accuracy for this model but the cross-validation score is not that great. """ # Random Forest predictor_var = features_mean model = RandomForestClassifier(n_estimators = 100,min_samples_split = 25, max_depth = 7, max_features = 2) print("For Random Forest") classification_model(model, traindf,predictor_var, outcome_var) print("") print("") """ Using all the features improves the prediction accuracy and the cross-validation score is great. An advantage with Random Forest is that it returns a feature importance matrix which can be used to select features. """ # Selecting Top features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) # Fitting Naive Bayes to the Training set """ from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_train, y_train) """ # Fitting Decision Tree to the Training set """ from sklearn.tree import DecisionTreeClassifier classifier = DecisionTreeClassifier(criterion = "entropy", random_state = 0) classifier.fit(X_train, y_train) """ # Fitting Random Forest classifier to the Training set from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators = 10, criterion = "entropy", random_state = 0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) #### Accuracy = (TP + TN) / (TP + TN + FP + FN) #### Precision = TP / (TP + FP) #### Recall = TP / (TP + FN) #### F1 Score = 2 * Precision * Recall / (Precision + Recall)
precision_train, recall_train, f1_score_train = precision_recall_fscore( train_truth, train_predicted) val_predicted = clf.predict(val_data) precision_val, recall_val, f1_score_val = precision_recall_fscore( val_truth, val_predicted) # print('Training set - precision:{:.3f}, recall:{:.3f}, f1_score:{:.3f}'.format(precision_train, recall_train, f1_score_train)) # print('Validation set - precision:{:.3f}, recall:{:.3f}, f1_score:{:.3f}'.format(precision_val, recall_val, f1_score_val)) print('{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}'.format( precision_train, recall_train, f1_score_train, precision_val, recall_val, f1_score_val)) print('############## RANDOM FOREST ##############') clf = RandomForestClassifier(n_estimators=20, min_samples_leaf=10) clf.fit(train_data, train_truth) train_predicted = clf.predict(train_data) precision_train, recall_train, f1_score_train = precision_recall_fscore( train_truth, train_predicted) val_predicted = clf.predict(val_data) precision_val, recall_val, f1_score_val = precision_recall_fscore( val_truth, val_predicted) # print('Training set - precision:{:.3f}, recall:{:.3f}, f1_score:{:.3f}'.format(precision_train, recall_train, f1_score_train)) # print('Validation set - precision:{:.3f}, recall:{:.3f}, f1_score:{:.3f}'.format(precision_val, recall_val, f1_score_val)) print('{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}'.format( precision_train, recall_train, f1_score_train, precision_val, recall_val,
data=np.asarray(data_df) label=np.asarray(label_df).flatten('F') #change to 1D vector scaler = joblib.load('scaler.joblib') scaler.fit(data) data=scaler.transform(data) x_train, x_test, y_train, y_test = train_test_split(data,label, test_size=0.2, random_state = 4) mlp =MLPClassifier(random_state=4) rfc=RandomForestClassifier(random_state=4) svc = LinearSVC() ovr = OneVsRestClassifier(svc) models =[] models.append(ovr) models.append(mlp) models.append(rfc) kf = StratifiedKFold(n_splits=5, random_state = 4) y_pred=[] for model in models: model.fit(data,label) y=model.predict(x_test) y_pred.append(y) print(accuracy_score(y_test, y))
from sklearn.model_selection import train_test_split xtra, xtes, ytra, ytes = train_test_split( wajah['data'], wajah['target'], test_size = .1 ) # print(len(xtra)) # print(len(xtes)) # print(xtra[0]) # print(ytra[0]) # =============================== # random forest from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators=40) # train model.fit(xtra, ytra) # akurasi print(model.score(xtes, ytes)) # predict print(xtes[0]) print(model.predict([xtes[0]])) print(ytes[0]) # =============================== # plot
def train_classifier(p, loop_num): """ parameters: """ logging.info('Preparing metrics to classifier') # Load label images: classes_ims = [] relevance_ims = [] for t in p.training_files: classes_ims.append(load_and_resize_image(os.path.join(p.labels_folder,t), p.bigger_dim_output_size[loop_num], False, True)[0]) #classes_ims.append(load_and_resize_image(os.path.join(p.labels_folder,t), p.bigger_dim_output_size[loop_num], False, True)[0]==True) if p.is_relevance_mask: #relevance_ims.append(load_and_resize_image(os.path.join(p.relevance_masks_folder,t), p.bigger_dim_output_size[loop_num], False, True)[0]==True) relevance_ims.append(load_and_resize_image(os.path.join(p.relevance_masks_folder,t), p.bigger_dim_output_size[loop_num], False, True)[0]) if not p.is_relevance_mask: relevance_ims = [np.ones(c.shape)*255 for c in classes_ims] # Original images filters array: X = np.array([]) # Classes array: c = np.array([]) # Relevance array: r = np.array([]) # Stack filters file and label images: for i,t in enumerate(p.training_files): temp_i = (generate_all_filters(p.ims_folder, p.bigger_dim_output_size[l], p.z_size, t, p.training_files_timestamp[i], p.channel_num))[0] # If requested output size bigger than image: if not temp_i: if loop_num!=0: logging.warning('Requested output size is bigger than original image. ' 'saving/using classifier from previous iteration although min_f1_score is not met.') return 0, 0, 0 else: raise Exception('Requested output size is bigger than original image. Please decrese bigger_dim_output_size in user_params.') temp_i = np.vstack([f.flatten() for f in temp_i]).T X = np.vstack((X,temp_i)) if X.size else temp_i c = np.hstack((c, classes_ims[i].flatten())) if c.size else classes_ims[i].flatten() r = np.hstack((r, relevance_ims[i].flatten())) if r.size else relevance_ims[i].flatten() if c.shape[0]!=X.shape[0] or r.shape[0]!=X.shape[0]: raise Exception('label images size must be equal to original images size') X = np.vstack([x.flatten() for x in X]) y = np.array([c for c in c.flatten()]) r = np.array([r for r in r.flatten()]) X = X[r==255] y = y[r==255] skf = StratifiedKFold(n_splits=3) classification_reports = list() f1_scores = list() # In case the user asked for regression and not classification: if not p.is_regression: for train_ix, test_ix in skf.split(X, y): # for each of K folds # define training and test sets X_train, X_test = X[train_ix,:], X[test_ix,:] y_train, y_test = y[train_ix], y[test_ix] # Train classifier clf = RandomForestClassifier() #(n_jobs=2) not sure if works on cluster.. clf.fit(X_train, y_train) # Predict test set labels y_hat = clf.predict(X_test) classification_reports.append(classification_report(y_test, y_hat)) f1_scores.append(f1_score(y_test, y_hat, average=None)) print(*classification_reports, sep='/n', flush=True) # Train classifier clf = RandomForestClassifier() #(n_jobs=2) not sure if works on cluster.. else: clf = RandomForestRegressor() clf.fit(X, y) stop_loop_bool = True if (np.mean(f1_scores)>p.min_f1_score) else False return clf, stop_loop_bool
# Collect training features and labels training_features, training_labels = f_extractor.extract_training(training_path) t1 = time.time() print('Done in %.3fs\n' % (t1 - t0)) # Train a random forest classifier # ******************************** # print('Training Random Forest') t0 = time.time() # Create and train a random forest with scikit-learn clf = RandomForestClassifier() clf.fit(training_features, training_labels) t1 = time.time() print('Done in %.3fs\n' % (t1 - t0)) # Test # **** # print('Compute testing features') t0 = time.time() # Collect test features test_features = f_extractor.extract_test(test_path)
clf=lambda: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), normalized=True ), ClfConf(id="knn", clf=lambda: KNeighborsClassifier(n_neighbors=3), normalized=False ), ClfConf(id="nm_g", clf=lambda: GaussianNB(), normalized=False ), ClfConf(id="rf", clf=lambda: RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0), normalized=False ), ClfConf(id="xgb", clf=lambda: xgb.XGBClassifier(), normalized=False ), ClfConf(id="gb", clf=lambda: GradientBoostingClassifier( loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None), normalized=False ), ]
from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from helpers import load_data, nn_layers, nn_reg, nn_iter, cluster_acc, myGMM, clusters, dims, dims_big, run_clustering, pairwiseDistCorr, reconstructionError, ImportanceSelect from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier from sklearn.model_selection import GridSearchCV out = './results/random_forest/' perm_x, perm_y, housing_x, housing_y = load_data() # perm, housing raise Exception('Remove this line to run code') #2 rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7) fs_perm = rfc.fit(perm_x, perm_y).feature_importances_ fs_housing = rfc.fit(housing_x, housing_y).feature_importances_ tmp = pd.Series(np.sort(fs_perm)[::-1]) tmp.to_csv(out + 'perm scree.csv') tmp = pd.Series(np.sort(fs_housing)[::-1]) tmp.to_csv(out + 'housing scree.csv') #4 filtr = ImportanceSelect(rfc) grid = { 'filter__n': dims, 'NN__alpha': nn_reg,
def get_top_n_features(titanic_train_data_X, titanic_train_data_Y, top_n_features): # random forest rf_est = RandomForestClassifier(random_state=0) rf_param_grid = { 'n_estimators': [500], 'min_samples_split': [2, 3], 'max_depth': [20] } rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid, n_jobs=25, cv=10, verbose=1) rf_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best RF Params:' + str(rf_grid.best_params_)) print('Top N Features Best RF Score:' + str(rf_grid.best_score_)) print('Top N Features RF Train Score:' + str(rf_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_rf = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': rf_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature'] print('Sample 10 Features from RF Classifier') print(str(features_top_n_rf[:10])) # AdaBoost ada_est = AdaBoostClassifier(random_state=0) ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1]} ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, n_jobs=25, cv=10, verbose=1) ada_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best Ada Params:' + str(ada_grid.best_params_)) print('Top N Features Best Ada Score:' + str(ada_grid.best_score_)) print('Top N Features Ada Train Score:' + str(ada_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_ada = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': ada_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature'] print('Sample 10 Feature from Ada Classifier:') print(str(features_top_n_ada[:10])) # ExtraTree et_est = ExtraTreesClassifier(random_state=0) et_param_grid = { 'n_estimators': [500], 'min_samples_split': [3, 4], 'max_depth': [20] } et_grid = model_selection.GridSearchCV(et_est, et_param_grid, n_jobs=25, cv=10, verbose=1) et_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best ET Params:' + str(et_grid.best_params_)) print('Top N Features Best ET Score:' + str(et_grid.best_score_)) print('Top N Features ET Train Score:' + str(et_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_et = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': et_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature'] print('Sample 10 Features from ET Classifier:') print(str(features_top_n_et[:10])) # GradientBoosting gb_est = GradientBoostingClassifier(random_state=0) gb_param_grid = { 'n_estimators': [500], 'learning_rate': [0.01, 0.1], 'max_depth': [20] } gb_grid = model_selection.GridSearchCV(gb_est, gb_param_grid, n_jobs=25, cv=10, verbose=1) gb_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best GB Params:' + str(gb_grid.best_params_)) print('Top N Features Best GB Score:' + str(gb_grid.best_score_)) print('Top N Features GB Train Score:' + str(gb_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_gb = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': gb_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)['feature'] print('Sample 10 Feature from GB Classifier:') print(str(features_top_n_gb[:10])) # DecisionTree dt_est = DecisionTreeClassifier(random_state=0) dt_param_grid = {'min_samples_split': [2, 4], 'max_depth': [20]} dt_grid = model_selection.GridSearchCV(dt_est, dt_param_grid, n_jobs=25, cv=10, verbose=1) dt_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best DT Params:' + str(dt_grid.best_params_)) print('Top N Features Best DT Score:' + str(dt_grid.best_score_)) print('Top N Features DT Train Score:' + str(dt_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_dt = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': dt_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)['feature'] print('Sample 10 Features from DT Classifier:') print(str(features_top_n_dt[:10])) # merge the three models features_top_n = pd.concat([ features_top_n_rf, features_top_n_ada, features_top_n_et, features_top_n_gb, features_top_n_dt ], ignore_index=True).drop_duplicates() features_importance = pd.concat([ feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et, feature_imp_sorted_gb, feature_imp_sorted_dt ], ignore_index=True) return features_top_n, features_importance
classifier_ADA=AdaBoostClassifier() classifier_ADA.fit(X_train1,Y_train1) pred_ADA_train=classifier_ADA.predict(X_train1) np.mean(pred_ADA_train==Y_train1) pred_ADA_test=classifier_ADA.predict(X_test1) np.mean(pred_ADA_test==Y_test1) #Train Accuracy ADA=91.18 #Test Accuracy LR=90.27 classifier_RF=RandomForestClassifier() classifier_RF.fit(X_train1,Y_train1) pred_RF_train=classifier_RF.predict(X_train1) np.mean(pred_RF_train==Y_train1) pred_RF_test=classifier_RF.predict(X_test1) np.mean(pred_RF_test==Y_test1) #Train Accuracy RF=100 #Test Accuracy RF=86.60 positive=Reviews[Reviews["Sentiment"]=="positive"] negative=Reviews[Reviews["Sentiment"]=="negative"] print(positive.shape,negative.shape)
def randomForest(self, predictors=predictors, target=target): alg = RandomForestClassifier(random_state=1, n_estimators=20, min_samples_split=2, min_samples_leaf=1) cleanData = self.dataClean() score = cV.kFold().analyze(cleanData, predictors, target, alg) return score