classification_binary( light_clf.SGDClassifier(random_state=RANDOM_SEED)), # Decision trees regression(tree.DecisionTreeRegressor(**TREE_PARAMS)), regression(tree.ExtraTreeRegressor(**TREE_PARAMS)), classification(tree.DecisionTreeClassifier(**TREE_PARAMS)), classification(tree.ExtraTreeClassifier(**TREE_PARAMS)), classification_binary(tree.DecisionTreeClassifier(**TREE_PARAMS)), classification_binary(tree.ExtraTreeClassifier(**TREE_PARAMS)), # Random forest regression(ensemble.ExtraTreesRegressor(**FOREST_PARAMS)), regression(ensemble.RandomForestRegressor(**FOREST_PARAMS)), classification(ensemble.ExtraTreesClassifier(**FOREST_PARAMS)), classification(ensemble.RandomForestClassifier(**FOREST_PARAMS)), classification_binary(ensemble.ExtraTreesClassifier(**FOREST_PARAMS)), classification_binary( ensemble.RandomForestClassifier(**FOREST_PARAMS)), ], # Following is the list of extra tests for languages/models which are # not fully supported yet. # <empty> ) def test_e2e(estimator, executor_cls, model_trainer, is_fast, global_tmp_dir): sys.setrecursionlimit(RECURSION_LIMIT) X_test, y_pred_true, fitted_estimator = model_trainer(estimator) executor = executor_cls(fitted_estimator)
color='r', label='Seven Clusters') plt.xlabel('Unsupervised method') plt.ylabel('Silhouette score') plt.title('Silhouette Score on Various Clustering Methods and Cluster Sizes') plt.xticks(index + bar_width, ('GMM', 'Kmeans', 'HAC')) plt.legend() plt.savefig(sys.argv[2] + '/silhouette_barplot.png') ###END PLOT SNIPPET### ###SUPERVISED PORTION from sklearn import ensemble from sklearn import cross_validation ##For each of random forest, naive bayes, logistic regression, run 10 fold CV and get the average score (mean accuracy) randfor = ensemble.RandomForestClassifier(100) randforscores = cross_validation.cross_val_score(randfor, PCA_data, class_labels, cv=10) avgrandforscore = numpy.mean(randforscores) from sklearn import naive_bayes nbayes = naive_bayes.GaussianNB() nbayesscores = cross_validation.cross_val_score(nbayes, PCA_data, class_labels, cv=10) avgnbayesscores = numpy.mean(nbayesscores)
:param classifier: Model Name :param feature_vector_train: Training input data :param label: Training output label :param feature_vector_valid: Testing input data :return: Accuracy score """ # fit the training dataset on the classifier classifier.fit(feature_vector_train, label) # predict the labels on validation dataset predictions = classifier.predict(feature_vector_valid) return metrics.accuracy_score(predictions, y_test) accuracy = train_model(svm.SVC(kernel='linear'), xtrain_tfidf, y_train, xvalid_tfidf) print "SVM, WordLevel TF-IDF: Accuracy:", accuracy * 100 print("\n") accuracy = train_model(svm.SVC(kernel='linear'), xtrain_count, y_train, xvalid_count) print "SVM, CountVector: Accuracy:", accuracy * 100 print("\n") accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_tfidf, y_train, xvalid_tfidf) print "RF, Wordlevel TF-IDF: Accuracy:", accuracy * 100 print("\n") accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_count, y_train, xvalid_count) print "RF, CountVector: Accuracy:", accuracy * 100 print("\n")
# -*- coding: utf-8 -*- from sklearn import ensemble MODELS = { "randomforest": ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, verbose=2), "extratrees": ensemble.ExtraTreesClassifier(n_estimators=200, n_jobs=-1, verbose=2) }
if __name__ == '__main__': now = datetime.now() unigrams = loadVocabulary() unigrams_flag = 'unigrams' print 'load train set' is_train_set = 1 labels = loadLabels(is_train_set) features = loadFeatures(labels, unigrams, is_train_set) train_set = {'labels': labels, 'features': features} print 'load test set' is_train_set = 0 labels = loadLabels(is_train_set) features = loadFeatures(labels, unigrams, is_train_set) test_set = {'labels': labels, 'features': features} model = ensemble.RandomForestClassifier(n_estimators=20, random_state=512) model_name = 'random forest' trainModel(unigrams_flag, train_set, test_set, model, model_name) model = ensemble.RandomForestClassifier(n_estimators=50, random_state=512) trainModel(unigrams_flag, train_set, test_set, model, model_name) topK = 100 getTopFeatures(model, topK, unigrams) print 'running time is', datetime.now() - now
def getModel(dataset, model): if model == Models.RandomForest: if dataset == "IMDB": return ensemble.RandomForestClassifier( criterion='gini', max_depth=600, max_features=0.8, max_leaf_nodes=100, min_impurity_decrease=0.0001, n_estimators=50) else: return ensemble.RandomForestClassifier( min_impurity_decrease=0.0001, random_state=30, criterion='gini', ccp_alpha=0.0002, max_depth=200, max_features=0.4, n_estimators=90) elif model == Models.DecisionTree: if dataset == "IMDB": return tree.DecisionTreeClassifier(max_depth=600, min_impurity_decrease=0.0001, max_leaf_nodes=100, max_features=0.8, splitter="random", ccp_alpha=0.00025, criterion='gini') else: return tree.DecisionTreeClassifier(max_depth=450, min_impurity_decrease=0.0001, max_leaf_nodes=600, random_state=30, max_features=0.4, criterion='gini', splitter="best", ccp_alpha=0.00055) elif model == Models.AdaBoost: if dataset == "IMDB": return ensemble.AdaBoostClassifier(n_estimators=300, learning_rate=0.7, random_state=0) else: return ensemble.AdaBoostClassifier(n_estimators=125, learning_rate=0.5, random_state=0) elif model == Models.KNN: if dataset == "IMDB": return neighbors.KNeighborsClassifier(n_neighbors=525, weights='uniform', p=2) else: return neighbors.KNeighborsClassifier(n_neighbors=600, weights='uniform', p=2) elif model == Models.LogisticRegression: if dataset == "IMDB": return linear_model.LogisticRegression(C=1.0, dual=False, max_iter=1000, penalty='l1', solver='liblinear', tol=0.1) else: return linear_model.LogisticRegression(C=1.0, dual=False, max_iter=100, penalty='l2', solver='saga', tol=0.01) elif model == Models.SVM: if dataset == "IMDB": return svm.LinearSVC(C=0.1, dual=False, loss='squared_hinge', max_iter=1000, penalty='l2', tol=0.1) else: return svm.LinearSVC(C=1.0, dual=True, fit_intercept=True, loss='squared_hinge', max_iter=5000, penalty='l2', tol=0.01)
import pandas as pd from sklearn import ensemble if __name__ == "__main__": loc_train = "fullTrainM4.csv" #loc_test = "fullTestM4.csv" loc_test = "fullTrainM4.csv" #loc_submission = "kaggle.forest.submission.csv" loc_submission = "trainScore.csv" df_train = pd.read_csv(loc_train) df_test = pd.read_csv(loc_test) feature_cols = [ col for col in df_train.columns if col not in ['repeater', 'id'] ] X_train = df_train[feature_cols] X_test = df_test[feature_cols] y = df_train['repeater'] #test_ids = df_test['id'] test_ids = df_test['id'] print "running RF ..." clf = ensemble.RandomForestClassifier(n_estimators=500, n_jobs=-1) clf.fit(X_train, y) print "scoring ..." with open(loc_submission, "wb") as outfile: outfile.write("id,repeatProbability\n") for e, val in enumerate(list(clf.predict_proba(X_test))): outfile.write("%s,%s\n" % (test_ids[e], val[1]))
from sklearn import ensemble from sklearn.utils import shuffle import numpy as np from sklearn import cross_validation #load data from file data = np.loadtxt('acceldata.txt', delimiter=',') DT = data.transpose() #split data into arrays X = np.array(DT[0:-1]).transpose() Y = np.array(DT[-1]).transpose() #create training and test sets, 70% in training 30% in testing set X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, Y, test_size=0.30, random_state=0) #construct random forest with 10 trees, creating bootstrap samples rf = ensemble.RandomForestClassifier(n_estimators=10, random_state=0, bootstrap=True) ### fit and score the model rf.fit(X_train, y_train)
data_y, test_size=0.3, random_state=4) print('----------- DTREE WITH GINI IMPURITY CRITERION ------------------') dtree_gini_mod = tree.DecisionTreeClassifier(criterion='gini') dtree_gini_mod.fit(x_train, y_train) preds_gini = dtree_gini_mod.predict(x_test) print_multiclass_classif_error_report(y_test, preds_gini) n_est = [100] depth = [None] for n in n_est: for dp in depth: # Create model and fit. mod = ensemble.RandomForestClassifier(n_estimators=n, max_depth=dp) mod.fit(x_train, y_train) # Make predictions - both class labels and predicted probabilities. preds = mod.predict(x_test) print('---------- EVALUATING MODEL: n_estimators = ' + str(n) + ', depth =' + str(dp) + ' -------------------') # Look at results. print_multiclass_classif_error_report(y_test, preds) #END-------------- Fatality or Injury ---------------------- #START-------------SPEEED LIMIT ----------------------------- features2 = list(data) features2.remove('SPEED_LIMIT') data_x = data[features2]
bag = { 'bid': bid_cluster_classifier_bagging, 'ask': ask_cluster_classifier_bagging } with open('../run_models/clusterAndClassify_Bagging.model', 'wb') as output: pickle.dump(bag, output, -1) bid_cluster_classifier_rfc = multiclass.OneVsOneClassifier( estimator=ensemble.RandomForestClassifier(n_estimators=30, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None), n_jobs=-1) bid_cluster_classifier_rfc.fit(trainFeatures, all_bid_labels) print "Bid accuracy with Random Forest: ", bid_cluster_classifier_rfc.score( trainFeatures, all_bid_labels) ask_cluster_classifier_rfc = multiclass.OneVsOneClassifier( estimator=ensemble.RandomForestClassifier(n_estimators=30, criterion='gini',
X_test = np.array(df.drop(['application_key'],1)) X_test = preprocessing.scale(X_test)# # if pca applied X_test = np.array(pd.DataFrame(data = pca.transform(X_test))) X_test = preprocessing.scale(X_test)# # different classifiers tested for the price pred dataset clf1 = svm.SVC() clf2 = ske.RandomForestClassifier(n_estimators=100) clf3 = neighbors.KNeighborsClassifier(n_neighbors=5) clf4 = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(150, 100, 50, 10, 3)) clf5 = DecisionTreeClassifier(criterion = "entropy", random_state = 10,max_depth=100, min_samples_leaf=5) clf6 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=100), n_estimators=600, learning_rate=1) clfs = [clf2 ] # train model i = 1 for clf in clfs: epochs = 10 for epoch in range(epochs): X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
pca_df.plot(x=0, y=1, kind='scatter') variance_df = pandas.DataFrame({ 'variance': pca.explained_variance_, 'principal component': pca_df.columns.tolist() }) # adding one to pricnipal componetns (since there is no 0th compeonet) variance_df['principal component'] = variance_df['principal component'] + 1 variance_df.plot(x='principal component', y='variance') # looks like variance stops getting explained after first two components pca_df_small = pca_df.ix[:, 0:1] # getting a cross val score of transformed data rf = ensemble.RandomForestClassifier(n_estimators=500) roc_scores_rf_pca = cross_val_score(rf, pca_df_small, response_series, cv=10, scoring='roc_auc') print roc_scores_rf_pca.mean() # 74% accuracy roc_scores_rf = cross_val_score(rf, explanatory_df, response_series, cv=10, scoring='roc_auc') print roc_scores_rf.mean()
# In[ ]: from sklearn import linear_model from sklearn import tree from sklearn import neighbors from sklearn import ensemble from sklearn import svm from sklearn import gaussian_process from sklearn import naive_bayes from sklearn import neural_network from sklearn.model_selection import cross_val_score clfs = {} clfs['lr'] = {'clf': linear_model.LogisticRegression(), 'name':'LogisticRegression'} clfs['rf'] = {'clf': ensemble.RandomForestClassifier(n_estimators=750, n_jobs=-1), 'name':'RandomForest'} clfs['tr'] = {'clf': tree.DecisionTreeClassifier(), 'name':'DecisionTree'} clfs['knn'] = {'clf': neighbors.KNeighborsClassifier(n_neighbors=4), 'name':'kNearestNeighbors'} clfs['svc'] = {'clf': svm.SVC(kernel="linear"), 'name': 'SupportVectorClassifier'} clfs['nusvc'] = {'clf': svm.NuSVC(), 'name': 'NuSVC'} clfs['linearsvc'] = {'clf': svm.LinearSVC(), 'name': 'LinearSVC'} clfs['SGD'] = {'clf': linear_model.SGDClassifier(), 'name': 'SGDClassifier'} clfs['GPC'] = {'clf': gaussian_process.GaussianProcessClassifier(), 'name': 'GaussianProcess'} clfs['nb'] = {'clf': naive_bayes.GaussianNB(), 'name':'GaussianNaiveBayes'} clfs['bag'] = {'clf': ensemble.BaggingClassifier(neighbors.KNeighborsClassifier(), max_samples=0.5, max_features=0.5), 'name': "BaggingClassifier"} clfs['gbc'] = {'clf': ensemble.GradientBoostingClassifier(), 'name': 'GradientBoostingClassifier'} clfs['mlp'] = {'clf': neural_network.MLPClassifier(hidden_layer_sizes=(10,8,3), alpha=1e-5, solver='lbfgs'), 'name': 'MultilayerPerceptron'} # In[ ]:
##grid_dt_search.fit(x_train, y_train) ##print(grid_dt_search.best_params_) ##tree_model.set_params(criterion="gini", max_depth=4) ##tree_model.fit(x_train, y_train) ##feature_importance = np.array(list(zip(data.columns.values, tree_model.feature_importances_)), ## dtype=[('feature', 'S10'), ('importance', 'float')]) ##most_important = np.sort(feature_importance, order="importance")[::-1] ##for i in most_important[0:5]: ## print(i) grid_para_forest = { 'criterion': ['gini', 'entropy'], 'max_depth': range(1, 31), 'n_estimators': range(10, 110, 10) } from sklearn import ensemble forest_model = ensemble.RandomForestClassifier() grid_rf_search = ms.GridSearchCV(forest_model, grid_para_forest, cv=3, n_jobs=1) grid_rf_search.fit(x_train, y_train) print(grid_rf_search.best_params_) forest_model.set_params(criterion='gini', max_depth=3, n_estimators=80) forest_model.fit(x_train, y_train) print(forest_model.score(x_test, y_test)) feature_importance = np.array(list(zip(data.columns.values, forest_model.feature_importances_)), dtype=[('feature', 'S10'), ('importance', 'float')]) most_important = np.sort(feature_importance, order="importance")[::-1] for i in most_important[0:5]: print(i)
time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) log_files = path.join(LOGS_PATH, 'log_benchmark_time.txt') logging.basicConfig(filename=log_files + str(time_stamp), level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') logging.debug('This is a log message.') models = [ # #alpha=0.0001, average=False, class_weight=None, epsilon=0.1, # eta0=0.0, fit_intercept=True, max_iter=1000, tol=None,l1_ratio=0.15, # learning_rate='optimal', loss='hinge' (LogisticRegression(C=2), { 'C': [1, 10, 100] }, 'Logistic_reg_scale'), (ensemble.RandomForestClassifier(n_estimators=100), { "max_depth": [3, None], "max_features": ['auto', 3, 10], "min_samples_split": [2, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"] }, 'RandomForest'), (ensemble.GradientBoostingClassifier(), {}, 'Gradient boosting') #(LateFusion(model2=svm.SVC(kernel='linear', C=1, probability=True),standardscaler=preprocessing.Normalizer()), {}, 'late_fusion(GB,SVM) fusion5:5') # (LateFusion(model2=LogisticRegression(C=2, class_weight={0:0.1, 1:0.9})),{},'late_fusion(GB,LR(0.1,0.9)'), # (LateFusion(model2=ensemble.GradientBoostingClassifier()),{},'late_fusion(GB, GB') ] def run_nested_cv_fold(data_type='benchmark', n_split=5, cv_fold=5): """
def test_classification_tasks(): # ------------------------ # Load latest feature sets # ------------------------ latest_feature_set_time = time.strftime('0') for index, file in enumerate(os.listdir(OUT_PATH)): if file.startswith("comb_dataset_") and file.endswith(".csv"): file_name = os.path.splitext(file)[0].split('_') timestamp = file_name[2] + '_' + file_name[3] if timestamp > latest_feature_set_time: latest_feature_set_time = timestamp print(latest_feature_set_time) features_path = OUT_PATH + 'comb_dataset_' + latest_feature_set_time + ".csv" df = pd.read_csv(features_path, sep=',') # df = pd.read_csv(OUT_PATH + 'dataset_20190513_115505.csv', sep=',') # feature_set = ['structural', 'temporal', 'social'] # label_set = ['true', 'false', 'unverified', 'non-rumor'] # print(df.dtypes) df = df.fillna(0) # ---------------- # DROP COLUMNS # ---------------- # if 'temporal___longest_length' in df: # df = df.drop(columns=['temporal___longest_length']) # if 'temp_longest_length' in df: # df = df.drop(columns=['temp_longest_length']) # --------------------- # DROP ROWS (label) # --------------------- # df = df[df.label != 'unverified'] # df = df[df.label != 'non-rumor'] X = df.drop(columns=['tweet_id', 'label']) y = df['label'] # print(df.shape, df['label'].value_counts().to_dict()) # print(df.info()) classifiers = {'RF': ensemble.RandomForestClassifier(), 'XGB': XGBClassifier(), 'ADAB': ensemble.AdaBoostClassifier(), 'GRADB': ensemble.GradientBoostingClassifier()} for classifier_name in list(classifiers.keys()): accuracy_results = [] f1_macro_results = [] f1_micro_results = [] max_accuracy = 0 for i in range(100): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) # 5-fold cross validation clf = classifiers[classifier_name] clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # predictions accuracy = metrics.accuracy_score(y_test, y_pred) f1_macro = f1_score(y_test, y_pred, average='macro') f1_micro = f1_score(y_test, y_pred, average='micro') # print("#{}: Accuracy={} F1-macro={} F1-micro={}".format(i, accuracy, round(f1_macro, 4), round(f1_micro, 4))) accuracy_results.append(accuracy) f1_macro_results.append(f1_macro) f1_micro_results.append(f1_micro) if accuracy > max_accuracy: max_accuracy = accuracy feature_importances = pd.DataFrame(clf.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False) print('\n\n' + classifier_name) print('MEAN \t STD\t MEDIAN') print('ACC \t', round(st.mean(accuracy_results), 4), '+-', round(st.pstdev(accuracy_results), 4)) print('F1-macro\t', round(st.mean(f1_macro_results), 4), '+-', round(st.pstdev(f1_macro_results), 4)) print('F1-micro\t', round(st.mean(f1_micro_results), 4), '+-', round(st.pstdev(f1_micro_results), 4)) print(max_accuracy) print(feature_importances) print("==========================="*3)
X_train,X_test,Y_train,Y_test= train_test_split(train,a,test_size=0.3) print("START TRAINNING") #KNN TRAINNING knn = KNeighborsClassifier() knn.fit(X_train, Y_train) y_predict = knn.predict(X_test) print(y_predict) print("KNN accuracy:",accuracy_score(Y_test,y_predict)) print("KNN precision:",precision_score(Y_test,y_predict, average=None)) # 建立 random forest 模型 forest = ensemble.RandomForestClassifier(n_estimators = 40) forest_fit = forest.fit(X_train, Y_train) # 預測 test_y_predicted = forest.predict(X_test) print(test_y_predicted) print("RF(40) accuracy:",accuracy_score(Y_test,test_y_predicted)) print("RF(40) precision:",precision_score(Y_test,test_y_predicted, average=None)) for i in range(len(Y_test)): if Y_test[i]!=test_y_predicted[i]: print(i,Y_test[i],test_y_predicted[i]) print("----------------------------------------------------------------------------------") print("TESTING")
y_train = y_train.reset_index(drop=True) x_test = x_test.reset_index(drop=True) y_test = y_test.reset_index(drop=True) '''IMPUTE MISSING VALUES FOR TRAIN AND TEST SEPERATELY''' x_train = missingValueImpute(x_train) x_test = missingValueImpute(x_test) '''CLASSIFICATION OF THE PRICE BINS''' #Most of the time people have an idea about the price range in which their rental will fail. #For users who have no idea about the price range, we can first classify in which price bucket their rental can fall in #and then do the bucket specific regression #RandomForest Classifier to predict the price bins randomForestClassifier = ensemble.RandomForestClassifier( n_estimators=200, max_features='auto', max_depth=15, min_samples_leaf=7, random_state=25, class_weight='balanced') randomForestClassifier.fit(scale(x_train), y_train['price_bins']) print(randomForestClassifier.score(scale(x_train), y_train['price_bins'])) print(randomForestClassifier.score(scale(x_test), y_test['price_bins'])) y_pred = randomForestClassifier.predict(scale(x_test)) report = metrics.classification_report(y_test['price_bins'], y_pred) print(report) #Logistic Regression logistic = linear_model.LogisticRegression(random_state=23, class_weight='balanced') logistic.fit(scale(x_train), y_train['price_bins']) print(logistic.score(scale(x_train), y_train['price_bins']))
kaggle_format, delimiter=",", fmt='%d,%d', header='Id,Category', comments='') ############# BUILT-IN FUNCTION ############# scoreBuffer = [] print 50 * '=' print "CROSS VALIDATION USING SCIKIT-LEARN" print 50 * '=' for depth in depths: print "DEPTH:", depth clf = ensemble.RandomForestClassifier(n_estimators=5, criterion='entropy', max_depth=depth) scores = computeCV_Score(clf, crossValidation_Data, crossValidation_Labels, k) scoreBuffer.append((scores).mean()) print "Depth:", depth, "Accuracy: %0.2f%% (+/- %0.2f)" % ( (scores).mean(), np.array(scores).std() / 2) print 50 * '-' maxScore = np.max(scoreBuffer) maxScore_Index = scoreBuffer.index(maxScore) print "Best Depth Value:", depths[ maxScore_Index], "Accuracy for that Depth:", np.around(maxScore, 3) print 50 * '-' print 20 * "*", "The End", 20 * "*"
np_features = df.as_matrix() np_keys = keys.as_matrix() features_train, features_test, keys_train, keys_test = train_test_split(np_features, np_keys, test_size=0.33, random_state=42) print "type(features_train)=", type(features_train), "features_train.shape=", features_train.shape print "type(features_test)=", type(features_test), "features_test.shape=", features_test.shape print "type(keys_train)=", type(keys_train), "keys_train.shape=", keys_train.shape print "type(keys_test)=", type(keys_test), "keys_test.shape=", keys_test.shape del df del np_features del np_keys start_time = time.time() clf = ensemble.RandomForestClassifier(n_estimators=200,n_jobs=-1,random_state=0) print("--- time to execute ensemble.RandomForestClassifier %s seconds ---" % (time.time() - start_time)) #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html #n_jobs : integer, optional (default=1) # The number of jobs to run in parallel for both fit and predict. # If -1, then the number of jobs is set to the number of cores. # my I7-4790 has 4 cores. print "fitting clf.fit" start_time = time.time() clf.fit(features_train, keys_train) print("--- time to execute clf.fit %s seconds ---" % (time.time() - start_time)) print "predicting" start_time = time.time() keys_test_predicted = clf.predict(features_test)
pca = doPCA(x_train) x_train = pca.transform(x_train) x_test = pca.transform(x_test) ''' clf = ensemble.ExtraTreesClassifier(n_estimators=800, min_samples_leaf=5) clf = clf.fit(x_train, y_train) model = feature_selection.SelectFromModel(clf, prefit=True) x_train = model.transform(x_train) x_test = model.transform(x_test) print(np.shape(x_train)) # Train random forest classifier with gini-impurity print("Begin training random forest with gini...") clf = ensemble.RandomForestClassifier(n_estimators=800, min_samples_leaf=5) clf.fit(x_train, y_train) trng_acc = clf.score(x_train, y_train) val_acc = clf.score(x_test, y_test) rf1_pred = clf.predict(x_test) print("Training accuracy: %f" %trng_acc) print("Validation accuracy: %f" %val_acc) joblib.dump(clf, "rf_gini.pkl") # Train random forest classifier with entropy-impurity print("Begin training random forest with entropy...") clf = ensemble.RandomForestClassifier(n_estimators=800, min_samples_leaf=5, criterion='entropy') clf.fit(x_train, y_train) trng_acc = clf.score(x_train, y_train) val_acc = clf.score(x_test, y_test) rf2_pred = clf.predict(x_test)
markersClassifier = None if learnAlgo == 'LogisticRegression': #markersClassifier = linear_model.LogisticRegression(C=nbMarkers, penalty='l1', class_weight=classWeights) markersClassifier = linear_model.LogisticRegression(max_iter=10000) markerFeaturesSet = markerFeaturesSet.tocsr() elif learnAlgo == 'SVM': markersClassifier = svm.SVC(probability=True, C=nbMarkers, class_weight=classWeights) markerFeaturesSet = markerFeaturesSet.tocsr() elif learnAlgo == 'DecisionTreeClassifier': markersClassifier = tree.DecisionTreeClassifier(min_samples_split=10, min_density=1) markerFeaturesSet = markerFeaturesSet.toarray() elif learnAlgo == 'RandomForestClassifier': markersClassifier = ensemble.RandomForestClassifier( min_samples_split=10, min_density=1) markerFeaturesSet = markerFeaturesSet.toarray() elif learnAlgo == 'ExtraTreesClassifier': markersClassifier = ensemble.ExtraTreesClassifier(min_samples_split=10, min_density=1) markerFeaturesSet = markerFeaturesSet.toarray() print(' - fit dataset') markersClassifier.fit(markerFeaturesSet, markerTargetsSet) print(' - save model to file') joblib.dump(markersClassifier, corpusModel + '/model_markers.txt') # Compute permutation cost as edit distance adapted to sequences def getSequenceDistance(s1, s2): s1Len = len(s1) s2Len = len(s2)
#Positive and Negative state of reviews #+ve is 1, -ve is 2 for f in range(len(state)): if state[f] == "positive": state[f] = 1 elif state[f] == "negative": state[f] = 2 #Splitting data into training and testing datasets train_x, test_x, train_y, test_y = train_test_split(termdoc, state, test_size=0.3) #Using Random Forest Classification Algorithm rfc = ensemble.RandomForestClassifier() rfc_scores = cross_val_score(rfc, termdoc, state, cv=10) print 'Random forest mean accuracy : %.2f' % (rfc_scores.mean()) print 'Random forest std : %.2f' % (rfc_scores.std()) rfc.fit(train_x, train_y) predict_train_y = rfc.predict(train_x) predict_test_y = rfc.predict(test_x) print confusion_matrix(test_y, predict_test_y) print 'Precision score is : %.2f' % (precision_score(test_y, predict_test_y)) print 'Recall Score is : %.2f' % (recall_score(test_y, predict_test_y)) print 'F score is : %.2f' % (f1_score(test_y, predict_test_y)) #Predicting the review states predicted_all = rfc.predict(termdoc)
def create_model(self, trial): rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True) model = ensemble.RandomForestClassifier( max_depth=rf_max_depth, n_estimators=10 ) return model
with open('../saves/exp_test_trim_1.pickle', 'rb') as f: test_df = pickle.load(f) train_data = train_df test_data = test_df scaler = MinMaxScaler() scaler.partial_fit(train_data[cols]) scaler.partial_fit(test_data[cols]) train_input = scaler.transform(train_data[cols]) test_input = scaler.transform(test_data[cols]) param_grid = { 'n_estimators': [100, 200, 300, 400, 500], 'max_features': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] } # cv_generator = GroupKFold(n_splits=3) for i in range(50): CV_rfc = GridSearchCV(ensemble.RandomForestClassifier(), param_grid=param_grid, cv=10) CV_rfc.fit(train_input, train_data['true_class']) save_string = "../saves/cv_forest_model_aws" + str(i) + ".pickle" with open(save_string, 'wb') as f: pickle.dump(CV_rfc, f) print(CV_rfc.best_params_) print('finished')
import time import pickle filename = "ApneaData.pkl" testPercent=20 features = [] classes = [] t = time.time() f = open(filename,'rb') data = pickle.load(f) f.close() np.random.shuffle(data) for row in data: features.append(row[:-1]) classes.append(row[-1]) inputLength = len(features) testLength = int(inputLength*0.2) train_features, train_classes=features[:-testLength], classes[:-testLength] test_features,test_classes = features[-testLength:],classes[-testLength:] print("preprocessing time:",(time.time()-t)) t=time.time() clf=ensemble.RandomForestClassifier(n_estimators=30) clf.fit(train_features,train_classes) print("fitting time:",(time.time()-t)) t=time.time() pred_classes=[] for e in test_features: pred_classes.append(clf.predict([e])[0]) score = accuracy_score(pred_classes,test_classes)*100 print("predicting time:",(time.time()-t)) print("Accuracy:",score)
from sklearn import ensemble, feature_extraction, preprocessing # import data train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') sample = pd.read_csv('../input/sampleSubmission.csv') # drop ids and get labels labels = train.target.values train = train.drop('id', axis=1) train = train.drop('target', axis=1) test = test.drop('id', axis=1) # encode labels lbl_enc = preprocessing.LabelEncoder() labels = lbl_enc.fit_transform(labels) # train a random forest classifier clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=50, verbose=2) clf.fit(train, labels) # predict on test set preds = clf.predict_proba(test) # create submission file preds = pd.DataFrame(preds, index=sample.id.values, columns=sample.columns[1:]) preds.to_csv('benchmark.csv', index_label='id')
from sklearn import tree from sklearn import ensemble models = { "decision_tree_gini": tree.DecisionTreeClassifier( criterion="gini" ), "decision_tree_entropy": tree.DecisionTreeClassifier( criterion='entropy' ), "rf": ensemble.RandomForestClassifier(), }
base_endrow = base_endrow + ppd print(base_endrow) test_startrow = base_endrow + 1 print(test_startrow) test_endrow = test_startrow + ppd print(test_endrow) day = day + 1 print(day) return(calendar) # scramble one one = one.sample(frac=1) #print(system_data_stream(one, p, t)) rf_model = skens.RandomForestClassifier(n_estimators=10,oob_score=True, criterion='entropy') calendar = [] base_startrow = 1 base_endrow = t test_startrow = t + 1 test_endrow = test_startrow + p day = 1 print(base_startrow) print(base_endrow) print(test_startrow) print(test_endrow) print(day)
def train_model_random_forest(train, labels): # train a random forest classifier model = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000) model.fit(train, labels) joblib.dump(model, 'rf_model2.model') return model