def initialize_models(X_train, y_train, X_test, y_test, accuracy, fscore): # TODO: Initialize the three models clf_A = dtc(random_state=13) clf_B = rfc(random_state=13) clf_C = abc(random_state=13) # TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data # HINT: samples_100 is the entire training set i.e. len(y_train) # HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`) # HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`) samples_100 = len(y_train) samples_10 = len(y_train) // 10 samples_1 = len(y_train) // 100 # Collect results on the learners results = {} for clf in [clf_A, clf_B, clf_C]: clf_name = clf.__class__.__name__ results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]): results[clf_name][i] = train_predict(clf, samples, X_train, y_train, X_test, y_test) # Run metrics visualization for the three supervised learning models chosen vs.evaluate(results, accuracy, fscore) return clf_C
def __init__(self, pathToData): self.dataFilePath = pathToData self.algoname = 'Boosting' self.datasetName = 'Letter' self.baseEstimater = dtc(class_weight='balanced') # x = {'base_estimator': self.baseEstimater, # 'base_estimator__max_depth': 15} self.classifier = abc(base_estimator=self.baseEstimater, algorithm='SAMME') # self.classifier.set_params(**x) self.cv = 5
def train(df): '''This function trains the data on 4 different SVC model kernels: 1. Linear Kernel 2. Polynomial Kernel 3. Radial Basis Function Kernel 4. Sigmoid Kernel The RFC model is also implemented. The hyperparameters are set default in each case. The score of the model on the Dev/Test set is returned to the main script. ''' X, y = preprocess_data.addFeatures(df) X_train, X_test, y_train, y_test = preprocess_data.splitDataset(X, y) X_train, X_test = preprocess_data.featureScaling(X_train, X_test) model_slinear = svm.SVC(kernel='linear') model_slinear.fit(X_train, y_train) score_slinear = model_slinear.score(X_test, y_test) model_spoly = svm.SVC(kernel='poly') model_spoly.fit(X_train, y_train) score_spoly = model_spoly.score(X_test, y_test) model_srbf = svm.SVC(kernel='rbf') model_srbf.fit(X_train, y_train) score_srbf = model_srbf.score(X_test, y_test) model_ssig = svm.SVC(kernel='sigmoid') model_ssig.fit(X_train, y_train) score_ssig = model_ssig.score(X_test, y_test) model_rfc = rfc(max_depth=4, random_state=0) model_rfc.fit(X_train, y_train) score_rfc = model_rfc.score(X_test, y_test) model_abc = abc(n_estimators=500) model_abc.fit(X_train, y_train) score_abc = model_abc.score(X_test, y_test) model_vc = VotingClassifier(estimators=[( 'svc', model_srbf, ), ('rf', model_rfc)], voting='hard') model_vc.fit(X_train, y_train) score_vc = model_vc.score(X_test, y_test) return score_slinear, score_spoly, score_srbf, score_ssig, score_rfc, score_abc, score_vc
def AdaBoost(): trainX, trainY, testX, testY = load_data(['common_neigh', 'check_common_time_spot','common_crt_time_spot', 'dist_common_spot', 'shortest_path', 'katzB' ,'adamic_adar', 'mean_distance']) print('load data completely') clf = abc(n_estimators=300) clf.fit(trainX, trainY) print('AdaBooting completely') print(clf.feature_importances_) testDict, testList = test_index(testX) test_size = len(testDict) predictY = clf.predict(testX) with open('adaboost_predict.txt', 'w') as f: for i in range(test_size): print(testList[i][0], testList[i][1], predictY[i], file=f) scores = clf.score(testX, testY) print('predict testing data completely') print('Accuracy in sample =', scores)
def train_abc(exp_depth, exp_lr): train = np.load('train_vars.npy') val = np.load('val_vars.npy') train_labels = np.load('train_labels.npy').ravel() val_labels = np.load('val_labels.npy').ravel() val_size = len(val_labels) abc_model = abc(base_estimator=DecisionTreeClassifier(max_depth=exp_depth), learning_rate=exp_lr) abc_model.fit(train, train_labels) predictions = abc_model.predict(val) correct = np.sum(np.equal(predictions, val_labels)) accuracy = correct / float(val_size) result = 1 - accuracy result = float(result) print 'Result = %f' % result #time.sleep(np.random.randint(60)) return result
def model_tunings_abc(X_train, y_train, X_test, y_test): # Initialize the classifier base_model = rfc() clf = abc(base_estimator=base_model, random_state=13) # TODO: Create the parameters list you wish to tune, using a dictionary if needed. # HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]} parameters = {'learning_rate': [0.02, 0.04, 0.2], 'n_estimators': [75, 100, 150]} # TODO: Make an fbeta_score scoring object using make_scorer() scorer = make_scorer(fbeta_score, beta=0.5) # TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV() grid_obj = GridSearchCV(clf, parameters, scoring=scorer) # TODO: Fit the grid search object to the training data and find the optimal parameters using fit() grid_fit = grid_obj.fit(X_train, y_train.ravel()) # Get the estimator best_clf = grid_fit.best_estimator_ # Make predictions using the unoptimized and model predictions = (clf.fit(X_train, y_train.ravel())).predict(X_test) best_predictions = best_clf.predict(X_test) # Report the before-and-afterscores print("Unoptimized model\n------") # print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))) print("Accuracy score on testing data: {0}".format(accuracy_score(y_test, predictions))) # print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5))) print("F-score on testing data: {0}".format(fbeta_score(y_test, predictions, beta=0.5))) print("\nOptimized Model\n------") # print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))) print("Final accuracy score on the testing data: {0}".format(accuracy_score(y_test, best_predictions))) # print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))) print("Final F-score on the testing data: {0}".format(fbeta_score(y_test, best_predictions, beta=0.5))) return best_clf,best_predictions
# In[ ]: # アルゴリズムにロジスティック回帰を採用 lr = LogisticRegression(C=1000) # fit関数で学習開始 lr.fit(X_train, y_train) y_test_pred = lr.predict(X_test) y_test_pred # In[ ]: # Adaboostなるものをためしてみる from sklearn.ensemble import AdaBoostClassifier as abc bdt = abc() bdt.fit(X_train, y_train) y_test_ada = bdt.predict(X_test) y_test_ada # In[ ]: # 次は決定木 from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(random_state=0) clf.fit(X_train, y_train) y_test_dtc = clf.predict(X_test) y_test_dtc # In[ ]:
Scale_NumCols(['Age', 'SibSp', 'Parch', 'Fare'], take_log=True)) pipeline = Pipeline([('deal_na', Deal_NAs()), ('encode_cat', Encode_CatCols(drop=['Name', 'Ticket'])), scale_num]) #X_prepared = pipeline.fit_transform(X_) X_train_p = pipeline.fit_transform(X_train) X_vali_p = pipeline.transform(X_vali) from sklearn.linear_model import LogisticRegression as lr from sklearn.tree import DecisionTreeClassifier as dtc from sklearn.ensemble import RandomForestClassifier as rfc from sklearn.ensemble import AdaBoostClassifier as abc from sklearn.ensemble import GradientBoostingClassifier as gbc model = lr(C=1) model = dtc(min_samples_split=10, max_features=5) model = abc(dtc(max_depth=4), n_estimators=100) model = gbc(n_estimators=200) #model = rfc(n_estimators=200 ,min_samples_split = 5) model.fit(X_train_p, Y_train) # print(model.score(X_train_p, Y_train)) # print(model.score(X_vali_p, Y_vali)) # coef_df = pd.DataFrame({'name':X_train_p.columns.tolist(), 'coef':model.coef_[0]}) # coef_df.sort_values('coef', ascending = False) from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix Y_pred = model.predict(X_vali_p) print(classification_report(Y_vali, Y_pred)) print(submit.head())
from cross_validation import cross_validation as CV import matplotlib.pyplot as plt from feature_selection import feature_selection #Loading data x_train = np.loadtxt('../Data/x_train.txt') y_train_binary = np.loadtxt('../Data/y_train_binary.txt') x_test = np.loadtxt('../Data/x_test.txt') y_test_binary = np.loadtxt('../Data/y_test_binary.txt') x_orig_train = np.loadtxt('../Data/x_orig_train.txt') y_orig_train_binary = np.loadtxt('../Data/y_orig_train_binary.txt') x_final_test = np.loadtxt('../Data/x_final_test.txt') y_final_test_binary = np.loadtxt('../Data/y_final_test_binary.txt') #Modeling classifier clf = abc() #Calling feature selection methods fs = feature_selection() #clf,x_train,x_test,x_final_test,y_out = fs.PCASelection(x_train,y_train_binary,x_test,y_test_binary,x_final_test,clf) #clf,x_train,x_test,x_final_test,y_out = fs.KBest(x_train,y_train_binary,x_test,y_test_binary,x_final_test,clf) clf.fit (x_train,y_train_binary) y_out = clf.predict(x_test) #Printing scores score = clf.score(x_test,y_test_binary) print "Score : ", score print "Precision recall f-score support : " , prfs(y_test_binary,y_out) #Cross validation
def experiment(model, textmodel): trade_days = [30, 60] nos = [1101] all_predict = [] all_labels = [] for stockno in nos: for days in trade_days: data = feature_label(stockno, days, textmodel) feature = [] for i, j in data[2].items(): for vec in j: feature.append(vec) feature = np.array(feature) test_scaler = StandardScaler().fit(feature) x_train = np.array(data[0]) y_train = np.array(data[1]) train_scaler = StandardScaler().fit(x_train) x_train = train_scaler.transform(x_train) if (model == 'ns'): clf = ns(kernel='linear').fit(x_train, y_train) elif (model == 'abc'): clf = abc(learning_rate=1, n_estimators=100).fit(x_train, y_train) elif (model == 'lr'): clf = lr().fit(x_train, y_train) x_test = [] pv = 0 pk = 0 y_test = {} predict = {} for i, j in data[2].items(): tmp = 0 if (j != []): # /////// vote /////// for vec in j: tmp += clf.predict( test_scaler.transform(vec.reshape(1, -1))) if (tmp > 0): pv = 1 else: pv = -1 # /////// KMeans /////// buf = [] for vec in j: buf.append(vec) center = KMeans(n_clusters=1).fit(buf) result = test_scaler.transform( np.array(center.cluster_centers_[0]).reshape(1, -1)) x_test.append(result) if (clf.predict(result) > 0): pk = 1 else: pk = -1 # pk = clf.predict(result) # y_test.append(data[3][i]) y_test[i] = data[3][i] predict[i] = [pv, pk] all_predict.append(predict) all_labels.append(y_test) print("StockNo: {}, {} trade days done".format(stockno, days)) return all_labels, all_predict
print 'correct_predict =', correct_predict print 'precision =', precision print 'recall =', recall print 'f1_score =', f1_score return f1_score with open('clcntt_randfix01.pickle', 'rb') as f: data = pickle.load(f) train_label = data['train_label'] train_data = data['train_data'] test_label = data['test_label'] test_data = data['test_data'] column_names = data['col_names'] del data trainer = abc(n_estimators=100, learning_rate=0.9).fit(train_data, train_label) tr_prediction = trainer.predict(test_data) f1Score(tr_prediction, test_label) """ dot_data=StringIO() tree.export_graphviz(trainer, out_file=dot_data, feature_names=column_names, class_names=['innocent','sin'], filled=True, rounded=True, impurity=False,max_depth=6,rotate=True ) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("test.pdf") """
print 'finished, file saved : ',excel_name with open(picklename,'rb') as f: data=pickle.load(f) test_labels =data['test_label'] test_dataset =data['test_data'] train_labels =data['train_label'] train_dataset =data['train_data'] column_names =data['col_names'] submit_dataset =data['submit_data'] submit_custid =data['submit_custid'] del data #조건별로 트레이너와 변수 세팅 if trainer_select =='adaboost': trainer = abc(n_estimators=n_estimators,learning_rate=learning_rate).fit(train_dataset,train_labels) elif trainer_select =='randomforest': trainer = rf(n_estimators=n_estimators).fit(train_dataset,train_labels) elif trainer_select =='tree': #tree 는 pdf 파일도 작성 trainer = tree.DecisionTreeClassifier().fit(train_dataset,train_labels) if ifpdf==1: #필요하다면 pdf로 출력 dot_data=StringIO() tree.export_graphviz(trainer, out_file=dot_data, feature_names=column_names, class_names=['innocent','sin'], filled=True, rounded=True, impurity=False,max_depth=6,rotate=True ) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf(pdfname)
##Fourth Model Approach: Boosted Trees (BT) - How does BT compare with RF and the rest of my attempts out-of-the-box? What tuning may be needed? if (alg3): df_bst = pd.get_dummies(df, columns=['sales', 'salary']) df_train_bst = pd.get_dummies(df_train, columns=['sales', 'salary']) df_test_bst = pd.get_dummies(df_test, columns=['sales', 'salary']) X = df_train_bst[[ 'satisfaction_level', 'salary_low', 'salary_medium', 'salary_high', 'number_project' ] + [v for v in df_train_bst.columns if re.search('sales', v) is not None]] X_test = df_test_bst[[ 'satisfaction_level', 'salary_low', 'salary_medium', 'salary_high', 'number_project' ] + [v for v in df_test_bst.columns if re.search('sales', v) is not None]] c3_result = abc(random_state=1234) c3_result.fit(X, df_train_bst['left']) #Overall fit is markedly better than previous attempts #Decision boundary is implicitly 50% in score() method print('\nRandom Forest OOS Results\n') print('Average Accuracy ' + str(c3_result.score(X_test, df_test_bst['left']))) #4-Fold CV done twice each fold to check average OOS accuracy reported above rkf = RepeatedKFold(n_splits=4, n_repeats=2, random_state=12883823) cv_score_list = [] for (train, test) in rkf.split(df_bst[[ 'satisfaction_level', 'salary_low', 'salary_medium', 'salary_high', 'number_project'
# # for i in range(len(test_x)): # test_x_vec = "" # line = test_x.iloc[i] # for j in line.split(): # j = p.stem(j) # test_x_vec += j # test_x.iloc[i] = test_x_vec # ============================================================================= # TFIDF方法 vectorizer = TfidfVectorizer() x_train_tfidf = vectorizer.fit_transform(train_x) x_input_tfidf = vectorizer.transform(test_x) # 构建模型 朴素贝叶斯 model = abc(n_estimators=600) model.fit(x_train_tfidf, train_y) # predicted = model.predict(x_input_tfidf) #print(confusion_matrix(predicted, test_y)) np.savetxt("data/res.csv", predicted, delimiter=',', fmt='%s') #print('Accuracy score: ', format(accuracy_score(test_y, predicted))) #print('Precision score: ', format(precision_score(test_y, predicted))) #print('Recall score: ', format(recall_score(test_y, predicted))) #print('F1 score: ', format(f1_score(test_y, predicted))) # ============================================================================= # #output some examples # category_map = {'ham':0, 'spam':1}
### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. from sklearn.naive_bayes import GaussianNB as gnb #from sklearn.linear_model import LogisticRegression as lr from sklearn.ensemble import RandomForestClassifier as rfc from sklearn.ensemble import AdaBoostClassifier as abc from sklearn.ensemble import GradientBoostingClassifier as gbc #from sklearn.svm import SVC as svc clf1 = gnb() #clf2 = lr() clf3 = rfc() clf4 = abc() clf5 = gbc() #clf6 = svc() ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! from sklearn.cross_validation import train_test_split from sklearn import metrics as mtr features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42)