def onescore(X, Y, Xtest): clf = RandomForestClassifier(oob_score=True, n_jobs=-1, n_estimators=1000, max_features=300, random_state=0) clf.fit(X, Y) print "oob_score = ", clf.oob_score_ print clf.get_params() ytest = clf.predict(Xtest) output(ytest, "try_004.csv")
def test_set_params(): """set_params should be able to set estimators""" clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]) assert_true('lr' in eclf1.named_estimators) assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1]) assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr']) eclf1.fit(X, y) assert_true('lr' in eclf1.named_estimators_) assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0]) assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr']) eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert_false(hasattr(eclf2, 'nb')) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params()) assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params()) eclf1.set_params(lr__C=10.0) eclf2.set_params(nb__max_depth=5) assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0) assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5) assert_equal(eclf1.get_params()["lr__C"], eclf1.get_params()["lr"].get_params()['C'])
def cross_validation(X, y): #fig = plt.figure() #ax = fig.add_subplot(111, projection='3d') assert(len(y) == len(X)) # Split the dataset in two equal parts X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42) depth = [8, 16, 32, 64] split = [1, 2, 4, 8, 16, 32, 64] best_score = 0 best_train_score = 0 best_param = None for d in depth: for s in split: model = RandomForestClassifier(n_estimators=500, criterion="entropy", max_features="sqrt", max_depth=d, min_samples_split=s, n_jobs=-1) model = model.fit(X_train, y_train) print "Depth: %d split: %d" % (d, s) print "Model trainning score:" score_train = model.score(X_train, y_train) print score_train #ax.scatter(d, s, score_train, c='b', marker='o') print "Model test score:" score_test = model.score(X_test, y_test) print score_test #ax.scatter(d, s, score_test, c='r', marker='^') if score_test > best_score: best_score = score_test best_train_score = score_train best_param = model.get_params() print "==================" print best_train_score print best_score print best_param return best_param
def fit(self,train_X,train_Y): #split set into ones and zeros zeros = train_X[train_Y == 0,:] ones = train_X[train_Y == 1,:] num_ones = ones.shape[0] # compute number of chunks to split num_chunks = int(zeros.shape[0]/num_ones) chunks = np.array_split(zeros,num_chunks) #train rfs i = 0 for chunk in chunks: print('training random forest %s of %s' %(i,num_chunks)) chunk_rf = RandomForestClassifier(n_estimators = 1000, n_jobs = -1) print(chunk_rf.get_params()) chunk_train_X = np.concatenate([chunk,ones]) chunk_train_Y = np.concatenate([np.zeros([chunk.shape[0],1]),np.ones([num_ones,1])]).ravel() #cross_validation if self.weights is not None: print('cross_validation') scores = cross_validation.cross_val_score(chunk_rf, chunk_train_X, chunk_train_Y, cv = 10, n_jobs = -1) print(scores.mean()) self.weights.append(scores.mean()) #train chunk_rf.fit(chunk_train_X,chunk_train_Y) self.rfs.append(chunk_rf) i+=1
def training_and_test(token, train_data, test_data, num_classes, result): """Train and test Args: token (:obj:`str`): token representing this run train_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of training feature and label test_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of testing feature and label num_classes (:obj:`int`): Number of classes result (:obj:`pyActLearn.performance.record.LearningResult`): LearningResult object to hold learning result """ model = RandomForestClassifier(n_estimators=20, criterion="entropy") model.fit(train_data[0], train_data[1].flatten()) # Test predicted_y = model.predict(test_data[0]) predicted_proba = model.predict_proba(test_data[0]) # Evaluate the Test and Store Result confusion_matrix = get_confusion_matrix(num_classes=num_classes, label=test_data[1].flatten(), predicted=predicted_y) result.add_record(model.get_params(), key=token, confusion_matrix=confusion_matrix) # In case any label is missing, populate it if predicted_proba.shape[1] != num_classes: temp_array = np.zeros((predicted_proba.shape[0], num_classes), np.float32) for i in range(len(model.classes_)): temp_array[:, model.classes_[i]] = predicted_proba[:, i] predicted_proba = temp_array return predicted_y, predicted_proba
def train_model_03(dataset_id): # Random Forest X, Y, test = prepare_data_for_training(dataset_id) clf = RandomForestClassifier(n_estimators=300, min_samples_split=150, bootstrap=False, criterion="gini", max_depth=117, min_samples_leaf=3, n_jobs=-1) train_and_make_predictions(clf, X, Y, test, "RandomForest %s" % clf.get_params())
def tuning_randomforest(X, y): clf = RandomForestClassifier(n_estimators=10000, criterion='entropy', max_depth=6, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_features=0.2, n_jobs=-1, class_weight='balanced_subsample', verbose=0) print 'parameters:', clf.get_params() skf = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0) for train_index, val_index in skf: X_train, X_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] clf.fit(X_train, y_train) print 'train accuracy', clf.score(X_train, y_train) y_val_pred = clf.predict(X_val) print 'val auc:', roc_auc_score(y_val, y_val_pred)
def test_voting_classifier_set_params(): # check equivalence in the output when setting underlying estimators clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]).fit(X, y) eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert eclf2.estimators[0][1].get_params() == clf1.get_params() assert eclf2.estimators[1][1].get_params() == clf2.get_params()
def do_generate_metrics_rf_optimazed_model(X_train, y_train, X_test, y_test, grid): file_operations.write_logs(FILENAME, 'Starting metrics calculation') model = RandomForestClassifier(random_state=my_constants.RANDOM_VALUE, oob_score=True) model.set_params(**grid.best_params_) model.fit(X_train, y_train) metrics = calculate_metrics(model, X_test, y_test) file_operations.write_logs( FILENAME, "Generated model params and results\n params:" + str(model.get_params()) + "\nscore " + str(model.score(X_test, y_test))) file_operations.write_logs( FILENAME, "Search grid best params and results\n params:" + str(grid.best_params_) + "\nscore " + str(grid.best_score_)) return model, metrics
class RandForestPS(): """ This classifier fist builds a features space concistiong of predicted probabilties from a list of classiefiers and then trains ont then space. One can use it like a classifier form the sklearn package. """ def __init__(self, estimators=None, *args, n_folds=8, bootstrap=False, **kwargs): self.RF = RandomForestClassifier(*args, **kwargs) self.estimators = estimators self.n_folds = n_folds self.pbb_space = ProbabilitySpace(estimators, n_folds=n_folds, bootstrap=bootstrap) def fit(self, X, y): Xp, yp = self.pbb_space.fit_transform(X, y) self.RF.fit(Xp, yp) return self def predict(self, X, y=None): Xp, yp = self.pbb_space.transform(X, y) return self.RF.predict(Xp) def predict_proba(self, X): Xp, _ = self.pbb_space.transform(X) return self.RF.predict_proba(Xp) def set_params(self, **kwargs): self.RF.set_params(**kwargs) return self def get_params(self, *args, **kwargs): params = self.RF.get_params(*args, **kwargs) params['estimators'] = self.estimators params['n_folds'] = self.n_folds return params def score(self, X, y): Xp, yp = self.pbb_space.transform(X, y) return self.RF.score(Xp, yp)
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf = neighbors.KNeighborsClassifier(weights='distance') clf.fit(X_train, y_train) print("\n\n") print("Parameters of Kneighbors", clf.get_params()) confidence = clf.score(X_test, y_test) print("Accuracy of Kneighbors", confidence) predicition = clf.predict(X_test) print("Predicted Spread of Kneighbors:", Counter(predicition)) print("\n\n") print("Decision Tree") clf1 = DecisionTreeClassifier(max_depth=4) clf1.fit(X_train, y_train) print("Parameters of Decision Tree", clf1.get_params()) print("Accuracy of Decision Tree", clf1.score(X_test, y_test)) print("Predicted Spread of Decision Tree", Counter(clf1.predict(X_test))) print("\n\n") print("RandomForest") clf2 = RandomForestClassifier() clf2.fit(X_train, y_train) print("Parameters of RandomForest", clf2.get_params()) print("Accuracy of RandomForest", clf2.score(X_test, y_test)) print("Predicted Spread of RandomForest", Counter(clf2.predict(X_test))) print("Ensemble") clfn = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) clfn.fit(X_train, y_train) confidence = clfn.score(X_test, y_test) print("Accuracy of Ensembles", confidence) predicition = clfn.predict(X_test) print("Predicted Spread of ensembles:", Counter(predicition)) return confidence
def train_by_RandForest(): filename = 'train_and_test_data' x_train, x_test, y_train, y_test = Load_Traindata_Testdata_with_Tfidf( filename) p = open('./data/indices', 'rb') data = pickle.load(p) indices = data['indices'] most_importance_feature = indices[:2000] x_train = x_train[:, most_importance_feature] x_test = x_test[:, most_importance_feature] print("Selected feature with shape:", x_train.shape) model = RandomForestClassifier(n_jobs=8, n_estimators=30) now = datetime.datetime.now() print("Training begin by RandForest:", now) model.fit(x_train, y_train) y_pre = model.predict(x_test) print(model.get_params()) print(model.score(x_test, y_test)) print(accuracy_score(y_test, y_pre)) training_time = datetime.datetime.now() - now print("Training time(s):", training_time)
class RandomForestModel(object): def __init__(self, n_estimators, max_depth, verbose=1): self.model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, verbose=verbose) self.name = 'RandomForest' def get_params(self): return self.model.get_params() def train(self, features, labels): self.model.fit(features, labels) def predict(self, feature): label_pred = self.model.predict(feature) return label_pred def score(self, features, labels): predictions = self.model.predict(features) accuracy = accuracy_score(labels, predictions) precision = precision_score(labels, predictions, average='macro') recall = recall_score(labels, predictions, average='macro') results = { 'params': self.get_params(), 'model': self.name, 'accuracy': accuracy, 'precision': precision, 'recall': recall } print('---> Accuracy obtained is: {0:.2f}%'.format(accuracy * 100)) figures = {} return results, figures def save(self, filename): with open(filename, 'wb') as fp: pickle.dump(self.model, fp)
def machine(k, temp1, temp2): #X_train, X_test, y_train, y_test = train_test_split(temp1, temp2, test_size=0.1, random_state=k, stratify=temp2) #X_train = [[float(j) for j in i] for i in X_train] #X_test = [[float(j) for j in i] for i in X_test] #y_train = [int(i) for i in y_train] #y_test = [int(i) for i in y_test] #y_train = np.ravel(y_train) #y_test = np.ravel(y_test) #X_train = np.array(X_train) #y_train = np.array(y_train) clf = RandomForestClassifier(n_estimators=100) cv = StratifiedKFold(n_splits=10) scores = cross_val_score(clf, temp1, temp2, cv=cv) print(scores) print(clf.get_params()) mean = statistics.mean(scores) std = statistics.stdev(scores) left = mean - 1.96 * (std / 10**(1 / 2.0)) right = mean + 1.96 * (std / 10**(1 / 2.0)) plt.figure() plt.axvline(mean * 100, color="blue", ymax=0.75, label='Mean Accuracy') plt.axvline(left * 100, color="red", ymax=0.5, label='95% Confidence Interval') plt.axvline(right * 100, color="red", ymax=0.5) plt.text( 50, 0.8, ' Mean Accuracy = %0.2f%% \n Lower limit of CI = %0.2f%% \n Upper Limit of CI = %0.2f%%' % (mean * 100, left * 100, right * 100), fontsize=12) plt.xlim([50, 100]) plt.legend(fontsize=11) plt.xlabel('Accuracy (in %)', fontsize=12) plt.title( 'Mean Accuracy and Confidence Interval \n Random Forest Classifier') plt.show()
class AvgPredictor(): def __init__(self): self.clf_svm = svm.SVC(kernel='linear', probability=True) self.clf_rf = RandomForestClassifier(n_estimators=1000, max_depth=10) def fit(self, X, y): # fit with svm self.clf_svm.fit(X, y) # fit with rf self.clf_rf.fit(X, y) return self def get_params(self, deep=True): if deep: return self.clf_rf.get_params(deep) | self.clf_svm.get_params(deep) else: return {} def predict(self, t): final_results = [] # predict svm final_results.append(self.clf_svm.predict_proba(t)) final_results[0] = final_results[0][:, 1] # predict rf final_results.append(self.clf_rf.predict_proba(t)) final_results[1] = final_results[1][:, 1] samples = len(t) result = list(np.zeros(samples)) for i in xrange(0, samples): result[i] = (final_results[0][i] + final_results[1][i]) / 2 return result
def main_rfclassifier(n_est, max_depth, datastruct, experiment_id=None): print("Starting experiment [{}, {}]".format(n_est, max_depth)) df, train_x, train_y, test_x, test_y = datastruct metrics = {} # if no experiment, set it up print("Setting up experiment") mlflow.set_experiment('RandomForest Classifier') with mlflow.start_run(): # model params model = RandomForestClassifier(n_estimators=n_est, max_depth=max_depth, class_weight='balanced') print("Training model") # train the model start_timer = time.time() model.fit(train_x, train_y) stop_timer = time.time() print("Model trained") score = score_model(model, test_x, test_y, True) #mlflow logging mlflow.log_param('model_type', str(model.__class__)) mlflow.log_param('features', train_x.columns) mlflow.log_param('sample_size', df.shape) mlflow.log_params(model.get_params()) metrics['roc_auc'] = score metrics['elapsed_time'] = (stop_timer - start_timer) mlflow.log_metrics(metrics) mlflow.sklearn.log_model(model, "Random Forest Classifier") print("Completed")
from sklearn.ensemble import RandomForestClassifier if '--example' in sys.argv: trainingdata = [[1, 1], [2, 0.5], [-1, -1], [-2, -2]] traininglabel = [1, 1, -1, -1] testdata = [[1, 3], [-3, -3]] model = RandomForestClassifier() model.fit(trainingdata, traininglabel) output = model.predict(testdata) for label in output: print label probas = model.predict_proba(testdata) for label in probas: print label for weights in model.get_params(): print weights for i, gini_imp in enumerate(model.feature_importances_): print "gini係数 index = ", i, gini_imp if '--learn' in sys.argv: import json anses = [] traings = [] for line in open('./learning.json').read().split('\n'): if line.strip() == "" : continue ans_label, data = json.loads(line.strip()) anses.append(ans_label) traings.append(data) model = RandomForestClassifier()
'dataset': str(data_set), 'kmer_size': kmer_size, 'n_splits': cv_gridsearch, 'n_repeats': n_iter_grid, 'acc': acc, 'auc': auc, 'model': learn_type, 'NMF_factors': n, 'params': 'liblinear' }) elif learn_type == "lasso": k_fold = RepeatedStratifiedKFold(n_splits=cv_gridsearch, n_repeats=n_iter_grid) estimator = LassoCV(alphas = param_grid["alpha"][0], cv = k_fold, n_jobs = -1) accuracies = [] for train_i, test_i in skf.split(x, y): x_train, x_test = x[train_i], x[test_i] y_train, y_test = y[train_i], y[test_i] y_train = list(map(int, y_train)) y_test = list(map(int, y_test)) estimator.fit(x_train, y_train) accuracy = evaluate(estimator, x_test, y_test) accuracies.append(estimator.get_params()) accuracies.append(accuracy) with open('/pollard/home/abustion/deep_learning_microbiome/lasso.txt', 'w') as f: for item in accuracies: f.write("%s\n" % item)
def train(datadict, model_id, n_estimators=25, max_depth=50): """Random forest model training. Trains random forest model. Only n_estimators, max_depth hyperparameters are available to the user for training. The rest of the hyperparameters have been tuned by the CMU team. Saves model and statistics after training. Args: datadict (dict): Dictionary of numpy arrays containing preprocessed train and test data. model_id (str): Timestamp used to identify model, scaler and feature names files. n_estimators (str): Number of trees in forest. Less likely to overfit with more trees. max_depth (str): The maximum depth of the tree. More likely to overfit if depth is large. """ # Convert n_estimators and max_depth from string to int since model only accepts int n_estimators = int(n_estimators) max_depth = int(max_depth) print("\nTraining model...") start_time = time.time() X_train = datadict['X_train'] y_train = datadict['y_train'] X_test = datadict['X_test'] y_test = datadict['y_test'] model_rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, verbose=1, n_jobs=-1, bootstrap=False) print("\nFitting model...") print("Parameters used:", model_rf.get_params()) model_rf.fit(X_train, y_train) print("\nPredicting results...") y_pred_rf = model_rf.predict(X_test) # y_pred_proba_rf = model_rf.predict_proba(X_test) print("\nCalculating accuracy...") accuracy_df = get_accuracy_windows(1, y_test, y_pred_rf) accuracy = accuracy_score(y_test, y_pred_rf) * 100 # Save model print("\nSaving model...") model_path = os.path.join( paths.model_dir, "acc-" + f"{accuracy:.2f}" + "-model_" + model_id + ".pkl.z") print(f"Model saved in {model_path}") joblib.dump(model_rf, model_path) # Get model stats feature_importance_df = get_feature_importance(model_rf, model_id) classification_report_df = get_classification_report(y_test, y_pred_rf) params_df = get_params(model_rf) # Save stats to excel print("\nSaving model stats...") stats_path = os.path.join( paths.output_delivery_prediction_stats_dir, "acc-" + f"{accuracy:.2f}" + "-stats_" + model_id + ".xlsx") print(f"Stats saved in {stats_path}") with pd.ExcelWriter(stats_path) as writer: accuracy_df.to_excel(writer, sheet_name='Accuracy') feature_importance_df.to_excel(writer, sheet_name='Feature Importance') classification_report_df.to_excel(writer, sheet_name='Classification Report') params_df.to_excel(writer, sheet_name='Model Parameters') utilities.print_elapsed_time(start_time)
'max_features': 'auto', 'min_samples_split': 3, 'n_estimators': 27 } rfp2 = { 'bootstrap': False, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 3, 'n_estimators': 28 } best_rf = RandomForestClassifier(**rfp, random_state=5, class_weight="balanced_subsample") print(best_rf.get_params().keys(), **sp) best_rf.fit(X_train, y_train) fscore = best_rf.score(X_test, y_test) print(f'Accuracy score for RandomForest Classifier: {fscore:.02f}', **sp) # logistic regression logReg = LogisticRegression(multi_class='multinomial', solver='lbfgs') # Fit the classifier to the training data logReg.fit(X_train, y_train) # Predict the labels of the test set: y_pred y_pred = logReg.predict(X_test) lscore = logReg.score(X_test, y_test) print(f'Accuracy score for logistic Regression Classifier: {lscore:.02f}',
y_train = df.iloc[:, 0].values - 1 f_names = df.columns[1:].values t_names = df.iloc[:, 0].unique() # 不同 Class 统计 (根据 Target 列) print('\nTraining dataset shape: ', X_train.shape, ' Number of features: ', X_train.shape[1]) num_categories = np.unique(y_train).size sum_y = np.asarray(np.unique(y_train.astype(int), return_counts=True)) df_sum_y = pd.DataFrame(sum_y.T, columns=['Class', 'Sum'], index=None) print('\n', df_sum_y) # 初始化 classifier 并完成数据集训练 clf = RandomForestClassifier(verbose=1, n_jobs=-1, random_state=args.randomseed, n_estimators=100).fit(X_train, y_train) print('\nClassifier parameters:\n') print(clf.get_params()) # 输出重要特征评分 df_import = eli5.explain_weights_df(clf, target_names=t_names, feature_names=f_names) df_import.to_csv('f_weight_output.csv', index=None) print( "\nThe importance features have been saved to 'f_weight_output.csv'.") end_time = time.time() # 程序结束时间 print('\n[Finished in: {0:.6f} mins = {1:.6f} seconds]\n'.format( ((end_time - start_time) / 60), (end_time - start_time)))
import numpy as np import time from sklearn.svm import SVC from sklearn.metrics import f1_score, classification_report from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier data = np.loadtxt('./data/TrainSamples.csv', delimiter=",") label = np.loadtxt('./data/TrainLabels.csv', delimiter=",") test = np.loadtxt('./data/TestSamples1.csv', delimiter=',') testLabel = np.loadtxt('./data/TestLabels1.csv', delimiter=',') start = time.time() classifier = RandomForestClassifier() print classifier.get_params(deep=True) classifier.fit(data, label) predictions = classifier.predict(test) reportname = 'RandomForestClassifier.txt' report = open('./result/' + reportname, 'w') r = classification_report(testLabel, predictions) report.write(r) end = time.time() report.write('time{0}'.format(str(end - start))) report.close()
X_test=np.loadtxt("X_test.gz",delimiter=",") #################################################################################### #################################################################################### #################################################################################### #classifier RFmodel = RandomForestClassifier( n_estimators=10, #number of trees to generate max_features='auto', #consider sqrt of number of features when splitting n_jobs=1, #run in parallel on all cores criterion="entropy" ) #train RFmodel = RFmodel.fit(X_train, Y_train) #get parameters params=RFmodel.get_params() #score on training set acc_rate=RFmodel.score(X_train,Y_train) print acc_rate #feature importances feat_imp=RFmodel.feature_importances_ #predict probabilities test_probs=RFmodel.predict_proba(X_test) #output test set probabilities to csv file columns=['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT',
import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score from sklearn.metrics import confusion_matrix from sklearn.metrics import roc_curve, auc from scipy import interp from sklearn.cross_validation import KFold from sklearn.preprocessing import StandardScaler rfc = RandomForestClassifier() rfc.get_params() # Reads train & test features and labels data from files and returnes them as numpy arrays X_train_df = pd.read_csv('X_train_header.txt') y_train_df = pd.read_csv('y_train_header.txt') X_test_df = pd.read_csv('X_test_header.txt') y_test_df = pd.read_csv('y_test_header.txt'); n_train_samples = X_train_df.shape[0] n_test_samples = X_test_df.shape[0] n_features = X_train_df.shape[1] X_train = np.array(X_train_df).reshape((n_train_samples,n_features)) y_train = np.array(y_train_df).reshape(n_train_samples,) X_test = np.array(X_test_df).reshape((n_test_samples,n_features)) y_test = np.array(y_test_df).reshape(n_test_samples,)
nthread = 4, min_child_weight = 1, subsample= 0.8, seed = 1337, objective= 'multi:softprob', max_depth = 7, gamma= .2) # use the xgb interface xgb_param = clf.get_xgb_params() xgb_param['num_class'] = 5 xgb_param['eval_metric'] = 'mlogloss' Xg_train = xgb.DMatrix(X_train, label=y_train, missing=np.nan) cvresult = xgb.cv(xgb_param, Xg_train, num_boost_round = clf.get_params()['n_estimators'], nfold = 5, show_progress = True, early_stopping_rounds = 100) clf.set_params(n_estimators=cvresult.shape[0]) clf.fit(X_train, y_train) best_outcome_params = clf.get_params() best_outcome_score = cvresult.min() try: # predict the outcome probabilities y_pred = grid.predict_proba(X_test) except: # predict the outcome probabilities y_pred = clf.predict_proba(X_test)
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--path', nargs='?', const=True, type=str, default='C:/Users/AliGökalp/Documents/phd/data/2013_DFTC/2013_DFTC', help='Input data path') parser.add_argument( '--loader_name', nargs='?', const=True, type=str, default='GRSS2013DataLoader', help='Data set loader name, Values : GRSS2013DataLoader') parser.add_argument('--neighborhood', nargs='?', type=int, default=5, help='Neighborhood for data extraction') parser.add_argument('--hyperparamopt', nargs='?', const=True, type=bool, default=False, help='If true, performs hyper parameter optimization.') parser.add_argument('--fullscene', nargs='?', const=True, type=bool, default=False, help='If true, performs full scene classification.') parser.add_argument('--batch_size', nargs='?', type=int, default=20, help='Batch size') parser.add_argument('--split_count', nargs='?', type=int, default=1, help='Split count') parser.add_argument('--base_log_path', nargs='?', const=True, type=str, default=os.path.dirname(__file__), help='Base path for saving logs') flags, unparsed = parser.parse_known_args() loader_name = flags.loader_name data_path = flags.path neighborhood = flags.neighborhood for run_index in range(flags.split_count): print('Starting episode#%d' % run_index) data_importer = InMemoryImporter.InMemoryImporter() training_data_with_labels, test_data_with_labels, validation_data_with_labels, shadow_dict, class_range, scene_shape, color_list = \ data_importer.read_data_set(loader_name=loader_name, path=data_path, test_data_ratio=0, neighborhood=neighborhood, normalize=False) flattened_training_data = flatten_data(training_data_with_labels.data) flattened_validation_data = flatten_data( validation_data_with_labels.data) start_time = time.time() estimator = RandomForestClassifier(n_estimators=50, n_jobs=8, max_features=int(2 * sqrt(144)), verbose=False) # estimator = ExtraTreesClassifier(n_estimators=10000, n_jobs=8, verbose=1) # estimator = SVC(kernel='poly', degree=1, cache_size=200, verbose=True) # GRSS2013 # estimator = SVC(kernel='rbf', gamma=1e-09, C=10000, cache_size=200) # GRSS2013 # estimator = SVC(kernel='rbf', gamma=1e-06, C=1000000, cache_size=1000, verbose=True) # GULFPORT estimator.fit(flattened_training_data, training_data_with_labels.labels) print('Completed training(%.3f sec)' % (time.time() - start_time)) predicted_validation_data = estimator.predict( flattened_validation_data) overall_accuracy = accuracy_score(validation_data_with_labels.labels, predicted_validation_data) average_accuracy = balanced_accuracy_score( validation_data_with_labels.labels, predicted_validation_data) kappa = cohen_kappa_score(validation_data_with_labels.labels, predicted_validation_data) conf_matrix = confusion_matrix(validation_data_with_labels.labels, predicted_validation_data) print_output(estimator.get_params(), average_accuracy, conf_matrix, kappa, overall_accuracy, run_index, loader_name, flags.base_log_path) if flags.hyperparamopt: perform_hyperparamopt(flattened_training_data, training_data_with_labels) if flags.fullscene: perform_full_scene_classification(data_path, loader_name, neighborhood, estimator, flags.batch_size)
print('Accuracy of Extratrees classifier on test set: %0.04f' % (score_ABC)) # Accuracy of Extratrees classifier on test set: 0.8224 #****************************************************************************** #****************************************************************************** # *** Applying Machine Learning Technique #7 *** from sklearn.ensemble import RandomForestClassifier Rando = RandomForestClassifier(n_estimators=5) from pprint import pprint # Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(Rando.get_params()) classifier = Rando.fit(X_train, y_train) score_RFC = Rando.score(X_test, y_test) print('Accuracy of Extratrees classifier on test set: %0.04f' % (score_RFC)) # Accuracy of Extratrees classifier on test set: 0.8137 #****************************************************************************** # HYPERPARAMETER OPTIMIZATION --> GRID SEARCH <-- from sklearn.model_selection import GridSearchCV # parameters for GridSearchCV param_grid = {
labels_test = pickle.load(data) print(features_train.shape) print(features_test.shape) # Random Forest from sklearn.ensemble import RandomForestClassifier from pprint import pprint from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from sklearn.model_selection import ShuffleSplit rf_0 = RandomForestClassifier(random_state=8) print('Parameters currently in use:\n') pprint(rf_0.get_params()) # Aanpassen tune parameters # n_estimators n_estimators = [int(x) for x in np.linspace(start=200, stop=1000, num=5)] # max_features max_features = ['auto', 'sqrt'] # max_depth max_depth = [int(x) for x in np.linspace(20, 100, num=5)] max_depth.append(None) # min_samples_split min_samples_split = [2, 5, 10] # min_samples_leaf min_samples_leaf = [1, 2, 4] # bootstrap bootstrap = [True, False]
] #Defining features and prediction target X = df[features] y = df.Survived #Select random forest for classifier model. survival_model_forest = RandomForestClassifier(random_state=1) #fit model. survival_model_forest.fit(X, y) ####RANDOM GRID SEARCH #Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(survival_model_forest.get_params()) # Number of trees in random forest n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid
trainingSet = np.vstack((trainingSetEllipticals, trainingSetSpirals)) #using only elliptical and spiral for training np.random.shuffle(trainingSet) trainingSetLabels = trainingSet[:,12] #putting labels in separate array trainingSetLabels[trainingSetLabels == 0] = -1 #replacing all 0 with -1 to match sklearn format trainingSet = trainingSet[:, 1:11] #removing label cols from actual inputs trainingSet, testingSet, trainingSetLabels, testingSetLabels = train_test_split(trainingSet, trainingSetLabels, test_size = 0.6, random_state = 0) #fixes random_state so results reproducible startTime = time.time() print "Time before training = ", startTime clf = RandomForestClassifier() #No max depth initial, tweak as necessary later clf = clf.fit(trainingSet, trainingSetLabels) print "Params after training:" print clf.get_params() trainingAccuracy = clf.score(trainingSet, trainingSetLabels) print "Training accuracy = ", trainingAccuracy testingAccuracy = clf.score(testingSet, testingSetLabels) print "Testing accuracy = ", testingAccuracy print "Done training and testing! Time = ", time.time() - startTime, "seconds"
from sklearn.ensemble import RandomForestClassifier # train the model wqp_rf = RandomForestClassifier() wqp_rf.fit(wqp_train_SX, wqp_train_y) # predict and evaluate performance wqp_rf_predictions = wqp_rf.predict(wqp_test_SX) meu.display_model_performance_metrics(true_labels=wqp_test_y, predicted_labels=wqp_rf_predictions, classes=wqp_label_names) # ## Hyperparameter tuning with Grid Search & Cross Validation # In[23]: print(wqp_rf.get_params()) # ### Get the best hyperparameter values # In[24]: from sklearn.model_selection import GridSearchCV param_grid = { 'n_estimators': [100, 200, 300, 500], 'max_features': ['auto', None, 'log2'] } wqp_clf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
# vectorization from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer stop_words = ['in', 'of', 'at', 'a', 'the'] tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words=stop_words) tfidf_vectorizer.fit(reviews_train_clean) X = tfidf_vectorizer.transform(reviews_train_clean) X_test = tfidf_vectorizer.transform(reviews_test_clean) # classifier find c from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.neural_network import MLPClassifier from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split target = [1 if i < 12500 else 0 for i in range(25000)] final_model = RandomForestClassifier() final_model.fit(X, target) y_pred = final_model.predict(X_test) model_coef = final_model.get_params() print("Final Accuracy: %s" % accuracy_score(target, y_pred)) from sklearn.metrics import confusion_matrix tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel() print(str((tn, fp, fn, tp)))
clf_etree = ExtraTreesClassifier(n_estimators=1000, max_depth=None, max_features=int(math.sqrt(n_features)), min_samples_split=100, random_state=144, n_jobs=4); clf_etree.fit(X_train, y_train) print "Validation set score: ERF " , clf_etree.score(X_val, y_val) clf_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME", n_estimators=500, random_state=74494, learning_rate=0.8) clf_boost.fit(X_train, y_train) print "Validation set score: ABOOST " , clf_boost.score(X_val, y_val) #clf_gboost = GradientBoostingClassifier(n_estimators=int(reg), random_state=74494, learning_rate=0.2) #clf_gboost.fit(X_train, y_train) #print "Validation set score:LR " , clf_gboost.score(X_val, y_val) print "Classifier:" print clf, clf.get_params() print clf_etree, clf_etree.get_params() print clf_boost, clf_boost.get_params() if(fe==1): #L1 norm based feature elimination clf_fe = LogisticRegression(C=1000,penalty='l1',random_state=0) clf_fe.fit(X_train, y_train) X_train = X_train[:,clf_fe.coef_.ravel()!=0] print "Xtrain.shape: ", X_train.shape X_val = X_val[:,clf_fe.coef_.ravel()!=0] clf2_l = svm.SVC(kernel='linear', C=reg) clf2_l.fit(X_train, y_train) print "Lasso Validation set score filtered coeff linear: " , clf2_l.score(X_val, y_val) clf2 = svm.SVC(kernel='rbf', C=reg, gamma=g)
cross_validation_accuracy_knn = [] for i in range(1, 26): clf_knn = KNeighborsClassifier(n_neighbors=i) scores = cross_val_score(clf_knn, X, y, cv=5) print(f"Average Accuracy Score when neighbours are {i} is: \t", scores.mean()) cross_validation_accuracy_knn.append(scores.mean()) plt.figure(figsize=(20,10)) plt.plot([i for i in range(1, 26)], [i*100.0 for i in cross_validation_accuracy_knn]) for i in range(1, 26): plt.text(i, cross_validation_accuracy_knn[i-1]*100 + 0.2, s=f'{cross_validation_accuracy_knn[i-1]*100:.3f}%') from sklearn.ensemble import RandomForestClassifier clf_rfc = RandomForestClassifier(n_estimators=10, random_state=42) clf_rfc.get_params() clf_rfc.fit(X_train, y_train) predictions_rfc = clf_rfc.predict(X_test) accuracy_score(y_test, predictions_rfc) plt.figure(figsize=(7,7)) sns.heatmap(confusion_matrix(y_test, predictions_rfc), annot=True, cmap="Blues", square=True, xticklabels=['No Disease', 'Disease'], yticklabels=['No Disease', 'Disease']) plt.xlabel("Predicted", fontsize=15) plt.ylabel("Actual", fontsize=15)
#for train,test in kf: for _ in range(1): #X_learn, X_valid, y_learn, y_valid = X.iloc[train], X.iloc[test], \ # y.iloc[train], y.iloc[test] #y_valid = pd.DataFrame({'country': y_valid}) #y_test = pd.DataFrame({'country': y_test}) """ RANDOM FOREST """ classif_base = RandomForestClassifier(n_estimators=300, criterion='entropy', random_state=0, min_samples_split=1000, max_depth=10, min_samples_leaf=100, n_jobs=-1) classif = RandomForestClassifier(**classif_base.get_params()) """ GRADIENT BOOSTING """ #classif_base = GradientBoostingClassifier(loss='deviance', # learning_rate=0.25, # n_estimators=20, # max_depth=5, # min_samples_split=50, # min_samples_leaf=100, # random_state=0, # verbose=True) #classif = GradientBoostingClassifier(**classif_base.get_params()) """ XGBOOST """ xg_train = xgb.DMatrix(X_learn, label=y_learn) xg_valid = xgb.DMatrix(X_valid, label=y_valid)
os = SMOTETomek(1) X_train_os, y_train_os = os.fit_sample(X_train, y_train) print("The number of classes before fit {}".format(Counter(y_train))) print("The number of classes after fit {}".format(Counter(y_train_os))) X_train = X_train_os y_train = y_train_os from sklearn.ensemble import RandomForestClassifier # import model_selection rf = RandomForestClassifier() from pprint import pprint print('Parameters currently in use:\n') pprint(rf.get_params()) from sklearn.model_selection import RandomizedSearchCV # Number of trees in random forest n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=20)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in range(100, 2000, 2)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False]
class rfClf(BaseModel): """Model using random forest classifier.""" def __init__(self, train_data_fname=None, nrows=None, **kwargs): """Initialize the data frame.""" super(rfClf, self).__init__(train_data_fname, nrows, **kwargs) def set_model(self, **kwargs): """Set the classifier. No criterion parameters since only one choice: mean sqared error """ verbose = kwargs.get('verbose', 0) n_estimators = kwargs.get('n_estimators', 200) max_depth = kwargs.get('max_depth', None) bootstrap = kwargs.get('bootstrap', True) min_samples_leaf = kwargs.get('min_samples_leaf', 1) min_samples_split = kwargs.get('min_samples_split', 2) max_features = kwargs.get('max_features', "auto") class_weight = kwargs.get('class_weight', "auto") n_jobs = kwargs.get('n_jobs', 1) criterion = kwargs.get('criterion', 'entropy') random_state = kwargs.get('random_state', 24) self.learner = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, bootstrap=bootstrap, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, max_features=max_features, n_jobs=n_jobs, verbose=verbose, criterion=criterion, class_weight=class_weight, random_state=random_state) print('\n\nRandom forest set with parameters:') par_dict = self.learner.get_params() for ipar in par_dict.keys(): print('{}: {}'.format(ipar, par_dict[ipar])) print('\n\n') def fitNscore(self, **kwargs): """Fit classifier and produce score and related plots.""" col2fit = kwargs.get('features') # cleaning bids_path = kwargs.get('bids_path', 'data/bids.csv') if not self.iscleaned: print 'Preparing the data...' self.prepare_data(bids_path, **kwargs) print('columns for fit=\n{}'.format(self.df_train.columns)) test_size = 0.2 # fraction kept for testing rnd_seed = 24 # for reproducibility #features_train, features_test, target_train, target_test =\ # train_test_split(self.df_train[col2fit].values, # self.df_train['outcome'].values, # test_size=test_size, # random_state=rnd_seed) sss = StratifiedShuffleSplit(self.df_train['outcome'].values, n_iter=1, test_size=test_size, random_state=rnd_seed) for train_index, test_index in sss: features_train = self.df_train[col2fit].values[train_index] features_test = self.df_train[col2fit].values[test_index] target_train = self.df_train['outcome'].values[train_index] target_test = self.df_train['outcome'].values[test_index] # Fit Classifier self.fitModel(features_train, target_train, **kwargs) # Predict on the rest of the sample print('\nPredicting...') predictions = self.learner.predict(features_test) probas = self.learner.predict_proba(features_test) # Feature index ordered by importance ord_idx = np.argsort(self.learner.feature_importances_) print("Feature ranking:") for ifeaturindex in ord_idx[::-1]: print('{0} \t: {1}'.format(col2fit[ifeaturindex], round(self.learner.feature_importances_[ifeaturindex], 2))) # Score print('Score={}'.format(self.learner.score(features_test, target_test))) # Plots # Feature importances maxfeat2show = 30 # number of features to show in plots importances = self.learner.feature_importances_ std = np.std([tree.feature_importances_ for tree in self.learner.estimators_], axis=0) indices = np.argsort(importances)[::-1] indices = indices[:min(maxfeat2show, len(indices))] # truncate if > maxfeat2show ordered_names = [col2fit[i] for i in indices] fig_import = plt.figure(figsize=(10, 10)) plt.title("Feature importances, RF") plt.barh(range(len(indices)), importances[indices], color="b", xerr=std[indices], align="center",ecolor='r') plt.yticks(range(len(indices)), ordered_names) plt.ylim([-1, len(indices)]) plt.ylim(plt.ylim()[::-1]) plt.subplots_adjust(left=0.22) fig_import.show() # confusion matrix cm = confusion_matrix(target_test.astype(int), predictions.astype(int)) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] cm_normalized = np.clip(cm_normalized, 0.0, 0.5) fig_cm = plt.figure() ax_cm = fig_cm.add_subplot(1,1,1) im_cm = ax_cm.imshow(cm_normalized, interpolation='nearest') plt.title('Normalized confusion mtx, RF') plt.xlabel('Predicted') plt.ylabel('True') fig_cm.colorbar(im_cm) fig_cm.show() # ROC curve # This ones seems to reflect better the LB score #false_pos, true_pos, thr = roc_curve(target_test, predictions) false_pos, true_pos, thr = roc_curve(target_test, probas[:, 1]) fig_roc = plt.figure() plt.plot(false_pos, true_pos, label='ROC curve (area = %0.2f)' % auc(false_pos, true_pos)) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC') plt.legend(loc="lower right") fig_roc.show() raw_input('press enter when finished...')
print("RandomForestClassifier, Cross_val_score=",results.mean() ) # #results sounds great but if I add max_depth=5 I have many missclassified points #How to Visualize a Decision Tree from a Random Forest in Python using Scikit-Learn X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y ,random_state=0) from sklearn.ensemble import RandomForestClassifier from sklearn import tree from sklearn.tree import DecisionTreeClassifier, export_graphviz import pydotplus from IPython.display import Image model = RandomForestClassifier(n_estimators=10,max_depth=5) pprint(model.get_params()) # Train model.fit(X_train, y_train) # Extract single tree estimator = model.estimators_[3] #we can change the tree number:) # Create DOT data dot_data = tree.export_graphviz(estimator, out_file=None, feature_names=filter_col, class_names=['Yes','No'], rounded = True, proportion = False, precision = 2, filled = True) # Draw graph graph = pydotplus.graph_from_dot_data(dot_data)
features_list = df.columns.values[1::] # Fit a random forest with (mostly) default parameters to determine feature importance forest = RandomForestClassifier(oob_score=True, n_estimators=10000) forest.fit(X, y) feature_importance = forest.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) # Get the indexes of all features over the importance threshold important_idx = np.where(feature_importance)[0] # Get the sorted indexes of important features sorted_idx = np.argsort(feature_importance[important_idx])[::-1] print "\nFeatures sorted by importance (DESC):\n", important_idx[sorted_idx] # Adapted from http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html pos = np.arange(sorted_idx.shape[0]) + .5 plt.subplot(1, 2, 2) plt.barh(pos, feature_importance[important_idx][sorted_idx[::-1]], align='center') plt.yticks(pos, important_idx[sorted_idx[:-1]]) plt.xlabel('Relative Importance') plt.title('Variable Importance') plt.show() sorted_idx feature_importance forest.get_params() df.filter(regex='Survived|Age_sc|SibSp|Parch|Fare_[0, 7.896]|Fare_[7.896, 14.454]|Fare_[14.454, 31.275]|Fare_[31.275, 512.329]|Sex|Pclass|Child|FamilySize|Family|Title_id')
output = OutputClassification("ECG Classification", "random_forest", output_dir="./output", file_dir="./output/images/", random_state=RANDOM_STATE, test_size=TEST_SIZE) output.add_info("n_processors", 12) output.add_info("n_folds", n_folds) csv = pd.read_csv("ecg.csv", index_col=None) X, y = csv.drop("abnormal", axis=1), csv["abnormal"] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE, test_size=TEST_SIZE) train_start = time.time() clf = RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE) output.add_model_parameter("basis", clf.get_params(deep=True)) cv = StratifiedKFold(n_splits=n_folds, random_state=RANDOM_STATE) probas = [] y_tests = [] y_preds = [] for train, test in cv.split(X, y): y_tests.append(y.iloc[test]) model = clf.fit(X.iloc[train], y.iloc[train]) proba = model.predict_proba(X.iloc[test]) y_pred = predict_from_proba(model, proba) y_preds.append(y_pred) probas.append(proba)
logging_level="Verbose", metric_period=100) # from catboost import cv as catcv # catpool = Pool(X_train,y_train,cat_features=categorical_features_pos) # cv_data = catcv(catpool,model.get_params(),fold_count=2) # best_cat_iterations = cv_data['test-Accuracy-mean'].idxmax() # print("Best Iteration: ",best_cat_iterations) # print("Best Score: ", cv_data['test-Accuracy-mean'][best_cat_iterations]) model = CatBoostClassifier(eval_metric='Accuracy', iterations=500, scale_pos_weight=imbalance_weight, random_seed=42, logging_level="Verbose", metric_period=100) model.fit(X, y, cat_features=categorical_features_pos) model.get_params() # cat_cv_std = cv_data.loc[cv_data['test-Accuracy-mean'].idxmax(),["train-Accuracy-mean","train-Accuracy-std"]] # print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (cat_cv_std[0],cat_cv_std[1])) # results = results.append({'Model': "Catboost",'Para': model.get_params(),'Test_Score': None, # 'CV Mean':cat_cv_std[0], 'CV STDEV': cat_cv_std[1]}, ignore_index=True) # catprobpred = model.predict_proba(test_df)[:,1] # catpred = model.predict(test_df).astype(np.int) # submission = pd.DataFrame({'PassengerId':test_df.index,'Survived':catpred}) # submission.to_csv('catboost.csv',index=False) lgtrain = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features) lgvalid = lgb.Dataset(X_test, y_test, categorical_feature=categorical_features)
# Extract the OOB accuracy from bc oob_accuracy = bc.oob_score_ print('Test set accuracy of bc: {:.2f}'.format(acc_test)) print('OOB accuracy of bc: {:.2f}'.format(oob_accuracy)) # =========== RANDOM FOREST CLASSIFIER ==========# # Instantiate rf rf = RandomForestClassifier(criterion='gini', random_state=2) # Fit rf to the training set rf.fit(X_train, y_train) rf.get_params() # =========== RANDOM SEARCH ==========# # criterion for information gain criterion = ['gini', 'entropy'] # Number of trees in random forest n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None)
lor_re = best_lor.predict_proba(x_test_s) ## loss function value import math z = 0 for i in range(len(y_test)): z = z + y_test[i]*math.log(lor_re[i][1]) + (1-y_test[i])*math.log(lor_re[i][0]) log_loss_lr = -(z/len(y_test)) print("The log loss for logistic regression is ", log_loss_lr) ## Random forest classfication ### random forest has three parameters for tune. Since this modle is designed ### avoid overfitting, so we do not consider accuracy VS complexity parameters ### here from sklearn.ensemble import RandomForestClassifier r_f = RandomForestClassifier(random_state = 1) r_f.get_params() param = { 'n_estimators': [500, 1000, 1300], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [round(len(x_train.columns)/6),round(len(x_train.columns)/3),round(len(x_train.columns)/2),round(len(x_train.columns)/1)] } grid_search_rf = GridSearchCV(r_f, param_grid = param, cv = 5, n_jobs = -1) grid_search_rf.fit(x_train, y_train) grid_search_rf.best_params_ best_rf = grid_search_rf.best_estimator_ rf_score = best_rf.score(x_test, y_test) rf_pre = best_rf.predict_proba(x_test) rf_pre = pd.DataFrame(rf_pre) ### Change 0 to not equal to 0 to caculate log loss rf_pre.loc[rf_pre[0] == 0, [0]] = 0.000000000001 ## loss function value
class Trainer(): def __init__(self): with open('credentials.json') as credentials_file: credentials = json.load(credentials_file) passwd = credentials['mysql']['password'] self.con = mdb.connect(host='127.0.0.1', port=3306, user='******', passwd=passwd, db='insight', autocommit=True) self.cur = self.con.cursor() print "Connected to database" self.load_data() def load_data(self): f = open('./pickles/mysql_dump.pickle', 'rb') self.loanData = pickle.load(f) self.loanData = pd.DataFrame(self.loanData) f.close() def drop_na(self): self.loanData = loanData.dropna() self.loanData.index = range(len(self.loanData)) def drop_columns(self): #drop the columns with malformed data in mysql db self.loanData = self.loanData.drop(['none', 'educational', 'IA', 'IDAHO', 'ME', 'NE', 'other_housing', 'issue_year'], 1) def drop_prepaid_loans(self): indices_to_drop = [] for i in range(len(self.loanData)): if self.loanData['loan_status'][i]==1 and self.loanData['days_to_zero_dollars'][i] < 1000: indices_to_drop.append(i) self.loanData = self.loanData.drop(indices_to_drop, 0) print "Number of prepaid loans: ", len(indices_to_drop) print "Number of loans after dropping prepaids: ", len(self.loanData) def define_features_targets(self, kind="regression"): #take out 1000 random loans with 36 month terms for testing #ids are already populated in test_loans for consistency test_ids = [] sql_query = "select id from test_loans;" self.cur.execute(sql_query) sql_resp = self.cur.fetchall() print "length of sql response: ", len(sql_resp) for val in sql_resp: test_ids.append(val[0]) print "length of test_ids: ", len(test_ids) #make the test and train data frames self.testLoanData = self.loanData[self.loanData['id'].isin(test_ids)] self.trainLoanData = self.loanData[~self.loanData['id'].isin(test_ids)] self.testLoanData.index = range(len(self.testLoanData)) self.trainLoanData.index = range(len(self.trainLoanData)) print "Train Loan Data: ", len(self.trainLoanData) print "Test Loan Data: ", len(self.testLoanData) self.features = self.trainLoanData.drop(['loan_status', 'days_to_zero_dollars', 'id'], 1) self.features = self.features.values #choose different target variables for regression vs classification if kind == "regression": self.targets = self.trainLoanData['days_to_zero_dollars'].values self.y_test = self.testLoanData['days_to_zero_dollars'].values elif kind == "classification": self.targets = self.trainLoanData['loan_status'].values self.y_test = self.testLoanData['loan_status'].values def preprocess(self): (self.X_train, self.X_cv, self.y_train, self.y_cv) = dm.split_train_test(features=self.features, targets=self.targets, test_size=0.1) self.X_test = self.testLoanData.drop(['loan_status', 'days_to_zero_dollars', 'id'], 1).values (self.X_train, self.X_cv) = dm.standardize_samples(self.X_train, self.X_cv) (self.X_train, self.X_cv) = dm.scale_samples_to_range(self.X_train, self.X_cv) (self.X_test, _) = dm.standardize_samples(self.X_test, self.X_test) (self.X_test, _) = dm.scale_samples_to_range(self.X_test, self.X_test) def define_dummy_classifier(self): self.clf = DummyClassifier() def define_rfr(self, n_estimators=10): self.regr = RandomForestRegressor(n_estimators=n_estimators, oob_score=True) print self.regr.get_params() def define_linear_regressor(self): self.regr = LinearRegression() print self.regr.get_params() def define_SVR(self, C=1, gamma=0.1): self.regr = SVR(C=C, gamma=gamma, verbose=3) print self.regr.get_params() def define_logistic_regressor(self, penalty="l2", C=1.0, class_weight=None): self.clf = LogisticRegression(penalty=penalty, C=C, class_weight=class_weight) print self.clf.get_params() def define_rfc(self, n_estimators=10): self.clf = RandomForestClassifier(n_estimators=n_estimators, oob_score=True) print self.clf.get_params() def train(self, kind="regression"): print "Fitting training data" if kind == "regression": self.regr.fit(self.X_train, self.y_train) elif kind == "classification": self.clf.fit(self.X_train, self.y_train) def predict(self, X, kind="regression"): if kind == "regression": self.prediction = self.regr.predict(X) elif kind == "classification": self.prediction = self.clf.predict(X) def score(self, X, y, kind="regression"): if kind == "regression": score_val = self.regr.score(X, y) print "R2 Score: ", score_val elif kind == "classification": score_val = self.clf.score(X, y) print "Accuracy: ", score_val print classification_report(y, self.prediction) self.precision = precision_score(y, self.prediction, labels=[0,1,2], average=None) print "\n\nPrecision Score: ", self.precision, "\n\n" self.accuracy = accuracy_score(y, self.prediction) def test(self, kind="regression"): #run clf and regr on the test data to determine to top 100 loans #the top loans are the ones least likely to default if kind == "regression": pred = self.regr.predict(self.X_test) print "length of regression pred: ", len(pred) for i, loan in enumerate(self.testLoanData['id']): sql_query = "UPDATE test_loans SET pred_days_to_zero_dollars=%s where id='%s';" %( pred[i], self.testLoanData['id'][i]) self.cur.execute(sql_query) print i elif kind == "classification": pred_proba = self.clf.predict_proba(self.X_test) for i, loan in enumerate(self.testLoanData['id']): sql_query = "UPDATE test_loans SET pred_default=%s, pred_paid=%s, pred_prepaid=%s where id='%s';" %( pred_proba[i][0], pred_proba[i][1],pred_proba[i][2], self.testLoanData['id'][i]) self.cur.execute(sql_query) self.con.close() def run_pca(self, n_components=20): self.pca = PCA(n_components=n_components) self.X_train = self.pca.fit_transform(self.X_train) print "Reduced data down to ", self.pca.n_components_, " dimensions: " print "Transforming cv data ..." self.X_cv = self.pca.transform(self.X_cv) print "Transforming test data ..." self.X_test = self.pca.transform(self.X_test) def plot_prediction(self): plt.scatter(self.prediction, self.y_cv) plt.xlabel('prediction') plt.ylabel('y_test') plt.show() def runSVRGridSearch(self): C_vals = [0.01, 0.1, 1, 10, 100] gamma_vals = [1E-2, 1E-1, 1, 1E1, 1E2, 1E3, 1E4] for C in C_vals: for gamma in gamma_vals: print "\n\n C: ", C, " gamma: ", gamma self.define_SVR(C=C, gamma=gamma) self.train() print "Training Scores:" self.predict(self.X_train) self.score(self.X_train, self.y_train) print "Testing Scores:" self.predict(self.X_cv) self.score(self.X_cv, self.y_cv) def roc(self): '''Compute ROC curve using one-vs-all technique''' pred_proba = self.clf.predict_proba(self.X_cv) fpr = [] tpr = [] thresholds = [] for i in [0, 1, 2]: fpr_i, tpr_i, thresholds_i = roc_curve(self.y_cv, pred_proba[:,i], pos_label=i) fpr.append(fpr_i) tpr.append(tpr_i) thresholds.append(thresholds_i) print "AUC: ", auc(fpr_i, tpr_i) plt.plot([0,1], [0,1], '--', color=(0.6, 0.6, 0.6)) plt.plot(fpr[0], tpr[0], label="Default", linewidth=3) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.show() def pickle_algo(self, X, fileName): print "pickling algorithm" f = open(fileName, 'wb') pickle.dump(X, f) f.close()
from sklearn.ensemble import RandomForestClassifier # train the model wqp_rf = RandomForestClassifier() wqp_rf.fit(wqp_train_SX, wqp_train_y) # predict and evaluate performance wqp_rf_predictions = wqp_rf.predict(wqp_test_SX) meu.display_model_performance_metrics(true_labels=wqp_test_y, predicted_labels=wqp_rf_predictions, classes=wqp_label_names) # ## Hyperparameter tuning with Grid Search & Cross Validation # In[23]: print(wqp_rf.get_params()) # ### Get the best hyperparameter values # In[24]: from sklearn.model_selection import GridSearchCV param_grid = { 'n_estimators': [100, 200, 300, 500], 'max_features': ['auto', None, 'log2'] } wqp_clf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5,
#!/usr/bin/env python # coding: utf-8 # Setuppo i dati import pandas as pd import numpy as np heart_disease = pd.read_csv("/heart-disease.csv") heart_disease # Features matrix x = heart_disease.drop("target", axis=1) # Labels y = heart_disease["target"] # Scelta del modello e hyperparameters from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier() clf.get_params() from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) clf.fit(x_train, y_train) # Prediction y_preds = clf.predict(x_test) y_preds y_test # Valutazione modello sui train data clf.score(x_train, y_train) #Valutazione modello sui test data clf.score(x_test, y_test) from sklearn.metrics import classification_report, confusion_matrix, accuracy_score print(classification_report(y_test, y_preds)) confusion_matrix(y_test, y_preds) accuracy_score(y_test, y_preds)
print "PLS Training error " , float(error)/yp_t.shape[0] yp_new = pls.predict(Xp_v, copy=True) yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int) #print y_new, y_pred, y_v #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0] error = ((yp_v - yp_pred) ** 2).sum() print "PLS Validation error " , float(error)/yp_v.shape[0] X_new = pls.transform(X) rf = RandomForestClassifier(n_estimators=500, max_depth=None, max_features=int(math.sqrt(n_components)), min_samples_split=100, random_state=144, n_jobs=4) #print "shapes ", X_new.shape, y.shape #print X_new,y X_t, X_v, y_t, y_v = tts(X_new,yd,train_size=0.85) rf.fit(X_t, y_t) print "Random Forest Classifier: ", rf.get_params() print "Covariance Classifier Training score: ", rf.score(X_t, y_t) print "Covariance Classifier Validation score: ", rf.score(X_v, y_v) #print "Class prob: ", zip(rf.predict_proba(X_v), y_v) sample_weights = rf.predict_proba(pls.transform(Xp_t))[:,1] print sample_weights.shape sample_weights = abs(sample_weights-0.5) for a in [.01, .1, .3, 1, 3, 10, 20, 30, 40, 50, 100]: clf = SGDClassifier(alpha=a,loss=algo,n_iter=20) clf.fit(Xp_t,yp_t,sample_weight=sample_weights) clf2 = SGDClassifier(alpha=a,loss=algo,n_iter=20) clf2.fit(Xp_t,yp_t) print "alpha: ", a print "Target score with weights: ", clf.score(Xt,yt)
from sklearn.ensemble import RandomForestClassifier from ray.tune.sklearn import TuneGridSearchCV from sklearn.model_selection import train_test_split # Load the data data = fetch_covtype() x = data.data y = data.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) default_model = RandomForestClassifier() default_model.fit(x_train, y_train) default_pred = default_model.predict(x_test) default_params = default_model.get_params() default_accuracy = np.count_nonzero( np.array(default_pred) == np.array(y_test)) / len(default_pred) parameter_grid = { "n_estimators": [10, 50], "max_depth": [5, 50, 100], "ccp_alpha": [0.001, 0.01] } tune_search = TuneGridSearchCV(RandomForestClassifier(), param_grid=parameter_grid, scoring="accuracy") start = time.time() tune_search.fit(x_train, y_train)
test_iterations = range(0,1000) average_score_sum = 0 for x in test_range: clf = RandomForestClassifier(criterion = "entropy",min_samples_leaf=4) #create the random forest classifier clf = clf.fit(features_train, labels_train) #train the classifier pred = clf.predict(features_test) #create an array of predictions from sklearn.metrics import accuracy_score acc = accuracy_score(pred, labels_test) #determine the accuracy of those predictions average_score_sum+=acc if acc > best_leaf_value[1]: best_leaf_value[0]=x #store the leaf value which yields the highest accuracy best_leaf_value[1]=acc #store the new highest average accuracy best_leaf_value[2]= clf.get_params(deep = True) average_score = average_score_sum/len(test_range) print "High Score: ", best_leaf_value[1] print "Average Score: ", average_score print "Deets: ", best_leaf_value ''' for x in test_range: print x average_score_sum = 0 for t in test_iterations: clf = RandomForestClassifier(min_samples_leaf=x) #create the random forest classifier clf = clf.fit(features_train, labels_train) #train the classifier