class SVMClassifier(BaseEstimator): def __init__(self, skewedness=3., n_components=85, C=100, rs = None): self.platt_params = [] self.feature_map_fourier = SkewedChi2Sampler(skewedness=skewedness, n_components=n_components, random_state = rs) # random_state plays a role in LinearSVC and SVC when dual = True (It is defaulted to True). self.clf = Pipeline([('fp', self.feature_map_fourier), ('svm', LinearSVC(C=C, random_state = rs, class_weight = 'balanced')) ]) def fit(self, X, y): self.clf.fit(X, y) def set_platt_params(self, X, y): # Use self.clf.decision_function() instead of self.clf.predict() # Former returns a score while latter gives the class label directly. # Platt scaling transforms these scores to a probability. # y_pred = self.clf.predict(X) y_pred = self.clf.decision_function(X) self.platt_params = SigmoidTrain(y_pred, y) def predict(self, X): return self.clf.predict(X) def predict_proba(self, X): # y_pred = self.clf.predict(X) y_pred = self.clf.decision_function(X) return SigmoidPredict(y_pred, self.platt_params)
def test_36_one_class_svm(self): print("\ntest 36 (One Class SVM\n") detection_map = { 'true': -1, 'false': 1 } df = pd.read_csv("nyoka/tests/train_ocsvm.csv") df_test = pd.read_csv("nyoka/tests/test_ocsvm.csv") features = df.columns model = OneClassSVM(nu=0.1) pipeline_obj = Pipeline([ ("model", model) ]) pipeline_obj.fit(df) file_name = 'test36sklearn.pmml' skl_to_pmml(pipeline_obj, features, '', file_name) model_pred = pipeline_obj.predict(df_test) model_scores = pipeline_obj.decision_function(df_test) model_name = self.adapa_utility.upload_to_zserver(file_name) z_predictions = self.adapa_utility.score_in_zserver(model_name,'nyoka/tests/test_ocsvm.csv','ANOMALY') cnt = 0 for idx, value in enumerate(z_predictions): score, is_anomaly = value.split(",") score = float(score) if "{:.6f}".format(score) != "{:.6f}".format(model_scores[idx]) or model_pred[idx] != detection_map[is_anomaly]: cnt += 1 self.assertEqual(cnt,0)
def run_ngram_baseline(train_debates, test_debates): train_list = [] for train_debate in train_debates: df = pd.read_csv(train_debate, index_col=None, header=None, names=_COL_NAMES, sep='\t') train_list.append(df) train_df = pd.concat(train_list) test_list = [] for train_debate in test_debates: df = pd.read_csv(train_debate, index_col=None, header=None, names=_COL_NAMES, sep='\t') test_list.append(df) test_df = pd.concat(test_list) pipeline = Pipeline([ ('ngrams', TfidfVectorizer(ngram_range=(1, 1))), ('clf', SVC(C=1, gamma=0.75, kernel='rbf', random_state=0)) ]) pipeline.fit(train_df['text'], train_df['label']) for test_debate in test_debates: test_df = pd.read_csv(test_debate, names=_COL_NAMES, sep='\t') results_fpath = join(ROOT_DIR, 'baselines/data/task5_ngram_baseline_%s'%(os.path.basename(test_debate))) with open(results_fpath, "w") as results_file: predicted_distance = pipeline.decision_function(test_df['text']) for line_num, dist in zip(test_df['line_number'], predicted_distance): results_file.write("{}\t{}\n".format(line_num, dist))
class SupportVectorMachineClassifier(object): def __init__(self): self.svc = Pipeline([ ('scaling', StandardScaler()), ('classification', LinearSVC(loss='hinge')), ]) def train(self, x_train, y_train): print("\nStarting to train vehicle detection classifier.") start = time.time() self.svc.fit(x_train, y_train) print("Completed training in {:5f} seconds.\n".format(time.time() - start)) def score(self, x_test, y_test): print("Testing accuracy:") scores = self.svc.score(x_test, y_test) print("Accuracy {:3f}%".format(scores)) return scores def predict(self, feature): return self.svc.predict(feature) def decision_function(self, feature): return self.svc.decision_function(feature)
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver="randomized", whiten=True) clf = SVC(probability=True, random_state=0, decision_function_shape="ovr") for preprocessing in [scaler, pca]: pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples, ) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y)
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = RandomizedPCA(n_components=2, whiten=True) clf = SVC(probability=True, random_state=0) for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples,)) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
class Ngram(Model): """ Ngram baseline model. """ def __init__(self, name, preprocessor): super().__init__(name, preprocessor) self.name = name self.pipeline = None def fit(self, dataframe): """ Trains on given labeled data. """ self.pipeline = Pipeline([ ('ngrams', TfidfVectorizer(ngram_range=(1, 1))), ('clf', SVC(C=1, gamma=0.75, kernel='rbf', random_state=0)) ]) self.pipeline.fit(dataframe[KEY_TEXT], dataframe[KEY_CHECK_WORTHINESS]) def run(self, dataframe): """ Model is fed inputs, writing outputs in the result file. """ results_fpath = self.get_result_path() with open(results_fpath, "w") as results_file: predicted_distance = self.pipeline.decision_function( dataframe[KEY_TEXT]) for i, line in dataframe.iterrows(): dist = predicted_distance[i] results_file.write("{}\t{}\t{}\t{}\n".format( line[KEY_TOPIC_ID], line[KEY_TWEET_ID], dist, "ngram"))
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver='randomized', whiten=True) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples, )) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def generate_test_performance_data(train_tile="b278",test_tiles=["b234","b261","b360"]): """ Train a classifier and calculate the perfmance in test using the hyperparameters estimated in this section """ # RF X,y=CARPYNCHO.retrieve_tile(train_tile) clf = RandomForestClassifier(n_estimators=400, criterion="entropy", min_samples_leaf=2, max_features="sqrt",n_jobs=7) clf.fit(X,y) # SVM clf2 = Pipeline([ ('scaler', StandardScaler()), ('clf', LinearSVC(verbose=3, max_iter=100000, C=get_optimal_parameters_i("svml")["C"], dual=False)) ]) clf2.fit(X,y) #SVM-K nystroem_approx_svm = Pipeline( [("scaler",StandardScaler()), ("feature_map", Nystroem(n_components=300,gamma=get_optimal_parameters_i("svml")["gamma"])), ("svm", LinearSVC(dual=False,max_iter=100000,C=get_optimal_parameters_i("svmK")["C"]))]) nystroem_approx_svm.fit(X,y) for test in test_tiles: Xtest, ytest = CARPYNCHO.retrieve_tile(test) curves = {} #RF test_predictions = clf.predict_proba(Xtest)[:,1] precision, recall, thresh = metrics.precision_recall_curve(ytest, test_predictions) curves["rf"] = (precision,recall) # SVM-L test_predictions = clf2.decision_function(Xtest) precision, recall, thresh = metrics.precision_recall_curve(ytest, test_predictions) curves["svml"] = (precision,recall) # SVM-K test_predictions = nystroem_approx_svm.decision_function(Xtest) precision, recall, thresh = metrics.precision_recall_curve(ytest, test_predictions) curves["svmk"] = (precision,recall) with open(EXPERIMENTS_OUTPUT_FOLDER_MS+ '/optimize_hyperparameters/test_results_train='+train_tile+ "Test="+test+".pkl", 'wb') as output: pickle.dump(curves,output, pickle.HIGHEST_PROTOCOL)
def anova_svm(df_in, ss_label, k_type, ts, f_n, svm_c, title_n, out_path): x = df_in.copy() x = x.values f_list = df_in.columns.tolist() fn = f_n if len(f_list) > f_n else 'all' x_train, x_test, y_train, y_test = train_test_split(x, ss_label, test_size=ts, random_state=1) anova_filter = SelectKBest(f_regression) clf = svm.SVC(kernel=k_type, probability=True) an_sv = Pipeline([('anova', anova_filter), ('svc', clf)]) an_sv.set_params(anova__k=fn, svc__C=svm_c).fit(x_train, y_train) y_score = an_sv.decision_function(x_test) average_precision = average_precision_score(y_test, y_score) precision, recall, _ = precision_recall_curve(y_test, y_score) mask = an_sv.named_steps.anova.get_support() m_mir_list = df_in.columns[mask] f_list = ','.join(m_mir_list) tmp1 = '{}\t{}'.format(len(m_mir_list), f_list) plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.fill_between(recall, precision, step='post', alpha=0.2, color='b') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('{} Precision-Recall curve: AP={:0.2f}'.format( title_n, average_precision)) fn = 'PR_' + title_n.replace(' ', '_').replace('(', '').replace(')', '') plt.savefig(os.path.join(out_path, fn)) plt.gcf().clear() y_pre = an_sv.predict_proba(x_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pre) roc_auc = auc(x=fpr, y=tpr) lf = title_n.split('_', 1) tmp2 = '{}\t{}\t{:.2f}'.format(lf[0], lf[1], roc_auc) plt.plot(fpr, tpr, color='b', linestyle='-', label='{} (auc = {:.2f})'.format(title_n, roc_auc)) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], linestyle='--', color='gray', linewidth=2) plt.xlim([-0.1, 1.1]) plt.ylim([-0.1, 1.1]) plt.title(title_n) plt.grid() plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') fn = 'ROC_' + title_n.replace(' ', '_').replace('(', '').replace(')', '') plt.savefig(os.path.join(out_path, fn)) plt.gcf().clear() return tmp1, tmp2
def train(X_train, X_test, y_train, y_test, vect, tfidf, clf): text_clf = Pipeline([('vect', vect), ('tfidf', tfidf), ('clf', clf)]) text_clf.fit(X_train, y_train) y_score = text_clf.decision_function(X_test) print(text_clf.classes_) for i, j in enumerate(text_clf.classes_): print(j) y_score_one = [l[i] for l in y_score] print(precision_recall_curve(y_test, y_score_one, pos_label=j))
def svm_svc_rbf(data_train, data_test, label_train, label_test): svm_clf_rbf = Pipeline([("scaler", StandardScaler()), ("svm_clf", SVC(kernel="rbf", gamma=GAMMA, C=C, random_state=6))]) svm_clf_rbf.fit(data_train, label_train) pred_test = svm_clf_rbf.decision_function(data_test) return label_test, pred_test
def build_iforest_housing(iforest, name, **pmml_options): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = Pipeline([("mapper", mapper), ("estimator", iforest)]) pipeline.fit(housing_X) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values) pipeline.configure(**pmml_options) store_pkl(pipeline, name) decisionFunction = DataFrame(pipeline.decision_function(housing_X), columns=["decisionFunction"]) outlier = DataFrame(pipeline.predict(housing_X) == -1, columns=["outlier" ]).replace(True, "true").replace(False, "false") store_csv(pandas.concat([decisionFunction, outlier], axis=1), name)
def build_ocsvm_housing(svm, name): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = Pipeline([("mapper", mapper), ("scaler", MaxAbsScaler()), ("estimator", svm)]) pipeline.fit(housing_X) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values) store_pkl(pipeline, name) decisionFunction = DataFrame(pipeline.decision_function(housing_X), columns=["decisionFunction"]) outlier = DataFrame(pipeline.predict(housing_X) <= 0, columns=["outlier" ]).replace(True, "true").replace(False, "false") store_csv(pandas.concat([decisionFunction, outlier], axis=1), name)
def get_pos_distribution_features(X_test, y_test, distrofile, model_name="cross_genre", output_type="probs"): with open(distrofile, "r", encoding="utf-8") as infile: data = infile.readlines() X_train = [item.split(",")[0] for item in data] y_train = [item.split(",")[1].strip() for item in data] #dataset_items = infile_dataset.readlines() conversion = { "hi": "new-delhi", "nl": "the-netherlands", "es": "spain", "pt": "portugal", "pl": "poland", "de": "germany", "ru": "russia", "fa": "iran", "it": "italy" } y_train = [conversion[item] for item in y_train] clf = svm.SVC(kernel="linear", probability=True) vect = TfidfVectorizer(ngram_range=(1, 3)) pipeline = Pipeline([("vect", vect), ("clf", clf)]) pipeline.fit(X_train, y_train) ## to do: pickle model predictions = pipeline.predict(X_test) print("Length", len(X_test)) if output_type == "probs": probabilities = pipeline.predict_proba(X_test) else: probabilities = pipeline.decision_function(X_test) acc = accuracy_score(y_test, predictions) print("\n" + str(acc)) with open( f"{model_name}_{output_type}_distributional_probability_features_pos.csv", "w", encoding="utf-8") as outfile: for ix, instance in enumerate(probabilities): instance_items = ",".join([str(item) for item in instance]) #print(instance_items) outfile.write(instance_items + "," + y_test[ix] + "\n")
def outlierDetection(X, features, N): clf = Pipeline(steps=[('imputer', impute.SimpleImputer( )), ('estimator', IsolationForest(behaviour='new', contamination='auto'))]) clf.fit(X) outliers = clf.decision_function(X) df = pd.DataFrame(X, columns=features) originalFeatures = df.keys() normalized_df = (df - df.mean()) / df.std() normalized_df.plot(kind="box", grid=False, figsize=(16, 9), rot=45) #plotCombinations = combinations(df.keys(), 2) dfo = pd.DataFrame({"outlier": outliers}) df = df.join(dfo) df = df.sort_values(by=['outlier']) cm = sns.light_palette("red", as_cmap=True, reverse=True) return(df[:N].style.\ background_gradient(subset=['outlier'], cmap=cm).\ apply(subset=originalFeatures, func=highlight_1D_Outliers))
def run_ngram_baseline(train_fpath, test_fpath): train_df = pd.read_csv(train_fpath, sep='\t') test_df = pd.read_csv(test_fpath, sep='\t') pipeline = Pipeline([('ngrams', TfidfVectorizer(ngram_range=(1, 1))), ('clf', SVC(C=1, gamma=0.75, kernel='rbf', random_state=0))]) pipeline.fit(train_df['tweet_text'], train_df['claim_worthiness']) results_fpath = join( ROOT_DIR, 'baselines/data/task1_ngram_baseline_%s' % (os.path.basename(test_fpath))) with open(results_fpath, "w") as results_file: predicted_distance = pipeline.decision_function(test_df['tweet_text']) for i, line in test_df.iterrows(): dist = predicted_distance[i] results_file.write("{}\t{}\t{}\t{}\n".format( line['topic_id'], line['tweet_id'], dist, "ngram"))
def linear_svm(X_train, y_train, X_test, y_test): lin_svm = Pipeline([("scaler", StandardScaler()), ("linear_svc", LinearSVC(C = 1, max_iter = 100000, loss = "hinge"))]) # fitting the model & keeping the SMOTE train sample lin_svm.fit(X_train, y_train) # getting predictions train_pred = lin_svm.predict(X_train) test_pred = lin_svm.predict(X_test) print("\nRecall for test set is: {0}".format(recall_score(y_test, test_pred))) # getting scores y_score = lin_svm.decision_function(X_test) # Plotting roc curve fpr, tpr, thresh = roc_curve(y_test, y_score) roc_auc = auc(fpr, tpr) print("\nAUC is :{0}".format(round(roc_auc, 2))) print('\nConfusion Matrix') print('----------------') display(pd.crosstab(y_test.ravel(), test_pred, rownames=['True'], colnames=['Predicted'], margins=True)) return fpr, tpr, thresh, y_score
def test_13_linearsvc(self): print("\ntest 13 (LinearSVC with preprocessing) [multi-class]\n") X, X_test, y, features, target, test_file = self.data_utility.get_data_for_multi_class_classification() model = LinearSVC() pipeline_obj = Pipeline([ ("scaler", StandardScaler()), ("model", model) ]) pipeline_obj.fit(X,y) file_name = 'test13sklearn.pmml' skl_to_pmml(pipeline_obj, features, target, file_name) model_name = self.adapa_utility.upload_to_zserver(file_name) predictions, probabilities = self.adapa_utility.score_in_zserver(model_name, test_file) model_pred = pipeline_obj.predict(X_test) model_prob = pipeline_obj.decision_function(X_test) self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True) self.assertEqual(self.adapa_utility.compare_probability(probabilities, model_prob), True)
def svm_svc_poly(data_train, data_test, label_train, label_test): print "hyper" print DEGREE print COEF0 print C svm_clf_poly = Pipeline([("scaler", StandardScaler()), ("svm_clf", SVC(kernel="poly", degree=DEGREE, coef0=COEF0, C=C, random_state=6))]) start_time = time.time() svm_clf_poly.fit(data_train, label_train) print("--- %s minutess ---" % ((time.time() - start_time) / 60.0)) pred_test = svm_clf_poly.decision_function(data_test) return label_test, pred_test
def run_svm_decision_distance(test, train, agreement=1): """ :param test: :param train: :param agreement: :return: """ from sklearn.pipeline import Pipeline svc = Pipeline([("svm", SVC(class_weight='balanced', kernel='rbf', C=0.7, gamma=0.001, random_state=0))]) features = get_experimential_pipeline(train) X_train = features.fit_transform(train) y = [1 if sent.label >= agreement else 0 for sent in train] X_train, y = balance(X_train, y) print("Start training SVM.") svc.fit(X_train, y) print("Finished training SVM.") X = features.fit_transform(test) y_pred_proba = svc.decision_function(X) y_pred_proba = MinMaxScaler().fit_transform(y_pred_proba).tolist() y_pred = svc.predict(X) for sent, prob, pred_label in zip(test, y_pred_proba, y_pred): sent.pred = prob sent.pred_label = pred_label y_true = [1 if s.label >= agreement else 0 for s in test] print(average_precision_score(y_true, y_pred_proba)) return test
def singular_lgls(pcompa = False): #X, training_target, Y_test, Y_test_id = load_data() X, Y = load_data(original=True) test_id = Y[['t_id']].as_matrix() test_id = test_id.flatten() training_target = X[['target']].as_matrix() training_target = training_target.flatten() features = [] lgls = [] for i in X.columns: if str(i) == 'target': pass else: #print "Feature %s " %(str(i)) features.append(str(i)) feature_X = X[str(i)] feature_Y = Y[str(i)] X_np = feature_X.as_matrix() Y_np = feature_Y.as_matrix() # split traininf data in to training and validation set X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33, random_state=4) X_train = np.reshape(X_train, (len(X_train), 1)) X_Val = np.reshape(X_Val, (len(X_Val), 1)) np.reshape(train_target, (len(train_target), 1)) np.reshape(val_target, (len(val_target), 1)) # feature selection select = SelectKBest(chi2, k=20) # dimensionality reduction ( PCA) pca = PCA(n_components=2, whiten=True) # randomized grid search??? clfs = [ LogisticRegression()] #xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05), #KNeighborsClassifier(n_neighbors=100), #RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1), #RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1) #RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1), #AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1), #ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1), #ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'), #ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'), #GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)] for j, clf in enumerate(clfs): #print j, clf.__class__.__name__ # pipeline with feature selection, pca and classifier if pcompa==True: #pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)]) pipeline = Pipeline([('pca', pca), ('clf', clf)]) else: pipeline = Pipeline([('clf', clf)]) #pipeline = Pipeline([('select', select), ('clf', clf)]) # cross validation skf = StratifiedKFold(train_target, n_folds=5, random_state=1) scores = [] for k, (train, test) in enumerate(skf): pipeline.fit(X_train[train], train_target[train]) if hasattr(pipeline, 'predict_proba'): score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1]) else: score = log_loss(train_target[test], pipeline.decision_function(X_train[test])) scores.append(score) #print 'Fold: %s, Class dist: %s, Log loss: %.3f ' %(k+1, np.bincount(train_target[train]), score) #print 'CV accuracy: %.3f +/- %.3f ' %( # np.mean(scores), np.std(scores)) ## test on the hold out set #print 'Log Loss: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1])) lgls.append(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1])) ## Learning curves #train_sizes, train_scores, test_scores = \ # learning_curve(estimator=pipeline, # X=X_train, # y=train_target, # train_sizes=np.linspace(.1, 1.0, 5), # cv=5, # scoring='log_loss', # n_jobs=1) #train_mean = np.mean(train_scores, axis=1) #train_std = np.std(train_scores, axis=1) #test_mean = np.mean(test_scores, axis=1) #test_std = np.std(test_scores, axis=1) #print sorted(zip(features, lgls), reverse=False, key=lambda x: x[1])[:5] print sorted(zip(features, lgls), reverse=False, key=lambda x: x[1]) print "Average logloss per feature: ", np.mean(lgls) return np.mean(lgls)
else: pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)]) pipeline.fit(dt_train_bucket, train_y) offline_time_fit += time.time() - start_offline_time_fit # predict separately for each prefix case preds = [] test_all_grouped = dt_test_bucket.groupby( dataset_manager.case_id_col) for _, group in test_all_grouped: start = time.time() _ = bucketer.predict(group) if cls_method == "svm": pred = pipeline.decision_function(group) else: preds_pos_label_idx = np.where( cls.classes_ == 1)[0][0] pred = pipeline.predict_proba( group)[:, preds_pos_label_idx] pipeline_pred_time = time.time() - start current_online_event_times.append(pipeline_pred_time / len(group)) preds.extend(pred) preds_all.extend(preds) test_y_all.extend(test_y) offline_total_time = offline_time_bucket + offline_time_fit + train_prefix_generation_time
steps = [('scaler', scaler), ('red_dim', pca), ('clf', svm)] pipeline = Pipeline(steps) summary = pipeline.named_steps pipeline.fit(X_train, train_labels_encoded) score_train = pipeline.score(X_train, train_labels_encoded) tot_train_score.append(score_train) score_test = pipeline.score(X_test, test_labels_encoded) tot_test_score.append(score_test) y_scores = pipeline.decision_function(X_test) auc = roc_auc_score(test_labels_encoded, y_scores) tot_auc.append(auc) y_pred = pipeline.predict(X_test) report = classification_report(test_labels_encoded, y_pred, output_dict=True) df_r = pd.DataFrame(report) df_r = df_r.transpose() #df_r.to_csv(f'/home/users/ubaldi/TESI_PA/result_CV/report_{name}/report_{i}') outname = f'report_{i}.csv'
('feature_selection', SelectKBest(f_regression, k=1000)), #('reduce_dims',PCA()), ('mnb', MultinomialNB()) ]) clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) try: pred_prob = clf.predict_proba(X_test) except AttributeError: try: dec_f = clf.decision_function(X_test) pred_prob = np.exp(dec_f) / np.sum(np.exp(dec_f)) except AttributeError: pred_prob = LabelBinarizer().fit_transform(pred.tolist()) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) y_test_prob = LabelBinarizer().fit_transform(y_test) log_loss = metrics.log_loss(y_test_prob, pred_prob) print("log_loss: %0.3f" % log_loss) if hasattr(clf, 'coef_'):
def main(): load_training_data() # Loads the training data from the json into the # dict # Converting that dictionary into a list where the content is only the body # of the posts. comments_in_list = all_key_val_to_list(TRAIN_JSON_DICTS, "body") # Making a list of words out of those comments. words_in_list = make_words(comments_in_list) # initalizing the pipeline for the SVC text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words="english")), ('tfdif', TfidfTransformer()), ('svc', sk.SVC(kernel="poly", cache_size=2048, degree=5, max_iter=10000, gamma=1e-7, C=65))]) # Fitting the test data. _ = text_clf_svm.fit(words_in_list[0], words_in_list[1]) # Making the list of tags for predictions/testing purposes. tag_for_all = [1] * len(comments_in_list[0][0]) + [-1] * \ len(comments_in_list[1][0]) title = "Reddit Classifier - SVM Learning Curve" # Generating the learning curve data. cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) all_comments = comments_in_list[0][0] + comments_in_list[1][0] tag_for_all = [1] * len(comments_in_list[0][0]) + [-1] * \ len(comments_in_list[1][0]) plot_learning_curve(text_clf_svm, title, all_comments, tag_for_all, cv=cv) score = text_clf_svm.decision_function(comments_in_list[0][0] + comments_in_list[1][0]) # Loading the test data up into the correct dictionary load_test_data() # Repeating the same steps as before but for the testing dictionary comments_in_list = all_key_val_to_list(TEST_JSON_DICTS, "body") all_comments = comments_in_list[0][0] + comments_in_list[1][0] tag_for_all = [1] * len(comments_in_list[0][0]) + [-1] * \ len(comments_in_list[1][0]) # Getting the decision function for testing with this data. score = text_clf_svm.decision_function(comments_in_list[0][0] + comments_in_list[1][0]) # Making the precision-recall graph. precision_recall(score, tag_for_all, "SVM Precision Recall") # This is the pipleine for the Random Forests. text_clf_rf = Pipeline([('vect', CountVectorizer(stop_words="english")), ('tfdif', TfidfTransformer()), ('tree', RandomForestClassifier(n_jobs=-1, criterion="entropy", n_estimators=55, min_samples_split=10, max_depth=400))]) # Redundant calls but in here for sanity. load_training_data() # Loading up the values just like before. comments_in_list = all_key_val_to_list(TRAIN_JSON_DICTS, "body") # Making the shiffle split for the learning curve. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) _ = text_clf_rf.fit(words_in_list[0], words_in_list[1]) predicted_rf = text_clf_rf.predict(comments_in_list[0][0] + comments_in_list[1][0]) # Setting up input for the learning curve and calling to it. title = "Reddit Classifier - Random Forest Learning Curve" all_comments = comments_in_list[0][0] + comments_in_list[1][0] tag_for_all = [1] * len(comments_in_list[0][0]) + [-1] * \ len(comments_in_list[1][0]) plot_learning_curve(text_clf_rf, title, all_comments, tag_for_all, cv=cv) # Getting the testing data loaded back up for the precision-recall tests. comments_in_list = all_key_val_to_list(TEST_JSON_DICTS, "body") # Since RF don't support decision functions the easiest way to get precision # and recall data is to just get the summary so this is all the setup for # that predicted_rf = text_clf_rf.predict(comments_in_list[0][0] + comments_in_list[1][0]) all_comments = comments_in_list[0][0] + comments_in_list[1][0] tag_for_all = [1] * len(comments_in_list[0][0]) + [-1] * \ len(comments_in_list[1][0]) # These are the names for the two different classes. target_names = ["askreddit", "personalfinance"] print( classification_report(tag_for_all, predicted_rf, target_names=target_names))
class Predictor(QtCore.QThread): """Object to predict the percentage match of an article, based on its abstract""" def __init__(self, logger, to_read_list, bdd=None): QtCore.QThread.__init__(self) self.to_read_list = to_read_list self.x_train = [] self.y_train = [] self.classifier = None if bdd is None: self.bdd = QtSql.QSqlDatabase.addDatabase("QSQLITE") self.bdd.setDatabaseName("fichiers.sqlite") self.bdd.open() else: self.bdd = bdd self.l = logger self.getStopWords() self.calculated_something = False def __del__(self): """Method to destroy the thread properly""" self.wait() self.l.debug("Deleting thread") def getStopWords(self): """Method to get english stop words + a list of personnal stop words""" my_additional_stop_words = [] if getattr(sys, "frozen", False): resource_dir = os.path.dirname(os.path.realpath(sys.argv[0])) else: resource_dir = '.' with open(os.path.join(resource_dir, 'config/stop_words.txt'), 'r') as config: for word in config.readlines(): my_additional_stop_words.append(word.rstrip()) self.stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words) def initializePipeline(self): """Initialize the pipeline for text analysis. 0 is the liked class""" start_time = datetime.datetime.now() query = QtSql.QSqlQuery(self.bdd) query.exec_("SELECT * FROM papers WHERE new=0") while query.next(): record = query.record() abstract = record.value('topic_simple') id_bdd = record.value('id') # Do not use 'Empty' abstracts if type(abstract) is not str or abstract == 'Empty': continue liked = record.value('liked') if type(liked) is int and liked == 1: category = 0 else: # Do not count the read and not liked articles if the articles # are in the waiting list if id_bdd not in self.to_read_list: category = 1 else: continue self.x_train.append(abstract) self.y_train.append(category) # To count for RuntimeWarning: divide by zero encountered in log if (not self.x_train or 0 not in self.y_train or 1 not in self.y_train): self.l.error("Not enough data yet to feed the classifier") return self.classifier = Pipeline([ ('vectorizer', CountVectorizer(stop_words=self.stop_words)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC())]) try: self.classifier.fit(self.x_train, self.y_train) except ValueError: self.l.error("Not enough data yet to train the classifier") return elapsed_time = datetime.datetime.now() - start_time self.l.debug("Initializing classifier in {0}".format(elapsed_time)) return True # @profile # def calculatePercentageMatch(self): def run(self): """Calculate the match percentage for each article, based on the abstract text and the liked articles""" self.l.debug("Starting calculations of match percentages") start_time = datetime.datetime.now() query = QtSql.QSqlQuery(self.bdd) query.exec_("SELECT id, topic_simple FROM papers") list_id = [] x_test = [] while query.next(): record = query.record() abstract = record.value('topic_simple') x_test.append(abstract) list_id.append(record.value('id')) try: # Normalize the percentages: the highest is set to 100% # http://stackoverflow.com/questions/929103/convert-a-number-range-to-another-range-maintaining-ratio x_test = self.classifier.decision_function(x_test) elapsed_time = datetime.datetime.now() - start_time self.l.debug("Classifier predicted proba in {}".format(elapsed_time)) diff_time = datetime.datetime.now() maximum = max(x_test) minimum = min(x_test) list_percentages = 100 - (x_test - minimum) * 100 / (maximum - minimum) self.l.debug("Classifier normalized proba in {}". format(datetime.datetime.now() - diff_time)) except AttributeError: self.l.error("Not enough data yet to predict probability") return except Exception as e: self.l.error("predictor: {}".format(e)) self.l.error(traceback.format_exc()) return self.bdd.transaction() query = QtSql.QSqlQuery(self.bdd) query.prepare("UPDATE papers SET percentage_match = ? WHERE id = ?") for id_bdd, percentage in zip(list_id, list_percentages): # Convert the percentage to a float, because the number is # probably a type used by numpy. MANDATORY params = (float(percentage), id_bdd) for value in params: query.addBindValue(value) query.exec_() # # Set the percentage_match to 0 if the abstact is 'Empty' or empty # query.prepare("UPDATE papers SET percentage_match = 0 WHERE abstract = 'Empty' OR abstract = ''") # query.exec_() if not self.bdd.commit(): self.l.critical("Percentages match not correctly written in db") else: elapsed_time = datetime.datetime.now() - start_time self.l.info("Done calculating match percentages in {0} s".format(elapsed_time)) self.calculated_something = True
def create_and_evaluate_model(args): global trial_nr trial_nr += 1 start = time.time() score = 0 for cv_iter in range(n_splits): dt_test_prefixes = dt_prefixes[cv_iter] dt_train_prefixes = pd.DataFrame() for cv_train_iter in range(n_splits): if cv_train_iter != cv_iter: dt_train_prefixes = pd.concat( [dt_train_prefixes, dt_prefixes[cv_train_iter]], axis=0) # Bucketing prefixes based on control flow bucketer_args = { 'encoding_method': bucket_encoding, 'case_id_col': dataset_manager.case_id_col, 'cat_cols': [dataset_manager.activity_col], 'num_cols': [], 'random_state': random_state } if bucket_method == "cluster": bucketer_args["n_clusters"] = args["n_clusters"] bucketer = BucketFactory.get_bucketer(bucket_method, **bucketer_args) bucket_assignments_train = bucketer.fit_predict(dt_train_prefixes) bucket_assignments_test = bucketer.predict(dt_test_prefixes) preds_all = [] test_y_all = [] if "prefix" in method_name: scores = defaultdict(int) for bucket in set(bucket_assignments_test): relevant_train_cases_bucket = dataset_manager.get_indexes( dt_train_prefixes)[bucket_assignments_train == bucket] relevant_test_cases_bucket = dataset_manager.get_indexes( dt_test_prefixes)[bucket_assignments_test == bucket] dt_test_bucket = dataset_manager.get_relevant_data_by_indexes( dt_test_prefixes, relevant_test_cases_bucket) test_y = dataset_manager.get_label_numeric(dt_test_bucket) if len(relevant_train_cases_bucket) == 0: preds = [class_ratios[cv_iter] ] * len(relevant_test_cases_bucket) else: dt_train_bucket = dataset_manager.get_relevant_data_by_indexes( dt_train_prefixes, relevant_train_cases_bucket) # one row per event train_y = dataset_manager.get_label_numeric(dt_train_bucket) if len(set(train_y)) < 2: preds = [train_y[0]] * len(relevant_test_cases_bucket) else: feature_combiner = FeatureUnion([ (method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods ]) if cls_method == "rf": cls = RandomForestClassifier( n_estimators=500, max_features=args['max_features'], random_state=random_state) elif cls_method == "xgboost": cls = xgb.XGBClassifier( objective='binary:logistic', n_estimators=500, learning_rate=args['learning_rate'], subsample=args['subsample'], max_depth=int(args['max_depth']), colsample_bytree=args['colsample_bytree'], min_child_weight=int(args['min_child_weight']), seed=random_state) elif cls_method == "logit": cls = LogisticRegression(C=2**args['C'], random_state=random_state) elif cls_method == "svm": cls = SVC(C=2**args['C'], gamma=2**args['gamma'], random_state=random_state) if cls_method == "svm" or cls_method == "logit": pipeline = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler()), ('cls', cls)]) else: pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)]) pipeline.fit(dt_train_bucket, train_y) if cls_method == "svm": preds = pipeline.decision_function(dt_test_bucket) else: preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0] preds = pipeline.predict_proba( dt_test_bucket)[:, preds_pos_label_idx] if "prefix" in method_name: auc = 0.5 if len(set(test_y)) == 2: auc = roc_auc_score(test_y, preds) scores[bucket] += auc preds_all.extend(preds) test_y_all.extend(test_y) score += roc_auc_score(test_y_all, preds_all) if "prefix" in method_name: for k, v in args.items(): for bucket, bucket_score in scores.items(): fout_all.write( "%s;%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, bucket, k, v, bucket_score / n_splits)) fout_all.write("%s;%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, 0, "processing_time", time.time() - start, 0)) else: for k, v in args.items(): fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, k, v, score / n_splits)) fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, "processing_time", time.time() - start, 0)) fout_all.flush() return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
def show_accuracy(y_hat, y_test, parameter): pass #(4)计算LogisticRegression分类器的准确率 print("LogisticRegression-输出训练集的准确率为:", classifier.score(x_train, y_train)) y_hat = classifier.predict(x_train) show_accuracy(y_hat, y_train, '训练集') print("LogisticRegression-输出测试集的准确率为:", classifier.score(x_test, y_test)) y_hat = classifier.predict(x_test) show_accuracy(y_hat, y_test, '测试集') # LogisticRegression-输出训练集的准确率为: 0.809523809524 # LogisticRegression-输出测试集的准确率为: 0.688888888889 # 查看决策函数,可以通过decision_function()实现。decision_function中每一列的值代表距离各类别的距离。 print('decision_function:\n', classifier.decision_function(x_train)) print('\npredict:\n', classifier.predict(x_train)) predict_proba = classifier.predict_proba(x_test) #得到结果概率矩阵 print("predict_proba", predict_proba) # (5)绘制图像 # 1.确定坐标轴范围,x,y轴分别表示两个特征 x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围 x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围 x1, x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200j] # 生成网格采样点 grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点 # print 'grid_test = \n', grid_test grid_hat = classifier.predict(grid_test) # 预测分类值 grid_hat = grid_hat.reshape(x1.shape) # 使之与输入的形状相同 # 2.指定默认字体
'''doc_id=0 for s, p, r in zip(docs_test, y_predicted, y_test): print(u'----------') print(u'[Text] %s' % s) print(u'[Label] %s' % p) print(u'[Actual] %s' % r)''' # Check if the total classification is empty # If empty, fill with the first classification total_prediction.append(y_predicted) # Average Positive score: ~0.7 # Min Score: ~0.002 # Max Score: ~2.86 dec = clf.decision_function(docs_test) # Numpy array, .T = Transpose # Transpose the classification to be exported to csv file multiLabel = np.array(total_prediction).T # Save the classification to file: binaryClass.csv with open('workbook/binaryClass.csv', 'w', newline='') as z: writer = csv.writer(z) writer.writerows(multiLabel) # Save values from confusion matrix to variables to use later TP, TN, FP, FN = calcValues(testY, multiLabel) precision = TP / (TP + FP) recall = TP / (TP + FN) accuracy = (TP + TN) / (TP + TN + FP + FN)
('features', features), ('Logistic', LogisticRegression(C=0.00077426, class_weight='balanced')) ]) model2.fit(fannie_train, status_train) status_pred2 = model2.predict(fannie_test) # print('Best C is: ', model2.named_steps['Logistic'].C_) print('Coefficients: ', model2.named_steps['Logistic'].coef_) print(classification_report(status_test, status_pred2)) print(pd.DataFrame(confusion_matrix(status_test, status_pred2), index=['Actual Healthy', 'Actual Default'], columns=['Pred. Healthy', 'Pred. Default'])) print('Area under the curve is', roc_auc_score(status_test, status_pred2)) prec, rec, thres1 = precision_recall_curve(status_test, status_pred2) fpr, tpr, thres2 = roc_curve(status_test, model2.decision_function(fannie_test)) with open('log_prec_rec.dill', 'wb') as f: dill.dump((prec, rec, thres1), f) with open('log_fpr_tpr.dill', 'wb') as f: dill.dump((fpr, tpr, thres2), f) with open('log_model.dill', 'wb') as f: dill.dump(model2, f) print('finishing dumping Logistic regression results to file!') # # Support Vector Machine # features = FeatureUnion([ # ('Loan_Amount', ExtractNormalized('STATE', 'ORIG_AMT')), # #('Interest_Rate', ExtractNormalized('STATE','ORIG_RT')),
class PredictPostings: ''' Applying this decorator to a beancount importer or its extract method will predict and auto-complete missing second postings of the transactions to be imported. Example: @PredictPostings( training_data="trainingdata.beancount", filter_training_data_by_account="The:Importers:Already:Known:Accountname" ) class MyImporter(ImporterProtocol): def extract(file): # do the import, return list of entries ''' # Implementation notes for how to write decorators for classes, see e.g., # https://stackoverflow.com/a/9910180 # https://www.codementor.io/sheena/advanced-use-python-decorators-class-function-du107nxsv # https://andrefsp.wordpress.com/2012/08/23/writing-a-class-decorator-in-python/ def __init__(self, *, training_data: Union[_FileMemo, List[Transaction], str] = None, filter_training_data_by_account: str = None, predict_second_posting: bool = True, suggest_accounts: bool = True): self.training_data = training_data self.filter_training_data_by_account = filter_training_data_by_account self.predict_second_posting = predict_second_posting self.suggest_accounts = suggest_accounts def __call__(self, to_be_decorated=None, *args, **kwargs): if inspect.isclass(to_be_decorated): logger.debug('The Decorator was applied to a class.') return self.patched_importer_class(to_be_decorated) elif inspect.isfunction(to_be_decorated): logger.debug('The Decorator was applied to an instancemethod.') return self.patched_extract_function(to_be_decorated) def patched_importer_class(self, importer_class): importer_class.extract = self.patched_extract_function( importer_class.extract) return importer_class def patched_extract_function(self, original_extract_function): decorator = self @wraps(original_extract_function) def wrapper(self, file, existing_entries=None): decorator.existing_entries = existing_entries logger.debug( f"About to call the importer's extract function to receive entries to be imported..." ) if 'existing_entries' in inspect.signature( original_extract_function).parameters: decorator.imported_transactions = original_extract_function( self, file, existing_entries) else: decorator.imported_transactions = original_extract_function( self, file) return decorator.enhance_transactions() return wrapper def enhance_transactions(self): # load training data self.training_data = ml.load_training_data( self.training_data, filter_training_data_by_account=self. filter_training_data_by_account, existing_entries=self.existing_entries) # convert training data to a list of TxnPostingAccounts self.converted_training_data = [ ml.TxnPostingAccount(t, p, pRef.account) for t in self.training_data for pRef in t.postings for p in t.postings if p.account != pRef.account ] # train the machine learning model self._trained = False if not self.converted_training_data: logger.warning("Cannot train the machine learning model " "because the training data is empty.") elif len(self.converted_training_data) < 2: logger.warning( "Cannot train the machine learning model " "because the training data consists of less than two elements." ) else: transformers = [] transformer_weights = {} transformers.append( ('narration', Pipeline([ ('getNarration', ml.GetNarration()), ('vect', CountVectorizer(ngram_range=(1, 3))), ]))) transformer_weights['narration'] = 0.8 transformers.append( ('account', Pipeline([ ('getReferencePostingAccount', ml.GetReferencePostingAccount()), ('vect', CountVectorizer(ngram_range=(1, 3))), ]))) transformer_weights['account'] = 0.8 distinctPayees = set( map(lambda trx: trx.txn.payee, self.converted_training_data)) if len(distinctPayees) > 1: transformers.append( ('payee', Pipeline([ ('getPayee', ml.GetPayee()), ('vect', CountVectorizer(ngram_range=(1, 3))), ]))) transformer_weights['payee'] = 0.5 transformers.append(( 'dayOfMonth', Pipeline([ ('getDayOfMonth', ml.GetDayOfMonth()), ('caster', ml.ArrayCaster()), # need for issue with data shape ]))) transformer_weights['dayOfMonth'] = 0.1 self.pipeline = Pipeline([ ('union', FeatureUnion(transformer_list=transformers, transformer_weights=transformer_weights)), ('svc', SVC(kernel='linear')), ]) logger.debug("About to train the machine learning model...") self.pipeline.fit( self.converted_training_data, ml.GetPostingAccount().transform(self.converted_training_data)) logger.info("Finished training the machine learning model.") self._trained = True if not self._trained: logger.warning( "Cannot generate predictions or suggestions " "because there is no trained machine learning model.") return self.imported_transactions # predict missing second postings self.transactions = self.imported_transactions if self.predict_second_posting: logger.debug( "About to generate predictions for missing second postings...") predicted_accounts: List[str] predicted_accounts = self.pipeline.predict( self.imported_transactions) self.transactions = [ ml.add_posting_to_transaction(*t_a) for t_a in zip(self.transactions, predicted_accounts) ] logger.debug( "Finished adding predicted accounts to the transactions to be imported." ) # suggest accounts that are likely involved in the transaction if self.suggest_accounts: # get values from the SVC decision function logger.debug( "About to generate suggestions about related accounts...") decision_values = self.pipeline.decision_function( self.imported_transactions) # add a human-readable class label (i.e., account name) to each value, and sort by value: suggestions = [[ account for _, account in sorted(list( zip(distance_values, self.pipeline.classes_)), key=lambda x: x[0], reverse=True) ] for distance_values in decision_values] # add the suggested accounts to each transaction: self.transactions = [ ml.add_suggested_accounts_to_transaction(*t_s) for t_s in zip(self.transactions, suggestions) ] logger.debug( "Finished adding suggested accounts to the transactions to be imported." ) return self.transactions
def combinations_lgls(pcompa = False, differences = True, addition = False, multiplication = False, division = False): #X, training_target, Y_test, Y_test_id = load_data() X, Y = load_data(original=True) test_id = Y[['t_id']].as_matrix() test_id = test_id.flatten() training_target = X[['target']].as_matrix() training_target = training_target.flatten() ### INCLUDE ALL NOT JUST THESE 5 ### f_s = [ 'feature%d' %x for x in range(1,22)] g_s = [ 'feature%d' %x for x in range(1,22)] features = [] lgls = [] for f in f_s: for g in g_s: if f == g: pass else: if differences: features.append(str(f)+"-"+str(g)) feature_X = X[str(f)]-X[str(g)] feature_Y = Y[str(f)]-Y[str(g)] elif addition: features.append(str(f)+"+"+str(g)) feature_X = X[str(f)]+X[str(g)] feature_Y = Y[str(f)]+Y[str(g)] elif multiplication: features.append(str(f)+"x"+str(g)) feature_X = X[str(f)]*X[str(g)] feature_Y = Y[str(f)]*Y[str(g)] elif division: features.append(str(f)+"/"+str(g)) feature_X = X[str(f)].div(X[str(g)]) feature_Y = Y[str(f)].div(Y[str(g)]) X_np = feature_X.as_matrix() Y_np = feature_Y.as_matrix() # split traininf data in to training and validation set X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33, random_state=4) X_train = np.reshape(X_train, (len(X_train), 1)) X_Val = np.reshape(X_Val, (len(X_Val), 1)) np.reshape(train_target, (len(train_target), 1)) np.reshape(val_target, (len(val_target), 1)) # feature selection select = SelectKBest(chi2, k=20) # dimensionality reduction ( PCA) pca = PCA(n_components=2, whiten=True) # randomized grid search??? clfs = [ LogisticRegression()] #xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05), for j, clf in enumerate(clfs): #print j, clf.__class__.__name__ # pipeline with feature selection, pca and classifier if pcompa==True: #pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)]) pipeline = Pipeline([('pca', pca), ('clf', clf)]) else: pipeline = Pipeline([('clf', clf)]) #pipeline = Pipeline([('select', select), ('clf', clf)]) # cross validation skf = StratifiedKFold(train_target, n_folds=5, random_state=1) scores = [] for k, (train, test) in enumerate(skf): pipeline.fit(X_train[train], train_target[train]) if hasattr(pipeline, 'predict_proba'): score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1]) else: score = log_loss(train_target[test], pipeline.decision_function(X_train[test])) scores.append(score) lgls.append(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1])) combination_scores = sorted(zip(features, lgls), key=lambda x: x[1]) single_f_average = singular_lgls() return [x for x in combination_scores if x[1]<single_f_average]
y_testt.append([0, 0, 0, 0, 1]) # In[84]: # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() # In[85]: y_testt = np.array(y_testt) # In[87]: y_score = text_clf.decision_function(X_test) # In[88]: for i in range(5): fpr[i], tpr[i], _ = roc_curve(y_testt[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # In[89]: # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_testt.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) n_classes = 5 # Compute macro-average ROC curve and ROC area
def CV_holdout(pcompa = False): #X, training_target, Y_test, Y_test_id = load_data() X, Y = load_data() test_id = Y[['t_id']].as_matrix() test_id = test_id.flatten() Y = Y.drop( 't_id', axis = 1 ) training_target = X[['target']].as_matrix() training_target = training_target.flatten() X = X.drop( 'target', axis = 1) X_np = X.as_matrix() Y_np = Y.as_matrix() # split traininf data in to training and validation set X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33) #X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33, random_state=4) # feature selection select = SelectKBest(chi2, k=20) # dimensionality reduction ( PCA) pca = PCA(n_components=2, whiten=True) # randomized grid search??? clfs = [ LogisticRegression()] #xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05), #KNeighborsClassifier(n_neighbors=100), #RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1), #RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1) #RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1), #AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1), #ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1), #ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'), #ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'), #GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)] for j, clf in enumerate(clfs): print j, clf.__class__.__name__ # pipeline with feature selection, pca and classifier if pcompa==True: #pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)]) pipeline = Pipeline([('pca', pca), ('clf', clf)]) else: #pipeline = Pipeline([('clf', clf)]) pipeline = Pipeline([('select', select), ('clf', clf)]) # cross validation skf = StratifiedKFold(train_target, n_folds=5, random_state=1) scores = [] for k, (train, test) in enumerate(skf): pipeline.fit(X_train[train], train_target[train]) if hasattr(pipeline, 'predict_proba'): score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1]) print pipeline.predict(X_train[test])[:10], train_target[test][:10] else: score = log_loss(train_target[test], pipeline.decision_function(X_train[test])) scores.append(score) #print 'Fold: %s, Class dist: %s, Log loss: %.3f ' %(k+1, np.bincount(train_target[train]), score) print 'CV accuracy: %.3f +/- %.3f ' %( np.mean(scores), np.std(scores)) ## test on the hold out set print 'Log Loss: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))
class Annotator: experiment = "" labels = [] final_choice = "" current_query = None ipython = False annotator = '' printify = lambda x: str(x) start_labels = [] access = "" web_root = "" shuffle = True classifier = None use_classifier = False classifier_trained = False def __init__(self, experiment, printify=None, ipython=False, access='web', web_root='', start_labels=[], annotator='', shuffle=True, use_classifier=False, close_choice_mode=False, choice_function=None): # access can be: # - File: and provide a file (file) to the experiment # - Web: and provide a URL (web_root) to the Flask server with API hooks # printify is a callable that takes in a sample and produces a string to print the sample to the user # start_labels: initial set of possible labels, can be updated over time # shuffle: If try, the samples are annotated in a random order? Default True # use_classifier: if true will train a classifier every few samples and then suggests the classes in order # close_choice_mode: if the choices available are known in advance, must provide `choice_function` # choice_function: if close_choice_mode is chosen, then a function that goes from a sample to the list of displayed choices if printify is not None: self.printify = printify self.experiment = experiment self.ipython = ipython self.access = access self.web_root = web_root self.start_labels = start_labels self.annotator = annotator # Name of the person annotating self.shuffle = shuffle self.use_classifier = use_classifier self.classifier_trained = False if self.use_classifier: self.classifier = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer()), ('clf', LinearSVC())]) self.close_choice_mode = close_choice_mode self.choice_function = choice_function def clean_html(self, raw_html): cleanr = re.compile('<.*?>') return re.sub(cleanr, '', raw_html) def query2json(self, query): url = self.web_root + query # print url r = requests.get(url) return r.json() # Web api hooks: 7 functions def list_experiments_web(self): results = self.query2json("annotator/list_experiments") for d in results: print "Experiment:", d['_id'], "has", d['count'], "samples" def insert_samples_web(self, samples): R = requests.post(self.web_root + "annotator/insert_samples", json=samples) print R.text def save_annotation_web(self): print "hellllo" self.query2json("annotator/save_annotation/" + str(self.current_query['_id']) + "/label/" + self.final_choice + "/annotator/" + self.annotator) def get_counts_web(self): return self.query2json("annotator/get_counts/" + self.experiment + "/annotator/" + self.annotator) def load_example_web(self): return self.query2json("annotator/next_example/" + self.experiment + "/annotator/" + self.annotator + "/shuffle/" + str(1 if self.shuffle else 0)) def reload_labels_web(self): self.labels = self.query2json("annotator/reload_labels/" + self.experiment) def detailed_statistics_web(self): return self.query2json("annotator/detailed_statistics/" + self.experiment) def relabel_web(self, from_label, to_label): return self.query2json("annotator/relabel/" + self.experiment + "/from/" + from_label + "/" + to_label + "/annotator/" + self.annotator) def load_labeled_dataset_web(self): return self.query2json("annotator/load_training/" + self.experiment) def export_web(self): return self.query2json("annotator/export/" + self.experiment) # Main functionality def list_experiments(): if self.access == 'web': self.list_experiments_web() def insert_samples(self, samples, pre_annotated=False): for sample in samples: if type(sample) is not dict: print "One sample or more is not a dict, which it should be. Please reformat" return -1 if 'label' in sample and not pre_annotated: print "The key `label` should not be used in the samples" return -2 if 'experiment' in sample and sample[ 'experiment'] != self.experiment: print "The key `experiment` is inconsistent with the experiment ID" return -3 sample['experiment'] = self.experiment if self.access == 'web': self.insert_samples_web(samples) def save_annotation(self): if self.access == 'web': self.save_annotation_web() def load_unannotated_example(self): if self.access == 'web': self.current_query = self.load_example_web() def reload_labels(self): if self.access == 'web': self.reload_labels_web() self.labels.extend(self.start_labels) self.labels = sorted(self.labels) def relabel(self, from_label, to_label): if self.access == 'web': self.relabel_web(from_label, to_label) def export(self): if self.access == 'web': return self.export_web() def finish(self): # Print a message once we are done print "Done annotating for now" def train_classifier(self): labeled_dataset = [] if self.access == 'web': labeled_dataset = self.load_labeled_dataset_web() if len(labeled_dataset) > 20: labels = [d['label']['label'] for d in labeled_dataset] text = [self.clean_html(self.printify(d)) for d in labeled_dataset] cross_val_accuracy = np.mean( cross_val_score(self.classifier, text, y=labels, scoring="accuracy", cv=5)) print "Classifier retrained (", len( labeled_dataset ), " samples). Cross val accuracy:", "{0:.2f}%".format( 100.0 * cross_val_accuracy) self.classifier.fit(text, y=labels) self.classifier_trained = True else: self.classifier_trained = False def get_ordered_labels(self): # Current example has been loaded, get the label either through the classifier # or take the first label if self.close_choice_mode: first_choice = "" text_labels = self.choice_function(self.current_query) if len(text_labels) > 0: first_choice = text_labels[0] return text_labels, first_choice if self.use_classifier: if not self.classifier_trained: self.train_classifier() if self.classifier_trained: best_label = self.classifier.predict( [self.clean_html(self.printify(self.current_query))])[0] scores = self.classifier.decision_function( [self.clean_html(self.printify(self.current_query))])[0] zero_min = (scores - np.min(scores)) normalized_scores = zero_min / np.sum(zero_min) labels = self.classifier.named_steps['clf'].classes_ lab2p = { lab: score for lab, score in zip(labels, normalized_scores) } sorted_labels = sorted(labels, key=lambda x: -lab2p[x]) + [ labs for labs in self.start_labels if labs not in labels ] text_labels = [ lab + " | Score: " + "{0:.1f}".format(100.0 * lab2p.get(lab, 0.0)) for lab in sorted_labels ] return text_labels, text_labels[0] best_label = "" if len(self.labels) > 0: best_label = self.labels[0] return self.labels, best_label def load_example_ipython(self): self.load_unannotated_example() if self.current_query is None: self.finish() return None # We stop here if not self.close_choice_mode: self.reload_labels() # In case something new has been added... display(HTML(self.printify(self.current_query))) TextField = widgets.Text(value='', placeholder='New class label', disabled=False) TextField.observe(self.on_change_jupyter) text_labels, self.final_choice = self.get_ordered_labels() Radio = widgets.RadioButtons(options=text_labels, value=self.final_choice, description="", disabled=False) Radio.observe(self.on_change_jupyter) B = widgets.Button(description='Submit annotation') B.on_click(self.on_submit_jupyter) count_annotated, count_total = self.get_counts() if count_annotated % 10 == 0: self.classifier_trained = False # Force retrain display( HTML("<div class='toDel'>" + str(count_annotated) + "/" + str(count_total) + "</div>")) display(TextField) display(Radio) display(B) def cleanup_jupyter(self): display( HTML( "<div class='js_stuff'><script>$('#example, .js_stuff').parent().parent().remove(); $('#desc_rows, .output_area, .toDel').remove(); $('.widget-subarea').html('');</script></div>" )) def on_change_jupyter(self, change): if change['type'] == 'change' and change['name'] == 'value': self.final_choice = change['new'] def on_submit_jupyter(self, change): # Save this annotation print "coucouchou" self.final_choice = self.final_choice.split("|")[0].strip() print self.final_choice self.save_annotation() self.cleanup_jupyter() self.load_example_ipython() def annotate(self): if self.ipython: self.load_example_ipython() # Analysis tools def get_counts(self): if self.access == 'web': return self.get_counts_web() return 0, 0 def status(self): count_annotated, count_total = self.get_counts() return "[" + self.experiment + "] Total samples: " + str( count_total) + " | Annotated: " + str(count_annotated) def detailed_statistics(self): count_annotated, count_total = self.get_counts() for d in self.detailed_statistics_web(): percentage = "{0:.2f}".format(100.0 * d['count'] / count_annotated) print d['_id'], d[ 'count'], " / ", count_annotated, " (", percentage, " % )"
metrics.classification_report( Y_test, logistic_classifier.predict(X_test)))) print 'classes : ',classifier.classes_ print 'RBM and Logistic regression : ', classifier.predict(X_test) print 'Raw Logistic regression', logistic_classifier.predict(X_test) logistic_proba = logistic_classifier.predict_proba(X_test) print 'logistic_classifier decision function : \n',logistic_classifier.decision_function(X_test) print 'logistic_classifier predict_proba : \n', logistic_proba classifier_proba = classifier.predict_proba(X_test) print 'classifier decision function : \n',classifier.decision_function(X_test) print 'classifier decision predict_proba : \n',classifier_proba if classifier_proba[0][1] < 0.6: print 'classifier ___________ led is acting strange' print 'current value : ',led_status[end-start-1] print 'desired value : ',X[0][end-start-1] f = open('transmit_confirm.txt','w') f.write(str(1)) f.close() print 'set led to : ', X[0][end-start-1] f = open('set_led.txt','w') f.write(str(X[0][end-start-1]))
def cross_validation(data_x, data_y): rs1 = RobustScaler() rs2 = RobustScaler() pca = PCA(n_components=10, svd_solver='full', whiten=False, random_state=42) ''' clf = SVC(kernel='rbf', C=1e-8, gamma='auto', cache_size=1000, probability=False, class_weight='balanced', decision_function_shape='ovr', random_state=42) ''' clf = MLPClassifier(solver='adam', activation='relu', hidden_layer_sizes=(50, ), alpha=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-4, learning_rate_init=1e-3, max_iter=200, early_stopping=True, validation_fraction=0.2, random_state=42) kfold = StratifiedKFold(n_splits=10, random_state=42) cv_test_auc = [] cv_train_auc = [] cv_test_brier = [] cv_train_brier = [] fig = plt.figure(figsize=(10, 10)) plt.ylim([-0.05, 1.05]) plt.plot([0, 1], [0, 1], 'k:', label='Perfect') for i, (train, test) in enumerate(kfold.split(data_x, data_y)): print('CV-Split {}'.format(i + 1)) train = resample(train, data_y, oversample=False) model = Pipeline(steps=[('rs1', rs1), ('pca', pca), ('rs2', rs2), ('clf', clf)]) model.fit(data_x[train], data_y[train]) ''' Get all your statistics here. For example: AUC, Brier loss and the calibration curve. ''' if hasattr(model, 'predict_proba'): p_test = model.predict_proba(data_x[test])[:, 1] p_train = model.predict_proba(data_x[train])[:, 1] else: p_test = model.decision_function(data_x[test]) p_train = model.decision_function(data_x[train]) p_test = norm(p_test) p_train = norm(p_train) p_pos, s_mean = calibration_curve(data_y[test], p_test, n_bins=10) plt.plot(s_mean, p_pos, 's-', label='CV fold {}'.format(i + 1)) test_auc = roc_auc_score(data_y[test], p_test) train_auc = roc_auc_score(data_y[train], p_train) test_brier = brier_score_loss(data_y[test], p_test) train_brier = brier_score_loss(data_y[train], p_train) print('###') print('Train AUC: {}'.format(train_auc)) print('Test AUC: {}'.format(test_auc)) print('Train Brier Loss: {}'.format(train_brier)) print('Test Brier Loss: {}'.format(test_brier)) print('###') cv_test_auc.append(test_auc) cv_train_auc.append(train_auc) cv_test_brier.append(test_brier) cv_train_brier.append(train_brier) plt.title('Calibration plot (reliability curve)') plt.ylabel('Fraction of positives') plt.xlabel('Mean predicted value') plt.legend(loc='best', ncol=2) plt.savefig('calibration.png') plt.close(fig) test_auc_stats = np.mean(cv_test_auc), np.std(cv_test_auc, ddof=1) train_auc_stats = np.mean(cv_train_auc), np.std(cv_train_auc, ddof=1) test_brier_stats = np.mean(cv_test_brier), np.std(cv_test_brier, ddof=1) train_brier_stats = np.mean(cv_train_brier), np.std(cv_train_brier, ddof=1) print('###') print('CV Train AUC: {0[0]} ({0[1]})'.format(train_auc_stats)) print('CV Test AUC: {0[0]} ({0[1]})'.format(test_auc_stats)) print('CV Train Brier Loss: {0[0]} ({0[1]})'.format(train_brier_stats)) print('CV Test Brier Loss: {0[0]} ({0[1]})'.format(test_brier_stats)) print('###')
stop_words='english', ngram_range=(1, 2), max_df=1.0, max_features=100000 ) print "Create pipeline for vectorizer => classifier" vect_clf = Pipeline([('vect', marisa_uni_vect), ('clf', LinearSVC())]) print "Train Model" vect_clf = vect_clf.fit(train_resume_text, train_labels) print "Predict test samples" predicted_score = vect_clf.predict(test_resume_text) predicted_decision = vect_clf.decision_function(test_resume_text) # accuracy = np.mean(predicted_score == test_labels) # p = precision_score(test_labels, predicted_score, average='macro') # r = recall_score(test_labels, predicted_score, average='macro') # # print accuracy # print p # print r # print classification_report([t for t in test_labels], [p for p in predicted_score]) predicted = [] actual_vs_predicted = [] for i in range(len(test_labels)):
def train_and_evaluate(writer, train_data, dev_data, test_data): train_text, train_label = _data2list(train_data) dev_text, dev_label = _data2list(dev_data) test_text, test_label = _data2list(test_data) hyper_parms = { 'ngram_range': [(1, 2), (1, 3), (1, 4)], 'sublinear_tf': [True, False], 'penalty': ['l2'], 'alpha': [1e-4, 1e-5], } all_hyper_parms = it.product(*(hyper_parms[k] for k in hyper_parms)) all_hyper_parms_dict = [ dict(zip(hyper_parms, arg)) for arg in all_hyper_parms ] best_dev = 0 best_args = None queue = multiprocessing.Queue() for cur_args in all_hyper_parms_dict: multiprocessing.Process(target=_worker, args=(queue, cur_args, train_text, train_label, dev_text, dev_label)).start() results = [] for _ in all_hyper_parms_dict: cur_args, acc, f1 = queue.get() res_str = '' for key, value in cur_args.items(): res_str = res_str + key + ': ' + str(value) + '. ' if f1 > best_dev: best_dev = f1 best_args = cur_args res_str += colored(('Dev acc: %.4f, f1: %.4f' % (acc, f1)), 'red') else: res_str += ('Dev acc: %.4f, f1: %.4f' % (acc, f1)) print(res_str) text_clf = Pipeline([ ('vect', TfidfVectorizer(ngram_range=best_args['ngram_range'], sublinear_tf=best_args['sublinear_tf'])), ('clf', SGDClassifier(loss='hinge', penalty=best_args['penalty'], max_iter=5, alpha=best_args['alpha'], random_state=1)) ]) train_text = train_text + dev_text train_label = train_label + dev_label text_clf.fit(train_text, train_label) test_pred = text_clf.predict(test_text) acc, f1, recall, precision = _compute_score(y_pred=test_pred, y_true=test_label, num_classes=2) scores = text_clf.decision_function(test_text) fpr, tpr, thresholds = metrics.roc_curve(test_label, scores, pos_label=1) mean_fpr = np.linspace(0, 1, 100) tpr = interp(mean_fpr, fpr, tpr) tpr[0] = 0.0 roc = metrics.auc(mean_fpr, tpr) print("End of training") print("Best dev f1: %.4f" % best_dev) print( "Test acc: %.4f, f1: %.4f, recall: %.4f, precision: %.4f, roc: %.4f" % (acc, f1, recall, precision, roc)) return acc, f1, roc, tpr
def blend_clfs_CV(f_number = 80, pcompa = True, layer = 1, cycles=9): if layer == 1: X, X_target, X_train, X_Val, train_target, val_target, Y_test, Y_test_id = load_data(f_number=f_number) elif layer == 2: num_clfs = [ LogisticRegression(), SVC(kernel='rbf', gamma=1.0, C=0.1, probability=True, verbose=True, random_state=1), xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05), KNeighborsClassifier(n_neighbors=100), RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1), #RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1) RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1), AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1), ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1), ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'), ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'), GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)] X, X_target, _, _, _, _, Y_test, Y_test_id = load_data(f_number=f_number) #newX = np.zeros((X.shape[0], cycles+len(num_clfs))) #newY = np.zeros((Y_test.shape[0], cycles+len(num_clfs))) #for i in range(cycles): test_preds = 'CV_blended_True_layer_1_keras1_feature_' + str(f_number) + '.csv' training_preds = 'CV_blended_True_training_layer_1_keras1_feature_' + str(f_number) + '.csv' test_preds_df = pd.read_csv(test_preds) training_preds_df = pd.read_csv(training_preds) test_preds_np = test_preds_df[["probability"]].as_matrix() training_preds_np = training_preds_df.as_matrix() X = np.concatenate((X, training_preds_np), axis=1) Y_test = np.concatenate((Y_test, test_preds_np), axis=1) for i in range(cycles): test_preds = 'CV_blended_False_layer_1_keras1_' + str(i) + '_feature_' + str(f_number) + '.csv' training_preds = 'CV_blended_False_training_layer_1_keras1_' + str(i) + '_feature_' + str(f_number) + '.csv' test_preds_df = pd.read_csv(test_preds) training_preds_df = pd.read_csv(training_preds) test_preds_np = test_preds_df[["probability"]].as_matrix() training_preds_np = training_preds_df.as_matrix() X = np.concatenate((X, training_preds_np), axis=1) Y_test = np.concatenate((Y_test, test_preds_np), axis=1) for i,c in enumerate(num_clfs): test_preds = 'CV_layer_1_' + str(c.__class__.__name__) + str(i) + '_feature_' + str(f_number) + '_pca_' + str(pcompa) + '.csv' training_preds = 'CV_training_layer_1_' + str(c.__class__.__name__) + str(i) + '_feature_' + str(f_number) + '_pca_' + str(pcompa) + '.csv' test_preds_df = pd.read_csv(test_preds) training_preds_df = pd.read_csv(training_preds) test_preds_np = test_preds_df[["probability"]].as_matrix() training_preds_np = training_preds_df.as_matrix() X = np.concatenate((X, training_preds_np), axis=1) Y_test = np.concatenate((Y_test, test_preds_np), axis=1) X_train, X_Val, train_target, val_target = train_test_split(X, X_target, test_size=0.33, random_state=4) X[X == -inf] = 0 X_train[X_train == -inf] = 0 X_Val[X_Val == -inf] = 0 Y_test[Y_test == -inf] = 0 #print "Number of total training samples: ", len(X) #print "Number of sub-training samples: ", len(X_train) #print "Number of validation samples: :", len(X_Val) # feature selection #select = SelectKBest(chi2, k=7) # dimensionality reduction ( PCA) pca = PCA(n_components=2, whiten=True) # randomized grid search??? clfs = [ LogisticRegression(), SVC(kernel='rbf', gamma=1.0, C=0.1, probability=True, verbose=True, random_state=1), xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05), KNeighborsClassifier(n_neighbors=100), RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1), #RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1) RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1), AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1), ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1), ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'), ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'), GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)] #C_range = 10.0 ** np.arange(-2, 3) #gamma_range = 10.0 ** np.arange(-2, 3) #param_grid = {"gamma": gamma_range.tolist(), "C": C_range.tolist(), "kernel": ['rbf', 'linear', 'sigmoid', 'poly']} #grid = GridSearchCV(SVC(), param_grid, n_jobs=-1, verbose=2) #grid = RandomizedSearchCV(SVC(), param_grid, n_iter=20, n_jobs=-1, verbose=2) #grid.fit(X, X_target) #print("The best classifier is: ", grid.best_estimator_) #print(grid.grid_scores_) for j, clf in enumerate(clfs): print j, clf # pipeline with feature selection, pca and classifier if pcompa==True: #pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)]) pipeline = Pipeline([('pca', pca), ('clf', clf)]) else: pipeline = Pipeline([('clf', clf)]) # cross validation skf = StratifiedKFold(train_target, n_folds=5, random_state=1) scores = [] for k, (train, test) in enumerate(skf): pipeline.fit(X_train[train], train_target[train]) if hasattr(pipeline, 'predict_proba'): score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1]) else: score = log_loss(train_target[test], pipeline.decision_function(X_train[test])) scores.append(score) print 'Fold: %s, Class dist: %s, Log loss: %.3f ' %(k+1, np.bincount(train_target[train]), score) print 'CV accuracy: %.3f +/- %.3f ' %( np.mean(scores), np.std(scores)) ## Learning curves #train_sizes, train_scores, test_scores = \ # learning_curve(estimator=pipeline, # X=X_train, # y=train_target, # train_sizes=np.linspace(.1, 1.0, 5), # cv=5, # scoring='log_loss', # n_jobs=1) #train_mean = np.mean(train_scores, axis=1) #train_std = np.std(train_scores, axis=1) #test_mean = np.mean(test_scores, axis=1) #test_std = np.std(test_scores, axis=1) #total_training_probabilities training_probs = pipeline.predict_proba(X)[:,1] training_probs_df = pd.DataFrame(data=training_probs, columns=["probability"]) training_submission = 'CV_training_layer_' + str(layer) + '_' + str(clf.__class__.__name__) + str(j) + '_feature_' + str(f_number) + '_pca_' + str(pcompa) training_probs_df.to_csv(training_submission + '.csv', index=False) ## test on the hold out set print 'Log Loss: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1])) ## test on real test set, save submission test_predictions = pipeline.predict_proba(Y_test)[:,1] test_predictions_df = pd.DataFrame(data=test_predictions, columns=["probability"]) Y_test_id.columns = ["t_id"] pred_submission = pd.concat((Y_test_id, test_predictions_df), axis = 1) submission = 'CV_layer_' + str(layer) + '_' + str(clf.__class__.__name__) + str(j) + '_feature_' + str(f_number) pred_submission.to_csv(submission + '.csv', index = False) submission_stats = open(submission + '.txt', 'a') submission_stats.write(str(clf) + '\n') submission_stats.write('pca = ' + str(pcompa) + '\n') submission_stats.write('Log Loss on Validation set: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1])) + '\n') submission_stats.write(' ' + '\n') submission_stats.close()