class SVMClassifier(BaseEstimator):
	def __init__(self, skewedness=3., n_components=85, C=100, rs = None):
		self.platt_params = []
		self.feature_map_fourier = SkewedChi2Sampler(skewedness=skewedness,	n_components=n_components, random_state = rs)
		# random_state plays a role in LinearSVC and SVC when dual = True (It is defaulted to True). 
		self.clf = Pipeline([('fp', self.feature_map_fourier),
							 ('svm', LinearSVC(C=C, random_state = rs, class_weight = 'balanced'))
							])

	def fit(self, X, y):
		self.clf.fit(X, y)

	def set_platt_params(self, X, y):
		# Use self.clf.decision_function() instead of self.clf.predict()
		# Former returns a score while latter gives the class label directly. 
		# Platt scaling transforms these scores to a probability. 
		# y_pred = self.clf.predict(X)
		y_pred = self.clf.decision_function(X)
		self.platt_params = SigmoidTrain(y_pred, y)

	def predict(self, X):
		return self.clf.predict(X)

	def predict_proba(self, X):
		# y_pred = self.clf.predict(X)
		y_pred = self.clf.decision_function(X)
		return SigmoidPredict(y_pred, self.platt_params)
 def test_36_one_class_svm(self):
     print("\ntest 36 (One Class SVM\n")
     detection_map = {
         'true': -1,
         'false': 1
     }
     df = pd.read_csv("nyoka/tests/train_ocsvm.csv")
     df_test = pd.read_csv("nyoka/tests/test_ocsvm.csv")
     features = df.columns
     model = OneClassSVM(nu=0.1)
     pipeline_obj = Pipeline([
         ("model", model)
     ])
     pipeline_obj.fit(df)
     file_name = 'test36sklearn.pmml'
     skl_to_pmml(pipeline_obj, features, '', file_name)
     model_pred = pipeline_obj.predict(df_test)
     model_scores = pipeline_obj.decision_function(df_test)
     model_name  = self.adapa_utility.upload_to_zserver(file_name)
     z_predictions = self.adapa_utility.score_in_zserver(model_name,'nyoka/tests/test_ocsvm.csv','ANOMALY')
     cnt = 0
     for idx, value in enumerate(z_predictions):
         score, is_anomaly = value.split(",")
         score = float(score)
         if "{:.6f}".format(score) != "{:.6f}".format(model_scores[idx]) or model_pred[idx] != detection_map[is_anomaly]:
             cnt += 1
     self.assertEqual(cnt,0)
Example #3
0
def run_ngram_baseline(train_debates, test_debates):

    train_list = []
    for train_debate in train_debates:
        df = pd.read_csv(train_debate, index_col=None, header=None, names=_COL_NAMES, sep='\t')
        train_list.append(df)
    train_df = pd.concat(train_list)

    test_list = []
    for train_debate in test_debates:
        df = pd.read_csv(train_debate, index_col=None, header=None, names=_COL_NAMES, sep='\t')
        test_list.append(df)
    test_df = pd.concat(test_list)

    pipeline = Pipeline([
        ('ngrams', TfidfVectorizer(ngram_range=(1, 1))),
        ('clf', SVC(C=1, gamma=0.75, kernel='rbf', random_state=0))
    ])
    pipeline.fit(train_df['text'], train_df['label'])
    for test_debate in test_debates:
        test_df = pd.read_csv(test_debate, names=_COL_NAMES, sep='\t')
        results_fpath = join(ROOT_DIR, 'baselines/data/task5_ngram_baseline_%s'%(os.path.basename(test_debate)))
        with open(results_fpath, "w") as results_file:
            predicted_distance = pipeline.decision_function(test_df['text'])
            for line_num, dist in zip(test_df['line_number'], predicted_distance):
                results_file.write("{}\t{}\n".format(line_num, dist))
Example #4
0
class SupportVectorMachineClassifier(object):
    def __init__(self):
        self.svc = Pipeline([
            ('scaling', StandardScaler()),
            ('classification', LinearSVC(loss='hinge')),
        ])

    def train(self, x_train, y_train):
        print("\nStarting to train vehicle detection classifier.")
        start = time.time()
        self.svc.fit(x_train, y_train)
        print("Completed training in {:5f} seconds.\n".format(time.time() -
                                                              start))

    def score(self, x_test, y_test):
        print("Testing accuracy:")
        scores = self.svc.score(x_test, y_test)
        print("Accuracy {:3f}%".format(scores))
        return scores

    def predict(self, feature):
        return self.svc.predict(feature)

    def decision_function(self, feature):
        return self.svc.decision_function(feature)
Example #5
0
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2, svd_solver="randomized", whiten=True)
    clf = SVC(probability=True, random_state=0, decision_function_shape="ovr")

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert predict.shape == (n_samples, )

        proba = pipe.predict_proba(X)
        assert proba.shape == (n_samples, n_classes)

        log_proba = pipe.predict_log_proba(X)
        assert log_proba.shape == (n_samples, n_classes)

        decision_function = pipe.decision_function(X)
        assert decision_function.shape == (n_samples, n_classes)

        pipe.score(X, y)
Example #6
0
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = RandomizedPCA(n_components=2, whiten=True)
    clf = SVC(probability=True, random_state=0)

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples,))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
Example #7
0
class Ngram(Model):
    """
    Ngram baseline model.
    """
    def __init__(self, name, preprocessor):
        super().__init__(name, preprocessor)
        self.name = name
        self.pipeline = None

    def fit(self, dataframe):
        """
        Trains on given labeled data.
        """
        self.pipeline = Pipeline([
            ('ngrams', TfidfVectorizer(ngram_range=(1, 1))),
            ('clf', SVC(C=1, gamma=0.75, kernel='rbf', random_state=0))
        ])
        self.pipeline.fit(dataframe[KEY_TEXT], dataframe[KEY_CHECK_WORTHINESS])

    def run(self, dataframe):
        """
        Model is fed inputs, writing outputs in the result file.
        """
        results_fpath = self.get_result_path()
        with open(results_fpath, "w") as results_file:
            predicted_distance = self.pipeline.decision_function(
                dataframe[KEY_TEXT])
            for i, line in dataframe.iterrows():
                dist = predicted_distance[i]
                results_file.write("{}\t{}\t{}\t{}\n".format(
                    line[KEY_TOPIC_ID], line[KEY_TWEET_ID], dist, "ngram"))
Example #8
0
def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    iris = load_iris()
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2, svd_solver='randomized', whiten=True)
    clf = SVC(probability=True, random_state=0, decision_function_shape='ovr')

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert_equal(predict.shape, (n_samples, ))

        proba = pipe.predict_proba(X)
        assert_equal(proba.shape, (n_samples, n_classes))

        log_proba = pipe.predict_log_proba(X)
        assert_equal(log_proba.shape, (n_samples, n_classes))

        decision_function = pipe.decision_function(X)
        assert_equal(decision_function.shape, (n_samples, n_classes))

        pipe.score(X, y)
Example #9
0
def generate_test_performance_data(train_tile="b278",test_tiles=["b234","b261","b360"]):
    """ 
    Train a classifier and calculate the perfmance in test using the hyperparameters
    estimated in this section
    """
    # RF
    X,y=CARPYNCHO.retrieve_tile(train_tile)
    clf = RandomForestClassifier(n_estimators=400, criterion="entropy", min_samples_leaf=2, max_features="sqrt",n_jobs=7)
    clf.fit(X,y)

    # SVM
    clf2 = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LinearSVC(verbose=3, max_iter=100000, C=get_optimal_parameters_i("svml")["C"], dual=False)) ])
            
    clf2.fit(X,y)
    
    #SVM-K
    nystroem_approx_svm = Pipeline( 
        [("scaler",StandardScaler()), 
         ("feature_map", Nystroem(n_components=300,gamma=get_optimal_parameters_i("svml")["gamma"])), 
         ("svm", LinearSVC(dual=False,max_iter=100000,C=get_optimal_parameters_i("svmK")["C"]))])

    nystroem_approx_svm.fit(X,y)    
        
    for test in test_tiles:
        Xtest, ytest = CARPYNCHO.retrieve_tile(test)
        curves = {}
        
        #RF
        test_predictions = clf.predict_proba(Xtest)[:,1]
        precision, recall, thresh = metrics.precision_recall_curve(ytest, test_predictions)
        curves["rf"] = (precision,recall)
        
        # SVM-L
        test_predictions = clf2.decision_function(Xtest)
        precision, recall, thresh = metrics.precision_recall_curve(ytest, test_predictions)
        curves["svml"] = (precision,recall)

        # SVM-K
        test_predictions = nystroem_approx_svm.decision_function(Xtest)
        precision, recall, thresh = metrics.precision_recall_curve(ytest, test_predictions)
        curves["svmk"] = (precision,recall)

        with open(EXPERIMENTS_OUTPUT_FOLDER_MS+ '/optimize_hyperparameters/test_results_train='+train_tile+ "Test="+test+".pkl", 'wb') as output:
            pickle.dump(curves,output, pickle.HIGHEST_PROTOCOL)      
Example #10
0
def anova_svm(df_in, ss_label, k_type, ts, f_n, svm_c, title_n, out_path):
    x = df_in.copy()
    x = x.values
    f_list = df_in.columns.tolist()
    fn = f_n if len(f_list) > f_n else 'all'
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        ss_label,
                                                        test_size=ts,
                                                        random_state=1)

    anova_filter = SelectKBest(f_regression)
    clf = svm.SVC(kernel=k_type, probability=True)
    an_sv = Pipeline([('anova', anova_filter), ('svc', clf)])
    an_sv.set_params(anova__k=fn, svc__C=svm_c).fit(x_train, y_train)
    y_score = an_sv.decision_function(x_test)
    average_precision = average_precision_score(y_test, y_score)
    precision, recall, _ = precision_recall_curve(y_test, y_score)

    mask = an_sv.named_steps.anova.get_support()
    m_mir_list = df_in.columns[mask]
    f_list = ','.join(m_mir_list)
    tmp1 = '{}\t{}'.format(len(m_mir_list), f_list)

    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('{} Precision-Recall curve: AP={:0.2f}'.format(
        title_n, average_precision))
    fn = 'PR_' + title_n.replace(' ', '_').replace('(', '').replace(')', '')
    plt.savefig(os.path.join(out_path, fn))
    plt.gcf().clear()

    y_pre = an_sv.predict_proba(x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pre)
    roc_auc = auc(x=fpr, y=tpr)
    lf = title_n.split('_', 1)
    tmp2 = '{}\t{}\t{:.2f}'.format(lf[0], lf[1], roc_auc)
    plt.plot(fpr,
             tpr,
             color='b',
             linestyle='-',
             label='{} (auc = {:.2f})'.format(title_n, roc_auc))
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', linewidth=2)
    plt.xlim([-0.1, 1.1])
    plt.ylim([-0.1, 1.1])
    plt.title(title_n)
    plt.grid()
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    fn = 'ROC_' + title_n.replace(' ', '_').replace('(', '').replace(')', '')
    plt.savefig(os.path.join(out_path, fn))
    plt.gcf().clear()
    return tmp1, tmp2
Example #11
0
def train(X_train, X_test, y_train, y_test, vect, tfidf, clf):
    text_clf = Pipeline([('vect', vect), ('tfidf', tfidf), ('clf', clf)])
    text_clf.fit(X_train, y_train)
    y_score = text_clf.decision_function(X_test)
    print(text_clf.classes_)
    for i, j in enumerate(text_clf.classes_):
        print(j)
        y_score_one = [l[i] for l in y_score]
        print(precision_recall_curve(y_test, y_score_one, pos_label=j))
Example #12
0
def svm_svc_rbf(data_train, data_test, label_train, label_test):

    svm_clf_rbf = Pipeline([("scaler", StandardScaler()),
                            ("svm_clf",
                             SVC(kernel="rbf",
                                 gamma=GAMMA,
                                 C=C,
                                 random_state=6))])

    svm_clf_rbf.fit(data_train, label_train)

    pred_test = svm_clf_rbf.decision_function(data_test)

    return label_test, pred_test
Example #13
0
def build_iforest_housing(iforest, name, **pmml_options):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = Pipeline([("mapper", mapper), ("estimator", iforest)])
    pipeline.fit(housing_X)
    pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name)
    decisionFunction = DataFrame(pipeline.decision_function(housing_X),
                                 columns=["decisionFunction"])
    outlier = DataFrame(pipeline.predict(housing_X) == -1,
                        columns=["outlier"
                                 ]).replace(True,
                                            "true").replace(False, "false")
    store_csv(pandas.concat([decisionFunction, outlier], axis=1), name)
Example #14
0
def build_ocsvm_housing(svm, name):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = Pipeline([("mapper", mapper), ("scaler", MaxAbsScaler()),
                         ("estimator", svm)])
    pipeline.fit(housing_X)
    pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values)
    store_pkl(pipeline, name)
    decisionFunction = DataFrame(pipeline.decision_function(housing_X),
                                 columns=["decisionFunction"])
    outlier = DataFrame(pipeline.predict(housing_X) <= 0,
                        columns=["outlier"
                                 ]).replace(True,
                                            "true").replace(False, "false")
    store_csv(pandas.concat([decisionFunction, outlier], axis=1), name)
def get_pos_distribution_features(X_test,
                                  y_test,
                                  distrofile,
                                  model_name="cross_genre",
                                  output_type="probs"):
    with open(distrofile, "r", encoding="utf-8") as infile:
        data = infile.readlines()
        X_train = [item.split(",")[0] for item in data]
        y_train = [item.split(",")[1].strip() for item in data]
        #dataset_items = infile_dataset.readlines()

    conversion = {
        "hi": "new-delhi",
        "nl": "the-netherlands",
        "es": "spain",
        "pt": "portugal",
        "pl": "poland",
        "de": "germany",
        "ru": "russia",
        "fa": "iran",
        "it": "italy"
    }
    y_train = [conversion[item] for item in y_train]
    clf = svm.SVC(kernel="linear", probability=True)
    vect = TfidfVectorizer(ngram_range=(1, 3))
    pipeline = Pipeline([("vect", vect), ("clf", clf)])
    pipeline.fit(X_train, y_train)
    ## to do: pickle model
    predictions = pipeline.predict(X_test)
    print("Length", len(X_test))
    if output_type == "probs":
        probabilities = pipeline.predict_proba(X_test)
    else:
        probabilities = pipeline.decision_function(X_test)
    acc = accuracy_score(y_test, predictions)

    print("\n" + str(acc))
    with open(
            f"{model_name}_{output_type}_distributional_probability_features_pos.csv",
            "w",
            encoding="utf-8") as outfile:

        for ix, instance in enumerate(probabilities):
            instance_items = ",".join([str(item) for item in instance])
            #print(instance_items)
            outfile.write(instance_items + "," + y_test[ix] + "\n")
def outlierDetection(X, features, N):
    clf = Pipeline(steps=[('imputer', impute.SimpleImputer(
    )), ('estimator', IsolationForest(behaviour='new', contamination='auto'))])
    clf.fit(X)
    outliers = clf.decision_function(X)
    df = pd.DataFrame(X, columns=features)
    originalFeatures = df.keys()
    normalized_df = (df - df.mean()) / df.std()
    normalized_df.plot(kind="box", grid=False, figsize=(16, 9), rot=45)
    #plotCombinations = combinations(df.keys(), 2)
    dfo = pd.DataFrame({"outlier": outliers})
    df = df.join(dfo)
    df = df.sort_values(by=['outlier'])
    cm = sns.light_palette("red", as_cmap=True, reverse=True)
    return(df[:N].style.\
            background_gradient(subset=['outlier'], cmap=cm).\
            apply(subset=originalFeatures, func=highlight_1D_Outliers))
Example #17
0
def run_ngram_baseline(train_fpath, test_fpath):
    train_df = pd.read_csv(train_fpath, sep='\t')
    test_df = pd.read_csv(test_fpath, sep='\t')

    pipeline = Pipeline([('ngrams', TfidfVectorizer(ngram_range=(1, 1))),
                         ('clf',
                          SVC(C=1, gamma=0.75, kernel='rbf', random_state=0))])
    pipeline.fit(train_df['tweet_text'], train_df['claim_worthiness'])

    results_fpath = join(
        ROOT_DIR, 'baselines/data/task1_ngram_baseline_%s' %
        (os.path.basename(test_fpath)))
    with open(results_fpath, "w") as results_file:
        predicted_distance = pipeline.decision_function(test_df['tweet_text'])
        for i, line in test_df.iterrows():
            dist = predicted_distance[i]
            results_file.write("{}\t{}\t{}\t{}\n".format(
                line['topic_id'], line['tweet_id'], dist, "ngram"))
def linear_svm(X_train, y_train, X_test, y_test):
    lin_svm = Pipeline([("scaler", StandardScaler()), ("linear_svc", LinearSVC(C = 1, max_iter = 100000, loss = "hinge"))])
    # fitting the model & keeping the SMOTE train sample
    lin_svm.fit(X_train, y_train)
    # getting predictions
    train_pred = lin_svm.predict(X_train)
    test_pred = lin_svm.predict(X_test)
    print("\nRecall for test set is: {0}".format(recall_score(y_test, test_pred)))
    # getting scores
    y_score = lin_svm.decision_function(X_test)
    # Plotting roc curve
    fpr, tpr, thresh = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    print("\nAUC is :{0}".format(round(roc_auc, 2)))
    print('\nConfusion Matrix')
    print('----------------')
    display(pd.crosstab(y_test.ravel(), test_pred, rownames=['True'], colnames=['Predicted'], margins=True))
    
    return fpr, tpr, thresh, y_score
    def test_13_linearsvc(self):
        print("\ntest 13 (LinearSVC with preprocessing) [multi-class]\n")
        X, X_test, y, features, target, test_file = self.data_utility.get_data_for_multi_class_classification()

        model = LinearSVC()
        pipeline_obj = Pipeline([
            ("scaler", StandardScaler()),
            ("model", model)
        ])
        pipeline_obj.fit(X,y)
        file_name = 'test13sklearn.pmml'
        
        skl_to_pmml(pipeline_obj, features, target, file_name)
        model_name  = self.adapa_utility.upload_to_zserver(file_name)
        predictions, probabilities = self.adapa_utility.score_in_zserver(model_name, test_file)
        model_pred = pipeline_obj.predict(X_test)
        model_prob = pipeline_obj.decision_function(X_test)
        self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
        self.assertEqual(self.adapa_utility.compare_probability(probabilities, model_prob), True)
Example #20
0
def svm_svc_poly(data_train, data_test, label_train, label_test):

    print "hyper"
    print DEGREE
    print COEF0
    print C

    svm_clf_poly = Pipeline([("scaler", StandardScaler()),
                             ("svm_clf",
                              SVC(kernel="poly",
                                  degree=DEGREE,
                                  coef0=COEF0,
                                  C=C,
                                  random_state=6))])

    start_time = time.time()
    svm_clf_poly.fit(data_train, label_train)
    print("--- %s minutess ---" % ((time.time() - start_time) / 60.0))
    pred_test = svm_clf_poly.decision_function(data_test)

    return label_test, pred_test
Example #21
0
def run_svm_decision_distance(test, train, agreement=1):
    """
    :param test:
    :param train:
    :param agreement:
    :return:
    """
    from sklearn.pipeline import Pipeline
    svc = Pipeline([("svm",
                     SVC(class_weight='balanced',
                         kernel='rbf',
                         C=0.7,
                         gamma=0.001,
                         random_state=0))])

    features = get_experimential_pipeline(train)
    X_train = features.fit_transform(train)

    y = [1 if sent.label >= agreement else 0 for sent in train]
    X_train, y = balance(X_train, y)

    print("Start training SVM.")
    svc.fit(X_train, y)
    print("Finished training SVM.")
    X = features.fit_transform(test)

    y_pred_proba = svc.decision_function(X)
    y_pred_proba = MinMaxScaler().fit_transform(y_pred_proba).tolist()

    y_pred = svc.predict(X)

    for sent, prob, pred_label in zip(test, y_pred_proba, y_pred):
        sent.pred = prob
        sent.pred_label = pred_label

    y_true = [1 if s.label >= agreement else 0 for s in test]

    print(average_precision_score(y_true, y_pred_proba))
    return test
Example #22
0
def singular_lgls(pcompa = False):
    #X, training_target, Y_test, Y_test_id = load_data()
    X, Y = load_data(original=True)
    
    test_id = Y[['t_id']].as_matrix()
    test_id = test_id.flatten()
   
    training_target = X[['target']].as_matrix()
    training_target = training_target.flatten()
    
    features = []
    lgls = []

    for i in X.columns:
        if str(i) == 'target':
            pass
        else:
            #print "Feature %s " %(str(i))
            features.append(str(i))
            feature_X = X[str(i)]
            feature_Y = Y[str(i)]
            X_np = feature_X.as_matrix() 
            Y_np = feature_Y.as_matrix() 

        # split traininf data in to training and validation set
        X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33, random_state=4)
        X_train = np.reshape(X_train, (len(X_train), 1))
        X_Val = np.reshape(X_Val, (len(X_Val), 1))
        np.reshape(train_target, (len(train_target), 1))
        np.reshape(val_target, (len(val_target), 1))

        # feature selection
        select = SelectKBest(chi2, k=20)

        # dimensionality reduction ( PCA)
        pca = PCA(n_components=2, whiten=True)

        # randomized grid search???

        clfs = [
                LogisticRegression()]
                #xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),
                #KNeighborsClassifier(n_neighbors=100),
                #RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
                #RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1)
                #RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1),
                #AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1),
                #ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
                #ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'),
                #ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'),
                #GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)]

        for j, clf in enumerate(clfs):
            #print j, clf.__class__.__name__
            # pipeline with feature selection, pca and classifier
            if pcompa==True:
                #pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)])
                pipeline = Pipeline([('pca', pca), ('clf', clf)])
            else:
                pipeline = Pipeline([('clf', clf)])
                #pipeline = Pipeline([('select', select), ('clf', clf)])

            # cross validation
            skf = StratifiedKFold(train_target, n_folds=5, random_state=1)

            scores = []

            for k, (train, test) in enumerate(skf):
                pipeline.fit(X_train[train], train_target[train])
                if hasattr(pipeline, 'predict_proba'):
                    score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1])
                else:
                    score = log_loss(train_target[test], pipeline.decision_function(X_train[test]))

                scores.append(score)

                #print 'Fold: %s, Class dist: %s, Log loss: %.3f ' %(k+1, np.bincount(train_target[train]), score)

            #print 'CV accuracy: %.3f +/- %.3f ' %(
            #                    np.mean(scores), np.std(scores))

            ## test on the hold out set
            #print 'Log Loss: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))
            lgls.append(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))

            ## Learning curves
            #train_sizes, train_scores, test_scores = \
            #        learning_curve(estimator=pipeline,
            #                       X=X_train,
            #                       y=train_target,
            #                       train_sizes=np.linspace(.1, 1.0, 5),
            #                       cv=5,
            #                       scoring='log_loss',
            #                       n_jobs=1)

            #train_mean = np.mean(train_scores, axis=1)
            #train_std = np.std(train_scores, axis=1)

            #test_mean = np.mean(test_scores, axis=1)
            #test_std = np.std(test_scores, axis=1)
    #print sorted(zip(features, lgls), reverse=False, key=lambda x: x[1])[:5]
    print sorted(zip(features, lgls), reverse=False, key=lambda x: x[1])
    print "Average logloss per feature: ", np.mean(lgls)
    return np.mean(lgls)
                    else:
                        pipeline = Pipeline([('encoder', feature_combiner),
                                             ('cls', cls)])

                    pipeline.fit(dt_train_bucket, train_y)
                    offline_time_fit += time.time() - start_offline_time_fit

                    # predict separately for each prefix case
                    preds = []
                    test_all_grouped = dt_test_bucket.groupby(
                        dataset_manager.case_id_col)
                    for _, group in test_all_grouped:
                        start = time.time()
                        _ = bucketer.predict(group)
                        if cls_method == "svm":
                            pred = pipeline.decision_function(group)
                        else:
                            preds_pos_label_idx = np.where(
                                cls.classes_ == 1)[0][0]
                            pred = pipeline.predict_proba(
                                group)[:, preds_pos_label_idx]

                        pipeline_pred_time = time.time() - start
                        current_online_event_times.append(pipeline_pred_time /
                                                          len(group))
                        preds.extend(pred)

            preds_all.extend(preds)
            test_y_all.extend(test_y)

        offline_total_time = offline_time_bucket + offline_time_fit + train_prefix_generation_time
    steps = [('scaler', scaler), ('red_dim', pca), ('clf', svm)]

    pipeline = Pipeline(steps)

    summary = pipeline.named_steps

    pipeline.fit(X_train, train_labels_encoded)

    score_train = pipeline.score(X_train, train_labels_encoded)
    tot_train_score.append(score_train)

    score_test = pipeline.score(X_test, test_labels_encoded)
    tot_test_score.append(score_test)

    y_scores = pipeline.decision_function(X_test)

    auc = roc_auc_score(test_labels_encoded, y_scores)

    tot_auc.append(auc)

    y_pred = pipeline.predict(X_test)

    report = classification_report(test_labels_encoded,
                                   y_pred,
                                   output_dict=True)
    df_r = pd.DataFrame(report)
    df_r = df_r.transpose()
    #df_r.to_csv(f'/home/users/ubaldi/TESI_PA/result_CV/report_{name}/report_{i}')

    outname = f'report_{i}.csv'
Example #25
0
      ('feature_selection', SelectKBest(f_regression, k=1000)),
      #('reduce_dims',PCA()),
      ('mnb', MultinomialNB())
        ])
clf.fit(X_train, y_train)

train_time = time() - t0
print("train time: %0.3fs" % train_time)

t0 = time()
pred = clf.predict(X_test)
try:
    pred_prob = clf.predict_proba(X_test)
except AttributeError:
    try:
        dec_f = clf.decision_function(X_test)
        pred_prob = np.exp(dec_f) / np.sum(np.exp(dec_f))
    except AttributeError:
        pred_prob = LabelBinarizer().fit_transform(pred.tolist())

test_time = time() - t0
print("test time:  %0.3fs" % test_time)

score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

y_test_prob = LabelBinarizer().fit_transform(y_test)
log_loss = metrics.log_loss(y_test_prob, pred_prob)
print("log_loss:   %0.3f" % log_loss)

if hasattr(clf, 'coef_'):
Example #26
0
def main():
    load_training_data()  # Loads the training data from the json into the
    # dict

    # Converting that dictionary into a list where the content is only the body
    # of the posts.
    comments_in_list = all_key_val_to_list(TRAIN_JSON_DICTS, "body")

    # Making a list of words out of those comments.
    words_in_list = make_words(comments_in_list)

    # initalizing the pipeline for the SVC
    text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words="english")),
                             ('tfdif', TfidfTransformer()),
                             ('svc',
                              sk.SVC(kernel="poly",
                                     cache_size=2048,
                                     degree=5,
                                     max_iter=10000,
                                     gamma=1e-7,
                                     C=65))])

    # Fitting the test data.
    _ = text_clf_svm.fit(words_in_list[0], words_in_list[1])

    # Making the list of tags for predictions/testing purposes.
    tag_for_all = [1] * len(comments_in_list[0][0]) + [-1] * \
        len(comments_in_list[1][0])

    title = "Reddit Classifier - SVM Learning Curve"

    # Generating the learning curve data.
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    all_comments = comments_in_list[0][0] + comments_in_list[1][0]
    tag_for_all = [1] * len(comments_in_list[0][0]) + [-1] * \
        len(comments_in_list[1][0])
    plot_learning_curve(text_clf_svm, title, all_comments, tag_for_all, cv=cv)

    score = text_clf_svm.decision_function(comments_in_list[0][0] +
                                           comments_in_list[1][0])

    # Loading the test data up into the correct dictionary
    load_test_data()

    # Repeating the same steps as before but for the testing dictionary
    comments_in_list = all_key_val_to_list(TEST_JSON_DICTS, "body")

    all_comments = comments_in_list[0][0] + comments_in_list[1][0]
    tag_for_all = [1] * len(comments_in_list[0][0]) + [-1] * \
        len(comments_in_list[1][0])

    # Getting the decision function for testing with this data.
    score = text_clf_svm.decision_function(comments_in_list[0][0] +
                                           comments_in_list[1][0])

    # Making the precision-recall graph.
    precision_recall(score, tag_for_all, "SVM Precision Recall")

    # This is the pipleine for the Random Forests.
    text_clf_rf = Pipeline([('vect', CountVectorizer(stop_words="english")),
                            ('tfdif', TfidfTransformer()),
                            ('tree',
                             RandomForestClassifier(n_jobs=-1,
                                                    criterion="entropy",
                                                    n_estimators=55,
                                                    min_samples_split=10,
                                                    max_depth=400))])

    # Redundant calls but in here for sanity.
    load_training_data()

    # Loading up the values just like before.
    comments_in_list = all_key_val_to_list(TRAIN_JSON_DICTS, "body")

    # Making the shiffle split for the learning curve.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
    _ = text_clf_rf.fit(words_in_list[0], words_in_list[1])
    predicted_rf = text_clf_rf.predict(comments_in_list[0][0] +
                                       comments_in_list[1][0])

    # Setting up input for the learning curve and calling to it.
    title = "Reddit Classifier - Random Forest Learning Curve"
    all_comments = comments_in_list[0][0] + comments_in_list[1][0]
    tag_for_all = [1] * len(comments_in_list[0][0]) + [-1] * \
        len(comments_in_list[1][0])
    plot_learning_curve(text_clf_rf, title, all_comments, tag_for_all, cv=cv)

    # Getting the testing data loaded back up for the precision-recall tests.
    comments_in_list = all_key_val_to_list(TEST_JSON_DICTS, "body")

    # Since RF don't support decision functions the easiest way to get precision
    # and recall data is to just get the summary so this is all the setup for
    # that
    predicted_rf = text_clf_rf.predict(comments_in_list[0][0] +
                                       comments_in_list[1][0])
    all_comments = comments_in_list[0][0] + comments_in_list[1][0]
    tag_for_all = [1] * len(comments_in_list[0][0]) + [-1] * \
        len(comments_in_list[1][0])

    # These are the names for the two different classes.
    target_names = ["askreddit", "personalfinance"]
    print(
        classification_report(tag_for_all,
                              predicted_rf,
                              target_names=target_names))
Example #27
0
class Predictor(QtCore.QThread):

    """Object to predict the percentage match of an article,
    based on its abstract"""

    def __init__(self, logger, to_read_list, bdd=None):

        QtCore.QThread.__init__(self)

        self.to_read_list = to_read_list

        self.x_train = []
        self.y_train = []
        self.classifier = None

        if bdd is None:
            self.bdd = QtSql.QSqlDatabase.addDatabase("QSQLITE")
            self.bdd.setDatabaseName("fichiers.sqlite")
            self.bdd.open()
        else:
            self.bdd = bdd

        self.l = logger

        self.getStopWords()

        self.calculated_something = False


    def __del__(self):

        """Method to destroy the thread properly"""

        self.wait()
        self.l.debug("Deleting thread")


    def getStopWords(self):

        """Method to get english stop words
        + a list of personnal stop words"""

        my_additional_stop_words = []

        if getattr(sys, "frozen", False):
            resource_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
        else:
            resource_dir = '.'

        with open(os.path.join(resource_dir, 'config/stop_words.txt'), 'r') as config:
            for word in config.readlines():
                my_additional_stop_words.append(word.rstrip())

        self.stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)


    def initializePipeline(self):

        """Initialize the pipeline for text analysis. 0 is the liked class"""

        start_time = datetime.datetime.now()

        query = QtSql.QSqlQuery(self.bdd)

        query.exec_("SELECT * FROM papers WHERE new=0")

        while query.next():
            record = query.record()
            abstract = record.value('topic_simple')
            id_bdd = record.value('id')

            # Do not use 'Empty' abstracts
            if type(abstract) is not str or abstract == 'Empty':
                continue

            liked = record.value('liked')
            if type(liked) is int and liked == 1:
                category = 0
            else:
                # Do not count the read and not liked articles if the articles
                # are in the waiting list
                if id_bdd not in self.to_read_list:
                    category = 1
                else:
                    continue

            self.x_train.append(abstract)
            self.y_train.append(category)

        # To count for RuntimeWarning: divide by zero encountered in log
        if (not self.x_train or 0 not in self.y_train or
                1 not in self.y_train):
            self.l.error("Not enough data yet to feed the classifier")
            return

        self.classifier = Pipeline([
            ('vectorizer', CountVectorizer(stop_words=self.stop_words)),
            ('tfidf', TfidfTransformer()),
            ('clf', LinearSVC())])

        try:
            self.classifier.fit(self.x_train, self.y_train)
        except ValueError:
            self.l.error("Not enough data yet to train the classifier")
            return

        elapsed_time = datetime.datetime.now() - start_time
        self.l.debug("Initializing classifier in {0}".format(elapsed_time))

        return True


    # @profile
    # def calculatePercentageMatch(self):
    def run(self):

        """Calculate the match percentage for each article,
        based on the abstract text and the liked articles"""

        self.l.debug("Starting calculations of match percentages")
        start_time = datetime.datetime.now()

        query = QtSql.QSqlQuery(self.bdd)

        query.exec_("SELECT id, topic_simple FROM papers")

        list_id = []
        x_test = []

        while query.next():
            record = query.record()
            abstract = record.value('topic_simple')
            x_test.append(abstract)
            list_id.append(record.value('id'))

        try:
            # Normalize the percentages: the highest is set to 100%
            # http://stackoverflow.com/questions/929103/convert-a-number-range-to-another-range-maintaining-ratio
            x_test = self.classifier.decision_function(x_test)

            elapsed_time = datetime.datetime.now() - start_time
            self.l.debug("Classifier predicted proba in {}".format(elapsed_time))
            diff_time = datetime.datetime.now()

            maximum = max(x_test)
            minimum = min(x_test)
            list_percentages = 100 - (x_test - minimum) * 100 / (maximum - minimum)

            self.l.debug("Classifier normalized proba in {}".
                         format(datetime.datetime.now() - diff_time))

        except AttributeError:
            self.l.error("Not enough data yet to predict probability")
            return
        except Exception as e:
            self.l.error("predictor: {}".format(e))
            self.l.error(traceback.format_exc())
            return

        self.bdd.transaction()
        query = QtSql.QSqlQuery(self.bdd)

        query.prepare("UPDATE papers SET percentage_match = ? WHERE id = ?")

        for id_bdd, percentage in zip(list_id, list_percentages):

            # Convert the percentage to a float, because the number is
            # probably a type used by numpy. MANDATORY
            params = (float(percentage), id_bdd)

            for value in params:
                query.addBindValue(value)

            query.exec_()

        # # Set the percentage_match to 0 if the abstact is 'Empty' or empty
        # query.prepare("UPDATE papers SET percentage_match = 0 WHERE abstract = 'Empty' OR abstract = ''")
        # query.exec_()

        if not self.bdd.commit():
            self.l.critical("Percentages match not correctly written in db")
        else:
            elapsed_time = datetime.datetime.now() - start_time
            self.l.info("Done calculating match percentages in {0} s".format(elapsed_time))

        self.calculated_something = True
def create_and_evaluate_model(args):
    global trial_nr
    trial_nr += 1

    start = time.time()
    score = 0
    for cv_iter in range(n_splits):

        dt_test_prefixes = dt_prefixes[cv_iter]
        dt_train_prefixes = pd.DataFrame()
        for cv_train_iter in range(n_splits):
            if cv_train_iter != cv_iter:
                dt_train_prefixes = pd.concat(
                    [dt_train_prefixes, dt_prefixes[cv_train_iter]], axis=0)

        # Bucketing prefixes based on control flow
        bucketer_args = {
            'encoding_method': bucket_encoding,
            'case_id_col': dataset_manager.case_id_col,
            'cat_cols': [dataset_manager.activity_col],
            'num_cols': [],
            'random_state': random_state
        }
        if bucket_method == "cluster":
            bucketer_args["n_clusters"] = args["n_clusters"]
        bucketer = BucketFactory.get_bucketer(bucket_method, **bucketer_args)
        bucket_assignments_train = bucketer.fit_predict(dt_train_prefixes)
        bucket_assignments_test = bucketer.predict(dt_test_prefixes)

        preds_all = []
        test_y_all = []
        if "prefix" in method_name:
            scores = defaultdict(int)
        for bucket in set(bucket_assignments_test):
            relevant_train_cases_bucket = dataset_manager.get_indexes(
                dt_train_prefixes)[bucket_assignments_train == bucket]
            relevant_test_cases_bucket = dataset_manager.get_indexes(
                dt_test_prefixes)[bucket_assignments_test == bucket]
            dt_test_bucket = dataset_manager.get_relevant_data_by_indexes(
                dt_test_prefixes, relevant_test_cases_bucket)
            test_y = dataset_manager.get_label_numeric(dt_test_bucket)
            if len(relevant_train_cases_bucket) == 0:
                preds = [class_ratios[cv_iter]
                         ] * len(relevant_test_cases_bucket)
            else:
                dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(
                    dt_train_prefixes,
                    relevant_train_cases_bucket)  # one row per event
                train_y = dataset_manager.get_label_numeric(dt_train_bucket)

                if len(set(train_y)) < 2:
                    preds = [train_y[0]] * len(relevant_test_cases_bucket)
                else:
                    feature_combiner = FeatureUnion([
                        (method,
                         EncoderFactory.get_encoder(method,
                                                    **cls_encoder_args))
                        for method in methods
                    ])

                    if cls_method == "rf":
                        cls = RandomForestClassifier(
                            n_estimators=500,
                            max_features=args['max_features'],
                            random_state=random_state)

                    elif cls_method == "xgboost":
                        cls = xgb.XGBClassifier(
                            objective='binary:logistic',
                            n_estimators=500,
                            learning_rate=args['learning_rate'],
                            subsample=args['subsample'],
                            max_depth=int(args['max_depth']),
                            colsample_bytree=args['colsample_bytree'],
                            min_child_weight=int(args['min_child_weight']),
                            seed=random_state)

                    elif cls_method == "logit":
                        cls = LogisticRegression(C=2**args['C'],
                                                 random_state=random_state)

                    elif cls_method == "svm":
                        cls = SVC(C=2**args['C'],
                                  gamma=2**args['gamma'],
                                  random_state=random_state)

                    if cls_method == "svm" or cls_method == "logit":
                        pipeline = Pipeline([('encoder', feature_combiner),
                                             ('scaler', StandardScaler()),
                                             ('cls', cls)])
                    else:
                        pipeline = Pipeline([('encoder', feature_combiner),
                                             ('cls', cls)])
                    pipeline.fit(dt_train_bucket, train_y)

                    if cls_method == "svm":
                        preds = pipeline.decision_function(dt_test_bucket)
                    else:
                        preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0]
                        preds = pipeline.predict_proba(
                            dt_test_bucket)[:, preds_pos_label_idx]

            if "prefix" in method_name:
                auc = 0.5
                if len(set(test_y)) == 2:
                    auc = roc_auc_score(test_y, preds)
                scores[bucket] += auc
            preds_all.extend(preds)
            test_y_all.extend(test_y)

        score += roc_auc_score(test_y_all, preds_all)

    if "prefix" in method_name:
        for k, v in args.items():
            for bucket, bucket_score in scores.items():
                fout_all.write(
                    "%s;%s;%s;%s;%s;%s;%s;%s\n" %
                    (trial_nr, dataset_name, cls_method, method_name, bucket,
                     k, v, bucket_score / n_splits))
        fout_all.write("%s;%s;%s;%s;%s;%s;%s;%s\n" %
                       (trial_nr, dataset_name, cls_method, method_name, 0,
                        "processing_time", time.time() - start, 0))
    else:
        for k, v in args.items():
            fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" %
                           (trial_nr, dataset_name, cls_method, method_name, k,
                            v, score / n_splits))
        fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" %
                       (trial_nr, dataset_name, cls_method, method_name,
                        "processing_time", time.time() - start, 0))
    fout_all.flush()
    return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
Example #29
0
def show_accuracy(y_hat, y_test, parameter):
    pass


#(4)计算LogisticRegression分类器的准确率
print("LogisticRegression-输出训练集的准确率为:", classifier.score(x_train, y_train))
y_hat = classifier.predict(x_train)
show_accuracy(y_hat, y_train, '训练集')
print("LogisticRegression-输出测试集的准确率为:", classifier.score(x_test, y_test))
y_hat = classifier.predict(x_test)
show_accuracy(y_hat, y_test, '测试集')
# LogisticRegression-输出训练集的准确率为: 0.809523809524
# LogisticRegression-输出测试集的准确率为: 0.688888888889

# 查看决策函数,可以通过decision_function()实现。decision_function中每一列的值代表距离各类别的距离。
print('decision_function:\n', classifier.decision_function(x_train))
print('\npredict:\n', classifier.predict(x_train))
predict_proba = classifier.predict_proba(x_test)  #得到结果概率矩阵
print("predict_proba", predict_proba)

# (5)绘制图像
# 1.确定坐标轴范围,x,y轴分别表示两个特征
x1_min, x1_max = x[:, 0].min(), x[:, 0].max()  # 第0列的范围
x2_min, x2_max = x[:, 1].min(), x[:, 1].max()  # 第1列的范围
x1, x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200j]  # 生成网格采样点
grid_test = np.stack((x1.flat, x2.flat), axis=1)  # 测试点
# print 'grid_test = \n', grid_test
grid_hat = classifier.predict(grid_test)  # 预测分类值
grid_hat = grid_hat.reshape(x1.shape)  # 使之与输入的形状相同

# 2.指定默认字体
        '''doc_id=0
        for s, p, r in zip(docs_test, y_predicted, y_test):
            print(u'----------')
            print(u'[Text] %s' % s)
            print(u'[Label] %s' % p)
            print(u'[Actual] %s' % r)'''

        # Check if the total classification is empty
        # If empty, fill with the first classification
        total_prediction.append(y_predicted)


        # Average Positive score: ~0.7
        # Min Score: ~0.002
        # Max Score: ~2.86
        dec = clf.decision_function(docs_test)

    # Numpy array, .T = Transpose
    # Transpose the classification to be exported to csv file
    multiLabel = np.array(total_prediction).T

    # Save the classification to file: binaryClass.csv
    with open('workbook/binaryClass.csv', 'w', newline='') as z:
        writer = csv.writer(z)
        writer.writerows(multiLabel)

    # Save values from confusion matrix to variables to use later
    TP, TN, FP, FN = calcValues(testY, multiLabel)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    accuracy = (TP + TN) / (TP + TN + FP + FN)
Example #31
0
    ('features', features),
    ('Logistic', LogisticRegression(C=0.00077426, class_weight='balanced'))
])

model2.fit(fannie_train, status_train)
status_pred2 = model2.predict(fannie_test)
# print('Best C is: ', model2.named_steps['Logistic'].C_)
print('Coefficients: ', model2.named_steps['Logistic'].coef_)

print(classification_report(status_test, status_pred2))
print(pd.DataFrame(confusion_matrix(status_test, status_pred2), index=['Actual Healthy',
                                                                       'Actual Default'],
                   columns=['Pred. Healthy', 'Pred. Default']))
print('Area under the curve is', roc_auc_score(status_test, status_pred2))
prec, rec, thres1 = precision_recall_curve(status_test, status_pred2)
fpr, tpr, thres2 = roc_curve(status_test, model2.decision_function(fannie_test))
with open('log_prec_rec.dill', 'wb') as f:
    dill.dump((prec, rec, thres1), f)

with open('log_fpr_tpr.dill', 'wb') as f:
    dill.dump((fpr, tpr, thres2), f)

with open('log_model.dill', 'wb') as f:
    dill.dump(model2, f)

print('finishing dumping Logistic regression results to file!')

# # Support Vector Machine
# features = FeatureUnion([
#     ('Loan_Amount', ExtractNormalized('STATE', 'ORIG_AMT')),
#     #('Interest_Rate', ExtractNormalized('STATE','ORIG_RT')),
Example #32
0
class PredictPostings:
    '''
    Applying this decorator to a beancount importer or its extract method
    will predict and auto-complete missing second postings
    of the transactions to be imported.

    Example:

    @PredictPostings(
        training_data="trainingdata.beancount",
        filter_training_data_by_account="The:Importers:Already:Known:Accountname"
    )
    class MyImporter(ImporterProtocol):
        def extract(file):
          # do the import, return list of entries
    '''

    # Implementation notes for how to write decorators for classes, see e.g.,
    # https://stackoverflow.com/a/9910180
    # https://www.codementor.io/sheena/advanced-use-python-decorators-class-function-du107nxsv
    # https://andrefsp.wordpress.com/2012/08/23/writing-a-class-decorator-in-python/

    def __init__(self,
                 *,
                 training_data: Union[_FileMemo, List[Transaction],
                                      str] = None,
                 filter_training_data_by_account: str = None,
                 predict_second_posting: bool = True,
                 suggest_accounts: bool = True):
        self.training_data = training_data
        self.filter_training_data_by_account = filter_training_data_by_account
        self.predict_second_posting = predict_second_posting
        self.suggest_accounts = suggest_accounts

    def __call__(self, to_be_decorated=None, *args, **kwargs):

        if inspect.isclass(to_be_decorated):
            logger.debug('The Decorator was applied to a class.')
            return self.patched_importer_class(to_be_decorated)

        elif inspect.isfunction(to_be_decorated):
            logger.debug('The Decorator was applied to an instancemethod.')
            return self.patched_extract_function(to_be_decorated)

    def patched_importer_class(self, importer_class):
        importer_class.extract = self.patched_extract_function(
            importer_class.extract)
        return importer_class

    def patched_extract_function(self, original_extract_function):
        decorator = self

        @wraps(original_extract_function)
        def wrapper(self, file, existing_entries=None):
            decorator.existing_entries = existing_entries

            logger.debug(
                f"About to call the importer's extract function to receive entries to be imported..."
            )
            if 'existing_entries' in inspect.signature(
                    original_extract_function).parameters:
                decorator.imported_transactions = original_extract_function(
                    self, file, existing_entries)
            else:
                decorator.imported_transactions = original_extract_function(
                    self, file)

            return decorator.enhance_transactions()

        return wrapper

    def enhance_transactions(self):  # load training data
        self.training_data = ml.load_training_data(
            self.training_data,
            filter_training_data_by_account=self.
            filter_training_data_by_account,
            existing_entries=self.existing_entries)

        # convert training data to a list of TxnPostingAccounts
        self.converted_training_data = [
            ml.TxnPostingAccount(t, p, pRef.account)
            for t in self.training_data for pRef in t.postings
            for p in t.postings if p.account != pRef.account
        ]

        # train the machine learning model
        self._trained = False
        if not self.converted_training_data:
            logger.warning("Cannot train the machine learning model "
                           "because the training data is empty.")
        elif len(self.converted_training_data) < 2:
            logger.warning(
                "Cannot train the machine learning model "
                "because the training data consists of less than two elements."
            )
        else:
            transformers = []
            transformer_weights = {}
            transformers.append(
                ('narration',
                 Pipeline([
                     ('getNarration', ml.GetNarration()),
                     ('vect', CountVectorizer(ngram_range=(1, 3))),
                 ])))
            transformer_weights['narration'] = 0.8
            transformers.append(
                ('account',
                 Pipeline([
                     ('getReferencePostingAccount',
                      ml.GetReferencePostingAccount()),
                     ('vect', CountVectorizer(ngram_range=(1, 3))),
                 ])))
            transformer_weights['account'] = 0.8

            distinctPayees = set(
                map(lambda trx: trx.txn.payee, self.converted_training_data))
            if len(distinctPayees) > 1:
                transformers.append(
                    ('payee',
                     Pipeline([
                         ('getPayee', ml.GetPayee()),
                         ('vect', CountVectorizer(ngram_range=(1, 3))),
                     ])))
                transformer_weights['payee'] = 0.5

            transformers.append((
                'dayOfMonth',
                Pipeline([
                    ('getDayOfMonth', ml.GetDayOfMonth()),
                    ('caster',
                     ml.ArrayCaster()),  # need for issue with data shape
                ])))
            transformer_weights['dayOfMonth'] = 0.1

            self.pipeline = Pipeline([
                ('union',
                 FeatureUnion(transformer_list=transformers,
                              transformer_weights=transformer_weights)),
                ('svc', SVC(kernel='linear')),
            ])
            logger.debug("About to train the machine learning model...")
            self.pipeline.fit(
                self.converted_training_data,
                ml.GetPostingAccount().transform(self.converted_training_data))
            logger.info("Finished training the machine learning model.")
            self._trained = True

        if not self._trained:
            logger.warning(
                "Cannot generate predictions or suggestions "
                "because there is no trained machine learning model.")
            return self.imported_transactions

        # predict missing second postings
        self.transactions = self.imported_transactions
        if self.predict_second_posting:
            logger.debug(
                "About to generate predictions for missing second postings...")
            predicted_accounts: List[str]
            predicted_accounts = self.pipeline.predict(
                self.imported_transactions)
            self.transactions = [
                ml.add_posting_to_transaction(*t_a)
                for t_a in zip(self.transactions, predicted_accounts)
            ]
            logger.debug(
                "Finished adding predicted accounts to the transactions to be imported."
            )

        # suggest accounts that are likely involved in the transaction
        if self.suggest_accounts:
            # get values from the SVC decision function
            logger.debug(
                "About to generate suggestions about related accounts...")
            decision_values = self.pipeline.decision_function(
                self.imported_transactions)

            # add a human-readable class label (i.e., account name) to each value, and sort by value:
            suggestions = [[
                account for _, account in sorted(list(
                    zip(distance_values, self.pipeline.classes_)),
                                                 key=lambda x: x[0],
                                                 reverse=True)
            ] for distance_values in decision_values]

            # add the suggested accounts to each transaction:
            self.transactions = [
                ml.add_suggested_accounts_to_transaction(*t_s)
                for t_s in zip(self.transactions, suggestions)
            ]
            logger.debug(
                "Finished adding suggested accounts to the transactions to be imported."
            )

        return self.transactions
Example #33
0
def combinations_lgls(pcompa = False, differences = True, addition = False, multiplication = False, division = False):
    #X, training_target, Y_test, Y_test_id = load_data()
    X, Y = load_data(original=True)
    
    test_id = Y[['t_id']].as_matrix()
    test_id = test_id.flatten()
   
    training_target = X[['target']].as_matrix()
    training_target = training_target.flatten()
   
    ### INCLUDE ALL NOT JUST THESE 5 ###

    f_s = [ 'feature%d' %x for x in range(1,22)]
    g_s = [ 'feature%d' %x for x in range(1,22)]

    features = []
    lgls = []

    for f in f_s:
        for g in g_s:
            if f == g:
                pass
            else:
                if differences: 
                    features.append(str(f)+"-"+str(g))
                    feature_X = X[str(f)]-X[str(g)]
                    feature_Y = Y[str(f)]-Y[str(g)]
                elif addition:
                    features.append(str(f)+"+"+str(g))
                    feature_X = X[str(f)]+X[str(g)]
                    feature_Y = Y[str(f)]+Y[str(g)]
                elif multiplication:
                    features.append(str(f)+"x"+str(g))
                    feature_X = X[str(f)]*X[str(g)]
                    feature_Y = Y[str(f)]*Y[str(g)]
                elif division:
                    features.append(str(f)+"/"+str(g))
                    feature_X = X[str(f)].div(X[str(g)])
                    feature_Y = Y[str(f)].div(Y[str(g)])

                X_np = feature_X.as_matrix() 
                Y_np = feature_Y.as_matrix() 

                # split traininf data in to training and validation set
                X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33, random_state=4)
                X_train = np.reshape(X_train, (len(X_train), 1))
                X_Val = np.reshape(X_Val, (len(X_Val), 1))
                np.reshape(train_target, (len(train_target), 1))
                np.reshape(val_target, (len(val_target), 1))

                # feature selection
                select = SelectKBest(chi2, k=20)

                # dimensionality reduction ( PCA)
                pca = PCA(n_components=2, whiten=True)

                # randomized grid search???

                clfs = [
                        LogisticRegression()]
                        #xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),

                for j, clf in enumerate(clfs):
                    #print j, clf.__class__.__name__
                    # pipeline with feature selection, pca and classifier
                    if pcompa==True:
                        #pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)])
                        pipeline = Pipeline([('pca', pca), ('clf', clf)])
                    else:
                        pipeline = Pipeline([('clf', clf)])
                        #pipeline = Pipeline([('select', select), ('clf', clf)])

                    # cross validation
                    skf = StratifiedKFold(train_target, n_folds=5, random_state=1)

                    scores = []

                    for k, (train, test) in enumerate(skf):
                        pipeline.fit(X_train[train], train_target[train])
                        if hasattr(pipeline, 'predict_proba'):
                            score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1])
                        else:
                            score = log_loss(train_target[test], pipeline.decision_function(X_train[test]))

                        scores.append(score)

                        lgls.append(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))

    combination_scores = sorted(zip(features, lgls), key=lambda x: x[1])
    single_f_average = singular_lgls()
    
    return [x for x in combination_scores if x[1]<single_f_average]
Example #34
0
        y_testt.append([0, 0, 0, 0, 1])

# In[84]:

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

# In[85]:

y_testt = np.array(y_testt)

# In[87]:

y_score = text_clf.decision_function(X_test)

# In[88]:

for i in range(5):
    fpr[i], tpr[i], _ = roc_curve(y_testt[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# In[89]:

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_testt.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

n_classes = 5
# Compute macro-average ROC curve and ROC area
Example #35
0
def CV_holdout(pcompa = False):
    #X, training_target, Y_test, Y_test_id = load_data()
    X, Y = load_data()

    test_id = Y[['t_id']].as_matrix()
    test_id = test_id.flatten()
    Y = Y.drop( 't_id', axis = 1 )
    training_target = X[['target']].as_matrix()
    training_target = training_target.flatten()
    X = X.drop( 'target', axis = 1)
    X_np = X.as_matrix()
    Y_np = Y.as_matrix()

    # split traininf data in to training and validation set
    X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33)
    #X_train, X_Val, train_target, val_target = train_test_split(X_np, training_target, test_size=0.33, random_state=4)

    # feature selection
    select = SelectKBest(chi2, k=20)

    # dimensionality reduction ( PCA)
    pca = PCA(n_components=2, whiten=True)

    # randomized grid search???

    clfs = [
            LogisticRegression()]
            #xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),
            #KNeighborsClassifier(n_neighbors=100),
            #RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
            #RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1)
            #RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1),
            #AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1),
            #ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
            #ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'),
            #ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'),
            #GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)]

    for j, clf in enumerate(clfs):
        print j, clf.__class__.__name__
        # pipeline with feature selection, pca and classifier
        if pcompa==True:
            #pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)])
            pipeline = Pipeline([('pca', pca), ('clf', clf)])
        else:
            #pipeline = Pipeline([('clf', clf)])
            pipeline = Pipeline([('select', select), ('clf', clf)])

        # cross validation
        skf = StratifiedKFold(train_target, n_folds=5, random_state=1)

        scores = []

        for k, (train, test) in enumerate(skf):
            pipeline.fit(X_train[train], train_target[train])
            if hasattr(pipeline, 'predict_proba'):
                score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1])
                print pipeline.predict(X_train[test])[:10], train_target[test][:10]
            else:
                score = log_loss(train_target[test], pipeline.decision_function(X_train[test]))

            scores.append(score)

            #print 'Fold: %s, Class dist: %s, Log loss: %.3f ' %(k+1, np.bincount(train_target[train]), score)

        print 'CV accuracy: %.3f +/- %.3f ' %(
                            np.mean(scores), np.std(scores))

        ## test on the hold out set
        
        print 'Log Loss: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))
Example #36
0
class Annotator:
    experiment = ""
    labels = []

    final_choice = ""
    current_query = None
    ipython = False
    annotator = ''
    printify = lambda x: str(x)
    start_labels = []
    access = ""
    web_root = ""
    shuffle = True
    classifier = None
    use_classifier = False
    classifier_trained = False

    def __init__(self,
                 experiment,
                 printify=None,
                 ipython=False,
                 access='web',
                 web_root='',
                 start_labels=[],
                 annotator='',
                 shuffle=True,
                 use_classifier=False,
                 close_choice_mode=False,
                 choice_function=None):
        # access can be:
        # 	- File: and provide a file (file) to the experiment
        # 	- Web: and provide a URL (web_root) to the Flask server with API hooks

        # printify is a callable that takes in a sample and produces a string to print the sample to the user
        # start_labels: initial set of possible labels, can be updated over time
        # shuffle: If try, the samples are annotated in a random order? Default True
        # use_classifier: if true will train a classifier every few samples and then suggests the classes in order

        # close_choice_mode: if the choices available are known in advance, must provide `choice_function`
        # choice_function: if close_choice_mode is chosen, then a function that goes from a sample to the list of displayed choices

        if printify is not None:
            self.printify = printify
        self.experiment = experiment
        self.ipython = ipython
        self.access = access
        self.web_root = web_root
        self.start_labels = start_labels
        self.annotator = annotator  # Name of the person annotating
        self.shuffle = shuffle

        self.use_classifier = use_classifier
        self.classifier_trained = False
        if self.use_classifier:
            self.classifier = Pipeline([('vect',
                                         CountVectorizer(ngram_range=(1, 2))),
                                        ('tfidf', TfidfTransformer()),
                                        ('clf', LinearSVC())])

        self.close_choice_mode = close_choice_mode
        self.choice_function = choice_function

    def clean_html(self, raw_html):
        cleanr = re.compile('<.*?>')
        return re.sub(cleanr, '', raw_html)

    def query2json(self, query):
        url = self.web_root + query
        # print url
        r = requests.get(url)
        return r.json()

    # Web api hooks: 7 functions
    def list_experiments_web(self):
        results = self.query2json("annotator/list_experiments")
        for d in results:
            print "Experiment:", d['_id'], "has", d['count'], "samples"

    def insert_samples_web(self, samples):
        R = requests.post(self.web_root + "annotator/insert_samples",
                          json=samples)
        print R.text

    def save_annotation_web(self):
        print "hellllo"
        self.query2json("annotator/save_annotation/" +
                        str(self.current_query['_id']) + "/label/" +
                        self.final_choice + "/annotator/" + self.annotator)

    def get_counts_web(self):
        return self.query2json("annotator/get_counts/" + self.experiment +
                               "/annotator/" + self.annotator)

    def load_example_web(self):
        return self.query2json("annotator/next_example/" + self.experiment +
                               "/annotator/" + self.annotator + "/shuffle/" +
                               str(1 if self.shuffle else 0))

    def reload_labels_web(self):
        self.labels = self.query2json("annotator/reload_labels/" +
                                      self.experiment)

    def detailed_statistics_web(self):
        return self.query2json("annotator/detailed_statistics/" +
                               self.experiment)

    def relabel_web(self, from_label, to_label):
        return self.query2json("annotator/relabel/" + self.experiment +
                               "/from/" + from_label + "/" + to_label +
                               "/annotator/" + self.annotator)

    def load_labeled_dataset_web(self):
        return self.query2json("annotator/load_training/" + self.experiment)

    def export_web(self):
        return self.query2json("annotator/export/" + self.experiment)

    # Main functionality
    def list_experiments():
        if self.access == 'web':
            self.list_experiments_web()

    def insert_samples(self, samples, pre_annotated=False):
        for sample in samples:
            if type(sample) is not dict:
                print "One sample or more is not a dict, which it should be. Please reformat"
                return -1
            if 'label' in sample and not pre_annotated:
                print "The key `label` should not be used in the samples"
                return -2
            if 'experiment' in sample and sample[
                    'experiment'] != self.experiment:
                print "The key `experiment` is inconsistent with the experiment ID"
                return -3
            sample['experiment'] = self.experiment
        if self.access == 'web':
            self.insert_samples_web(samples)

    def save_annotation(self):
        if self.access == 'web':
            self.save_annotation_web()

    def load_unannotated_example(self):
        if self.access == 'web':
            self.current_query = self.load_example_web()

    def reload_labels(self):
        if self.access == 'web':
            self.reload_labels_web()

        self.labels.extend(self.start_labels)
        self.labels = sorted(self.labels)

    def relabel(self, from_label, to_label):
        if self.access == 'web':
            self.relabel_web(from_label, to_label)

    def export(self):
        if self.access == 'web':
            return self.export_web()

    def finish(self):
        # Print a message once we are done
        print "Done annotating for now"

    def train_classifier(self):
        labeled_dataset = []
        if self.access == 'web':
            labeled_dataset = self.load_labeled_dataset_web()
        if len(labeled_dataset) > 20:
            labels = [d['label']['label'] for d in labeled_dataset]
            text = [self.clean_html(self.printify(d)) for d in labeled_dataset]

            cross_val_accuracy = np.mean(
                cross_val_score(self.classifier,
                                text,
                                y=labels,
                                scoring="accuracy",
                                cv=5))
            print "Classifier retrained (", len(
                labeled_dataset
            ), " samples). Cross val accuracy:", "{0:.2f}%".format(
                100.0 * cross_val_accuracy)

            self.classifier.fit(text, y=labels)

            self.classifier_trained = True
        else:
            self.classifier_trained = False

    def get_ordered_labels(self):
        # Current example has been loaded, get the label either through the classifier
        # or take the first label
        if self.close_choice_mode:
            first_choice = ""
            text_labels = self.choice_function(self.current_query)
            if len(text_labels) > 0:
                first_choice = text_labels[0]
            return text_labels, first_choice

        if self.use_classifier:
            if not self.classifier_trained:
                self.train_classifier()
            if self.classifier_trained:
                best_label = self.classifier.predict(
                    [self.clean_html(self.printify(self.current_query))])[0]
                scores = self.classifier.decision_function(
                    [self.clean_html(self.printify(self.current_query))])[0]
                zero_min = (scores - np.min(scores))
                normalized_scores = zero_min / np.sum(zero_min)
                labels = self.classifier.named_steps['clf'].classes_
                lab2p = {
                    lab: score
                    for lab, score in zip(labels, normalized_scores)
                }
                sorted_labels = sorted(labels, key=lambda x: -lab2p[x]) + [
                    labs for labs in self.start_labels if labs not in labels
                ]
                text_labels = [
                    lab + " | Score: " +
                    "{0:.1f}".format(100.0 * lab2p.get(lab, 0.0))
                    for lab in sorted_labels
                ]

                return text_labels, text_labels[0]

        best_label = ""
        if len(self.labels) > 0:
            best_label = self.labels[0]

        return self.labels, best_label

    def load_example_ipython(self):

        self.load_unannotated_example()
        if self.current_query is None:
            self.finish()
            return None  # We stop here

        if not self.close_choice_mode:
            self.reload_labels()  # In case something new has been added...

        display(HTML(self.printify(self.current_query)))

        TextField = widgets.Text(value='',
                                 placeholder='New class label',
                                 disabled=False)
        TextField.observe(self.on_change_jupyter)

        text_labels, self.final_choice = self.get_ordered_labels()

        Radio = widgets.RadioButtons(options=text_labels,
                                     value=self.final_choice,
                                     description="",
                                     disabled=False)
        Radio.observe(self.on_change_jupyter)

        B = widgets.Button(description='Submit annotation')
        B.on_click(self.on_submit_jupyter)

        count_annotated, count_total = self.get_counts()
        if count_annotated % 10 == 0:
            self.classifier_trained = False  # Force retrain
        display(
            HTML("<div class='toDel'>" + str(count_annotated) + "/" +
                 str(count_total) + "</div>"))
        display(TextField)
        display(Radio)
        display(B)

    def cleanup_jupyter(self):
        display(
            HTML(
                "<div class='js_stuff'><script>$('#example, .js_stuff').parent().parent().remove(); $('#desc_rows, .output_area, .toDel').remove(); $('.widget-subarea').html('');</script></div>"
            ))

    def on_change_jupyter(self, change):
        if change['type'] == 'change' and change['name'] == 'value':
            self.final_choice = change['new']

    def on_submit_jupyter(self, change):
        # Save this annotation

        print "coucouchou"
        self.final_choice = self.final_choice.split("|")[0].strip()

        print self.final_choice
        self.save_annotation()
        self.cleanup_jupyter()
        self.load_example_ipython()

    def annotate(self):
        if self.ipython:
            self.load_example_ipython()

    # Analysis tools
    def get_counts(self):
        if self.access == 'web':
            return self.get_counts_web()
        return 0, 0

    def status(self):
        count_annotated, count_total = self.get_counts()
        return "[" + self.experiment + "] Total samples: " + str(
            count_total) + " | Annotated: " + str(count_annotated)

    def detailed_statistics(self):
        count_annotated, count_total = self.get_counts()

        for d in self.detailed_statistics_web():
            percentage = "{0:.2f}".format(100.0 * d['count'] / count_annotated)
            print d['_id'], d[
                'count'], " / ", count_annotated, " (", percentage, " % )"
     metrics.classification_report(
         Y_test,
         logistic_classifier.predict(X_test))))
 
 print 'classes : ',classifier.classes_
 print 'RBM and Logistic regression : ', classifier.predict(X_test) 
 print 'Raw Logistic regression', logistic_classifier.predict(X_test)
 
 logistic_proba = logistic_classifier.predict_proba(X_test)
 
 print 'logistic_classifier decision function : \n',logistic_classifier.decision_function(X_test)
 print 'logistic_classifier predict_proba : \n', logistic_proba
 
 classifier_proba = classifier.predict_proba(X_test)
 
 print 'classifier decision function : \n',classifier.decision_function(X_test)
 print 'classifier decision predict_proba : \n',classifier_proba
 
 
 if classifier_proba[0][1] < 0.6:
     print 'classifier ___________ led is acting strange'
     print 'current value : ',led_status[end-start-1]
     print 'desired value : ',X[0][end-start-1]
     
     f = open('transmit_confirm.txt','w')
     f.write(str(1))
     f.close()
     
     print 'set led to : ', X[0][end-start-1]
     f = open('set_led.txt','w')
     f.write(str(X[0][end-start-1]))
Example #38
0
def cross_validation(data_x, data_y):
    rs1 = RobustScaler()
    rs2 = RobustScaler()
    pca = PCA(n_components=10,
              svd_solver='full',
              whiten=False,
              random_state=42)
    '''
    clf = SVC(kernel='rbf',
              C=1e-8,
              gamma='auto',
              cache_size=1000,
              probability=False,
              class_weight='balanced',
              decision_function_shape='ovr',
              random_state=42)
    '''
    clf = MLPClassifier(solver='adam',
                        activation='relu',
                        hidden_layer_sizes=(50, ),
                        alpha=0.1,
                        beta_1=0.9,
                        beta_2=0.999,
                        epsilon=1e-4,
                        learning_rate_init=1e-3,
                        max_iter=200,
                        early_stopping=True,
                        validation_fraction=0.2,
                        random_state=42)

    kfold = StratifiedKFold(n_splits=10, random_state=42)

    cv_test_auc = []
    cv_train_auc = []
    cv_test_brier = []
    cv_train_brier = []

    fig = plt.figure(figsize=(10, 10))

    plt.ylim([-0.05, 1.05])
    plt.plot([0, 1], [0, 1], 'k:', label='Perfect')

    for i, (train, test) in enumerate(kfold.split(data_x, data_y)):
        print('CV-Split {}'.format(i + 1))

        train = resample(train, data_y, oversample=False)

        model = Pipeline(steps=[('rs1', rs1), ('pca', pca),
                                ('rs2', rs2), ('clf', clf)])

        model.fit(data_x[train], data_y[train])

        '''
        Get all your statistics here.
        For example: AUC, Brier loss and the calibration curve.
        '''
        if hasattr(model, 'predict_proba'):
            p_test = model.predict_proba(data_x[test])[:, 1]
            p_train = model.predict_proba(data_x[train])[:, 1]
        else:
            p_test = model.decision_function(data_x[test])
            p_train = model.decision_function(data_x[train])
            p_test = norm(p_test)
            p_train = norm(p_train)

        p_pos, s_mean = calibration_curve(data_y[test], p_test, n_bins=10)

        plt.plot(s_mean, p_pos, 's-', label='CV fold {}'.format(i + 1))

        test_auc = roc_auc_score(data_y[test], p_test)
        train_auc = roc_auc_score(data_y[train], p_train)

        test_brier = brier_score_loss(data_y[test], p_test)
        train_brier = brier_score_loss(data_y[train], p_train)

        print('###')
        print('Train AUC: {}'.format(train_auc))
        print('Test AUC: {}'.format(test_auc))
        print('Train Brier Loss: {}'.format(train_brier))
        print('Test Brier Loss: {}'.format(test_brier))
        print('###')

        cv_test_auc.append(test_auc)
        cv_train_auc.append(train_auc)
        cv_test_brier.append(test_brier)
        cv_train_brier.append(train_brier)

    plt.title('Calibration plot  (reliability curve)')
    plt.ylabel('Fraction of positives')
    plt.xlabel('Mean predicted value')
    plt.legend(loc='best', ncol=2)
    plt.savefig('calibration.png')
    plt.close(fig)

    test_auc_stats = np.mean(cv_test_auc), np.std(cv_test_auc, ddof=1)
    train_auc_stats = np.mean(cv_train_auc), np.std(cv_train_auc, ddof=1)
    test_brier_stats = np.mean(cv_test_brier), np.std(cv_test_brier, ddof=1)
    train_brier_stats = np.mean(cv_train_brier), np.std(cv_train_brier, ddof=1)

    print('###')
    print('CV Train AUC: {0[0]} ({0[1]})'.format(train_auc_stats))
    print('CV Test AUC: {0[0]} ({0[1]})'.format(test_auc_stats))
    print('CV Train Brier Loss: {0[0]} ({0[1]})'.format(train_brier_stats))
    print('CV Test Brier Loss: {0[0]} ({0[1]})'.format(test_brier_stats))
    print('###')
        stop_words='english',
        ngram_range=(1, 2),
        max_df=1.0,
        max_features=100000
    )

    print "Create pipeline for vectorizer => classifier"
    vect_clf = Pipeline([('vect', marisa_uni_vect),
                         ('clf', LinearSVC())])

    print "Train Model"
    vect_clf = vect_clf.fit(train_resume_text, train_labels)

    print "Predict test samples"
    predicted_score = vect_clf.predict(test_resume_text)
    predicted_decision = vect_clf.decision_function(test_resume_text)

    # accuracy = np.mean(predicted_score == test_labels)
    # p = precision_score(test_labels, predicted_score, average='macro')
    # r = recall_score(test_labels, predicted_score, average='macro')
    #
    # print accuracy
    # print p
    # print r

    # print classification_report([t for t in test_labels], [p for p in predicted_score])
    predicted = []

    actual_vs_predicted = []

    for i in range(len(test_labels)):
Example #40
0
def train_and_evaluate(writer, train_data, dev_data, test_data):
    train_text, train_label = _data2list(train_data)
    dev_text, dev_label = _data2list(dev_data)
    test_text, test_label = _data2list(test_data)

    hyper_parms = {
        'ngram_range': [(1, 2), (1, 3), (1, 4)],
        'sublinear_tf': [True, False],
        'penalty': ['l2'],
        'alpha': [1e-4, 1e-5],
    }
    all_hyper_parms = it.product(*(hyper_parms[k] for k in hyper_parms))
    all_hyper_parms_dict = [
        dict(zip(hyper_parms, arg)) for arg in all_hyper_parms
    ]

    best_dev = 0
    best_args = None

    queue = multiprocessing.Queue()
    for cur_args in all_hyper_parms_dict:
        multiprocessing.Process(target=_worker,
                                args=(queue, cur_args, train_text, train_label,
                                      dev_text, dev_label)).start()

    results = []
    for _ in all_hyper_parms_dict:
        cur_args, acc, f1 = queue.get()

        res_str = ''
        for key, value in cur_args.items():
            res_str = res_str + key + ': ' + str(value) + '. '

        if f1 > best_dev:
            best_dev = f1
            best_args = cur_args
            res_str += colored(('Dev acc: %.4f, f1: %.4f' % (acc, f1)), 'red')
        else:
            res_str += ('Dev acc: %.4f, f1: %.4f' % (acc, f1))
        print(res_str)

    text_clf = Pipeline([
        ('vect',
         TfidfVectorizer(ngram_range=best_args['ngram_range'],
                         sublinear_tf=best_args['sublinear_tf'])),
        ('clf',
         SGDClassifier(loss='hinge',
                       penalty=best_args['penalty'],
                       max_iter=5,
                       alpha=best_args['alpha'],
                       random_state=1))
    ])

    train_text = train_text + dev_text
    train_label = train_label + dev_label

    text_clf.fit(train_text, train_label)
    test_pred = text_clf.predict(test_text)
    acc, f1, recall, precision = _compute_score(y_pred=test_pred,
                                                y_true=test_label,
                                                num_classes=2)

    scores = text_clf.decision_function(test_text)
    fpr, tpr, thresholds = metrics.roc_curve(test_label, scores, pos_label=1)

    mean_fpr = np.linspace(0, 1, 100)
    tpr = interp(mean_fpr, fpr, tpr)
    tpr[0] = 0.0
    roc = metrics.auc(mean_fpr, tpr)

    print("End of training")
    print("Best dev f1: %.4f" % best_dev)
    print(
        "Test acc: %.4f, f1: %.4f, recall: %.4f, precision: %.4f, roc: %.4f" %
        (acc, f1, recall, precision, roc))

    return acc, f1, roc, tpr
Example #41
0
def blend_clfs_CV(f_number = 80, pcompa = True, layer = 1, cycles=9):
    if layer == 1:
        X, X_target, X_train, X_Val, train_target, val_target, Y_test, Y_test_id = load_data(f_number=f_number)
    elif layer == 2:
        num_clfs = [
        LogisticRegression(),
        SVC(kernel='rbf', gamma=1.0, C=0.1, probability=True, verbose=True, random_state=1),
        xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),
        KNeighborsClassifier(n_neighbors=100),
        RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
        #RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1)
        RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1),
        AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1),
        ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
        ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)]

        X, X_target, _, _, _, _, Y_test, Y_test_id = load_data(f_number=f_number)
        #newX = np.zeros((X.shape[0], cycles+len(num_clfs)))
        #newY = np.zeros((Y_test.shape[0], cycles+len(num_clfs)))
        
        #for i in range(cycles):
        test_preds     = 'CV_blended_True_layer_1_keras1_feature_' + str(f_number) + '.csv'
        training_preds = 'CV_blended_True_training_layer_1_keras1_feature_' + str(f_number) + '.csv'
        test_preds_df = pd.read_csv(test_preds)
        training_preds_df = pd.read_csv(training_preds)
        test_preds_np = test_preds_df[["probability"]].as_matrix()
        training_preds_np = training_preds_df.as_matrix()
        X = np.concatenate((X, training_preds_np), axis=1)
        Y_test = np.concatenate((Y_test, test_preds_np), axis=1)
        
        for i in range(cycles):
            test_preds     = 'CV_blended_False_layer_1_keras1_' + str(i) + '_feature_' + str(f_number) + '.csv'
            training_preds = 'CV_blended_False_training_layer_1_keras1_' + str(i) + '_feature_' + str(f_number) + '.csv'
            test_preds_df = pd.read_csv(test_preds)
            training_preds_df = pd.read_csv(training_preds)
            test_preds_np = test_preds_df[["probability"]].as_matrix()
            training_preds_np = training_preds_df.as_matrix()
            X = np.concatenate((X, training_preds_np), axis=1)
            Y_test = np.concatenate((Y_test, test_preds_np), axis=1)

        for i,c in enumerate(num_clfs):
            test_preds     = 'CV_layer_1_' + str(c.__class__.__name__) + str(i) + '_feature_' + str(f_number) + '_pca_' + str(pcompa) + '.csv'
            training_preds = 'CV_training_layer_1_' +  str(c.__class__.__name__) + str(i) + '_feature_' + str(f_number) + '_pca_' + str(pcompa) + '.csv'
            test_preds_df = pd.read_csv(test_preds)
            training_preds_df = pd.read_csv(training_preds)
            test_preds_np = test_preds_df[["probability"]].as_matrix()
            training_preds_np = training_preds_df.as_matrix()
            X = np.concatenate((X, training_preds_np), axis=1)
            Y_test = np.concatenate((Y_test, test_preds_np), axis=1)
    
        X_train, X_Val, train_target, val_target = train_test_split(X, X_target, test_size=0.33, random_state=4)
    
    X[X == -inf] = 0
    X_train[X_train == -inf] = 0
    X_Val[X_Val == -inf] = 0
    Y_test[Y_test == -inf] = 0

    #print "Number of total training samples: ", len(X)
    #print "Number of sub-training samples: ", len(X_train)
    #print "Number of validation samples: :", len(X_Val)

    # feature selection
    #select = SelectKBest(chi2, k=7)

    # dimensionality reduction ( PCA)
    pca = PCA(n_components=2, whiten=True)

    # randomized grid search???

    clfs = [
            LogisticRegression(),
            SVC(kernel='rbf', gamma=1.0, C=0.1, probability=True, verbose=True, random_state=1),
            xgb.XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=300, learning_rate=0.05),
            KNeighborsClassifier(n_neighbors=100),
            RandomForestClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
            #RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='entropy', random_state=1)
            RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1, criterion='entropy', random_state=1),
            AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", learning_rate=0.01, n_estimators=50, random_state=1),
            ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='gini', random_state=1),
            ExtraTreesClassifier(n_estimators=100, max_depth=3, min_samples_split=5, min_samples_leaf=5, n_jobs=-1, criterion='gini'),
            ExtraTreesClassifier(n_estimators=50, max_depth=6, n_jobs=-1, criterion='entropy'),
            GradientBoostingClassifier(learning_rate=0.01, subsample=0.8, loss='exponential', max_depth=6, n_estimators=50)]

    #C_range = 10.0 ** np.arange(-2, 3)
    #gamma_range = 10.0 ** np.arange(-2, 3)
    #param_grid = {"gamma": gamma_range.tolist(), "C": C_range.tolist(), "kernel": ['rbf', 'linear', 'sigmoid', 'poly']}
    #grid = GridSearchCV(SVC(), param_grid, n_jobs=-1, verbose=2)
    #grid = RandomizedSearchCV(SVC(), param_grid, n_iter=20, n_jobs=-1, verbose=2)
    #grid.fit(X, X_target)
    #print("The best classifier is: ", grid.best_estimator_)
    #print(grid.grid_scores_)

    for j, clf in enumerate(clfs):
        print j, clf
        # pipeline with feature selection, pca and classifier
        if pcompa==True:
            #pipeline = Pipeline([('select', select), ('pca', pca), ('clf', clf)])
            pipeline = Pipeline([('pca', pca), ('clf', clf)])
        else:
            pipeline = Pipeline([('clf', clf)])

        # cross validation
        skf = StratifiedKFold(train_target, n_folds=5, random_state=1)
        
        scores = []

        for k, (train, test) in enumerate(skf):
            pipeline.fit(X_train[train], train_target[train])
            if hasattr(pipeline, 'predict_proba'):
                score = log_loss(train_target[test], pipeline.predict_proba(X_train[test])[:, 1])
            else:
                score = log_loss(train_target[test], pipeline.decision_function(X_train[test]))
            
            scores.append(score)
            
            print 'Fold: %s, Class dist: %s, Log loss: %.3f ' %(k+1, np.bincount(train_target[train]), score)

        print 'CV accuracy: %.3f +/- %.3f ' %(
                            np.mean(scores), np.std(scores))

        ## Learning curves
        #train_sizes, train_scores, test_scores = \
        #        learning_curve(estimator=pipeline,
        #                       X=X_train,
        #                       y=train_target,
        #                       train_sizes=np.linspace(.1, 1.0, 5),
        #                       cv=5,
        #                       scoring='log_loss',
        #                       n_jobs=1)

        #train_mean = np.mean(train_scores, axis=1)
        #train_std = np.std(train_scores, axis=1)

        #test_mean = np.mean(test_scores, axis=1)
        #test_std = np.std(test_scores, axis=1)
        
        #total_training_probabilities
        training_probs = pipeline.predict_proba(X)[:,1]
        training_probs_df = pd.DataFrame(data=training_probs, columns=["probability"])
        training_submission = 'CV_training_layer_' + str(layer) + '_' + str(clf.__class__.__name__) + str(j) + '_feature_' + str(f_number) + '_pca_' + str(pcompa) 
        training_probs_df.to_csv(training_submission + '.csv', index=False)

        ## test on the hold out set
        print 'Log Loss: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1]))
        
        ## test on real test set, save submission
        test_predictions = pipeline.predict_proba(Y_test)[:,1]
        test_predictions_df = pd.DataFrame(data=test_predictions, columns=["probability"])
        Y_test_id.columns = ["t_id"]
        pred_submission = pd.concat((Y_test_id, test_predictions_df), axis = 1)
        submission = 'CV_layer_' + str(layer) + '_' + str(clf.__class__.__name__) + str(j) + '_feature_' + str(f_number)
        pred_submission.to_csv(submission + '.csv', index = False)
        submission_stats = open(submission + '.txt', 'a')
        submission_stats.write(str(clf) + '\n')
        submission_stats.write('pca = ' + str(pcompa) + '\n')
        submission_stats.write('Log Loss on Validation set: %.5f ' %(log_loss(val_target, pipeline.predict_proba(X_Val)[:, 1])) + '\n')
        submission_stats.write(' ' + '\n')
        submission_stats.close()