def run_regression(train_embeds, train_labels, test_embeds, test_labels):
    np.random.seed(1)
    from sklearn.linear_model import SGDClassifier
    from sklearn.dummy import DummyClassifier
    from sklearn.metrics import accuracy_score
    dummy = DummyClassifier()
    dummy.fit(train_embeds, train_labels)
    log = SGDClassifier(loss="log", n_jobs=55)
    log.fit(train_embeds, train_labels)
    print("Test scores")
    print(accuracy_score(test_labels, log.predict(test_embeds)))
    print("Train scores")
    print(accuracy_score(train_labels, log.predict(train_embeds)))
    print("Random baseline")
    print(accuracy_score(test_labels, dummy.predict(test_embeds)))
def get_scores(X, y):
    nfolds = 200
    cv = StratifiedShuffleSplit(y, n_iter=nfolds, test_size=0.2)
    dumb = DummyClassifier(strategy="most_frequent")
    clf = svm.SVC(class_weight="auto")
    clf = linear_model.LogisticRegression()
    param_dist = {"C": [0.1, 1, 10], "kernel": ["rbf", "linear", "poly"]}
    param_dist = {"C": [1e6, 1e5, 1e4, 1e3, 1e2, 10, 1, 0.1, 0.01, 0.001]}
    search = GridSearchCV(clf, param_grid=param_dist, scoring="mean_absolute_error")
    test_scores, train_scores, dummy_scores = [], [], []
    preds, true_labels = [], []
    for oidx, (train, test) in enumerate(cv):
        y_train, y_test = y[train], y[test]
        X_train, X_test = X[train, :], X[test, :]

        search.fit(X_train, y_train)
        clf = search.best_estimator_
        print search.best_params_

        clf.fit(X_train, y_train)
        train_scores.append(accuracy_score(clf.predict(X_train), y_train))
        test_scores.append(accuracy_score(clf.predict(X_test), y_test))
        dumb.fit(X_train, y_train)
        dummy_scores.append(accuracy_score(dumb.predict(X_test), y_test))
        preds += list(clf.predict(X_test))
        true_labels += list(y_test)
    return test_scores, train_scores, dummy_scores, preds, true_labels
Esempio n. 3
0
def get_scores(X, y):
    nfolds = 40
    cv = StratifiedShuffleSplit(y, n_iter=nfolds, test_size=.05)
    dumb = DummyClassifier(strategy='most_frequent')
    clf = svm.SVC(class_weight='auto')
    param_dist = {"C": [.1, 1, 10],
                  "kernel": ['rbf', 'linear', 'poly']
                  }
    search = GridSearchCV(clf, param_grid=param_dist,
                          scoring='mean_absolute_error')
    stest, strain, sdummy = [], [], []
    for nfeats in range(X.shape[1]):
        test_scores, train_scores, dummy_scores = [], [], []
        # figure out our possible feature combinations
        feats = itertools.combinations(range(X.shape[1]), nfeats + 1)
        for my_feats in feats:
            for oidx, (train, test) in enumerate(cv):
                idx = np.array(my_feats)
                y_train, y_test = y[train], y[test]
                X_train, X_test = X[train, :], X[test, :]

                search.fit(X_train, y_train)
                clf = search.best_estimator_

                clf.fit(X_train[:, idx], y_train)
                train_scores.append(accuracy_score(clf.predict(X_train[:, idx]), y_train))
                test_scores.append(accuracy_score(clf.predict(X_test[:, idx]), y_test))
                dumb.fit(X_train[:, idx], y_train)
                dummy_scores.append(accuracy_score(dumb.predict(X_test[:, idx]), y_test))
        sdummy.append(np.mean(dummy_scores))
        strain.append(np.mean(train_scores))
        stest.append(np.mean(test_scores))
    return stest, strain, sdummy
def _run_dummy_detection(x_train, x_test, y_train, y_test):
    clf = DummyClassifier(strategy='most_frequent')

    print "Training Dummy..."
    clf.fit(x_train, y_train)
    print "Predicting Test Set..."
    print "Score for test set: {}".format(clf.score(x_test, y_test))
Esempio n. 5
0
def test_dtype_of_classifier_probas(strategy):
    y = [0, 2, 1, 1]
    X = np.zeros(4)
    model = DummyClassifier(strategy=strategy, random_state=0, constant=0)
    probas = model.fit(X, y).predict_proba(X)

    assert probas.dtype == np.float64
Esempio n. 6
0
def do_cross_validation(labels):
    """Perform the k-fold cross validation.

    Perform the k-fold cross validation, collect the result and return the
    single test instance predictions, as well as the classification results for
    each single fold and for the combination of all folds.

    Keyword arguments:
    features -- all features
    labels -- all labels
    """
    skf = StratifiedKFold(labels, NO_OF_FOLDS)
    single_predictions = []  # Store each single classification decision

    # Store classification results for each fold and for the entire task (i.e.,
    # entire cross validation).
    classification_result = np.zeros((NO_OF_FOLDS + 1, 5))

    for cur_fold, (train_idx, test_idx) in enumerate(skf):
        model = DummyClassifier(strategy='most_frequent')
        model.fit(None, labels[train_idx])
        pred_labels = model.predict(np.zeros(labels[test_idx].shape[0]))

        fold_array = np.empty(test_idx.shape[0])
        fold_array.fill(cur_fold)
        single_predictions.append(np.transpose(np.vstack((fold_array, test_idx,
                labels[test_idx], pred_labels))))
        classification_result[cur_fold, :] = get_classification_result(cur_fold,
                labels[test_idx], pred_labels)

    single_predictions = np.vstack(single_predictions)
    return single_predictions, classification_result
Esempio n. 7
0
def test_dummy_classifier_on_nan_value():
    X = [[np.NaN]]
    y = [1]
    y_expected = [1]
    clf = DummyClassifier()
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert_array_equal(y_pred, y_expected)
Esempio n. 8
0
def test_most_frequent_strategy():
    X = [[0], [0], [0], [0]]  # ignored
    y = [1, 2, 1, 1]

    clf = DummyClassifier(strategy="most_frequent", random_state=0)
    clf.fit(X, y)
    assert_array_equal(clf.predict(X), np.ones(len(X)))
    _check_predict_proba(clf, X, y)
Esempio n. 9
0
def test_constant_strategy_multioutput():
    X = [[0], [0], [0], [0]]  # ignored
    y = np.array([[2, 3], [1, 3], [2, 3], [2, 0]])

    n_samples = len(X)

    clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
    clf.fit(X, y)
    assert_array_equal(clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]))
    _check_predict_proba(clf, X, y)
Esempio n. 10
0
def test_constant_strategy_sparse_target():
    X = [[0]] * 5  # ignored
    y = sp.csc_matrix(np.array([[0, 1], [4, 0], [1, 1], [1, 4], [1, 1]]))

    n_samples = len(X)

    clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert_true(sp.issparse(y_pred))
    assert_array_equal(y_pred.toarray(), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]))
Esempio n. 11
0
def test_dummy_classifier_on_3D_array():
    X = np.array([[['foo']], [['bar']], [['baz']]])
    y = [2, 2, 2]
    y_expected = [2, 2, 2]
    y_proba_expected = [[1], [1], [1]]
    cls = DummyClassifier()
    cls.fit(X, y)
    y_pred = cls.predict(X)
    y_pred_proba = cls.predict_proba(X)
    assert_array_equal(y_pred, y_expected)
    assert_array_equal(y_pred_proba, y_proba_expected)
Esempio n. 12
0
def test_most_frequent_and_prior_strategy_multioutput():
    X = [[0], [0], [0], [0]]  # ignored
    y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]])

    n_samples = len(X)

    for strategy in ("prior", "most_frequent"):
        clf = DummyClassifier(strategy=strategy, random_state=0)
        clf.fit(X, y)
        assert_array_equal(clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]))
        _check_predict_proba(clf, X, y)
        _check_behavior_2d(clf)
Esempio n. 13
0
def test_stratified_strategy():
    X = [[0]] * 5  # ignored
    y = [1, 2, 1, 1, 2]
    clf = DummyClassifier(strategy="stratified", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 1000
    y_pred = clf.predict(X)
    p = np.bincount(y_pred) / float(len(X))
    assert_almost_equal(p[1], 3. / 5, decimal=1)
    assert_almost_equal(p[2], 2. / 5, decimal=1)
    _check_predict_proba(clf, X, y)
Esempio n. 14
0
def test_uniform_strategy():
    X = [[0]] * 4  # ignored
    y = [1, 2, 1, 1]
    clf = DummyClassifier(strategy="uniform", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)
    p = np.bincount(y_pred) / float(len(X))
    assert_almost_equal(p[1], 0.5, decimal=1)
    assert_almost_equal(p[2], 0.5, decimal=1)
    _check_predict_proba(clf, X, y)
Esempio n. 15
0
def test_most_frequent_and_prior_strategy_sparse_target():
    X = [[0]] * 5  # ignored
    y = sp.csc_matrix(np.array([[1, 0], [1, 3], [4, 0], [0, 1], [1, 0]]))

    n_samples = len(X)
    y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
    for strategy in ("most_frequent", "prior"):
        clf = DummyClassifier(strategy=strategy, random_state=0)
        clf.fit(X, y)

        y_pred = clf.predict(X)
        assert_true(sp.issparse(y_pred))
        assert_array_equal(y_pred.toarray(), y_expected)
Esempio n. 16
0
def run_ML_leave_one_subject_out(config, filename, question, clf, cols, return_arr=None, return_index=-1):
    working_directory = config['DATA_DIRECTORY']
    data_X, data_y = load_data(working_directory, filename, cols, question)
    data = leave_one_subject_out(data_X, data_y, 'User')
    score = 0
    score_dummy_mf = 0
    score_dummy_sf = 0
    dummy_clf_mf = DummyClassifier('most_frequent')
    dummy_clf_sf = DummyClassifier('stratified')
    for (training_X, training_y), (testing_X, testing_y) in data:
        clf.fit(training_X, training_y)
        dummy_clf_mf.fit(training_X, training_y)
        dummy_clf_sf.fit(training_X, training_y)

        single_score = clf.score(testing_X, testing_y)
        single_score_dummy_mf = dummy_clf_mf.score(testing_X, testing_y)
        single_score_dummy_sf = dummy_clf_sf.score(testing_X, testing_y)
        #print 'Single run score: ' + ("%0.2f" % single_score.mean())
        #print 'Single run score (dummy most frequent): ' + ("%0.2f" % single_score_dummy_mf.mean())
        #print 'Single run score (dummy stratified): ' + ("%0.2f" % single_score_dummy_sf.mean())

        score = score + single_score.mean()
        score_dummy_mf = score_dummy_mf + single_score_dummy_mf.mean()
        score_dummy_sf = score_dummy_sf + single_score_dummy_sf.mean()
    score = round(float(score / len(data)), 2)
    score_dummy_mf = round(float(score_dummy_mf / len(data)), 2)
    score_dummy_sf = round(float(score_dummy_sf / len(data)), 2)
    #print 'Total score: ' + str(score)
    #print 'Total score (dummy most frequent): ' + str(score_dummy_mf)
    #print 'Total score (dummy stratified): ' + str(score_dummy_sf)
    if return_index == -1:
        return score, score_dummy_mf, score_dummy_sf
    else:
        return_arr[return_index] = (score, score_dummy_mf, score_dummy_sf)
def find_best_dummy_classification(X, y, test_size=0.3, random_state=0, thresh=0.5, target_names=None, n=1):
    """Try all dummy models."""
    X = X.reshape((len(X) ,-1))
    # y = y.reshape((len(y) ,-1))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    dummy_scores = []
    for i in range(n):
        for strategy in ['most_frequent', 'uniform', 'prior', 'stratified']:
            clf = DummyClassifier(strategy=strategy)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            score = clf.score(X_test, y_test)

            matthews_corrcoef=sklearn.metrics.matthews_corrcoef(y_test > thresh, y_pred > thresh)

            report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names))

            dummy_scores.append(
                collections.OrderedDict(
                    strategy='classifier_' + strategy,
                    matthews_corrcoef=matthews_corrcoef,
                    score=score,
                    report=report
                )
            )

        for strategy in ['mean', 'median']:
            clf=DummyRegressor(strategy=strategy)
            clf.fit(X_train, y_train)
            y_pred=clf.predict(X_test)
            score=clf.score(X_test, y_test)

            matthews_corrcoef=sklearn.metrics.matthews_corrcoef(y_test > thresh, y_pred > thresh)

            report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names))

            dummy_scores.append(
                collections.OrderedDict(
                    strategy='regressor_' + strategy,
                    matthews_corrcoef=matthews_corrcoef,
                    score=score,
                    report=report
                )
                )

    df=pd.DataFrame(dummy_scores)
    df=df.sort_values('matthews_corrcoef', ascending=False)
    return df, df[:1].iloc[0].to_dict()
Esempio n. 18
0
def main(training_set, language, gold_standard, gazetteer):
    """ Searches for the best hyperparameters """

    gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {}

    logger.info('Building training set')
    extractor = FactExtractorFeatureExtractor(language)
    for row in training_set:
        data = json.loads(row)
        extractor.process_sentence(data['sentence'], data['fes'],
                                   add_unknown=True, gazetteer=gazetteer)

    logger.info('Finalizing training set')
    x, y = extractor.get_features()

    logger.info('Searching for the best model parameters')
    svc = LinearSVC()
    search = GridSearchCV(
        svc,
        param_grid=[{
            'C': [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
            'multi_class': ['ovr', 'crammer_singer'],
        }],
        scoring='f1_weighted',
        cv=10)
    search.fit(x, y)

    logger.info('The best model (weighted-averaged F1 of %.4f) has parameters %s',
                search.best_score_, search.best_params_)

    if not gold_standard:
        logger.info('Skipping gold standard evaluation')
        return

    logger.info('Evaluating on the gold standard')
    for row in gold_standard:
        data = json.loads(row)
        extractor.process_sentence(data['sentence'], data['fes'])
    x_gold, y_gold = extractor.get_features()

    dummy = DummyClassifier(strategy='stratified')
    dummy.fit(x, y)

    y_dummy = dummy.predict(x_gold)
    logger.info('Dummy model has a weighted-averaged F1 on the gold standard of %.4f',
                metrics.f1_score(y_gold, y_dummy, average='weighted'))

    y_best = search.predict(x_gold)
    logger.info('Best model has a weighted-averaged F1 on the gold standard of %.4f',
                metrics.f1_score(y_gold, y_best, average='weighted'))
Esempio n. 19
0
def test_uniform_strategy_sparse_target_warning():
    X = [[0]] * 5  # ignored
    y = sp.csc_matrix(np.array([[2, 1], [2, 2], [1, 4], [4, 2], [1, 1]]))

    clf = DummyClassifier(strategy="uniform", random_state=0)
    assert_warns_message(UserWarning, "the uniform strategy would not save memory", clf.fit, X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)

    for k in range(y.shape[1]):
        p = np.bincount(y_pred[:, k]) / float(len(X))
        assert_almost_equal(p[1], 1 / 3, decimal=1)
        assert_almost_equal(p[2], 1 / 3, decimal=1)
        assert_almost_equal(p[4], 1 / 3, decimal=1)
Esempio n. 20
0
def main(training_set, language, gold_standard, gazetteer, n_folds, n_jobs,
         scoring, output, test, word2vec_model, independent_lus):
    """ Searches for the best hyperparameters """

    logger.info('Searching for the best model and parameters')

    training_sets = get_training_sets(training_set, language, gazetteer, word2vec_model, independent_lus)
    models = get_models(test)

    search = MultimodelGridSearchCV(*models, cv=n_folds, n_jobs=n_jobs,
                                    scoring=Scorer(scoring, True))
    (x_tr, y_tr, best_training_meta), best_score, best_params, best_model = search.fit(training_sets)

    logger.info('Evaluation Results')
    logger.info('  Best model: %s', best_model.__class__.__name__)
    logger.info('  Score: %f', best_score)
    logger.info('  Parameters: %s', best_params)
    logger.info('  Gazetteer: %s', best_training_meta['gazetteer'])
    logger.info('  Extractor: %s', best_training_meta['extractor_cls'].__name__)
    logger.info('  Extractor args: %s', best_training_meta['extractor_args'])

    joblib.dump((best_model, best_training_meta), output)
    logger.info("Done, dumped model to '%s'", output)

    if not gold_standard:
        logger.info('Skipping gold standard evaluation')
        return

    logger.info('Evaluating on the gold standard')

    extractor = best_training_meta['extractor']
    gazetteer = best_training_meta['gazetteer']

    extractor.start()
    for row in gold_standard:
        data = json.loads(row)
        extractor.process_sentence(data['sentence'], data['lu'], data['fes'],
                                   add_unknown=False, gazetteer=gazetteer)
    x_gold, y_gold = extractor.get_features(refit=False)

    dummy = DummyClassifier(strategy='stratified')
    dummy.fit(x_tr, y_tr)

    logger.info('Dummy model has a weighted-averaged F1 on the gold standard of %.4f',
                Scorer(scoring, True)(dummy, x_gold, y_gold))

    logger.info('Best model has a weighted-averaged F1 on the gold standard of %.4f',
                Scorer(scoring, True)(best_model, x_gold, y_gold))
Esempio n. 21
0
def test_uniform_strategy_multioutput():
    X = [[0]] * 4  # ignored
    y = np.array([[2, 1], [2, 2], [1, 2], [1, 1]])
    clf = DummyClassifier(strategy="uniform", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)

    for k in range(y.shape[1]):
        p = np.bincount(y_pred[:, k]) / float(len(X))
        assert_almost_equal(p[1], 0.5, decimal=1)
        assert_almost_equal(p[2], 0.5, decimal=1)
        _check_predict_proba(clf, X, y)

    _check_behavior_2d(clf)
Esempio n. 22
0
 def train_on_data(self, train):
     """\
     Train model on the specified training data set (which must be a loaded
     DataSet object).
     """
     log_info('Preparing data set...')
     self.data_headers = train.get_headers()
     self.attr_mask = self.get_attr_mask()
     train_vect = self.__vectorize(train)
     train_classes = self.get_classes(train)
     # if all the training data have the same class, use a dummy classifier
     if train.get_attrib(self.class_attr).num_values == 1:
         self.feature_filter = None
         self.classifier = DummyClassifier(strategy='most_frequent')
     # filter features
     log_info('Filtering...')
     train_filt = self.__filter_features(train_vect, train_classes)
     # train the classifier
     log_info('Training...')
     if self.use_weights:
         self.classifier.fit(train_filt, train_classes,
                             sample_weight=train.inst_weights)
     else:
         self.classifier.fit(train_filt, train_classes)
     self.classifier_trained = True
     log_info('Training done.')
Esempio n. 23
0
def test_stratified_strategy_sparse_target():
    X = [[0]] * 5  # ignored
    y = sp.csc_matrix(np.array([[4, 1], [0, 0], [1, 1], [1, 4], [1, 1]]))

    clf = DummyClassifier(strategy="stratified", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)
    assert_true(sp.issparse(y_pred))
    y_pred = y_pred.toarray()

    for k in range(y.shape[1]):
        p = np.bincount(y_pred[:, k]) / float(len(X))
        assert_almost_equal(p[1], 3.0 / 5, decimal=1)
        assert_almost_equal(p[0], 1.0 / 5, decimal=1)
        assert_almost_equal(p[4], 1.0 / 5, decimal=1)
Esempio n. 24
0
def main(args):
    X, y, names = loadData(args.mat)

    t = numpy.transpose(X)
    ls = []
    for name, col in zip(names, t):
        ls.append( (name, col) )

    for col1,col2 in itertools.combinations(ls, 2):
        name1, row1 = col1
        name2, row2 = col2
        c,p = scipy.stats.pearsonr(row1, row2)
        print 'debugCor', name1, name2, c, p


    depth = 6
    clf = tree.DecisionTreeClassifier(max_depth=depth)
    clf = clf.fit(X, y)
    dot_data = StringIO()
    tree.export_graphviz(clf, feature_names=names, out_file=dot_data)
    graph = pydot.graph_from_dot_data( dot_data.getvalue() )
    graph.write_pdf(args.plotFile)

    sss = StratifiedShuffleSplit(y, 5, test_size=0.1, random_state=442)
    for train_index, test_index in sss:
        clf = tree.DecisionTreeClassifier(max_depth=depth)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        metrics.confusion_matrix( y_test, preds )
        print metrics.classification_report(y_test, clf.predict(X_test))

    print '\ndummy\n'
    for train_index, test_index in sss:
        clf = DummyClassifier()
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        metrics.confusion_matrix( y_test, preds )
        print 'dummy',
        print metrics.classification_report(y_test, clf.predict(X_test))
Esempio n. 25
0
def test_classifier_prediction_independent_of_X(strategy):
    y = [0, 2, 1, 1]
    X1 = [[0]] * 4
    clf1 = DummyClassifier(strategy=strategy, random_state=0, constant=0)
    clf1.fit(X1, y)
    predictions1 = clf1.predict(X1)

    X2 = [[1]] * 4
    clf2 = DummyClassifier(strategy=strategy, random_state=0, constant=0)
    clf2.fit(X2, y)
    predictions2 = clf2.predict(X2)

    assert_array_equal(predictions1, predictions2)
def train_clf(vectorizer, classifier, train, test, topic):    
    train_X = vectorizer.transform(train["sentence"]).toarray()     #Vectorize Training Features
    test_X = vectorizer.transform(test["sentence"]).toarray()       #Vectorize Testing Feature 
    
    train_y = train[topic]                                          #Create Training Label Vector
    test_y = test[topic]                                            #Create Testing Label Vector
    
    dummy_clf = DummyClassifier(strategy="most_frequent").fit(train_X, train_y) #Train a Dummy Classifier (for comparison)
    clf = classifier.fit(train_X, train_y)                                      #Train Actual Classifier
    

    #Test Classifiers & Output Accuracy, Confusion Matrix Statistics
    dummy_accuracy = accuracy_score(test_y, dummy_clf.predict(test_X))
    accuracy = accuracy_score(test_y, clf.predict(test_X))
    cm = confusion_matrix(test_y, clf.predict(test_X))
    
    print topic+" Dummy Accuracy: "+str(dummy_accuracy)
    print topic+" Accuracy:       "+str(accuracy)
    print topic+" Confusion Matrix: "
    print cm
    print ""
Esempio n. 27
0
def test_most_frequent_and_prior_strategy_with_2d_column_y():
    # non-regression test added in
    # https://github.com/scikit-learn/scikit-learn/pull/13545
    X = [[0], [0], [0], [0]]
    y_1d = [1, 2, 1, 1]
    y_2d = [[1], [2], [1], [1]]

    for strategy in ("most_frequent", "prior"):
        clf_1d = DummyClassifier(strategy=strategy, random_state=0)
        clf_2d = DummyClassifier(strategy=strategy, random_state=0)

        clf_1d.fit(X, y_1d)
        clf_2d.fit(X, y_2d)
        assert_array_equal(clf_1d.predict(X), clf_2d.predict(X))
Esempio n. 28
0
def svm_ssp_metrics(inputfile):
	"""
	This is essentially a helper function which returns all the metrics of an SVM's performance.
	Returns accuracy, precision, recall, F1 Score, confusion matrix

	:type inputfile: string
	:param inputfile: samples file

	:type w: float
	:param w: class weighting
	"""

	x, y, labels = load_csv_svm(inputfile)
	x_scaled = preprocessing.scale(x)

	if USE_PCA:
		pca = PCA(n_components=PCA_COMPONENTS)
		x = pca.fit_transform(x_scaled)
		print(pca.explained_variance_ratio_) 
	else:
		x = x_scaled

	x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234,test_size=0.3)
	clf = svm.SVC(gamma=GAMMA, C=C, class_weight=WEIGHT, kernel=KERNEL, cache_size=400)	# gamma=.01, C=.01, 
	y_pred = clf.fit(x_train, y_train).predict(x_test)

	dummy_clf = DummyClassifier(strategy='stratified',random_state=0) # most_frequent, uniform, stratified
	dummy_y_pred = dummy_clf.fit(x_train, y_train).predict(x_test)

	print("\nClassification report for classifier %s:\n\n%s" % (clf, metrics.classification_report(y_test, y_pred)))
	print('Accuracy: {0}\n'.format(accuracy_score(y_test, y_pred)))
	print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, y_pred))
	
	if KERNEL == 'linear':
		print('\nfeature_weights: {0}'.format(clf.coef_))

	print("\nClassification report for classifier %s:\n\n%s" % (dummy_clf, metrics.classification_report(y_test, dummy_y_pred)))
	print('Accuracy: {0}\n'.format(accuracy_score(y_test, dummy_y_pred)))
	print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, dummy_y_pred))
Esempio n. 29
0
def test_constant_strategy():
    X = [[0], [0], [0], [0]]  # ignored
    y = [2, 1, 2, 2]

    clf = DummyClassifier(strategy="constant", random_state=0, constant=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(X), np.ones(len(X)))
    _check_predict_proba(clf, X, y)

    X = [[0], [0], [0], [0]]  # ignored
    y = ['two', 'one', 'two', 'two']
    clf = DummyClassifier(strategy="constant", random_state=0, constant='one')
    clf.fit(X, y)
    assert_array_equal(clf.predict(X), np.array(['one'] * 4))
    _check_predict_proba(clf, X, y)
Esempio n. 30
0
def kfolds_evaluation(folds, model, scoring, skip_majority, x, y):
    kf = KFold(x.shape[0], folds, shuffle=True)
    scorer = Scorer(scoring, skip_majority)

    scores_dummy, scores_test, scores_train = [], [], []
    for train_index, test_index in kf:
        x_train, y_train = x[train_index], y[train_index]
        x_test, y_test = x[test_index], y[test_index]

        model.fit(x_train, y_train)
        dummy = DummyClassifier()
        dummy.fit(x_train, y_train)

        scores_test.append(scorer(model, x_test, y_test))
        scores_dummy.append(scorer(dummy, x_test, y_test))
        scores_train.append(scorer(model, x_train, y_train))

    logger.info("%d-folds cross evaluation results", folds)
    logger.info("    minimum test %f  dummy %f  training %f", min(scores_test), min(scores_dummy), min(scores_train))

    logger.info("    maximum test %f  dummy %f  training %f", max(scores_test), max(scores_dummy), max(scores_train))
    logger.info(
        "    average test %f  dummy %f  training %f",
        np.average(scores_test),
        np.average(scores_dummy),
        np.average(scores_train),
    )
    logger.info(
        "    median  test %f  dummy %f  training %f",
        np.median(scores_test),
        np.median(scores_dummy),
        np.median(scores_train),
    )

    logger.debug("full test scores: %s", scores_test)
    logger.debug("full dummy scores: %s", scores_dummy)
    logger.debug("full train scores: %s", scores_train)
Esempio n. 31
0
# Creamos el Pipeline incorporando ColumnTransformer y Clasificador
pipeline = Pipeline([
    ('imputer', imputer),
    ('scaler', scaler),
    ('svm', SVC(random_state=random_state, class_weight=class_weight))
])

# InnerCV (GridSearchCV de 2-folds 5-times (stratified) para obtener mejores parámetros)
rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=random_state)  # inner
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=scoring, cv=rskf)

# OuterCV (Validación cruzada de 5 folds (stratified) para estimar Accuracy)
scores = cross_validate(estimator=grid_search, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=scoring)  # outer
print('Scores: {}' .format(scores['test_score']))
print('Mean score: {}' .format(np.mean(scores['test_score'])))

# Creamos clasificador 'tonto' y obtenemos resultados también con validación cruzada (CV=5) para tener resultados más realistas
dummy_clf = DummyClassifier(strategy='most_frequent', random_state=random_state)
dummy_scores = cross_validate(estimator=dummy_clf, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=scoring)
print('Dummy scores: {}' .format(dummy_scores['test_score']))
print('Dummy mean score: {}' .format(np.mean(dummy_scores['test_score'])))

# Matriz de confusion
results = cross_val_predict(grid_search, X=X, y=y, cv=5)
conf_m = confusion_matrix(y, results, labels=[1, 0])
print(conf_m)

# F1_Score
print(f1_score(y, results))
Esempio n. 32
0
                                 header=0)

X = feature_vectors_df.drop(columns=['class', 'buggy'], axis=1)
y = feature_vectors_df.buggy

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=0,
                                                    test_size=0.2)
''' Default version '''
clf = DecisionTreeClassifier()
gnb = GaussianNB()
lsvc = LinearSVC()
mlpc = MLPClassifier()
rfc = RandomForestClassifier()
biased = DummyClassifier(strategy='constant', constant=1)
''' Fine-tuned version '''
# clf = DecisionTreeClassifier(criterion='entropy', splitter='random', presort=True)
# gnb = GaussianNB(var_smoothing=1e-3)
# lsvc = LinearSVC(loss='hinge', random_state=1)
# mlpc = MLPClassifier(hidden_layer_sizes=1000, activation='tanh', solver='sgd', learning_rate='adaptive')
# rfc = RandomForestClassifier(criterion='entropy', oob_score=True, warm_start=True)
# biased = DummyClassifier(strategy='constant', constant=1)

y_pred_clf = clf.fit(X_train, y_train).predict(X_test)
y_pred_gnb = gnb.fit(X_train, y_train).predict(X_test)
y_pred_lsvc = lsvc.fit(X_train, y_train).predict(X_test)
y_pred_mlpc = mlpc.fit(X_train, y_train).predict(X_test)
y_pred_rfc = rfc.fit(X_train, y_train).predict(X_test)
y_pred_biased = biased.fit(X_train, y_train).predict(X_test)
        ypred = model.predict(X[test])
        accuracies.append(accuracy_score(y[test], ypred))
    avg_accuracy = sum(accuracies) / len(accuracies)
    print("Decision Tree Model average accuracy: ", avg_accuracy)

    if curr_best[0] is None or curr_best[1] < avg_accuracy:
        curr_best = (model, avg_accuracy, lang)

    model_names.append("DecisionTree")
    model_accuracies.append(avg_accuracy)

    # Dummy Model
    accuracies = []
    kf = KFold(n_splits=5)
    for train, test in kf.split(X):
        model = DummyClassifier(strategy="most_frequent").fit(
            X[train], y[train])
        ypred = model.predict(X[test])
        accuracies.append(accuracy_score(y[test], ypred))
    avg_accuracy = sum(accuracies) / len(accuracies)
    dummy_models.append((avg_accuracy, lang))
    print("Dummy Model average accuracy: ", avg_accuracy)

    if curr_best[0] is None or curr_best[1] < avg_accuracy:
        curr_best = (model, avg_accuracy, lang)

    model_names.append("Dummy")
    model_accuracies.append(avg_accuracy)

    # RidgeClassifier Model
    best_ridge_accuracy = (-1, -1)
    mean_error = []
Esempio n. 34
0
      "\n\n")

## Naive Bayes Classification
print("Number of classes used:", nr_classes)
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
priors = np.array([hist / y.shape[0] for hist in histo])
nb_clf = GaussianNB(priors=priors)
nb_score = cross_val_score(nb_clf, X, binned_y, cv=3)
print("Naive Bayes Scores:\n ", nb_score)

## KNN Classification
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(nr_classes)
knn_score = cross_val_score(knn_clf, X, binned_y, cv=5)
print("\n\nK-Nearest Neighbor Scores: \n", knn_score)
#
# ## Dummy classifier
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_score = cross_val_score(dummy_clf, X, binned_y, cv=5)
print("\n\nPredict most frequent class: \n", dummy_score)

## Plot histogram of data binning
# plt.bar([x for x in range(1,histo.shape[0]+1)],histo)
# plt.xlabel('bins')
# plt.ylabel('number of training examples')
# plt.savefig('histogram.jpg')
df = pd.read_csv(
    "https://www.openml.org/data/get_csv/1595261/adult-census.csv")

# %%
target_name = "class"
target = df[target_name].to_numpy()
data = df.drop(columns=[target_name, "fnlwgt"])
numerical_columns = [
    c for c in data.columns if data[c].dtype.kind in ["i", "f"]]
data_numeric = data[numerical_columns]

# %%
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier

high_revenue_clf = DummyClassifier(strategy="constant",
                                   constant=" >50K")
scores = cross_val_score(high_revenue_clf, data_numeric, target)
print(f"{scores.mean():.3f} +/- {scores.std():.3f}")

# %%
low_revenue_clf = DummyClassifier(strategy="constant",
                                  constant=" <=50K")
scores = cross_val_score(low_revenue_clf, data_numeric, target)
print(f"{scores.mean():.3f} +/- {scores.std():.3f}")

# %%
most_freq_revenue_clf = DummyClassifier(strategy="most_frequent")
scores = cross_val_score(most_freq_revenue_clf, data_numeric, target)
print(f"{scores.mean():.3f} +/- {scores.std():.3f}")

# %% [markdown]
Esempio n. 36
0
                        random_state=1,
                        solver='liblinear',
                        multi_class='ovr')
lr.fit(train_x, train_y)
accuracy = lr.score(validate_x, validate_y)
print("Accuracy: %.3f" % accuracy)

print("\nModel 4: SVM, C=1.0")
svm = SVC(kernel='linear', C=1.0, random_state=1)
svm.fit(train_x, train_y)
accuracy = svm.score(validate_x, validate_y)
print("Accuracy: %.3f" % accuracy)

print("\n>>> Beginning Baseline model training...")
print("Baseline Model 1: Strategy = \"stratified\"")
dummy = DummyClassifier(strategy="stratified")
dummy.fit(train_x, train_y)
dummy.predict(validate_x)
accuracy = dummy.score(validate_x, validate_y)
print("Accuracy : %.3f" % accuracy)

print("\nBaseline Model 2: Strategy = \"uniform\"")
dummy = DummyClassifier(strategy="uniform")
dummy.fit(train_x, train_y)
dummy.predict(validate_x)
accuracy = dummy.score(validate_x, validate_y)
print("Accuracy : %.3f" % accuracy)

print(
    "\n>>> Model Analysis: Logistic Regression and SVM give very similar results; best model is most likely Model 1"
)
Esempio n. 37
0
# Init selected scenario
if SCENARIO == 1:
    scenario_name = 'naive_bayes'
    clf = GaussianNB()
elif SCENARIO == 2:
    scenario_name = 'knn'
    clf = KNeighborsClassifier(n_neighbors=KNN_N_NEIGH)
elif SCENARIO == 3:
    scenario_name = 'random_forrest'
    clf = RandomForestClassifier()
elif SCENARIO == 4:
    scenario_name = 'adaboost'
    clf = AdaBoostClassifier()
elif SCENARIO == 5:
    scenario_name = 'decision_tree'
    clf = DecisionTreeClassifier()
elif SCENARIO == 6:
    scenario_name = 'most_frequent'
    clf = DummyClassifier(strategy='most_frequent')
elif SCENARIO == 7:
    scenario_name = 'random'
    clf = DummyClassifier(strategy='uniform')
else:
    print("\n!!!")
    print("Selected invalid scenario, defaulting to Naive Bayes.")
    print("!!!\n")
    scenario_name = 'naive_bayes'
    clf = GaussianNB()

evaluate_model(clf, x, y, scenario_name, 'default_params', SAMPLE_SIZE, scale)
rfresults = {}
for eval in evaluation_scores:
    dummyresults[eval[0]] = []
    lrresults[eval[0]] = []
    lsvmresults[eval[0]] = []
    mlpresults[eval[0]] = []
    rfresults[eval[0]] = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    print()

    dummy_clf = DummyClassifier(strategy="most_frequent")
    dummy_clf.fit(X_train, y_train)
    y_pred_dummy = dummy_clf.predict(X_test)

    for (evalname, evaluator) in evaluation_scores:
        print("DUMMY " + evalname + ":", evaluator(y_test, y_pred_dummy))
        dummyresults[evalname].append(evaluator(y_test, y_pred_dummy))

    lr_clf = LogisticRegression(solver="lbfgs")
    lr_clf.fit(X_train, y_train)
    y_pred_lr = lr_clf.predict(X_test)
    for evalname, evaluator in evaluation_scores:
        print("Logistic regression " + evalname + ":",
              evaluator(y_test, y_pred_lr))
        lrresults[evalname].append(evaluator(y_test, y_pred_lr))
def calculateBaselines(df, target, intervalType, MachineLearningMethod,
                       FeeModel):
    X = df[["Open", "Low", "High", "Close", "Volume"]]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        target,
                                                        test_size=0.2,
                                                        train_size=0.8,
                                                        shuffle=False,
                                                        random_state=42)

    # BASELINES FOR REGRESSION
    if MachineLearningMethod == "Regression":
        print(
            '------------- No-Change baseline: Predict same Target as last timestamp ------------- '
        )
        # No-Change baseline
        # Add new column 'Predicted' . True if previous timestamp was also true
        df['Predicted'] = target.shift(1)
        y_predict_NOCHANGE = df['Predicted'][len(X_train):]
        MAE_nochange = metrics.mean_absolute_error(y_test, y_predict_NOCHANGE)
        print('Mean Absolute Error:', MAE_nochange)
        print('Mean Squared Error:',
              metrics.mean_squared_error(y_test, y_predict_NOCHANGE))
        print('Root Mean Squared Error:',
              np.sqrt(metrics.mean_squared_error(y_test, y_predict_NOCHANGE)))

        print(
            '------------- DUMMY BASELINE MODEL “mean”: always predicts the mean of the training set ------------- '
        )
        # Dummy classifier to create baseline to compare to the real models
        dummy_clf = DummyRegressor(strategy="mean")
        dummy_clf_ = dummy_clf.fit(X_train, y_train)
        y_predict_MEAN = dummy_clf_.predict(X_test)

        MAE_mean = metrics.mean_absolute_error(y_test, y_predict_MEAN)
        print('Mean Absolute Error:', MAE_mean)
        print('Mean Squared Error:',
              metrics.mean_squared_error(y_test, y_predict_MEAN))
        print('Root Mean Squared Error:',
              np.sqrt(metrics.mean_squared_error(y_test, y_predict_MEAN)))

        print(
            '------------- DUMMY BASELINE MODEL “median”: always predicts the median of the training set ------------- '
        )
        dummy_clf = DummyRegressor(strategy="median")
        dummy_clf_ = dummy_clf.fit(X_train, y_train)
        y_predict_median = dummy_clf.predict(X_test)

        MAE_median = metrics.mean_absolute_error(y_test, y_predict_median)
        print('Mean Absolute Error:', MAE_median)
        print('Mean Squared Error:',
              metrics.mean_squared_error(y_test, y_predict_median))
        print('Root Mean Squared Error:',
              np.sqrt(metrics.mean_squared_error(y_test, y_predict_median)))
    # BASELINES FOR CLASSIFICATION
    elif MachineLearningMethod == "Classification":
        print(
            '------------- No-Change baseline: Predict same Target as last timestamp ------------- '
        )
        # No-Change baseline
        # Add new column 'Predicted' . True if previous timestamp was also true
        df['Predicted'] = np.where(target.shift(1) == True, True, False)
        y_predict_NOCHANGE = df['Predicted'][len(X_train):]
        print('Accuracy score:')
        accuracy_nochange = round(
            accuracy_score(y_test, y_predict_NOCHANGE) * 100, 2)
        print(accuracy_nochange)
        print('Classification report:')
        print(classification_report(y_test, y_predict_NOCHANGE))

        print(
            '------------- DUMMY BASELINE MODEL (Stratified) generates predictions by respecting the training set’s class distribution. Random ------------- '
        )
        # Dummy classifier to create baseline to compare to the real models
        dummy_clf = DummyClassifier(strategy="stratified", random_state=42)
        dummy_clf_ = dummy_clf.fit(X_train, y_train)
        y_predict_STRATIFIED = dummy_clf.predict(X_test)

        print('Accuracy score:')
        accuracy_stratified = round(
            accuracy_score(y_test, y_predict_STRATIFIED) * 100, 2)
        print(accuracy_stratified)
        print('Classification report:')
        print(classification_report(y_test, y_predict_STRATIFIED))

        print(
            '------------- DUMMY BASELINE MODEL (Most Frequent)  always predicts the most frequent label in the training set. ------------- '
        )
        dummy_clf = DummyClassifier(strategy="most_frequent", random_state=42)
        dummy_clf_ = dummy_clf.fit(X_train, y_train)
        y_predict_MOSTFREQ = dummy_clf.predict(X_test)

        print('Accuracy score:')
        accuracy_mostfrequent = round(
            accuracy_score(y_test, y_predict_MOSTFREQ) * 100, 2)
        print(accuracy_mostfrequent)
        print('Classification report:')
        print(classification_report(y_test, y_predict_MOSTFREQ))

        print(
            '------------- DUMMY BASELINE MODEL (Prior) always predicts the class that maximizes the class prior (like “most_frequent”) and predict_proba returns the class prior. ------------- '
        )
        dummy_clf = DummyClassifier(strategy="prior", random_state=42)
        dummy_clf_ = dummy_clf.fit(X_train, y_train)
        y_predict_PRIOR = dummy_clf.predict(X_test)
        print('Accuracy score:')
        accuracy_prior = round(
            accuracy_score(y_test, y_predict_PRIOR) * 100, 2)
        print(accuracy_prior)
        print('Classification report:')
        print(classification_report(y_test, y_predict_PRIOR))

    # Build Matplot boxplot from results of baselines + results of all our own modelssssssssssssssssss
    if MachineLearningMethod == "Regression":
        if intervalType == "Day":
            DEFAULT = readTxtResults(
                '../accuraciesOutput/default/BTCUSD_1Day.csv_Regression_Difference_FALSE_RELATIONS.txt'
            )
            RF_OPTIMIZED = readTxtResults(
                '../accuraciesOutput/rf_optimized/BTCUSD_1Day.csv_Regression_Difference_FALSE_RELATIONS.txt'
            )
            NO_RELATIONS = readTxtResults(
                '../accuraciesOutput/no_relations/BTCUSD_1Day.csv_Regression_Difference_FALSE_RELATIONS.txt'
            )
            RELATIONS = readTxtResults(
                '../accuraciesOutput/relations/BTCUSD_1Day.csv_Regression_Difference_TRUE_RELATIONS.txt'
            )
        elif intervalType == "Hour":
            DEFAULT = readTxtResults(
                '../accuraciesOutput/default/bitfinex_tBTCUSD_1h.csv_Regression_Difference_FALSE_RELATIONS.txt'
            )
            RF_OPTIMIZED = readTxtResults(
                '../accuraciesOutput/rf_optimized/bitfinex_tBTCUSD_1h.csv_Regression_Difference_FALSE_RELATIONS.txt'
            )
            NO_RELATIONS = readTxtResults(
                '../accuraciesOutput/no_relations/bitfinex_tBTCUSD_1h.csv_Regression_Difference_FALSE_RELATIONS.txt'
            )
            RELATIONS = readTxtResults(
                '../accuraciesOutput/relations/bitfinex_tBTCUSD_1h.csv_Regression_Difference_TRUE_RELATIONS.txt'
            )
        elif intervalType == "Minute":
            DEFAULT = readTxtResults(
                '../accuraciesOutput/default/bitfinex_tBTCUSD_1m.csv_Regression_Difference_FALSE_RELATIONS.txt'
            )
            RF_OPTIMIZED = readTxtResults(
                '../accuraciesOutput/rf_optimized/bitfinex_tBTCUSD_1m.csv_Regression_Difference_FALSE_RELATIONS.txt'
            )
            NO_RELATIONS = readTxtResults(
                '../accuraciesOutput/no_relations/bitfinex_tBTCUSD_1m.csv_Regression_Difference_FALSE_RELATIONS.txt'
            )
            RELATIONS = readTxtResults(
                '../accuraciesOutput/relations/bitfinex_tBTCUSD_1m.csv_Regression_Difference_TRUE_RELATIONS.txt'
            )
        xLabelNames = [
            'Default', 'RF Optimized', '68 Hyp..', '14 Hyp..', 'No-change',
            'Mean', 'Median'
        ]
        data = [
            DEFAULT, RF_OPTIMIZED, NO_RELATIONS, RELATIONS, [MAE_nochange],
            [MAE_mean], [MAE_median]
        ]
    elif MachineLearningMethod == "Classification":
        if FeeModel == "OFF":
            if intervalType == "Day":
                DEFAULT = readTxtResults(
                    '../accuraciesOutput/default/BTCUSD_1Day.csv_Classification_FALSE_RELATIONS.txt'
                )
                RF_OPTIMIZED = readTxtResults(
                    '../accuraciesOutput/rf_optimized/BTCUSD_1Day.csv_Classification_FALSE_RELATIONS.txt'
                )
                NO_RELATIONS = readTxtResults(
                    '../accuraciesOutput/no_relations/BTCUSD_1Day.csv_Classification_FALSE_RELATIONS.txt'
                )
                RELATIONS = readTxtResults(
                    '../accuraciesOutput/relations/BTCUSD_1Day.csv_Classification_TRUE_RELATIONS.txt'
                )
            elif intervalType == "Hour":
                DEFAULT = readTxtResults(
                    '../accuraciesOutput/default/bitfinex_tBTCUSD_1h.csv_Classification_FALSE_RELATIONS.txt'
                )
                RF_OPTIMIZED = readTxtResults(
                    '../accuraciesOutput/rf_optimized/bitfinex_tBTCUSD_1h.csv_Classification_FALSE_RELATIONS.txt'
                )
                NO_RELATIONS = readTxtResults(
                    '../accuraciesOutput/no_relations/bitfinex_tBTCUSD_1h.csv_Classification_FALSE_RELATIONS.txt'
                )
                RELATIONS = readTxtResults(
                    '../accuraciesOutput/relations/bitfinex_tBTCUSD_1h.csv_Classification_TRUE_RELATIONS.txt'
                )
            elif intervalType == "Minute":
                DEFAULT = readTxtResults(
                    '../accuraciesOutput/default/bitfinex_tBTCUSD_1m.csv_Classification_FALSE_RELATIONS.txt'
                )
                RF_OPTIMIZED = readTxtResults(
                    '../accuraciesOutput/rf_optimized/bitfinex_tBTCUSD_1m.csv_Classification_FALSE_RELATIONS.txt'
                )
                NO_RELATIONS = readTxtResults(
                    '../accuraciesOutput/no_relations/bitfinex_tBTCUSD_1m.csv_Classification_FALSE_RELATIONS.txt'
                )
                RELATIONS = readTxtResults(
                    '../accuraciesOutput/relations/bitfinex_tBTCUSD_1m.csv_Classification_TRUE_RELATIONS.txt'
                )
        elif FeeModel == "ON":
            if intervalType == "Day":
                DEFAULT = readTxtResults(
                    '../accuraciesOutput_feeModel/default/BTCUSD_1Day.csv_Classification_FALSE_RELATIONS.txt'
                )
                RF_OPTIMIZED = readTxtResults(
                    '../accuraciesOutput_feeModel/rf_optimized/BTCUSD_1Day.csv_Classification_FALSE_RELATIONS.txt'
                )
                NO_RELATIONS = readTxtResults(
                    '../accuraciesOutput_feeModel/no_relations/BTCUSD_1Day.csv_Classification_FALSE_RELATIONS.txt'
                )
                RELATIONS = readTxtResults(
                    '../accuraciesOutput_feeModel/relations/BTCUSD_1Day.csv_Classification_TRUE_RELATIONS.txt'
                )
            elif intervalType == "Hour":
                DEFAULT = readTxtResults(
                    '../accuraciesOutput_feeModel/default/bitfinex_tBTCUSD_1h.csv_Classification_FALSE_RELATIONS.txt'
                )
                RF_OPTIMIZED = readTxtResults(
                    '../accuraciesOutput_feeModel/rf_optimized/bitfinex_tBTCUSD_1h.csv_Classification_FALSE_RELATIONS.txt'
                )
                NO_RELATIONS = readTxtResults(
                    '../accuraciesOutput_feeModel/no_relations/bitfinex_tBTCUSD_1h.csv_Classification_FALSE_RELATIONS.txt'
                )
                RELATIONS = readTxtResults(
                    '../accuraciesOutput_feeModel/relations/bitfinex_tBTCUSD_1h.csv_Classification_TRUE_RELATIONS.txt'
                )
            elif intervalType == "Minute":
                DEFAULT = readTxtResults(
                    '../accuraciesOutput_feeModel/default/bitfinex_tBTCUSD_1m.csv_Classification_FALSE_RELATIONS.txt'
                )
                RF_OPTIMIZED = readTxtResults(
                    '../accuraciesOutput_feeModel/rf_optimized/bitfinex_tBTCUSD_1m.csv_Classification_FALSE_RELATIONS.txt'
                )
                NO_RELATIONS = readTxtResults(
                    '../accuraciesOutput_feeModel/no_relations/bitfinex_tBTCUSD_1m.csv_Classification_FALSE_RELATIONS.txt'
                )
                RELATIONS = readTxtResults(
                    '../accuraciesOutput_feeModel/relations/bitfinex_tBTCUSD_1m.csv_Classification_TRUE_RELATIONS.txt'
                )
        xLabelNames = [
            'Default', 'RF Optimized', '68 Hyp..', '14 Hyp..', 'No-change',
            'Stratified', 'Most Frequent', 'Prior'
        ]
        data = [
            DEFAULT, RF_OPTIMIZED, NO_RELATIONS,
            RELATIONS, [accuracy_nochange], [accuracy_stratified],
            [accuracy_mostfrequent], [accuracy_prior]
        ]
    # Export the predicted model
    df = df[len(X_train):]
    if MachineLearningMethod == "Classification":
        df['Predicted'] = y_predict_MOSTFREQ
    elif MachineLearningMethod == "Regression":
        df['Predicted'] = y_predict_MEAN
    dfClose = df['Close']
    df['Change'] = dfClose.pct_change(periods=1)  # Contains percentage change
    df = df[['Timestamp', 'Close', 'Change', 'Target', 'Predicted']]
    nameOfExportedModel = intervalType + "_" + MachineLearningMethod
    df.to_csv("../PredictedModels/baselineModels/" + nameOfExportedModel +
              ".csv")

    matplotlib.use("pgf")
    matplotlib.rcParams.update({
        "pgf.texsystem": "pdflatex",
        'font.family': 'serif',
        'text.usetex': True,
        'pgf.rcfonts': False,
    })

    fig, ax = plt.subplots(figsize=(6.69, 4))
    ax.set_xticklabels(xLabelNames)
    ax.set_title(MachineLearningMethod + ' Baselines for interval: ' +
                 intervalType)
    ax.boxplot(data)
    #plt.savefig('histogram.png')
    plt.savefig('histogram.pgf')
    plt.show()
    exit()
Esempio n. 40
0
imprime_titulo("Comparación de clasificación")

randomf_clasif = [("Random Forest", RandomForestClassifier(n_estimators=100))]

clasificador_randomf = Pipeline(preprocesado + randomf_clasif)

with mensaje("Ajustando modelo de clasificación Random Forest"):
  clasificador_randomf.fit(digits_tra_x, digits_tra_y)

y_clasif_randomf = clasificador_randomf.predict(digits_test_x)
muestra_confusion(digits_test_y, y_clasif_randomf, "Random Forest")

estima_error_clasif(clasificador_randomf, digits_tra_x, digits_tra_y,
                    digits_test_x, digits_test_y, "RandomForest")

dummy_clasif = DummyClassifier(strategy="stratified")
dummy_clasif.fit(digits_tra_x, digits_tra_y)
estima_error_clasif(dummy_clasif, digits_tra_x, digits_tra_y, digits_test_x,
                    digits_test_y, "Estratificado (Dummy)")
espera()

imprime_titulo("Comparación de regresión")

randomf_regr = [("Random Forest", RandomForestRegressor(n_estimators=100))]
regresor_randomf = Pipeline(preprocesado + randomf_regr)

with mensaje("Ajustando modelo de regresión Random Forest"):
  regresor_randomf.fit(airfoil_tra_x, airfoil_tra_y)

estima_error_regresion(regresor_randomf, airfoil_tra_x, airfoil_tra_y,
                       airfoil_test_x, airfoil_test_y, "RandomForest")
	print(spam)

	print(messages.groupby('label').describe())

	print('\nВыборка несбалансированна, неспам - 4825, спам - 747, примеров спама гораздо меньше')

	# перевод str в int (ham->0, spam->1)
	messages['label'] = messages['label'].map({'ham': 0, 'spam': 1}).astype(int)

	# Векторизация
	bow = CountVectorizer()
	bow.fit_transform(messages['message'])
	bowed_messages = bow.transform(messages['message'])

	# Обучение DummyClassifier
	clf = DummyClassifier(strategy='most_frequent', random_state=0)
	clf = clf.fit(bowed_messages, messages['label'])

	# Вывод результатов по Dummy Classifier
	print(classification_report(messages['label'], clf.predict(bowed_messages)))
	print('Dummy classifier, который будет всем новым наблюдениям присваивать класс ham, получит 75% precission и 87 -  recall, 80 - f-score')

	# print('\nNaive Bayes 1')
	# naive_model = MultinomialNB()
	# naive_model.fit(bowed_messages, messages['label'])
	# # print(len(msg_train), len(msg_test))
	# cv_results = cross_val_score(naive_model, bowed_messages, messages['label'], cv=10, scoring='accuracy')
	# print(cv_results.mean(), cv_results.std())
	# print(classification_report(messages['label'], naive_model.predict(bowed_messages)))

	msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.2) # поделить выборку в соотновении 80:20
Esempio n. 42
0
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier

from nlp4musa2020.dataloaders.alf200k import ALF200KLoader, genre_target_labels
import nlp4musa2020.evaluators as evaluators
from nlp4musa2020.models.simplenn_genre import SimpleGenreNN

dataloader = ALF200KLoader('data/processed/dataset-lfm-genres.pickle',
                           load_feature_groups=[
                               'explicitness',
                           ],
                           text_vectorizers=None,
                           target=genre_target_labels())

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', DummyClassifier(strategy="uniform")),
])

evaluator = GridEvaluator(
    parameters={
        "model__random_state": [42],
    },
    grid_parameters=evaluators.grid_parameters_genres(),
)

result_handlers = [
    result_handlers.print_gridsearch_results,
]
# In[107]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# ## Dummy classifier

# In[109]:

clfDummy = DummyClassifier(strategy='most_frequent', random_state=0)
clfDummy.fit(X_train, y_train)

# In[110]:

clfDummy.score(X_test, y_test)

# In[111]:

dump(clfDummy, 'dummyClf.joblib')

# ## Ridge classifier

# In[122]:

#hyperparameter tuning
Esempio n. 44
0
if __name__ == '__main__':

    ###########################################################
    # Settings

    #!!!
    USED_EXAMPLES_NUMBER = None  # 'None' means that all examples are used; otherwise randomly selected

    #!!!
    OBJECTIVE_NAME = 'Sex'  # e.g. 'BMIgr', 'Sex', 'cl_sleep_interval' #!!!!
    sample_name = OBJECTIVE_NAME + '_1'  # train-test filename
    SEED = 0

    classifiers = [
        (
            "Dummy", DummyClassifier(strategy='stratified')
        ),  # see http://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html
        ("Nearest Neighbors", KNeighborsClassifier(3)),
        # ("Linear SVM", SVC(kernel="linear", C=0.025)),
        # ("RBF SVM", SVC(gamma=2, C=1)),
        # ("Decision Tree", DecisionTreeClassifier(max_depth=5)),
        ("Random Forest",
         RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),
        # ("AdaBoost", AdaBoostClassifier()),
        ("Naive Bayes", GaussianNB())
    ]  # TODO: xgboost

    ###############################################################
    # Initial configuration
    np.random.seed(SEED)
    logg.configure_logging(
Esempio n. 45
0
df['Target'] = 0
df.loc[df['FutureReturn'] > buy_threshold, 'Target'] = 1
df.loc[df['FutureReturn'] < sell_threshold, 'Target'] = -1

# Train/test split
X = df.ix[:, :-2]  # all columns except Target
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0)  # Default: train 75%, test 25%

# Fit ensemble classifier
clf = VotingClassifier([('knn', KNeighborsClassifier()),
                        ('rfor', RandomForestClassifier(random_state=0)),
                        ('lsvc', LinearSVC())]).fit(X_train, y_train)

dummy = DummyClassifier(strategy='stratified').fit(X_train, y_train)

print('Accuracy of Ensemble classifier on training set: {:.2f}'.format(
    clf.score(X_train, y_train)))
print('Accuracy of Ensemble classifier on test set: {:.2f}'.format(
    clf.score(X_test, y_test)))
print('Prediction Spread:', Counter(clf.predict(X_test)))
print('Accuracy of Dummy classifier on test set: {:.2f}'.format(
    dummy.score(X_test, y_test)))
print('Dummy Prediction Spread:', Counter(dummy.predict(X_test)))

# Plot predictions
X_test = X_test.sort_index()
df['predictions'] = pd.DataFrame(clf.predict(X_test), index=X_test.index)

fig, ax = plt.subplots()
Esempio n. 46
0
                                                    k: int(v / self.factor)
                                                    for k, v in counts.items()
                                                })
        X_resampled, y_resampled = self.undersampler_.fit_resample(X, y)
        if self.oversampler is not None:
            self.oversampler_ = clone(self.oversampler).set_params(
                random_state=self.random_state, sampling_strategy=dict(counts))
            X_resampled, y_resampled = self.oversampler_.fit_resample(
                X_resampled, y_resampled)
        return X_resampled, y_resampled


SCORERS['geometric_mean_score'] = make_scorer(geometric_mean_score)
CONFIG = {
    'classifiers': [
        ('CONSTANT CLASSIFIER', DummyClassifier(strategy='constant',
                                                constant=0), {}),
        ('LR', LogisticRegression(solver='liblinear', multi_class='auto'), {}),
        ('KNN', KNeighborsClassifier(), {
            'n_neighbors': [3, 5]
        }),
        ('DT', DecisionTreeClassifier(), {
            'max_depth': [3, 6]
        }),
        ('GBC', GradientBoostingClassifier(), {
            'max_depth': [3, 6],
            'n_estimators': [50, 100]
        }),
    ],
    'scoring': ['accuracy', 'geometric_mean_score'],
    'n_splits':
    5,
Esempio n. 47
0
benchmark_dir = os.environ["AMM_DATASET_DIR"]

for p in BENCHMARK_FULL_SET:
    pname = p["name"]
    print("Loading {}".format(pname))
    df = pd.read_pickle(os.path.join(benchmark_dir, p["data_pickle"]))
    target = p["target"]
    ltype = p["problem_type"]
    if ltype == AMM_REG_NAME:
        kf = KFold(n_splits=5, random_state=18012019, shuffle=True)
        estimator = DummyRegressor(strategy="mean")
        scoring = "neg_mean_absolute_error"
        multiplier = -1
    elif ltype == AMM_CLF_NAME:
        kf = StratifiedKFold(n_splits=5, random_state=18012019, shuffle=True)
        estimator = DummyClassifier(strategy="stratified")
        multiplier = 1
        scoring = "roc_auc"
    else:
        raise ValueError("problem type {} is not known.".format(ltype))

    cvs = cross_val_score(estimator,
                          df.drop(columns=[target]),
                          y=df[target],
                          scoring=scoring,
                          cv=kf)

    cvs = multiplier * cvs
    mean_cvs = np.mean(cvs)
    print(pname, mean_cvs)
Esempio n. 48
0
def test_classifier_exceptions():
    clf = DummyClassifier(strategy="unknown")
    assert_raises(ValueError, clf.fit, [], [])

    assert_raises(ValueError, clf.predict, [])
    assert_raises(ValueError, clf.predict_proba, [])

from sklearn.feature_extraction.text import CountVectorizer

c = CountVectorizer(stop_words='english')
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

predict(X, y, c, lr)
"""#### Accuracy is around 93.9% - not bad. However we notice that some of those significant coefficients are not meaningful, e.g. 280mg."""

from sklearn.dummy import DummyClassifier

### calling function for dummy classifier
text_fit(X, y, c, DummyClassifier(), 0)
"""#### Logistic regression model on TFIDF"""

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
text_fit(X, y, tfidf, LogisticRegression())

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
predict(X, y, tfidf, LogisticRegression())
"""Accurany is roughly the same - 93.5%. However we notice that the significant words make much more sense now, with higher coefficient magnitude as well!"""
"""#### Upvote prediction

We will be focusing on score 5 reviews, and get rid of comments with neutral votes
Esempio n. 50
0
def test_classifier_score_with_None(y, y_test):
    clf = DummyClassifier(strategy="most_frequent")
    clf.fit(None, y)
    assert_equal(clf.score(None, y_test), 0.5)
Esempio n. 51
0
knn.fit(X_train, y_train)
prediction = knn.predict_proba(X_test)
fpr, tpr, _ = roc_curve(y_test, prediction[:, 1])
auc_score = roc_auc_score(y_test, prediction[:, 1])
print("AUC Score:", auc_score)
plt.plot(fpr, tpr, color='red', label='K-Neighbours')

knn.fit(X_train, y_train)
preds_train = knn.predict(X_train)
preds_test = knn.predict(X_test)

print_results(preds_train, y_train, "KNN train")
print_results(preds_test, y_test, "KNN test")

# baseline classifier
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
preds_train = dummy.predict(X_train)
preds_test = dummy.predict(X_test)

print_results(preds_train, y_train, "Dummy train")
print_results(preds_test, y_test, "Dummy test")

# baseline confusion matrix for plotting point
matrix = confusion_matrix(y_train, preds_train)

most_freq_fpr = matrix[0][1] / (matrix[0][1] + matrix[0][0])  # FP / (FP + TN)
most_freq_tpr = matrix[1][1] / (matrix[1][1] + matrix[1][0])  # TP / (TP + FN)

plt.plot(most_freq_fpr,
         most_freq_tpr,
Esempio n. 52
0
def test_string_labels():
    X = [[0]] * 5
    y = ["paris", "paris", "tokyo", "amsterdam", "berlin"]
    clf = DummyClassifier(strategy="most_frequent")
    clf.fit(X, y)
    assert_array_equal(clf.predict(X), ["paris"] * 5)
pickle.dump(clf, open('multi_tfidf.sav', 'wb'))
pred_tfidf_balanced = clf.predict(vect_tfidf.transform(X_test))
print(" Tf-idf, Balanced accuracy score = " + str(balanced_accuracy_score(y_test, pred_tfidf_balanced)))
print(" Tf-idf, Accuracy score = " + str(accuracy_score(y_test, pred_tfidf_balanced)))
report_tfidf = classification_report(y_test, pred_tfidf_balanced)

# count balanced accuracy: 42
# count accuracy: 43.5
# tf-idf balanced accuracy: 43.5
# tf-idf accuracy: 41.5

'''
-------------------- Baseline Models -------------------
'''

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
DummyClassifier(strategy='uniform')
dummy_clf.predict(X_train)
dummy_clf.score(X_test, y_test)

# Multilabel
# Most frequent: 19.7 %
# Stratified: 14.2 %
# Uniform: 11.3

# Binary 50.46 %


'''
------------------- Feature importance -----------------
    action = args.action

    # Get the full data set, instances, and outcomes.
    dataset, instances, outcomes = get_data(filename)

    # Organize the data
    instances, outcomes = data_organizer(instances, outcomes)

    assert len(instances) == len(outcomes)

    # Generate labels array from the outcome data
    labels = generate_labels(outcomes)

    # Split data into training and dev sets
    size_of_test_set = 0.3
    instance_train, instance_test, labels_train, labels_test =\
       train_test_split( instances, labels, test_size = size_of_test_set )

    assert len(instance_train) == len(labels_train) and len(
        instance_test) == len(labels_test)

    # Classify the training set
    classifier = NBclassify(instance_train, labels_train)

    # Baseline
    baseline = DummyClassifier(strategy='uniform')
    dumb_clf = baseline.fit(instance_train, labels_train)

    # Evaluate the classification
    evaluate(classifier, dumb_clf, instance_test, labels_test)
Esempio n. 55
0
dataset=load_digits()
X,y = dataset.data, dataset.target
for class_name,class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name, class_count)
y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced !=1] = 0
print('Original labels:\t', y[1:30])
print('New binary labels:\t', y_binary_imbalanced[1:30])
np.bincount(y_binary_imbalanced)
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)
svm = SVC(kernel="rbf", C=1).fit(X_train,y_train)
svm.score(X_test,y_test)

# Dummy classifiers server as a sanity check purpose, not used for real classifier
from sklearn.dummy import DummyClassifier
dummy_majority= DummyClassifier(strategy="most_frequent").fit(X_train, y_train)
# most_frequent, stratified(random from training set class distribution), uniform(uniform random), constant(when positive class is minority)

y_dummy_pred=dummy_majority.predict(X_test)
y_dummy_pred
dummy_majority.score(X_test,y_test)
### dummy classifier: for sanity check, if
### dummy regressors
## strategy: mean, median, quantile, constant

# confusion matrix
from sklearn.metrics import confusion_matrix
dummy_majority = DummyClassifier(strategy="most_frequent").fit(X_train, y_train)
y_majority_pred = dummy_majority.predict(X_test)
confusion_dummy =confusion_matrix(y_test, y_majority_pred)
confusion_dummy # [ [TN,FN], [FP,TP] ]
Esempio n. 56
0
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier()
dummy.fit(X_train, y_train)
print(accuracy_score(y_train, dummy.predict(X_train)))
Esempio n. 57
0
    def fit(self, X, y, sample_weight=None):
        """Fit all base estimators.

        Parameters
        ----------
        X : 2d numpy array or sparse matrix of shape [n_samples, n_features]
            Training data
        y : 1d numpy array of shape [n_samples]
            Target values.
        sample_weight : 1d numpy array of shape [n_samples]
            Individual weights for each sample.
            Passed to fit method of each estimator.
            Note: will be split automatically for each fold.

        Returns
        -------
        self : object
            Fitted StackingTransformer instance.
        """
        # ---------------------------------------------------------------------
        # Validation
        # ---------------------------------------------------------------------

        # ---------------------------------------------------------------------
        # Check input data
        # ---------------------------------------------------------------------
        # Check X and y
        # ``check_estimator`` does not allow ``force_all_finite=False``
        X, y = check_X_y(
            X,
            y,
            accept_sparse=['csr'],  # allow csr, cast all others to csr
            force_all_finite=True,  # do not allow  nan and inf
            multi_output=False)  # allow only one column in y_train

        # Check X and sample_weight
        # X is alredy checked, but we need it to compare length of sample_weight
        if sample_weight is not None:
            X, sample_weight = check_X_y(X,
                                         sample_weight,
                                         accept_sparse=['csr'],
                                         force_all_finite=True,
                                         multi_output=False)

        # ---------------------------------------------------------------------
        # Check ``estimators``
        # ---------------------------------------------------------------------
        if self.estimators is None:
            if self.regression:
                self.estimators_ = [('dumregr',
                                     DummyRegressor(strategy='constant',
                                                    constant=5.5))]
            else:
                self.estimators_ = [('dumclf',
                                     DummyClassifier(strategy='constant',
                                                     constant=1))]
            # warnings.warn('No estimators were specified. '
            #               'Using single dummy estimator as demo.', UserWarning)
        else:
            if 0 == len(self.estimators):
                raise ValueError('List of estimators is empty')
            else:
                # Clone
                self.estimators_ = [(name, clone(estim))
                                    for name, estim in self.estimators]
                # Check names of estimators
                names, estims = zip(*self.estimators_)
                self._validate_names(names)
                # Check if all estimators support ``sample_weight``
                if sample_weight is not None:
                    for name, estim in self.estimators_:
                        if not has_fit_parameter(estim, 'sample_weight'):
                            raise ValueError(
                                'Underlying estimator [%s] does not '
                                'support sample weights.' % name)

        # ---------------------------------------------------------------------
        # Check other StackingTransformer parameters
        # ---------------------------------------------------------------------

        # ``variant``
        if self.variant not in ['A', 'B']:
            raise ValueError('Parameter ``variant`` must be set properly')

        # ``n_folds``
        if not isinstance(self.n_folds, int):
            raise ValueError('Parameter ``n_folds`` must be integer')
        if not self.n_folds > 1:
            raise ValueError('Parameter ``n_folds`` must be not less than 2')

        # ``verbose``
        if self.verbose not in [0, 1, 2]:
            raise ValueError('Parameter ``verbose`` must be 0, 1, or 2')

        # Additional check for inapplicable parameter combinations
        # If ``regression=True`` we ignore classification-specific
        # parameters and issue user warning
        if self.regression and (self.needs_proba or self.stratified):
            warn_str = ('This is regression task hence classification-specific'
                        'parameters set to ``True`` were ignored:')
            if self.needs_proba:
                self.needs_proba = False
                warn_str += ' ``needs_proba``'
            if self.stratified:
                self.stratified = False
                warn_str += ' ``stratified``'
            warnings.warn(warn_str, UserWarning)

        # ---------------------------------------------------------------------
        # Compute attributes (basic properties of data, number of estimators, etc.)
        # ---------------------------------------------------------------------
        self.train_shape_ = X.shape
        self.n_train_examples_ = X.shape[0]
        self.n_features_ = X.shape[1]
        if not self.regression:
            self.n_classes_ = len(np.unique(y))
        else:
            self.n_classes_ = None
        self.n_estimators_ = len(self.estimators_)
        self.train_footprint_ = self._get_footprint(X)

        # ---------------------------------------------------------------------
        # Specify default metric
        # ---------------------------------------------------------------------
        if self.metric is None and self.regression:
            self.metric_ = mean_absolute_error
        elif self.metric is None and not self.regression:
            if self.needs_proba:
                self.metric_ = log_loss
            else:
                self.metric_ = accuracy_score
        else:
            self.metric_ = self.metric
        # ---------------------------------------------------------------------
        # Create report header strings and print report header
        # ---------------------------------------------------------------------
        if self.verbose > 0:
            if self.regression:
                task_str = 'task:         [regression]'
            else:
                task_str = 'task:         [classification]'
                n_classes_str = 'n_classes:    [%d]' % self.n_classes_
            metric_str = 'metric:       [%s]' % self.metric_.__name__
            variant_str = 'variant:      [%s]' % self.variant
            n_estimators_str = 'n_estimators: [%d]' % self.n_estimators_

            print(task_str)
            if not self.regression:
                print(n_classes_str)
            print(metric_str)
            print(variant_str)
            print(n_estimators_str + '\n')
        # ---------------------------------------------------------------------
        # Initialize cross-validation split
        # Stratified can be used only for classification
        # ---------------------------------------------------------------------
        if not self.regression and self.stratified:
            self.kf_ = StratifiedKFold(n_splits=self.n_folds,
                                       shuffle=self.shuffle,
                                       random_state=self.random_state)
            # Save target to be able to create stratified split in ``transform`` method
            # This is more efficient than to save split indices
            self._y_ = y.copy()
        else:
            self.kf_ = KFold(n_splits=self.n_folds,
                             shuffle=self.shuffle,
                             random_state=self.random_state)
            self._y_ = None

        # ---------------------------------------------------------------------
        # Compute implicit number of classes to create appropriate empty arrays.
        # !!! Important. In order to unify array creation
        # variable ``n_classes_implicit_`` is always equal to 1, except the case
        # when we performing classification task with ``needs_proba=True``
        # ---------------------------------------------------------------------
        if not self.regression and self.needs_proba:
            self.n_classes_implicit_ = len(np.unique(y))
            self.action_ = 'predict_proba'
        else:
            self.n_classes_implicit_ = 1
            self.action_ = 'predict'

        # ---------------------------------------------------------------------
        # Create empty numpy array for train predictions (OOF)
        # !!! Important. We have to implicitly predict during fit
        # in order to compute CV scores, because
        # the most reasonable place to print out CV scores is fit method
        # ---------------------------------------------------------------------
        S_train = np.zeros(
            (X.shape[0], self.n_estimators_ * self.n_classes_implicit_))

        # ---------------------------------------------------------------------
        # Prepare (clone) estmators for fitting and storing
        # We need models_A_ for both variant A and varian B
        # We need models_B_ for varian B only (in variant A attribute models_B_ is None)
        # ---------------------------------------------------------------------

        self.models_A_ = []
        self.models_B_ = None

        for n, est in self.estimators_:
            self.models_A_.append([clone(est) for _ in range(self.n_folds)])

        if self.variant in ['B']:
            self.models_B_ = [clone(est) for n, est in self.estimators_]

        # ---------------------------------------------------------------------
        # Create empty numpy array to store scores for each estimator and each fold
        # ---------------------------------------------------------------------
        self.scores_ = np.zeros((self.n_estimators_, self.n_folds))

        # ---------------------------------------------------------------------
        # Create empty list to store name, mean and std for each estimator
        # ---------------------------------------------------------------------
        self.mean_std_ = []

        # ---------------------------------------------------------------------
        # MAIN FIT PROCEDURE
        # ---------------------------------------------------------------------
        # Loop across estimators
        # ---------------------------------------------------------------------
        for estimator_counter, (name,
                                estimator) in enumerate(self.estimators_):
            if self.verbose > 0:
                estimator_str = 'estimator %2d: [%s: %s]' % (
                    estimator_counter, name, estimator.__class__.__name__)
                print(estimator_str)

            # -----------------------------------------------------------------
            # Loop across folds
            # -----------------------------------------------------------------
            for fold_counter, (tr_index,
                               te_index) in enumerate(self.kf_.split(X, y)):
                # Split data and target
                X_tr = X[tr_index]
                y_tr = y[tr_index]
                X_te = X[te_index]
                y_te = y[te_index]

                # Split sample weights accordingly (if passed)
                if sample_weight is not None:
                    sample_weight_tr = sample_weight[tr_index]
                    # sample_weight_te = sample_weight[te_index]
                else:
                    sample_weight_tr = None
                    # sample_weight_te = None

                # Fit estimator
                _ = self._estimator_action(
                    self.models_A_[estimator_counter][fold_counter],
                    X_tr,
                    y_tr,
                    None,
                    sample_weight=sample_weight_tr,
                    action='fit',
                    transform=self.transform_target)

                # Predict out-of-fold part of train set
                if 'predict_proba' == self.action_:
                    col_slice_estimator = slice(
                        estimator_counter * self.n_classes_implicit_,
                        estimator_counter * self.n_classes_implicit_ +
                        self.n_classes_implicit_)
                else:
                    col_slice_estimator = estimator_counter
                S_train[te_index,
                        col_slice_estimator] = self._estimator_action(
                            self.models_A_[estimator_counter][fold_counter],
                            None,
                            None,
                            X_te,
                            action=self.action_,
                            transform=self.transform_pred)
                # Compute score
                score = self.metric_(y_te, S_train[te_index,
                                                   col_slice_estimator])
                self.scores_[estimator_counter, fold_counter] = score

                # Print fold score
                if self.verbose > 1:
                    fold_str = '    fold %2d:  [%.8f]' % (fold_counter, score)
                    print(fold_str)

            # Compute mean and std and save in dict
            estim_name = self.estimators_[estimator_counter][0]
            estim_mean = np.mean(self.scores_[estimator_counter])
            estim_std = np.std(self.scores_[estimator_counter])
            self.mean_std_.append((estim_name, estim_mean, estim_std))

            if self.verbose > 1:
                sep_str = '    ----'
                print(sep_str)

            # Compute mean + std (and full)
            if self.verbose > 0:
                mean_str = '    MEAN:     [%.8f] + [%.8f]\n' % (estim_mean,
                                                                estim_std)
                print(mean_str)

            # Fit estimator on full train set
            if self.variant in ['B']:
                if self.verbose > 0:
                    print('    Fitting on full train set...\n')
                _ = self._estimator_action(self.models_B_[estimator_counter],
                                           X,
                                           y,
                                           None,
                                           sample_weight=sample_weight,
                                           action='fit',
                                           transform=self.transform_target)

        # ---------------------------------------------------------------------
        # ---------------------------------------------------------------------

        # Return fitted StackingTransformer instance
        return self
def classify(data, descriptors, descriptors_to_labels, print_predictions, \
    print_results):
    # number of data points
    num_data = data.shape[0]

    # randomly shuffle the data vectors
    np.random.shuffle(data)

    # extract the vectors, labels, and indices
    vectors = data[:, :-2]
    labels = data[:, -2]
    indices = data[:, -1]

    # number of vectors to use as training data
    num_training = int(TRAINING_SET_PROP * num_data)
    num_test = num_data - num_training

    # extract first 'num_training' training vectors and labels
    training_vectors = vectors[:num_training, :]
    training_labels = labels[:num_training]
    training_indices = indices[:num_training]

    # extract remaining test vectors and labels
    test_vectors = vectors[num_training:, :]
    test_labels = labels[num_training:]
    test_indices = indices[num_training:]

    # set up logistic regression classifier and fit to data
    log_reg_clf = LogisticRegression(solver='newton-cg', max_iter=50, random_state=0,\
        multi_class='multinomial', verbose=0).fit(training_vectors, \
        training_labels)

    # get classifier's predictions and decision function for each vector
    predictions = log_reg_clf.predict(test_vectors)
    confidences = log_reg_clf.decision_function(test_vectors)

    # to record number of correctly classified instances
    num_correct_inst = 0

    # to hold mapping from descriptor to list of predictions for that
    # descriptor
    descriptor_classifications = {}

    # go through all test data indices
    for i in range(num_test):

        # actual label number and string
        actual = test_labels[i]
        actual_str = [k for k, v in label_nums.items() if v == actual][0]

        # predicted label number and string
        prediction = predictions[i]
        prediction_str = [k for k,v in label_nums.items() if \
            v == prediction][0]

        # prediction confidence for prediction
        confidence = confidences[i][int(prediction)]

        if prediction != actual:
            if print_predictions:
                print("***Incorrect Prediction (" + str(confidence) + "):" +
                      descriptors[i])
                print("\t Actual - " + actual_str)
                print("\t Predicted - " + prediction_str)
        else:
            if print_predictions:
                print("---Correct Prediction: (" + str(confidence) + "):" +
                      descriptors[i])
                print("\t Actual - " + actual_str)
                print("\t Predicted - " + prediction_str)

            num_correct_inst += 1

        # get corresponding descriptor for this point
        descriptor = descriptors[int(test_indices[i])]

        # there is not an entry in the dictionary yet for this descriptor
        if descriptor not in descriptor_classifications:
            descriptor_classifications[descriptor] = []

        # add this prediction to the list of predictions for this descriptor,
        # along with the confidence
        descriptor_classifications[descriptor].append((predictions[i],\
            float(confidences[i][int(predictions[i])])))

    # to record number of correctly classified descriptors
    num_correct_desc = 0

    # go through all descriptors
    for descriptor in descriptor_classifications:
        # extract list of (classification,confidence) pairs for this descriptor
        classifications = descriptor_classifications[descriptor]

        # to store the sum of the confidences for each label
        confidences_per_label = {}

        # go through all labels
        for label in label_nums:
            # initialize the confidence for this label as 0
            num = label_nums[label]
            confidences_per_label[num] = 0

        # set confidences by summing confidences of classifications
        for classification in classifications:
            confidences_per_label[classification[0]] += classification[1]

        # take a confidence vote
        descriptor_label_prediction = max(confidences_per_label,\
            key=confidences_per_label.get)
        confidence = max(confidences_per_label.values())

        # extract actual and predicted descriptor labels
        actual_str = [k for k,v in label_nums.items() if v == \
            descriptors_to_labels[descriptor]][0]
        prediction_str = [k for k,v in label_nums.items() if v == \
            descriptor_label_prediction][0]

        if (descriptor_label_prediction != descriptors_to_labels[descriptor]):
            if print_predictions:
                print("*Incorrect Prediction (" + str(confidence) + "):" + \
                    descriptor)
                print("\t Actual - " + actual_str)
                print("\t Predicted - " + prediction_str)
        else:
            if print_predictions:
                print("-Correct Prediction (" + str(confidence) + "):" + \
                    descriptor)
                print("\t Actual - " + actual_str)
                print("\t Predicted - " + prediction_str)
            num_correct_desc += 1

    log_reg_result = float(num_correct_inst) / float(num_data - num_training)

    desc_classify_result = float(num_correct_desc) / \
            float(len(descriptor_classifications))

    dummy_clf = DummyClassifier(strategy='stratified')
    dummy_clf.fit(training_vectors, training_labels)
    dummy_result = dummy_clf.score(test_vectors, test_labels)

    ridge_clf = RidgeClassifier(solver='auto')
    ridge_clf.fit(training_vectors, training_labels)
    ridge_result = ridge_clf.score(test_vectors, test_labels)

    if print_results:
        print('Ridge result: ' + str(ridge_result))
        print("Logistic Regression result: " + str(log_reg_result))
        print("Descriptor classification result: " + str(desc_classify_result))
        print('Random result: ' + str(dummy_result))

    # return relevant scores
    ret_dict = { \
    "ridge": ridge_result, \
    "log_reg": log_reg_result, \
    "desc_classify": desc_classify_result, \
    "dummy": dummy_result \
    }
    return ret_dict
Esempio n. 59
0
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

# %% [markdown] tags=["solution"]
# Using an arbitrary mapping from string labels to integers as done here causes
# the linear model to make bad assumptions on the relative ordering of
# categories.
#
# This prevents the model from learning anything predictive enough and the
# cross-validated score is even lower than the baseline we obtained by ignoring
# the input data and just constantly predicting the most frequent class:

# %% tags=["solution"]
from sklearn.dummy import DummyClassifier

cv_results = cross_validate(DummyClassifier(strategy="most_frequent"),
                            data_categorical, target)
scores = cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

# %% [markdown]
# Now, we would like to compare the generalization performance of our previous
# model with a new model where instead of using an `OrdinalEncoder`, we will
# use a `OneHotEncoder`. Repeat the model evaluation using cross-validation.
# Compare the score of both models and conclude on the impact of choosing a
# specific encoding strategy when using a linear model.

# %%
from sklearn.preprocessing import OneHotEncoder
                         final_decision='only',
                         allow_empty='False',
                         pretrained_weights='scibert_scivocab_uncased',
                         remove_duplicates=True,
                         remove_stopwords=False)

embeddings_input = data_loader.read_embeddigns_from_file()
number_of_reviews = torch.tensor(
    [reviews.shape[0] for reviews in embeddings_input])
embeddings_input = rnn.pad_sequence(
    embeddings_input,
    batch_first=True).numpy()  # pad the reviews to form a tensor
print(embeddings_input.shape)
labels = data_loader.read_labels().numpy()

majoriy_clf = DummyClassifier(strategy='most_frequent')

preds = cross_val_predict(majoriy_clf, embeddings_input, labels, cv=5)

print('5-CV Majority Classifier:\n',
      classification_report(labels, preds, output_dict=True))

valid_size = 0.1

num_train = embeddings_input.shape[0]
indices = list(range(num_train))
split = int(np.floor(valid_size * num_train))

train_idx, test_idx = indices[split:], indices[:split]

test_embeddings_input = embeddings_input[test_idx, :, :]