def run_regression(train_embeds, train_labels, test_embeds, test_labels):
    np.random.seed(1)
    from sklearn.linear_model import SGDClassifier
    from sklearn.dummy import DummyClassifier
    from sklearn.metrics import accuracy_score
    dummy = DummyClassifier()
    dummy.fit(train_embeds, train_labels)
    log = SGDClassifier(loss="log", n_jobs=55)
    log.fit(train_embeds, train_labels)
    print("Test scores")
    print(accuracy_score(test_labels, log.predict(test_embeds)))
    print("Train scores")
    print(accuracy_score(train_labels, log.predict(train_embeds)))
    print("Random baseline")
    print(accuracy_score(test_labels, dummy.predict(test_embeds)))
def get_scores(X, y):
    nfolds = 40
    cv = StratifiedShuffleSplit(y, n_iter=nfolds, test_size=.05)
    dumb = DummyClassifier(strategy='most_frequent')
    clf = svm.SVC(class_weight='auto')
    param_dist = {"C": [.1, 1, 10],
                  "kernel": ['rbf', 'linear', 'poly']
                  }
    search = GridSearchCV(clf, param_grid=param_dist,
                          scoring='mean_absolute_error')
    stest, strain, sdummy = [], [], []
    for nfeats in range(X.shape[1]):
        test_scores, train_scores, dummy_scores = [], [], []
        # figure out our possible feature combinations
        feats = itertools.combinations(range(X.shape[1]), nfeats + 1)
        for my_feats in feats:
            for oidx, (train, test) in enumerate(cv):
                idx = np.array(my_feats)
                y_train, y_test = y[train], y[test]
                X_train, X_test = X[train, :], X[test, :]

                search.fit(X_train, y_train)
                clf = search.best_estimator_

                clf.fit(X_train[:, idx], y_train)
                train_scores.append(accuracy_score(clf.predict(X_train[:, idx]), y_train))
                test_scores.append(accuracy_score(clf.predict(X_test[:, idx]), y_test))
                dumb.fit(X_train[:, idx], y_train)
                dummy_scores.append(accuracy_score(dumb.predict(X_test[:, idx]), y_test))
        sdummy.append(np.mean(dummy_scores))
        strain.append(np.mean(train_scores))
        stest.append(np.mean(test_scores))
    return stest, strain, sdummy
Example #3
0
def do_cross_validation(labels):
    """Perform the k-fold cross validation.

    Perform the k-fold cross validation, collect the result and return the
    single test instance predictions, as well as the classification results for
    each single fold and for the combination of all folds.

    Keyword arguments:
    features -- all features
    labels -- all labels
    """
    skf = StratifiedKFold(labels, NO_OF_FOLDS)
    single_predictions = []  # Store each single classification decision

    # Store classification results for each fold and for the entire task (i.e.,
    # entire cross validation).
    classification_result = np.zeros((NO_OF_FOLDS + 1, 5))

    for cur_fold, (train_idx, test_idx) in enumerate(skf):
        model = DummyClassifier(strategy='most_frequent')
        model.fit(None, labels[train_idx])
        pred_labels = model.predict(np.zeros(labels[test_idx].shape[0]))

        fold_array = np.empty(test_idx.shape[0])
        fold_array.fill(cur_fold)
        single_predictions.append(np.transpose(np.vstack((fold_array, test_idx,
                labels[test_idx], pred_labels))))
        classification_result[cur_fold, :] = get_classification_result(cur_fold,
                labels[test_idx], pred_labels)

    single_predictions = np.vstack(single_predictions)
    return single_predictions, classification_result
def get_scores(X, y):
    nfolds = 200
    cv = StratifiedShuffleSplit(y, n_iter=nfolds, test_size=0.2)
    dumb = DummyClassifier(strategy="most_frequent")
    clf = svm.SVC(class_weight="auto")
    clf = linear_model.LogisticRegression()
    param_dist = {"C": [0.1, 1, 10], "kernel": ["rbf", "linear", "poly"]}
    param_dist = {"C": [1e6, 1e5, 1e4, 1e3, 1e2, 10, 1, 0.1, 0.01, 0.001]}
    search = GridSearchCV(clf, param_grid=param_dist, scoring="mean_absolute_error")
    test_scores, train_scores, dummy_scores = [], [], []
    preds, true_labels = [], []
    for oidx, (train, test) in enumerate(cv):
        y_train, y_test = y[train], y[test]
        X_train, X_test = X[train, :], X[test, :]

        search.fit(X_train, y_train)
        clf = search.best_estimator_
        print search.best_params_

        clf.fit(X_train, y_train)
        train_scores.append(accuracy_score(clf.predict(X_train), y_train))
        test_scores.append(accuracy_score(clf.predict(X_test), y_test))
        dumb.fit(X_train, y_train)
        dummy_scores.append(accuracy_score(dumb.predict(X_test), y_test))
        preds += list(clf.predict(X_test))
        true_labels += list(y_test)
    return test_scores, train_scores, dummy_scores, preds, true_labels
Example #5
0
def run_ML_leave_one_subject_out(config, filename, question, clf, cols, return_arr=None, return_index=-1):
    working_directory = config['DATA_DIRECTORY']
    data_X, data_y = load_data(working_directory, filename, cols, question)
    data = leave_one_subject_out(data_X, data_y, 'User')
    score = 0
    score_dummy_mf = 0
    score_dummy_sf = 0
    dummy_clf_mf = DummyClassifier('most_frequent')
    dummy_clf_sf = DummyClassifier('stratified')
    for (training_X, training_y), (testing_X, testing_y) in data:
        clf.fit(training_X, training_y)
        dummy_clf_mf.fit(training_X, training_y)
        dummy_clf_sf.fit(training_X, training_y)

        single_score = clf.score(testing_X, testing_y)
        single_score_dummy_mf = dummy_clf_mf.score(testing_X, testing_y)
        single_score_dummy_sf = dummy_clf_sf.score(testing_X, testing_y)
        #print 'Single run score: ' + ("%0.2f" % single_score.mean())
        #print 'Single run score (dummy most frequent): ' + ("%0.2f" % single_score_dummy_mf.mean())
        #print 'Single run score (dummy stratified): ' + ("%0.2f" % single_score_dummy_sf.mean())

        score = score + single_score.mean()
        score_dummy_mf = score_dummy_mf + single_score_dummy_mf.mean()
        score_dummy_sf = score_dummy_sf + single_score_dummy_sf.mean()
    score = round(float(score / len(data)), 2)
    score_dummy_mf = round(float(score_dummy_mf / len(data)), 2)
    score_dummy_sf = round(float(score_dummy_sf / len(data)), 2)
    #print 'Total score: ' + str(score)
    #print 'Total score (dummy most frequent): ' + str(score_dummy_mf)
    #print 'Total score (dummy stratified): ' + str(score_dummy_sf)
    if return_index == -1:
        return score, score_dummy_mf, score_dummy_sf
    else:
        return_arr[return_index] = (score, score_dummy_mf, score_dummy_sf)
def _run_dummy_detection(x_train, x_test, y_train, y_test):
    clf = DummyClassifier(strategy='most_frequent')

    print "Training Dummy..."
    clf.fit(x_train, y_train)
    print "Predicting Test Set..."
    print "Score for test set: {}".format(clf.score(x_test, y_test))
Example #7
0
def test_dummy_classifier_on_nan_value():
    X = [[np.NaN]]
    y = [1]
    y_expected = [1]
    clf = DummyClassifier()
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert_array_equal(y_pred, y_expected)
Example #8
0
def test_most_frequent_strategy():
    X = [[0], [0], [0], [0]]  # ignored
    y = [1, 2, 1, 1]

    clf = DummyClassifier(strategy="most_frequent", random_state=0)
    clf.fit(X, y)
    assert_array_equal(clf.predict(X), np.ones(len(X)))
    _check_predict_proba(clf, X, y)
def test_constant_strategy_multioutput():
    X = [[0], [0], [0], [0]]  # ignored
    y = np.array([[2, 3], [1, 3], [2, 3], [2, 0]])

    n_samples = len(X)

    clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
    clf.fit(X, y)
    assert_array_equal(clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]))
    _check_predict_proba(clf, X, y)
Example #10
0
def test_dummy_classifier_on_3D_array():
    X = np.array([[['foo']], [['bar']], [['baz']]])
    y = [2, 2, 2]
    y_expected = [2, 2, 2]
    y_proba_expected = [[1], [1], [1]]
    cls = DummyClassifier()
    cls.fit(X, y)
    y_pred = cls.predict(X)
    y_pred_proba = cls.predict_proba(X)
    assert_array_equal(y_pred, y_expected)
    assert_array_equal(y_pred_proba, y_proba_expected)
def test_constant_strategy_sparse_target():
    X = [[0]] * 5  # ignored
    y = sp.csc_matrix(np.array([[0, 1], [4, 0], [1, 1], [1, 4], [1, 1]]))

    n_samples = len(X)

    clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert_true(sp.issparse(y_pred))
    assert_array_equal(y_pred.toarray(), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]))
Example #12
0
def test_stratified_strategy():
    X = [[0]] * 5  # ignored
    y = [1, 2, 1, 1, 2]
    clf = DummyClassifier(strategy="stratified", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 1000
    y_pred = clf.predict(X)
    p = np.bincount(y_pred) / float(len(X))
    assert_almost_equal(p[1], 3. / 5, decimal=1)
    assert_almost_equal(p[2], 2. / 5, decimal=1)
    _check_predict_proba(clf, X, y)
Example #13
0
def test_uniform_strategy():
    X = [[0]] * 4  # ignored
    y = [1, 2, 1, 1]
    clf = DummyClassifier(strategy="uniform", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)
    p = np.bincount(y_pred) / float(len(X))
    assert_almost_equal(p[1], 0.5, decimal=1)
    assert_almost_equal(p[2], 0.5, decimal=1)
    _check_predict_proba(clf, X, y)
def test_most_frequent_and_prior_strategy_multioutput():
    X = [[0], [0], [0], [0]]  # ignored
    y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]])

    n_samples = len(X)

    for strategy in ("prior", "most_frequent"):
        clf = DummyClassifier(strategy=strategy, random_state=0)
        clf.fit(X, y)
        assert_array_equal(clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]))
        _check_predict_proba(clf, X, y)
        _check_behavior_2d(clf)
Example #15
0
def test_classifier_prediction_independent_of_X(strategy):
    y = [0, 2, 1, 1]
    X1 = [[0]] * 4
    clf1 = DummyClassifier(strategy=strategy, random_state=0, constant=0)
    clf1.fit(X1, y)
    predictions1 = clf1.predict(X1)

    X2 = [[1]] * 4
    clf2 = DummyClassifier(strategy=strategy, random_state=0, constant=0)
    clf2.fit(X2, y)
    predictions2 = clf2.predict(X2)

    assert_array_equal(predictions1, predictions2)
def test_most_frequent_and_prior_strategy_sparse_target():
    X = [[0]] * 5  # ignored
    y = sp.csc_matrix(np.array([[1, 0], [1, 3], [4, 0], [0, 1], [1, 0]]))

    n_samples = len(X)
    y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
    for strategy in ("most_frequent", "prior"):
        clf = DummyClassifier(strategy=strategy, random_state=0)
        clf.fit(X, y)

        y_pred = clf.predict(X)
        assert_true(sp.issparse(y_pred))
        assert_array_equal(y_pred.toarray(), y_expected)
def main(training_set, language, gold_standard, gazetteer):
    """ Searches for the best hyperparameters """

    gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {}

    logger.info('Building training set')
    extractor = FactExtractorFeatureExtractor(language)
    for row in training_set:
        data = json.loads(row)
        extractor.process_sentence(data['sentence'], data['fes'],
                                   add_unknown=True, gazetteer=gazetteer)

    logger.info('Finalizing training set')
    x, y = extractor.get_features()

    logger.info('Searching for the best model parameters')
    svc = LinearSVC()
    search = GridSearchCV(
        svc,
        param_grid=[{
            'C': [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
            'multi_class': ['ovr', 'crammer_singer'],
        }],
        scoring='f1_weighted',
        cv=10)
    search.fit(x, y)

    logger.info('The best model (weighted-averaged F1 of %.4f) has parameters %s',
                search.best_score_, search.best_params_)

    if not gold_standard:
        logger.info('Skipping gold standard evaluation')
        return

    logger.info('Evaluating on the gold standard')
    for row in gold_standard:
        data = json.loads(row)
        extractor.process_sentence(data['sentence'], data['fes'])
    x_gold, y_gold = extractor.get_features()

    dummy = DummyClassifier(strategy='stratified')
    dummy.fit(x, y)

    y_dummy = dummy.predict(x_gold)
    logger.info('Dummy model has a weighted-averaged F1 on the gold standard of %.4f',
                metrics.f1_score(y_gold, y_dummy, average='weighted'))

    y_best = search.predict(x_gold)
    logger.info('Best model has a weighted-averaged F1 on the gold standard of %.4f',
                metrics.f1_score(y_gold, y_best, average='weighted'))
def test_most_frequent_and_prior_strategy():
    X = [[0], [0], [0], [0]]  # ignored
    y = [1, 2, 1, 1]

    for strategy in ("most_frequent", "prior"):
        clf = DummyClassifier(strategy=strategy, random_state=0)
        clf.fit(X, y)
        assert_array_equal(clf.predict(X), np.ones(len(X)))
        _check_predict_proba(clf, X, y)

        if strategy == "prior":
            assert_array_equal(clf.predict_proba(X[0]), clf.class_prior_.reshape((1, -1)))
        else:
            assert_array_equal(clf.predict_proba(X[0]), clf.class_prior_.reshape((1, -1)) > 0.5)
Example #19
0
def test_most_frequent_and_prior_strategy_with_2d_column_y():
    # non-regression test added in
    # https://github.com/scikit-learn/scikit-learn/pull/13545
    X = [[0], [0], [0], [0]]
    y_1d = [1, 2, 1, 1]
    y_2d = [[1], [2], [1], [1]]

    for strategy in ("most_frequent", "prior"):
        clf_1d = DummyClassifier(strategy=strategy, random_state=0)
        clf_2d = DummyClassifier(strategy=strategy, random_state=0)

        clf_1d.fit(X, y_1d)
        clf_2d.fit(X, y_2d)
        assert_array_equal(clf_1d.predict(X), clf_2d.predict(X))
def find_best_dummy_classification(X, y, test_size=0.3, random_state=0, thresh=0.5, target_names=None, n=1):
    """Try all dummy models."""
    X = X.reshape((len(X) ,-1))
    # y = y.reshape((len(y) ,-1))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    dummy_scores = []
    for i in range(n):
        for strategy in ['most_frequent', 'uniform', 'prior', 'stratified']:
            clf = DummyClassifier(strategy=strategy)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            score = clf.score(X_test, y_test)

            matthews_corrcoef=sklearn.metrics.matthews_corrcoef(y_test > thresh, y_pred > thresh)

            report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names))

            dummy_scores.append(
                collections.OrderedDict(
                    strategy='classifier_' + strategy,
                    matthews_corrcoef=matthews_corrcoef,
                    score=score,
                    report=report
                )
            )

        for strategy in ['mean', 'median']:
            clf=DummyRegressor(strategy=strategy)
            clf.fit(X_train, y_train)
            y_pred=clf.predict(X_test)
            score=clf.score(X_test, y_test)

            matthews_corrcoef=sklearn.metrics.matthews_corrcoef(y_test > thresh, y_pred > thresh)

            report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names))

            dummy_scores.append(
                collections.OrderedDict(
                    strategy='regressor_' + strategy,
                    matthews_corrcoef=matthews_corrcoef,
                    score=score,
                    report=report
                )
                )

    df=pd.DataFrame(dummy_scores)
    df=df.sort_values('matthews_corrcoef', ascending=False)
    return df, df[:1].iloc[0].to_dict()
Example #21
0
def test_constant_strategy():
    X = [[0], [0], [0], [0]]  # ignored
    y = [2, 1, 2, 2]

    clf = DummyClassifier(strategy="constant", random_state=0, constant=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(X), np.ones(len(X)))
    _check_predict_proba(clf, X, y)

    X = [[0], [0], [0], [0]]  # ignored
    y = ['two', 'one', 'two', 'two']
    clf = DummyClassifier(strategy="constant", random_state=0, constant='one')
    clf.fit(X, y)
    assert_array_equal(clf.predict(X), np.array(['one'] * 4))
    _check_predict_proba(clf, X, y)
Example #22
0
def main(training_set, language, gold_standard, gazetteer, n_folds, n_jobs,
         scoring, output, test, word2vec_model, independent_lus):
    """ Searches for the best hyperparameters """

    logger.info('Searching for the best model and parameters')

    training_sets = get_training_sets(training_set, language, gazetteer, word2vec_model, independent_lus)
    models = get_models(test)

    search = MultimodelGridSearchCV(*models, cv=n_folds, n_jobs=n_jobs,
                                    scoring=Scorer(scoring, True))
    (x_tr, y_tr, best_training_meta), best_score, best_params, best_model = search.fit(training_sets)

    logger.info('Evaluation Results')
    logger.info('  Best model: %s', best_model.__class__.__name__)
    logger.info('  Score: %f', best_score)
    logger.info('  Parameters: %s', best_params)
    logger.info('  Gazetteer: %s', best_training_meta['gazetteer'])
    logger.info('  Extractor: %s', best_training_meta['extractor_cls'].__name__)
    logger.info('  Extractor args: %s', best_training_meta['extractor_args'])

    joblib.dump((best_model, best_training_meta), output)
    logger.info("Done, dumped model to '%s'", output)

    if not gold_standard:
        logger.info('Skipping gold standard evaluation')
        return

    logger.info('Evaluating on the gold standard')

    extractor = best_training_meta['extractor']
    gazetteer = best_training_meta['gazetteer']

    extractor.start()
    for row in gold_standard:
        data = json.loads(row)
        extractor.process_sentence(data['sentence'], data['lu'], data['fes'],
                                   add_unknown=False, gazetteer=gazetteer)
    x_gold, y_gold = extractor.get_features(refit=False)

    dummy = DummyClassifier(strategy='stratified')
    dummy.fit(x_tr, y_tr)

    logger.info('Dummy model has a weighted-averaged F1 on the gold standard of %.4f',
                Scorer(scoring, True)(dummy, x_gold, y_gold))

    logger.info('Best model has a weighted-averaged F1 on the gold standard of %.4f',
                Scorer(scoring, True)(best_model, x_gold, y_gold))
Example #23
0
def test_dtype_of_classifier_probas(strategy):
    y = [0, 2, 1, 1]
    X = np.zeros(4)
    model = DummyClassifier(strategy=strategy, random_state=0, constant=0)
    probas = model.fit(X, y).predict_proba(X)

    assert probas.dtype == np.float64
def test_uniform_strategy_multioutput():
    X = [[0]] * 4  # ignored
    y = np.array([[2, 1], [2, 2], [1, 2], [1, 1]])
    clf = DummyClassifier(strategy="uniform", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)

    for k in range(y.shape[1]):
        p = np.bincount(y_pred[:, k]) / float(len(X))
        assert_almost_equal(p[1], 0.5, decimal=1)
        assert_almost_equal(p[2], 0.5, decimal=1)
        _check_predict_proba(clf, X, y)

    _check_behavior_2d(clf)
def test_stratified_strategy_sparse_target():
    X = [[0]] * 5  # ignored
    y = sp.csc_matrix(np.array([[4, 1], [0, 0], [1, 1], [1, 4], [1, 1]]))

    clf = DummyClassifier(strategy="stratified", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)
    assert_true(sp.issparse(y_pred))
    y_pred = y_pred.toarray()

    for k in range(y.shape[1]):
        p = np.bincount(y_pred[:, k]) / float(len(X))
        assert_almost_equal(p[1], 3.0 / 5, decimal=1)
        assert_almost_equal(p[0], 1.0 / 5, decimal=1)
        assert_almost_equal(p[4], 1.0 / 5, decimal=1)
Example #26
0
    def compare_dummy_classification(self):
        """ Compares classifier to dummy classifiers. Return results (resultscores_tuple, N.A., N.A.)"""
        X_train = self.train_vectors
        y_train = self.train_tweetclasses
        X_test = self.test_vectors
        y_test = self.test_tweetclasses

        dummy_results = []

        dummy = DummyClassifier(strategy="most_frequent", random_state=0)
        dummy.fit(X_train, y_train)
        y_true, y_preddum = y_test, dummy.predict(X_test)
        tuples = precision_recall_fscore_support(y_true, y_preddum)

        dummy1 = DummyClassifier(strategy="stratified", random_state=0)
        dummy1.fit(X_train, y_train)
        y_true, y_preddum1 = y_test, dummy1.predict(X_test)
        tuples1 = precision_recall_fscore_support(y_true, y_preddum1)

        dummy2 = DummyClassifier(strategy="uniform", random_state=0)
        dummy2.fit(X_train, y_train)
        y_true, y_preddum2 = y_test, dummy2.predict(X_test)
        tuples2 = precision_recall_fscore_support(y_true, y_preddum2)

        resulttuple = ("dummy freq", "N.A.", "N.A.", "N.A.", "N.A.", tuples)
        resulttuple1 = ("dummy strat", "N.A.", "N.A.", "N.A.", "N.A.", tuples1)
        resulttuple2 = ("dummy uni", "N.A.", "N.A.", "N.A.", "N.A.", tuples2)

        dummy_results.append(resulttuple)
        dummy_results.append(resulttuple1)
        dummy_results.append(resulttuple2)

        return dummy_results
Example #27
0
	def compare_dummy(self):
		""" Compares classifier to dummy classifiers"""
		#print "\nDetailed classification report:\n"
		#print "The model is trained on the full development set.\n"
		#print "The scores are computed on the full evaluation set.\n"

		X_train = self.train_vectors
		y_train = self.train_tweetclasses
		X_test = self.test_vectors
		y_test = self.test_tweetclasses

		dummy = DummyClassifier(strategy='most_frequent',random_state=0)
		dummy.fit(X_train, y_train)
		y_true, y_preddum = y_test, dummy.predict(X_test)
		tuples = precision_recall_fscore_support(y_true, y_preddum)

		dummy1 = DummyClassifier(strategy='stratified',random_state=0)
		dummy1.fit(X_train, y_train)
		y_true, y_preddum1 = y_test, dummy1.predict(X_test)
		tuples1 = precision_recall_fscore_support(y_true, y_preddum1)

		dummy2 = DummyClassifier(strategy='uniform',random_state=0)
		dummy2.fit(X_train, y_train)
		y_true, y_preddum2 = y_test, dummy2.predict(X_test)
		tuples2 = precision_recall_fscore_support(y_true, y_preddum2)

		return (tuples, tuples1,tuples2)
def eval_against_dumm(FS, aut_target, myclf, folder):
	real_acc = []
	dummy1_acc, dummy2_acc, dummy3_acc = [], [], []
	clf = copy.deepcopy(myclf)
	for train_index, test_index in folder:
		clf.fit(FS[train_index, :],aut_target[train_index])
		labels = np.asarray(clf.predict(FS[test_index, :]))
		acc = np.mean(aut_target[test_index] == labels)
		real_acc.append(acc)

		clf = DummyClassifier("stratified")
		clf.fit(FS[train_index, :], aut_target[train_index])
		labels = np.asarray(clf.predict(FS[test_index, :]))
		acc = np.mean(aut_target[test_index] == labels)
		dummy1_acc.append(acc)

		clf = DummyClassifier("most_frequent")
		clf.fit(FS[train_index, :], aut_target[train_index])
		labels = np.asarray(clf.predict(FS[test_index, :]))
		acc = np.mean(aut_target[test_index] == labels)
		dummy2_acc.append(acc)

		clf = DummyClassifier("uniform")
		clf.fit(FS[train_index, :], aut_target[train_index])
		labels = np.asarray(clf.predict(FS[test_index, :]))
		acc = np.mean(aut_target[test_index] == labels)
		dummy3_acc.append(acc)

	return np.mean(real_acc), np.mean(dummy1_acc), np.mean(dummy2_acc),\
		np.mean(dummy3_acc)
Example #29
0
def kfolds_evaluation(folds, model, scoring, skip_majority, x, y):
    kf = KFold(x.shape[0], folds, shuffle=True)
    scorer = Scorer(scoring, skip_majority)

    scores_dummy, scores_test, scores_train = [], [], []
    for train_index, test_index in kf:
        x_train, y_train = x[train_index], y[train_index]
        x_test, y_test = x[test_index], y[test_index]

        model.fit(x_train, y_train)
        dummy = DummyClassifier()
        dummy.fit(x_train, y_train)

        scores_test.append(scorer(model, x_test, y_test))
        scores_dummy.append(scorer(dummy, x_test, y_test))
        scores_train.append(scorer(model, x_train, y_train))

    logger.info("%d-folds cross evaluation results", folds)
    logger.info("    minimum test %f  dummy %f  training %f", min(scores_test), min(scores_dummy), min(scores_train))

    logger.info("    maximum test %f  dummy %f  training %f", max(scores_test), max(scores_dummy), max(scores_train))
    logger.info(
        "    average test %f  dummy %f  training %f",
        np.average(scores_test),
        np.average(scores_dummy),
        np.average(scores_train),
    )
    logger.info(
        "    median  test %f  dummy %f  training %f",
        np.median(scores_test),
        np.median(scores_dummy),
        np.median(scores_train),
    )

    logger.debug("full test scores: %s", scores_test)
    logger.debug("full dummy scores: %s", scores_dummy)
    logger.debug("full train scores: %s", scores_train)
Example #30
0
def get_xs_ys_predictions(embeddings_dict, classifier):
    """
    Run a classifier of type 'classifier' (one of: majority vote baseline,
    tratified sampling baseline, 10-NN classifier).

    Return:
        - xs: the word embeddings
        - ys: the gold standard labels
        - y_pred: the predicted labels
    """
    assert classifier in ['majority_vote', 'stratified', '10-NN']

    pos_ints = {'v': 0, 'n': 1, 'adj': 2, 'fn': 3}

    ys = []
    xs = []

    words = sorted(embeddings_dict.keys())
    for w in words:
        xs.append(embeddings_dict[w])
        # get embeddings's pos tag, look up pos tag's unique integer
        label = pos_ints[get_pos_tag(w)]
        ys.append(label)

    clf = None
    if classifier == 'majority_vote':
        clf = DummyClassifier(strategy='most_frequent', random_state=0)
    elif classifier == 'stratified':
        clf = DummyClassifier(strategy='stratified', random_state=0)
    elif classifier == '10-NN':
        clf = KNeighborsClassifier(n_neighbors=10, algorithm='ball_tree')

    clf.fit(xs, ys)
    y_pred = clf.predict(xs)

    return xs, ys, y_pred
Example #31
0
# Train and test data split from training data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
# print(X_train.shape, y_train.shape)
# print(X_test.shape, y_test.shape)

from sklearn.dummy import DummyClassifier

# create model
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)

# train model
model_dummy.fit(X_train, y_train)

# print(f'Accuracy for baseline model : {model_dummy.score(X_test, y_test)}')

# performance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

# print(f'Accuracy for baseline model : {accuracy_score(y_test, model_dummy.predict(X_test) )}')
# print(f'Accuracy for baseline model : {confusion_matrix(y_test, model_dummy.predict(X_test) )}')
# print(f'Accuracy for baseline model : {precision_score(y_test, model_dummy.predict(X_test) )}')
# print(f'Accuracy for baseline model : {recall_score(y_test, model_dummy.predict(X_test) )}')

# import logistic regression from sklearn
from sklearn.linear_model import LogisticRegression

# create model
 def get_dummy_classifier(self):
     """ Return a dummy classifier object. """
     clf = DummyClassifier()
     return clf.fit(self.x_train, self.y_train)
Example #33
0
    y_temp = y_temp.to_numpy()
    y = []
    for i in range(len(y_temp)):
        if np.array_equal(y_temp[i], np.array([0, 0])):
            y.append(0)
        elif np.array_equal(y_temp[i], np.array([1, 0])):
            y.append(1)
        elif np.array_equal(y_temp[i], np.array([0, 1])):
            y.append(2)
        elif np.array_equal(y_temp[i], np.array([1, 1])):
            y.append(3)

    y = np.array(y)

    model = DummyClassifier(strategy="most_frequent")
    model.fit(X, y)
    y_pred = model.predict(X)
    accuracy1 = accuracy_score(y, y_pred)
    print('Base Accuracy', accuracy1, file=f)

    model = DummyClassifier(strategy="stratified")
    model.fit(X, y)
    y_pred = model.predict(X)
    accuracy2 = accuracy_score(y, y_pred)
    print('Stratified Class Base Accuracy', accuracy2, file=f)

    conf_matrix_list_of_arrays = []
    scores = []
    for i in range(10):
        for fold_ind, (train_index, test_index) in enumerate(
                stratified_group_k_fold(X, y, ids, k=8)):
Example #34
0
def main():
    data = util.load_data(filenameX, filenamey, header=1)

    X, y = data.X, data.y
    print data.Xnames
    set_data_weights(y, data)

    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=None)

    metric_list = [
        "accuracy", "f1_score", "auroc", "precision", "sensitivity",
        "specificity"
    ]

    max_f1_linear = 0

    # for train, test in kf.split(X, y):
    #     X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    #     train_weights, test_weights = data.weights[train], data.weights[test]

    #     print select_param_rbf(X_train, y_train, kf, metric="f1_score")

    #     max_f1_linear = max(max_f1_linear, select_param_linear(X, y, kf, metric="f1_score"))

    print 'max_f1_linear', max_f1_linear

    C, gamma = (10.0, 0.1)

    X_train, X_test, y_train, y_test, weight_train, weight_test = train_test_split(
        X, y, data.weights, test_size=0.2, stratify=y)

    dumclf = DummyClassifier(strategy="most_frequent")
    dumclf.fit(X_train, y_train, sample_weight=weight_train)

    rbfclf = SVC(C=C, gamma=gamma, class_weight="balanced")
    rbfclf.fit(X_train, y_train)
    y_pred = rbfclf.predict(X_test)

    # compute classifier performance
    for metric in metric_list:
        print metric + ":", performance(y_test, y_pred, metric)

    svc_train_score = rbfclf.score(X_train, y_train)
    svc_test_score = rbfclf.score(X_test, y_test)
    dummy_train_score = dumclf.score(X_train, y_train, weight_train)
    dummy_test_score = dumclf.score(X_test, y_test, weight_test)

    print metrics.confusion_matrix(y_test,
                                   y_pred,
                                   labels=[1, 0],
                                   sample_weight=weight_test)

    print "RBFSVC train accuracy: %.6f" % (svc_train_score)
    print "Dummy train accuracy: %.6f" % (dummy_train_score)
    print "RBFSVC test accuracy: %.6f" % (svc_test_score)
    print "Dummy test accuracy: %.6f" % (dummy_test_score)

    max_f1_linear = 10.0
    linclf = SVC(C=max_f1_linear, kernel="linear")

    linclf.fit(X_train, y_train)

    print "Linear SVC train accuracy: %.6f" % (linclf.score(X_train, y_train))
    print "Linear SVC test accuracy: %.6f" % (linclf.score(X_test, y_test))

    y_pred = linclf.predict(X_test)

    print "Linear SVC test F1 score: %.6f" % (performance(
        y_test, y_pred, metric="f1_score"))

    print "Top ten features (probably) in order from largest to smallest:"
    indices = linclf.coef_[0].argsort()[-10:][::-1]
    print[data.Xnames[i] for i in indices]

    print "RBF F1/accuracy score with each of the top ten features removed."
    for i in indices:
        X_train_mod = np.delete(X_train, i, 1)
        X_test_mod = np.delete(X_test, i, 1)
        rbfclf.fit(X_train_mod, y_train)
        y_train_pred = rbfclf.predict(X_train_mod)
        y_test_pred = rbfclf.predict(X_test_mod)

        print "%s:" % (data.Xnames[i])
        print "\tAccuracy: "
        print "\t\tTrain: %.6f" % (rbfclf.score(X_train_mod, y_train))
        print "\t\tTest: %.6f" % (rbfclf.score(X_test_mod, y_test))
        print "\tF1 Score:"
        print "\t\tTrain: %.6f" % (performance(
            y_train, y_train_pred, metric="f1_score"))
        print "\t\tTest: %.6f" % (performance(
            y_test, y_test_pred, metric="f1_score"))

    print "RBF Least predictive ten features (probably) in order from smallest to largest:"
    indices = linclf.coef_[0].argsort()[:10]
    print[data.Xnames[i] for i in indices]

    print "F1/accuracy score with each of the bottom ten features removed cumulatively"
    for i in indices:
        X_train_mod = np.delete(X_train, indices[:i + 1], 1)
        X_test_mod = np.delete(X_test, indices[:i + 1], 1)
        rbfclf.fit(X_train_mod, y_train)
        y_train_pred = rbfclf.predict(X_train_mod)
        y_test_pred = rbfclf.predict(X_test_mod)

        print "%s:" % (data.Xnames[i])
        print "\tAccuracy: "
        print "\t\tTrain: %.6f" % (rbfclf.score(X_train_mod, y_train))
        print "\t\tTest: %.6f" % (rbfclf.score(X_test_mod, y_test))
        print "\tF1 Score:"
        print "\t\tTrain: %.6f" % (performance(
            y_train, y_train_pred, metric="f1_score"))
        print "\t\tTest: %.6f" % (performance(
            y_test, y_test_pred, metric="f1_score"))
Example #35
0
train_X, test_X = X[train_idx], X[test_idx]
train_Y, test_Y = Y[train_idx], Y[test_idx]
train_m_id, test_m_id = m_id[train_idx], m_id[test_idx]

train_m_id = set(train_m_id.tolist())
print len(train_X), len(test_X)

train_label_freq = Counter(train_Y)
print train_label_freq, len(train_label_freq)

test_label_freq = Counter(test_Y)
print test_label_freq, len(test_label_freq)

#majority classifier
maj_clf = DummyClassifier(strategy='most_frequent')
maj_clf.fit(train_X, train_Y)

maj_pred_Y = maj_clf.predict(test_X)
maj_label = maj_pred_Y[0]

print pred_eval(test_Y, maj_pred_Y), maj_label
# print Counter(maj_pred_Y)

#linear svm
svc_clf = LinearSVC(penalty='l2', C=10.0, dual=False, multi_class='ovr')
svc_clf.fit(train_X, train_Y)

svc_pred_Y = svc_clf.predict(test_X)

print pred_eval(test_Y, svc_pred_Y)
svc_label_freq = Counter(svc_pred_Y)
scores = cross_val_score(clf, X, y, cv=5)
end = time.time()
accuracy_all.append(accuracy_score(prediction,test_labels))
cvs_all.append(np.mean(scores))

#print("1-GaussianNB accuracy:",clf.score(test,test_labels))
print("1-GaussianNB accuracy :",accuracy_score(prediction,test_labels))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
print("Execution time: {0:.5} seconds \n".format(end-start))


# 2-Initialize our dummy classifier
start = time.time()
dummy=DummyClassifier()
# Train our classifier
dummy.fit(train, train_labels)
prediction = dummy.predict(test)
scores = cross_val_score(dummy, X, y, cv=5)
end = time.time()
accuracy_all.append(accuracy_score(prediction,test_labels))
cvs_all.append(np.mean(scores))

#print("2-Dummy accuracy:",dummy.score(test,test_labels))
print("2-Dummy Accuracy: {0:.2%}".format(accuracy_score(prediction,test_labels)))
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))
print("Execution time: {0:.5} seconds \n".format(end-start))


# 3-Initialize our KNeighbors classifier
start = time.time()
clf = neighbors.KNeighborsClassifier()
Example #37
0
    y_temp=y_temp.to_numpy()
    y=[]
    for i in range(len(y_temp)):
        if np.array_equal(y_temp[i],np.array([0,0])):
            y.append(0)
        elif np.array_equal(y_temp[i],np.array([1,0])):
            y.append(1)
        elif np.array_equal(y_temp[i],np.array([0,1])):
            y.append(2)
        elif np.array_equal(y_temp[i],np.array([1,1])):
            y.append(3)

    y=np.array(y)

    model = DummyClassifier(strategy="most_frequent")
    model.fit(X, y)
    y_pred = model.predict(X)
    accuracy1 = accuracy_score(y, y_pred)
    print('Majority Class Base Accuracy',accuracy1,file=f)

    model = DummyClassifier(strategy="stratified")
    model.fit(X, y)
    y_pred = model.predict(X)
    accuracy2 = accuracy_score(y, y_pred)
    print('Stratified Class Base Accuracy',accuracy2,file=f)

    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.pipeline import Pipeline

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
Example #38
0
    #         model_mem = InMemoryModel(model.predict_proba, examples=test_data)
    #         interpreter.feature_importance.plot_feature_importance(model_mem, ascending=False, ax=ax)
    #         ax.set_title(f"{title} on fold {fold}")
    #         print("\n")
    #         modelno += 1
    #     fold += 1
    # plt.tight_layout()

    for train_index, test_index in kFold.split(yeastAttrib):
        print(f"------------" f"Fold {fold}")
        modelno = 1
        train_data, train_target = yeastAttrib[train_index], yeastTarget[
            train_index]
        test_data, test_target = yeastAttrib[test_index], yeastTarget[
            test_index]
        dummy.fit(train_data, train_target)
        prediction = dummy.predict(test_data)
        print("Dummy prediction")
        print(classification_report(test_target, prediction))
        for model, title in zip(models, titles):
            clf = model.fit(train_data, train_target)
            prediction = clf.predict(test_data)
            print(f"{title}")
            print(classification_report(test_target, prediction))
            print(
                f"Confusion Matrix: \n {confusion_matrix(test_target, prediction)}"
            )

            # ax = axs[modelno - 1, fold - 1]
            interpreter = Interpretation(test_data,
                                         feature_names=featureNames[1:9])
Example #39
0
kfold = KFold(10, True, 1)
fold_number = 1

for train, test in kfold.split(data):
    print "........... Fold %d ..........." % fold_number

    training_corpus = build_corpus(train)

    train_labels = build_labels(train)

    test_corpus = build_corpus(test)

    test_labels = build_labels(test)

    #Generating dummy accuracies for each fold.
    dummy_clf.fit(training_corpus, train_labels)
    dummy_accuracies.append(dummy_clf.score(test_corpus, test_labels) * 100)

    vectorizer = CountVectorizer(ngram_range=args.ngrange,
                                 stop_words=stop_words,
                                 binary=args.onehot,
                                 analyzer='word',
                                 token_pattern=r'\b[^\W\d]+\b')

    vectors = vectorizer.fit_transform(training_corpus).toarray()

    sums = vectors.sum(axis=0)

    j = 0
    print "Cleaning up the TDM according to the supplied cutoff value:"
    for i in tqdm(vectorizer.vocabulary_.items()):
def ModelRandomGuessing(hog_features, labels, pp):
    model = "RandomGuessing"
    clf = DummyClassifier()
    clf.fit(hog_features, labels)
    joblib.dump((clf, pp), "model0randomguessing.pkl", compress=3)
    return (model, clf)
Example #41
0
    def train_rf(self, features, labels):

        print('Training random forest ...')

        self.model = RandomForestRegressor(n_estimators=100,
                                           max_features='sqrt',
                                           max_depth=np.ceil(
                                               len(features[0]) / 5),
                                           min_samples_leaf=3,
                                           n_jobs=-1)

        self.model2 = RandomForestClassifier(n_estimators=100,
                                             max_features='sqrt',
                                             max_depth=np.ceil(
                                                 len(features[0]) / 5),
                                             min_samples_leaf=3,
                                             n_jobs=-1)

        self.lr0 = linear_model.TheilSenRegressor()
        self.lr1 = linear_model.TheilSenRegressor()

        reg_dummy = DummyRegressor()
        clf_dummy = DummyClassifier()

        kfold = KFold(n_splits=self.kfold, shuffle=True)
        kfold2 = KFold(n_splits=self.kfold, shuffle=True)

        features, labels = shuffle(features, labels)

        import matplotlib.pyplot as plt
        import seaborn as sns
        sns.set(style='whitegrid', context='paper')

        for ifold, (train, test) in enumerate(kfold.split(labels)):
            self.model.fit(features[train], labels[train])
            score_train = self.model.score(features[train], labels[train])
            score_test = self.model.score(features[test], labels[test])
            reg_dummy.fit(features[train], labels[train])
            score_dummy = reg_dummy.score(features[test], labels[test])
            print('Fold %d: %.4f / %.4f (%.4f)' %
                  (ifold, score_test, score_train, score_dummy))

            labels_t = labels.transpose()
            y_pred = self.model.predict(features)
            y_pred_t = y_pred.transpose()
            # self.lr0.fit(labels_t[0][train].reshape(-1, 1), y_pred_t[0][train])
            self.lr1.fit(labels_t[1][train].reshape(-1, 1), y_pred_t[1][train])
            y_lr = self.lr1.predict(labels_t[1][test].reshape(-1, 1))
            dy = np.abs(y_pred_t[1][test] - y_lr) < 0.2
            print('\t%d / %d' % (np.sum(dy), np.sum(1 - dy)))
            for jfold, (train2, test2) in enumerate(kfold2.split(dy)):
                self.model2.fit(features[test[train2]], dy[train2])
                y_pred2 = self.model2.predict(features[test[test2]])
                score_train2 = precision_score(dy[train2],
                                               self.model2.predict(
                                                   features[test[train2]]),
                                               average='binary')
                score_test2 = precision_score(dy[test2],
                                              y_pred2,
                                              average='binary')
                clf_dummy.fit(features[test[train2]], dy[train2])
                score_dummy = precision_score(dy[test2],
                                              clf_dummy.predict(
                                                  features[test[test2]]),
                                              average='binary')
                print('\tFold %d: %.4f / %.4f (%.4f)' %
                      (jfold, score_test2, score_train2, score_dummy))

                score_final_train = self.model.score(features[test[train2]],
                                                     labels[test[train2]])
                score_final_test = self.model.score(
                    features[test[test2[y_pred2]]],
                    labels[test[test2[y_pred2]]])
                print('\tFinal: %.4f / %.4f' %
                      (score_final_test, score_final_train))

            fig, axs = plt.subplots(2, 2)
            train_truth = labels[train].transpose()
            train_pred = self.model.predict(features[train]).transpose()
            test_truth = labels[test].transpose()
            test_pred = y_pred[test].transpose()
            sns.scatterplot(x=train_truth[0], y=train_pred[0], ax=axs[0, 0])
            sns.scatterplot(x=train_truth[1], y=train_pred[1], ax=axs[0, 1])
            sns.scatterplot(x=test_truth[0][test2[y_pred2]],
                            y=test_pred[0][test2[y_pred2]],
                            ax=axs[1, 0])
            sns.scatterplot(x=test_truth[1][test2[y_pred2]],
                            y=test_pred[1][test2[y_pred2]],
                            ax=axs[1, 1])
            plt.draw()

        plt.show()

        return
Example #42
0
class ModelDummyClassifier:

    def __init__(self):

        self.version = 'dummy_classifier__' + datetime.datetime.today().strftime("%Y%m%d")

        self.global_config = dict()

        self.vardict = self.get_model_vardict()

        self.model_config = {
            'strategy': 'prior'
        }

        self.model = DummyClassifier(**self.model_config)

        self.time_for_training = 0

    def get_model_vardict(self):

        vardict = dict()

        # Target
        vardict["target"] = "result"

        # Numerical
        vardict["numerical"] = [
            #"nb_characters_german",
            #"nb_characters_english",
            "levenshtein_distance_german_english",
            #"previous_score",
            #"previous_question_time",
            "difficulty_category",
        ]

        # Difference in time
        vardict["diff_time"] = [
            #"days_since_last_occurrence_same_language",
            "days_since_first_occur_any_language",
        ]

        # Boolean
        vardict["boolean"] = [
            #"previous_result",
            "is_noun",
        ]

        # Categorical
        vardict["categorical"] = [
            #"language_asked",
            "previous_language_asked",
        ]

        # vardict['all'] = vardict['numerical'] + vardict['diff_time'] + vardict['boolean'] + vardict['categorical']

        return vardict

    def preprocessing_training(self, dataset):

        self.vardict["into_model"] = (
                self.vardict['numerical'] +
                self.vardict['diff_time'] +
                self.vardict['boolean'] +
                self.vardict['categorical']
        )

        return dataset

    def train(self, dataset):

        X_train = dataset[self.vardict["into_model"]]
        y_train = dataset[self.vardict["target"]]

        start = time.time()
        self.model.fit(X_train, y_train)
        end = time.time()

        self.time_for_training = end - start

    def preprocessing_inference(self, dataset):

        return dataset

    def predict(self, dataset, target_present=False):

        X_valid = dataset[self.vardict["into_model"]].copy()

        predictions = X_valid.copy()
        predictions["y_pred"] = self.model.predict(X_valid)
        predictions["y_proba"] = [x[1] for x in self.model.predict_proba(X_valid)]

        if target_present:
            predictions["y_true"] = dataset[self.vardict["target"]].copy()

        return predictions
# We will look at confusion matrices of the different predictions
# %% [markdown]
# |                   |Predicted Negative|Predicted Postive|
# |-------------------|------------------|-----------------|
# |**Actual Negative**|True Negative     |False Positive   |
# |**Actual Positive**|False Negative    |True Positive    |
# %% [markdown]
# ### Dummy Classifier
# %% [markdown]
# **Fit and predict**

# %%
dummy = DummyClassifier()

## fit on the training data
dummy.fit(X_train, y_train)

## make predictions on test data
dummy_test_pred = dummy.predict(X_test)

## fit on the scaled training data
dummy.fit(X_train_scale, y_train)

## make predictions on scaled test data
dummy_test_pred_scale = dummy.predict(X_test_scale)

# %% [markdown]
# **Confusion matrix**

# %%
dummy_matrix = confusion_matrix(y_test, dummy_test_pred)
Example #44
0
    def _train_local_classifier(self, X, y, node_id):
        if self.graph_.out_degree(node_id) == 0:
            # Leaf node
            if self.algorithm == "lcpn":
                # Leaf nodes do not get a classifier assigned in LCPN algorithm mode.
                self.logger.debug(
                    "_train_local_classifier() - skipping leaf node %s when algorithm is 'lcpn'",
                    node_id,
                )
                return

        X = self.graph_.node[node_id]["X"]
        nnz_rows = nnz_rows_ix(X)
        X_ = X[nnz_rows, :]

        y_rolled_up = rollup_nodes(
            graph=self.graph_,
            source=node_id,
            targets=[y[idx] for idx in nnz_rows],
        )

        if self.is_tree_:
            y_ = flatten_list(y_rolled_up)
        else:
            # Class hierarchy graph is a DAG
            X_, y_ = apply_rollup_Xy(X_, y_rolled_up)

        num_targets = len(np.unique(y_))

        self.logger.debug(
            "_train_local_classifier() - Training local classifier for node: %s, X_.shape: %s, len(y): %s, n_targets: %s",  # noqa:E501
            node_id,
            X_.shape,
            len(y_),
            num_targets,
        )

        if X_.shape[0] == 0:
            # No training data could be materialized for current node
            # TODO: support a 'strict' mode flag to explicitly enable/disable fallback logic here?
            self.logger.warning(
                "_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node %s",  # noqa:E501
                node_id,
            )
            return
        elif num_targets == 1:
            # Training data could be materialized for only a single target at current node
            # TODO: support a 'strict' mode flag to explicitly enable/disable fallback logic here?
            constant = y_[0]
            self.logger.debug(
                "_train_local_classifier() - only a single target (child node) available to train classifier for node %s, Will trivially predict %s",  # noqa:E501
                node_id,
                constant,
            )

            clf = DummyClassifier(strategy="constant", constant=constant)
        else:
            clf = self._base_estimator_for(node_id)

        clf.fit(X=X_, y=y_)
        self.graph_.node[node_id][CLASSIFIER] = clf
def optimal_models_performance (X, y, optimal_k, optimal_C, y_label):    
    ''' Grid search for optimal nlp classifier models (svm and knn). Plot ROC curves, generate confusion matrices and classification report '''
    
    #1. Split the data
    testSizeX = 0.33 #67:33 split
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= testSizeX, random_state=42) 
    
    #SVM
    svm_model = Pipeline([('vect', CountVectorizer(stop_words = nltk.corpus.stopwords.words('english'))),
                      ('tfidf', TfidfTransformer()),
                      ('clf', LinearSVC())])
    
    #Knn
    knn_model = Pipeline([('vect', CountVectorizer(stop_words = nltk.corpus.stopwords.words('english'))),
                      ('tfidf', TfidfTransformer()),
                      ('clf', KNeighborsClassifier(n_neighbors = optimal_k, weights= 'uniform'))])
    
    #Dummy classifier
    dummy_model = DummyClassifier(strategy='most_frequent').fit(Xtrain, ytrain)
    
    
    #Grid search
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False)} 
    
    #************************************************
    #Svm: Train svm model
    svm_gs = GridSearchCV(svm_model, parameters, n_jobs=-1)

    #Performance - best performing
    print('*********************************************')
    print('====== \n Results for svm grid search model:')
    
    svm_gs = svm_gs.fit(Xtrain, ytrain)  
    print(svm_gs.best_params_)       
    predicted = svm_gs.predict(Xtest)
    
    print(confusion_matrix(ytest, predicted))
    print(classification_report(ytest, predicted))
    
    #************************************************
    #Train knn model
    knn_gs = GridSearchCV(knn_model, parameters, n_jobs=-1)

    #Performance - best performing 
    print('*********************************************')    
    print('====== \n Results for knn grid search model:')
    
    knn_gs = knn_gs.fit(Xtrain, ytrain) 
    print(knn_gs.best_params_) 
    predicted = knn_gs.predict(Xtest)
    
    print(confusion_matrix(ytest, predicted))
    print(classification_report(ytest, predicted))
    
    #**********************************************
    #Dummy model
    print('*********************************************') 
    print('====== \n Results for dummy model:')
       
    dummy_model_fitted = dummy_model.fit(Xtrain, ytrain)  
    predicted = dummy_model_fitted.predict(Xtest)
    
    print(confusion_matrix(ytest, predicted))
    print(classification_report(ytest, predicted))
    
    #**********************************************
    #ROC plots
    plt.figure()
    
    #svm model 
    scores = svm_gs.decision_function(Xtest)
    fpr, tpr, _= roc_curve(ytest, scores)
    plt.plot(fpr,tpr, label = 'SVM')
    print('SVM AUC = {}'.format(auc(fpr, tpr)))

    #knn model
    scores = knn_gs.predict_proba(Xtest)[:,1]
    fpr, tpr, _= roc_curve(ytest, scores)
    plt.plot(fpr,tpr, color = 'r', label = 'knn')
    print('knn AUC = {}'.format(auc(fpr, tpr)))

    #Baseline Model
    scores_bl = dummy_model_fitted.predict_proba(Xtest)
    fpr, tpr, _= roc_curve(ytest, scores_bl[:, 1])
    plt.plot(fpr,tpr, color = 'orange', label = 'baseline model')
    print('AUC = {}'.format(auc(fpr, tpr)))
    
    #Random Choice
    plt.plot([0, 1], [0, 1],'g--') 

    #Labels
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC Curve. X = review text, y = {}'.format(y_label))

    plt.legend(['Svm', 'Knn', 'Baseline (most freq)','Random Classifier']) 
    plt.savefig('./roc_{}'.format(y_label))
    plt.show()
Example #46
0
def eval_zero_rule(args):
    samples_dir_path = args.samples_dir
    class_count = args.class_count
    sent_count = args.sent_count
    split_dir_path = args.split_dir

    #
    # Check that (input) POWER Samples Directory exists
    #

    logging.info('Check that (input) POWER Samples Directory exists ...')

    samples_dir = SamplesDir(Path(samples_dir_path))
    samples_dir.check()

    #
    # Check that (input) POWER Split Directory exists
    #

    logging.info('Check that (input) POWER Split Directory exists ...')

    split_dir = SplitDir(Path(split_dir_path))
    split_dir.check()

    #
    # Load entity/relation labels
    #

    logging.info('Load entity/relation labels ...')

    ent_to_lbl = split_dir.entities_tsv.load()
    rel_to_lbl = split_dir.relations_tsv.load()

    #
    # Load datasets
    #

    logging.info('Load test dataset ...')

    test_set = samples_dir.test_samples_tsv.load(class_count, sent_count)

    #
    # Calc class frequencies
    #

    logging.info('Calc class frequencies ...')

    _, _, test_classes_stack, _ = zip(*test_set)
    test_freqs = np.array(test_classes_stack).mean(axis=0)

    #
    # Evaluate
    #

    logging.info(f'test_freqs = {test_freqs}')

    for strategy in ('uniform', 'stratified', 'most_frequent', 'constant'):
        logging.info(strategy)

        mean_metrics = []
        for i, gt in tqdm(enumerate(np.array(test_classes_stack).T)):

            if strategy == 'constant':
                classifier = DummyClassifier(strategy='constant', constant=1)
                classifier.fit([0, 1], [0, 1])
            else:
                classifier = DummyClassifier(strategy=strategy)
                classifier.fit(gt, gt)

            metrics_list = []
            for _ in range(10):
                pred = classifier.predict(gt)

                acc = accuracy_score(gt, pred)
                prec, recall, f1, _ = precision_recall_fscore_support(
                    gt, pred, labels=[1], zero_division=1)

                metrics_list.append((acc, prec[0], recall[0], f1[0]))

            mean_metrics.append(np.mean(metrics_list, axis=0))

        logging.info(mean_metrics[0])
        logging.info(mean_metrics[-1])
        logging.info(np.mean(mean_metrics, axis=0))
def test_string_labels():
    X = [[0]] * 5
    y = ["paris", "paris", "tokyo", "amsterdam", "berlin"]
    clf = DummyClassifier(strategy="most_frequent")
    clf.fit(X, y)
    assert_array_equal(clf.predict(X), ["paris"] * 5)
Example #48
0
    print(npz_file.keys())

with np.load('mnist-6k.npz', allow_pickle=False) as npz_file:
    X = npz_file['data']
    y = npz_file['labels']

X_tr, X_te, y_tr, y_te = train_test_split(X,
                                          y,
                                          stratify=y,
                                          test_size=1 / 6,
                                          random_state=0)
print(X_tr.shape, X_te.shape, y_tr.shape, y_te.shape)

# Dummy classifier
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_tr, y_tr)

# Accuracy on test set
accuracy = dummy.score(X_te, y_te)
print('Baseline accuracy: {:.3f}'.format(accuracy))

# k-NN classifier

scaler = StandardScaler()

# grid search for optimal k:
k_values = np.arange(1, 50, 5)

test_curve = []

for k in k_values:
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.67,
                                                        random_state=i)

    vect = CountVectorizer(stop_words='english',
                           analyzer="word",
                           min_df=2,
                           max_df=0.8)
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    feat_dtm = vect.get_feature_names()

    clf = DummyClassifier()
    clf.fit(X_train_dtm, y_train)
    y_pred = clf.predict(X_test_dtm)

    accuracy = metrics.accuracy_score(y_test, y_pred)
    #print(accuracy)
    arr_Accu.append(accuracy)

#Vectorize
vect = CountVectorizer(stop_words='english',
                       analyzer="word",
                       min_df=2,
                       max_df=0.8)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
feat_dtm = vect.get_feature_names()
    vector_strs = []
    with open(VECTORS_FOLDER + (EP_NUMBER_FORMAT % ep_num), 'r') as file:
        vector_strs = file.read().splitlines()

    for vector_str in vector_strs:
        test_data_vec.append(ast.literal_eval(vector_str))

test_data = np.array(test_data_vec)

test_data_vecs = test_data[:, :-1]
test_data_labels = test_data[:, -1]

# set up logistic regression classifier and fit to data
log_reg_clf = LogisticRegression(solver='newton-cg', max_iter=50,\
    random_state=0, multi_class='multinomial',\
    verbose=0).fit(train_data_vecs, train_data_labels)

print('Logistic Regression Accuracy: ' + \
    str(log_reg_clf.score(test_data_vecs, test_data_labels)))

ridge_clf = RidgeClassifier(solver='auto')
ridge_clf.fit(train_data_vecs, train_data_labels)

print('Ridge Regression Accuracy: ' + \
    str(ridge_clf.score(test_data_vecs, test_data_labels)))

dummy_clf = DummyClassifier(strategy='stratified')
dummy_clf.fit(train_data_vecs, train_data_labels)
print('Stratified Random Accuracy: ' + \
    str(dummy_clf.score(test_data_vecs, test_data_labels)))
Example #51
0
                                                        stratify=y)

modelo = LinearSVC(random_state=SEED)
print("Treinaremos com %d elementos e testaremos com %d elementos" %
      (len(treino_x), len(teste_x)))
modelo.fit(treino_x, treino_y)

previsoes = modelo.predict(teste_x)
taxa_de_acerto = accuracy_score(teste_y, previsoes)


print("Taxa de acerto %.2f%%" % (taxa_de_acerto * 100))


dummy = DummyClassifier()
dummy.fit(treino_x, treino_y)
previsoes = dummy.predict(teste_x)

acuracia = accuracy_score(teste_y, previsoes)

print("A acurácia do algoritmo Dummy foi de %.2f%%" % (acuracia * 100))


dummy = DummyClassifier(random_state=SEED)
dummy.fit(treino_x, treino_y)
acuracia = dummy.score(teste_x, teste_y)

print("A acurácia do algoritmo Dummy foi de %.2f%%" % (acuracia * 100))


SEED = 8
Example #52
0
                                                    random_state=0)

#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
"""#Handling the imbalance dataset
from imblearn.combine import SMOTETomek
smk= SMOTETomek()
x_res,y_res = smk.fit_sample(x,y)"""

#Building Model
#K_Nearest
reg = DummyClassifier()
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)

#confusion matrix
from sklearn.metrics import confusion_matrix
con = confusion_matrix(y_test, y_pred)
print(con)

#checking Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
def run_classifier(feature_path, labelled_path):
    training_data = pd.read_csv(feature_path)
    labelled_data = pd.read_csv(labelled_path)
    feature_df = training_data
    headers = feature_df.columns
    labels_df = np.array(labelled_data['label'], dtype=int)
    feature_df = feature_df.as_matrix()
    num_trees = 100
    max_features = int(math.sqrt(feature_df.shape[1]))
    model = RandomForestClassifier(n_estimators=num_trees,
                                   max_features=max_features)
    dummy_model = DummyClassifier(constant=None,
                                  random_state=0,
                                  strategy='most_frequent')
    scores = cross_val_score(model,
                             feature_df,
                             labels_df,
                             cv=ShuffleSplit(n_splits=3,
                                             train_size=0.7,
                                             random_state=0))
    # scores = cross_val_score(model, feature_df , labels_df, cv=10, verbose=1)
    dummy_scores = cross_val_score(dummy_model,
                                   feature_df,
                                   labels_df,
                                   cv=ShuffleSplit(n_splits=3,
                                                   train_size=0.7,
                                                   random_state=0))
    print('randomforest_scores=', np.mean(scores), np.std(scores))
    print('dummy_scores=', np.mean(dummy_scores), np.std(dummy_scores))
    np.random.shuffle(feature_df)
    dummy_model = dummy_model.fit(feature_df[:400, :], labels_df[:400])
    model = model.fit(feature_df[:400, :], labels_df[:400])
    # print(model.predict(feature_df[:100,:]))
    print('randomforest', model.score(feature_df[400:, :], labels_df[400:]))
    print('dummy', dummy_model.score(feature_df[400:, :], labels_df[400:]))
    # return scores, model

    ########################
    # MANU ADDED from HERE #
    ########################
    # We set the features we want to use
    # features_to_use = ['var_mgw', 'motif_scores', 'bDNA','compA_d', 'compT_d', 'compG_d', 'compC_d', 'compA_u', 'compT_u', 'compG_u', 'compC_u', 'tfs_D_fw', 'tfs_D_rv', 'tfs_U_fw', 'tfs_U_fw.1', 'intergenetic']
    features_to_use = [
        'bDNA', 'var_mgw', 'motif_scores', 'tfs_D_fw', 'tfs_D_rv', 'tfs_U_fw',
        'tfs_U_fw.1'
    ]
    seqs = ['seq' + str(i) for i in range(21)]
    features_to_use += seqs

    # Load features from files in the format requiered for scikit, filtering
    # desired Features
    features, labels, headers = prepare_data_for_classifier(
        feature_path,
        labelled_path,
        randomize=True,
        only_columns=features_to_use)

    # DIFFERENT WAYS of running cross-validation!
    # Running the custom cross-validation just one time, with feature importance
    # analisis
    run_custom_cross(features, labels, headers, single_run=True)
    # We can also run it in verbose mode (will print all the confussion matrixes
    # and scores)
    run_custom_cross(features, labels, headers, single_run=True)
    # Running the custom cross-validation multiple times (using single_run=False)
    # This can be used to calculate average behaviour
    for i in range(10):
        print(run_custom_cross(features, labels, headers, single_run=False))

    return None, None
Example #54
0
print(" Tf-idf, Balanced accuracy score = " +
      str(balanced_accuracy_score(y_test, pred_tfidf_balanced)))
print(" Tf-idf, Accuracy score = " +
      str(accuracy_score(y_test, pred_tfidf_balanced)))
report_tfidf = classification_report(y_test, pred_tfidf_balanced)

# count balanced accuracy: 42
# count accuracy: 43.5
# tf-idf balanced accuracy: 43.5
# tf-idf accuracy: 41.5
'''
-------------------- Baseline Models -------------------
'''

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
DummyClassifier(strategy='uniform')
dummy_clf.predict(X_train)
dummy_clf.score(X_test, y_test)

# Multilabel
# Most frequent: 19.7 %
# Stratified: 14.2 %
# Uniform: 11.3

# Binary 50.46 %
'''
------------------- Feature importance -----------------
'''

Example #55
0
y = pd.read_csv(path)

print(y.shape)
print(X.shape)
"""##### KFold cross validation"""

kf = KFold(n_splits=10, shuffle=True, random_state=4)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=4)

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
"""###### Baseline Model"""

dummy_model = DummyClassifier(strategy='most_frequent', random_state=0)
dummy_model.fit(X_train, y_train)

print('score for baseline model : {0:.2f}'.format(
    dummy_model.score(X_test, y_test)))
print('accuracy for baseline model : {0:.2f}'.format(
    accuracy_score(y_test, dummy_model.predict(X_test))))

print('confusion matrix for baseline model: \n {0}'.format(
    confusion_matrix(y_test, dummy_model.predict(X_test))))

print('precision for baseline model : {0:.2f}'.format(
    precision_score(y_test, dummy_model.predict(X_test))))
print('recall for baseline model : {0:.2f}'.format(
    recall_score(y_test, dummy_model.predict(X_test))))
"""###### Logistic regression model"""
test_report = classification_report(y_test, y_pred, output_dict=True)

start_time = time.time()
y_pred = model.predict(X_train)
print(classification_report(y_train, y_pred))
# print('Accuracy score:', accuracy_score(y_train, y_pred))
trainingtime = (time.time() - start_time + training_time)

train_report = classification_report(y_train, y_pred, output_dict=True)

metric_list = ['precision', 'recall', 'f1-score']
avg_list = ['micro avg', 'macro avg', 'weighted avg']

test_str_output = "DT_sentiment\t" + f"{size}\t" + "test\t"
train_str_output = "DT_sentiment\t" + f"{size}\t" + "train\t"

for m in metric_list:
    for a in avg_list:
        test_str_output = test_str_output + f"{test_report[a][m]:.3f}\t"
        train_str_output = train_str_output + f"{train_report[a][m]:.3f}\t"
test_str_output += f"{testtime:.4f}"
train_str_output += f"{trainingtime:.4f}"
print(test_str_output.rstrip())
print(train_str_output.rstrip())

baselineClf = DummyClassifier(strategy="most_frequent")
baseline = baselineClf.fit(X_train, y_train)
y_pred_base = baseline.predict(X_test)
print(classification_report(y_test, y_pred_base))
print('Accuracy score:', accuracy_score(y_test, y_pred_base))
Example #57
0
def create_models(headlines):
    headline = headlines['headline']
    label = headlines['label']
    arr_Accu = []

    results = dict()

    for i in range(1, 20):
        headline_train, headline_test, label_train, label_test = train_test_split(
            headline, label, test_size=0.10, random_state=i)
        vect = CountVectorizer(max_features=1000, binary=True)
        headline_train_vector = vect.fit_transform(headline_train)
        headline_test_vector = vect.transform(headline_test)

        # Note: Egine prospatheia balancing tou dataset alla to accuracy sti sunexeia twn dokimwn apo katw den veltiwthike
        # balancing = SMOTE()
        # headline_train_balanced, label_train_balanced = balancing.fit_sample(headline_train_vector, label_train)
        # oversampled_headlines, counts = np.unique(label_train_balanced, return_counts=True)
        # print(list(zip(oversampled_headlines, counts)))

        dummy = DummyClassifier()
        dummy.fit(headline_train_vector, label_train)
        prediction = dummy.predict(headline_test_vector)
        accuracy = metrics.accuracy_score(label_test, prediction)
        # print(accuracy)
        arr_Accu.append(accuracy)
    print(max(arr_Accu))
    max_random_state = arr_Accu.index(max(arr_Accu)) + 1
    print(max_random_state)
    for j in range(1, 20):
        print("Random State : ", j, "   Accuracy : ", arr_Accu[j - 1])

    # Dokimi me k-fold gia tin euresi katalilis timis K gia megisto accuracy
    # Note: to accuracy edw einai xeirotero apo prin

    # arr_Accu = []
    # for i in range(3, 15):
    #     vect = CountVectorizer(stop_words='english', analyzer="word", min_df=2, max_df=0.8)
    #     headline_train_vector = vect.fit_transform(headline)
    #
    #     dummy = DummyClassifier()
    #     accuracy = cross_val_score(dummy, headline_train_vector, label, cv=i, scoring='accuracy')
    #
    #     arr_Accu.append(np.mean(accuracy))
    #
    # # print(arr_Accu)
    # for j in range(3, 15):
    #     print("K-Fold : ", j, "   Accuracy : ", arr_Accu[j - 3])

    # Ksekina i dimiourgia montelwn me to veltisto random state

    headline_train, headline_test, label_train, label_test = train_test_split(
        headline, label, test_size=0.10, random_state=max_random_state)
    print("random state chosen: ")
    print(max_random_state)
    vect = CountVectorizer(max_features=1000, binary=True)
    headline_train_vector = vect.fit_transform(headline_train)
    headline_test_vector = vect.transform(headline_test)
    # ta headlines tou training kommatioy ginontai fit_transform gia to fit
    # ta headlines tou test ginontai transform gia to test

    # Multionomial Bayes
    mbayes = MultinomialNB()
    mbayes.fit(headline_train_vector, label_train)
    # print(mbayes.score(headline_train_vector, label_train))

    # actual testing me to testing set pou diaxwrisame
    prediction = mbayes.predict(headline_test_vector)
    # print(prediction)
    accuracy = metrics.accuracy_score(label_test, prediction)
    #print('MBayes Accuracy : ', accuracy)
    results["bayes_accuracy"] = accuracy

    log_regression = LogisticRegression()
    log_regression.fit(headline_train_vector, label_train)
    prediction = log_regression.predict(headline_test_vector)
    accuracy = metrics.accuracy_score(label_test, prediction)
    print('LogisticRegression Accuracy : ', accuracy)

    results["Logistic_regression"] = accuracy

    decision_tree = DecisionTreeClassifier(criterion='entropy')
    decision_tree.fit(headline_train_vector, label_train)
    prediction = decision_tree.predict(headline_test_vector)
    accuracy = metrics.accuracy_score(label_test, prediction)
    print('DecisionTree Accuracy : ', accuracy)

    random_forest = RandomForestClassifier(criterion='entropy')
    random_forest.fit(headline_train_vector, label_train)
    prediction = random_forest.predict(headline_test_vector)
    accuracy = metrics.accuracy_score(label_test, prediction)
    print('RandomForestClassifier Accuracy : ', accuracy)

    adaboost = AdaBoostClassifier()
    adaboost.fit(headline_train_vector, label_train)
    prediction = adaboost.predict(headline_test_vector)
    accuracy = metrics.accuracy_score(label_test, prediction)
    print('Adaboost Accuracy : ', accuracy)

    bernoulli_bayes = BernoulliNB()
    bernoulli_bayes.fit(headline_train_vector, label_train)
    prediction = bernoulli_bayes.predict(headline_test_vector)
    accuracy = metrics.accuracy_score(label_test, prediction)
    print('BernoulliNB Accuracy : ', accuracy)

    linear_SVC = LinearSVC()
    linear_SVC.fit(headline_train_vector, label_train)
    prediction = linear_SVC.predict(headline_test_vector)
    accuracy = metrics.accuracy_score(label_test, prediction)
    print('Linear_SVC Accuracy : ', accuracy)

    # passive_aggressive = PassiveAggressiveClassifier()
    # passive_aggressive.fit(headline_train_vector, label_train)
    # prediction = passive_aggressive.predict(headline_test_vector)
    # accuracy = metrics.accuracy_score(label_test, prediction)
    # print('PassiveAggressiveClassifier Accuracy : ', accuracy)
    return results
Example #58
0
class_counts = pd.Series(Counter(y_train))
class_counts /= class_counts.sum()
class_counts

# %% [markdown]
# We can observe that the positive class, `'donated'`, comprises only 24% of
# the of the samples. The good accuracy of our classifier is then linked
# to its ability to predict correctly the negative class `'not donated'`
# which may or may not be relevant, depending on the application. We can
# illustrate the issue using a dummy classifier as a baseline.

# %%
from sklearn.dummy import DummyClassifier

dummy_classifier = DummyClassifier(strategy="constant", constant="not donated")
dummy_classifier.fit(X_train, y_train).score(X_test, y_test)

# %% [markdown]
# With the dummy classifier, which always predicts the negative class
# `'not donated'`,
# we obtain an accuracy score of 76%. Therefore, it means that this classifier,
# without learning anything from the data `X`, is capable of predicting as
# accurately as our logistic regression model.
#
# The problem illustrated above is also known as the class imbalance problem.
# When the classes are imbalanced, accuracy should not be used. In this case,
# one should either use
# the precision, recall, or F1 score as presented above or the balanced
# accuracy score instead of accuracy.

# %%
def denseNN_grid_search(*, 
    dataset_name,
    method_name,
    module_name,
    PATH_encoded,
    train_subset_names,
    test_subset_names,
    # ...                    
    class_encoding,
    grid,
    store_predictions=True,
    track_progres=True,
    verbose=False,
    plot_history=False # applied only if verbose==True, 
):
    
    # dist to store results, 
    model_acc_and_parameters_list = list()
    model_predictions_dict = dict()
    model_history_dict = dict()
    class_decoding = dict(zip(list(list(class_encoding.values())), list(class_encoding.keys()))) # reverse on class_encoding, 
    
    # .. 
    if track_progres==True:
        print(f"{module_name} _________________________________________ {pd.to_datetime('now')}")
    else:
        pass

    # Grid search, 
    model_ID = -1 # id number for each model, its predictions, I started with -1 so the first id will be 0 !
    for params in grid:    

        # PARAMETERS, ...................................
        model_ID +=1
        Xy_names = ["train", "valid", "test"] # these are internal names for datasets create with that function,
         # not the names of datatsets important that can ghave the same names, or other names, 
            
        if track_progres==True:
            print('.', end="")
        else:
            pass
        
        
        
        # LOAD & PREPARE THE DATA ,......................
        
        # find any logfile created while saving img files, 
        os.chdir(PATH_encoded)
        logfiles = []
        for file in glob.glob(f"{''.join([module_name,'_',dataset_name])}*_logfile.csv"):
            logfiles.append(file)
                
        # Load train data, 
        X_tot, batch_labels = load_encoded_imgbatch_using_logfile(logfile_name=logfiles[0], load_datasetnames=train_subset_names)
        X_tot = X_tot.astype(np.float)
        y_tot = pd.Series(batch_labels.classname).map(class_encoding).values.astype("int")
        
        # Load test data, 
        X_te, batch_labels = load_encoded_imgbatch_using_logfile(logfile_name=logfiles[0], load_datasetnames=test_subset_names)
        X_te = X_te.astype(np.float)
        y_te = pd.Series(batch_labels.classname).map(class_encoding).values.astype("int")
        idx_y_te = np.arange(y_te.shape[0]) # kep for compatibility issues

        # ... Split data into train/validation sets
        """ here it is done to prepare the script for future applications"""
        X_tr, X_valid, y_tr, y_valid = train_test_split(
                X_tot, y_tot, 
                train_size=params["train_test_split__train_size"], 
                test_size=(1-params["train_test_split__train_size"]),
                random_state=params["random_state"]
        )     
        
         # ... get xy_idx to identify raw images in train/valid datasets, 
        _, _, idx_y_tr, idx_y_valid = train_test_split(
                X_tot, np.arange(X_tot.shape[0], dtype="int"), 
                train_size=params["train_test_split__train_size"], 
                test_size=(1-params["train_test_split__train_size"]),
                random_state=params["random_state"]
        )

        # place all in dict,
        X_dct = dict(zip(Xy_names, [X_tr, X_valid, X_te]))
        y_dct = dict(zip(Xy_names, [y_tr, y_valid, y_te]))
        idx_y_dct = dict(zip(Xy_names, [idx_y_tr, idx_y_valid, idx_y_te]))            

        # SHUFFLE , ................................... 
        'only in case X_tot is used for NN training'
            
        # shuffle the samples in tot - otherwise the model will load batches, smaller then class, ie, one batch will often haven samples from only one class !
        # ... it will very fast went into overfitting with low accurqcy and huge loss for validation set, 
        idx = np.arange(X_tot.shape[0])
        my_seed = np.random.RandomState(params["random_state"])
        idx_mix = my_seed.choice(a=idx, size=idx.shape[0], replace=False)
        X_tot = X_tot[idx_mix,:].copy()
        y_tot = y_tot[idx_mix].copy()        
        

        # INFO , ................................... 
        if verbose==True:
            print(f"\n{''.join(['-']*40)}"); print(f"{''.join(['-']*40)}");print(f"{''.join(['-']*40)}")
            print(f'{model_ID}: {module_name}, logfie: {logfiles[0]}'); print(f"{''.join(['-']*40)}")
            print("PARAMETERS:"); print(f'{model_ID}: {params}')
            print("INPUT DATA DIMENSIONS:");
            for xyname in Xy_names:
                print(f"{xyname}: {X_dct[xyname].shape}")
        else:
            pass

        
        # BASELINE, ...............................
        'Create Most frequet baseline - done mainly for bakccompatibility'
        dummy = DummyClassifier(strategy='most_frequent')
        dummy.fit(X_dct["train"].astype(np.float), y_dct["train"].astype(int))
        # ..
        baseline_acc = dict()
        for xyname in Xy_names:
            baseline_acc[f"baseline_acc_{xyname}"] = dummy.score(X_dct[xyname], y_dct[xyname])

        if verbose==True:
            print(" --- ", model_ID, baseline_acc)    
        else:
            pass
            

        # CREATE AND TRAIN THE MODEL ,................
        "params dict is used here to provide imputs for parameter values"    
               
        # from keras import backend as K
        K.clear_session()    
            
        # create model
        if params["model"]=="one_layer":
            model = create_keras_one_layer_dense_model(
                input_size = X_tot.shape[1],
                output_size = len(list(class_encoding.keys())),
                verbose = verbose,
                **params
                )            
                  
        if params["model"]=="two_layers":
            model = create_keras_two_layer_dense_model(
                input_size = X_tot.shape[1],
                output_size = len(list(class_encoding.keys())),
                verbose = verbose,
                **params
                )

        # define early stopping - End training when acc stops improving (optional)
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', 
            patience=params["EarlyStopping__patience"], 
            restore_best_weights=True
        )
        
        # Fit model
        history = model.fit(
            x=X_tot, # samples are subdivided internally, 
            y=y_tot,
            validation_split=params['fit__validation_split'], 
            batch_size=params['fit__batch_size'], 
            epochs=params["fit__epoch"],
            shuffle=True, # Shuffle training samples
            callbacks=[early_stopping],
            verbose=0# no info, 
        )
        
        

        # EVALUTE MODEL ACC, .......................... 
        model_acc = dict()
        loss_acc = dict()
        # ...
        n = params["EarlyStopping__patience"]# early stopping steps taken into account, 
        acc_results = pd.DataFrame(history.history).iloc[-n::,:].mean(axis=0)
        model_acc["model_acc_train"] = acc_results.loc["acc"]  
        model_acc["model_acc_valid"] = acc_results.loc["val_acc"]
        model_acc["model_loss_train"] = acc_results.loc["loss"]
        model_acc["model_loss_valid"] = acc_results.loc["val_loss"]     
        # ...
        loss, acc = model.evaluate(X_dct["test"], y_dct["test"], verbose=0)
        model_acc["model_acc_test"] = acc
        model_acc["model_loss_test"] = loss  
    
        # COLLECT THE RESULTS ,..............................  
        'acc_restuls_and_params were added to all objects in case I woudl have some dounbts about results origine,'

        # 1. acc_restuls_and_params
        acc_restuls_and_params = {
                 "random_state_nr": params["random_state"], # for backcompatibility, 
                 "model_ID": model_ID,
                 "method": method_name,
                 "module": module_name,
                 **baseline_acc,
                 **model_acc,
                 **params
        }
        model_acc_and_parameters_list.append(acc_restuls_and_params) # in list, so it can be used as pd.df immediately, 


        # 2. save model history, 
        model_history_dict[model_ID] = {
            "model_history": pd.DataFrame(history.history),
            "acc_restuls_and_params":  acc_restuls_and_params}
        
        # 3. Model predictions, 
        """collect all model predictions also for test and valid datasets 
           to have nice comparisons on errors and problematic files"""
        if store_predictions==True:
            one_model_predictions = dict()
            for xyname in Xy_names:
                # make predictions and decode them,
                predictions               = model.predict_classes(X_dct[xyname])
                decoded_predictions       = pd.Series(predictions).map(class_decoding).values
                model_predictions_proba   = model.predict_proba(X_dct[xyname])
                decoded_y_labels          = pd.Series(y_dct[xyname]).map(class_decoding).values
                    # ...
                one_model_predictions[xyname] = {
                        "idx_in_batch":            idx_y_dct[xyname],
                        "original_labels":         decoded_y_labels, 
                        "model_predictions":       decoded_predictions, 
                        "model_predictions_proba": model_predictions_proba,
                        "acc_restuls_and_params":  acc_restuls_and_params,
                        "class_decoding":          class_decoding
                }# added, in case I woudl have some dounbts about results origine, 

            # and finally, add this to the big dict wiht all the results, 
            model_predictions_dict[model_ID] = one_model_predictions
            
        else:
            model_predictions_dict[model_ID] = None

            
            
        # PLOT THE RESULTS ,......................    
                    
        if verbose==True and plot_history==True:
        
            #.. figure, axes, 
            fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
            fig.suptitle(f"{params}")

            #.. Plot accuracy values
            ax1.plot(history.history['loss'], label='train loss')
            ax1.plot(history.history['val_loss'], label='val loss')
            ax1.set_title('Validation loss {:.3f} (mean last 3)'.format(
                np.mean(history.history['val_loss'][-3:]) # last three values
            ))
            ax1.set_xlabel('epoch')
            ax1.set_ylabel('loss value')
            ax1.grid(ls="--", color="grey")
            ax1.legend()

            #.. Plot accuracy values
            ax2.plot(history.history['acc'], label='train acc')
            ax2.plot(history.history['val_acc'], label='val acc')
            ax2.set_title('Validation accuracy {:.3f} (mean last 3)'.format(
                np.mean(history.history['val_acc'][-3:]) # last three values
            ))
            ax2.set_xlabel('epoch')
            ax2.set_ylabel('accuracy')
            ax2.set_ylim(0,1)
            ax2.grid(ls="--", color="grey")
            ax2.legend()
            plt.show()
        
        else:
            pass
  

    if track_progres==True:
        print(f"\nDONE _________________________________________ {pd.to_datetime('now')}",end="\n\n")
    else:
        pass

    # ..................................................
    return model_acc_and_parameters_list, model_predictions_dict, model_history_dict
def run_custom_cross(features,
                     labels,
                     headers,
                     single_run=True,
                     verbose=False):

    ############################
    #   FOREST   customization #
    # --> change from here <-- #
    ############################
    # Number of trees in the forest
    number_of_trees = 300
    # Number of features to train each tree
    max_number_of_features = 'sqrt'  # can be 'log'
    # Class Weight
    # If not given, all classes are supposed to have weight one
    #
    # The “balanced” mode uses the values of y to automatically adjust weights
    # inversely proportional to class frequencies in the input data as
    # n_samples / (n_classes * np.bincount(y))
    #
    # The “balanced_subsample” mode is the same as “balanced” except that
    # weights are computed based on the bootstrap sample for every tree grown.
    class_weight = 'balanced'
    ############################
    # -->      to here     <-- #
    ############################

    ############################
    #   DUMMY    customization #
    # --> change from here <-- #
    ############################
    dummy_strategy = 'stratified'
    ############################
    # -->      to here     <-- #
    ############################

    ############################
    #  CROSS   customization   #
    # --> change from here <-- #
    ############################
    number_of_splits = 5
    ############################
    # -->      to here     <-- #
    ############################

    if single_run:
        print("Using features:")
        print(headers)

    # We create TWO estimators
    forest = RandomForestClassifier(n_estimators=number_of_trees,
                                    max_features=max_number_of_features,
                                    class_weight=class_weight)
    dummy = DummyClassifier(constant=None,
                            random_state=0,
                            strategy=dummy_strategy)
    # We create a crossvalidator
    cross_stratified_kfold = StratifiedKFold(n_splits=number_of_splits,
                                             shuffle=True,
                                             random_state=None)
    # Some list to store results
    forest_scores_list = []
    dummy_scores_list = []
    forest_cmatrix_list = []
    dummy_cmatrix_list = []
    forest_features_importance_list = []
    forest_features_std_list = []

    # Now we do the actual training and classification
    for train_index, test_index in cross_stratified_kfold.split(
            features, labels):
        # We define training and test subset, as conducted by the cross-validator
        train_features = features[train_index]
        train_labels = labels[train_index]
        test_features = features[test_index]
        test_labels = labels[test_index]
        # We fit the models
        forest = forest.fit(train_features, train_labels)
        dummy = dummy.fit(train_features, train_labels)
        # We use them to clasify
        predicted_labels_forest = forest.predict(test_features)
        predicted_labels_dummy = dummy.predict(test_features)
        # We get the MCC scores (1 is perfect classification, 0 is random, -1 is inverse prediction)
        forest_score = matthews_corrcoef(test_labels,
                                         predicted_labels_forest,
                                         sample_weight=None)
        fpr, tpr, _ = roc_curve(test_labels, predicted_labels_forest)
        # print(roc_auc_score(test_labels, predicted_labels_forest))
        plt.plot(fpr, tpr)
        dummy_score = matthews_corrcoef(test_labels,
                                        predicted_labels_dummy,
                                        sample_weight=None)
        d_fpr, d_tpr, _ = roc_curve(test_labels, predicted_labels_dummy)
        # plt.plot(fpr, tpr)
        # plt.show()

        # We generate the Confusion Matrix
        # True negatives is C_{0,0}
        # False negatives is C_{1,0}
        # True positives is C_{1,1}
        # False positives is C_{0,1}
        forest_matrix = confusion_matrix(test_labels,
                                         predicted_labels_forest,
                                         labels=None,
                                         sample_weight=None)
        dummy_matrix = confusion_matrix(test_labels,
                                        predicted_labels_dummy,
                                        labels=None,
                                        sample_weight=None)

        # We store everything in the appropiate lists
        forest_scores_list.append(forest_score)
        dummy_scores_list.append(dummy_score)
        forest_cmatrix_list.append(forest_matrix)
        dummy_cmatrix_list.append(dummy_matrix)
        forest_features_importance_list.append(forest.feature_importances_)

        forest_feature_std = np.std(
            [tree.feature_importances_ for tree in forest.estimators_], axis=0)
        forest_features_std_list.append(forest_feature_std)

        if verbose & single_run:
            # We print everything
            print("Forest score", forest_score)
            print("Dummy score", dummy_score)
            print("\nForest Matrix")
            print(forest_matrix)
            print("\nDummy Matrix")
            print(dummy_matrix)

    # Now we calculate summurizing scores:
    # best, worst, average and variance of MCC for both classifiers
    forest_score_array = np.array(forest_scores_list)
    forest_max_score = forest_score_array.max()
    forest_min_score = forest_score_array.min()
    forest_avg_score = np.mean(forest_score_array)
    forest_var_score = np.var(forest_score_array)

    dummy_score_array = np.array(dummy_scores_list)
    dummy_max_score = dummy_score_array.max()
    dummy_min_score = dummy_score_array.min()
    dummy_avg_score = np.mean(dummy_score_array)
    dummy_var_score = np.var(dummy_score_array)

    forest_scores_tuple = (forest_max_score, forest_min_score,
                           forest_avg_score, forest_var_score)

    if single_run:
        print('# Scores from cross-validation')
        print(('max', 'min', 'average', 'variance'))
        print('Forest: ')
        print(forest_scores_tuple)
        print('Dummy: ')
        print((dummy_max_score, dummy_min_score, dummy_avg_score,
               dummy_var_score))

        # We pick the best run and we extract the importance of features
        bestp_index = np.argmax(forest_score_array)
        bestp_features_imp = forest_features_importance_list[bestp_index]
        bestp_features_std = forest_features_std_list[bestp_index]
        # And we print the Feature Importance (with plot)
        print('The following data is from best performing Forest')
        feature_importance_analysis(bestp_features_imp, bestp_features_std,
                                    features, headers, True)
        # And we also print confussion matrix for that forest and that dummy
        print('Best Forest matrix:')
        print(forest_cmatrix_list[bestp_index])
        print(forest_scores_list[bestp_index])
        print('Correspondent Dummy matrix:')
        print(dummy_cmatrix_list[bestp_index])
        print(dummy_scores_list[bestp_index])

    return forest_scores_tuple