def test_classifier_chain_fit_and_predict_with_sparse_data():
    # Fit classifier chain with sparse data
    X, Y = generate_multilabel_dataset_with_correlations()
    X_sparse = sp.csr_matrix(X)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X_sparse, Y)
    Y_pred_sparse = classifier_chain.predict(X_sparse)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)
    Y_pred_dense = classifier_chain.predict(X)

    assert_array_equal(Y_pred_sparse, Y_pred_dense)
def train_model(model, df):
    """This function trains the model specified in 'model', either a sklearn multilabel classifier implemented
    with a Random Forest classifier, or a Chain Classifier implemented with a Random Forest classifier.
    model = 'multi' or 'chain'
    """
    X = df.iloc[:, 0:13]
    y = df[list(filter(lambda x: str(x).startswith('notes'), df.columns))]

    if model == 'multi':
        multi = MultiOutputClassifier(RandomForestClassifier()).fit(X, y)
        return (multi, list(y.columns))
    elif model == 'chain':
        columns = {}
        for index, value in enumerate(y.columns):
            columns.update({value: index})
        constant = [
            'notes_type_0', 'notes_lineIndex_0', 'notes_lineLayer_0',
            'notes_cutDirection_0', 'notes_type_1', 'notes_lineIndex_1',
            'notes_lineLayer_1', 'notes_cutDirection_1', 'notes_type_3',
            'notes_lineIndex_3', 'notes_lineLayer_3', 'notes_cutDirection_3'
        ]
        order = [columns[x] for x in constant]
        chain = ClassifierChain(RandomForestClassifier(),
                                order=order).fit(X, y)
        return (chain, constant)
Esempio n. 3
0
def test_randForest(df, truth, eval_type):
    param_randForest = {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': [None, 50, 80, 100],
        'max_features': ['auto', 'sqrt', 'log2'],
        'criterion': ['gini', 'entropy']
    }
    combinations_randForest = it.product(*(param_randForest[Name]
                                           for Name in param_randForest))

    # Test the combinations for Random Forest with cross validation
    results_randForest = []
    keys = []
    for index, values in enumerate(combinations_randForest):
        key = "RF" + "-".join([str(item) for item in values])

        clf = RandomForestClassifier(n_estimators=values[0],
                                     max_depth=values[1],
                                     max_features=values[2],
                                     criterion=values[3])
        classifier = ClassifierChain(clf)
        kfold = KFold(n_splits=10, random_state=26)
        scores = cross_val_score(classifier,
                                 df.values,
                                 truth,
                                 cv=kfold,
                                 scoring=eval_type)
        keys.append(key)
        results_randForest.append(scores)

        msg = "%s: %f (%f)" % (key, scores.mean(), scores.std())
        print(msg)

    return keys, results_randForest
Esempio n. 4
0
def test_AdaBoost(df, truth, eval_type):
    param_adaBoost = {'n_estimators': [50, 100]}
    combinations_adaBoost = it.product(*(param_adaBoost[Name]
                                         for Name in param_adaBoost))

    # Test the combinations for AdaBoost with cross validation
    results_adaBoost = []
    keys = []
    for index, values in enumerate(combinations_adaBoost):
        key = "ADA" + "-".join([str(item) for item in values])

        clf = AdaBoostClassifier(n_estimators=values[0])
        classifier = ClassifierChain(clf)
        kfold = KFold(n_splits=10, random_state=26)
        scores = cross_val_score(classifier,
                                 df.values,
                                 truth,
                                 cv=kfold,
                                 scoring=eval_type)
        keys.append(key)
        results_adaBoost.append(scores)

        msg = "%s: %f (%f)" % (key, scores.mean(), scores.std())
        print(msg)

    return keys, results_adaBoost
Esempio n. 5
0
def train_and_pred(dictTrainMats, Trainlabel, dictTestMats, lian):
    chain = OneVsRestClassifier(ExtraTreesClassifier(bootstrap=True,
                                                     n_estimators=120),
                                n_jobs=8)
    chains = [ClassifierChain(chain, order="random") for i in range(lian)]
    model = OneVsRestClassifier(ExtraTreesClassifier(bootstrap=True,
                                                     n_estimators=200),
                                n_jobs=8)
    fea_train = np.array([])
    fea_test = np.array([])
    for i in range(lian):
        X_train, X_test = dictTrainMats[i % 8], dictTestMats[i % 8]
        clf = chains[i]
        clf.fit(X_train, Trainlabel)
        y_pred = clf.predict(X_test)
        if i == 0:
            fea_train = clf.predict(X_train)
            fea_test = y_pred
        else:
            fea_train = np.hstack([fea_train, clf.predict(X_train)])
            fea_test = np.hstack([fea_test, y_pred])
    print(fea_train.shape, fea_test.shape)
    model.fit(fea_train, Trainlabel)
    y_pred = model.predict(fea_test)
    print(y_pred.shape)
    save_tmp(y_pred, "./data/mlamp_train_710Test.pickle")
Esempio n. 6
0
def test_svm(df, truth, eval_type):
    # param_svm = {'C': [1, 10, 100, 1000],
    #              'kernel': ['linear', 'rbf'],
    #              'gamma': ['auto', 'scale']
    #              }
    param_svm = {'C': [100], 'kernel': ['linear'], 'gamma': ['auto']}
    combinations_svm = it.product(*(param_svm[Name] for Name in param_svm))

    # Test the combinations for SVM with cross validation
    results_svm = []
    keys = []
    for index, values in enumerate(combinations_svm):
        key = "SVM" + "-".join([str(item) for item in values])

        clf = svm.SVC(C=values[0], kernel=values[1], gamma=values[2])
        classifier = ClassifierChain(clf)
        kfold = KFold(n_splits=10, random_state=26)
        scores = cross_val_score(classifier,
                                 df.values,
                                 truth,
                                 cv=kfold,
                                 scoring=eval_type)
        keys.append(key)
        results_svm.append(scores)

        msg = "%s: %f (%f)" % (key, scores.mean(), scores.std())
        print(msg)

    return keys, results_svm
Esempio n. 7
0
def train_baseline(ds_name, train_input, train_labels):

    tuned_params = {"random_state": [i for i in np.arange(10)]}
    base_lr = LogisticRegression(C=1,
                                 max_iter=500,
                                 fit_intercept=True,
                                 tol=1e-15,
                                 class_weight="balanced")
    gs_chain = GridSearchCV(ClassifierChain(base_lr, order="random"),
                            tuned_params,
                            cv=3,
                            scoring=scorer)
    gs_chain.fit(train_input, train_labels)

    print("best order according to grid search is %s" %
          gs_chain.best_estimator_.order_)
    print("best order according to grid search is %s" % gs_chain.best_score_)

    from sklearn.ensemble import RandomForestClassifier

    tuned_params = {
        "n_estimators": [i for i in np.arange(1, 100, 10)],
        "max_depth": [i for i in np.arange(1, 100, 10)],
    }
    gs_forest = GridSearchCV(RandomForestClassifier(random_state=1),
                             tuned_params,
                             cv=3,
                             scoring=scorer)
    gs_forest.fit(train_input, train_labels)

    return (
        ("classifier_chain", gs_chain.best_estimator_),
        ("random_forest", gs_forest.best_estimator_),
    )
Esempio n. 8
0
def calc_Fitness(train_d):

    vectorizer = TfidfVectorizer(strip_accents='unicode',
                                 analyzer='word',
                                 ngram_range=(1, 3),
                                 norm='l2')

    x_train = vectorizer.fit_transform(train_d.comment_text)
    y_train = train_d.drop(labels=['id', 'comment_text'], axis=1)
    x_test = vectorizer.transform(test.comment_text)
    y_test = test.drop(labels=['id', 'comment_text'], axis=1)

    # using classifier chains
    from sklearn.multioutput import ClassifierChain
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, hamming_loss, precision_score
    # initialize classifier chains multi-label classifier
    classifier = ClassifierChain(LogisticRegression())
    # Training logistic regression model on train data
    classifier.fit(x_train, y_train)
    # predict
    predictions = classifier.predict(x_test)
    # accuracy
    quality = (accuracy_score(y_test, predictions) +
               (1 - hamming_loss(y_test, predictions)) +
               precision_score(y_test, predictions, average='weighted')) / 3
    return quality
Esempio n. 9
0
def CommonFunction(est, Xtrain, Ytrain, Xtest, parameters):
    n_cv = 5  # Number of cross-validations
    # multi_est=MultiOutputClassifier(est)
    multi_est = ClassifierChain(est)
    [n_trainsamples, n_decisions] = Ytrain.shape
    tparameters = 1
    for keys in parameters.items():
        tparameters = tparameters * len(keys[1])
    print('Total number of hyperparameter combinations to be tested is',
          str(tparameters))

    def ScoreFunction(decision, Y):  # Scoring function
        return round(
            numpy.sum(decision == Y) / (n_trainsamples * n_decisions) * 100 *
            n_cv, 2)

    score = make_scorer(ScoreFunction, greater_is_better=True)
    multi_est_GS = dcv.GridSearchCV(multi_est,
                                    param_grid=parameters,
                                    scoring=score,
                                    cv=n_cv,
                                    n_jobs=-1).fit(Xtrain, Ytrain)
    decision = multi_est_GS.predict(Xtest)
    printer(multi_est_GS.best_score_, multi_est_GS.best_params_, tparameters)
    return decision
def build_model():
    """
    Function:
    ---------
    Uses SMOTE Oversampling technique from imblearn
    to balance our dataset, builds model with a variety of base learners,
    uses ClassifierChains and ensembles them by stacking a
    meta learner on top, with grid search implemented
    
    Parameters:
    -----------
    None
    
    Returns:
    --------
    Model ready to be fitted
    """
    print('=============================')
    print('Building Model:')
    print('-----------------------------')

    # Aggregate an ensemble of RandomForest classifier chains and feed them
    # to the meta classifier
    print('Creating ClassifierChains...')
    chains = [
        ClassifierChain(
            base_estimator=RandomForestClassifier(n_estimators=100),
            order='random',
            random_state=42) for _ in range(5)
    ]

    # Meta Classifier that will take the predictions
    # of each output of the classifier chains and figure out
    # the weight of each classifier in predicting labels
    print('Adding Meta Classifier...')
    meta_clf = MultiOutputClassifier(AdaBoostClassifier())

    # Stack the base learners
    print('Stacking Meta Classifier on top of ClassifierChains...')
    sclf = StackingClassifier(classifiers=chains, meta_classifier=meta_clf)

    # Final Pipeline
    print('Building Pipeline...')
    pipeline = Pipeline([('features',
                          FeatureUnion([
                              ('text_pipeline',
                               Pipeline([
                                   ('tfidf_vect',
                                    TfidfVectorizer(tokenizer=tokenize)),
                               ]))
                          ])), ('sclf', sclf)])

    parameters = {
        'features__text_pipeline__tfidf_vect__ngram_range': ((1, 2), (1, 10))
    }

    print('Initializing GridSearchCV...')
    model = GridSearchCV(pipeline, param_grid=parameters, cv=5)
    return model
def test_classifier_chain_fit_and_predict_with_sparse_data_and_cv():
    # Fit classifier chain with sparse data cross_val_predict
    X, Y = generate_multilabel_dataset_with_correlations()
    X_sparse = sp.csr_matrix(X)
    classifier_chain = ClassifierChain(LogisticRegression(), cv=3)
    classifier_chain.fit(X_sparse, Y)
    Y_pred = classifier_chain.predict(X_sparse)
    assert_equal(Y_pred.shape, Y.shape)
def chain_classifiers(x, y):
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    base_lr = LogisticRegression()
    ovr = OneVsRestClassifier(base_lr)
    ovr.fit(x_train, y_train)
    y_pred_ovr = ovr.predict(x_test)

    from sklearn.metrics import jaccard_score
    ovr_jaccard_score = jaccard_score(y_test, y_pred_ovr, average='samples')

    from sklearn.multioutput import ClassifierChain
    chains = [
        ClassifierChain(base_lr, order='random', random_state=i)
        for i in range(10)
    ]

    for chain in chains:
        chain.fit(x_train, y_train)

    y_pred_chains = np.array([chain.predict(x_test) for chain in chains])
    chain_jaccard_scores = [
        jaccard_score(y_test, y_pred_chain >= 0.5, average='samples')
        for y_pred_chain in y_pred_chains
    ]

    y_pred_ensemble = y_pred_chains.mean(axis=0)
    ensemble_jaccard_score = jaccard_score(y_test,
                                           y_pred_ensemble >= 0.5,
                                           average='samples')

    model_scores = [ovr_jaccard_score] + chain_jaccard_scores
    model_scores.append(ensemble_jaccard_score)

    model_names = ('Independent', 'Chain 1', 'Chain 2', 'Chain 3', 'Chain 4',
                   'Chain 5', 'Chain 6', 'Chain 7', 'Chain 8', 'Chain 9',
                   'Chain 10', 'Ensemble')

    x_pos = np.arange(len(model_names))

    # Plot the Jaccard similarity scores for the independent model, each of the
    # chains, and the ensemble (note that the vertical axis on this plot does
    # not begin at 0).

    fig, ax = plt.subplots(figsize=(7, 4))
    ax.grid(True)
    ax.set_title('Classifier Chain Ensemble Performance Comparison')
    ax.set_xticks(x_pos)
    ax.set_xticklabels(model_names, rotation='vertical')
    ax.set_ylabel('Jaccard Similarity Score')
    ax.set_ylim([min(model_scores) * .9, max(model_scores) * 1.1])
    colors = ['r'] + ['b'] * len(chain_jaccard_scores) + ['g']
    ax.bar(x_pos, model_scores, alpha=0.5, color=colors)
    plt.tight_layout()
    plt.show()

    return chains[-1]
Esempio n. 13
0
def cc():

    print 'reading npy...'

    data = np.load('../data/1st.npy')
    feature_data = np.load('dnn_feature.npy')

    train_order = np.load('../data/train.npy')
    validation_order = np.load('../data/validation.npy')
    test_order = np.load('../data/test.npy')

    train_nlcd = get_data.get_feature(feature_data, train_order)
    train_label = get_data.get_label(data, train_order)

    test_nlcd = get_data.get_feature(feature_data, test_order)
    test_label = get_data.get_label(data, test_order)

    print 'chaining'
    # Fit an ensemble of logistic regression classifier chains and take the
    # take the average prediction of all the chains.

    chains = []
    for i in range(10):
        chains.append(
            ClassifierChain(LogisticRegression(),
                            order='random',
                            random_state=i))
        #chains.append(ClassifierChain(LogisticRegression()))
        #chains.append(ClassifierChain(LogisticRegression(), order=range(100), random_state=i))

    f**k = 0
    for chain in chains:
        print f**k + 1
        chain.fit(train_nlcd, train_label)
        f**k += 1

    print 'testing'
    # Y_pred_chains = np.array([chain.predict(X_test) for chain in
    #                           chains])
    # # chain_jaccard_scores = [jaccard_similarity_score(Y_test, Y_pred_chain >= .5)
    #                         for Y_pred_chain in Y_pred_chains]

    # Y_pred_ensemble = Y_pred_chains.mean(axis=0)
    # ensemble_jaccard_score = jaccard_similarity_score(Y_test,
    #                                                   Y_pred_ensemble >= .5)

    # model_scores = [ovr_jaccard_score] + chain_jaccard_scores
    # model_scores.append(ensemble_jaccard_score)
    scores = []
    for chain in chains:
        pre = chain.predict_proba(test_nlcd)
        #np.save('pre.npy',pre)
        chain_score = log_likelihood(test_label, pre)
        print chain_score
        scores.append(chain_score)
    scores = np.array(scores)
    print 'mean:'
    print np.mean(scores)
Esempio n. 14
0
def test_classifier_chain_tuple_invalid_order():
    X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    y = [[3, 2], [2, 3], [3, 2]]
    order = tuple([1, 2])

    chain = ClassifierChain(RandomForestClassifier(), order=order)

    with pytest.raises(ValueError, match='invalid order'):
        chain.fit(X, y)
def test_classifier_chain_crossval_fit_and_predict():
    # Fit classifier chain with cross_val_predict and verify predict
    # performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_cv = ClassifierChain(LogisticRegression(), cv=3)
    classifier_chain_cv.fit(X, Y)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred_cv = classifier_chain_cv.predict(X)
    Y_pred = classifier_chain.predict(X)

    assert_equal(Y_pred_cv.shape, Y.shape)
    assert_greater(jaccard_similarity_score(Y, Y_pred_cv), 0.4)

    assert_not_equal(jaccard_similarity_score(Y, Y_pred_cv),
                     jaccard_similarity_score(Y, Y_pred))
Esempio n. 16
0
 def _set_estimators_reset_fitted(self):
     self.estimators_ = [
         ClassifierChain(clone(self.base_estimator),
                         order="random",
                         cv=self.cv,
                         random_state=None) for _ in range(self.k_)
     ]
     self._set_random_state_of_estimators()
     self.fitted_ = False
def test_base_chain_fit_and_predict_with_sparse_data_and_cv():
    # Fit base chain with sparse data cross_val_predict
    X, Y = generate_multilabel_dataset_with_correlations()
    X_sparse = sp.csr_matrix(X)
    base_chains = [ClassifierChain(LogisticRegression(), cv=3),
                   RegressorChain(Ridge(), cv=3)]
    for chain in base_chains:
        chain.fit(X_sparse, Y)
        Y_pred = chain.predict(X_sparse)
        assert_equal(Y_pred.shape, Y.shape)
Esempio n. 18
0
def test_naiveBayes(df, truth, eval_type):
    clf = MultinomialNB()
    classifier = ClassifierChain(clf)
    kfold = KFold(n_splits=10, random_state=26)
    scores = cross_val_score(classifier,
                             df.values,
                             truth,
                             cv=kfold,
                             scoring=eval_type)

    return ["NB"], [scores]
Esempio n. 19
0
 def test_chainclassifier(implementation):
     name = "test_ls_cc"
     x, y = make_multilabel_classification()
     x_train, x_test, y_train, y_test = train_test_split(x, y)
     valid_cc = ClassifierChain(LinearSVC())
     valid_cc.fit(x_train, y_train)
     implementation.save(valid_cc, name)
     test_cc = implementation.load(name)
     expected = valid_cc.predict(x_test)
     got = test_cc.predict(x_test)
     assert_array_equal(got, expected)
Esempio n. 20
0
def test_classifier_chain_tuple_order(order_type):
    X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    y = [[3, 2], [2, 3], [3, 2]]
    order = order_type([1, 0])

    chain = ClassifierChain(RandomForestClassifier(), order=order)

    chain.fit(X, y)
    X_test = [[1.5, 2.5, 3.5]]
    y_test = [[3, 2]]
    assert_array_almost_equal(chain.predict(X_test), y_test)
Esempio n. 21
0
def test_best_AdaBoost(df, truth, eval_type):
    clf = AdaBoostClassifier(n_estimators=50)
    classifier = ClassifierChain(clf)
    kfold = KFold(n_splits=10, random_state=26)
    print("Start crossvalidation...")
    scores = cross_val_score(classifier,
                             df.values,
                             truth,
                             cv=kfold,
                             scoring=eval_type)
    print(f"Crossvalidation done. Mean: {np.mean(scores)}")
    return scores
def run(classifier, train_test_set):
    X_train, X_test, y_train, y_test = train_test_set

    # init model and fit to train data
    chain = ClassifierChain(classifier, order='random', random_state=0)
    chain.fit(X_train, y_train)

    # make predictions
    y_pred = chain.predict(X_test)
    print('\n--------Classifier chains with {:}'.format(classifier))

    return y_test, y_pred
Esempio n. 23
0
    def fit(self, train_x, train_y):
        self._estimators = []
        self._feature_number = train_y.shape[1]
        for i in range(self._no_of_estimators):
            X, y = train_x, train_y
            print(random.sample(range(0, self._feature_number), self._feature_number))
            estimator = ClassifierChain(DecisionTreeClassifier(), order=random.sample(range(0, self._feature_number), self._feature_number))
            estimator.fit(X, y)

            self._estimators.append(estimator)

        return self
def chaining_svm(X, Y, max_iter=-1):
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.2,
                                                        random_state=0)

    Cs = np.logspace(-2, 10, 30)
    res = []
    print(f'Trying Cs: {Cs}')
    print('C \t accuracy \t f1 \t precision \t recall')
    for C in Cs:
        base_clf = SVC(C=C, kernel='rbf', max_iter=max_iter)

        chain = ClassifierChain(base_clf, cv=2, order='random', random_state=0)
        chain.fit(X_train, Y_train)
        y_pred = chain.predict(X_test)
        res.append([[
            get_accuracy(Y_test, y_pred),
            get_f1(Y_test, y_pred),
            get_recall(Y_test, y_pred),
            get_precision(Y_test, y_pred)
        ], C])
        print(
            f'{C}\t{get_accuracy(Y_test, y_pred)}\t{get_f1(Y_test, y_pred)}\t{get_recall(Y_test, y_pred)}\t{get_precision(Y_test, y_pred)}'
        )

    store_data_as_pickle(res, f'svm-chain-logscale-values')

    acc = np.asarray([[a[0][0], a[1]] for a in res])
    f1 = np.asarray([[a[0][1], a[1]] for a in res])
    recall = np.asarray([[a[0][2], a[1]] for a in res])
    precision = np.asarray([[a[0][3], a[1]] for a in res])

    print("Max acc without question at default_dist: ",
          acc[np.argmax(acc[:, 0]), 1], " ", np.max(acc[:, 0]))
    print("Max f1 without question at default_dist: ", f1[np.argmax(f1[:, 0]),
                                                          1], " ",
          np.max(f1[:, 0]))
    print("Max recall without question at default_dist: ",
          recall[np.argmax(recall[:, 0]), 1], " ", np.max(recall[:, 0]))
    print("Max precision without question at default_dist: ",
          precision[np.argmax(precision[:, 0]), 1], " ", np.max(precision[:,
                                                                          0]))
    plt.plot(acc[:, 1], acc[:, 0], label='Accuracy')
    plt.plot(f1[:, 1], f1[:, 0], label='F1-Score')
    plt.plot(recall[:, 1], recall[:, 0], label='Recall')
    plt.plot(precision[:, 1], precision[:, 0], label='Precision')
    plt.legend()
    plt.xscale('log')
    plt.xlabel("C regularization parameter")
    plt.title("SVM with ClassifierChain 10 folds")
    plt.show()
def test_classifier_chain_random_order():
    # Fit classifier chain with random order
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_random = ClassifierChain(LogisticRegression(),
                                              order='random',
                                              random_state=42)
    classifier_chain_random.fit(X, Y)
    Y_pred_random = classifier_chain_random.predict(X)

    assert_not_equal(list(classifier_chain_random.order), list(range(4)))
    assert_equal(len(classifier_chain_random.order_), 4)
    assert_equal(len(set(classifier_chain_random.order_)), 4)

    classifier_chain_fixed = \
        ClassifierChain(LogisticRegression(),
                        order=classifier_chain_random.order_)
    classifier_chain_fixed.fit(X, Y)
    Y_pred_fixed = classifier_chain_fixed.predict(X)

    # Randomly ordered chain should behave identically to a fixed order chain
    # with the same order.
    assert_array_equal(Y_pred_random, Y_pred_fixed)
def chaining_adaboost(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.2,
                                                        random_state=0)

    base_clf = AdaBoostClassifier(algorithm="SAMME", n_estimators=200)
    chain = ClassifierChain(base_clf, cv=2, order='random', random_state=0)
    chain.fit(X_train, Y_train)
    y_pred = chain.predict(X_test)
    print(
        f'{get_accuracy(Y_test, y_pred)}\t{get_f1(Y_test, y_pred)}\t{get_recall(Y_test, y_pred)}\t{get_precision(Y_test, y_pred)}'
    )
Esempio n. 27
0
def XGBoostChain(X_train, y_train, X_test):
    print("fitting the data")
    # Fitting X-Gradient boosting
    gbc = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

    chains = [ClassifierChain(gbc, order='random', random_state=i) for i in range(10)]

    for chain in chains:
        chain.fit(X_train, y_train)

    Y_pred_chains = np.array([chain.predict_proba(X_test) for chain in chains])
    Y_pred_ensemble = Y_pred_chains.mean(axis=0)
    print(Y_pred_ensemble)
def test_classifier_chain_fit_and_predict_with_linear_svc():
    # Fit classifier chain and verify predict performance using LinearSVC
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LinearSVC())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_decision = classifier_chain.decision_function(X)

    Y_binary = (Y_decision >= 0)
    assert_array_equal(Y_binary, Y_pred)
    assert not hasattr(classifier_chain, 'predict_proba')
Esempio n. 29
0
def test_classifier_chain_fit_and_predict_with_logistic_regression():
    # Fit classifier chain and verify predict performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_prob = classifier_chain.predict_proba(X)
    Y_binary = (Y_prob >= .5)
    assert_array_equal(Y_binary, Y_pred)

    assert_equal([c.coef_.size for c in classifier_chain.estimators_],
                 list(range(X.shape[1], X.shape[1] + Y.shape[1])))
Esempio n. 30
0
def test_best_rf(df, truth, eval_type):
    clf = RandomForestClassifier(n_estimators=200,
                                 max_depth=50,
                                 max_features='auto',
                                 criterion='entropy')
    classifier = ClassifierChain(clf)
    kfold = KFold(n_splits=10, random_state=26)
    print("Start crossvalidation...")
    scores = cross_val_score(classifier,
                             df.values,
                             truth,
                             cv=kfold,
                             scoring=eval_type)
    print(f"Crossvalidation done. Mean: {np.mean(scores)}")
    return scores