def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    yeast = fetch_mldata('yeast')
    X = yeast['data']
    Y = yeast['target'].transpose().toarray()
    X_train = X[:2000, :]
    X_test = X[2000:, :]
    Y_train = Y[:2000, :]
    Y_test = Y[2000:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(LogisticRegression(),
                            order=np.array([0, 2, 4, 6, 8, 10,
                                            12, 1, 3, 5, 7, 9,
                                            11, 13]))
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain),
                   jaccard_similarity_score(Y_test, Y_pred_ovr))
Esempio n. 2
0
def calc_Fitness(train_d):

    vectorizer = TfidfVectorizer(strip_accents='unicode',
                                 analyzer='word',
                                 ngram_range=(1, 3),
                                 norm='l2')

    x_train = vectorizer.fit_transform(train_d.comment_text)
    y_train = train_d.drop(labels=['id', 'comment_text'], axis=1)
    x_test = vectorizer.transform(test.comment_text)
    y_test = test.drop(labels=['id', 'comment_text'], axis=1)

    # using classifier chains
    from sklearn.multioutput import ClassifierChain
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, hamming_loss, precision_score
    # initialize classifier chains multi-label classifier
    classifier = ClassifierChain(LogisticRegression())
    # Training logistic regression model on train data
    classifier.fit(x_train, y_train)
    # predict
    predictions = classifier.predict(x_test)
    # accuracy
    quality = (accuracy_score(y_test, predictions) +
               (1 - hamming_loss(y_test, predictions)) +
               precision_score(y_test, predictions, average='weighted')) / 3
    return quality
def test_classifier_chain_fit_and_predict_with_sparse_data_and_cv():
    # Fit classifier chain with sparse data cross_val_predict
    X, Y = generate_multilabel_dataset_with_correlations()
    X_sparse = sp.csr_matrix(X)
    classifier_chain = ClassifierChain(LogisticRegression(), cv=3)
    classifier_chain.fit(X_sparse, Y)
    Y_pred = classifier_chain.predict(X_sparse)
    assert_equal(Y_pred.shape, Y.shape)
def test_classifier_chain_fit_and_predict_with_sparse_data_and_cv():
    # Fit classifier chain with sparse data cross_val_predict
    X, Y = generate_multilabel_dataset_with_correlations()
    X_sparse = sp.csr_matrix(X)
    classifier_chain = ClassifierChain(LogisticRegression(), cv=3)
    classifier_chain.fit(X_sparse, Y)
    Y_pred = classifier_chain.predict(X_sparse)
    assert_equal(Y_pred.shape, Y.shape)
Esempio n. 5
0
def test_classifier_chain_tuple_invalid_order():
    X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    y = [[3, 2], [2, 3], [3, 2]]
    order = tuple([1, 2])

    chain = ClassifierChain(RandomForestClassifier(), order=order)

    with pytest.raises(ValueError, match='invalid order'):
        chain.fit(X, y)
Esempio n. 6
0
def test_classifier_chain_tuple_order(order_type):
    X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    y = [[3, 2], [2, 3], [3, 2]]
    order = order_type([1, 0])

    chain = ClassifierChain(RandomForestClassifier(), order=order)

    chain.fit(X, y)
    X_test = [[1.5, 2.5, 3.5]]
    y_test = [[3, 2]]
    assert_array_almost_equal(chain.predict(X_test), y_test)
Esempio n. 7
0
class Multi_classes_classifier_on_column(BaseEstimator):
    def __init__(self, base_classifier, column):
        self.column = column
        self.classifier = ClassifierChain(base_classifier)
        self.vectorizer = None

    def _get_vectors(self, X):
        text_data = X[self.column]
        text_data = [prepro.clean_text(text)
                     for text in text_data]  # text cleaning
        feature_vector = self.vectorizer.transform(text_data).toarray()
        return feature_vector

    def fit(self, X, y):
        if type(self.column) == type(int(1)):
            self.column = list(X.columns)[self.column]
        if type(self.vectorizer) == type(None):
            self.vectorizer = prepro.get_text_vectorizer(X, self.column)

        feature_vector = self._get_vectors(X)
        self.classifier.fit(feature_vector, y)
        return self

    def predict(self, X):
        feature_vector = self._get_vectors(X)
        result = self.classifier.predict(feature_vector)
        return result

    def predict_proba(self, X: pd.DataFrame):
        feature_vector = self._get_vectors(X)
        result = self.classifier.predict_proba(feature_vector)
        return result

    def partial_fit(self, X, y):
        feature_vector = self._get_vectors(X)

        result = self.classifier.partial_fit(feature_vector, y)
        return result

    def score(self, X, y):
        feature_vector = self._get_vectors(X)
        result = self.classifier.score(feature_vector, y)
        return result

    def set_params(self, **params):
        self.classifier.set_params(**params)
        return self

    def get_params(self, deep):
        result = self.classifier.get_params(deep)
        return result

    def set_vectorizer(self, vectorizer):
        self.vectorizer = vectorizer
Esempio n. 8
0
 def test_chainclassifier(implementation):
     name = "test_ls_cc"
     x, y = make_multilabel_classification()
     x_train, x_test, y_train, y_test = train_test_split(x, y)
     valid_cc = ClassifierChain(LinearSVC())
     valid_cc.fit(x_train, y_train)
     implementation.save(valid_cc, name)
     test_cc = implementation.load(name)
     expected = valid_cc.predict(x_test)
     got = test_cc.predict(x_test)
     assert_array_equal(got, expected)
def run(classifier, train_test_set):
    X_train, X_test, y_train, y_test = train_test_set

    # init model and fit to train data
    chain = ClassifierChain(classifier, order='random', random_state=0)
    chain.fit(X_train, y_train)

    # make predictions
    y_pred = chain.predict(X_test)
    print('\n--------Classifier chains with {:}'.format(classifier))

    return y_test, y_pred
Esempio n. 10
0
    def fit(self, train_x, train_y):
        self._estimators = []
        self._feature_number = train_y.shape[1]
        for i in range(self._no_of_estimators):
            X, y = train_x, train_y
            print(random.sample(range(0, self._feature_number), self._feature_number))
            estimator = ClassifierChain(DecisionTreeClassifier(), order=random.sample(range(0, self._feature_number), self._feature_number))
            estimator.fit(X, y)

            self._estimators.append(estimator)

        return self
def chaining_svm(X, Y, max_iter=-1):
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.2,
                                                        random_state=0)

    Cs = np.logspace(-2, 10, 30)
    res = []
    print(f'Trying Cs: {Cs}')
    print('C \t accuracy \t f1 \t precision \t recall')
    for C in Cs:
        base_clf = SVC(C=C, kernel='rbf', max_iter=max_iter)

        chain = ClassifierChain(base_clf, cv=2, order='random', random_state=0)
        chain.fit(X_train, Y_train)
        y_pred = chain.predict(X_test)
        res.append([[
            get_accuracy(Y_test, y_pred),
            get_f1(Y_test, y_pred),
            get_recall(Y_test, y_pred),
            get_precision(Y_test, y_pred)
        ], C])
        print(
            f'{C}\t{get_accuracy(Y_test, y_pred)}\t{get_f1(Y_test, y_pred)}\t{get_recall(Y_test, y_pred)}\t{get_precision(Y_test, y_pred)}'
        )

    store_data_as_pickle(res, f'svm-chain-logscale-values')

    acc = np.asarray([[a[0][0], a[1]] for a in res])
    f1 = np.asarray([[a[0][1], a[1]] for a in res])
    recall = np.asarray([[a[0][2], a[1]] for a in res])
    precision = np.asarray([[a[0][3], a[1]] for a in res])

    print("Max acc without question at default_dist: ",
          acc[np.argmax(acc[:, 0]), 1], " ", np.max(acc[:, 0]))
    print("Max f1 without question at default_dist: ", f1[np.argmax(f1[:, 0]),
                                                          1], " ",
          np.max(f1[:, 0]))
    print("Max recall without question at default_dist: ",
          recall[np.argmax(recall[:, 0]), 1], " ", np.max(recall[:, 0]))
    print("Max precision without question at default_dist: ",
          precision[np.argmax(precision[:, 0]), 1], " ", np.max(precision[:,
                                                                          0]))
    plt.plot(acc[:, 1], acc[:, 0], label='Accuracy')
    plt.plot(f1[:, 1], f1[:, 0], label='F1-Score')
    plt.plot(recall[:, 1], recall[:, 0], label='Recall')
    plt.plot(precision[:, 1], precision[:, 0], label='Precision')
    plt.legend()
    plt.xscale('log')
    plt.xlabel("C regularization parameter")
    plt.title("SVM with ClassifierChain 10 folds")
    plt.show()
def chaining_adaboost(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.2,
                                                        random_state=0)

    base_clf = AdaBoostClassifier(algorithm="SAMME", n_estimators=200)
    chain = ClassifierChain(base_clf, cv=2, order='random', random_state=0)
    chain.fit(X_train, Y_train)
    y_pred = chain.predict(X_test)
    print(
        f'{get_accuracy(Y_test, y_pred)}\t{get_f1(Y_test, y_pred)}\t{get_recall(Y_test, y_pred)}\t{get_precision(Y_test, y_pred)}'
    )
Esempio n. 13
0
def test_classifier_chain_fit_and_predict_with_linear_svc():
    # Fit classifier chain and verify predict performance using LinearSVC
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LinearSVC())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_decision = classifier_chain.decision_function(X)

    Y_binary = (Y_decision >= 0)
    assert_array_equal(Y_binary, Y_pred)
    assert not hasattr(classifier_chain, 'predict_proba')
Esempio n. 14
0
def test_classifier_chain_fit_and_predict_with_sparse_data():
    # Fit classifier chain with sparse data
    X, Y = generate_multilabel_dataset_with_correlations()
    X_sparse = sp.csr_matrix(X)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X_sparse, Y)
    Y_pred_sparse = classifier_chain.predict(X_sparse)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)
    Y_pred_dense = classifier_chain.predict(X)

    assert_array_equal(Y_pred_sparse, Y_pred_dense)
def test_classifier_chain_fit_and_predict_with_linear_svc():
    # Fit classifier chain and verify predict performance using LinearSVC
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LinearSVC())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_decision = classifier_chain.decision_function(X)

    Y_binary = (Y_decision >= 0)
    assert_array_equal(Y_binary, Y_pred)
    assert not hasattr(classifier_chain, 'predict_proba')
def test_classifier_chain_fit_and_predict_with_sparse_data():
    # Fit classifier chain with sparse data
    X, Y = generate_multilabel_dataset_with_correlations()
    X_sparse = sp.csr_matrix(X)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X_sparse, Y)
    Y_pred_sparse = classifier_chain.predict(X_sparse)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)
    Y_pred_dense = classifier_chain.predict(X)

    assert_array_equal(Y_pred_sparse, Y_pred_dense)
Esempio n. 17
0
def test_classifier_chain_fit_and_predict_with_logistic_regression():
    # Fit classifier chain and verify predict performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_prob = classifier_chain.predict_proba(X)
    Y_binary = (Y_prob >= .5)
    assert_array_equal(Y_binary, Y_pred)

    assert_equal([c.coef_.size for c in classifier_chain.estimators_],
                 list(range(X.shape[1], X.shape[1] + Y.shape[1])))
Esempio n. 18
0
def test_classifier_chain_fit_and_predict_with_logistic_regression():
    # Fit classifier chain and verify predict performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_prob = classifier_chain.predict_proba(X)
    Y_binary = (Y_prob >= .5)
    assert_array_equal(Y_binary, Y_pred)

    assert_equal([c.coef_.size for c in classifier_chain.estimators_],
                 list(range(X.shape[1], X.shape[1] + Y.shape[1])))
def multi_label(x_train, y_train, name="MP"):
    logger.info(f"Multi label problem: [{name}] - {len(x_train)}... ")
    le = MultiLabelBinarizer(sparse_output=True)
    vct = get_new_vectorizer()

    logger.info(f"[{name}] Vectorizing inputs...")
    x_train = vct.fit_transform(x_train)
    logger.info(f"[{name}] Vectorizing outputs...")
    y_train = le.fit_transform(y_train)

    logger.info(f"[{name}] Data shapes:")
    logger.info(f"[{name}] x_train: {x_train.shape}")
    logger.info(f"[{name}] y_train: {y_train.shape}")

    model = ClassifierChain(LinearSVC(random_state=0))
    model.fit(x_train, y_train.todense())

    return SKLearnProblem(name, le, model, vct)
def test_classifier_chain_crossval_fit_and_predict():
    # Fit classifier chain with cross_val_predict and verify predict
    # performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_cv = ClassifierChain(LogisticRegression(), cv=3)
    classifier_chain_cv.fit(X, Y)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred_cv = classifier_chain_cv.predict(X)
    Y_pred = classifier_chain.predict(X)

    assert_equal(Y_pred_cv.shape, Y.shape)
    assert_greater(jaccard_similarity_score(Y, Y_pred_cv), 0.4)

    assert_not_equal(jaccard_similarity_score(Y, Y_pred_cv),
                     jaccard_similarity_score(Y, Y_pred))
Esempio n. 21
0
def test_classifier_chain_crossval_fit_and_predict():
    # Fit classifier chain with cross_val_predict and verify predict
    # performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_cv = ClassifierChain(LogisticRegression(), cv=3)
    classifier_chain_cv.fit(X, Y)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred_cv = classifier_chain_cv.predict(X)
    Y_pred = classifier_chain.predict(X)

    assert_equal(Y_pred_cv.shape, Y.shape)
    assert_greater(jaccard_similarity_score(Y, Y_pred_cv), 0.4)

    assert_not_equal(jaccard_similarity_score(Y, Y_pred_cv),
                     jaccard_similarity_score(Y, Y_pred))
Esempio n. 22
0
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    X, Y = generate_multilabel_dataset_with_correlations()
    X_train = X[:600, :]
    X_test = X[600:, :]
    Y_train = Y[:600, :]
    Y_test = Y[600:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(LogisticRegression())
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain),
                   jaccard_similarity_score(Y_test, Y_pred_ovr))
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    X, Y = generate_multilabel_dataset_with_correlations()
    X_train = X[:600, :]
    X_test = X[600:, :]
    Y_train = Y[:600, :]
    Y_test = Y[600:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(LogisticRegression())
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_score(Y_test, Y_pred_chain, average='samples'),
                   jaccard_score(Y_test, Y_pred_ovr, average='samples'))
Esempio n. 24
0
def test_classifier_chain_random_order():
    # Fit classifier chain with random order
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_random = ClassifierChain(LogisticRegression(),
                                              order='random',
                                              random_state=42)
    classifier_chain_random.fit(X, Y)
    Y_pred_random = classifier_chain_random.predict(X)

    assert_not_equal(list(classifier_chain_random.order), list(range(4)))
    assert_equal(len(classifier_chain_random.order_), 4)
    assert_equal(len(set(classifier_chain_random.order_)), 4)

    classifier_chain_fixed = \
        ClassifierChain(LogisticRegression(),
                        order=classifier_chain_random.order_)
    classifier_chain_fixed.fit(X, Y)
    Y_pred_fixed = classifier_chain_fixed.predict(X)

    # Randomly ordered chain should behave identically to a fixed order chain
    # with the same order.
    assert_array_equal(Y_pred_random, Y_pred_fixed)
def test_classifier_chain_random_order():
    # Fit classifier chain with random order
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_random = ClassifierChain(LogisticRegression(),
                                              order='random',
                                              random_state=42)
    classifier_chain_random.fit(X, Y)
    Y_pred_random = classifier_chain_random.predict(X)

    assert_not_equal(list(classifier_chain_random.order), list(range(4)))
    assert_equal(len(classifier_chain_random.order_), 4)
    assert_equal(len(set(classifier_chain_random.order_)), 4)

    classifier_chain_fixed = \
        ClassifierChain(LogisticRegression(),
                        order=classifier_chain_random.order_)
    classifier_chain_fixed.fit(X, Y)
    Y_pred_fixed = classifier_chain_fixed.predict(X)

    # Randomly ordered chain should behave identically to a fixed order chain
    # with the same order.
    assert_array_equal(Y_pred_random, Y_pred_fixed)
Esempio n. 26
0
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    yeast = fetch_mldata('yeast')
    X = yeast['data']
    Y = yeast['target'].transpose().toarray()
    X_train = X[:2000, :]
    X_test = X[2000:, :]
    Y_train = Y[:2000, :]
    Y_test = Y[2000:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(
        LogisticRegression(),
        order=np.array([0, 2, 4, 6, 8, 10, 12, 1, 3, 5, 7, 9, 11, 13]))
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain),
                   jaccard_similarity_score(Y_test, Y_pred_ovr))
Esempio n. 27
0
def naive_base(params):
    building_list = params[0]
    n_list = params[1]
    target_building = params[2]
    inc_num = params[3]
    iter_num = params[4]
    accuracy_list = list()
    micro_f1_list = list()
    macro_f1_list = list()
    for iter_i in range(0, iter_num):
        sentence_dict = dict()
        truth_dict = dict()
        if iter_i == 0:
            learning_srcids = list()
        for building, n in zip(building_list, n_list):
            if building == target_building:
                n += iter_i * inc_num
            if building != 'ghc':
                (sensorDF, srcid_list, name_list, jciname_list, desc_list,
                 unit_list, bacnettype_list) = toker.parse_sentences(building)
                for srcid, name, jciname, desc in \
                        zip(srcid_list, name_list, jciname_list, desc_list):
                    sentence_dict[srcid] = list(
                        map(replacer, name + jciname + desc))
            else:
                with open(
                        'metadata/{0}_sentence_dict_justseparate.json'.format(
                            building), 'r') as fp:
                    curr_sentence_dict = json.load(fp)

                curr_sentence_dict = dict([
                    (srcid, list(map(replacer, sentence)))
                    for srcid, sentence in curr_sentence_dict.items()
                ])
                sentence_dict.update(curr_sentence_dict)

            with open('metadata/{0}_ground_truth.json'.format(building),
                      'r') as fp:
                truth_dict.update(json.load(fp))
            label_dict = get_label_dict(building)
            srcids = list(truth_dict.keys())

            if iter_i == 0:
                learning_srcids += select_random_samples(
                    building,
                    srcids,
                    n,
                    True,
                    token_type='justseparate',
                    reverse=True,
                    cluster_dict=None,
                    shuffle_flag=False)
            else:
                learning_srcids += new_srcids * 3
                pass
            if building == target_building:
                test_srcids = [
                    srcid for srcid in label_dict.keys()
                    if srcid not in learning_srcids
                ]

        binarizer = MultiLabelBinarizer().fit(truth_dict.values())
        vectorizer = TfidfVectorizer(tokenizer=tokenizer).fit(
            list(map(joiner, sentence_dict.values())))
        learning_doc = [
            ' '.join(sentence_dict[srcid]) for srcid in learning_srcids
        ]
        learning_vect_doc = vectorizer.transform(learning_doc)

        learning_truth_mat = binarizer.transform(
            [truth_dict[srcid] for srcid in learning_srcids])

        #classifier = RandomForestClassifier(n_estimators=200, n_jobs=1)
        classifier = ClassifierChain(RandomForestClassifier())
        classifier.fit(learning_vect_doc, learning_truth_mat)

        test_doc = [' '.join(sentence_dict[srcid]) for srcid in test_srcids]
        test_vect_doc = vectorizer.transform(test_doc)

        pred_mat = classifier.predict(test_vect_doc)
        prob_mat = classifier.predict_proba(test_vect_doc)

        # Query Stage for Active Learning
        entropies = [get_entropy(prob) for prob in prob_mat]
        sorted_entropies = sorted([(test_srcids[i], entropy)
                                   for i, entropy in enumerate(entropies)],
                                  key=itemgetter(1),
                                  reverse=True)
        added_cids = set()
        """
        for srcid in learning_srcids:
            cid = find_keys(srcid, cluster_dict, crit=lambda x,y:x in y)[0]
            added_cids.add(cid)
            """

        new_srcids = []
        new_srcid_cnt = 0
        cluster_dict = get_cluster_dict(target_building)
        for srcid, entropy in sorted_entropies:
            if srcid not in learning_srcids:
                the_cid = None
                for cid, cluster in cluster_dict.items():
                    if srcid in cluster:
                        the_cid = cid
                        break
                if the_cid in added_cids:
                    continue
                added_cids.add(the_cid)
                new_srcids.append(srcid)
                new_srcid_cnt += 1
                if new_srcid_cnt == inc_num:
                    break

        pred_tagsets_list = binarizer.inverse_transform(pred_mat)
        pred_tagsets_dict = dict([
            (srcid, pred_tagset)
            for srcid, pred_tagset in zip(test_srcids, pred_tagsets_list)
        ])

        correct_cnt = 0
        incorrect_cnt = 0
        for i, srcid in enumerate(test_srcids):
            pred = pred_tagsets_dict[srcid]
            true = truth_dict[srcid]
            if set(pred_tagsets_dict[srcid]) != set(truth_dict[srcid]):
                incorrect_cnt += 1
            else:
                correct_cnt += 1

        test_truth_mat = binarizer.transform(
            [truth_dict[srcid] for srcid in test_srcids])

        if not isinstance(pred_mat, np.ndarray):
            pred_mat = pred_mat.toarray()
        if not isinstance(test_truth_mat, np.ndarray):
            test_truth_mat = test_truth_mat.toarray()

        accuracy = get_accuracy(test_truth_mat, pred_mat)
        micro_f1 = get_micro_f1(test_truth_mat, pred_mat)
        #_, _, macro_f1, _ = precision_recall_fscore_support(test_truth_mat,
        #                                            pred_mat, average='macro')
        macro_f1 = get_macro_f1(test_truth_mat, pred_mat)
        accuracy_list.append(accuracy * 100)
        micro_f1_list.append(micro_f1 * 100)
        macro_f1_list.append(macro_f1 * 100)

    return accuracy_list, macro_f1_list
#로지스틱 회귀
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

ovr = OneVsRestClassifier(LogisticRegression())
ovr.fit(X_train, Y_train)
pred_ovr = ovr.predict(X_test)

from sklearn.metrics import jaccard_similarity_score
ovr_score = jaccard_similarity_score(Y_test, pred_ovr)
ovr_score

from sklearn.multioutput import ClassifierChain

cc = ClassifierChain(LogisticRegression(), order='random', random_state=42)
cc.fit(X_train, Y_train)
pred_cc = cc.predict(X_test)
cc_score = jaccard_similarity_score(Y_test, pred_cc)
cc_score

chains = [
    ClassifierChain(LogisticRegression(), order='random', random_state=42 + i)
    for i in range(10)
]
for chain in chains:
    chain.fit(X_train, Y_train)

pred_chains = np.array([chain.predict(X_test) for chain in chains])
chain_scores = [
    jaccard_similarity_score(Y_test, pred_chain) for pred_chain in pred_chains
]
Esempio n. 29
0
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain
from sklearn.preprocessing import MultiLabelBinarizer

ir_data = pd.read_csv("../../data/extracted_Features.csv")
ir_data.drop('Unnamed: 0', inplace=True, axis=1)

label = list(ir_data["label"])
y_lab = [lab.split(" ") for lab in label]
bin = MultiLabelBinarizer()
y = bin.fit_transform(y_lab)

ir_data.drop("label", inplace=True, axis=1)

X_train, X_test, y_train, y_test = train_test_split(ir_data, y, test_size=0.2)

clf = AdaBoostClassifier(n_estimators=50)
classifier = ClassifierChain(clf)

model = classifier.fit(X=X_train, Y=y_train)
predictions = classifier.predict(X=X_test)

cm = confusion_matrix(y_true=y_test.argmax(axis=1),
                      y_pred=predictions.argmax(axis=1))

print(cm)
print(bin.classes_)

print(predictions.argmax(axis=1))
Esempio n. 30
0
label = list(ir_data["label"])
y_lab = [lab.split(" ") for lab in label]
bin = MultiLabelBinarizer()
y = bin.fit_transform(y_lab)

ir_data.drop("label", inplace=True, axis=1)

X_train, X_test, y_train, y_test = train_test_split(ir_data, y, test_size=0.2)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


def list_comparison(list1, list2):
    for ind in range(0, len(list1)):
        if list1[ind] != list2[ind]:
            return False

    return True


clf = AdaBoostClassifier(n_estimators=50)
classifier = ClassifierChain(clf)

classifier.fit(X_train, y_train)
predicted_labels = classifier.predict(X_test)

Esempio n. 31
0
def train_model(X_train,
                y_train,
                seed,
                ccru_version,
                base_classifier,
                X_val,
                y_val,
                feature_subsets_per_cc=[]):
    pid = os.getpid()
    print('The id of ' + str(seed) + ' is :' + str(pid))
    # print('Train ecc: '+str(seed)+' started')

    if ccru_version == 'standard':
        model = ClassifierChain(base_classifier,
                                order='random',
                                random_state=seed)
    elif ccru_version == 'eccru' or ccru_version == 'eccru2' or ccru_version == 'eccru3':
        model = CCRU(base_classifier, order='random', random_state=seed)
    elif ccru_version == 'binary_relevance':
        model = SVC(gamma='auto', kernel='linear')
    else:
        print('Cannot recoginize ccru version!!!!')

    class_1 = 1
    class_2 = 0
    if -1 in y_train:
        class_2 = -1

    if ccru_version == 'binary_relevance':

        class_1_counter = np.count_nonzero(y_train[:, 0] == class_1)
        class_2_counter = np.count_nonzero(y_train[:, 0] == class_2)
        # class_1_counter = y_train.flatten().tolist()[0].count(class_1)
        # class_2_counter = y_train.flatten().tolist()[0].count(class_2)

        if class_1_counter <= class_2_counter:
            minority_class = class_1
            majority_class = class_2
            minority_counter = class_1_counter
        else:
            minority_class = class_2
            majority_class = class_1
            minority_counter = class_2_counter

        sampled_index = [
            index for index, label in enumerate(y_train)
            if label == minority_class
        ]
        sampled_y = [minority_class] * minority_counter

        temp_sampled_index = [
            index for index, label in enumerate(y_train)
            if label == majority_class
        ]

        sampled_index.extend(
            random.sample(temp_sampled_index, minority_counter))
        sampled_y.extend([majority_class] * minority_counter)
        print('Train binary_relevance: ' + str(seed) + ' started')

        print('training on ' + str(len(sampled_y)))
        if len(feature_subsets_per_cc) != 0:
            trained_model = model.fit(
                X_train[np.array(sampled_index), feature_subsets_per_cc[seed]],
                y_train, X_val, y_val)
        else:
            trained_model = model.fit(X_train[np.array(sampled_index), :],
                                      sampled_y)
    else:
        print('Train ecc: ' + str(seed) + ' started ')
        if len(feature_subsets_per_cc) != 0:
            trained_model = model.fit(X_train[:, feature_subsets_per_cc[seed]],
                                      y_train, X_val, y_val)
        else:
            trained_model = model.fit(X_train, y_train, X_val, y_val)
    print('Train model: ' + str(seed) + ' ended')
    return trained_model
Esempio n. 32
0

X = combine_2_feats('content', 'structural')
y = load_labels()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

accs = np.zeros(10)
precs = np.zeros(10)
recs = np.zeros(10)
f1s = np.zeros(10)
for i in range(10):
    # model = ClassifierChain(LinearSVC(C=1, max_iter=1000, fit_intercept=True))
    # model = ClassifierChain(AdaBoostClassifier())
    # model = MLkNN(k=3, s=0.1)
    model = ClassifierChain(
        RandomForestClassifier(n_estimators=1500,
                               min_samples_split=7,
                               min_samples_leaf=7,
                               max_features='sqrt'))

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accs[i] = jaccard_score(y_test, pred, average='samples')
    precs[i], recs[i], f1s[i], _ = precision_recall_fscore_support(
        y_test, pred, average='samples')

print(f"Accuracy: {accs.mean()} +- {accs.std()}")
print(f"Precision: {precs.mean()} +- {precs.std()}")
print(f"Recall: {recs.mean()} +- {recs.std()}")
print(f"F1: {f1s.mean()} +- {f1s.std()}")
Esempio n. 33
0
features = [item.split(" ") for item in train_df]
col_dicts = [make_dict(entry) for entry in features]

features_val = [item.split(" ") for item in val_df]
col_dicts_val = [make_dict(entry) for entry in features_val]

features_df = pd.DataFrame(col_dicts)
features_df_val = pd.DataFrame(col_dicts_val)

features_df = features_df.fillna(0)
features_df_val = features_df_val.fillna(0)
print('done cleanning')
X_train = np.array(features_df)
Y_train = np.array(encoded_labels_df)
x_val = np.array(features_df_val)
y_val = np.array(encoded_labels_df_val)

base_lr = LogisticRegression(max_iter=MAX_ITER, n_jobs=-1, verbose=1)

int_rand = np.random.randint(1000)
chain = ClassifierChain(base_lr, order='random', random_state=int_rand)

chain.fit(X_train, Y_train)
filename = MAX_ITER + "_" + int_ran + ".pickle"
pickle.dump(chain, open(filename, 'wb'))

#loaded_model = pickle.load(open(filename, 'rb'))
print('start predict')
Y_pred_chains = np.array([chain.predict_proba(x_val) for chain in chains])
Esempio n. 34
0
from sklearn.multiclass import OneVsRestClassifier
t0 = clock()
onerest = OneVsRestClassifier(knn)
onerest.fit(X_train, Y_train)
Y_pred = onerest.predict(X_test)
t_onerest = clock() - t0
#print(Y_test)
#print(Y_pred)
loss_onerest = np.mean(Y_pred != Y_test)
print("Hamming loss for One vs Rest classifier: ", loss_onerest)

from sklearn.multioutput import ClassifierChain
t0 = clock()
classfierchain = ClassifierChain(knn)
classfierchain.fit(X_train, Y_train)
Y_pred = classfierchain.predict(X_test)
t_chain = clock() - t0
#print(Y_test)
#print(Y_pred)
loss_chain = np.mean(Y_pred != Y_test)
print("Hamming loss for classifier chain: ", loss_chain)

arr_epoch = np.arange(1, len(time_h) + 1) * 10

plt.figure(figsize=(12, 9))
plt.plot(arr_epoch, time_h, label='my network', c='k')
plt.axhline(t_nn, c='r', label='Default network of Sklearn')
plt.axhline(
    t_knn,
    c='g',