def test_classifier_chain_fit_and_predict_with_sparse_data():
    # Fit classifier chain with sparse data
    X, Y = generate_multilabel_dataset_with_correlations()
    X_sparse = sp.csr_matrix(X)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X_sparse, Y)
    Y_pred_sparse = classifier_chain.predict(X_sparse)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)
    Y_pred_dense = classifier_chain.predict(X)

    assert_array_equal(Y_pred_sparse, Y_pred_dense)
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    yeast = fetch_mldata('yeast')
    X = yeast['data']
    Y = yeast['target'].transpose().toarray()
    X_train = X[:2000, :]
    X_test = X[2000:, :]
    Y_train = Y[:2000, :]
    Y_test = Y[2000:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(LogisticRegression(),
                            order=np.array([0, 2, 4, 6, 8, 10,
                                            12, 1, 3, 5, 7, 9,
                                            11, 13]))
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain),
                   jaccard_similarity_score(Y_test, Y_pred_ovr))
def test_classifier_chain_fit_and_predict_with_sparse_data_and_cv():
    # Fit classifier chain with sparse data cross_val_predict
    X, Y = generate_multilabel_dataset_with_correlations()
    X_sparse = sp.csr_matrix(X)
    classifier_chain = ClassifierChain(LogisticRegression(), cv=3)
    classifier_chain.fit(X_sparse, Y)
    Y_pred = classifier_chain.predict(X_sparse)
    assert_equal(Y_pred.shape, Y.shape)
def test_classifier_chain_crossval_fit_and_predict():
    # Fit classifier chain with cross_val_predict and verify predict
    # performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_cv = ClassifierChain(LogisticRegression(), cv=3)
    classifier_chain_cv.fit(X, Y)

    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred_cv = classifier_chain_cv.predict(X)
    Y_pred = classifier_chain.predict(X)

    assert_equal(Y_pred_cv.shape, Y.shape)
    assert_greater(jaccard_similarity_score(Y, Y_pred_cv), 0.4)

    assert_not_equal(jaccard_similarity_score(Y, Y_pred_cv),
                     jaccard_similarity_score(Y, Y_pred))
def test_classifier_chain_random_order():
    # Fit classifier chain with random order
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain_random = ClassifierChain(LogisticRegression(),
                                              order='random',
                                              random_state=42)
    classifier_chain_random.fit(X, Y)
    Y_pred_random = classifier_chain_random.predict(X)

    assert_not_equal(list(classifier_chain_random.order), list(range(4)))
    assert_equal(len(classifier_chain_random.order_), 4)
    assert_equal(len(set(classifier_chain_random.order_)), 4)

    classifier_chain_fixed = \
        ClassifierChain(LogisticRegression(),
                        order=classifier_chain_random.order_)
    classifier_chain_fixed.fit(X, Y)
    Y_pred_fixed = classifier_chain_fixed.predict(X)

    # Randomly ordered chain should behave identically to a fixed order chain
    # with the same order.
    assert_array_equal(Y_pred_random, Y_pred_fixed)
def test_classifier_chain_fit_and_predict_with_linear_svc():
    # Fit classifier chain and verify predict performance using LinearSVC
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LinearSVC())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_decision = classifier_chain.decision_function(X)

    Y_binary = (Y_decision >= 0)
    assert_array_equal(Y_binary, Y_pred)
    assert not hasattr(classifier_chain, 'predict_proba')
def test_classifier_chain_fit_and_predict_with_logistic_regression():
    # Fit classifier chain and verify predict performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_prob = classifier_chain.predict_proba(X)
    Y_binary = (Y_prob >= .5)
    assert_array_equal(Y_binary, Y_pred)

    assert_equal([c.coef_.size for c in classifier_chain.estimators_],
                 list(range(X.shape[1], X.shape[1] + Y.shape[1])))
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    X, Y = generate_multilabel_dataset_with_correlations()
    X_train = X[:600, :]
    X_test = X[600:, :]
    Y_train = Y[:600, :]
    Y_test = Y[600:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(LogisticRegression())
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain),
                   jaccard_similarity_score(Y_test, Y_pred_ovr))
Esempio n. 9
0
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    X, Y = generate_multilabel_dataset_with_correlations()
    X_train = X[:600, :]
    X_test = X[600:, :]
    Y_train = Y[:600, :]
    Y_test = Y[600:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(LogisticRegression())
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert jaccard_score(Y_test, Y_pred_chain,
                         average="samples") > jaccard_score(
                             Y_test, Y_pred_ovr, average="samples")
Esempio n. 10
0
def test_classifier_chain_vs_independent_models():
    # Verify that an ensemble of classifier chains (each of length
    # N) can achieve a higher Jaccard similarity score than N independent
    # models
    yeast = fetch_mldata('yeast')
    X = yeast['data']
    Y = yeast['target'].transpose().toarray()
    X_train = X[:2000, :]
    X_test = X[2000:, :]
    Y_train = Y[:2000, :]
    Y_test = Y[2000:, :]

    ovr = OneVsRestClassifier(LogisticRegression())
    ovr.fit(X_train, Y_train)
    Y_pred_ovr = ovr.predict(X_test)

    chain = ClassifierChain(
        LogisticRegression(),
        order=np.array([0, 2, 4, 6, 8, 10, 12, 1, 3, 5, 7, 9, 11, 13]))
    chain.fit(X_train, Y_train)
    Y_pred_chain = chain.predict(X_test)

    assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain),
                   jaccard_similarity_score(Y_test, Y_pred_ovr))
Esempio n. 11
0
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain
from sklearn.preprocessing import MultiLabelBinarizer

ir_data = pd.read_csv("../../data/extracted_Features.csv")
ir_data.drop('Unnamed: 0', inplace=True, axis=1)

label = list(ir_data["label"])
y_lab = [lab.split(" ") for lab in label]
bin = MultiLabelBinarizer()
y = bin.fit_transform(y_lab)

ir_data.drop("label", inplace=True, axis=1)

X_train, X_test, y_train, y_test = train_test_split(ir_data, y, test_size=0.2)

clf = AdaBoostClassifier(n_estimators=50)
classifier = ClassifierChain(clf)

model = classifier.fit(X=X_train, Y=y_train)
predictions = classifier.predict(X=X_test)

cm = confusion_matrix(y_true=y_test.argmax(axis=1),
                      y_pred=predictions.argmax(axis=1))

print(cm)
print(bin.classes_)

print(predictions.argmax(axis=1))
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

ovr = OneVsRestClassifier(LogisticRegression())
ovr.fit(X_train, Y_train)
pred_ovr = ovr.predict(X_test)

from sklearn.metrics import jaccard_similarity_score
ovr_score = jaccard_similarity_score(Y_test, pred_ovr)
ovr_score

from sklearn.multioutput import ClassifierChain

cc = ClassifierChain(LogisticRegression(), order='random', random_state=42)
cc.fit(X_train, Y_train)
pred_cc = cc.predict(X_test)
cc_score = jaccard_similarity_score(Y_test, pred_cc)
cc_score

chains = [
    ClassifierChain(LogisticRegression(), order='random', random_state=42 + i)
    for i in range(10)
]
for chain in chains:
    chain.fit(X_train, Y_train)

pred_chains = np.array([chain.predict(X_test) for chain in chains])
chain_scores = [
    jaccard_similarity_score(Y_test, pred_chain) for pred_chain in pred_chains
]
Esempio n. 13
0

X = combine_2_feats('content', 'structural')
y = load_labels()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

accs = np.zeros(10)
precs = np.zeros(10)
recs = np.zeros(10)
f1s = np.zeros(10)
for i in range(10):
    # model = ClassifierChain(LinearSVC(C=1, max_iter=1000, fit_intercept=True))
    # model = ClassifierChain(AdaBoostClassifier())
    # model = MLkNN(k=3, s=0.1)
    model = ClassifierChain(
        RandomForestClassifier(n_estimators=1500,
                               min_samples_split=7,
                               min_samples_leaf=7,
                               max_features='sqrt'))

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accs[i] = jaccard_score(y_test, pred, average='samples')
    precs[i], recs[i], f1s[i], _ = precision_recall_fscore_support(
        y_test, pred, average='samples')

print(f"Accuracy: {accs.mean()} +- {accs.std()}")
print(f"Precision: {precs.mean()} +- {precs.std()}")
print(f"Recall: {recs.mean()} +- {recs.std()}")
print(f"F1: {f1s.mean()} +- {f1s.std()}")
Esempio n. 14
0
from sklearn.svm import LinearSVC

classifier_1 = OneVsRestClassifier(LinearSVC(random_state=0)).fit(
    X_train, Y_train)
classifier_2 = ClassifierChain(LinearSVC(random_state=0)).fit(X_train, Y_train)
classifier_3 = KNeighborsClassifier().fit(X_train, Y_train)

Y_pred_1 = classifier_1.predict(X_test)
loss_1 = np.mean(Y_pred_1 != Y_test)
print("Hamming loss with classifier 1 on testing set: ", loss_1)

Y_pred_1_bis = classifier_1.predict(X_train)
loss_1_bis = np.mean(Y_pred_1_bis != Y_train)
print("Hamming loss with classifier 1 on training set: ", loss_1_bis)

Y_pred_2 = classifier_3.predict(X_test)
loss_2 = np.mean(Y_pred_2 != Y_test)
print("Hamming loss with classifier 2 on testing set: ", loss_2)

Y_pred_2_bis = classifier_2.predict(X_train)
loss_2_bis = np.mean(Y_pred_2_bis != Y_train)
print("Hamming loss with classifier 2 on training set: ", loss_2_bis)

Y_pred_3 = classifier_3.predict(X_test)
loss_3 = np.mean(Y_pred_3 != Y_test)
print("Hamming loss with classifier 3 on testing set: ", loss_3)

Y_pred_3_bis = classifier_3.predict(X_train)
loss_3_bis = np.mean(Y_pred_3_bis != Y_train)
print("Hamming loss with classifier 3 on training set: ", loss_3_bis)
Esempio n. 15
0
from sklearn.multiclass import OneVsRestClassifier
t0 = clock()
onerest = OneVsRestClassifier(knn)
onerest.fit(X_train, Y_train)
Y_pred = onerest.predict(X_test)
t_onerest = clock() - t0
#print(Y_test)
#print(Y_pred)
loss_onerest = np.mean(Y_pred != Y_test)
print("Hamming loss for One vs Rest classifier: ", loss_onerest)

from sklearn.multioutput import ClassifierChain
t0 = clock()
classfierchain = ClassifierChain(knn)
classfierchain.fit(X_train, Y_train)
Y_pred = classfierchain.predict(X_test)
t_chain = clock() - t0
#print(Y_test)
#print(Y_pred)
loss_chain = np.mean(Y_pred != Y_test)
print("Hamming loss for classifier chain: ", loss_chain)

arr_epoch = np.arange(1, len(time_h) + 1) * 10

plt.figure(figsize=(12, 9))
plt.plot(arr_epoch, time_h, label='my network', c='k')
plt.axhline(t_nn, c='r', label='Default network of Sklearn')
plt.axhline(
    t_knn,
    c='g',
    label='K-nearst neighbor classifier',
Esempio n. 16
0
label = list(ir_data["label"])
y_lab = [lab.split(" ") for lab in label]
bin = MultiLabelBinarizer()
y = bin.fit_transform(y_lab)

ir_data.drop("label", inplace=True, axis=1)

X_train, X_test, y_train, y_test = train_test_split(ir_data, y, test_size=0.2)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


def list_comparison(list1, list2):
    for ind in range(0, len(list1)):
        if list1[ind] != list2[ind]:
            return False

    return True


clf = AdaBoostClassifier(n_estimators=50)
classifier = ClassifierChain(clf)

classifier.fit(X_train, y_train)
predicted_labels = classifier.predict(X_test)

Esempio n. 17
0
def naive_base(params):
    building_list = params[0]
    n_list = params[1]
    target_building = params[2]
    inc_num = params[3]
    iter_num = params[4]
    accuracy_list = list()
    micro_f1_list = list()
    macro_f1_list = list()
    for iter_i in range(0, iter_num):
        sentence_dict = dict()
        truth_dict = dict()
        if iter_i == 0:
            learning_srcids = list()
        for building, n in zip(building_list, n_list):
            if building == target_building:
                n += iter_i * inc_num
            if building != 'ghc':
                (sensorDF, srcid_list, name_list, jciname_list, desc_list,
                 unit_list, bacnettype_list) = toker.parse_sentences(building)
                for srcid, name, jciname, desc in \
                        zip(srcid_list, name_list, jciname_list, desc_list):
                    sentence_dict[srcid] = list(
                        map(replacer, name + jciname + desc))
            else:
                with open(
                        'metadata/{0}_sentence_dict_justseparate.json'.format(
                            building), 'r') as fp:
                    curr_sentence_dict = json.load(fp)

                curr_sentence_dict = dict([
                    (srcid, list(map(replacer, sentence)))
                    for srcid, sentence in curr_sentence_dict.items()
                ])
                sentence_dict.update(curr_sentence_dict)

            with open('metadata/{0}_ground_truth.json'.format(building),
                      'r') as fp:
                truth_dict.update(json.load(fp))
            label_dict = get_label_dict(building)
            srcids = list(truth_dict.keys())

            if iter_i == 0:
                learning_srcids += select_random_samples(
                    building,
                    srcids,
                    n,
                    True,
                    token_type='justseparate',
                    reverse=True,
                    cluster_dict=None,
                    shuffle_flag=False)
            else:
                learning_srcids += new_srcids * 3
                pass
            if building == target_building:
                test_srcids = [
                    srcid for srcid in label_dict.keys()
                    if srcid not in learning_srcids
                ]

        binarizer = MultiLabelBinarizer().fit(truth_dict.values())
        vectorizer = TfidfVectorizer(tokenizer=tokenizer).fit(
            list(map(joiner, sentence_dict.values())))
        learning_doc = [
            ' '.join(sentence_dict[srcid]) for srcid in learning_srcids
        ]
        learning_vect_doc = vectorizer.transform(learning_doc)

        learning_truth_mat = binarizer.transform(
            [truth_dict[srcid] for srcid in learning_srcids])

        #classifier = RandomForestClassifier(n_estimators=200, n_jobs=1)
        classifier = ClassifierChain(RandomForestClassifier())
        classifier.fit(learning_vect_doc, learning_truth_mat)

        test_doc = [' '.join(sentence_dict[srcid]) for srcid in test_srcids]
        test_vect_doc = vectorizer.transform(test_doc)

        pred_mat = classifier.predict(test_vect_doc)
        prob_mat = classifier.predict_proba(test_vect_doc)

        # Query Stage for Active Learning
        entropies = [get_entropy(prob) for prob in prob_mat]
        sorted_entropies = sorted([(test_srcids[i], entropy)
                                   for i, entropy in enumerate(entropies)],
                                  key=itemgetter(1),
                                  reverse=True)
        added_cids = set()
        """
        for srcid in learning_srcids:
            cid = find_keys(srcid, cluster_dict, crit=lambda x,y:x in y)[0]
            added_cids.add(cid)
            """

        new_srcids = []
        new_srcid_cnt = 0
        cluster_dict = get_cluster_dict(target_building)
        for srcid, entropy in sorted_entropies:
            if srcid not in learning_srcids:
                the_cid = None
                for cid, cluster in cluster_dict.items():
                    if srcid in cluster:
                        the_cid = cid
                        break
                if the_cid in added_cids:
                    continue
                added_cids.add(the_cid)
                new_srcids.append(srcid)
                new_srcid_cnt += 1
                if new_srcid_cnt == inc_num:
                    break

        pred_tagsets_list = binarizer.inverse_transform(pred_mat)
        pred_tagsets_dict = dict([
            (srcid, pred_tagset)
            for srcid, pred_tagset in zip(test_srcids, pred_tagsets_list)
        ])

        correct_cnt = 0
        incorrect_cnt = 0
        for i, srcid in enumerate(test_srcids):
            pred = pred_tagsets_dict[srcid]
            true = truth_dict[srcid]
            if set(pred_tagsets_dict[srcid]) != set(truth_dict[srcid]):
                incorrect_cnt += 1
            else:
                correct_cnt += 1

        test_truth_mat = binarizer.transform(
            [truth_dict[srcid] for srcid in test_srcids])

        if not isinstance(pred_mat, np.ndarray):
            pred_mat = pred_mat.toarray()
        if not isinstance(test_truth_mat, np.ndarray):
            test_truth_mat = test_truth_mat.toarray()

        accuracy = get_accuracy(test_truth_mat, pred_mat)
        micro_f1 = get_micro_f1(test_truth_mat, pred_mat)
        #_, _, macro_f1, _ = precision_recall_fscore_support(test_truth_mat,
        #                                            pred_mat, average='macro')
        macro_f1 = get_macro_f1(test_truth_mat, pred_mat)
        accuracy_list.append(accuracy * 100)
        micro_f1_list.append(micro_f1 * 100)
        macro_f1_list.append(macro_f1 * 100)

    return accuracy_list, macro_f1_list