def test_classifier_chain_fit_and_predict_with_sparse_data(): # Fit classifier chain with sparse data X, Y = generate_multilabel_dataset_with_correlations() X_sparse = sp.csr_matrix(X) classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X_sparse, Y) Y_pred_sparse = classifier_chain.predict(X_sparse) classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X, Y) Y_pred_dense = classifier_chain.predict(X) assert_array_equal(Y_pred_sparse, Y_pred_dense)
def test_classifier_chain_vs_independent_models(): # Verify that an ensemble of classifier chains (each of length # N) can achieve a higher Jaccard similarity score than N independent # models yeast = fetch_mldata('yeast') X = yeast['data'] Y = yeast['target'].transpose().toarray() X_train = X[:2000, :] X_test = X[2000:, :] Y_train = Y[:2000, :] Y_test = Y[2000:, :] ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) chain = ClassifierChain(LogisticRegression(), order=np.array([0, 2, 4, 6, 8, 10, 12, 1, 3, 5, 7, 9, 11, 13])) chain.fit(X_train, Y_train) Y_pred_chain = chain.predict(X_test) assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain), jaccard_similarity_score(Y_test, Y_pred_ovr))
def test_classifier_chain_fit_and_predict_with_sparse_data_and_cv(): # Fit classifier chain with sparse data cross_val_predict X, Y = generate_multilabel_dataset_with_correlations() X_sparse = sp.csr_matrix(X) classifier_chain = ClassifierChain(LogisticRegression(), cv=3) classifier_chain.fit(X_sparse, Y) Y_pred = classifier_chain.predict(X_sparse) assert_equal(Y_pred.shape, Y.shape)
def test_classifier_chain_crossval_fit_and_predict(): # Fit classifier chain with cross_val_predict and verify predict # performance X, Y = generate_multilabel_dataset_with_correlations() classifier_chain_cv = ClassifierChain(LogisticRegression(), cv=3) classifier_chain_cv.fit(X, Y) classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X, Y) Y_pred_cv = classifier_chain_cv.predict(X) Y_pred = classifier_chain.predict(X) assert_equal(Y_pred_cv.shape, Y.shape) assert_greater(jaccard_similarity_score(Y, Y_pred_cv), 0.4) assert_not_equal(jaccard_similarity_score(Y, Y_pred_cv), jaccard_similarity_score(Y, Y_pred))
def test_classifier_chain_random_order(): # Fit classifier chain with random order X, Y = generate_multilabel_dataset_with_correlations() classifier_chain_random = ClassifierChain(LogisticRegression(), order='random', random_state=42) classifier_chain_random.fit(X, Y) Y_pred_random = classifier_chain_random.predict(X) assert_not_equal(list(classifier_chain_random.order), list(range(4))) assert_equal(len(classifier_chain_random.order_), 4) assert_equal(len(set(classifier_chain_random.order_)), 4) classifier_chain_fixed = \ ClassifierChain(LogisticRegression(), order=classifier_chain_random.order_) classifier_chain_fixed.fit(X, Y) Y_pred_fixed = classifier_chain_fixed.predict(X) # Randomly ordered chain should behave identically to a fixed order chain # with the same order. assert_array_equal(Y_pred_random, Y_pred_fixed)
def test_classifier_chain_fit_and_predict_with_linear_svc(): # Fit classifier chain and verify predict performance using LinearSVC X, Y = generate_multilabel_dataset_with_correlations() classifier_chain = ClassifierChain(LinearSVC()) classifier_chain.fit(X, Y) Y_pred = classifier_chain.predict(X) assert_equal(Y_pred.shape, Y.shape) Y_decision = classifier_chain.decision_function(X) Y_binary = (Y_decision >= 0) assert_array_equal(Y_binary, Y_pred) assert not hasattr(classifier_chain, 'predict_proba')
def test_classifier_chain_fit_and_predict_with_logistic_regression(): # Fit classifier chain and verify predict performance X, Y = generate_multilabel_dataset_with_correlations() classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X, Y) Y_pred = classifier_chain.predict(X) assert_equal(Y_pred.shape, Y.shape) Y_prob = classifier_chain.predict_proba(X) Y_binary = (Y_prob >= .5) assert_array_equal(Y_binary, Y_pred) assert_equal([c.coef_.size for c in classifier_chain.estimators_], list(range(X.shape[1], X.shape[1] + Y.shape[1])))
def test_classifier_chain_vs_independent_models(): # Verify that an ensemble of classifier chains (each of length # N) can achieve a higher Jaccard similarity score than N independent # models X, Y = generate_multilabel_dataset_with_correlations() X_train = X[:600, :] X_test = X[600:, :] Y_train = Y[:600, :] Y_test = Y[600:, :] ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) chain = ClassifierChain(LogisticRegression()) chain.fit(X_train, Y_train) Y_pred_chain = chain.predict(X_test) assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain), jaccard_similarity_score(Y_test, Y_pred_ovr))
def test_classifier_chain_vs_independent_models(): # Verify that an ensemble of classifier chains (each of length # N) can achieve a higher Jaccard similarity score than N independent # models X, Y = generate_multilabel_dataset_with_correlations() X_train = X[:600, :] X_test = X[600:, :] Y_train = Y[:600, :] Y_test = Y[600:, :] ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) chain = ClassifierChain(LogisticRegression()) chain.fit(X_train, Y_train) Y_pred_chain = chain.predict(X_test) assert jaccard_score(Y_test, Y_pred_chain, average="samples") > jaccard_score( Y_test, Y_pred_ovr, average="samples")
def test_classifier_chain_vs_independent_models(): # Verify that an ensemble of classifier chains (each of length # N) can achieve a higher Jaccard similarity score than N independent # models yeast = fetch_mldata('yeast') X = yeast['data'] Y = yeast['target'].transpose().toarray() X_train = X[:2000, :] X_test = X[2000:, :] Y_train = Y[:2000, :] Y_test = Y[2000:, :] ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) chain = ClassifierChain( LogisticRegression(), order=np.array([0, 2, 4, 6, 8, 10, 12, 1, 3, 5, 7, 9, 11, 13])) chain.fit(X_train, Y_train) Y_pred_chain = chain.predict(X_test) assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain), jaccard_similarity_score(Y_test, Y_pred_ovr))
from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from sklearn.multioutput import ClassifierChain from sklearn.preprocessing import MultiLabelBinarizer ir_data = pd.read_csv("../../data/extracted_Features.csv") ir_data.drop('Unnamed: 0', inplace=True, axis=1) label = list(ir_data["label"]) y_lab = [lab.split(" ") for lab in label] bin = MultiLabelBinarizer() y = bin.fit_transform(y_lab) ir_data.drop("label", inplace=True, axis=1) X_train, X_test, y_train, y_test = train_test_split(ir_data, y, test_size=0.2) clf = AdaBoostClassifier(n_estimators=50) classifier = ClassifierChain(clf) model = classifier.fit(X=X_train, Y=y_train) predictions = classifier.predict(X=X_test) cm = confusion_matrix(y_true=y_test.argmax(axis=1), y_pred=predictions.argmax(axis=1)) print(cm) print(bin.classes_) print(predictions.argmax(axis=1))
from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) pred_ovr = ovr.predict(X_test) from sklearn.metrics import jaccard_similarity_score ovr_score = jaccard_similarity_score(Y_test, pred_ovr) ovr_score from sklearn.multioutput import ClassifierChain cc = ClassifierChain(LogisticRegression(), order='random', random_state=42) cc.fit(X_train, Y_train) pred_cc = cc.predict(X_test) cc_score = jaccard_similarity_score(Y_test, pred_cc) cc_score chains = [ ClassifierChain(LogisticRegression(), order='random', random_state=42 + i) for i in range(10) ] for chain in chains: chain.fit(X_train, Y_train) pred_chains = np.array([chain.predict(X_test) for chain in chains]) chain_scores = [ jaccard_similarity_score(Y_test, pred_chain) for pred_chain in pred_chains ]
X = combine_2_feats('content', 'structural') y = load_labels() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) accs = np.zeros(10) precs = np.zeros(10) recs = np.zeros(10) f1s = np.zeros(10) for i in range(10): # model = ClassifierChain(LinearSVC(C=1, max_iter=1000, fit_intercept=True)) # model = ClassifierChain(AdaBoostClassifier()) # model = MLkNN(k=3, s=0.1) model = ClassifierChain( RandomForestClassifier(n_estimators=1500, min_samples_split=7, min_samples_leaf=7, max_features='sqrt')) model.fit(X_train, y_train) pred = model.predict(X_test) accs[i] = jaccard_score(y_test, pred, average='samples') precs[i], recs[i], f1s[i], _ = precision_recall_fscore_support( y_test, pred, average='samples') print(f"Accuracy: {accs.mean()} +- {accs.std()}") print(f"Precision: {precs.mean()} +- {precs.std()}") print(f"Recall: {recs.mean()} +- {recs.std()}") print(f"F1: {f1s.mean()} +- {f1s.std()}")
from sklearn.svm import LinearSVC classifier_1 = OneVsRestClassifier(LinearSVC(random_state=0)).fit( X_train, Y_train) classifier_2 = ClassifierChain(LinearSVC(random_state=0)).fit(X_train, Y_train) classifier_3 = KNeighborsClassifier().fit(X_train, Y_train) Y_pred_1 = classifier_1.predict(X_test) loss_1 = np.mean(Y_pred_1 != Y_test) print("Hamming loss with classifier 1 on testing set: ", loss_1) Y_pred_1_bis = classifier_1.predict(X_train) loss_1_bis = np.mean(Y_pred_1_bis != Y_train) print("Hamming loss with classifier 1 on training set: ", loss_1_bis) Y_pred_2 = classifier_3.predict(X_test) loss_2 = np.mean(Y_pred_2 != Y_test) print("Hamming loss with classifier 2 on testing set: ", loss_2) Y_pred_2_bis = classifier_2.predict(X_train) loss_2_bis = np.mean(Y_pred_2_bis != Y_train) print("Hamming loss with classifier 2 on training set: ", loss_2_bis) Y_pred_3 = classifier_3.predict(X_test) loss_3 = np.mean(Y_pred_3 != Y_test) print("Hamming loss with classifier 3 on testing set: ", loss_3) Y_pred_3_bis = classifier_3.predict(X_train) loss_3_bis = np.mean(Y_pred_3_bis != Y_train) print("Hamming loss with classifier 3 on training set: ", loss_3_bis)
from sklearn.multiclass import OneVsRestClassifier t0 = clock() onerest = OneVsRestClassifier(knn) onerest.fit(X_train, Y_train) Y_pred = onerest.predict(X_test) t_onerest = clock() - t0 #print(Y_test) #print(Y_pred) loss_onerest = np.mean(Y_pred != Y_test) print("Hamming loss for One vs Rest classifier: ", loss_onerest) from sklearn.multioutput import ClassifierChain t0 = clock() classfierchain = ClassifierChain(knn) classfierchain.fit(X_train, Y_train) Y_pred = classfierchain.predict(X_test) t_chain = clock() - t0 #print(Y_test) #print(Y_pred) loss_chain = np.mean(Y_pred != Y_test) print("Hamming loss for classifier chain: ", loss_chain) arr_epoch = np.arange(1, len(time_h) + 1) * 10 plt.figure(figsize=(12, 9)) plt.plot(arr_epoch, time_h, label='my network', c='k') plt.axhline(t_nn, c='r', label='Default network of Sklearn') plt.axhline( t_knn, c='g', label='K-nearst neighbor classifier',
label = list(ir_data["label"]) y_lab = [lab.split(" ") for lab in label] bin = MultiLabelBinarizer() y = bin.fit_transform(y_lab) ir_data.drop("label", inplace=True, axis=1) X_train, X_test, y_train, y_test = train_test_split(ir_data, y, test_size=0.2) print(X_train.shape) print(X_test.shape) print(y_train.shape) print(y_test.shape) def list_comparison(list1, list2): for ind in range(0, len(list1)): if list1[ind] != list2[ind]: return False return True clf = AdaBoostClassifier(n_estimators=50) classifier = ClassifierChain(clf) classifier.fit(X_train, y_train) predicted_labels = classifier.predict(X_test)
def naive_base(params): building_list = params[0] n_list = params[1] target_building = params[2] inc_num = params[3] iter_num = params[4] accuracy_list = list() micro_f1_list = list() macro_f1_list = list() for iter_i in range(0, iter_num): sentence_dict = dict() truth_dict = dict() if iter_i == 0: learning_srcids = list() for building, n in zip(building_list, n_list): if building == target_building: n += iter_i * inc_num if building != 'ghc': (sensorDF, srcid_list, name_list, jciname_list, desc_list, unit_list, bacnettype_list) = toker.parse_sentences(building) for srcid, name, jciname, desc in \ zip(srcid_list, name_list, jciname_list, desc_list): sentence_dict[srcid] = list( map(replacer, name + jciname + desc)) else: with open( 'metadata/{0}_sentence_dict_justseparate.json'.format( building), 'r') as fp: curr_sentence_dict = json.load(fp) curr_sentence_dict = dict([ (srcid, list(map(replacer, sentence))) for srcid, sentence in curr_sentence_dict.items() ]) sentence_dict.update(curr_sentence_dict) with open('metadata/{0}_ground_truth.json'.format(building), 'r') as fp: truth_dict.update(json.load(fp)) label_dict = get_label_dict(building) srcids = list(truth_dict.keys()) if iter_i == 0: learning_srcids += select_random_samples( building, srcids, n, True, token_type='justseparate', reverse=True, cluster_dict=None, shuffle_flag=False) else: learning_srcids += new_srcids * 3 pass if building == target_building: test_srcids = [ srcid for srcid in label_dict.keys() if srcid not in learning_srcids ] binarizer = MultiLabelBinarizer().fit(truth_dict.values()) vectorizer = TfidfVectorizer(tokenizer=tokenizer).fit( list(map(joiner, sentence_dict.values()))) learning_doc = [ ' '.join(sentence_dict[srcid]) for srcid in learning_srcids ] learning_vect_doc = vectorizer.transform(learning_doc) learning_truth_mat = binarizer.transform( [truth_dict[srcid] for srcid in learning_srcids]) #classifier = RandomForestClassifier(n_estimators=200, n_jobs=1) classifier = ClassifierChain(RandomForestClassifier()) classifier.fit(learning_vect_doc, learning_truth_mat) test_doc = [' '.join(sentence_dict[srcid]) for srcid in test_srcids] test_vect_doc = vectorizer.transform(test_doc) pred_mat = classifier.predict(test_vect_doc) prob_mat = classifier.predict_proba(test_vect_doc) # Query Stage for Active Learning entropies = [get_entropy(prob) for prob in prob_mat] sorted_entropies = sorted([(test_srcids[i], entropy) for i, entropy in enumerate(entropies)], key=itemgetter(1), reverse=True) added_cids = set() """ for srcid in learning_srcids: cid = find_keys(srcid, cluster_dict, crit=lambda x,y:x in y)[0] added_cids.add(cid) """ new_srcids = [] new_srcid_cnt = 0 cluster_dict = get_cluster_dict(target_building) for srcid, entropy in sorted_entropies: if srcid not in learning_srcids: the_cid = None for cid, cluster in cluster_dict.items(): if srcid in cluster: the_cid = cid break if the_cid in added_cids: continue added_cids.add(the_cid) new_srcids.append(srcid) new_srcid_cnt += 1 if new_srcid_cnt == inc_num: break pred_tagsets_list = binarizer.inverse_transform(pred_mat) pred_tagsets_dict = dict([ (srcid, pred_tagset) for srcid, pred_tagset in zip(test_srcids, pred_tagsets_list) ]) correct_cnt = 0 incorrect_cnt = 0 for i, srcid in enumerate(test_srcids): pred = pred_tagsets_dict[srcid] true = truth_dict[srcid] if set(pred_tagsets_dict[srcid]) != set(truth_dict[srcid]): incorrect_cnt += 1 else: correct_cnt += 1 test_truth_mat = binarizer.transform( [truth_dict[srcid] for srcid in test_srcids]) if not isinstance(pred_mat, np.ndarray): pred_mat = pred_mat.toarray() if not isinstance(test_truth_mat, np.ndarray): test_truth_mat = test_truth_mat.toarray() accuracy = get_accuracy(test_truth_mat, pred_mat) micro_f1 = get_micro_f1(test_truth_mat, pred_mat) #_, _, macro_f1, _ = precision_recall_fscore_support(test_truth_mat, # pred_mat, average='macro') macro_f1 = get_macro_f1(test_truth_mat, pred_mat) accuracy_list.append(accuracy * 100) micro_f1_list.append(micro_f1 * 100) macro_f1_list.append(macro_f1 * 100) return accuracy_list, macro_f1_list