def test_classifier_chain_vs_independent_models(): # Verify that an ensemble of classifier chains (each of length # N) can achieve a higher Jaccard similarity score than N independent # models yeast = fetch_mldata('yeast') X = yeast['data'] Y = yeast['target'].transpose().toarray() X_train = X[:2000, :] X_test = X[2000:, :] Y_train = Y[:2000, :] Y_test = Y[2000:, :] ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) chain = ClassifierChain(LogisticRegression(), order=np.array([0, 2, 4, 6, 8, 10, 12, 1, 3, 5, 7, 9, 11, 13])) chain.fit(X_train, Y_train) Y_pred_chain = chain.predict(X_test) assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain), jaccard_similarity_score(Y_test, Y_pred_ovr))
def calc_Fitness(train_d): vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1, 3), norm='l2') x_train = vectorizer.fit_transform(train_d.comment_text) y_train = train_d.drop(labels=['id', 'comment_text'], axis=1) x_test = vectorizer.transform(test.comment_text) y_test = test.drop(labels=['id', 'comment_text'], axis=1) # using classifier chains from sklearn.multioutput import ClassifierChain from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, hamming_loss, precision_score # initialize classifier chains multi-label classifier classifier = ClassifierChain(LogisticRegression()) # Training logistic regression model on train data classifier.fit(x_train, y_train) # predict predictions = classifier.predict(x_test) # accuracy quality = (accuracy_score(y_test, predictions) + (1 - hamming_loss(y_test, predictions)) + precision_score(y_test, predictions, average='weighted')) / 3 return quality
def test_classifier_chain_fit_and_predict_with_sparse_data_and_cv(): # Fit classifier chain with sparse data cross_val_predict X, Y = generate_multilabel_dataset_with_correlations() X_sparse = sp.csr_matrix(X) classifier_chain = ClassifierChain(LogisticRegression(), cv=3) classifier_chain.fit(X_sparse, Y) Y_pred = classifier_chain.predict(X_sparse) assert_equal(Y_pred.shape, Y.shape)
def test_classifier_chain_tuple_invalid_order(): X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]] y = [[3, 2], [2, 3], [3, 2]] order = tuple([1, 2]) chain = ClassifierChain(RandomForestClassifier(), order=order) with pytest.raises(ValueError, match='invalid order'): chain.fit(X, y)
def test_classifier_chain_tuple_order(order_type): X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]] y = [[3, 2], [2, 3], [3, 2]] order = order_type([1, 0]) chain = ClassifierChain(RandomForestClassifier(), order=order) chain.fit(X, y) X_test = [[1.5, 2.5, 3.5]] y_test = [[3, 2]] assert_array_almost_equal(chain.predict(X_test), y_test)
class Multi_classes_classifier_on_column(BaseEstimator): def __init__(self, base_classifier, column): self.column = column self.classifier = ClassifierChain(base_classifier) self.vectorizer = None def _get_vectors(self, X): text_data = X[self.column] text_data = [prepro.clean_text(text) for text in text_data] # text cleaning feature_vector = self.vectorizer.transform(text_data).toarray() return feature_vector def fit(self, X, y): if type(self.column) == type(int(1)): self.column = list(X.columns)[self.column] if type(self.vectorizer) == type(None): self.vectorizer = prepro.get_text_vectorizer(X, self.column) feature_vector = self._get_vectors(X) self.classifier.fit(feature_vector, y) return self def predict(self, X): feature_vector = self._get_vectors(X) result = self.classifier.predict(feature_vector) return result def predict_proba(self, X: pd.DataFrame): feature_vector = self._get_vectors(X) result = self.classifier.predict_proba(feature_vector) return result def partial_fit(self, X, y): feature_vector = self._get_vectors(X) result = self.classifier.partial_fit(feature_vector, y) return result def score(self, X, y): feature_vector = self._get_vectors(X) result = self.classifier.score(feature_vector, y) return result def set_params(self, **params): self.classifier.set_params(**params) return self def get_params(self, deep): result = self.classifier.get_params(deep) return result def set_vectorizer(self, vectorizer): self.vectorizer = vectorizer
def test_chainclassifier(implementation): name = "test_ls_cc" x, y = make_multilabel_classification() x_train, x_test, y_train, y_test = train_test_split(x, y) valid_cc = ClassifierChain(LinearSVC()) valid_cc.fit(x_train, y_train) implementation.save(valid_cc, name) test_cc = implementation.load(name) expected = valid_cc.predict(x_test) got = test_cc.predict(x_test) assert_array_equal(got, expected)
def run(classifier, train_test_set): X_train, X_test, y_train, y_test = train_test_set # init model and fit to train data chain = ClassifierChain(classifier, order='random', random_state=0) chain.fit(X_train, y_train) # make predictions y_pred = chain.predict(X_test) print('\n--------Classifier chains with {:}'.format(classifier)) return y_test, y_pred
def fit(self, train_x, train_y): self._estimators = [] self._feature_number = train_y.shape[1] for i in range(self._no_of_estimators): X, y = train_x, train_y print(random.sample(range(0, self._feature_number), self._feature_number)) estimator = ClassifierChain(DecisionTreeClassifier(), order=random.sample(range(0, self._feature_number), self._feature_number)) estimator.fit(X, y) self._estimators.append(estimator) return self
def chaining_svm(X, Y, max_iter=-1): X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0) Cs = np.logspace(-2, 10, 30) res = [] print(f'Trying Cs: {Cs}') print('C \t accuracy \t f1 \t precision \t recall') for C in Cs: base_clf = SVC(C=C, kernel='rbf', max_iter=max_iter) chain = ClassifierChain(base_clf, cv=2, order='random', random_state=0) chain.fit(X_train, Y_train) y_pred = chain.predict(X_test) res.append([[ get_accuracy(Y_test, y_pred), get_f1(Y_test, y_pred), get_recall(Y_test, y_pred), get_precision(Y_test, y_pred) ], C]) print( f'{C}\t{get_accuracy(Y_test, y_pred)}\t{get_f1(Y_test, y_pred)}\t{get_recall(Y_test, y_pred)}\t{get_precision(Y_test, y_pred)}' ) store_data_as_pickle(res, f'svm-chain-logscale-values') acc = np.asarray([[a[0][0], a[1]] for a in res]) f1 = np.asarray([[a[0][1], a[1]] for a in res]) recall = np.asarray([[a[0][2], a[1]] for a in res]) precision = np.asarray([[a[0][3], a[1]] for a in res]) print("Max acc without question at default_dist: ", acc[np.argmax(acc[:, 0]), 1], " ", np.max(acc[:, 0])) print("Max f1 without question at default_dist: ", f1[np.argmax(f1[:, 0]), 1], " ", np.max(f1[:, 0])) print("Max recall without question at default_dist: ", recall[np.argmax(recall[:, 0]), 1], " ", np.max(recall[:, 0])) print("Max precision without question at default_dist: ", precision[np.argmax(precision[:, 0]), 1], " ", np.max(precision[:, 0])) plt.plot(acc[:, 1], acc[:, 0], label='Accuracy') plt.plot(f1[:, 1], f1[:, 0], label='F1-Score') plt.plot(recall[:, 1], recall[:, 0], label='Recall') plt.plot(precision[:, 1], precision[:, 0], label='Precision') plt.legend() plt.xscale('log') plt.xlabel("C regularization parameter") plt.title("SVM with ClassifierChain 10 folds") plt.show()
def chaining_adaboost(X, Y): X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0) base_clf = AdaBoostClassifier(algorithm="SAMME", n_estimators=200) chain = ClassifierChain(base_clf, cv=2, order='random', random_state=0) chain.fit(X_train, Y_train) y_pred = chain.predict(X_test) print( f'{get_accuracy(Y_test, y_pred)}\t{get_f1(Y_test, y_pred)}\t{get_recall(Y_test, y_pred)}\t{get_precision(Y_test, y_pred)}' )
def test_classifier_chain_fit_and_predict_with_linear_svc(): # Fit classifier chain and verify predict performance using LinearSVC X, Y = generate_multilabel_dataset_with_correlations() classifier_chain = ClassifierChain(LinearSVC()) classifier_chain.fit(X, Y) Y_pred = classifier_chain.predict(X) assert_equal(Y_pred.shape, Y.shape) Y_decision = classifier_chain.decision_function(X) Y_binary = (Y_decision >= 0) assert_array_equal(Y_binary, Y_pred) assert not hasattr(classifier_chain, 'predict_proba')
def test_classifier_chain_fit_and_predict_with_sparse_data(): # Fit classifier chain with sparse data X, Y = generate_multilabel_dataset_with_correlations() X_sparse = sp.csr_matrix(X) classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X_sparse, Y) Y_pred_sparse = classifier_chain.predict(X_sparse) classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X, Y) Y_pred_dense = classifier_chain.predict(X) assert_array_equal(Y_pred_sparse, Y_pred_dense)
def test_classifier_chain_fit_and_predict_with_logistic_regression(): # Fit classifier chain and verify predict performance X, Y = generate_multilabel_dataset_with_correlations() classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X, Y) Y_pred = classifier_chain.predict(X) assert_equal(Y_pred.shape, Y.shape) Y_prob = classifier_chain.predict_proba(X) Y_binary = (Y_prob >= .5) assert_array_equal(Y_binary, Y_pred) assert_equal([c.coef_.size for c in classifier_chain.estimators_], list(range(X.shape[1], X.shape[1] + Y.shape[1])))
def multi_label(x_train, y_train, name="MP"): logger.info(f"Multi label problem: [{name}] - {len(x_train)}... ") le = MultiLabelBinarizer(sparse_output=True) vct = get_new_vectorizer() logger.info(f"[{name}] Vectorizing inputs...") x_train = vct.fit_transform(x_train) logger.info(f"[{name}] Vectorizing outputs...") y_train = le.fit_transform(y_train) logger.info(f"[{name}] Data shapes:") logger.info(f"[{name}] x_train: {x_train.shape}") logger.info(f"[{name}] y_train: {y_train.shape}") model = ClassifierChain(LinearSVC(random_state=0)) model.fit(x_train, y_train.todense()) return SKLearnProblem(name, le, model, vct)
def test_classifier_chain_crossval_fit_and_predict(): # Fit classifier chain with cross_val_predict and verify predict # performance X, Y = generate_multilabel_dataset_with_correlations() classifier_chain_cv = ClassifierChain(LogisticRegression(), cv=3) classifier_chain_cv.fit(X, Y) classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X, Y) Y_pred_cv = classifier_chain_cv.predict(X) Y_pred = classifier_chain.predict(X) assert_equal(Y_pred_cv.shape, Y.shape) assert_greater(jaccard_similarity_score(Y, Y_pred_cv), 0.4) assert_not_equal(jaccard_similarity_score(Y, Y_pred_cv), jaccard_similarity_score(Y, Y_pred))
def test_classifier_chain_vs_independent_models(): # Verify that an ensemble of classifier chains (each of length # N) can achieve a higher Jaccard similarity score than N independent # models X, Y = generate_multilabel_dataset_with_correlations() X_train = X[:600, :] X_test = X[600:, :] Y_train = Y[:600, :] Y_test = Y[600:, :] ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) chain = ClassifierChain(LogisticRegression()) chain.fit(X_train, Y_train) Y_pred_chain = chain.predict(X_test) assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain), jaccard_similarity_score(Y_test, Y_pred_ovr))
def test_classifier_chain_vs_independent_models(): # Verify that an ensemble of classifier chains (each of length # N) can achieve a higher Jaccard similarity score than N independent # models X, Y = generate_multilabel_dataset_with_correlations() X_train = X[:600, :] X_test = X[600:, :] Y_train = Y[:600, :] Y_test = Y[600:, :] ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) chain = ClassifierChain(LogisticRegression()) chain.fit(X_train, Y_train) Y_pred_chain = chain.predict(X_test) assert_greater(jaccard_score(Y_test, Y_pred_chain, average='samples'), jaccard_score(Y_test, Y_pred_ovr, average='samples'))
def test_classifier_chain_random_order(): # Fit classifier chain with random order X, Y = generate_multilabel_dataset_with_correlations() classifier_chain_random = ClassifierChain(LogisticRegression(), order='random', random_state=42) classifier_chain_random.fit(X, Y) Y_pred_random = classifier_chain_random.predict(X) assert_not_equal(list(classifier_chain_random.order), list(range(4))) assert_equal(len(classifier_chain_random.order_), 4) assert_equal(len(set(classifier_chain_random.order_)), 4) classifier_chain_fixed = \ ClassifierChain(LogisticRegression(), order=classifier_chain_random.order_) classifier_chain_fixed.fit(X, Y) Y_pred_fixed = classifier_chain_fixed.predict(X) # Randomly ordered chain should behave identically to a fixed order chain # with the same order. assert_array_equal(Y_pred_random, Y_pred_fixed)
def test_classifier_chain_vs_independent_models(): # Verify that an ensemble of classifier chains (each of length # N) can achieve a higher Jaccard similarity score than N independent # models yeast = fetch_mldata('yeast') X = yeast['data'] Y = yeast['target'].transpose().toarray() X_train = X[:2000, :] X_test = X[2000:, :] Y_train = Y[:2000, :] Y_test = Y[2000:, :] ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) chain = ClassifierChain( LogisticRegression(), order=np.array([0, 2, 4, 6, 8, 10, 12, 1, 3, 5, 7, 9, 11, 13])) chain.fit(X_train, Y_train) Y_pred_chain = chain.predict(X_test) assert_greater(jaccard_similarity_score(Y_test, Y_pred_chain), jaccard_similarity_score(Y_test, Y_pred_ovr))
def naive_base(params): building_list = params[0] n_list = params[1] target_building = params[2] inc_num = params[3] iter_num = params[4] accuracy_list = list() micro_f1_list = list() macro_f1_list = list() for iter_i in range(0, iter_num): sentence_dict = dict() truth_dict = dict() if iter_i == 0: learning_srcids = list() for building, n in zip(building_list, n_list): if building == target_building: n += iter_i * inc_num if building != 'ghc': (sensorDF, srcid_list, name_list, jciname_list, desc_list, unit_list, bacnettype_list) = toker.parse_sentences(building) for srcid, name, jciname, desc in \ zip(srcid_list, name_list, jciname_list, desc_list): sentence_dict[srcid] = list( map(replacer, name + jciname + desc)) else: with open( 'metadata/{0}_sentence_dict_justseparate.json'.format( building), 'r') as fp: curr_sentence_dict = json.load(fp) curr_sentence_dict = dict([ (srcid, list(map(replacer, sentence))) for srcid, sentence in curr_sentence_dict.items() ]) sentence_dict.update(curr_sentence_dict) with open('metadata/{0}_ground_truth.json'.format(building), 'r') as fp: truth_dict.update(json.load(fp)) label_dict = get_label_dict(building) srcids = list(truth_dict.keys()) if iter_i == 0: learning_srcids += select_random_samples( building, srcids, n, True, token_type='justseparate', reverse=True, cluster_dict=None, shuffle_flag=False) else: learning_srcids += new_srcids * 3 pass if building == target_building: test_srcids = [ srcid for srcid in label_dict.keys() if srcid not in learning_srcids ] binarizer = MultiLabelBinarizer().fit(truth_dict.values()) vectorizer = TfidfVectorizer(tokenizer=tokenizer).fit( list(map(joiner, sentence_dict.values()))) learning_doc = [ ' '.join(sentence_dict[srcid]) for srcid in learning_srcids ] learning_vect_doc = vectorizer.transform(learning_doc) learning_truth_mat = binarizer.transform( [truth_dict[srcid] for srcid in learning_srcids]) #classifier = RandomForestClassifier(n_estimators=200, n_jobs=1) classifier = ClassifierChain(RandomForestClassifier()) classifier.fit(learning_vect_doc, learning_truth_mat) test_doc = [' '.join(sentence_dict[srcid]) for srcid in test_srcids] test_vect_doc = vectorizer.transform(test_doc) pred_mat = classifier.predict(test_vect_doc) prob_mat = classifier.predict_proba(test_vect_doc) # Query Stage for Active Learning entropies = [get_entropy(prob) for prob in prob_mat] sorted_entropies = sorted([(test_srcids[i], entropy) for i, entropy in enumerate(entropies)], key=itemgetter(1), reverse=True) added_cids = set() """ for srcid in learning_srcids: cid = find_keys(srcid, cluster_dict, crit=lambda x,y:x in y)[0] added_cids.add(cid) """ new_srcids = [] new_srcid_cnt = 0 cluster_dict = get_cluster_dict(target_building) for srcid, entropy in sorted_entropies: if srcid not in learning_srcids: the_cid = None for cid, cluster in cluster_dict.items(): if srcid in cluster: the_cid = cid break if the_cid in added_cids: continue added_cids.add(the_cid) new_srcids.append(srcid) new_srcid_cnt += 1 if new_srcid_cnt == inc_num: break pred_tagsets_list = binarizer.inverse_transform(pred_mat) pred_tagsets_dict = dict([ (srcid, pred_tagset) for srcid, pred_tagset in zip(test_srcids, pred_tagsets_list) ]) correct_cnt = 0 incorrect_cnt = 0 for i, srcid in enumerate(test_srcids): pred = pred_tagsets_dict[srcid] true = truth_dict[srcid] if set(pred_tagsets_dict[srcid]) != set(truth_dict[srcid]): incorrect_cnt += 1 else: correct_cnt += 1 test_truth_mat = binarizer.transform( [truth_dict[srcid] for srcid in test_srcids]) if not isinstance(pred_mat, np.ndarray): pred_mat = pred_mat.toarray() if not isinstance(test_truth_mat, np.ndarray): test_truth_mat = test_truth_mat.toarray() accuracy = get_accuracy(test_truth_mat, pred_mat) micro_f1 = get_micro_f1(test_truth_mat, pred_mat) #_, _, macro_f1, _ = precision_recall_fscore_support(test_truth_mat, # pred_mat, average='macro') macro_f1 = get_macro_f1(test_truth_mat, pred_mat) accuracy_list.append(accuracy * 100) micro_f1_list.append(micro_f1 * 100) macro_f1_list.append(macro_f1 * 100) return accuracy_list, macro_f1_list
#로지스틱 회귀 from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) pred_ovr = ovr.predict(X_test) from sklearn.metrics import jaccard_similarity_score ovr_score = jaccard_similarity_score(Y_test, pred_ovr) ovr_score from sklearn.multioutput import ClassifierChain cc = ClassifierChain(LogisticRegression(), order='random', random_state=42) cc.fit(X_train, Y_train) pred_cc = cc.predict(X_test) cc_score = jaccard_similarity_score(Y_test, pred_cc) cc_score chains = [ ClassifierChain(LogisticRegression(), order='random', random_state=42 + i) for i in range(10) ] for chain in chains: chain.fit(X_train, Y_train) pred_chains = np.array([chain.predict(X_test) for chain in chains]) chain_scores = [ jaccard_similarity_score(Y_test, pred_chain) for pred_chain in pred_chains ]
from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from sklearn.multioutput import ClassifierChain from sklearn.preprocessing import MultiLabelBinarizer ir_data = pd.read_csv("../../data/extracted_Features.csv") ir_data.drop('Unnamed: 0', inplace=True, axis=1) label = list(ir_data["label"]) y_lab = [lab.split(" ") for lab in label] bin = MultiLabelBinarizer() y = bin.fit_transform(y_lab) ir_data.drop("label", inplace=True, axis=1) X_train, X_test, y_train, y_test = train_test_split(ir_data, y, test_size=0.2) clf = AdaBoostClassifier(n_estimators=50) classifier = ClassifierChain(clf) model = classifier.fit(X=X_train, Y=y_train) predictions = classifier.predict(X=X_test) cm = confusion_matrix(y_true=y_test.argmax(axis=1), y_pred=predictions.argmax(axis=1)) print(cm) print(bin.classes_) print(predictions.argmax(axis=1))
label = list(ir_data["label"]) y_lab = [lab.split(" ") for lab in label] bin = MultiLabelBinarizer() y = bin.fit_transform(y_lab) ir_data.drop("label", inplace=True, axis=1) X_train, X_test, y_train, y_test = train_test_split(ir_data, y, test_size=0.2) print(X_train.shape) print(X_test.shape) print(y_train.shape) print(y_test.shape) def list_comparison(list1, list2): for ind in range(0, len(list1)): if list1[ind] != list2[ind]: return False return True clf = AdaBoostClassifier(n_estimators=50) classifier = ClassifierChain(clf) classifier.fit(X_train, y_train) predicted_labels = classifier.predict(X_test)
def train_model(X_train, y_train, seed, ccru_version, base_classifier, X_val, y_val, feature_subsets_per_cc=[]): pid = os.getpid() print('The id of ' + str(seed) + ' is :' + str(pid)) # print('Train ecc: '+str(seed)+' started') if ccru_version == 'standard': model = ClassifierChain(base_classifier, order='random', random_state=seed) elif ccru_version == 'eccru' or ccru_version == 'eccru2' or ccru_version == 'eccru3': model = CCRU(base_classifier, order='random', random_state=seed) elif ccru_version == 'binary_relevance': model = SVC(gamma='auto', kernel='linear') else: print('Cannot recoginize ccru version!!!!') class_1 = 1 class_2 = 0 if -1 in y_train: class_2 = -1 if ccru_version == 'binary_relevance': class_1_counter = np.count_nonzero(y_train[:, 0] == class_1) class_2_counter = np.count_nonzero(y_train[:, 0] == class_2) # class_1_counter = y_train.flatten().tolist()[0].count(class_1) # class_2_counter = y_train.flatten().tolist()[0].count(class_2) if class_1_counter <= class_2_counter: minority_class = class_1 majority_class = class_2 minority_counter = class_1_counter else: minority_class = class_2 majority_class = class_1 minority_counter = class_2_counter sampled_index = [ index for index, label in enumerate(y_train) if label == minority_class ] sampled_y = [minority_class] * minority_counter temp_sampled_index = [ index for index, label in enumerate(y_train) if label == majority_class ] sampled_index.extend( random.sample(temp_sampled_index, minority_counter)) sampled_y.extend([majority_class] * minority_counter) print('Train binary_relevance: ' + str(seed) + ' started') print('training on ' + str(len(sampled_y))) if len(feature_subsets_per_cc) != 0: trained_model = model.fit( X_train[np.array(sampled_index), feature_subsets_per_cc[seed]], y_train, X_val, y_val) else: trained_model = model.fit(X_train[np.array(sampled_index), :], sampled_y) else: print('Train ecc: ' + str(seed) + ' started ') if len(feature_subsets_per_cc) != 0: trained_model = model.fit(X_train[:, feature_subsets_per_cc[seed]], y_train, X_val, y_val) else: trained_model = model.fit(X_train, y_train, X_val, y_val) print('Train model: ' + str(seed) + ' ended') return trained_model
X = combine_2_feats('content', 'structural') y = load_labels() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) accs = np.zeros(10) precs = np.zeros(10) recs = np.zeros(10) f1s = np.zeros(10) for i in range(10): # model = ClassifierChain(LinearSVC(C=1, max_iter=1000, fit_intercept=True)) # model = ClassifierChain(AdaBoostClassifier()) # model = MLkNN(k=3, s=0.1) model = ClassifierChain( RandomForestClassifier(n_estimators=1500, min_samples_split=7, min_samples_leaf=7, max_features='sqrt')) model.fit(X_train, y_train) pred = model.predict(X_test) accs[i] = jaccard_score(y_test, pred, average='samples') precs[i], recs[i], f1s[i], _ = precision_recall_fscore_support( y_test, pred, average='samples') print(f"Accuracy: {accs.mean()} +- {accs.std()}") print(f"Precision: {precs.mean()} +- {precs.std()}") print(f"Recall: {recs.mean()} +- {recs.std()}") print(f"F1: {f1s.mean()} +- {f1s.std()}")
features = [item.split(" ") for item in train_df] col_dicts = [make_dict(entry) for entry in features] features_val = [item.split(" ") for item in val_df] col_dicts_val = [make_dict(entry) for entry in features_val] features_df = pd.DataFrame(col_dicts) features_df_val = pd.DataFrame(col_dicts_val) features_df = features_df.fillna(0) features_df_val = features_df_val.fillna(0) print('done cleanning') X_train = np.array(features_df) Y_train = np.array(encoded_labels_df) x_val = np.array(features_df_val) y_val = np.array(encoded_labels_df_val) base_lr = LogisticRegression(max_iter=MAX_ITER, n_jobs=-1, verbose=1) int_rand = np.random.randint(1000) chain = ClassifierChain(base_lr, order='random', random_state=int_rand) chain.fit(X_train, Y_train) filename = MAX_ITER + "_" + int_ran + ".pickle" pickle.dump(chain, open(filename, 'wb')) #loaded_model = pickle.load(open(filename, 'rb')) print('start predict') Y_pred_chains = np.array([chain.predict_proba(x_val) for chain in chains])
from sklearn.multiclass import OneVsRestClassifier t0 = clock() onerest = OneVsRestClassifier(knn) onerest.fit(X_train, Y_train) Y_pred = onerest.predict(X_test) t_onerest = clock() - t0 #print(Y_test) #print(Y_pred) loss_onerest = np.mean(Y_pred != Y_test) print("Hamming loss for One vs Rest classifier: ", loss_onerest) from sklearn.multioutput import ClassifierChain t0 = clock() classfierchain = ClassifierChain(knn) classfierchain.fit(X_train, Y_train) Y_pred = classfierchain.predict(X_test) t_chain = clock() - t0 #print(Y_test) #print(Y_pred) loss_chain = np.mean(Y_pred != Y_test) print("Hamming loss for classifier chain: ", loss_chain) arr_epoch = np.arange(1, len(time_h) + 1) * 10 plt.figure(figsize=(12, 9)) plt.plot(arr_epoch, time_h, label='my network', c='k') plt.axhline(t_nn, c='r', label='Default network of Sklearn') plt.axhline( t_knn, c='g',