def test_intermediate_node_training_data(): r"""Test that a training set which includes intermediate (non-leaf) nodes as labels, as well as leaf nodes, constructs a correct classifier hierarchy """ G, (X, y) = make_clothing_graph_and_data(root=ROOT) # Add a new node rendering "Bottoms" an intermediate node with training data G.add_edge("Bottoms", "Pants") assert_that(any(yi == "Pants" for yi in y), is_(False)) assert_that(any(yi == "Bottoms" for yi in y), is_(True)) base_estimator = LogisticRegression( solver="lbfgs", max_iter=1_000, multi_class="multinomial", ) clf = HierarchicalClassifier( base_estimator, class_hierarchy=G, algorithm="lcpn", root=ROOT, ) clf.fit(X, y) # Ensure non-terminal node with training data is included in its' parent classifier classes assert_that(clf.graph_.nodes()["Mens"]["classifier"].classes_, has_item("Bottoms"))
def classify_digits(): r"""Test that a nontrivial hierarchy leaf classification behaves as expected. We build the following class hierarchy along with data from the handwritten digits dataset: <ROOT> / \ A B / \ | \ 1 7 C 9 / \ 3 8 """ class_hierarchy = { ROOT: ["A", "B"], "A": ["1", "7"], "B": ["C", "9"], "C": ["3", "8"], } base_estimator = make_pipeline( TruncatedSVD(n_components=24), svm.SVC( gamma=0.001, kernel="rbf", probability=True ), ) clf = HierarchicalClassifier( base_estimator=base_estimator, class_hierarchy=class_hierarchy, ) X, y = make_digits_dataset( targets=[1, 7, 3, 8, 9], as_str=False, ) print(type(X), X.shape, X) # cast the targets to strings so we have consistent typing of labels across hierarchy y = y.astype(str) print(type(y[0]), y.shape, y) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=RANDOM_STATE, ) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print("Classification Report:\n", classification_report(y_test, y_pred)) # Demonstrate using our hierarchical metrics module with MLB wrapper with multi_labeled(y_test, y_pred, clf.graph_) as (y_test_, y_pred_, graph_): h_fbeta = h_fbeta_score( y_test_, y_pred_, graph_, ) print("h_fbeta_score: ", h_fbeta)
def classify_digits(): """Test that a nontrivial hierarchy leaf classification behaves as expected. We build the following class hierarchy along with data from the handwritten digits dataset: <ROOT> / \ A B / \ / \ \ 1 7 3 8 9 """ class_hierarchy = { ROOT: ["A", "B"], "A": [1, 7], "B": [3, 8, 9], } base_estimator = make_pipeline( TruncatedSVD(n_components=24), svm.SVC( gamma=0.001, kernel="rbf", probability=True ), ) clf = HierarchicalClassifier( base_estimator=base_estimator, class_hierarchy=class_hierarchy, ) X, y = make_digits_dataset( targets=[1, 7, 3, 8, 9], as_str=False, ) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=RANDOM_STATE, ) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print("Classification Report:\n", classification_report(y_test, y_pred))
) print("training classifier") print(len(x_train_str), len(y_train)) X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(x_train_str, y_train, random_state=42, test_size=0.2) print('X_train_s.shape', len(X_train_s), len(X_train_s[0])) for x in X_train_s: print(len(x)) sys.exit(1) print('y_train_s.shape', len(y_train_s), y_train_s[0].shape) clf.fit(X_train_s, y_train_s) print("predicting") y_pred_scores_s = clf.predict_proba(X_test_s) y_pred_scores_s[np.where(y_pred_scores_s == 0)] = -10 y_pred_s = y_pred_scores_s > -0.25 print( 'f1 micro:', f1_score(y_true=y_test_s, y_pred=y_pred_s[:, :y_test_s.shape[1]], average='micro')) print( 'f1 macro:', f1_score(y_true=y_test_s,
def main(): subexp_name = 'YoungJaeShinSamples/4' anno_incomplete_file = '../data/data_jan2019_anno/anno_all_incomplete_YoungJaeShinSamples_4_useryoungjae.db' anno_complete_file = '../data/data_jan2019_anno/anno_all_YoungJaeShinSamples_4_useryoungjae.db' # anno_complete_file = '../data/data_jan2019_anno/anno_all_YoungJaeShinSamples_4_usertest0123.db' thickthin_anno_file = '../data/data_jan2019_anno/anno_thickthin_v2_YoungJaeShinSamples_4_useryoungjae.db' # thickthin_anno_file = '../data/data_jan2019_anno/anno_thickthin_v2_YoungJaeShinSamples_4_usertest0123.db' data_path = os.path.join('../data/data_jan2019', subexp_name) result_path = os.path.join('../results/data_jan2019_script/mat', subexp_name) clf_path = os.path.join( '../results/data_jan2019_script/thickthinglue_clf_complete', subexp_name) if not os.path.exists(clf_path): os.makedirs(clf_path) # merge the annotation if not os.path.exists(anno_complete_file): os.system('cp %s %s' % (anno_incomplete_file, anno_complete_file)) # update the annotation into the all annotation thickthin_oriname_newflakeids = readthickthindb(thickthin_anno_file) n_thickthin_flakes = len(thickthin_oriname_newflakeids) print(n_thickthin_flakes) updatedb(anno_complete_file, thickthin_oriname_newflakeids) # get the train/val split split_name = os.path.join(clf_path, 'train_val_split.p') if os.path.exists(split_name): to_load = pickle.load(open(split_name, 'rb')) train_names = to_load['train_names'] train_labels = to_load['train_labels'] val_names = to_load['val_names'] val_labels = to_load['val_labels'] else: itemname_labels = readdb(anno_complete_file) train_names, train_labels, val_names, val_labels = split_trainval( itemname_labels) to_save = dict() to_save['train_names'] = train_names to_save['train_labels'] = train_labels to_save['val_names'] = val_names to_save['val_labels'] = val_labels pickle.dump(to_save, open(split_name, 'wb')) # load flakes flake_save_name = os.path.join(clf_path, 'train_val_data.p') if os.path.exists(flake_save_name): to_load = pickle.load(open(flake_save_name, 'rb')) train_flakes = to_load['train_flakes'] train_feats = to_load['train_feats'] val_flakes = to_load['val_flakes'] val_feats = to_load['val_feats'] else: img_names = os.listdir(data_path) img_names.sort() img_flakes = Parallel(n_jobs=8)(delayed(load_one_image)( os.path.join(data_path, img_names[i]), os.path.join(result_path, img_names[i][:-4] + '.p')) for i in range(len(img_names))) # pickle.dump(img_flakes, open(flake_save_name, 'wb')) # load corresponding flakes train_flakes, train_feats = locate_flakes(train_names, img_flakes, img_names) val_flakes, val_feats = locate_flakes(val_names, img_flakes, img_names) to_save = dict() to_save['train_flakes'] = train_flakes to_save['train_feats'] = train_feats # to_save['train_names'] = train_names # to_save['train_labels'] = train_labels to_save['val_flakes'] = val_flakes to_save['val_feats'] = val_feats # to_save['val_names'] = val_names # to_save['val_labels'] = val_labels pickle.dump(to_save, open(flake_save_name, 'wb')) print('loading done') # normalize data mean_feat = np.mean(train_feats, axis=0, keepdims=True) std_feat = np.std(train_feats, axis=0, keepdims=True) norm_fea = {} norm_fea['mean'] = mean_feat norm_fea['std'] = std_feat pickle.dump(norm_fea, open(os.path.join(clf_path, 'normfea.p'), 'wb')) train_feats -= mean_feat train_feats = train_feats / std_feat # train_feats = train_feats / np.linalg.norm(train_feats, 2, axis=1, keepdims=True) val_feats -= mean_feat val_feats = val_feats / std_feat # val_feats = val_feats / np.linalg.norm(val_feats, 2, axis=1, keepdims=True) # run classifier # method = 'linearsvm' # method = 'ridge' # method = 'rbfkernelsvm' method = hyperparams['clf_method'] print(method) C = hyperparams['C'] # C = 10 Cs = [ 0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1, 2.5, 5, ] from sklearn.decomposition import TruncatedSVD from sklearn.pipeline import make_pipeline from sklearn_hierarchical_classification.classifier import HierarchicalClassifier from sklearn_hierarchical_classification.constants import ROOT from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled from sklearn_hierarchical_classification.tests.fixtures import make_digits_dataset class_hierarchy = { ROOT: ["A", "2"], "A": ["0", "1"], } for C in Cs: clf_save_path = os.path.join( clf_path, 'feanorm_weighted_classifier-%s-%f.p' % (method, C)) if os.path.exists(clf_save_path): clf = pickle.load(open(clf_save_path, 'rb')) else: if method == 'hielinearsvm': # clf = LinearSVC(random_state=0, tol=1e-5, C=C, max_iter=5e4, class_weight='balanced') # clf = LinearSVC(random_state=0, tol=1e-5, C=C, max_iter=9e4, class_weight={0:1, 1:5, 2:1})#, multi_class='crammer_singer') # clf.fit(train_feats, train_labels) base_estimator = make_pipeline( # TruncatedSVD(n_components=24), SVC(gamma='auto', kernel="linear", probability=True, C=C), ) # elif method == 'ridge': # clf = RidgeClassifier(random_state=0, alpha=C) # clf.fit(train_feats, train_labels) elif method == 'hierbfsvm': # clf = SVC(kernel='rbf', C=C, tol=1e-5, gamma='auto') # clf.fit(train_feats, train_labels) base_estimator = make_pipeline( # TruncatedSVD(n_components=24), SVC(gamma='auto', kernel="rbf", probability=True, C=C), ) else: raise NotImplementedError clf = HierarchicalClassifier( base_estimator=base_estimator, class_hierarchy=class_hierarchy, ) train_labels_str = [str(_) for _ in train_labels] clf.fit(train_feats, train_labels_str) pickle.dump(clf, open(clf_save_path, 'wb')) train_pred_cls = clf.predict(train_feats) train_pred_cls = [int(_) for _ in train_pred_cls] train_pred_scores = clf.predict_proba(train_feats) val_pred_cls = clf.predict(val_feats) val_pred_cls = [int(_) for _ in val_pred_cls] val_pred_scores = clf.predict_proba(val_feats) clf_vis_path = os.path.join(clf_path, subexp_name, 'vis', 'feanorm_weighted_%s-%f' % (method, C)) if not os.path.exists(clf_vis_path): os.makedirs(clf_vis_path) from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix, precision_recall_curve train_acc = accuracy_score(train_labels, train_pred_cls) val_acc = accuracy_score(val_labels, val_pred_cls) train_conf = confusion_matrix(train_labels, train_pred_cls) train_conf = train_conf / np.sum(train_conf, 1, keepdims=True) val_conf = confusion_matrix(val_labels, val_pred_cls) val_conf = val_conf / np.sum(val_conf, 1, keepdims=True) # # val_acc = accuracy_score(, test_pred_cls) # print('train acc: %.4f' % (train_acc)) print(train_conf) print(val_conf) # vis_error(val_pred_cls, val_pred_scores, val_labels, val_flakes, clf_vis_path, val_names, 'val') # vis_error(train_pred_cls, train_pred_scores, train_labels, train_flakes, clf_vis_path, train_names, 'train') # calculate map: uniquelabels = [0, 1, 2] train_aps = [] val_aps = [] fig = plt.figure() ax = fig.add_subplot(111) legends = ['thick', 'thin', 'glue'] for l in uniquelabels: l_train_labels = [_ == l for _ in train_labels] l_val_labels = [_ == l for _ in val_labels] # if method == 'linearsvm': # clf = LinearSVC(random_state=0, tol=1e-5, C=C) # clf.fit(train_feats, l_train_labels) # elif method == 'ridge': # clf = RidgeClassifier(random_state=0, alpha=C) # clf.fit(train_feats, l_train_labels) # elif method == 'rbfkernelsvm': # clf = SVC(kernel='rbf', C=C, tol=1e-5, gamma='auto') # clf.fit(train_feats, l_train_labels) # else: # raise NotImplementedError # l_train_pred_scores = clf.decision_function(train_feats) # l_val_pred_scores = clf.decision_function(val_feats) train_aps.append( average_precision_score(l_train_labels, train_pred_scores[:, l])) val_aps.append( average_precision_score(l_val_labels, val_pred_scores[:, l])) # print(val_pred_scores[:, l]) # print(np.array(l_val_labels, dtype=np.uint8)) precision_l, recall_l, _ = precision_recall_curve( np.array(l_val_labels, dtype=np.uint8), val_pred_scores[:, l]) print(l, getPrecisionAtRecall(precision_l, recall_l, 0.90), getPrecisionAtRecall(precision_l, recall_l, 0.95)) ax.plot(recall_l, precision_l, label=legends[l]) plt.legend() plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.savefig(os.path.join(clf_path, 'feanorm_weighted_%s-%f.png' % (method, C)), dpi=300) plt.close(fig) # train_labels = (np.array(train_labels) + 1)//2 # val_labels = (np.array(val_labels) + 1 ) // 2 # train_ap = average_precision_score(train_labels, train_pred_scores) # val_ap = average_precision_score(val_labels, pred_scores) print(train_aps) print(val_aps) print('%s-%f: train: %.4f, val: %.4f, ap train: %4f, ap val: %4f' % (method, C, train_acc, val_acc, np.mean(train_aps), np.mean(val_aps)))
def svm_train(): train_x, train_y, apps, train_y2 = get_data_set(train_path) test_x, test_y, apps, test_y2 = get_data_set(test_path) pred_x, _, apps, _ = get_data_set(pred_path) # with open(CHANNEL_MODEL + 'svm_label.pkl', 'wb') as f: # pickle.dump(label_dic, f) logging.info('train {} test{}'.format(len(train_x), len(test_x))) t = time.time() data_set = train_x + test_x + pred_x vec = TfidfVectorizer(ngram_range=(1, 3), min_df=10, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1) #vec=HashingVectorizer(ngram_range=(1, 3)) vec.fit_transform(data_set) # with open(project_path + 'tfidf.pkl', 'wb') as f: pickle.dump(vec, f) # with open(CHANNEL_MODEL + 'tfidf.pkl', 'rb') as f: # vec = pickle.load(f) trn_term_doc = vec.transform(train_x) print(label1_label2) time.sleep(20) tfidf_time = time.time() logging.info('time spend {}'.format(tfidf_time - t)) logging.info('begin svm ') lin_clf = svm.LinearSVC(C=1) lin_clf = CalibratedClassifierCV(lin_clf) clf = HierarchicalClassifier( base_estimator=lin_clf, class_hierarchy=label1_label2, ) clf.fit(trn_term_doc, train_y) print(clf.classes_) logging.info('end svm ') # with open(project_path + 'svm_model.pkl', 'wb') as f: # pickle.dump(lin_clf, f) train_preds = clf.predict(trn_term_doc) train_preds_prob = clf.predict_proba(trn_term_doc) print(len((clf.classes_)), train_preds_prob.shape) time.sleep(20) for reg, prob in zip(train_preds, train_preds_prob): print(reg, list(prob.argsort()[-1:][::-1])) time.sleep(20) from sklearn.metrics import classification_report logging.info('train {} accuracy_score {}, \n {}'.format( 'train', accuracy_score(train_y, train_preds), classification_report(train_y, train_preds))) t2 = time.time() logging.info('time spend {}'.format(t2 - t)) test_term_doc = vec.transform(test_x) test_preds_1 = clf.predict_proba(test_term_doc) test_preds = clf.predict(test_term_doc) logging.info('train {} accuracy_score {}, \n {}'.format( 'train', accuracy_score(train_y, train_preds), classification_report(test_y2, test_preds))) dic_lab = {} for k, v in label_dic2.items(): dic_lab[v] = k test_preds = [] test_preds = [] for prob in test_preds_1: test_preds.append(list(prob.argsort()[-2:][::-1])) test_y_name = [] test_preds_name = [] for real, pred in zip(test_y2, test_preds): prd = pred[0] print(real, pred) for pr in pred: if real == clf.classes_[pr]: prd = pr test_y_name.append(real) test_preds_name.append(clf.classes_[prd]) logging.info('{} model on {} data accuracy_score {} top2 test\n {}'.format( "train", test_path, accuracy_score(test_y_name, test_preds_name), classification_report(test_y_name, test_preds_name)))
del hierarchy["ROOT"] class_hierarchy = hierarchy bclf = OneVsRestClassifier(LinearSVC()) base_estimator = make_pipeline(vectorizer, bclf) clf = HierarchicalClassifier(base_estimator=base_estimator, class_hierarchy=class_hierarchy, algorithm="lcn", training_strategy="siblings", preprocessing=True, mlb=mlb, use_decision_function=True) print("training classifier") clf.fit(X_train_raw, y_train[:, :]) print("predicting") y_pred_scores = clf.predict_proba(X_dev_raw) y_pred_scores[np.where(y_pred_scores == 0)] = -10 y_pred = y_pred_scores > -0.25 if train == 1: print('f1 micro:', f1_score(y_true=y_dev, y_pred=y_pred, average='micro')) print('f1 macro:', f1_score(y_true=y_dev, y_pred=y_pred, average='macro')) print(classification_report(y_true=y_dev, y_pred=y_pred)) else: import networkx as nx