def main(input_dir='./datasets/rcv1org', output_dir=settings.data_dir_20ng, split_randomly=True): logger = logging.getLogger(__name__) logger.info(logconfig.key_log(logconfig.DATA_NAME, input_dir)) paths = [] for file_name in os.listdir(input_dir): if file_name.endswith('filtered'): paths.append(os.path.join(input_dir, file_name)) paths.sort() logger.info(logconfig.key_log(logconfig.FUNCTION_NAME, 'build_class_tree')) deltas, classes = build_class_tree(paths, output_dir) logger.info(logconfig.key_log(logconfig.FUNCTION_NAME, 'split')) if split_randomly: data = tools.load( os.path.join(output_dir, 'depth%d.txt' % (len(paths)))) train_idx, test_idx = split_train_test(data, classes[-1], settings.train_ratio, output_dir) else: copyfile(os.path.join(input_dir, 'train_test_idx.npz'), os.path.join(output_dir, 'train_test_idx.npz')) logger.info( logconfig.key_log(logconfig.FUNCTION_NAME, 'generate_hier_info')) generate_hier_info(deltas, classes, output_dir)
def main(input_dir=settings.data_dir_20ng, label_ratio=0.1, time=0, sparse_format=False): logger = logging.getLogger(__name__) logger.info(logconfig.key_log(logconfig.DATA_NAME, input_dir)) depth_files = [] for file_name in os.listdir(input_dir): if file_name.startswith('depth'): depth_files.append(file_name) depth_files.sort() data = tools.load(os.path.join(input_dir, depth_files[-1])) classes = tools.load(os.path.join(input_dir, settings.classes_file)) train_test_idx = tools.load( os.path.join(input_dir, settings.train_test_idx_file)) train_idx = train_test_idx['train_idx'] output_dir = os.path.join(input_dir, str(label_ratio), str(time)) logger.info( logconfig.key_log(logconfig.FUNCTION_NAME, 'split_label_unlabel')) label_idx, unlabel_idx = split_label_unlabel(data, train_idx, classes[-1], label_ratio, output_dir, seed=time) logger.info(logconfig.key_log(logconfig.FUNCTION_NAME, 'process_dataset')) [labeled_data_manager, unlabeled_data_manager, test_data_manager], vocab_info = \ process_dataset(input_dir, output_dir, sparse_format=sparse_format) logger.info( logconfig.key_log('VocabularySize', str(len(vocab_info['stoi']))))
def main(input_dir=settings.data_dir_20ng, label_ratio=0.1, time=0, output_dir=None): logger = logging.getLogger(__name__) if output_dir is None: output_dir = input_dir tools.make_sure_path_exists(output_dir) deltas = tools.load(os.path.join(input_dir, settings.deltas_file)) classes = tools.load(os.path.join(input_dir, settings.classes_file)) nos, hier_tree = generate_hier_info(deltas, classes, input_dir) sub_dir = os.path.join(input_dir, str(label_ratio), str(time)) output_dir = os.path.join(output_dir, str(label_ratio), str(time)) tools.make_sure_path_exists(output_dir) logger.info(logconfig.key_log('Input dir', sub_dir)) logger.info(logconfig.key_log('Output dir', output_dir)) cat_hier_file = os.path.join(output_dir, settings.cat_hier_file) if not os.path.exists(cat_hier_file): shutil.copyfile(os.path.join(input_dir, settings.cat_hier_file), os.path.join(output_dir, settings.cat_hier_file)) if os.path.exists(os.path.join(output_dir, settings.labeled_svmlight_file)) and \ os.path.exists(os.path.join(output_dir, settings.dataless_svmlight_file)) and \ os.path.exists(os.path.join(output_dir, settings.test_svmlight_file)): return data_managers = load_data_managers(sub_dir) generate_svmlight_format(data_managers, nos[-1], output_dir)
def run_level(data_managers, deltas, C=1.0, method='LR_labeled'): logger = logging.getLogger(__name__) model_name = "level" + method logger.info(logconfig.key_log(logconfig.MODEL_NAME, model_name)) model_list = [] test_pres = [] if 'labeled' in method: sims = data_managers[0].deltas elif 'dataless' in method: if settings.soft_sim: sims = list( map(lambda sim: normalize(sim, axis=1), data_managers[0].sims)) else: sims = list( map(lambda sim: hardmax(sim, axis=1), data_managers[0].sims)) else: raise NotImplementedError start = time.time() model_list = train_level(data_managers[0].xit, sims, C, method) logger.info("training time: " + str(time.time() - start)) start = time.time() for depth in range(len(sims)): if 'tf-idf' in method: test_pre = model_list[depth].predict( tf_idf.transform(data_managers[2].xit)) else: test_pre = model_list[depth].predict(data_managers[2].xit) test_pres.append(test_pre) logger.info("predicting time: " + str(time.time() - start)) return model_list, test_pres
def run_PC(data_managers, deltas, C=1.0, method='LR_labeled', path_weights=None): logger = logging.getLogger(__name__) model_name = "PC" + method logger.info(logconfig.key_log(logconfig.MODEL_NAME, model_name)) if 'labeled' in method: sims = data_managers[0].deltas elif 'dataless' in method: if settings.soft_sim: sims = list( map(lambda sim: normalize(sim, axis=1), data_managers[0].sims)) else: sims = list( map(lambda sim: hardmax(sim, axis=1), data_managers[0].sims)) else: raise NotImplementedError start = time.time() path_score = compute_path_score(sims, deltas, path_weights=path_weights) model = train_PC(data_managers[0].xit, path_score, C, method) logger.info("training time: " + str(time.time() - start)) start = time.time() test_pres = predict_label_PC_pathscore(model, data_managers[2].xit, deltas) logger.info("predicting time: " + str(time.time() - start)) return model, test_pres
def run_PSO_WD(data_managers, deltas, C=1.0, method='LR_labeled', soft_pathscore=True, path_weights=None, nos=None): logger = logging.getLogger(__name__) model_name = 'WD(PSO)_' + ("soft_" if soft_pathscore else "hard_") + method logger.info(logconfig.key_log(logconfig.MODEL_NAME, model_name)) if 'labeled' in method: sims = data_managers[0].deltas labels = data_managers[0].labels elif 'dataless' in method: if settings.soft_sim: sims = list( map(lambda sim: normalize(sim, axis=1), data_managers[0].sims)) else: sims = list( map(lambda sim: hardmax(sim, axis=1), data_managers[0].sims)) labels = list( map(lambda sim: np.argmax(sim, axis=-1), data_managers[0].sims)) else: raise NotImplementedError start = time.time() model_list = train_WD(data_managers[0].xit, sims, C, method) def score_function(path_weights): labeled_pres = predict_label_WD_pathscore( model_list, data_managers[0].xit, deltas=(None if soft_pathscore else deltas), path_weights=path_weights) return compute_overall_p_r_f1(labels, labeled_pres, nos)[2][settings.main_metric] pso = PSO(path_weights, score_function, group_size=settings.pso_group_size, min_x=settings.pso_min_x, max_x=settings.pso_max_x) pso.update(c1=settings.pso_c1, c2=settings.pso_c2, w=settings.pso_w, max_iter=settings.pso_max_iter, patience=settings.pso_patience) path_weights = pso.get_best_x() logger.info("training time: " + str(time.time() - start)) logger.info('best_path_weight: %s' % (str(path_weights))) start = time.time() test_pres = predict_label_WD_pathscore( model_list, data_managers[2].xit, deltas=(None if soft_pathscore else deltas), path_weights=path_weights) logger.info("predicting time: " + str(time.time() - start)) return model_list, test_pres
def run_level(data_managers, deltas, method='LR_labeled', dual=True): logger = logging.getLogger(__name__) model_name = method + "_level" logger.info(logconfig.key_log(logconfig.MODEL_NAME, model_name)) model_list = [] y_pres = [] if 'labeled' in method: sims = data_managers[0].deltas elif 'dataless' in method: if settings.soft_sim: sims = list( map(lambda sim: normalize(sim, axis=1), data_managers[0].sims)) else: sims = list( map(lambda sim: hardmax(sim, axis=1), data_managers[0].sims)) else: raise NotImplementedError start = time.time() max_depth = len(sims) # non_zero_indices = np.nonzero(data_managers[0].xit) # non_zero_columns = sorted(set(non_zero_indices[1])) if 'tf-idf' in method: tf_idf = TfidfTransformer() # tf_idf.fit(data_managers[0].xit[:,non_zero_columns]) tf_idf.fit(data_managers[0].xit) else: tf_idf = None for depth in range(max_depth): if 'LR' in method: model = LogisticRegression(dual=True, solver='liblinear') elif 'SVM' in method: model = LinearSVC(dual=dual) if 'tf-idf' in method: # model.fit(tf_idf.transform(data_managers[0].xit[:,non_zero_columns]), np.argmax(sims[depth], axis=1)) model.fit(tf_idf.transform(data_managers[0].xit), np.argmax(sims[depth], axis=1)) else: # model.fit(data_managers[0].xit[:,non_zero_columns], np.argmax(sims[depth], axis=1)) model.fit(data_managers[0].xit, np.argmax(sims[depth], axis=1)) model_list.append(model) logger.info("training time: " + str(time.time() - start)) start = time.time() for depth in range(max_depth): if 'tf-idf' in method: # y_pre = model_list[depth].predict(tf_idf.transform(data_managers[2].xit[:, non_zero_columns])) y_pre = model_list[depth].predict( tf_idf.transform(data_managers[2].xit)) else: # y_pre = model_list[depth].predict(data_managers[2].xit[:, non_zero_columns]) y_pre = model_list[depth].predict(data_managers[2].xit) y_pres.append(y_pre) logger.info("predicting time: " + str(time.time() - start)) return model_list, y_pres
def filter_multilabels(input_dir): logger = logging.getLogger(__name__) logger.info(logconfig.key_log(logconfig.DATA_NAME, input_dir)) paths = [] for file_name in os.listdir(input_dir): if os.path.splitext(file_name)[-1].startswith('.depth'): paths.append(os.path.join(input_dir, file_name)) paths.sort() valid_id_counter = Counter() for depth in range(len(paths)): doc_topic_id = defaultdict( lambda: defaultdict(lambda: set()) ) # doc_topic[i][j] means a set about a document with [doc_text i] and [topic j] with open(paths[depth], 'r', encoding='utf-8') as f: line = f.readline() while line: line = line.strip() if line: line_sp = line.split('\t') topics = line_sp[2].split(';') if len(topics) == 2: # an empty str will be at the last doc_topic_id[line_sp[1]][topics[0]].add(line) line = f.readline() with open(paths[depth] + '.filtered', 'w', encoding='utf-8') as f: for doc, y in doc_topic_id.items(): # multi-label if len(y) > 1: continue for xx, yy in y.items(): # just keep one document lines = sorted(list(yy)) line = lines[0] doc_id = line.split('\t', 1)[0] if depth == 0 or (valid_id_counter[doc_id] & (1 << (depth - 1))): valid_id_counter[doc_id] += (1 << depth) f.write(line) f.write('\n') break logger.info(logconfig.key_log(logconfig.DEPTH, str(depth)))
def run_leaf(data_managers, deltas, C=1.0, method='LR_labeled'): logger = logging.getLogger(__name__) model_name = "flat" + method logger.info(logconfig.key_log(logconfig.MODEL_NAME, model_name)) model_list = [] test_pres = [] if 'labeled' in method: sims = data_managers[0].deltas elif 'dataless' in method: if settings.soft_sim: sims = list( map(lambda sim: normalize(sim, axis=1), data_managers[0].sims)) else: sims = list( map(lambda sim: hardmax(sim, axis=1), data_managers[0].sims)) else: raise NotImplementedError start = time.time() max_depth = len(sims) if 'LR' in method: model = LogisticRegression(C=C, solver='lbfgs', multi_class='multinomial') elif 'SVM' in method: model = LinearSVC(C=C, multi_class='crammer_singer') if 'tf-idf' in method: tf_idf = TfidfTransformer() tf_idf.fit(data_managers[0].xit) model.fit(tf_idf.transform(data_managers[0].xit), np.argmax(sims[-1], axis=1)) else: model.fit(data_managers[0].xit, np.argmax(sims[-1], axis=1)) logger.info("training time: " + str(time.time() - start)) start = time.time() if 'tf-idf' in method: test_pre = model.predict(tf_idf.transform(data_managers[2].xit)) else: test_pre = model.predict(data_managers[2].xit) logger.info("predicting time: " + str(time.time() - start)) return model, test_pre
def main(input_dir=settings.data_dir_20ng, label_ratio=0.1, times=1, classifier_names=None): logger = logging.getLogger(__name__) vocab_info = tools.load(os.path.join(input_dir, settings.vocab_file)) vocab_size = len(vocab_info["stoi"]) del vocab_info nos, hier_tree = get_hier_info(input_dir) if not classifier_names: classifier_names = [ 'LIBLINEAR_LR_primal', 'LIBLINEAR_LR_dual', 'LIBLINEAR_SVC_primal', 'LIBLINEAR_SVC_dual' ] if label_ratio == 1.0: times = 1 for mode in ["labeled", "dataless"]: metrics_result = np.zeros( (times, 2, len(classifier_names), 3, 2) ) # times, methods, [[(M_precision,m_precision), (M_recall,m_recall), (M_f1, m_f1)], ...] for i in range(times): sub_dir = os.path.join(input_dir, str(label_ratio), str(i)) logger.info(logconfig.key_log(logconfig.START_PROGRAM, sub_dir)) model_dirs = train_HierCost(sub_dir, vocab_size, classifier_names, mode) metrics_list = predict_HierCost(sub_dir, model_dirs, vocab_size, nos, hier_tree) metrics_result[i] = metrics_list avg_M_metrics_result = np.mean(metrics_result[:, :, :, :, 0], axis=0) std_M_metrics_result = np.std(metrics_result[:, :, :, :, 0], axis=0) avg_m_metrics_result = np.mean(metrics_result[:, :, :, :, 1], axis=0) std_m_metrics_result = np.std(metrics_result[:, :, :, :, 1], axis=0) with open( os.path.join(input_dir, str(label_ratio), 'LIBLINEAR_%s.csv' % (mode)), 'w') as f: csv_writer = csv.writer(f) csv_writer.writerow(['Leaf'] + classifier_names) csv_writer.writerow(['Macro precision avg'] + list(avg_M_metrics_result[0, :, 0])) csv_writer.writerow(['Macro precision std'] + list(std_M_metrics_result[0, :, 0])) csv_writer.writerow(['Micro precision avg'] + list(avg_m_metrics_result[0, :, 0])) csv_writer.writerow(['Micro precision std'] + list(std_m_metrics_result[0, :, 0])) csv_writer.writerow(['Macro recall avg'] + list(avg_M_metrics_result[0, :, 1])) csv_writer.writerow(['Macro recall std'] + list(std_M_metrics_result[0, :, 1])) csv_writer.writerow(['Micro recall avg'] + list(avg_m_metrics_result[0, :, 1])) csv_writer.writerow(['Micro recall std'] + list(std_m_metrics_result[0, :, 1])) csv_writer.writerow(['Macro f1 avg'] + list(avg_M_metrics_result[0, :, 2])) csv_writer.writerow(['Macro f1 std'] + list(std_M_metrics_result[0, :, 2])) csv_writer.writerow(['Micro f1 avg'] + list(avg_m_metrics_result[0, :, 2])) csv_writer.writerow(['Micro f1 std'] + list(std_m_metrics_result[0, :, 2])) csv_writer.writerow([]) csv_writer.writerow(['Overall'] + classifier_names) csv_writer.writerow(['Macro precision avg'] + list(avg_M_metrics_result[1, :, 0])) csv_writer.writerow(['Macro precision std'] + list(std_M_metrics_result[1, :, 0])) csv_writer.writerow(['Micro precision avg'] + list(avg_m_metrics_result[1, :, 0])) csv_writer.writerow(['Micro precision std'] + list(std_m_metrics_result[1, :, 0])) csv_writer.writerow(['Macro recall avg'] + list(avg_M_metrics_result[1, :, 1])) csv_writer.writerow(['Macro recall std'] + list(std_M_metrics_result[1, :, 1])) csv_writer.writerow(['Micro recall avg'] + list(avg_m_metrics_result[1, :, 1])) csv_writer.writerow(['Micro recall std'] + list(std_m_metrics_result[1, :, 1])) csv_writer.writerow(['Macro f1 avg'] + list(avg_M_metrics_result[1, :, 2])) csv_writer.writerow(['Macro f1 std'] + list(std_M_metrics_result[1, :, 2])) csv_writer.writerow(['Micro f1 avg'] + list(avg_m_metrics_result[1, :, 2])) csv_writer.writerow(['Micro f1 std'] + list(std_m_metrics_result[1, :, 2])) csv_writer.writerow([]) logger.info(logconfig.key_log(logconfig.END_PROGRAM, input_dir))
def main(input_dir=settings.data_dir_20ng, label_ratio=0.1, times=1, classifier_names=None, C=1.0): logger = logging.getLogger(__name__) if label_ratio == 1.0: times = 1 classes = tools.load(os.path.join(input_dir, settings.classes_file)) deltas = tools.load(os.path.join(input_dir, settings.deltas_file)) if not classifier_names: classifier_names = ['flatLR', 'levelLR', 'flatSVM', 'levelSVM'] path_weights = [1.0] for i in range(1, len(classes)): path_weights.append(path_weights[-1] * settings.path_weight) path_weights = np.asarray(path_weights) nos, hier_tree = get_hier_info(input_dir) kw = {'deltas': deltas, 'path_weights': path_weights, 'nos': nos} if label_ratio == 1.0: times = 1 for mode in ["labeled", "dataless"]: metrics_result = np.zeros( (times, 2, len(classifier_names) * len(settings.Cs), 3, 2) ) # times, methods, depth+1, [[(M_precision,m_precision), (M_recall,m_recall), (M_f1, m_f1)], ...] for i in range(times): method_index = 0 sub_dir = os.path.join(input_dir, str(label_ratio), str(i)) logger.info(logconfig.key_log(logconfig.START_PROGRAM, sub_dir)) data_managers = load_data_managers(sub_dir) if settings.reduce_features: non_zero_indices = np.nonzero(data_managers[0].xit) non_zero_columns = sorted(set(non_zero_indices[1])) for data_manager in data_managers: data_manager.xit = data_manager.xit[:, non_zero_columns] if mode == "dataless" and np.max( data_managers[2].sims[0][0]) == 0.0: continue for j, classifier_name in enumerate(classifier_names): for k, C in enumerate(settings.Cs): kw['C'] = C result = run_classifiers(classifier_name, data_managers, mode, **kw) if len(data_managers[2].labels) == len(result[1]): metrics_result[i, 0, j * len(settings.Cs) + k] = compute_p_r_f1( data_managers[2].labels[-1], result[1][-1]) metrics_result[i, 1, j * len(settings.Cs) + k] = compute_overall_p_r_f1( data_managers[2].labels, result[1], nos) else: metrics_result[i, 0, j * len(settings.Cs) + k] = compute_p_r_f1( data_managers[2].labels[-1], result[1]) metrics_result[i, 1, j * len(settings.Cs) + k] = compute_hier_p_r_f1( data_managers[2].labels[-1], result[1], nos, hier_tree) avg_M_metrics_result = np.mean(metrics_result[:, :, :, :, 0], axis=0) std_M_metrics_result = np.std(metrics_result[:, :, :, :, 0], axis=0) avg_m_metrics_result = np.mean(metrics_result[:, :, :, :, 1], axis=0) std_m_metrics_result = np.std(metrics_result[:, :, :, :, 1], axis=0) headers = [] for j, classifier_name in enumerate(classifier_names): for k, C in enumerate(settings.Cs): headers.append('%s_C_%.2f' % (classifier_name, C)) with open( os.path.join(input_dir, str(label_ratio), 'LR_SVM_%s.csv' % (mode)), 'w') as f: csv_writer = csv.writer(f) csv_writer.writerow(['Leaf'] + headers) csv_writer.writerow(['Macro precision avg'] + list(avg_M_metrics_result[0, :, 0])) csv_writer.writerow(['Macro precision std'] + list(std_M_metrics_result[0, :, 0])) csv_writer.writerow(['Micro precision avg'] + list(avg_m_metrics_result[0, :, 0])) csv_writer.writerow(['Micro precision std'] + list(std_m_metrics_result[0, :, 0])) csv_writer.writerow(['Macro recall avg'] + list(avg_M_metrics_result[0, :, 1])) csv_writer.writerow(['Macro recall std'] + list(std_M_metrics_result[0, :, 1])) csv_writer.writerow(['Micro recall avg'] + list(avg_m_metrics_result[0, :, 1])) csv_writer.writerow(['Micro recall std'] + list(std_m_metrics_result[0, :, 1])) csv_writer.writerow(['Macro f1 avg'] + list(avg_M_metrics_result[0, :, 2])) csv_writer.writerow(['Macro f1 std'] + list(std_M_metrics_result[0, :, 2])) csv_writer.writerow(['Micro f1 avg'] + list(avg_m_metrics_result[0, :, 2])) csv_writer.writerow(['Micro f1 std'] + list(std_m_metrics_result[0, :, 2])) csv_writer.writerow([]) csv_writer.writerow(['Overall'] + headers) csv_writer.writerow(['Macro precision avg'] + list(avg_M_metrics_result[1, :, 0])) csv_writer.writerow(['Macro precision std'] + list(std_M_metrics_result[1, :, 0])) csv_writer.writerow(['Micro precision avg'] + list(avg_m_metrics_result[1, :, 0])) csv_writer.writerow(['Micro precision std'] + list(std_m_metrics_result[1, :, 0])) csv_writer.writerow(['Macro recall avg'] + list(avg_M_metrics_result[1, :, 1])) csv_writer.writerow(['Macro recall std'] + list(std_M_metrics_result[1, :, 1])) csv_writer.writerow(['Micro recall avg'] + list(avg_m_metrics_result[1, :, 1])) csv_writer.writerow(['Micro recall std'] + list(std_m_metrics_result[1, :, 1])) csv_writer.writerow(['Macro f1 avg'] + list(avg_M_metrics_result[1, :, 2])) csv_writer.writerow(['Macro f1 std'] + list(std_M_metrics_result[1, :, 2])) csv_writer.writerow(['Micro f1 avg'] + list(avg_m_metrics_result[1, :, 2])) csv_writer.writerow(['Micro f1 std'] + list(std_m_metrics_result[1, :, 2])) csv_writer.writerow([]) logger.info(logconfig.key_log(logconfig.END_PROGRAM, sub_dir))
def run_TD(data_managers, deltas, C=1.0, method='LR_labeled'): logger = logging.getLogger(__name__) if 'BU' in method: model_name = method else: model_name = 'TD' + method logger.info(logconfig.key_log(logconfig.MODEL_NAME, model_name)) model_lists = [] unlabeled_pres = [] test_pres = [] if 'labeled' in method: labels = data_managers[0].labels elif 'dataless' in method: labels = list( map(lambda sim: np.argmax(sim, axis=1), data_managers[0].sims)) else: raise NotImplementedError start = time.time() max_depth = len(deltas) if 'tf-idf' in method: tf_idf = TfidfTransformer() tf_idf.fit(data_managers[0].xit) else: tf_idf = None data_managers_d0 = [ DataManager(data_managers[0].name + '_d0', xit=data_managers[0].xit, labels=data_managers[0].labels, deltas=data_managers[0].deltas, sims=data_managers[0].sims, true_idx=None), None, DataManager(data_managers[2].name + '_d0', xit=data_managers[2].xit, labels=data_managers[2].labels, deltas=data_managers[2].deltas, sims=data_managers[2].sims, true_idx=None) ] data_managers_list = [data_managers_d0] for depth in range(max_depth): model_list, unlabeled_pre, test_pre = train_one_depth( data_managers_list, depth, deltas, C, method) model_lists.append(model_list) # unlabeled_pres.append(unlabeled_pre) test_pres.append(test_pre) # prepare for the next depth if depth == max_depth - 1: break class_depth_no = deltas[depth].shape[0] labeled_true_idx_list = [[] for i in range(class_depth_no)] # unlabeled_true_idx_list = [[] for i in range(class_depth_no)] test_true_idx_list = [[] for i in range(class_depth_no)] for i, l in enumerate(labels[depth]): labeled_true_idx_list[l].append(i) # for i, u in enumerate(unlabeled_pre): # unlabeled_true_idx_list[u].append(i) for i, t in enumerate(test_pre): test_true_idx_list[t].append(i) data_managers_list.clear() for i in range(class_depth_no): data_managers_list.append([ build_subdata_manager( data_managers_d0[0], data_managers[0].name + '_d%d_c%d' % (depth, i), labeled_true_idx_list[i]), None, build_subdata_manager( data_managers_d0[2], data_managers[2].name + '_d%d_c%d' % (depth, i), test_true_idx_list[i]) ]) logger.info("training and predicting time: " + str(time.time() - start)) return model_lists, test_pres
def main(input_dir=settings.data_dir_20ng, label_ratio=0.1, times=1, classifier_names=None, dual=True): logger = logging.getLogger(__name__) if label_ratio == 1.0: times = 1 classes = tools.load(os.path.join(input_dir, settings.classes_file)) deltas = tools.load(os.path.join(input_dir, settings.deltas_file)) kw = {'deltas': deltas, 'dual': dual} if not classifier_names: classifier_names = ['flatLR', 'levelLR', 'flatSVM', 'levelSVM'] path_weights = [1.0] for i in range(1, len(classes)): path_weights.append(path_weights[-1] * settings.path_weight) if label_ratio == 1.0: times = 1 nos, hier_tree = get_hier_info(input_dir) for mode in ["labeled", "dataless"]: metrics_result = np.zeros( (times, 2, len(classifier_names), 3, 2) ) # times, methods, depth+1, [[(M_precision,m_precision), (M_recall,m_recall), (M_f1, m_f1)], ...] for i in range(times): method_index = 0 sub_dir = os.path.join(input_dir, str(label_ratio), str(i)) logger.info(logconfig.key_log(logconfig.START_PROGRAM, sub_dir)) data_managers = load_data_managers(sub_dir) if mode == "dataless" and np.max( data_managers[2].sims[0][0]) == 0.0: continue for j, classifier_name in enumerate(classifier_names): result = run_classifiers(classifier_name, data_managers, mode, **kw) if len(data_managers[2].labels) == len(result[1]): metrics_result[i, 0, j] = compute_p_r_f1( data_managers[2].labels[-1], result[1][-1]) metrics_result[i, 1, j] = compute_overall_p_r_f1( data_managers[2].labels, result[1], nos) else: metrics_result[i, 0, j] = compute_p_r_f1( data_managers[2].labels[-1], result[1]) metrics_result[i, 1, j] = compute_hier_p_r_f1( data_managers[2].labels[-1], result[1], nos, hier_tree) avg_M_metrics_result = np.mean(metrics_result[:, :, :, :, 0], axis=0) std_M_metrics_result = np.std(metrics_result[:, :, :, :, 0], axis=0) avg_m_metrics_result = np.mean(metrics_result[:, :, :, :, 1], axis=0) std_m_metrics_result = np.std(metrics_result[:, :, :, :, 1], axis=0) with open( os.path.join(input_dir, str(label_ratio), 'LR_SVM_%s.csv' % (mode)), 'w') as f: csv_writer = csv.writer(f) csv_writer.writerow(['Leaf'] + classifier_names) csv_writer.writerow(['Macro precision avg'] + list(avg_M_metrics_result[0, :, 0])) csv_writer.writerow(['Macro precision std'] + list(std_M_metrics_result[0, :, 0])) csv_writer.writerow(['Micro precision avg'] + list(avg_m_metrics_result[0, :, 0])) csv_writer.writerow(['Micro precision std'] + list(std_m_metrics_result[0, :, 0])) csv_writer.writerow(['Macro recall avg'] + list(avg_M_metrics_result[0, :, 1])) csv_writer.writerow(['Macro recall std'] + list(std_M_metrics_result[0, :, 1])) csv_writer.writerow(['Micro recall avg'] + list(avg_m_metrics_result[0, :, 1])) csv_writer.writerow(['Micro recall std'] + list(std_m_metrics_result[0, :, 1])) csv_writer.writerow(['Macro f1 avg'] + list(avg_M_metrics_result[0, :, 2])) csv_writer.writerow(['Macro f1 std'] + list(std_M_metrics_result[0, :, 2])) csv_writer.writerow(['Micro f1 avg'] + list(avg_m_metrics_result[0, :, 2])) csv_writer.writerow(['Micro f1 std'] + list(std_m_metrics_result[0, :, 2])) csv_writer.writerow([]) csv_writer.writerow(['Overall'] + classifier_names) csv_writer.writerow(['Macro precision avg'] + list(avg_M_metrics_result[1, :, 0])) csv_writer.writerow(['Macro precision std'] + list(std_M_metrics_result[1, :, 0])) csv_writer.writerow(['Micro precision avg'] + list(avg_m_metrics_result[1, :, 0])) csv_writer.writerow(['Micro precision std'] + list(std_m_metrics_result[1, :, 0])) csv_writer.writerow(['Macro recall avg'] + list(avg_M_metrics_result[1, :, 1])) csv_writer.writerow(['Macro recall std'] + list(std_M_metrics_result[1, :, 1])) csv_writer.writerow(['Micro recall avg'] + list(avg_m_metrics_result[1, :, 1])) csv_writer.writerow(['Micro recall std'] + list(std_m_metrics_result[1, :, 1])) csv_writer.writerow(['Macro f1 avg'] + list(avg_M_metrics_result[1, :, 2])) csv_writer.writerow(['Macro f1 std'] + list(std_M_metrics_result[1, :, 2])) csv_writer.writerow(['Micro f1 avg'] + list(avg_m_metrics_result[1, :, 2])) csv_writer.writerow(['Micro f1 std'] + list(std_m_metrics_result[1, :, 2])) csv_writer.writerow([]) logger.info(logconfig.key_log(logconfig.END_PROGRAM, input_dir))