コード例 #1
0
def main(input_dir='./datasets/rcv1org',
         output_dir=settings.data_dir_20ng,
         split_randomly=True):
    logger = logging.getLogger(__name__)
    logger.info(logconfig.key_log(logconfig.DATA_NAME, input_dir))

    paths = []
    for file_name in os.listdir(input_dir):
        if file_name.endswith('filtered'):
            paths.append(os.path.join(input_dir, file_name))
    paths.sort()

    logger.info(logconfig.key_log(logconfig.FUNCTION_NAME, 'build_class_tree'))
    deltas, classes = build_class_tree(paths, output_dir)

    logger.info(logconfig.key_log(logconfig.FUNCTION_NAME, 'split'))
    if split_randomly:
        data = tools.load(
            os.path.join(output_dir, 'depth%d.txt' % (len(paths))))
        train_idx, test_idx = split_train_test(data, classes[-1],
                                               settings.train_ratio,
                                               output_dir)
    else:
        copyfile(os.path.join(input_dir, 'train_test_idx.npz'),
                 os.path.join(output_dir, 'train_test_idx.npz'))

    logger.info(
        logconfig.key_log(logconfig.FUNCTION_NAME, 'generate_hier_info'))
    generate_hier_info(deltas, classes, output_dir)
コード例 #2
0
ファイル: build_data_managers.py プロジェクト: seanliu96/PCHC
def main(input_dir=settings.data_dir_20ng,
         label_ratio=0.1,
         time=0,
         sparse_format=False):
    logger = logging.getLogger(__name__)
    logger.info(logconfig.key_log(logconfig.DATA_NAME, input_dir))

    depth_files = []
    for file_name in os.listdir(input_dir):
        if file_name.startswith('depth'):
            depth_files.append(file_name)
    depth_files.sort()

    data = tools.load(os.path.join(input_dir, depth_files[-1]))
    classes = tools.load(os.path.join(input_dir, settings.classes_file))
    train_test_idx = tools.load(
        os.path.join(input_dir, settings.train_test_idx_file))
    train_idx = train_test_idx['train_idx']

    output_dir = os.path.join(input_dir, str(label_ratio), str(time))

    logger.info(
        logconfig.key_log(logconfig.FUNCTION_NAME, 'split_label_unlabel'))
    label_idx, unlabel_idx = split_label_unlabel(data,
                                                 train_idx,
                                                 classes[-1],
                                                 label_ratio,
                                                 output_dir,
                                                 seed=time)

    logger.info(logconfig.key_log(logconfig.FUNCTION_NAME, 'process_dataset'))
    [labeled_data_manager, unlabeled_data_manager, test_data_manager], vocab_info = \
        process_dataset(input_dir, output_dir, sparse_format=sparse_format)
    logger.info(
        logconfig.key_log('VocabularySize', str(len(vocab_info['stoi']))))
コード例 #3
0
def main(input_dir=settings.data_dir_20ng,
         label_ratio=0.1,
         time=0,
         output_dir=None):
    logger = logging.getLogger(__name__)

    if output_dir is None:
        output_dir = input_dir
    tools.make_sure_path_exists(output_dir)
    deltas = tools.load(os.path.join(input_dir, settings.deltas_file))
    classes = tools.load(os.path.join(input_dir, settings.classes_file))
    nos, hier_tree = generate_hier_info(deltas, classes, input_dir)
    sub_dir = os.path.join(input_dir, str(label_ratio), str(time))
    output_dir = os.path.join(output_dir, str(label_ratio), str(time))
    tools.make_sure_path_exists(output_dir)

    logger.info(logconfig.key_log('Input dir', sub_dir))
    logger.info(logconfig.key_log('Output dir', output_dir))

    cat_hier_file = os.path.join(output_dir, settings.cat_hier_file)
    if not os.path.exists(cat_hier_file):
        shutil.copyfile(os.path.join(input_dir, settings.cat_hier_file),
                        os.path.join(output_dir, settings.cat_hier_file))
    if os.path.exists(os.path.join(output_dir, settings.labeled_svmlight_file)) and \
        os.path.exists(os.path.join(output_dir, settings.dataless_svmlight_file)) and \
            os.path.exists(os.path.join(output_dir, settings.test_svmlight_file)):
        return
    data_managers = load_data_managers(sub_dir)
    generate_svmlight_format(data_managers, nos[-1], output_dir)
コード例 #4
0
ファイル: LR_SVM.py プロジェクト: seanliu96/PCHC
def run_level(data_managers, deltas, C=1.0, method='LR_labeled'):
    logger = logging.getLogger(__name__)
    model_name = "level" + method
    logger.info(logconfig.key_log(logconfig.MODEL_NAME, model_name))

    model_list = []
    test_pres = []
    if 'labeled' in method:
        sims = data_managers[0].deltas
    elif 'dataless' in method:
        if settings.soft_sim:
            sims = list(
                map(lambda sim: normalize(sim, axis=1), data_managers[0].sims))
        else:
            sims = list(
                map(lambda sim: hardmax(sim, axis=1), data_managers[0].sims))
    else:
        raise NotImplementedError

    start = time.time()
    model_list = train_level(data_managers[0].xit, sims, C, method)
    logger.info("training time: " + str(time.time() - start))
    start = time.time()
    for depth in range(len(sims)):
        if 'tf-idf' in method:
            test_pre = model_list[depth].predict(
                tf_idf.transform(data_managers[2].xit))
        else:
            test_pre = model_list[depth].predict(data_managers[2].xit)
        test_pres.append(test_pre)
    logger.info("predicting time: " + str(time.time() - start))
    return model_list, test_pres
コード例 #5
0
ファイル: LR_SVM.py プロジェクト: seanliu96/PCHC
def run_PC(data_managers,
           deltas,
           C=1.0,
           method='LR_labeled',
           path_weights=None):
    logger = logging.getLogger(__name__)
    model_name = "PC" + method
    logger.info(logconfig.key_log(logconfig.MODEL_NAME, model_name))

    if 'labeled' in method:
        sims = data_managers[0].deltas
    elif 'dataless' in method:
        if settings.soft_sim:
            sims = list(
                map(lambda sim: normalize(sim, axis=1), data_managers[0].sims))
        else:
            sims = list(
                map(lambda sim: hardmax(sim, axis=1), data_managers[0].sims))
    else:
        raise NotImplementedError
    start = time.time()
    path_score = compute_path_score(sims, deltas, path_weights=path_weights)
    model = train_PC(data_managers[0].xit, path_score, C, method)
    logger.info("training time: " + str(time.time() - start))
    start = time.time()
    test_pres = predict_label_PC_pathscore(model, data_managers[2].xit, deltas)
    logger.info("predicting time: " + str(time.time() - start))
    return model, test_pres
コード例 #6
0
ファイル: LR_SVM.py プロジェクト: seanliu96/PCHC
def run_PSO_WD(data_managers,
               deltas,
               C=1.0,
               method='LR_labeled',
               soft_pathscore=True,
               path_weights=None,
               nos=None):
    logger = logging.getLogger(__name__)
    model_name = 'WD(PSO)_' + ("soft_" if soft_pathscore else "hard_") + method
    logger.info(logconfig.key_log(logconfig.MODEL_NAME, model_name))

    if 'labeled' in method:
        sims = data_managers[0].deltas
        labels = data_managers[0].labels
    elif 'dataless' in method:
        if settings.soft_sim:
            sims = list(
                map(lambda sim: normalize(sim, axis=1), data_managers[0].sims))
        else:
            sims = list(
                map(lambda sim: hardmax(sim, axis=1), data_managers[0].sims))
        labels = list(
            map(lambda sim: np.argmax(sim, axis=-1), data_managers[0].sims))
    else:
        raise NotImplementedError
    start = time.time()
    model_list = train_WD(data_managers[0].xit, sims, C, method)

    def score_function(path_weights):
        labeled_pres = predict_label_WD_pathscore(
            model_list,
            data_managers[0].xit,
            deltas=(None if soft_pathscore else deltas),
            path_weights=path_weights)
        return compute_overall_p_r_f1(labels, labeled_pres,
                                      nos)[2][settings.main_metric]

    pso = PSO(path_weights,
              score_function,
              group_size=settings.pso_group_size,
              min_x=settings.pso_min_x,
              max_x=settings.pso_max_x)
    pso.update(c1=settings.pso_c1,
               c2=settings.pso_c2,
               w=settings.pso_w,
               max_iter=settings.pso_max_iter,
               patience=settings.pso_patience)
    path_weights = pso.get_best_x()
    logger.info("training time: " + str(time.time() - start))
    logger.info('best_path_weight: %s' % (str(path_weights)))
    start = time.time()
    test_pres = predict_label_WD_pathscore(
        model_list,
        data_managers[2].xit,
        deltas=(None if soft_pathscore else deltas),
        path_weights=path_weights)
    logger.info("predicting time: " + str(time.time() - start))
    return model_list, test_pres
コード例 #7
0
def run_level(data_managers, deltas, method='LR_labeled', dual=True):
    logger = logging.getLogger(__name__)
    model_name = method + "_level"
    logger.info(logconfig.key_log(logconfig.MODEL_NAME, model_name))

    model_list = []
    y_pres = []
    if 'labeled' in method:
        sims = data_managers[0].deltas
    elif 'dataless' in method:
        if settings.soft_sim:
            sims = list(
                map(lambda sim: normalize(sim, axis=1), data_managers[0].sims))
        else:
            sims = list(
                map(lambda sim: hardmax(sim, axis=1), data_managers[0].sims))
    else:
        raise NotImplementedError

    start = time.time()
    max_depth = len(sims)
    # non_zero_indices = np.nonzero(data_managers[0].xit)
    # non_zero_columns = sorted(set(non_zero_indices[1]))
    if 'tf-idf' in method:
        tf_idf = TfidfTransformer()
        # tf_idf.fit(data_managers[0].xit[:,non_zero_columns])
        tf_idf.fit(data_managers[0].xit)
    else:
        tf_idf = None
    for depth in range(max_depth):
        if 'LR' in method:
            model = LogisticRegression(dual=True, solver='liblinear')
        elif 'SVM' in method:
            model = LinearSVC(dual=dual)
        if 'tf-idf' in method:
            # model.fit(tf_idf.transform(data_managers[0].xit[:,non_zero_columns]), np.argmax(sims[depth], axis=1))
            model.fit(tf_idf.transform(data_managers[0].xit),
                      np.argmax(sims[depth], axis=1))
        else:
            # model.fit(data_managers[0].xit[:,non_zero_columns], np.argmax(sims[depth], axis=1))
            model.fit(data_managers[0].xit, np.argmax(sims[depth], axis=1))
        model_list.append(model)
    logger.info("training time: " + str(time.time() - start))
    start = time.time()
    for depth in range(max_depth):
        if 'tf-idf' in method:
            # y_pre = model_list[depth].predict(tf_idf.transform(data_managers[2].xit[:, non_zero_columns]))
            y_pre = model_list[depth].predict(
                tf_idf.transform(data_managers[2].xit))
        else:
            # y_pre = model_list[depth].predict(data_managers[2].xit[:, non_zero_columns])
            y_pre = model_list[depth].predict(data_managers[2].xit)
        y_pres.append(y_pre)
    logger.info("predicting time: " + str(time.time() - start))
    return model_list, y_pres
コード例 #8
0
def filter_multilabels(input_dir):
    logger = logging.getLogger(__name__)
    logger.info(logconfig.key_log(logconfig.DATA_NAME, input_dir))

    paths = []
    for file_name in os.listdir(input_dir):
        if os.path.splitext(file_name)[-1].startswith('.depth'):
            paths.append(os.path.join(input_dir, file_name))
    paths.sort()

    valid_id_counter = Counter()
    for depth in range(len(paths)):
        doc_topic_id = defaultdict(
            lambda: defaultdict(lambda: set())
        )  # doc_topic[i][j] means a set about a document with [doc_text i] and [topic j]
        with open(paths[depth], 'r', encoding='utf-8') as f:
            line = f.readline()
            while line:
                line = line.strip()
                if line:
                    line_sp = line.split('\t')
                    topics = line_sp[2].split(';')
                    if len(topics) == 2:  # an empty str will be at the last
                        doc_topic_id[line_sp[1]][topics[0]].add(line)
                line = f.readline()
        with open(paths[depth] + '.filtered', 'w', encoding='utf-8') as f:
            for doc, y in doc_topic_id.items():
                # multi-label
                if len(y) > 1:
                    continue
                for xx, yy in y.items():
                    # just keep one document
                    lines = sorted(list(yy))
                    line = lines[0]
                    doc_id = line.split('\t', 1)[0]
                    if depth == 0 or (valid_id_counter[doc_id] &
                                      (1 << (depth - 1))):
                        valid_id_counter[doc_id] += (1 << depth)
                        f.write(line)
                        f.write('\n')
                    break
        logger.info(logconfig.key_log(logconfig.DEPTH, str(depth)))
コード例 #9
0
ファイル: LR_SVM.py プロジェクト: seanliu96/PCHC
def run_leaf(data_managers, deltas, C=1.0, method='LR_labeled'):
    logger = logging.getLogger(__name__)
    model_name = "flat" + method
    logger.info(logconfig.key_log(logconfig.MODEL_NAME, model_name))

    model_list = []
    test_pres = []
    if 'labeled' in method:
        sims = data_managers[0].deltas
    elif 'dataless' in method:
        if settings.soft_sim:
            sims = list(
                map(lambda sim: normalize(sim, axis=1), data_managers[0].sims))
        else:
            sims = list(
                map(lambda sim: hardmax(sim, axis=1), data_managers[0].sims))
    else:
        raise NotImplementedError

    start = time.time()
    max_depth = len(sims)
    if 'LR' in method:
        model = LogisticRegression(C=C,
                                   solver='lbfgs',
                                   multi_class='multinomial')
    elif 'SVM' in method:
        model = LinearSVC(C=C, multi_class='crammer_singer')
    if 'tf-idf' in method:
        tf_idf = TfidfTransformer()
        tf_idf.fit(data_managers[0].xit)
        model.fit(tf_idf.transform(data_managers[0].xit),
                  np.argmax(sims[-1], axis=1))
    else:
        model.fit(data_managers[0].xit, np.argmax(sims[-1], axis=1))
    logger.info("training time: " + str(time.time() - start))
    start = time.time()
    if 'tf-idf' in method:
        test_pre = model.predict(tf_idf.transform(data_managers[2].xit))
    else:
        test_pre = model.predict(data_managers[2].xit)
    logger.info("predicting time: " + str(time.time() - start))
    return model, test_pre
コード例 #10
0
ファイル: LIBLINEAR.py プロジェクト: seanliu96/PCHC
def main(input_dir=settings.data_dir_20ng,
         label_ratio=0.1,
         times=1,
         classifier_names=None):
    logger = logging.getLogger(__name__)

    vocab_info = tools.load(os.path.join(input_dir, settings.vocab_file))
    vocab_size = len(vocab_info["stoi"])
    del vocab_info
    nos, hier_tree = get_hier_info(input_dir)
    if not classifier_names:
        classifier_names = [
            'LIBLINEAR_LR_primal', 'LIBLINEAR_LR_dual', 'LIBLINEAR_SVC_primal',
            'LIBLINEAR_SVC_dual'
        ]
    if label_ratio == 1.0:
        times = 1
    for mode in ["labeled", "dataless"]:
        metrics_result = np.zeros(
            (times, 2, len(classifier_names), 3, 2)
        )  # times, methods, [[(M_precision,m_precision), (M_recall,m_recall),  (M_f1, m_f1)], ...]

        for i in range(times):
            sub_dir = os.path.join(input_dir, str(label_ratio), str(i))
            logger.info(logconfig.key_log(logconfig.START_PROGRAM, sub_dir))
            model_dirs = train_HierCost(sub_dir, vocab_size, classifier_names,
                                        mode)
            metrics_list = predict_HierCost(sub_dir, model_dirs, vocab_size,
                                            nos, hier_tree)
            metrics_result[i] = metrics_list
        avg_M_metrics_result = np.mean(metrics_result[:, :, :, :, 0], axis=0)
        std_M_metrics_result = np.std(metrics_result[:, :, :, :, 0], axis=0)
        avg_m_metrics_result = np.mean(metrics_result[:, :, :, :, 1], axis=0)
        std_m_metrics_result = np.std(metrics_result[:, :, :, :, 1], axis=0)
        with open(
                os.path.join(input_dir, str(label_ratio),
                             'LIBLINEAR_%s.csv' % (mode)), 'w') as f:
            csv_writer = csv.writer(f)
            csv_writer.writerow(['Leaf'] + classifier_names)
            csv_writer.writerow(['Macro precision avg'] +
                                list(avg_M_metrics_result[0, :, 0]))
            csv_writer.writerow(['Macro precision std'] +
                                list(std_M_metrics_result[0, :, 0]))
            csv_writer.writerow(['Micro precision avg'] +
                                list(avg_m_metrics_result[0, :, 0]))
            csv_writer.writerow(['Micro precision std'] +
                                list(std_m_metrics_result[0, :, 0]))
            csv_writer.writerow(['Macro recall avg'] +
                                list(avg_M_metrics_result[0, :, 1]))
            csv_writer.writerow(['Macro recall std'] +
                                list(std_M_metrics_result[0, :, 1]))
            csv_writer.writerow(['Micro recall avg'] +
                                list(avg_m_metrics_result[0, :, 1]))
            csv_writer.writerow(['Micro recall std'] +
                                list(std_m_metrics_result[0, :, 1]))
            csv_writer.writerow(['Macro f1 avg'] +
                                list(avg_M_metrics_result[0, :, 2]))
            csv_writer.writerow(['Macro f1 std'] +
                                list(std_M_metrics_result[0, :, 2]))
            csv_writer.writerow(['Micro f1 avg'] +
                                list(avg_m_metrics_result[0, :, 2]))
            csv_writer.writerow(['Micro f1 std'] +
                                list(std_m_metrics_result[0, :, 2]))
            csv_writer.writerow([])
            csv_writer.writerow(['Overall'] + classifier_names)
            csv_writer.writerow(['Macro precision avg'] +
                                list(avg_M_metrics_result[1, :, 0]))
            csv_writer.writerow(['Macro precision std'] +
                                list(std_M_metrics_result[1, :, 0]))
            csv_writer.writerow(['Micro precision avg'] +
                                list(avg_m_metrics_result[1, :, 0]))
            csv_writer.writerow(['Micro precision std'] +
                                list(std_m_metrics_result[1, :, 0]))
            csv_writer.writerow(['Macro recall avg'] +
                                list(avg_M_metrics_result[1, :, 1]))
            csv_writer.writerow(['Macro recall std'] +
                                list(std_M_metrics_result[1, :, 1]))
            csv_writer.writerow(['Micro recall avg'] +
                                list(avg_m_metrics_result[1, :, 1]))
            csv_writer.writerow(['Micro recall std'] +
                                list(std_m_metrics_result[1, :, 1]))
            csv_writer.writerow(['Macro f1 avg'] +
                                list(avg_M_metrics_result[1, :, 2]))
            csv_writer.writerow(['Macro f1 std'] +
                                list(std_M_metrics_result[1, :, 2]))
            csv_writer.writerow(['Micro f1 avg'] +
                                list(avg_m_metrics_result[1, :, 2]))
            csv_writer.writerow(['Micro f1 std'] +
                                list(std_m_metrics_result[1, :, 2]))
            csv_writer.writerow([])

    logger.info(logconfig.key_log(logconfig.END_PROGRAM, input_dir))
コード例 #11
0
ファイル: LR_SVM.py プロジェクト: seanliu96/PCHC
def main(input_dir=settings.data_dir_20ng,
         label_ratio=0.1,
         times=1,
         classifier_names=None,
         C=1.0):
    logger = logging.getLogger(__name__)

    if label_ratio == 1.0:
        times = 1
    classes = tools.load(os.path.join(input_dir, settings.classes_file))
    deltas = tools.load(os.path.join(input_dir, settings.deltas_file))

    if not classifier_names:
        classifier_names = ['flatLR', 'levelLR', 'flatSVM', 'levelSVM']
    path_weights = [1.0]
    for i in range(1, len(classes)):
        path_weights.append(path_weights[-1] * settings.path_weight)
    path_weights = np.asarray(path_weights)
    nos, hier_tree = get_hier_info(input_dir)
    kw = {'deltas': deltas, 'path_weights': path_weights, 'nos': nos}
    if label_ratio == 1.0:
        times = 1
    for mode in ["labeled", "dataless"]:
        metrics_result = np.zeros(
            (times, 2, len(classifier_names) * len(settings.Cs), 3, 2)
        )  # times, methods, depth+1, [[(M_precision,m_precision), (M_recall,m_recall),  (M_f1, m_f1)], ...]

        for i in range(times):
            method_index = 0
            sub_dir = os.path.join(input_dir, str(label_ratio), str(i))
            logger.info(logconfig.key_log(logconfig.START_PROGRAM, sub_dir))

            data_managers = load_data_managers(sub_dir)
            if settings.reduce_features:
                non_zero_indices = np.nonzero(data_managers[0].xit)
                non_zero_columns = sorted(set(non_zero_indices[1]))
                for data_manager in data_managers:
                    data_manager.xit = data_manager.xit[:, non_zero_columns]

            if mode == "dataless" and np.max(
                    data_managers[2].sims[0][0]) == 0.0:
                continue

            for j, classifier_name in enumerate(classifier_names):
                for k, C in enumerate(settings.Cs):
                    kw['C'] = C
                    result = run_classifiers(classifier_name, data_managers,
                                             mode, **kw)
                    if len(data_managers[2].labels) == len(result[1]):
                        metrics_result[i, 0, j * len(settings.Cs) +
                                       k] = compute_p_r_f1(
                                           data_managers[2].labels[-1],
                                           result[1][-1])
                        metrics_result[i, 1, j * len(settings.Cs) +
                                       k] = compute_overall_p_r_f1(
                                           data_managers[2].labels, result[1],
                                           nos)
                    else:
                        metrics_result[i, 0, j * len(settings.Cs) +
                                       k] = compute_p_r_f1(
                                           data_managers[2].labels[-1],
                                           result[1])
                        metrics_result[i, 1, j * len(settings.Cs) +
                                       k] = compute_hier_p_r_f1(
                                           data_managers[2].labels[-1],
                                           result[1], nos, hier_tree)

        avg_M_metrics_result = np.mean(metrics_result[:, :, :, :, 0], axis=0)
        std_M_metrics_result = np.std(metrics_result[:, :, :, :, 0], axis=0)
        avg_m_metrics_result = np.mean(metrics_result[:, :, :, :, 1], axis=0)
        std_m_metrics_result = np.std(metrics_result[:, :, :, :, 1], axis=0)

        headers = []
        for j, classifier_name in enumerate(classifier_names):
            for k, C in enumerate(settings.Cs):
                headers.append('%s_C_%.2f' % (classifier_name, C))
        with open(
                os.path.join(input_dir, str(label_ratio),
                             'LR_SVM_%s.csv' % (mode)), 'w') as f:
            csv_writer = csv.writer(f)
            csv_writer.writerow(['Leaf'] + headers)
            csv_writer.writerow(['Macro precision avg'] +
                                list(avg_M_metrics_result[0, :, 0]))
            csv_writer.writerow(['Macro precision std'] +
                                list(std_M_metrics_result[0, :, 0]))
            csv_writer.writerow(['Micro precision avg'] +
                                list(avg_m_metrics_result[0, :, 0]))
            csv_writer.writerow(['Micro precision std'] +
                                list(std_m_metrics_result[0, :, 0]))
            csv_writer.writerow(['Macro recall avg'] +
                                list(avg_M_metrics_result[0, :, 1]))
            csv_writer.writerow(['Macro recall std'] +
                                list(std_M_metrics_result[0, :, 1]))
            csv_writer.writerow(['Micro recall avg'] +
                                list(avg_m_metrics_result[0, :, 1]))
            csv_writer.writerow(['Micro recall std'] +
                                list(std_m_metrics_result[0, :, 1]))
            csv_writer.writerow(['Macro f1 avg'] +
                                list(avg_M_metrics_result[0, :, 2]))
            csv_writer.writerow(['Macro f1 std'] +
                                list(std_M_metrics_result[0, :, 2]))
            csv_writer.writerow(['Micro f1 avg'] +
                                list(avg_m_metrics_result[0, :, 2]))
            csv_writer.writerow(['Micro f1 std'] +
                                list(std_m_metrics_result[0, :, 2]))
            csv_writer.writerow([])
            csv_writer.writerow(['Overall'] + headers)
            csv_writer.writerow(['Macro precision avg'] +
                                list(avg_M_metrics_result[1, :, 0]))
            csv_writer.writerow(['Macro precision std'] +
                                list(std_M_metrics_result[1, :, 0]))
            csv_writer.writerow(['Micro precision avg'] +
                                list(avg_m_metrics_result[1, :, 0]))
            csv_writer.writerow(['Micro precision std'] +
                                list(std_m_metrics_result[1, :, 0]))
            csv_writer.writerow(['Macro recall avg'] +
                                list(avg_M_metrics_result[1, :, 1]))
            csv_writer.writerow(['Macro recall std'] +
                                list(std_M_metrics_result[1, :, 1]))
            csv_writer.writerow(['Micro recall avg'] +
                                list(avg_m_metrics_result[1, :, 1]))
            csv_writer.writerow(['Micro recall std'] +
                                list(std_m_metrics_result[1, :, 1]))
            csv_writer.writerow(['Macro f1 avg'] +
                                list(avg_M_metrics_result[1, :, 2]))
            csv_writer.writerow(['Macro f1 std'] +
                                list(std_M_metrics_result[1, :, 2]))
            csv_writer.writerow(['Micro f1 avg'] +
                                list(avg_m_metrics_result[1, :, 2]))
            csv_writer.writerow(['Micro f1 std'] +
                                list(std_m_metrics_result[1, :, 2]))
            csv_writer.writerow([])
    logger.info(logconfig.key_log(logconfig.END_PROGRAM, sub_dir))
コード例 #12
0
ファイル: LR_SVM.py プロジェクト: seanliu96/PCHC
def run_TD(data_managers, deltas, C=1.0, method='LR_labeled'):
    logger = logging.getLogger(__name__)
    if 'BU' in method:
        model_name = method
    else:
        model_name = 'TD' + method
    logger.info(logconfig.key_log(logconfig.MODEL_NAME, model_name))

    model_lists = []
    unlabeled_pres = []
    test_pres = []
    if 'labeled' in method:
        labels = data_managers[0].labels
    elif 'dataless' in method:
        labels = list(
            map(lambda sim: np.argmax(sim, axis=1), data_managers[0].sims))
    else:
        raise NotImplementedError

    start = time.time()
    max_depth = len(deltas)

    if 'tf-idf' in method:
        tf_idf = TfidfTransformer()
        tf_idf.fit(data_managers[0].xit)
    else:
        tf_idf = None
    data_managers_d0 = [
        DataManager(data_managers[0].name + '_d0',
                    xit=data_managers[0].xit,
                    labels=data_managers[0].labels,
                    deltas=data_managers[0].deltas,
                    sims=data_managers[0].sims,
                    true_idx=None), None,
        DataManager(data_managers[2].name + '_d0',
                    xit=data_managers[2].xit,
                    labels=data_managers[2].labels,
                    deltas=data_managers[2].deltas,
                    sims=data_managers[2].sims,
                    true_idx=None)
    ]
    data_managers_list = [data_managers_d0]
    for depth in range(max_depth):
        model_list, unlabeled_pre, test_pre = train_one_depth(
            data_managers_list, depth, deltas, C, method)
        model_lists.append(model_list)
        # unlabeled_pres.append(unlabeled_pre)
        test_pres.append(test_pre)
        # prepare for the next depth
        if depth == max_depth - 1:
            break
        class_depth_no = deltas[depth].shape[0]
        labeled_true_idx_list = [[] for i in range(class_depth_no)]
        # unlabeled_true_idx_list = [[] for i in range(class_depth_no)]
        test_true_idx_list = [[] for i in range(class_depth_no)]

        for i, l in enumerate(labels[depth]):
            labeled_true_idx_list[l].append(i)
        # for i, u in enumerate(unlabeled_pre):
        #     unlabeled_true_idx_list[u].append(i)
        for i, t in enumerate(test_pre):
            test_true_idx_list[t].append(i)
        data_managers_list.clear()
        for i in range(class_depth_no):
            data_managers_list.append([
                build_subdata_manager(
                    data_managers_d0[0],
                    data_managers[0].name + '_d%d_c%d' % (depth, i),
                    labeled_true_idx_list[i]), None,
                build_subdata_manager(
                    data_managers_d0[2],
                    data_managers[2].name + '_d%d_c%d' % (depth, i),
                    test_true_idx_list[i])
            ])
    logger.info("training and predicting time: " + str(time.time() - start))
    return model_lists, test_pres
コード例 #13
0
def main(input_dir=settings.data_dir_20ng,
         label_ratio=0.1,
         times=1,
         classifier_names=None,
         dual=True):
    logger = logging.getLogger(__name__)

    if label_ratio == 1.0:
        times = 1
    classes = tools.load(os.path.join(input_dir, settings.classes_file))
    deltas = tools.load(os.path.join(input_dir, settings.deltas_file))
    kw = {'deltas': deltas, 'dual': dual}
    if not classifier_names:
        classifier_names = ['flatLR', 'levelLR', 'flatSVM', 'levelSVM']
    path_weights = [1.0]
    for i in range(1, len(classes)):
        path_weights.append(path_weights[-1] * settings.path_weight)
    if label_ratio == 1.0:
        times = 1

    nos, hier_tree = get_hier_info(input_dir)
    for mode in ["labeled", "dataless"]:
        metrics_result = np.zeros(
            (times, 2, len(classifier_names), 3, 2)
        )  # times, methods, depth+1, [[(M_precision,m_precision), (M_recall,m_recall),  (M_f1, m_f1)], ...]

        for i in range(times):
            method_index = 0
            sub_dir = os.path.join(input_dir, str(label_ratio), str(i))
            logger.info(logconfig.key_log(logconfig.START_PROGRAM, sub_dir))

            data_managers = load_data_managers(sub_dir)

            if mode == "dataless" and np.max(
                    data_managers[2].sims[0][0]) == 0.0:
                continue

            for j, classifier_name in enumerate(classifier_names):
                result = run_classifiers(classifier_name, data_managers, mode,
                                         **kw)
                if len(data_managers[2].labels) == len(result[1]):
                    metrics_result[i, 0, j] = compute_p_r_f1(
                        data_managers[2].labels[-1], result[1][-1])
                    metrics_result[i, 1, j] = compute_overall_p_r_f1(
                        data_managers[2].labels, result[1], nos)
                else:
                    metrics_result[i, 0, j] = compute_p_r_f1(
                        data_managers[2].labels[-1], result[1])
                    metrics_result[i, 1, j] = compute_hier_p_r_f1(
                        data_managers[2].labels[-1], result[1], nos, hier_tree)

        avg_M_metrics_result = np.mean(metrics_result[:, :, :, :, 0], axis=0)
        std_M_metrics_result = np.std(metrics_result[:, :, :, :, 0], axis=0)
        avg_m_metrics_result = np.mean(metrics_result[:, :, :, :, 1], axis=0)
        std_m_metrics_result = np.std(metrics_result[:, :, :, :, 1], axis=0)

        with open(
                os.path.join(input_dir, str(label_ratio),
                             'LR_SVM_%s.csv' % (mode)), 'w') as f:
            csv_writer = csv.writer(f)
            csv_writer.writerow(['Leaf'] + classifier_names)
            csv_writer.writerow(['Macro precision avg'] +
                                list(avg_M_metrics_result[0, :, 0]))
            csv_writer.writerow(['Macro precision std'] +
                                list(std_M_metrics_result[0, :, 0]))
            csv_writer.writerow(['Micro precision avg'] +
                                list(avg_m_metrics_result[0, :, 0]))
            csv_writer.writerow(['Micro precision std'] +
                                list(std_m_metrics_result[0, :, 0]))
            csv_writer.writerow(['Macro recall avg'] +
                                list(avg_M_metrics_result[0, :, 1]))
            csv_writer.writerow(['Macro recall std'] +
                                list(std_M_metrics_result[0, :, 1]))
            csv_writer.writerow(['Micro recall avg'] +
                                list(avg_m_metrics_result[0, :, 1]))
            csv_writer.writerow(['Micro recall std'] +
                                list(std_m_metrics_result[0, :, 1]))
            csv_writer.writerow(['Macro f1 avg'] +
                                list(avg_M_metrics_result[0, :, 2]))
            csv_writer.writerow(['Macro f1 std'] +
                                list(std_M_metrics_result[0, :, 2]))
            csv_writer.writerow(['Micro f1 avg'] +
                                list(avg_m_metrics_result[0, :, 2]))
            csv_writer.writerow(['Micro f1 std'] +
                                list(std_m_metrics_result[0, :, 2]))
            csv_writer.writerow([])
            csv_writer.writerow(['Overall'] + classifier_names)
            csv_writer.writerow(['Macro precision avg'] +
                                list(avg_M_metrics_result[1, :, 0]))
            csv_writer.writerow(['Macro precision std'] +
                                list(std_M_metrics_result[1, :, 0]))
            csv_writer.writerow(['Micro precision avg'] +
                                list(avg_m_metrics_result[1, :, 0]))
            csv_writer.writerow(['Micro precision std'] +
                                list(std_m_metrics_result[1, :, 0]))
            csv_writer.writerow(['Macro recall avg'] +
                                list(avg_M_metrics_result[1, :, 1]))
            csv_writer.writerow(['Macro recall std'] +
                                list(std_M_metrics_result[1, :, 1]))
            csv_writer.writerow(['Micro recall avg'] +
                                list(avg_m_metrics_result[1, :, 1]))
            csv_writer.writerow(['Micro recall std'] +
                                list(std_m_metrics_result[1, :, 1]))
            csv_writer.writerow(['Macro f1 avg'] +
                                list(avg_M_metrics_result[1, :, 2]))
            csv_writer.writerow(['Macro f1 std'] +
                                list(std_M_metrics_result[1, :, 2]))
            csv_writer.writerow(['Micro f1 avg'] +
                                list(avg_m_metrics_result[1, :, 2]))
            csv_writer.writerow(['Micro f1 std'] +
                                list(std_m_metrics_result[1, :, 2]))
            csv_writer.writerow([])
    logger.info(logconfig.key_log(logconfig.END_PROGRAM, input_dir))