Ejemplo n.º 1
0
    def libact_first_try_second_run(self, enriched_train_df, extractor,
                                    ideal_df, lbr, quota, validation_data_df,
                                    return_dict):

        trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)
        qs = UncertaintySampling(trn_ds,
                                 method='lc',
                                 model=LogisticRegression())
        E_out1 = []
        E_out1 = np.append(
            E_out1,
            run_classifier(trn_ds.extract_labeled_dataframe(),
                           validation_data_df).f1)
        for i in range(quota):
            if len(trn_ds.get_unlabeled_entries()) == 0:
                break  # finished labeling all examples
            ask_id = qs.make_query()
            lb = lbr.label(trn_ds.extract_sentence(ask_id))
            self.assertEqual(lb, ideal_df[cn.tag_col][ask_id])
            trn_ds.update(ask_id, lb)
            # model.train(trn_ds)
            E_out1 = np.append(
                E_out1,
                run_classifier(trn_ds.extract_labeled_dataframe(),
                               validation_data_df).f1)
        return_dict[2] = E_out1
Ejemplo n.º 2
0
def score_per_add_al(labeled_pool_df, base_training_df, validation_data_df):
    # type: (DataFrame, DataFrame, DataFrame) -> tuple

    gen_pool_df = labeled_pool_df.copy(deep=True)
    gen_pool_df[cn.col_names.tag] = [np.NaN] * len(
        gen_pool_df)  # clear all tags
    enriched_train_df = pd.concat([base_training_df, gen_pool_df],
                                  ignore_index=True)

    extractor = cn.Feature_Extractor(
        enriched_train_df, cn.col_names)  # build the feature extractor

    trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)

    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())

    ideal_df = pd.concat([base_training_df, labeled_pool_df],
                         ignore_index=True)
    lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor))

    scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(),
                                            validation_data_df)
    ex_added_list, res_list = run_active_learning(
        trn_ds, scoring_fun, lbr, qs, len(enriched_train_df))  # label all df

    return ex_added_list, res_list
Ejemplo n.º 3
0
    def libact_first_try_first_run(self, enriched_train_df, extractor, lbr,
                                   quota, validation_data_df, return_dict):

        trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)
        qs = UncertaintySampling(trn_ds,
                                 method='lc',
                                 model=LogisticRegression())
        scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(),
                                                validation_data_df).f1
        query_num, E_out1 = run_active_learning(trn_ds, scoring_fun, lbr, qs,
                                                quota)
        return_dict[1] = E_out1
Ejemplo n.º 4
0
def scores_per_add_default(labeled_pool_df, base_training_df,
                           validation_data_df):
    # type: (pd.DataFrame, pd.DataFrame, callable, int) -> (list, list)
    """
    get result from adding examples from $labeled_pool_df to $base_train_df, training them and seeing their results
    :param labeled_pool_df: df containing the new examples we asked the expert for
    :param base_train_df: the original training examples we had
    :param validation_data_df:  df containing the validation data we test our model's performance on
    :return: (ex_added_list, res_list), a list containing the number of examples added
                            and a list containing its corresponding result
    """
    # return score_per_add_al(labeled_pool_df, base_training_df, validation_data_df)
    return score_per_addition_results(
        labeled_pool_df, base_training_df,
        lambda en_df: run_classifier(en_df, validation_data_df))
Ejemplo n.º 5
0
    def test_heuristic_test_data_gain_works(self):
        cn.load_codementor_sentiment_analysis_parameters()
        kb_helper.load_WordNet_model()

        base_train_df, pool_df, validation_data_df = prepare_pool_based_dataset(
        )
        all_sents = list(base_train_df[cn.col_names.text])
        tns = 2

        pool_name, prep_pools = (
            "orig pool", lambda *_:
            (pool_df.iloc[:tns + 5], cn.labeled_pool_df.iloc[:tns + 5]))
        train_with_pool_df = pd.concat([base_train_df, pool_df],
                                       ignore_index=True)
        generated_pool_df, labeled_pool_df = prep_pools(
            all_sents, train_with_pool_df, cn.col_names, tns)
        cn.experiment_purpose += pool_name + " "

        trn_ds, lbr, extractor = prepare_trn_ds(base_train_df,
                                                generated_pool_df,
                                                labeled_pool_df)
        final_scoring_fun = partial(
            lambda en_df: run_classifier(en_df, validation_data_df).acc)

        table_headers = ['#added examples']
        data = [range(0, tns + 1)]
        compared_heuristics = [("test-data-gain",
                                lambda: SynStateTestDataGain),
                               ("random", lambda: "random")]

        for (heuristic_name, prepare_usage) in compared_heuristics:
            ss_type = prepare_usage()
            table_headers.append(heuristic_name)
            print heuristic_name
            _, heur_scores = insert_in_AL_fashion(trn_ds,
                                                  final_scoring_fun,
                                                  lbr,
                                                  ss_type,
                                                  labeled_pool_df,
                                                  quota=tns)
            data.append(heur_scores)

        print data[1]
        print data[2]
        self.assertEqual(data[1][0], data[2][0], "starts are same")
        self.assertGreater(data[1][-1], data[2][-1],
                           "test-data-gain should be better than random")
Ejemplo n.º 6
0
def effect_of_size_of_semantic_environment():
    experiment_name = 'effect_of_size_of_semantic_environment'
    print experiment_name

    # prepare the different splits of $data_df
    balanced_train_df, validation_data_df = prepare_balanced_dataset(
        print_expert_acc=False)
    all_sents = list(balanced_train_df[cn.col_names.text])
    tns = 50

    prepare_pools_funcs = list()
    prepare_pools_funcs.append(
        local_search_gen_template("uncertainty lc LogReg", 5))
    prepare_pools_funcs.append(local_search_gen_template("random-score", 5))
    prepare_pools_funcs.append(generate_pool_using_random_synthesis())
    final_scoring_fun = partial(
        lambda en_df: run_classifier(en_df, validation_data_df).acc)
    insertion_order_heuristic = find_heuristic("random-score")()
    print cn.experiment_purpose
    table_headers = ['size of semantic environment']
    data = [[]]

    for env_size in range(1, 5, 1) + range(5, 35, 5):
        data[0].append(env_size)
        for i, (pool_name, prepare_pool_fun) in enumerate(prepare_pools_funcs):
            if pool_name not in table_headers:
                table_headers.append(pool_name)
                data.append([])
                assert len(data) == i + 2, "meaning i+1 is our index"
            print pool_name
            cn.distance_measure = env_size
            gen_pool_df, labeled_pool_df = prepare_pool_fun(
                all_sents, balanced_train_df, cn.col_names, tns)
            trn_ds, lbr, extractor = prepare_trn_ds(balanced_train_df,
                                                    gen_pool_df,
                                                    labeled_pool_df)
            query_num, pool_insr_scores = insert_in_AL_fashion(
                trn_ds,
                final_scoring_fun,
                lbr,
                insertion_order_heuristic,
                labeled_pool_df,
                quota=tns)
            data[i + 1].append(pool_insr_scores[-1])

    return experiment_name, table_headers, data, plot_effect_of_size_of_semantic_environment
Ejemplo n.º 7
0
def effect_of_num_of_operators():
    experiment_name = 'effect_of_num_of_operators'
    print experiment_name

    # prepare the different splits of $data_df
    balanced_train_df, validation_data_df = prepare_balanced_dataset(
        print_expert_acc=False)
    all_sents = list(balanced_train_df[cn.col_names.text])
    tns = 50

    final_scoring_fun = partial(
        lambda en_df: run_classifier(en_df, validation_data_df).acc)
    insertion_order_heuristic = find_heuristic("random-score")()
    print cn.experiment_purpose
    table_headers = ['num of operators']
    data = [[1] + range(2, 12, 2)]

    for i, heur in enumerate(["uncertainty lc LogReg",
                              "random-score"]):  # "uncertainty lc LogReg",
        table_headers.append(heur)
        data.append(list())
        prepare_pools_funcs = list()
        prepare_pools_funcs.append(local_search_gen_template(heur, 1))
        for j in range(2, 12, 2):
            prepare_pools_funcs.append(local_search_gen_template(heur, j))

        for pool_name, prepare_pool_fun in prepare_pools_funcs:
            gen_pool_df, labeled_pool_df = prepare_pool_fun(
                all_sents, balanced_train_df, cn.col_names, tns)
            trn_ds, lbr, extractor = prepare_trn_ds(balanced_train_df,
                                                    gen_pool_df,
                                                    labeled_pool_df)
            print pool_name
            query_num, pool_insr_scores = insert_in_batch_AL(
                trn_ds,
                final_scoring_fun,
                lbr,
                insertion_order_heuristic,
                labeled_pool_df,
                batch_num=len(labeled_pool_df))
            data[1 + i].append(pool_insr_scores[-1])

    return experiment_name, table_headers, data, plot_effect_of_num_of_operators