Ejemplo n.º 1
0
def load_word2vec_model(
        relpath='word2vec_models/GoogleNews-vectors-negative300_trimmed.bin'):
    global k_base, kb_type
    kb_type = 'w2v'
    save_previous_model()
    cn.add_experiment_param(kb_type)
    k_base = Word2VecKB(cn.distance_measure, knowledge_bases_foldpath, relpath)
def curr_pool_synthesis_from_sents(base_sents, base_training_df, col_names, total_new_sents=None,
                                   choose_bulk_method=choose_best_by_uncertainty):
    # type: (list, DataFrame, ColumnNames, int, callable) -> DataFrame
    """
    generates new examples based on $base_sents using a generic algorithms
    :param base_sents: list of base sents, from which we generate new ones
    :param base_training_df: DataFrame containing all labeled sentences
    :param col_names: contains the names of the columns in the output DataFrame
    :param total_new_sents: indicates the number of sentences we want to synthesize
    :param choose_bulk_method: a method for choosing sentences to be sent for generation
    :return: an unlabeled DataFrame with the new sentences generated
    """
    print "start curr pool search map"
    total_new_sents = pool_size(total_new_sents, base_sents)
    wanted_new_sents = int(total_new_sents * 4)
    choose_amount = wanted_new_sents / 8 + 1
    cn.add_experiment_param("choose_amount_"+str(choose_amount))
    if "choose_amount" not in cn.experiment_purpose:
        cn.experiment_purpose += "curr_pool choose_amount="+str(choose_amount)+", "

    from ResearchNLP.knowledge_bases import kb_helper
    kb_helper.k_base.load_knowledgebase()  # explicitly load to help processes share memory
    # print kb_helper.kb_type

    didnt_advance_count = 0
    sent_pool = set(base_sents)
    current_pool = list(base_sents)
    sent_pool_df = base_training_df.copy(deep=True)
    print "total new sentences: " + str(wanted_new_sents)
    while len(sent_pool) - len(base_sents) <= wanted_new_sents:  # gen quarter size
        all_new_tuples = synthesize_tree_depth1_bulk(current_pool, sent_pool_df, col_names)
        combined_df = add_sentences_and_histories_to_df(base_training_df, col_names, all_new_tuples)
        combined_df = prepare_df_columns(combined_df, col_names)
        chosen_idxs = choose_bulk_method(combined_df, col_names, choose_amount, sent_pool)
        if len(chosen_idxs) == 0:
            didnt_advance_count += 1

        for idx in chosen_idxs:
            sent_pool_df.loc[len(sent_pool_df)] = combined_df.loc[idx].copy(deep=True)
            # add new example to sent pools
            new_sent = combined_df[col_names.text][idx]
            assert new_sent not in sent_pool, "the new sentence should not appear beforehand"
            current_pool.append(new_sent)
            sent_pool.add(new_sent)
            print_progress(len(sent_pool) - len(base_sents), total=wanted_new_sents)
            didnt_advance_count = 0

        sent_pool_df = prepare_df_columns(sent_pool_df, col_names)

        if didnt_advance_count >= 50:
            print "didn't advance, stopping synthesis"
            break

    # use the already filled sent_pool_df
    final_chosen_idxs = choose_bulk_method(sent_pool_df, col_names, total_new_sents, set())
    new_sents_df = sent_pool_df.iloc[final_chosen_idxs].reset_index(drop=True)
    print "\ngenerated", len(new_sents_df), "sentences"
    return new_sents_df
Ejemplo n.º 3
0
def get_sentence_representation(sent):
    import ResearchNLP.Constants as cn
    cn.add_experiment_param('glove300')
    reduced_sent = all_in_vocab(sent).split()
    if len(reduced_sent) == 0:
        # print "reduced sent: " + str(sent)
        return [0.0] * len(glove_model.obj.word_vectors[0])  # zeros representation
    global total_diff, diff_count
    total_diff += len(sent.split()) - len(reduced_sent)
    diff_count += 1
    return sum(map(lambda word: glove_model.obj.word_vectors[glove_model.obj.dictionary[word]].__array__(), reduced_sent)) \
             / len(reduced_sent)
def _choose_best_by_heuristic_fun(sent_df, col_names, count, ss_type, sent_pool):
    # type: (DataFrame, ColumnNames, int, SynState) -> list
    cn.add_experiment_param(ss_type.__name__)

    unlabeled_idxs = pd.np.where(sent_df[col_names.tag].isnull())[0]
    idx_text_col = list(sent_df[col_names.text][unlabeled_idxs].iteritems())
    filtered_tpls = filter(lambda (idx, s): s not in sent_pool, idx_text_col)
    filtered_idxs = map(lambda (idx, s): idx, filtered_tpls)
    assert len(filtered_idxs) >= count, "Not enough unlabeled instances to choose from (after filtering)"

    score_idx_list = calculate_heuristic_bulk(sent_df, col_names, ss_type, filtered_idxs)

    return _choose_by_heuristic_score_diverse_origins(sent_df, col_names, count, score_idx_list)
Ejemplo n.º 5
0
        def kfold_gain(train_set, dev_set, state_df, col_names):
            def depth1_gain(labeled_state_df):
                ex_added_list, res_list = scores_per_add_default(
                    labeled_state_df, train_set, dev_set)
                f1_list = ExprScores.list_to_f1(res_list)
                return f1_list[1] - f1_list[
                    0]  # difference in f1 score. NOT NORMALIZED, but its supposed to be OK

            state_df.loc[0, col_names.tag] = 0
            change0 = depth1_gain(state_df)
            state_df.loc[0, col_names.tag] = 1
            change1 = depth1_gain(state_df)
            cn.add_experiment_param("5_spits_with_prob_kfold_gain")
            return p0 * change0 + p1 * change1
Ejemplo n.º 6
0
    def __init__(self, sent_df, col_names, init_text_state=None):
        # type: (pd.DataFrame, ColumnNames, str) -> None
        super(BestInstanceProblem, self).__init__()

        cn.inst_count += 1
        self.sent_pool_df = sent_df
        self.col_names = col_names

        cn.add_experiment_param(cn.ss_type.__name__)

        if init_text_state is not None:
            init_row = sent_df[sent_df[col_names.text] == init_text_state]
            assert len(init_row) != 0, "init_text_state not in send_df"
            # initial_state is used in BestInstanceProblem
            self.initial_state = cn.ss_type(init_row.index[0],
                                            self.sent_pool_df, col_names)
        self.init_states = None
Ejemplo n.º 7
0
def compare_generation_methods_pools():
    experiment_name = 'small_train_compare_generation_methods_pools'
    print experiment_name

    # prepare the different splits of $data_df
    balanced_train_df, validation_data_df = prepare_balanced_dataset(
        print_expert_acc=False)
    all_sents = list(balanced_train_df[cn.col_names.text])
    tns = 40

    prepare_pools_funcs = list()
    # prepare_pools_funcs.append(curr_pool_gen_template("uncertainty lc LogReg"))
    #
    # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 2))
    # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 2))
    # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 10))
    # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 0, use_enhanced=True))
    # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5, use_enhanced=True))
    # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 10, use_enhanced=True))
    # prepare_pools_funcs.append(local_search_gen_template("random-score", 2))
    # # prepare_pools_funcs.append(local_search_gen_template("random-score", 0))
    #
    # # prepare_pools_funcs.append(curr_pool_gen_template("test-data-gain"))
    # prepare_pools_funcs.append(generate_pool_using_random_synthesis())
    # prepare_pools_funcs.append(prepare_orig_examples_pools())
    # prepare_pools_funcs.append(generate_pool_lecun_augmentation())
    prepare_pools_funcs.append(generate_sents_using_lstm_generator())

    final_scoring_fun = partial(
        lambda en_df: run_classifier(en_df, validation_data_df).acc)
    insertion_order_heuristic = find_heuristic("uncertainty lc LogReg")()
    # insertion_order_heuristic = find_heuristic("test-data-gain")()
    cn.add_experiment_param(insertion_order_heuristic.__name__)
    cn.experiment_purpose += "insertion order using " + insertion_order_heuristic.__name__ + " "
    print cn.experiment_purpose
    table_headers = ['#added examples']
    data = [[0]]

    for pool_name, prepare_pool_fun in prepare_pools_funcs:
        init_score = final_scoring_fun(balanced_train_df)
        print pool_name
        gen_pool_df, labeled_pool_df = prepare_pool_fun(
            all_sents, balanced_train_df, cn.col_names, tns)
        trn_ds, lbr, extractor = prepare_trn_ds(balanced_train_df, gen_pool_df,
                                                labeled_pool_df)
        print pool_name
        query_num, pool_insr_scores = insert_in_AL_fashion(
            trn_ds,
            final_scoring_fun,
            lbr,
            insertion_order_heuristic,
            labeled_pool_df,
            quota=tns)
        # query_num, pool_insr_scores = insert_in_batch_AL(trn_ds, final_scoring_fun, lbr, insertion_order_heuristic,
        #                                                  labeled_pool_df, quota=tns, batch_num=5)
        pool_insr_scores[0] = init_score
        data[0] = query_num if len(data[0]) < len(query_num) else data[0]

        table_headers.append(pool_name)
        data.append(pool_insr_scores)

    return experiment_name, table_headers, data, plot_compare_generation_methods
Ejemplo n.º 8
0
def compare_pool_generation_methods_proper_al():
    experiment_name = 'compare_pool_generation_methods_proper_AL'
    print experiment_name

    # prepare the different splits of $data_df
    balanced_train_df, validation_data_df = prepare_balanced_dataset()
    all_sents = list(balanced_train_df[cn.col_names.text])

    prepare_pools_funcs = list()
    # prepare_pools_funcs.append(curr_pool_gen_template("uncertainty lc LogReg"))
    # prepare_pools_funcs.append(curr_pool_gen_template("random-score"))
    # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5))
    # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5, use_enhanced=True))
    # prepare_pools_funcs.append(local_search_gen_template("random-score", 5))
    # prepare_pools_funcs.append(generate_pool_using_random_synthesis())

    # prepare_pools_funcs.append(prepare_orig_examples_pools())
    # prepare_pools_funcs.append(generate_pool_lecun_augmentation())
    prepare_pools_funcs.append(generate_sents_using_lstm_generator())

    final_scoring_fun = partial(
        lambda en_df: run_classifier(en_df, validation_data_df).acc)

    cn.add_experiment_param("tns_" + str(total_new_sents))
    cn.add_experiment_param("pool_size" + str(pool_size_each_step))
    cn.add_experiment_param("batch_size" + str(examples_at_each_step))
    print cn.experiment_purpose

    def do_one_AL_cycle(pool_gen_fun, curr_training_df):
        done_generating = False
        sent_pool = set(curr_training_df[cn.col_names.text])
        gen_pool_df, labeled_pool_df = pool_gen_fun(list(sent_pool),
                                                    curr_training_df,
                                                    cn.col_names,
                                                    pool_size_each_step)
        if len(gen_pool_df) > examples_at_each_step:
            selected_instance_idxs = select_from_pool_uncertainty(
                gen_pool_df, balanced_train_df, cn.col_names, sent_pool,
                examples_at_each_step)
            labeled_instances_df = labeled_pool_df.iloc[
                selected_instance_idxs].copy(deep=True).reset_index(drop=True)
        else:
            labeled_instances_df = labeled_pool_df  # all there is, close enough.
            if len(gen_pool_df) < examples_at_each_step:
                done_generating = True

        enriched_train_df = pd.concat([curr_training_df, labeled_instances_df],
                                      ignore_index=True)
        return enriched_train_df, final_scoring_fun(
            enriched_train_df), done_generating

    table_headers = ['#added examples']
    data = [
        range(0, total_new_sents + examples_at_each_step,
              examples_at_each_step)
    ]

    for pool_name, prepare_pool_fun in prepare_pools_funcs:
        start_time = time.time()
        print "starting {0} - {1}".format(pool_name, cn.data_name)

        curr_training_df = balanced_train_df.copy(deep=True)
        res_list = [final_scoring_fun(curr_training_df)]
        for i in range(0, total_new_sents,
                       examples_at_each_step):  # has to be serial
            print_progress(i, total=total_new_sents)

            sa = time.time()
            curr_training_df, curr_add_res, done = do_one_AL_cycle(
                prepare_pool_fun, curr_training_df)
            if done:
                break
            res_list.append(curr_add_res)
            print "AL cycle took {0:.2f} s".format(time.time() - sa)

        print "{0} run time: {1:.2f} minutes - {2}".format(
            pool_name, (time.time() - start_time) / 60.0, cn.data_name)

        table_headers.append(pool_name)
        data.append(res_list)

    return experiment_name, table_headers, data, plot_compare_pool_generation_methods_proper_al
Ejemplo n.º 9
0
def parmap(f,
           X,
           nprocs=multiprocessing.cpu_count(),
           force_parallel=False,
           chunk_size=1):
    from ResearchNLP import Constants as cn
    from ResearchNLP.util_files import function_cache

    if len(X) == 0:
        return []  # like map

    # nprocs = min(nprocs, cn.max_procs)
    if nprocs != multiprocessing.cpu_count() and len(X) < nprocs * chunk_size:
        chunk_size = 1  # use chunk_size = 1 if there is enough procs for a batch size of 1
    nprocs = max(1, min(nprocs, len(X) / chunk_size))  # at least 1
    if len(X) < nprocs:
        if cn.verbose and nprocs != multiprocessing.cpu_count():
            print "parmap too much procs"
        nprocs = len(X)  # too much procs

    if nprocs == 1 or (cn.serial_parmap and not force_parallel
                       ):  # we want it serial (maybe for profiling)
        return map(f, X)

    def _spawn_fun(input, func):
        import random, numpy
        from ResearchNLP import Constants as cn2
        from ResearchNLP.util_files import function_cache as function_cache2
        random.seed(1554 + i)
        numpy.random.seed(42 + i)  # set random seeds
        try:
            res = func(input)
            res_dict = dict()
            res_dict["res"] = res
            res_dict["functions_dict"] = function_cache2.caches_dicts
            res_dict["experiment_purpose"] = cn2.experiment_purpose
            res_dict["curr_params_list"] = cn2.curr_experiment_params_list
            return res_dict
        except:
            import traceback
            traceback.print_exc()
            raise  # re-raise exception

    # if chunk_size == 1:
    #     chunk_size = math.ceil(float(len(X)) / nprocs)  # all procs work on an equal chunk

    try:  # try-catch hides bugs
        global proc_count
        old_proc_count = proc_count
        proc_count = nprocs
        p = Pool(nprocs)
        p.restart(force=True)
        retval_par = p.map(
            _spawn_fun, X, [f] * len(X),
            chunk_size=chunk_size)  # can throw if current proc is daemon
        p.terminate()
        for res_dict in retval_par:  # add all experiments params we missed
            curr_params_list = res_dict["curr_params_list"]
            for param in curr_params_list:
                cn.add_experiment_param(param)
        cn.experiment_purpose = retval_par[0][
            "experiment_purpose"]  # use the "experiment_purpose" from the fork
        function_cache.merge_cache_dicts_from_parallel_runs(
            map(lambda a: a["functions_dict"], retval_par))  # merge all
        retval = map(lambda res_dict: res_dict["res"],
                     retval_par)  # make it like the original map
        proc_count = old_proc_count
        global i
        i += 1
    except AssertionError as e:
        if e.message == "daemonic processes are not allowed to have children":
            retval = map(f, X)  # can't have pool inside pool
        else:
            print "error message is: " + str(e.message)
            raise  # re-raise orig exception
    return retval
Ejemplo n.º 10
0
def load_GloVe_model(relpath='glove_models/glove.6B.100d.txt'):
    global k_base, kb_type
    kb_type = 'GloVe'
    save_previous_model()
    cn.add_experiment_param(kb_type)
    k_base = GloveKB(cn.distance_measure, knowledge_bases_foldpath, relpath)
Ejemplo n.º 11
0
def load_WordNet_model():
    global k_base, kb_type
    kb_type = 'WordNet'
    save_previous_model()
    cn.add_experiment_param(kb_type)
    k_base = WordNetKB(cn.distance_measure)
Ejemplo n.º 12
0
def load_dep_word2vec_model(relpath='word2vec_models/deps_trimmed.words'):
    global k_base, kb_type
    kb_type = 'dep_w2v'
    save_previous_model()
    cn.add_experiment_param(kb_type)
    k_base = Word2VecKB(cn.distance_measure, knowledge_bases_foldpath, relpath)