Example #1
0
def load_word2vec_model(
        relpath='word2vec_models/GoogleNews-vectors-negative300_trimmed.bin'):
    global k_base, kb_type
    kb_type = 'w2v'
    save_previous_model()
    cn.add_experiment_param(kb_type)
    k_base = Word2VecKB(cn.distance_measure, knowledge_bases_foldpath, relpath)
def curr_pool_synthesis_from_sents(base_sents, base_training_df, col_names, total_new_sents=None,
                                   choose_bulk_method=choose_best_by_uncertainty):
    # type: (list, DataFrame, ColumnNames, int, callable) -> DataFrame
    """
    generates new examples based on $base_sents using a generic algorithms
    :param base_sents: list of base sents, from which we generate new ones
    :param base_training_df: DataFrame containing all labeled sentences
    :param col_names: contains the names of the columns in the output DataFrame
    :param total_new_sents: indicates the number of sentences we want to synthesize
    :param choose_bulk_method: a method for choosing sentences to be sent for generation
    :return: an unlabeled DataFrame with the new sentences generated
    """
    print "start curr pool search map"
    total_new_sents = pool_size(total_new_sents, base_sents)
    wanted_new_sents = int(total_new_sents * 4)
    choose_amount = wanted_new_sents / 8 + 1
    cn.add_experiment_param("choose_amount_"+str(choose_amount))
    if "choose_amount" not in cn.experiment_purpose:
        cn.experiment_purpose += "curr_pool choose_amount="+str(choose_amount)+", "

    from ResearchNLP.knowledge_bases import kb_helper
    kb_helper.k_base.load_knowledgebase()  # explicitly load to help processes share memory
    # print kb_helper.kb_type

    didnt_advance_count = 0
    sent_pool = set(base_sents)
    current_pool = list(base_sents)
    sent_pool_df = base_training_df.copy(deep=True)
    print "total new sentences: " + str(wanted_new_sents)
    while len(sent_pool) - len(base_sents) <= wanted_new_sents:  # gen quarter size
        all_new_tuples = synthesize_tree_depth1_bulk(current_pool, sent_pool_df, col_names)
        combined_df = add_sentences_and_histories_to_df(base_training_df, col_names, all_new_tuples)
        combined_df = prepare_df_columns(combined_df, col_names)
        chosen_idxs = choose_bulk_method(combined_df, col_names, choose_amount, sent_pool)
        if len(chosen_idxs) == 0:
            didnt_advance_count += 1

        for idx in chosen_idxs:
            sent_pool_df.loc[len(sent_pool_df)] = combined_df.loc[idx].copy(deep=True)
            # add new example to sent pools
            new_sent = combined_df[col_names.text][idx]
            assert new_sent not in sent_pool, "the new sentence should not appear beforehand"
            current_pool.append(new_sent)
            sent_pool.add(new_sent)
            print_progress(len(sent_pool) - len(base_sents), total=wanted_new_sents)
            didnt_advance_count = 0

        sent_pool_df = prepare_df_columns(sent_pool_df, col_names)

        if didnt_advance_count >= 50:
            print "didn't advance, stopping synthesis"
            break

    # use the already filled sent_pool_df
    final_chosen_idxs = choose_bulk_method(sent_pool_df, col_names, total_new_sents, set())
    new_sents_df = sent_pool_df.iloc[final_chosen_idxs].reset_index(drop=True)
    print "\ngenerated", len(new_sents_df), "sentences"
    return new_sents_df
Example #3
0
    def test_libact_first_try_results_are_the_same(self):
        """
        test that the first libact example work the same way as the original example taken from github\

        very long test !
        """

        # self.skipTest(reason="too long")

        cn.Inner_PredictionModel = SvmModel
        cn.Feature_Extractor = AvgGloveExtractor
        cn.load_codementor_sentiment_analysis_parameters()
        kb_helper.load_WordNet_model()

        quota = 5  # ask labeler to label 5 samples (tops)
        base_training_df, validation_data_df = prepare_balanced_dataset()
        pos_sents = pandas_util.get_all_positive_sentences(
            base_training_df, cn.col_names.text, cn.col_names.tag, cn.pos_tags)

        # prepare all data
        generated_pool_df = sg.generate_sents_using_random_synthesis(
            pos_sents, base_training_df, cn.col_names)
        labeled_pool_df = label_df_with_expert(generated_pool_df, cn.col_names)

        enriched_train_df = pd.concat([base_training_df, generated_pool_df],
                                      ignore_index=True)
        ideal_df = pd.concat([base_training_df, labeled_pool_df],
                             ignore_index=True)

        extractor = cn.Feature_Extractor(
            enriched_train_df, cn.col_names)  # build the feature extractor

        lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor))
        manager = multiprocessing.Manager()
        return_dict = manager.dict()
        jobs = []
        # first job
        p = multiprocessing.Process(target=self.libact_first_try_first_run,
                                    args=(enriched_train_df, extractor, lbr,
                                          quota, validation_data_df,
                                          return_dict))
        jobs.append(p)
        p.start()
        # second job
        p = multiprocessing.Process(target=self.libact_first_try_second_run,
                                    args=(enriched_train_df, extractor,
                                          ideal_df, lbr, quota,
                                          validation_data_df, return_dict))
        jobs.append(p)
        p.start()

        for proc in jobs:
            proc.join()

        self.assertTrue(np.array_equal(return_dict[1], return_dict[2]))
def get_sentence_representation(sent):
    import ResearchNLP.Constants as cn
    cn.add_experiment_param('glove300')
    reduced_sent = all_in_vocab(sent).split()
    if len(reduced_sent) == 0:
        # print "reduced sent: " + str(sent)
        return [0.0] * len(glove_model.obj.word_vectors[0])  # zeros representation
    global total_diff, diff_count
    total_diff += len(sent.split()) - len(reduced_sent)
    diff_count += 1
    return sum(map(lambda word: glove_model.obj.word_vectors[glove_model.obj.dictionary[word]].__array__(), reduced_sent)) \
             / len(reduced_sent)
def _choose_best_by_heuristic_fun(sent_df, col_names, count, ss_type, sent_pool):
    # type: (DataFrame, ColumnNames, int, SynState) -> list
    cn.add_experiment_param(ss_type.__name__)

    unlabeled_idxs = pd.np.where(sent_df[col_names.tag].isnull())[0]
    idx_text_col = list(sent_df[col_names.text][unlabeled_idxs].iteritems())
    filtered_tpls = filter(lambda (idx, s): s not in sent_pool, idx_text_col)
    filtered_idxs = map(lambda (idx, s): idx, filtered_tpls)
    assert len(filtered_idxs) >= count, "Not enough unlabeled instances to choose from (after filtering)"

    score_idx_list = calculate_heuristic_bulk(sent_df, col_names, ss_type, filtered_idxs)

    return _choose_by_heuristic_score_diverse_origins(sent_df, col_names, count, score_idx_list)
        def kfold_gain(train_set, dev_set, state_df, col_names):
            def depth1_gain(labeled_state_df):
                ex_added_list, res_list = scores_per_add_default(
                    labeled_state_df, train_set, dev_set)
                f1_list = ExprScores.list_to_f1(res_list)
                return f1_list[1] - f1_list[
                    0]  # difference in f1 score. NOT NORMALIZED, but its supposed to be OK

            state_df.loc[0, col_names.tag] = 0
            change0 = depth1_gain(state_df)
            state_df.loc[0, col_names.tag] = 1
            change1 = depth1_gain(state_df)
            cn.add_experiment_param("5_spits_with_prob_kfold_gain")
            return p0 * change0 + p1 * change1
Example #7
0
    def classify_df(self, unlabeled_df):
        # type: (pd.DataFrame) -> pd.DataFrame
        # if cn.balance_dataset:
        #     self.all_data_df = pandas_util.imbalance_dataset(cn.data_df, cn.tag_col, 0.5,
        #                                                      cn.pos_tags, cn.neg_tags)

        # build the feature extractor
        combined_df = pd.concat([self.all_data_df, unlabeled_df])
        extractor_cls = cn.Expert_PredictionModel.get_FeatureExtractor_cls()
        if extractor_cls is None:
            extractor_cls = cn.Expert_FeatureExtractor

        extractor = extractor_cls(combined_df, self.col_names)
        X_all = extractor.transform(
            combined_df, self.col_names)  # use pre-extracted features

        # extract all the features
        X_unlabeled = X_all[len(self.all_data_df[self.col_names.text]):]
        X_train = X_all[:len(self.all_data_df[self.col_names.text])]
        y_train = self.all_data_df[self.col_names.tag].tolist()

        model = cn.Expert_PredictionModel(
            X_train,
            y_train)  # expert model trains on all data (makes it an expert)

        y_pred = model.train_model_and_predict(X_unlabeled)
        # print y_pred.tolist()

        labeled_df = unlabeled_df.copy(deep=True)
        labeled_df[self.col_names.tag] = map(
            float,
            y_pred)  # add predictions to the df (simulates a human label)
        labeled_df = prepare_df_columns(labeled_df, self.col_names)

        return labeled_df  # return the now tagged df
    def __init__(self, state_idx, sent_df, col_names, prev_state=None):
        # type: (int, DataFrame, ColumnNames, SynStateFurthestFromPos) -> None
        super(SynStateFurthestFeatureSp,
              self).__init__(state_idx, sent_df, col_names, prev_state)

        relevant_idxs = list(
            self.sent_df[self.sent_df[col_names.tag].notnull()].index)
        # relevant_idxs.remove(state_idx)
        # relevant_idxs = np.array(relevant_idxs)
        if self.col_names.feature_repr in self.sent_df.columns:
            self.sent_repr = self.sent_df[self.col_names.feature_repr][
                self.state_idx]
            self.min_dist = np.min(map(
                lambda (i, r): LA.norm(
                    r[col_names.feature_repr] - self.sent_repr, 1)**2,
                self.sent_df.iloc[relevant_idxs].iterrows()),
                                   axis=0)
        else:
            extractor = cn.Feature_Extractor(
                sent_df,
                col_names)  # this is actually faster than DataFrame.append()
            X_all = extractor.transform(sent_df, col_names)
            self.sent_repr = X_all[self.state_idx]

            self.min_dist = np.min(
                map(lambda feat: LA.norm(feat - self.sent_repr, 1)**2,
                    X_all[relevant_idxs]))
Example #9
0
def score_per_add_al(labeled_pool_df, base_training_df, validation_data_df):
    # type: (DataFrame, DataFrame, DataFrame) -> tuple

    gen_pool_df = labeled_pool_df.copy(deep=True)
    gen_pool_df[cn.col_names.tag] = [np.NaN] * len(
        gen_pool_df)  # clear all tags
    enriched_train_df = pd.concat([base_training_df, gen_pool_df],
                                  ignore_index=True)

    extractor = cn.Feature_Extractor(
        enriched_train_df, cn.col_names)  # build the feature extractor

    trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)

    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())

    ideal_df = pd.concat([base_training_df, labeled_pool_df],
                         ignore_index=True)
    lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor))

    scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(),
                                            validation_data_df)
    ex_added_list, res_list = run_active_learning(
        trn_ds, scoring_fun, lbr, qs, len(enriched_train_df))  # label all df

    return ex_added_list, res_list
Example #10
0
    def test_heuristic_test_data_gain_works(self):
        cn.load_codementor_sentiment_analysis_parameters()
        kb_helper.load_WordNet_model()

        base_train_df, pool_df, validation_data_df = prepare_pool_based_dataset(
        )
        all_sents = list(base_train_df[cn.col_names.text])
        tns = 2

        pool_name, prep_pools = (
            "orig pool", lambda *_:
            (pool_df.iloc[:tns + 5], cn.labeled_pool_df.iloc[:tns + 5]))
        train_with_pool_df = pd.concat([base_train_df, pool_df],
                                       ignore_index=True)
        generated_pool_df, labeled_pool_df = prep_pools(
            all_sents, train_with_pool_df, cn.col_names, tns)
        cn.experiment_purpose += pool_name + " "

        trn_ds, lbr, extractor = prepare_trn_ds(base_train_df,
                                                generated_pool_df,
                                                labeled_pool_df)
        final_scoring_fun = partial(
            lambda en_df: run_classifier(en_df, validation_data_df).acc)

        table_headers = ['#added examples']
        data = [range(0, tns + 1)]
        compared_heuristics = [("test-data-gain",
                                lambda: SynStateTestDataGain),
                               ("random", lambda: "random")]

        for (heuristic_name, prepare_usage) in compared_heuristics:
            ss_type = prepare_usage()
            table_headers.append(heuristic_name)
            print heuristic_name
            _, heur_scores = insert_in_AL_fashion(trn_ds,
                                                  final_scoring_fun,
                                                  lbr,
                                                  ss_type,
                                                  labeled_pool_df,
                                                  quota=tns)
            data.append(heur_scores)

        print data[1]
        print data[2]
        self.assertEqual(data[1][0], data[2][0], "starts are same")
        self.assertGreater(data[1][-1], data[2][-1],
                           "test-data-gain should be better than random")
Example #11
0
    def __init__(self, sent_df, col_names, init_text_state=None):
        # type: (pd.DataFrame, ColumnNames, str) -> None
        super(BestInstanceProblem, self).__init__()

        cn.inst_count += 1
        self.sent_pool_df = sent_df
        self.col_names = col_names

        cn.add_experiment_param(cn.ss_type.__name__)

        if init_text_state is not None:
            init_row = sent_df[sent_df[col_names.text] == init_text_state]
            assert len(init_row) != 0, "init_text_state not in send_df"
            # initial_state is used in BestInstanceProblem
            self.initial_state = cn.ss_type(init_row.index[0],
                                            self.sent_pool_df, col_names)
        self.init_states = None
Example #12
0
def prepare_trn_ds(balanced_train_df, generated_pool_df, labeled_pool_df):
    enriched_train_df = pd.concat([balanced_train_df, generated_pool_df], ignore_index=True)

    extractor = cn.Feature_Extractor(enriched_train_df, cn.col_names)  # build the feature extractor
    trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)

    ideal_df = pd.concat([balanced_train_df, labeled_pool_df], ignore_index=True)
    lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor))
    return trn_ds, lbr, extractor
Example #13
0
def prepare_classifier(train_data_df, validation_data_df, col_names):
    # type: (pd.DataFrame, pd.DataFrame, ColumnNames) -> (PredictionModel, np.ndarray, np.ndarray)

    # build the feature extractor
    extractor = cn.Feature_Extractor(train_data_df, col_names)  # this is actually faster than DataFrame.append()

    # extract all the features
    combined_df = pd.concat([train_data_df, validation_data_df])
    X_all = extractor.transform(combined_df, col_names)  # save time by using fit_transform

    X_train = X_all[:len(train_data_df[col_names.text])]
    y_train = train_data_df[col_names.tag].tolist()
    X_test = X_all[len(train_data_df[col_names.text]):]
    y_test = validation_data_df[col_names.tag]

    model = cn.Inner_PredictionModel(X_train, y_train)  # expert model trains on all data (makes it an expert)

    return model, X_test, np.array(y_test, dtype=int)
Example #14
0
 def generate_random_state(self):
     # type: () -> SynState
     if self.init_states is None:
         positive_df = pandas_util.all_positive_rows_df(
             self.sent_pool_df, self.col_names.tag, cn.pos_tags)
         self.init_states = map(
             lambda (idx, text_state): cn.ss_type(idx, self.sent_pool_df,
                                                  self.col_names),
             positive_df[self.col_names.text].iteritems())
     return random.sample(self.init_states,
                          1)[0]  # not used in hill climbing
Example #15
0
def compare_generation_methods_pools():
    experiment_name = 'small_train_compare_generation_methods_pools'
    print experiment_name

    # prepare the different splits of $data_df
    balanced_train_df, validation_data_df = prepare_balanced_dataset(
        print_expert_acc=False)
    all_sents = list(balanced_train_df[cn.col_names.text])
    tns = 40

    prepare_pools_funcs = list()
    # prepare_pools_funcs.append(curr_pool_gen_template("uncertainty lc LogReg"))
    #
    # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 2))
    # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 2))
    # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 10))
    # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 0, use_enhanced=True))
    # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5, use_enhanced=True))
    # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 10, use_enhanced=True))
    # prepare_pools_funcs.append(local_search_gen_template("random-score", 2))
    # # prepare_pools_funcs.append(local_search_gen_template("random-score", 0))
    #
    # # prepare_pools_funcs.append(curr_pool_gen_template("test-data-gain"))
    # prepare_pools_funcs.append(generate_pool_using_random_synthesis())
    # prepare_pools_funcs.append(prepare_orig_examples_pools())
    # prepare_pools_funcs.append(generate_pool_lecun_augmentation())
    prepare_pools_funcs.append(generate_sents_using_lstm_generator())

    final_scoring_fun = partial(
        lambda en_df: run_classifier(en_df, validation_data_df).acc)
    insertion_order_heuristic = find_heuristic("uncertainty lc LogReg")()
    # insertion_order_heuristic = find_heuristic("test-data-gain")()
    cn.add_experiment_param(insertion_order_heuristic.__name__)
    cn.experiment_purpose += "insertion order using " + insertion_order_heuristic.__name__ + " "
    print cn.experiment_purpose
    table_headers = ['#added examples']
    data = [[0]]

    for pool_name, prepare_pool_fun in prepare_pools_funcs:
        init_score = final_scoring_fun(balanced_train_df)
        print pool_name
        gen_pool_df, labeled_pool_df = prepare_pool_fun(
            all_sents, balanced_train_df, cn.col_names, tns)
        trn_ds, lbr, extractor = prepare_trn_ds(balanced_train_df, gen_pool_df,
                                                labeled_pool_df)
        print pool_name
        query_num, pool_insr_scores = insert_in_AL_fashion(
            trn_ds,
            final_scoring_fun,
            lbr,
            insertion_order_heuristic,
            labeled_pool_df,
            quota=tns)
        # query_num, pool_insr_scores = insert_in_batch_AL(trn_ds, final_scoring_fun, lbr, insertion_order_heuristic,
        #                                                  labeled_pool_df, quota=tns, batch_num=5)
        pool_insr_scores[0] = init_score
        data[0] = query_num if len(data[0]) < len(query_num) else data[0]

        table_headers.append(pool_name)
        data.append(pool_insr_scores)

    return experiment_name, table_headers, data, plot_compare_generation_methods
Example #16
0
def compare_pool_generation_methods_proper_al():
    experiment_name = 'compare_pool_generation_methods_proper_AL'
    print experiment_name

    # prepare the different splits of $data_df
    balanced_train_df, validation_data_df = prepare_balanced_dataset()
    all_sents = list(balanced_train_df[cn.col_names.text])

    prepare_pools_funcs = list()
    # prepare_pools_funcs.append(curr_pool_gen_template("uncertainty lc LogReg"))
    # prepare_pools_funcs.append(curr_pool_gen_template("random-score"))
    # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5))
    # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5, use_enhanced=True))
    # prepare_pools_funcs.append(local_search_gen_template("random-score", 5))
    # prepare_pools_funcs.append(generate_pool_using_random_synthesis())

    # prepare_pools_funcs.append(prepare_orig_examples_pools())
    # prepare_pools_funcs.append(generate_pool_lecun_augmentation())
    prepare_pools_funcs.append(generate_sents_using_lstm_generator())

    final_scoring_fun = partial(
        lambda en_df: run_classifier(en_df, validation_data_df).acc)

    cn.add_experiment_param("tns_" + str(total_new_sents))
    cn.add_experiment_param("pool_size" + str(pool_size_each_step))
    cn.add_experiment_param("batch_size" + str(examples_at_each_step))
    print cn.experiment_purpose

    def do_one_AL_cycle(pool_gen_fun, curr_training_df):
        done_generating = False
        sent_pool = set(curr_training_df[cn.col_names.text])
        gen_pool_df, labeled_pool_df = pool_gen_fun(list(sent_pool),
                                                    curr_training_df,
                                                    cn.col_names,
                                                    pool_size_each_step)
        if len(gen_pool_df) > examples_at_each_step:
            selected_instance_idxs = select_from_pool_uncertainty(
                gen_pool_df, balanced_train_df, cn.col_names, sent_pool,
                examples_at_each_step)
            labeled_instances_df = labeled_pool_df.iloc[
                selected_instance_idxs].copy(deep=True).reset_index(drop=True)
        else:
            labeled_instances_df = labeled_pool_df  # all there is, close enough.
            if len(gen_pool_df) < examples_at_each_step:
                done_generating = True

        enriched_train_df = pd.concat([curr_training_df, labeled_instances_df],
                                      ignore_index=True)
        return enriched_train_df, final_scoring_fun(
            enriched_train_df), done_generating

    table_headers = ['#added examples']
    data = [
        range(0, total_new_sents + examples_at_each_step,
              examples_at_each_step)
    ]

    for pool_name, prepare_pool_fun in prepare_pools_funcs:
        start_time = time.time()
        print "starting {0} - {1}".format(pool_name, cn.data_name)

        curr_training_df = balanced_train_df.copy(deep=True)
        res_list = [final_scoring_fun(curr_training_df)]
        for i in range(0, total_new_sents,
                       examples_at_each_step):  # has to be serial
            print_progress(i, total=total_new_sents)

            sa = time.time()
            curr_training_df, curr_add_res, done = do_one_AL_cycle(
                prepare_pool_fun, curr_training_df)
            if done:
                break
            res_list.append(curr_add_res)
            print "AL cycle took {0:.2f} s".format(time.time() - sa)

        print "{0} run time: {1:.2f} minutes - {2}".format(
            pool_name, (time.time() - start_time) / 60.0, cn.data_name)

        table_headers.append(pool_name)
        data.append(res_list)

    return experiment_name, table_headers, data, plot_compare_pool_generation_methods_proper_al
Example #17
0
def parmap(f,
           X,
           nprocs=multiprocessing.cpu_count(),
           force_parallel=False,
           chunk_size=1):
    from ResearchNLP import Constants as cn
    from ResearchNLP.util_files import function_cache

    if len(X) == 0:
        return []  # like map

    # nprocs = min(nprocs, cn.max_procs)
    if nprocs != multiprocessing.cpu_count() and len(X) < nprocs * chunk_size:
        chunk_size = 1  # use chunk_size = 1 if there is enough procs for a batch size of 1
    nprocs = max(1, min(nprocs, len(X) / chunk_size))  # at least 1
    if len(X) < nprocs:
        if cn.verbose and nprocs != multiprocessing.cpu_count():
            print "parmap too much procs"
        nprocs = len(X)  # too much procs

    if nprocs == 1 or (cn.serial_parmap and not force_parallel
                       ):  # we want it serial (maybe for profiling)
        return map(f, X)

    def _spawn_fun(input, func):
        import random, numpy
        from ResearchNLP import Constants as cn2
        from ResearchNLP.util_files import function_cache as function_cache2
        random.seed(1554 + i)
        numpy.random.seed(42 + i)  # set random seeds
        try:
            res = func(input)
            res_dict = dict()
            res_dict["res"] = res
            res_dict["functions_dict"] = function_cache2.caches_dicts
            res_dict["experiment_purpose"] = cn2.experiment_purpose
            res_dict["curr_params_list"] = cn2.curr_experiment_params_list
            return res_dict
        except:
            import traceback
            traceback.print_exc()
            raise  # re-raise exception

    # if chunk_size == 1:
    #     chunk_size = math.ceil(float(len(X)) / nprocs)  # all procs work on an equal chunk

    try:  # try-catch hides bugs
        global proc_count
        old_proc_count = proc_count
        proc_count = nprocs
        p = Pool(nprocs)
        p.restart(force=True)
        retval_par = p.map(
            _spawn_fun, X, [f] * len(X),
            chunk_size=chunk_size)  # can throw if current proc is daemon
        p.terminate()
        for res_dict in retval_par:  # add all experiments params we missed
            curr_params_list = res_dict["curr_params_list"]
            for param in curr_params_list:
                cn.add_experiment_param(param)
        cn.experiment_purpose = retval_par[0][
            "experiment_purpose"]  # use the "experiment_purpose" from the fork
        function_cache.merge_cache_dicts_from_parallel_runs(
            map(lambda a: a["functions_dict"], retval_par))  # merge all
        retval = map(lambda res_dict: res_dict["res"],
                     retval_par)  # make it like the original map
        proc_count = old_proc_count
        global i
        i += 1
    except AssertionError as e:
        if e.message == "daemonic processes are not allowed to have children":
            retval = map(f, X)  # can't have pool inside pool
        else:
            print "error message is: " + str(e.message)
            raise  # re-raise orig exception
    return retval
 def setUpClass(cls):
     cn.load_codementor_sentiment_analysis_parameters()
     ce.load_machine_expert(cn.col_names, cn.relevant_tags, cn.data_df)
Example #19
0
 def setUpClass(cls):
     cn.load_codementor_sentiment_analysis_parameters(d_measure=10)
     cn.data_df = pretokenize_df(cn.data_df, cn.col_names)
Example #20
0
def load_GloVe_model(relpath='glove_models/glove.6B.100d.txt'):
    global k_base, kb_type
    kb_type = 'GloVe'
    save_previous_model()
    cn.add_experiment_param(kb_type)
    k_base = GloveKB(cn.distance_measure, knowledge_bases_foldpath, relpath)
Example #21
0
def load_WordNet_model():
    global k_base, kb_type
    kb_type = 'WordNet'
    save_previous_model()
    cn.add_experiment_param(kb_type)
    k_base = WordNetKB(cn.distance_measure)
Example #22
0
def load_dep_word2vec_model(relpath='word2vec_models/deps_trimmed.words'):
    global k_base, kb_type
    kb_type = 'dep_w2v'
    save_previous_model()
    cn.add_experiment_param(kb_type)
    k_base = Word2VecKB(cn.distance_measure, knowledge_bases_foldpath, relpath)
Example #23
0
 def build_feature_extractor(sent_df, col_names):
     # type: (pd.DataFrame, ColumnNames) -> tuple(FeatureExtractor, pd.DataFrame)
     """
     Builds and returns a feature extractor using sent_df
     """
     return cn.Feature_Extractor(sent_df, col_names)  # build the feature extractor