def load_word2vec_model( relpath='word2vec_models/GoogleNews-vectors-negative300_trimmed.bin'): global k_base, kb_type kb_type = 'w2v' save_previous_model() cn.add_experiment_param(kb_type) k_base = Word2VecKB(cn.distance_measure, knowledge_bases_foldpath, relpath)
def curr_pool_synthesis_from_sents(base_sents, base_training_df, col_names, total_new_sents=None, choose_bulk_method=choose_best_by_uncertainty): # type: (list, DataFrame, ColumnNames, int, callable) -> DataFrame """ generates new examples based on $base_sents using a generic algorithms :param base_sents: list of base sents, from which we generate new ones :param base_training_df: DataFrame containing all labeled sentences :param col_names: contains the names of the columns in the output DataFrame :param total_new_sents: indicates the number of sentences we want to synthesize :param choose_bulk_method: a method for choosing sentences to be sent for generation :return: an unlabeled DataFrame with the new sentences generated """ print "start curr pool search map" total_new_sents = pool_size(total_new_sents, base_sents) wanted_new_sents = int(total_new_sents * 4) choose_amount = wanted_new_sents / 8 + 1 cn.add_experiment_param("choose_amount_"+str(choose_amount)) if "choose_amount" not in cn.experiment_purpose: cn.experiment_purpose += "curr_pool choose_amount="+str(choose_amount)+", " from ResearchNLP.knowledge_bases import kb_helper kb_helper.k_base.load_knowledgebase() # explicitly load to help processes share memory # print kb_helper.kb_type didnt_advance_count = 0 sent_pool = set(base_sents) current_pool = list(base_sents) sent_pool_df = base_training_df.copy(deep=True) print "total new sentences: " + str(wanted_new_sents) while len(sent_pool) - len(base_sents) <= wanted_new_sents: # gen quarter size all_new_tuples = synthesize_tree_depth1_bulk(current_pool, sent_pool_df, col_names) combined_df = add_sentences_and_histories_to_df(base_training_df, col_names, all_new_tuples) combined_df = prepare_df_columns(combined_df, col_names) chosen_idxs = choose_bulk_method(combined_df, col_names, choose_amount, sent_pool) if len(chosen_idxs) == 0: didnt_advance_count += 1 for idx in chosen_idxs: sent_pool_df.loc[len(sent_pool_df)] = combined_df.loc[idx].copy(deep=True) # add new example to sent pools new_sent = combined_df[col_names.text][idx] assert new_sent not in sent_pool, "the new sentence should not appear beforehand" current_pool.append(new_sent) sent_pool.add(new_sent) print_progress(len(sent_pool) - len(base_sents), total=wanted_new_sents) didnt_advance_count = 0 sent_pool_df = prepare_df_columns(sent_pool_df, col_names) if didnt_advance_count >= 50: print "didn't advance, stopping synthesis" break # use the already filled sent_pool_df final_chosen_idxs = choose_bulk_method(sent_pool_df, col_names, total_new_sents, set()) new_sents_df = sent_pool_df.iloc[final_chosen_idxs].reset_index(drop=True) print "\ngenerated", len(new_sents_df), "sentences" return new_sents_df
def get_sentence_representation(sent): import ResearchNLP.Constants as cn cn.add_experiment_param('glove300') reduced_sent = all_in_vocab(sent).split() if len(reduced_sent) == 0: # print "reduced sent: " + str(sent) return [0.0] * len(glove_model.obj.word_vectors[0]) # zeros representation global total_diff, diff_count total_diff += len(sent.split()) - len(reduced_sent) diff_count += 1 return sum(map(lambda word: glove_model.obj.word_vectors[glove_model.obj.dictionary[word]].__array__(), reduced_sent)) \ / len(reduced_sent)
def _choose_best_by_heuristic_fun(sent_df, col_names, count, ss_type, sent_pool): # type: (DataFrame, ColumnNames, int, SynState) -> list cn.add_experiment_param(ss_type.__name__) unlabeled_idxs = pd.np.where(sent_df[col_names.tag].isnull())[0] idx_text_col = list(sent_df[col_names.text][unlabeled_idxs].iteritems()) filtered_tpls = filter(lambda (idx, s): s not in sent_pool, idx_text_col) filtered_idxs = map(lambda (idx, s): idx, filtered_tpls) assert len(filtered_idxs) >= count, "Not enough unlabeled instances to choose from (after filtering)" score_idx_list = calculate_heuristic_bulk(sent_df, col_names, ss_type, filtered_idxs) return _choose_by_heuristic_score_diverse_origins(sent_df, col_names, count, score_idx_list)
def kfold_gain(train_set, dev_set, state_df, col_names): def depth1_gain(labeled_state_df): ex_added_list, res_list = scores_per_add_default( labeled_state_df, train_set, dev_set) f1_list = ExprScores.list_to_f1(res_list) return f1_list[1] - f1_list[ 0] # difference in f1 score. NOT NORMALIZED, but its supposed to be OK state_df.loc[0, col_names.tag] = 0 change0 = depth1_gain(state_df) state_df.loc[0, col_names.tag] = 1 change1 = depth1_gain(state_df) cn.add_experiment_param("5_spits_with_prob_kfold_gain") return p0 * change0 + p1 * change1
def __init__(self, sent_df, col_names, init_text_state=None): # type: (pd.DataFrame, ColumnNames, str) -> None super(BestInstanceProblem, self).__init__() cn.inst_count += 1 self.sent_pool_df = sent_df self.col_names = col_names cn.add_experiment_param(cn.ss_type.__name__) if init_text_state is not None: init_row = sent_df[sent_df[col_names.text] == init_text_state] assert len(init_row) != 0, "init_text_state not in send_df" # initial_state is used in BestInstanceProblem self.initial_state = cn.ss_type(init_row.index[0], self.sent_pool_df, col_names) self.init_states = None
def compare_generation_methods_pools(): experiment_name = 'small_train_compare_generation_methods_pools' print experiment_name # prepare the different splits of $data_df balanced_train_df, validation_data_df = prepare_balanced_dataset( print_expert_acc=False) all_sents = list(balanced_train_df[cn.col_names.text]) tns = 40 prepare_pools_funcs = list() # prepare_pools_funcs.append(curr_pool_gen_template("uncertainty lc LogReg")) # # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 2)) # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 2)) # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 10)) # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 0, use_enhanced=True)) # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5, use_enhanced=True)) # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 10, use_enhanced=True)) # prepare_pools_funcs.append(local_search_gen_template("random-score", 2)) # # prepare_pools_funcs.append(local_search_gen_template("random-score", 0)) # # # prepare_pools_funcs.append(curr_pool_gen_template("test-data-gain")) # prepare_pools_funcs.append(generate_pool_using_random_synthesis()) # prepare_pools_funcs.append(prepare_orig_examples_pools()) # prepare_pools_funcs.append(generate_pool_lecun_augmentation()) prepare_pools_funcs.append(generate_sents_using_lstm_generator()) final_scoring_fun = partial( lambda en_df: run_classifier(en_df, validation_data_df).acc) insertion_order_heuristic = find_heuristic("uncertainty lc LogReg")() # insertion_order_heuristic = find_heuristic("test-data-gain")() cn.add_experiment_param(insertion_order_heuristic.__name__) cn.experiment_purpose += "insertion order using " + insertion_order_heuristic.__name__ + " " print cn.experiment_purpose table_headers = ['#added examples'] data = [[0]] for pool_name, prepare_pool_fun in prepare_pools_funcs: init_score = final_scoring_fun(balanced_train_df) print pool_name gen_pool_df, labeled_pool_df = prepare_pool_fun( all_sents, balanced_train_df, cn.col_names, tns) trn_ds, lbr, extractor = prepare_trn_ds(balanced_train_df, gen_pool_df, labeled_pool_df) print pool_name query_num, pool_insr_scores = insert_in_AL_fashion( trn_ds, final_scoring_fun, lbr, insertion_order_heuristic, labeled_pool_df, quota=tns) # query_num, pool_insr_scores = insert_in_batch_AL(trn_ds, final_scoring_fun, lbr, insertion_order_heuristic, # labeled_pool_df, quota=tns, batch_num=5) pool_insr_scores[0] = init_score data[0] = query_num if len(data[0]) < len(query_num) else data[0] table_headers.append(pool_name) data.append(pool_insr_scores) return experiment_name, table_headers, data, plot_compare_generation_methods
def compare_pool_generation_methods_proper_al(): experiment_name = 'compare_pool_generation_methods_proper_AL' print experiment_name # prepare the different splits of $data_df balanced_train_df, validation_data_df = prepare_balanced_dataset() all_sents = list(balanced_train_df[cn.col_names.text]) prepare_pools_funcs = list() # prepare_pools_funcs.append(curr_pool_gen_template("uncertainty lc LogReg")) # prepare_pools_funcs.append(curr_pool_gen_template("random-score")) # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5)) # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5, use_enhanced=True)) # prepare_pools_funcs.append(local_search_gen_template("random-score", 5)) # prepare_pools_funcs.append(generate_pool_using_random_synthesis()) # prepare_pools_funcs.append(prepare_orig_examples_pools()) # prepare_pools_funcs.append(generate_pool_lecun_augmentation()) prepare_pools_funcs.append(generate_sents_using_lstm_generator()) final_scoring_fun = partial( lambda en_df: run_classifier(en_df, validation_data_df).acc) cn.add_experiment_param("tns_" + str(total_new_sents)) cn.add_experiment_param("pool_size" + str(pool_size_each_step)) cn.add_experiment_param("batch_size" + str(examples_at_each_step)) print cn.experiment_purpose def do_one_AL_cycle(pool_gen_fun, curr_training_df): done_generating = False sent_pool = set(curr_training_df[cn.col_names.text]) gen_pool_df, labeled_pool_df = pool_gen_fun(list(sent_pool), curr_training_df, cn.col_names, pool_size_each_step) if len(gen_pool_df) > examples_at_each_step: selected_instance_idxs = select_from_pool_uncertainty( gen_pool_df, balanced_train_df, cn.col_names, sent_pool, examples_at_each_step) labeled_instances_df = labeled_pool_df.iloc[ selected_instance_idxs].copy(deep=True).reset_index(drop=True) else: labeled_instances_df = labeled_pool_df # all there is, close enough. if len(gen_pool_df) < examples_at_each_step: done_generating = True enriched_train_df = pd.concat([curr_training_df, labeled_instances_df], ignore_index=True) return enriched_train_df, final_scoring_fun( enriched_train_df), done_generating table_headers = ['#added examples'] data = [ range(0, total_new_sents + examples_at_each_step, examples_at_each_step) ] for pool_name, prepare_pool_fun in prepare_pools_funcs: start_time = time.time() print "starting {0} - {1}".format(pool_name, cn.data_name) curr_training_df = balanced_train_df.copy(deep=True) res_list = [final_scoring_fun(curr_training_df)] for i in range(0, total_new_sents, examples_at_each_step): # has to be serial print_progress(i, total=total_new_sents) sa = time.time() curr_training_df, curr_add_res, done = do_one_AL_cycle( prepare_pool_fun, curr_training_df) if done: break res_list.append(curr_add_res) print "AL cycle took {0:.2f} s".format(time.time() - sa) print "{0} run time: {1:.2f} minutes - {2}".format( pool_name, (time.time() - start_time) / 60.0, cn.data_name) table_headers.append(pool_name) data.append(res_list) return experiment_name, table_headers, data, plot_compare_pool_generation_methods_proper_al
def parmap(f, X, nprocs=multiprocessing.cpu_count(), force_parallel=False, chunk_size=1): from ResearchNLP import Constants as cn from ResearchNLP.util_files import function_cache if len(X) == 0: return [] # like map # nprocs = min(nprocs, cn.max_procs) if nprocs != multiprocessing.cpu_count() and len(X) < nprocs * chunk_size: chunk_size = 1 # use chunk_size = 1 if there is enough procs for a batch size of 1 nprocs = max(1, min(nprocs, len(X) / chunk_size)) # at least 1 if len(X) < nprocs: if cn.verbose and nprocs != multiprocessing.cpu_count(): print "parmap too much procs" nprocs = len(X) # too much procs if nprocs == 1 or (cn.serial_parmap and not force_parallel ): # we want it serial (maybe for profiling) return map(f, X) def _spawn_fun(input, func): import random, numpy from ResearchNLP import Constants as cn2 from ResearchNLP.util_files import function_cache as function_cache2 random.seed(1554 + i) numpy.random.seed(42 + i) # set random seeds try: res = func(input) res_dict = dict() res_dict["res"] = res res_dict["functions_dict"] = function_cache2.caches_dicts res_dict["experiment_purpose"] = cn2.experiment_purpose res_dict["curr_params_list"] = cn2.curr_experiment_params_list return res_dict except: import traceback traceback.print_exc() raise # re-raise exception # if chunk_size == 1: # chunk_size = math.ceil(float(len(X)) / nprocs) # all procs work on an equal chunk try: # try-catch hides bugs global proc_count old_proc_count = proc_count proc_count = nprocs p = Pool(nprocs) p.restart(force=True) retval_par = p.map( _spawn_fun, X, [f] * len(X), chunk_size=chunk_size) # can throw if current proc is daemon p.terminate() for res_dict in retval_par: # add all experiments params we missed curr_params_list = res_dict["curr_params_list"] for param in curr_params_list: cn.add_experiment_param(param) cn.experiment_purpose = retval_par[0][ "experiment_purpose"] # use the "experiment_purpose" from the fork function_cache.merge_cache_dicts_from_parallel_runs( map(lambda a: a["functions_dict"], retval_par)) # merge all retval = map(lambda res_dict: res_dict["res"], retval_par) # make it like the original map proc_count = old_proc_count global i i += 1 except AssertionError as e: if e.message == "daemonic processes are not allowed to have children": retval = map(f, X) # can't have pool inside pool else: print "error message is: " + str(e.message) raise # re-raise orig exception return retval
def load_GloVe_model(relpath='glove_models/glove.6B.100d.txt'): global k_base, kb_type kb_type = 'GloVe' save_previous_model() cn.add_experiment_param(kb_type) k_base = GloveKB(cn.distance_measure, knowledge_bases_foldpath, relpath)
def load_WordNet_model(): global k_base, kb_type kb_type = 'WordNet' save_previous_model() cn.add_experiment_param(kb_type) k_base = WordNetKB(cn.distance_measure)
def load_dep_word2vec_model(relpath='word2vec_models/deps_trimmed.words'): global k_base, kb_type kb_type = 'dep_w2v' save_previous_model() cn.add_experiment_param(kb_type) k_base = Word2VecKB(cn.distance_measure, knowledge_bases_foldpath, relpath)