def load_word2vec_model( relpath='word2vec_models/GoogleNews-vectors-negative300_trimmed.bin'): global k_base, kb_type kb_type = 'w2v' save_previous_model() cn.add_experiment_param(kb_type) k_base = Word2VecKB(cn.distance_measure, knowledge_bases_foldpath, relpath)
def curr_pool_synthesis_from_sents(base_sents, base_training_df, col_names, total_new_sents=None, choose_bulk_method=choose_best_by_uncertainty): # type: (list, DataFrame, ColumnNames, int, callable) -> DataFrame """ generates new examples based on $base_sents using a generic algorithms :param base_sents: list of base sents, from which we generate new ones :param base_training_df: DataFrame containing all labeled sentences :param col_names: contains the names of the columns in the output DataFrame :param total_new_sents: indicates the number of sentences we want to synthesize :param choose_bulk_method: a method for choosing sentences to be sent for generation :return: an unlabeled DataFrame with the new sentences generated """ print "start curr pool search map" total_new_sents = pool_size(total_new_sents, base_sents) wanted_new_sents = int(total_new_sents * 4) choose_amount = wanted_new_sents / 8 + 1 cn.add_experiment_param("choose_amount_"+str(choose_amount)) if "choose_amount" not in cn.experiment_purpose: cn.experiment_purpose += "curr_pool choose_amount="+str(choose_amount)+", " from ResearchNLP.knowledge_bases import kb_helper kb_helper.k_base.load_knowledgebase() # explicitly load to help processes share memory # print kb_helper.kb_type didnt_advance_count = 0 sent_pool = set(base_sents) current_pool = list(base_sents) sent_pool_df = base_training_df.copy(deep=True) print "total new sentences: " + str(wanted_new_sents) while len(sent_pool) - len(base_sents) <= wanted_new_sents: # gen quarter size all_new_tuples = synthesize_tree_depth1_bulk(current_pool, sent_pool_df, col_names) combined_df = add_sentences_and_histories_to_df(base_training_df, col_names, all_new_tuples) combined_df = prepare_df_columns(combined_df, col_names) chosen_idxs = choose_bulk_method(combined_df, col_names, choose_amount, sent_pool) if len(chosen_idxs) == 0: didnt_advance_count += 1 for idx in chosen_idxs: sent_pool_df.loc[len(sent_pool_df)] = combined_df.loc[idx].copy(deep=True) # add new example to sent pools new_sent = combined_df[col_names.text][idx] assert new_sent not in sent_pool, "the new sentence should not appear beforehand" current_pool.append(new_sent) sent_pool.add(new_sent) print_progress(len(sent_pool) - len(base_sents), total=wanted_new_sents) didnt_advance_count = 0 sent_pool_df = prepare_df_columns(sent_pool_df, col_names) if didnt_advance_count >= 50: print "didn't advance, stopping synthesis" break # use the already filled sent_pool_df final_chosen_idxs = choose_bulk_method(sent_pool_df, col_names, total_new_sents, set()) new_sents_df = sent_pool_df.iloc[final_chosen_idxs].reset_index(drop=True) print "\ngenerated", len(new_sents_df), "sentences" return new_sents_df
def test_libact_first_try_results_are_the_same(self): """ test that the first libact example work the same way as the original example taken from github\ very long test ! """ # self.skipTest(reason="too long") cn.Inner_PredictionModel = SvmModel cn.Feature_Extractor = AvgGloveExtractor cn.load_codementor_sentiment_analysis_parameters() kb_helper.load_WordNet_model() quota = 5 # ask labeler to label 5 samples (tops) base_training_df, validation_data_df = prepare_balanced_dataset() pos_sents = pandas_util.get_all_positive_sentences( base_training_df, cn.col_names.text, cn.col_names.tag, cn.pos_tags) # prepare all data generated_pool_df = sg.generate_sents_using_random_synthesis( pos_sents, base_training_df, cn.col_names) labeled_pool_df = label_df_with_expert(generated_pool_df, cn.col_names) enriched_train_df = pd.concat([base_training_df, generated_pool_df], ignore_index=True) ideal_df = pd.concat([base_training_df, labeled_pool_df], ignore_index=True) extractor = cn.Feature_Extractor( enriched_train_df, cn.col_names) # build the feature extractor lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor)) manager = multiprocessing.Manager() return_dict = manager.dict() jobs = [] # first job p = multiprocessing.Process(target=self.libact_first_try_first_run, args=(enriched_train_df, extractor, lbr, quota, validation_data_df, return_dict)) jobs.append(p) p.start() # second job p = multiprocessing.Process(target=self.libact_first_try_second_run, args=(enriched_train_df, extractor, ideal_df, lbr, quota, validation_data_df, return_dict)) jobs.append(p) p.start() for proc in jobs: proc.join() self.assertTrue(np.array_equal(return_dict[1], return_dict[2]))
def get_sentence_representation(sent): import ResearchNLP.Constants as cn cn.add_experiment_param('glove300') reduced_sent = all_in_vocab(sent).split() if len(reduced_sent) == 0: # print "reduced sent: " + str(sent) return [0.0] * len(glove_model.obj.word_vectors[0]) # zeros representation global total_diff, diff_count total_diff += len(sent.split()) - len(reduced_sent) diff_count += 1 return sum(map(lambda word: glove_model.obj.word_vectors[glove_model.obj.dictionary[word]].__array__(), reduced_sent)) \ / len(reduced_sent)
def _choose_best_by_heuristic_fun(sent_df, col_names, count, ss_type, sent_pool): # type: (DataFrame, ColumnNames, int, SynState) -> list cn.add_experiment_param(ss_type.__name__) unlabeled_idxs = pd.np.where(sent_df[col_names.tag].isnull())[0] idx_text_col = list(sent_df[col_names.text][unlabeled_idxs].iteritems()) filtered_tpls = filter(lambda (idx, s): s not in sent_pool, idx_text_col) filtered_idxs = map(lambda (idx, s): idx, filtered_tpls) assert len(filtered_idxs) >= count, "Not enough unlabeled instances to choose from (after filtering)" score_idx_list = calculate_heuristic_bulk(sent_df, col_names, ss_type, filtered_idxs) return _choose_by_heuristic_score_diverse_origins(sent_df, col_names, count, score_idx_list)
def kfold_gain(train_set, dev_set, state_df, col_names): def depth1_gain(labeled_state_df): ex_added_list, res_list = scores_per_add_default( labeled_state_df, train_set, dev_set) f1_list = ExprScores.list_to_f1(res_list) return f1_list[1] - f1_list[ 0] # difference in f1 score. NOT NORMALIZED, but its supposed to be OK state_df.loc[0, col_names.tag] = 0 change0 = depth1_gain(state_df) state_df.loc[0, col_names.tag] = 1 change1 = depth1_gain(state_df) cn.add_experiment_param("5_spits_with_prob_kfold_gain") return p0 * change0 + p1 * change1
def classify_df(self, unlabeled_df): # type: (pd.DataFrame) -> pd.DataFrame # if cn.balance_dataset: # self.all_data_df = pandas_util.imbalance_dataset(cn.data_df, cn.tag_col, 0.5, # cn.pos_tags, cn.neg_tags) # build the feature extractor combined_df = pd.concat([self.all_data_df, unlabeled_df]) extractor_cls = cn.Expert_PredictionModel.get_FeatureExtractor_cls() if extractor_cls is None: extractor_cls = cn.Expert_FeatureExtractor extractor = extractor_cls(combined_df, self.col_names) X_all = extractor.transform( combined_df, self.col_names) # use pre-extracted features # extract all the features X_unlabeled = X_all[len(self.all_data_df[self.col_names.text]):] X_train = X_all[:len(self.all_data_df[self.col_names.text])] y_train = self.all_data_df[self.col_names.tag].tolist() model = cn.Expert_PredictionModel( X_train, y_train) # expert model trains on all data (makes it an expert) y_pred = model.train_model_and_predict(X_unlabeled) # print y_pred.tolist() labeled_df = unlabeled_df.copy(deep=True) labeled_df[self.col_names.tag] = map( float, y_pred) # add predictions to the df (simulates a human label) labeled_df = prepare_df_columns(labeled_df, self.col_names) return labeled_df # return the now tagged df
def __init__(self, state_idx, sent_df, col_names, prev_state=None): # type: (int, DataFrame, ColumnNames, SynStateFurthestFromPos) -> None super(SynStateFurthestFeatureSp, self).__init__(state_idx, sent_df, col_names, prev_state) relevant_idxs = list( self.sent_df[self.sent_df[col_names.tag].notnull()].index) # relevant_idxs.remove(state_idx) # relevant_idxs = np.array(relevant_idxs) if self.col_names.feature_repr in self.sent_df.columns: self.sent_repr = self.sent_df[self.col_names.feature_repr][ self.state_idx] self.min_dist = np.min(map( lambda (i, r): LA.norm( r[col_names.feature_repr] - self.sent_repr, 1)**2, self.sent_df.iloc[relevant_idxs].iterrows()), axis=0) else: extractor = cn.Feature_Extractor( sent_df, col_names) # this is actually faster than DataFrame.append() X_all = extractor.transform(sent_df, col_names) self.sent_repr = X_all[self.state_idx] self.min_dist = np.min( map(lambda feat: LA.norm(feat - self.sent_repr, 1)**2, X_all[relevant_idxs]))
def score_per_add_al(labeled_pool_df, base_training_df, validation_data_df): # type: (DataFrame, DataFrame, DataFrame) -> tuple gen_pool_df = labeled_pool_df.copy(deep=True) gen_pool_df[cn.col_names.tag] = [np.NaN] * len( gen_pool_df) # clear all tags enriched_train_df = pd.concat([base_training_df, gen_pool_df], ignore_index=True) extractor = cn.Feature_Extractor( enriched_train_df, cn.col_names) # build the feature extractor trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) ideal_df = pd.concat([base_training_df, labeled_pool_df], ignore_index=True) lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor)) scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(), validation_data_df) ex_added_list, res_list = run_active_learning( trn_ds, scoring_fun, lbr, qs, len(enriched_train_df)) # label all df return ex_added_list, res_list
def test_heuristic_test_data_gain_works(self): cn.load_codementor_sentiment_analysis_parameters() kb_helper.load_WordNet_model() base_train_df, pool_df, validation_data_df = prepare_pool_based_dataset( ) all_sents = list(base_train_df[cn.col_names.text]) tns = 2 pool_name, prep_pools = ( "orig pool", lambda *_: (pool_df.iloc[:tns + 5], cn.labeled_pool_df.iloc[:tns + 5])) train_with_pool_df = pd.concat([base_train_df, pool_df], ignore_index=True) generated_pool_df, labeled_pool_df = prep_pools( all_sents, train_with_pool_df, cn.col_names, tns) cn.experiment_purpose += pool_name + " " trn_ds, lbr, extractor = prepare_trn_ds(base_train_df, generated_pool_df, labeled_pool_df) final_scoring_fun = partial( lambda en_df: run_classifier(en_df, validation_data_df).acc) table_headers = ['#added examples'] data = [range(0, tns + 1)] compared_heuristics = [("test-data-gain", lambda: SynStateTestDataGain), ("random", lambda: "random")] for (heuristic_name, prepare_usage) in compared_heuristics: ss_type = prepare_usage() table_headers.append(heuristic_name) print heuristic_name _, heur_scores = insert_in_AL_fashion(trn_ds, final_scoring_fun, lbr, ss_type, labeled_pool_df, quota=tns) data.append(heur_scores) print data[1] print data[2] self.assertEqual(data[1][0], data[2][0], "starts are same") self.assertGreater(data[1][-1], data[2][-1], "test-data-gain should be better than random")
def __init__(self, sent_df, col_names, init_text_state=None): # type: (pd.DataFrame, ColumnNames, str) -> None super(BestInstanceProblem, self).__init__() cn.inst_count += 1 self.sent_pool_df = sent_df self.col_names = col_names cn.add_experiment_param(cn.ss_type.__name__) if init_text_state is not None: init_row = sent_df[sent_df[col_names.text] == init_text_state] assert len(init_row) != 0, "init_text_state not in send_df" # initial_state is used in BestInstanceProblem self.initial_state = cn.ss_type(init_row.index[0], self.sent_pool_df, col_names) self.init_states = None
def prepare_trn_ds(balanced_train_df, generated_pool_df, labeled_pool_df): enriched_train_df = pd.concat([balanced_train_df, generated_pool_df], ignore_index=True) extractor = cn.Feature_Extractor(enriched_train_df, cn.col_names) # build the feature extractor trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor) ideal_df = pd.concat([balanced_train_df, labeled_pool_df], ignore_index=True) lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor)) return trn_ds, lbr, extractor
def prepare_classifier(train_data_df, validation_data_df, col_names): # type: (pd.DataFrame, pd.DataFrame, ColumnNames) -> (PredictionModel, np.ndarray, np.ndarray) # build the feature extractor extractor = cn.Feature_Extractor(train_data_df, col_names) # this is actually faster than DataFrame.append() # extract all the features combined_df = pd.concat([train_data_df, validation_data_df]) X_all = extractor.transform(combined_df, col_names) # save time by using fit_transform X_train = X_all[:len(train_data_df[col_names.text])] y_train = train_data_df[col_names.tag].tolist() X_test = X_all[len(train_data_df[col_names.text]):] y_test = validation_data_df[col_names.tag] model = cn.Inner_PredictionModel(X_train, y_train) # expert model trains on all data (makes it an expert) return model, X_test, np.array(y_test, dtype=int)
def generate_random_state(self): # type: () -> SynState if self.init_states is None: positive_df = pandas_util.all_positive_rows_df( self.sent_pool_df, self.col_names.tag, cn.pos_tags) self.init_states = map( lambda (idx, text_state): cn.ss_type(idx, self.sent_pool_df, self.col_names), positive_df[self.col_names.text].iteritems()) return random.sample(self.init_states, 1)[0] # not used in hill climbing
def compare_generation_methods_pools(): experiment_name = 'small_train_compare_generation_methods_pools' print experiment_name # prepare the different splits of $data_df balanced_train_df, validation_data_df = prepare_balanced_dataset( print_expert_acc=False) all_sents = list(balanced_train_df[cn.col_names.text]) tns = 40 prepare_pools_funcs = list() # prepare_pools_funcs.append(curr_pool_gen_template("uncertainty lc LogReg")) # # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 2)) # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 2)) # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 10)) # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 0, use_enhanced=True)) # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5, use_enhanced=True)) # # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 10, use_enhanced=True)) # prepare_pools_funcs.append(local_search_gen_template("random-score", 2)) # # prepare_pools_funcs.append(local_search_gen_template("random-score", 0)) # # # prepare_pools_funcs.append(curr_pool_gen_template("test-data-gain")) # prepare_pools_funcs.append(generate_pool_using_random_synthesis()) # prepare_pools_funcs.append(prepare_orig_examples_pools()) # prepare_pools_funcs.append(generate_pool_lecun_augmentation()) prepare_pools_funcs.append(generate_sents_using_lstm_generator()) final_scoring_fun = partial( lambda en_df: run_classifier(en_df, validation_data_df).acc) insertion_order_heuristic = find_heuristic("uncertainty lc LogReg")() # insertion_order_heuristic = find_heuristic("test-data-gain")() cn.add_experiment_param(insertion_order_heuristic.__name__) cn.experiment_purpose += "insertion order using " + insertion_order_heuristic.__name__ + " " print cn.experiment_purpose table_headers = ['#added examples'] data = [[0]] for pool_name, prepare_pool_fun in prepare_pools_funcs: init_score = final_scoring_fun(balanced_train_df) print pool_name gen_pool_df, labeled_pool_df = prepare_pool_fun( all_sents, balanced_train_df, cn.col_names, tns) trn_ds, lbr, extractor = prepare_trn_ds(balanced_train_df, gen_pool_df, labeled_pool_df) print pool_name query_num, pool_insr_scores = insert_in_AL_fashion( trn_ds, final_scoring_fun, lbr, insertion_order_heuristic, labeled_pool_df, quota=tns) # query_num, pool_insr_scores = insert_in_batch_AL(trn_ds, final_scoring_fun, lbr, insertion_order_heuristic, # labeled_pool_df, quota=tns, batch_num=5) pool_insr_scores[0] = init_score data[0] = query_num if len(data[0]) < len(query_num) else data[0] table_headers.append(pool_name) data.append(pool_insr_scores) return experiment_name, table_headers, data, plot_compare_generation_methods
def compare_pool_generation_methods_proper_al(): experiment_name = 'compare_pool_generation_methods_proper_AL' print experiment_name # prepare the different splits of $data_df balanced_train_df, validation_data_df = prepare_balanced_dataset() all_sents = list(balanced_train_df[cn.col_names.text]) prepare_pools_funcs = list() # prepare_pools_funcs.append(curr_pool_gen_template("uncertainty lc LogReg")) # prepare_pools_funcs.append(curr_pool_gen_template("random-score")) # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5)) # prepare_pools_funcs.append(local_search_gen_template("uncertainty lc LogReg", 5, use_enhanced=True)) # prepare_pools_funcs.append(local_search_gen_template("random-score", 5)) # prepare_pools_funcs.append(generate_pool_using_random_synthesis()) # prepare_pools_funcs.append(prepare_orig_examples_pools()) # prepare_pools_funcs.append(generate_pool_lecun_augmentation()) prepare_pools_funcs.append(generate_sents_using_lstm_generator()) final_scoring_fun = partial( lambda en_df: run_classifier(en_df, validation_data_df).acc) cn.add_experiment_param("tns_" + str(total_new_sents)) cn.add_experiment_param("pool_size" + str(pool_size_each_step)) cn.add_experiment_param("batch_size" + str(examples_at_each_step)) print cn.experiment_purpose def do_one_AL_cycle(pool_gen_fun, curr_training_df): done_generating = False sent_pool = set(curr_training_df[cn.col_names.text]) gen_pool_df, labeled_pool_df = pool_gen_fun(list(sent_pool), curr_training_df, cn.col_names, pool_size_each_step) if len(gen_pool_df) > examples_at_each_step: selected_instance_idxs = select_from_pool_uncertainty( gen_pool_df, balanced_train_df, cn.col_names, sent_pool, examples_at_each_step) labeled_instances_df = labeled_pool_df.iloc[ selected_instance_idxs].copy(deep=True).reset_index(drop=True) else: labeled_instances_df = labeled_pool_df # all there is, close enough. if len(gen_pool_df) < examples_at_each_step: done_generating = True enriched_train_df = pd.concat([curr_training_df, labeled_instances_df], ignore_index=True) return enriched_train_df, final_scoring_fun( enriched_train_df), done_generating table_headers = ['#added examples'] data = [ range(0, total_new_sents + examples_at_each_step, examples_at_each_step) ] for pool_name, prepare_pool_fun in prepare_pools_funcs: start_time = time.time() print "starting {0} - {1}".format(pool_name, cn.data_name) curr_training_df = balanced_train_df.copy(deep=True) res_list = [final_scoring_fun(curr_training_df)] for i in range(0, total_new_sents, examples_at_each_step): # has to be serial print_progress(i, total=total_new_sents) sa = time.time() curr_training_df, curr_add_res, done = do_one_AL_cycle( prepare_pool_fun, curr_training_df) if done: break res_list.append(curr_add_res) print "AL cycle took {0:.2f} s".format(time.time() - sa) print "{0} run time: {1:.2f} minutes - {2}".format( pool_name, (time.time() - start_time) / 60.0, cn.data_name) table_headers.append(pool_name) data.append(res_list) return experiment_name, table_headers, data, plot_compare_pool_generation_methods_proper_al
def parmap(f, X, nprocs=multiprocessing.cpu_count(), force_parallel=False, chunk_size=1): from ResearchNLP import Constants as cn from ResearchNLP.util_files import function_cache if len(X) == 0: return [] # like map # nprocs = min(nprocs, cn.max_procs) if nprocs != multiprocessing.cpu_count() and len(X) < nprocs * chunk_size: chunk_size = 1 # use chunk_size = 1 if there is enough procs for a batch size of 1 nprocs = max(1, min(nprocs, len(X) / chunk_size)) # at least 1 if len(X) < nprocs: if cn.verbose and nprocs != multiprocessing.cpu_count(): print "parmap too much procs" nprocs = len(X) # too much procs if nprocs == 1 or (cn.serial_parmap and not force_parallel ): # we want it serial (maybe for profiling) return map(f, X) def _spawn_fun(input, func): import random, numpy from ResearchNLP import Constants as cn2 from ResearchNLP.util_files import function_cache as function_cache2 random.seed(1554 + i) numpy.random.seed(42 + i) # set random seeds try: res = func(input) res_dict = dict() res_dict["res"] = res res_dict["functions_dict"] = function_cache2.caches_dicts res_dict["experiment_purpose"] = cn2.experiment_purpose res_dict["curr_params_list"] = cn2.curr_experiment_params_list return res_dict except: import traceback traceback.print_exc() raise # re-raise exception # if chunk_size == 1: # chunk_size = math.ceil(float(len(X)) / nprocs) # all procs work on an equal chunk try: # try-catch hides bugs global proc_count old_proc_count = proc_count proc_count = nprocs p = Pool(nprocs) p.restart(force=True) retval_par = p.map( _spawn_fun, X, [f] * len(X), chunk_size=chunk_size) # can throw if current proc is daemon p.terminate() for res_dict in retval_par: # add all experiments params we missed curr_params_list = res_dict["curr_params_list"] for param in curr_params_list: cn.add_experiment_param(param) cn.experiment_purpose = retval_par[0][ "experiment_purpose"] # use the "experiment_purpose" from the fork function_cache.merge_cache_dicts_from_parallel_runs( map(lambda a: a["functions_dict"], retval_par)) # merge all retval = map(lambda res_dict: res_dict["res"], retval_par) # make it like the original map proc_count = old_proc_count global i i += 1 except AssertionError as e: if e.message == "daemonic processes are not allowed to have children": retval = map(f, X) # can't have pool inside pool else: print "error message is: " + str(e.message) raise # re-raise orig exception return retval
def setUpClass(cls): cn.load_codementor_sentiment_analysis_parameters() ce.load_machine_expert(cn.col_names, cn.relevant_tags, cn.data_df)
def setUpClass(cls): cn.load_codementor_sentiment_analysis_parameters(d_measure=10) cn.data_df = pretokenize_df(cn.data_df, cn.col_names)
def load_GloVe_model(relpath='glove_models/glove.6B.100d.txt'): global k_base, kb_type kb_type = 'GloVe' save_previous_model() cn.add_experiment_param(kb_type) k_base = GloveKB(cn.distance_measure, knowledge_bases_foldpath, relpath)
def load_WordNet_model(): global k_base, kb_type kb_type = 'WordNet' save_previous_model() cn.add_experiment_param(kb_type) k_base = WordNetKB(cn.distance_measure)
def load_dep_word2vec_model(relpath='word2vec_models/deps_trimmed.words'): global k_base, kb_type kb_type = 'dep_w2v' save_previous_model() cn.add_experiment_param(kb_type) k_base = Word2VecKB(cn.distance_measure, knowledge_bases_foldpath, relpath)
def build_feature_extractor(sent_df, col_names): # type: (pd.DataFrame, ColumnNames) -> tuple(FeatureExtractor, pd.DataFrame) """ Builds and returns a feature extractor using sent_df """ return cn.Feature_Extractor(sent_df, col_names) # build the feature extractor