def curr_pool_synthesis_from_sents(base_sents, base_training_df, col_names, total_new_sents=None, choose_bulk_method=choose_best_by_uncertainty): # type: (list, DataFrame, ColumnNames, int, callable) -> DataFrame """ generates new examples based on $base_sents using a generic algorithms :param base_sents: list of base sents, from which we generate new ones :param base_training_df: DataFrame containing all labeled sentences :param col_names: contains the names of the columns in the output DataFrame :param total_new_sents: indicates the number of sentences we want to synthesize :param choose_bulk_method: a method for choosing sentences to be sent for generation :return: an unlabeled DataFrame with the new sentences generated """ print "start curr pool search map" total_new_sents = pool_size(total_new_sents, base_sents) wanted_new_sents = int(total_new_sents * 4) choose_amount = wanted_new_sents / 8 + 1 cn.add_experiment_param("choose_amount_"+str(choose_amount)) if "choose_amount" not in cn.experiment_purpose: cn.experiment_purpose += "curr_pool choose_amount="+str(choose_amount)+", " from ResearchNLP.knowledge_bases import kb_helper kb_helper.k_base.load_knowledgebase() # explicitly load to help processes share memory # print kb_helper.kb_type didnt_advance_count = 0 sent_pool = set(base_sents) current_pool = list(base_sents) sent_pool_df = base_training_df.copy(deep=True) print "total new sentences: " + str(wanted_new_sents) while len(sent_pool) - len(base_sents) <= wanted_new_sents: # gen quarter size all_new_tuples = synthesize_tree_depth1_bulk(current_pool, sent_pool_df, col_names) combined_df = add_sentences_and_histories_to_df(base_training_df, col_names, all_new_tuples) combined_df = prepare_df_columns(combined_df, col_names) chosen_idxs = choose_bulk_method(combined_df, col_names, choose_amount, sent_pool) if len(chosen_idxs) == 0: didnt_advance_count += 1 for idx in chosen_idxs: sent_pool_df.loc[len(sent_pool_df)] = combined_df.loc[idx].copy(deep=True) # add new example to sent pools new_sent = combined_df[col_names.text][idx] assert new_sent not in sent_pool, "the new sentence should not appear beforehand" current_pool.append(new_sent) sent_pool.add(new_sent) print_progress(len(sent_pool) - len(base_sents), total=wanted_new_sents) didnt_advance_count = 0 sent_pool_df = prepare_df_columns(sent_pool_df, col_names) if didnt_advance_count >= 50: print "didn't advance, stopping synthesis" break # use the already filled sent_pool_df final_chosen_idxs = choose_bulk_method(sent_pool_df, col_names, total_new_sents, set()) new_sents_df = sent_pool_df.iloc[final_chosen_idxs].reset_index(drop=True) print "\ngenerated", len(new_sents_df), "sentences" return new_sents_df
def classify_df(self, unlabeled_df): # type: (pd.DataFrame) -> pd.DataFrame # if cn.balance_dataset: # self.all_data_df = pandas_util.imbalance_dataset(cn.data_df, cn.tag_col, 0.5, # cn.pos_tags, cn.neg_tags) # build the feature extractor combined_df = pd.concat([self.all_data_df, unlabeled_df]) extractor_cls = cn.Expert_PredictionModel.get_FeatureExtractor_cls() if extractor_cls is None: extractor_cls = cn.Expert_FeatureExtractor extractor = extractor_cls(combined_df, self.col_names) X_all = extractor.transform( combined_df, self.col_names) # use pre-extracted features # extract all the features X_unlabeled = X_all[len(self.all_data_df[self.col_names.text]):] X_train = X_all[:len(self.all_data_df[self.col_names.text])] y_train = self.all_data_df[self.col_names.tag].tolist() model = cn.Expert_PredictionModel( X_train, y_train) # expert model trains on all data (makes it an expert) y_pred = model.train_model_and_predict(X_unlabeled) # print y_pred.tolist() labeled_df = unlabeled_df.copy(deep=True) labeled_df[self.col_names.tag] = map( float, y_pred) # add predictions to the df (simulates a human label) labeled_df = prepare_df_columns(labeled_df, self.col_names) return labeled_df # return the now tagged df
def classify_df(self, unlabeled_df): # type: (DataFrame) -> DataFrame # use the human gui package for classifying with a human labeled_df = human_expert_gui.classify_by_expert( unlabeled_df, self.col_names.text, self.col_names.tag, self.positive_text, self.negative_text) labeled_df[self.col_names.tag] = map(float, labeled_df[self.col_names.tag]) labeled_df = prepare_df_columns(labeled_df, self.col_names) # labeled_df.to_pickle('tagged_gen_sents.pkl') # save to file return labeled_df
def _build_new_sents_df(sent_pool, col_names, base_sents, do_difference=True): # type: (set, ColumnNames, list, bool) -> pd.DataFrame """ Builds a new DataFrame object to put all new sents from $sent_pool (without sents from $base_sents) """ if do_difference: sent_pool = sent_pool.difference(set(base_sents)) # only new sentences # put all new unlabeled generated sentences in a new DataFrame new_sent_df = pd.DataFrame(columns=list(col_names)) new_sent_df[col_names.text] = list(sent_pool) # fill the text column with the new sentences new_sent_df = prepare_df_columns(new_sent_df, col_names) return new_sent_df
def __init__(self, df, col_names, feature_extractor, features=None): # type: (DataFrame, ColumnNames, FeatureExtractor, np.ndarray) -> None assert features is not None or feature_extractor is not None # features=None -> extractor!=None # prepare data for the superclass self.col_names = col_names self.df = prepare_df_columns(df, col_names) self.df = df.copy(deep=True) self.sents = df[col_names.text] # used in IdealLabeler if features is None: features = feature_extractor.transform(df, col_names) tags = map(lambda tag: tag if not math.isnan(tag) else None, df[col_names.tag].astype(float)) def update_dataframe(entry_id, lb): self.df.loc[entry_id, col_names.tag] = lb super(TextDataset, self).__init__(features, tags) self.on_update(update_dataframe)
def _generic_synthesis_from_sents( base_sents, base_training_df, col_names, syn_alg, total_new_sents, sents_choice_for_generation=choose_random_sents_from_df, batch_size=1, start_with_orig=False): # type: (list, DataFrame, ColumnNames, int, SynthesisAlgorithm, int) -> DataFrame """ generates new examples based on $base_sents using a generic algorithms :param base_sents: list of base sents, from which we generate new ones :param base_training_df: DataFrame containing all labeled sentences :param col_names: contains the names of the columns in the output DataFrame :param syn_alg: a synthesis algorithm which makes the generation :param total_new_sents: indicates the number of sentences we want to synthesize :return: an unlabeled DataFrame with the new sentences generated """ from ResearchNLP.knowledge_bases import kb_helper kb_helper.k_base.load_knowledgebase( ) # explicitly load to help processes share memory # print kb_helper.kb_type didnt_advance_count = 0 from ResearchNLP.text_synthesis.heuristic_functions import choose_best_for_expansion choose_best_for_expansion.counter = 0 replicate_count = 0 sent_pool = set(base_sents).union(set(base_training_df[col_names.text])) orig_sent_pool_len = len(sent_pool) sent_pool_df = base_training_df.copy(deep=True) print "total new sentences: " + str(total_new_sents) while len(sent_pool ) - orig_sent_pool_len <= total_new_sents: # gen quarter size if start_with_orig: chosen_sents = sents_choice_for_generation(base_training_df, col_names, batch_size) else: chosen_sents = sents_choice_for_generation(sent_pool_df, col_names, batch_size) new_sent_tuples = syn_alg.run_alg_parallel(sent_pool_df, col_names, chosen_sents) for new_sent, sent_history in new_sent_tuples: if new_sent not in sent_pool: didnt_advance_count = 0 print_progress(len(sent_pool) - orig_sent_pool_len, total=total_new_sents) print str(replicate_count) + " replicated sents", sent_pool.add(new_sent) sent_pool_df = add_sentences_and_histories_to_df( sent_pool_df, col_names, [(new_sent, sent_history)]) else: didnt_advance_count += 1 replicate_count += 1 sent_pool_df = prepare_df_columns(sent_pool_df, col_names) if didnt_advance_count >= 50: print "didn't advance, stopping synthesis" break # use the already filled sent_pool_df new_sents_df = sent_pool_df.iloc[len(base_training_df):(len(base_training_df)+total_new_sents)] \ .reset_index(drop=True) print "generated", len(new_sents_df), "sentences" return new_sents_df