def curr_pool_synthesis_from_sents(base_sents, base_training_df, col_names, total_new_sents=None,
                                   choose_bulk_method=choose_best_by_uncertainty):
    # type: (list, DataFrame, ColumnNames, int, callable) -> DataFrame
    """
    generates new examples based on $base_sents using a generic algorithms
    :param base_sents: list of base sents, from which we generate new ones
    :param base_training_df: DataFrame containing all labeled sentences
    :param col_names: contains the names of the columns in the output DataFrame
    :param total_new_sents: indicates the number of sentences we want to synthesize
    :param choose_bulk_method: a method for choosing sentences to be sent for generation
    :return: an unlabeled DataFrame with the new sentences generated
    """
    print "start curr pool search map"
    total_new_sents = pool_size(total_new_sents, base_sents)
    wanted_new_sents = int(total_new_sents * 4)
    choose_amount = wanted_new_sents / 8 + 1
    cn.add_experiment_param("choose_amount_"+str(choose_amount))
    if "choose_amount" not in cn.experiment_purpose:
        cn.experiment_purpose += "curr_pool choose_amount="+str(choose_amount)+", "

    from ResearchNLP.knowledge_bases import kb_helper
    kb_helper.k_base.load_knowledgebase()  # explicitly load to help processes share memory
    # print kb_helper.kb_type

    didnt_advance_count = 0
    sent_pool = set(base_sents)
    current_pool = list(base_sents)
    sent_pool_df = base_training_df.copy(deep=True)
    print "total new sentences: " + str(wanted_new_sents)
    while len(sent_pool) - len(base_sents) <= wanted_new_sents:  # gen quarter size
        all_new_tuples = synthesize_tree_depth1_bulk(current_pool, sent_pool_df, col_names)
        combined_df = add_sentences_and_histories_to_df(base_training_df, col_names, all_new_tuples)
        combined_df = prepare_df_columns(combined_df, col_names)
        chosen_idxs = choose_bulk_method(combined_df, col_names, choose_amount, sent_pool)
        if len(chosen_idxs) == 0:
            didnt_advance_count += 1

        for idx in chosen_idxs:
            sent_pool_df.loc[len(sent_pool_df)] = combined_df.loc[idx].copy(deep=True)
            # add new example to sent pools
            new_sent = combined_df[col_names.text][idx]
            assert new_sent not in sent_pool, "the new sentence should not appear beforehand"
            current_pool.append(new_sent)
            sent_pool.add(new_sent)
            print_progress(len(sent_pool) - len(base_sents), total=wanted_new_sents)
            didnt_advance_count = 0

        sent_pool_df = prepare_df_columns(sent_pool_df, col_names)

        if didnt_advance_count >= 50:
            print "didn't advance, stopping synthesis"
            break

    # use the already filled sent_pool_df
    final_chosen_idxs = choose_bulk_method(sent_pool_df, col_names, total_new_sents, set())
    new_sents_df = sent_pool_df.iloc[final_chosen_idxs].reset_index(drop=True)
    print "\ngenerated", len(new_sents_df), "sentences"
    return new_sents_df
    def classify_df(self, unlabeled_df):
        # type: (pd.DataFrame) -> pd.DataFrame
        # if cn.balance_dataset:
        #     self.all_data_df = pandas_util.imbalance_dataset(cn.data_df, cn.tag_col, 0.5,
        #                                                      cn.pos_tags, cn.neg_tags)

        # build the feature extractor
        combined_df = pd.concat([self.all_data_df, unlabeled_df])
        extractor_cls = cn.Expert_PredictionModel.get_FeatureExtractor_cls()
        if extractor_cls is None:
            extractor_cls = cn.Expert_FeatureExtractor

        extractor = extractor_cls(combined_df, self.col_names)
        X_all = extractor.transform(
            combined_df, self.col_names)  # use pre-extracted features

        # extract all the features
        X_unlabeled = X_all[len(self.all_data_df[self.col_names.text]):]
        X_train = X_all[:len(self.all_data_df[self.col_names.text])]
        y_train = self.all_data_df[self.col_names.tag].tolist()

        model = cn.Expert_PredictionModel(
            X_train,
            y_train)  # expert model trains on all data (makes it an expert)

        y_pred = model.train_model_and_predict(X_unlabeled)
        # print y_pred.tolist()

        labeled_df = unlabeled_df.copy(deep=True)
        labeled_df[self.col_names.tag] = map(
            float,
            y_pred)  # add predictions to the df (simulates a human label)
        labeled_df = prepare_df_columns(labeled_df, self.col_names)

        return labeled_df  # return the now tagged df
Beispiel #3
0
    def classify_df(self, unlabeled_df):
        # type: (DataFrame) -> DataFrame

        # use the human gui package for classifying with a human
        labeled_df = human_expert_gui.classify_by_expert(
            unlabeled_df, self.col_names.text, self.col_names.tag,
            self.positive_text, self.negative_text)
        labeled_df[self.col_names.tag] = map(float,
                                             labeled_df[self.col_names.tag])
        labeled_df = prepare_df_columns(labeled_df, self.col_names)
        # labeled_df.to_pickle('tagged_gen_sents.pkl')  # save to file
        return labeled_df
def _build_new_sents_df(sent_pool, col_names, base_sents, do_difference=True):
    # type: (set, ColumnNames, list, bool) -> pd.DataFrame
    """
        Builds a new DataFrame object to put all new sents from $sent_pool (without sents from $base_sents)
    """
    if do_difference: sent_pool = sent_pool.difference(set(base_sents))  # only new sentences

    # put all new unlabeled generated sentences in a new DataFrame
    new_sent_df = pd.DataFrame(columns=list(col_names))
    new_sent_df[col_names.text] = list(sent_pool)  # fill the text column with the new sentences
    new_sent_df = prepare_df_columns(new_sent_df, col_names)
    return new_sent_df
Beispiel #5
0
    def __init__(self, df, col_names, feature_extractor, features=None):
        # type: (DataFrame, ColumnNames, FeatureExtractor, np.ndarray) -> None
        assert features is not None or feature_extractor is not None  # features=None -> extractor!=None
        # prepare data for the superclass
        self.col_names = col_names
        self.df = prepare_df_columns(df, col_names)
        self.df = df.copy(deep=True)
        self.sents = df[col_names.text]  # used in IdealLabeler
        if features is None:
            features = feature_extractor.transform(df, col_names)
        tags = map(lambda tag: tag if not math.isnan(tag) else None,
                   df[col_names.tag].astype(float))

        def update_dataframe(entry_id, lb):
            self.df.loc[entry_id, col_names.tag] = lb

        super(TextDataset, self).__init__(features, tags)
        self.on_update(update_dataframe)
Beispiel #6
0
def _generic_synthesis_from_sents(
        base_sents,
        base_training_df,
        col_names,
        syn_alg,
        total_new_sents,
        sents_choice_for_generation=choose_random_sents_from_df,
        batch_size=1,
        start_with_orig=False):
    # type: (list, DataFrame, ColumnNames, int, SynthesisAlgorithm, int) -> DataFrame
    """
    generates new examples based on $base_sents using a generic algorithms
    :param base_sents: list of base sents, from which we generate new ones
    :param base_training_df: DataFrame containing all labeled sentences
    :param col_names: contains the names of the columns in the output DataFrame
    :param syn_alg: a synthesis algorithm which makes the generation
    :param total_new_sents: indicates the number of sentences we want to synthesize
    :return: an unlabeled DataFrame with the new sentences generated
    """

    from ResearchNLP.knowledge_bases import kb_helper
    kb_helper.k_base.load_knowledgebase(
    )  # explicitly load to help processes share memory
    # print kb_helper.kb_type

    didnt_advance_count = 0
    from ResearchNLP.text_synthesis.heuristic_functions import choose_best_for_expansion
    choose_best_for_expansion.counter = 0
    replicate_count = 0
    sent_pool = set(base_sents).union(set(base_training_df[col_names.text]))
    orig_sent_pool_len = len(sent_pool)
    sent_pool_df = base_training_df.copy(deep=True)
    print "total new sentences: " + str(total_new_sents)
    while len(sent_pool
              ) - orig_sent_pool_len <= total_new_sents:  # gen quarter size
        if start_with_orig:
            chosen_sents = sents_choice_for_generation(base_training_df,
                                                       col_names, batch_size)
        else:
            chosen_sents = sents_choice_for_generation(sent_pool_df, col_names,
                                                       batch_size)
        new_sent_tuples = syn_alg.run_alg_parallel(sent_pool_df, col_names,
                                                   chosen_sents)

        for new_sent, sent_history in new_sent_tuples:
            if new_sent not in sent_pool:
                didnt_advance_count = 0
                print_progress(len(sent_pool) - orig_sent_pool_len,
                               total=total_new_sents)
                print str(replicate_count) + " replicated sents",
                sent_pool.add(new_sent)
                sent_pool_df = add_sentences_and_histories_to_df(
                    sent_pool_df, col_names, [(new_sent, sent_history)])
            else:
                didnt_advance_count += 1
                replicate_count += 1

        sent_pool_df = prepare_df_columns(sent_pool_df, col_names)

        if didnt_advance_count >= 50:
            print "didn't advance, stopping synthesis"
            break

    # use the already filled sent_pool_df
    new_sents_df = sent_pool_df.iloc[len(base_training_df):(len(base_training_df)+total_new_sents)] \
        .reset_index(drop=True)
    print "generated", len(new_sents_df), "sentences"
    return new_sents_df