def __init__(self, state_idx, sent_df, col_names, prev_state=None):
        # type: (int, DataFrame, ColumnNames, SynStateFurthestFromPos) -> None
        super(SynStateFurthestFeatureSp,
              self).__init__(state_idx, sent_df, col_names, prev_state)

        relevant_idxs = list(
            self.sent_df[self.sent_df[col_names.tag].notnull()].index)
        # relevant_idxs.remove(state_idx)
        # relevant_idxs = np.array(relevant_idxs)
        if self.col_names.feature_repr in self.sent_df.columns:
            self.sent_repr = self.sent_df[self.col_names.feature_repr][
                self.state_idx]
            self.min_dist = np.min(map(
                lambda (i, r): LA.norm(
                    r[col_names.feature_repr] - self.sent_repr, 1)**2,
                self.sent_df.iloc[relevant_idxs].iterrows()),
                                   axis=0)
        else:
            extractor = cn.Feature_Extractor(
                sent_df,
                col_names)  # this is actually faster than DataFrame.append()
            X_all = extractor.transform(sent_df, col_names)
            self.sent_repr = X_all[self.state_idx]

            self.min_dist = np.min(
                map(lambda feat: LA.norm(feat - self.sent_repr, 1)**2,
                    X_all[relevant_idxs]))
Esempio n. 2
0
def score_per_add_al(labeled_pool_df, base_training_df, validation_data_df):
    # type: (DataFrame, DataFrame, DataFrame) -> tuple

    gen_pool_df = labeled_pool_df.copy(deep=True)
    gen_pool_df[cn.col_names.tag] = [np.NaN] * len(
        gen_pool_df)  # clear all tags
    enriched_train_df = pd.concat([base_training_df, gen_pool_df],
                                  ignore_index=True)

    extractor = cn.Feature_Extractor(
        enriched_train_df, cn.col_names)  # build the feature extractor

    trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)

    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())

    ideal_df = pd.concat([base_training_df, labeled_pool_df],
                         ignore_index=True)
    lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor))

    scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(),
                                            validation_data_df)
    ex_added_list, res_list = run_active_learning(
        trn_ds, scoring_fun, lbr, qs, len(enriched_train_df))  # label all df

    return ex_added_list, res_list
Esempio n. 3
0
def prepare_trn_ds(balanced_train_df, generated_pool_df, labeled_pool_df):
    enriched_train_df = pd.concat([balanced_train_df, generated_pool_df], ignore_index=True)

    extractor = cn.Feature_Extractor(enriched_train_df, cn.col_names)  # build the feature extractor
    trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)

    ideal_df = pd.concat([balanced_train_df, labeled_pool_df], ignore_index=True)
    lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor))
    return trn_ds, lbr, extractor
Esempio n. 4
0
    def test_libact_first_try_results_are_the_same(self):
        """
        test that the first libact example work the same way as the original example taken from github\

        very long test !
        """

        # self.skipTest(reason="too long")

        cn.Inner_PredictionModel = SvmModel
        cn.Feature_Extractor = AvgGloveExtractor
        cn.load_codementor_sentiment_analysis_parameters()
        kb_helper.load_WordNet_model()

        quota = 5  # ask labeler to label 5 samples (tops)
        base_training_df, validation_data_df = prepare_balanced_dataset()
        pos_sents = pandas_util.get_all_positive_sentences(
            base_training_df, cn.col_names.text, cn.col_names.tag, cn.pos_tags)

        # prepare all data
        generated_pool_df = sg.generate_sents_using_random_synthesis(
            pos_sents, base_training_df, cn.col_names)
        labeled_pool_df = label_df_with_expert(generated_pool_df, cn.col_names)

        enriched_train_df = pd.concat([base_training_df, generated_pool_df],
                                      ignore_index=True)
        ideal_df = pd.concat([base_training_df, labeled_pool_df],
                             ignore_index=True)

        extractor = cn.Feature_Extractor(
            enriched_train_df, cn.col_names)  # build the feature extractor

        lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor))
        manager = multiprocessing.Manager()
        return_dict = manager.dict()
        jobs = []
        # first job
        p = multiprocessing.Process(target=self.libact_first_try_first_run,
                                    args=(enriched_train_df, extractor, lbr,
                                          quota, validation_data_df,
                                          return_dict))
        jobs.append(p)
        p.start()
        # second job
        p = multiprocessing.Process(target=self.libact_first_try_second_run,
                                    args=(enriched_train_df, extractor,
                                          ideal_df, lbr, quota,
                                          validation_data_df, return_dict))
        jobs.append(p)
        p.start()

        for proc in jobs:
            proc.join()

        self.assertTrue(np.array_equal(return_dict[1], return_dict[2]))
Esempio n. 5
0
def prepare_classifier(train_data_df, validation_data_df, col_names):
    # type: (pd.DataFrame, pd.DataFrame, ColumnNames) -> (PredictionModel, np.ndarray, np.ndarray)

    # build the feature extractor
    extractor = cn.Feature_Extractor(train_data_df, col_names)  # this is actually faster than DataFrame.append()

    # extract all the features
    combined_df = pd.concat([train_data_df, validation_data_df])
    X_all = extractor.transform(combined_df, col_names)  # save time by using fit_transform

    X_train = X_all[:len(train_data_df[col_names.text])]
    y_train = train_data_df[col_names.tag].tolist()
    X_test = X_all[len(train_data_df[col_names.text]):]
    y_test = validation_data_df[col_names.tag]

    model = cn.Inner_PredictionModel(X_train, y_train)  # expert model trains on all data (makes it an expert)

    return model, X_test, np.array(y_test, dtype=int)
Esempio n. 6
0
 def build_feature_extractor(sent_df, col_names):
     # type: (pd.DataFrame, ColumnNames) -> tuple(FeatureExtractor, pd.DataFrame)
     """
     Builds and returns a feature extractor using sent_df
     """
     return cn.Feature_Extractor(sent_df, col_names)  # build the feature extractor