Exemple #1
0
def score_per_add_al(labeled_pool_df, base_training_df, validation_data_df):
    # type: (DataFrame, DataFrame, DataFrame) -> tuple

    gen_pool_df = labeled_pool_df.copy(deep=True)
    gen_pool_df[cn.col_names.tag] = [np.NaN] * len(
        gen_pool_df)  # clear all tags
    enriched_train_df = pd.concat([base_training_df, gen_pool_df],
                                  ignore_index=True)

    extractor = cn.Feature_Extractor(
        enriched_train_df, cn.col_names)  # build the feature extractor

    trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)

    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())

    ideal_df = pd.concat([base_training_df, labeled_pool_df],
                         ignore_index=True)
    lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor))

    scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(),
                                            validation_data_df)
    ex_added_list, res_list = run_active_learning(
        trn_ds, scoring_fun, lbr, qs, len(enriched_train_df))  # label all df

    return ex_added_list, res_list
Exemple #2
0
def prepare_trn_ds(balanced_train_df, generated_pool_df, labeled_pool_df):
    enriched_train_df = pd.concat([balanced_train_df, generated_pool_df], ignore_index=True)

    extractor = cn.Feature_Extractor(enriched_train_df, cn.col_names)  # build the feature extractor
    trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor)

    ideal_df = pd.concat([balanced_train_df, labeled_pool_df], ignore_index=True)
    lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor))
    return trn_ds, lbr, extractor
Exemple #3
0
 def build_query_strategy(sent_df, col_names):
     # type: (DataFrame, ColumnNames) -> QueryStrategy
     """
     Builds and returns a QueryStrategy
         using a feature extractor and a base_df
     """
     init_extractor = SynStateALHeuristic.build_feature_extractor(
         sent_df, col_names)
     combined_features = init_extractor.transform(sent_df, col_names)
     trn_ds = TextDataset(sent_df,
                          col_names,
                          None,
                          features=combined_features)
     return ActiveLearningByLearning(
         trn_ds,
         query_strategies=[
             UncertaintySampling(trn_ds,
                                 model=SVM(C=100,
                                           gamma=3.1,
                                           kernel='rbf',
                                           decision_function_shape='ovr')),
             QUIRE(trn_ds),
             HintSVM(trn_ds, cl=1.0, ch=1.0),
         ],
         T=1000,
         uniform_sampler=True,
         model=SVM(C=100,
                   gamma=3.1,
                   kernel='rbf',
                   decision_function_shape='ovr'))
Exemple #4
0
 def build_query_strategy(sent_df, col_names):
     # type: (DataFrame, ColumnNames) -> QueryStrategy
     """
     Builds and returns a QueryStrategy
         using a feature extractor and a base_df
     """
     init_extractor = SynStateALHeuristic.build_feature_extractor(sent_df, col_names)
     combined_features = init_extractor.transform(sent_df, col_names)
     return UncertaintySampling(TextDataset(sent_df, col_names, None, features=combined_features),
                                method='lc', model=LogisticRegression())
Exemple #5
0
 def build_query_strategy(sent_df, col_names):
     # type: (DataFrame, ColumnNames) -> QueryStrategy
     """
     Builds and returns a QueryStrategy
         using a feature extractor and a base_df
     """
     init_extractor = SynStateALHeuristic.build_feature_extractor(sent_df, col_names)
     combined_features = init_extractor.transform(sent_df, col_names)
     return HintSVM(TextDataset(sent_df, col_names, None, features=combined_features),
                    Cl=0.01, p=0.8)
 def build_query_strategy(sent_df, col_names):
     # type: (DataFrame, ColumnNames) -> QueryStrategy
     """
     Builds and returns a QueryStrategy
         using a feature extractor and a base_df
     """
     init_extractor = SynStateALHeuristic.build_feature_extractor(sent_df, col_names)
     combined_features = init_extractor.transform(sent_df, col_names)
     return VarianceReduction(TextDataset(sent_df, col_names, None, features=combined_features),
                              model=LogisticRegression(), sigma=0.1)
    def heuristic_score_fun(inst_idx, ss_type):
        if ss_type == "Random":
            if "qs2" not in shared_variables:
                extractor = SynStateALHeuristic.build_feature_extractor(enriched_train_df, col_names)
                qs2 = RandomSampling(TextDataset(enriched_train_df, col_names, extractor))
                shared_variables["qs2"] = qs2
            qs2 = shared_variables["qs2"]
            return qs2.get_score(inst_idx)

        class Object(object):
            pass
        PS_type = type(ss_type.__name__, (object,), dict(orig_state=Object()))  # python hack for naming a type

        def prepare_prev_state(ss_type, prev_state=None):
            if prev_state is None:
                prev_state = PS_type()

            if issubclass(ss_type, SynStateALHeuristic):
                if str(ss_type)+"qs" not in shared_variables:
                    qs = ss_type.build_query_strategy(enriched_train_df, col_names)
                    shared_variables[str(ss_type)+"qs"] = qs
                qs = shared_variables[str(ss_type)+"qs"]
                prev_state.build_next_states_qs = lambda _: qs
             elif ss_type == SynStateTestDataGain:
                if "en_labeled_train_df" not in shared_variables:
                    enriched_labeled_train_df = SynStateTestDataGain. \
                        label_dataframe_with_expert(enriched_train_df, col_names, labeled_df)
                    shared_variables["en_labeled_train_df"] = enriched_labeled_train_df
                enriched_labeled_train_df = shared_variables["en_labeled_train_df"]
                prev_state.build_next_states_labeled_df = lambda _: enriched_labeled_train_df
            elif ss_type == SynStateRandom:
                pass  # return prev_state as it is
            return prev_state

        ss_prev_state = prepare_prev_state(ss_type)
        ss = ss_type(inst_idx, enriched_train_df, col_names, ss_prev_state)
        return ss.get_state_score()