def score_per_add_al(labeled_pool_df, base_training_df, validation_data_df): # type: (DataFrame, DataFrame, DataFrame) -> tuple gen_pool_df = labeled_pool_df.copy(deep=True) gen_pool_df[cn.col_names.tag] = [np.NaN] * len( gen_pool_df) # clear all tags enriched_train_df = pd.concat([base_training_df, gen_pool_df], ignore_index=True) extractor = cn.Feature_Extractor( enriched_train_df, cn.col_names) # build the feature extractor trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) ideal_df = pd.concat([base_training_df, labeled_pool_df], ignore_index=True) lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor)) scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(), validation_data_df) ex_added_list, res_list = run_active_learning( trn_ds, scoring_fun, lbr, qs, len(enriched_train_df)) # label all df return ex_added_list, res_list
def prepare_trn_ds(balanced_train_df, generated_pool_df, labeled_pool_df): enriched_train_df = pd.concat([balanced_train_df, generated_pool_df], ignore_index=True) extractor = cn.Feature_Extractor(enriched_train_df, cn.col_names) # build the feature extractor trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor) ideal_df = pd.concat([balanced_train_df, labeled_pool_df], ignore_index=True) lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor)) return trn_ds, lbr, extractor
def build_query_strategy(sent_df, col_names): # type: (DataFrame, ColumnNames) -> QueryStrategy """ Builds and returns a QueryStrategy using a feature extractor and a base_df """ init_extractor = SynStateALHeuristic.build_feature_extractor( sent_df, col_names) combined_features = init_extractor.transform(sent_df, col_names) trn_ds = TextDataset(sent_df, col_names, None, features=combined_features) return ActiveLearningByLearning( trn_ds, query_strategies=[ UncertaintySampling(trn_ds, model=SVM(C=100, gamma=3.1, kernel='rbf', decision_function_shape='ovr')), QUIRE(trn_ds), HintSVM(trn_ds, cl=1.0, ch=1.0), ], T=1000, uniform_sampler=True, model=SVM(C=100, gamma=3.1, kernel='rbf', decision_function_shape='ovr'))
def build_query_strategy(sent_df, col_names): # type: (DataFrame, ColumnNames) -> QueryStrategy """ Builds and returns a QueryStrategy using a feature extractor and a base_df """ init_extractor = SynStateALHeuristic.build_feature_extractor(sent_df, col_names) combined_features = init_extractor.transform(sent_df, col_names) return UncertaintySampling(TextDataset(sent_df, col_names, None, features=combined_features), method='lc', model=LogisticRegression())
def build_query_strategy(sent_df, col_names): # type: (DataFrame, ColumnNames) -> QueryStrategy """ Builds and returns a QueryStrategy using a feature extractor and a base_df """ init_extractor = SynStateALHeuristic.build_feature_extractor(sent_df, col_names) combined_features = init_extractor.transform(sent_df, col_names) return HintSVM(TextDataset(sent_df, col_names, None, features=combined_features), Cl=0.01, p=0.8)
def build_query_strategy(sent_df, col_names): # type: (DataFrame, ColumnNames) -> QueryStrategy """ Builds and returns a QueryStrategy using a feature extractor and a base_df """ init_extractor = SynStateALHeuristic.build_feature_extractor(sent_df, col_names) combined_features = init_extractor.transform(sent_df, col_names) return VarianceReduction(TextDataset(sent_df, col_names, None, features=combined_features), model=LogisticRegression(), sigma=0.1)
def heuristic_score_fun(inst_idx, ss_type): if ss_type == "Random": if "qs2" not in shared_variables: extractor = SynStateALHeuristic.build_feature_extractor(enriched_train_df, col_names) qs2 = RandomSampling(TextDataset(enriched_train_df, col_names, extractor)) shared_variables["qs2"] = qs2 qs2 = shared_variables["qs2"] return qs2.get_score(inst_idx) class Object(object): pass PS_type = type(ss_type.__name__, (object,), dict(orig_state=Object())) # python hack for naming a type def prepare_prev_state(ss_type, prev_state=None): if prev_state is None: prev_state = PS_type() if issubclass(ss_type, SynStateALHeuristic): if str(ss_type)+"qs" not in shared_variables: qs = ss_type.build_query_strategy(enriched_train_df, col_names) shared_variables[str(ss_type)+"qs"] = qs qs = shared_variables[str(ss_type)+"qs"] prev_state.build_next_states_qs = lambda _: qs elif ss_type == SynStateTestDataGain: if "en_labeled_train_df" not in shared_variables: enriched_labeled_train_df = SynStateTestDataGain. \ label_dataframe_with_expert(enriched_train_df, col_names, labeled_df) shared_variables["en_labeled_train_df"] = enriched_labeled_train_df enriched_labeled_train_df = shared_variables["en_labeled_train_df"] prev_state.build_next_states_labeled_df = lambda _: enriched_labeled_train_df elif ss_type == SynStateRandom: pass # return prev_state as it is return prev_state ss_prev_state = prepare_prev_state(ss_type) ss = ss_type(inst_idx, enriched_train_df, col_names, ss_prev_state) return ss.get_state_score()