def __init__(self, state_idx, sent_df, col_names, prev_state=None): # type: (int, DataFrame, ColumnNames, SynStateFurthestFromPos) -> None super(SynStateFurthestFeatureSp, self).__init__(state_idx, sent_df, col_names, prev_state) relevant_idxs = list( self.sent_df[self.sent_df[col_names.tag].notnull()].index) # relevant_idxs.remove(state_idx) # relevant_idxs = np.array(relevant_idxs) if self.col_names.feature_repr in self.sent_df.columns: self.sent_repr = self.sent_df[self.col_names.feature_repr][ self.state_idx] self.min_dist = np.min(map( lambda (i, r): LA.norm( r[col_names.feature_repr] - self.sent_repr, 1)**2, self.sent_df.iloc[relevant_idxs].iterrows()), axis=0) else: extractor = cn.Feature_Extractor( sent_df, col_names) # this is actually faster than DataFrame.append() X_all = extractor.transform(sent_df, col_names) self.sent_repr = X_all[self.state_idx] self.min_dist = np.min( map(lambda feat: LA.norm(feat - self.sent_repr, 1)**2, X_all[relevant_idxs]))
def score_per_add_al(labeled_pool_df, base_training_df, validation_data_df): # type: (DataFrame, DataFrame, DataFrame) -> tuple gen_pool_df = labeled_pool_df.copy(deep=True) gen_pool_df[cn.col_names.tag] = [np.NaN] * len( gen_pool_df) # clear all tags enriched_train_df = pd.concat([base_training_df, gen_pool_df], ignore_index=True) extractor = cn.Feature_Extractor( enriched_train_df, cn.col_names) # build the feature extractor trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) ideal_df = pd.concat([base_training_df, labeled_pool_df], ignore_index=True) lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor)) scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(), validation_data_df) ex_added_list, res_list = run_active_learning( trn_ds, scoring_fun, lbr, qs, len(enriched_train_df)) # label all df return ex_added_list, res_list
def prepare_trn_ds(balanced_train_df, generated_pool_df, labeled_pool_df): enriched_train_df = pd.concat([balanced_train_df, generated_pool_df], ignore_index=True) extractor = cn.Feature_Extractor(enriched_train_df, cn.col_names) # build the feature extractor trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor) ideal_df = pd.concat([balanced_train_df, labeled_pool_df], ignore_index=True) lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor)) return trn_ds, lbr, extractor
def test_libact_first_try_results_are_the_same(self): """ test that the first libact example work the same way as the original example taken from github\ very long test ! """ # self.skipTest(reason="too long") cn.Inner_PredictionModel = SvmModel cn.Feature_Extractor = AvgGloveExtractor cn.load_codementor_sentiment_analysis_parameters() kb_helper.load_WordNet_model() quota = 5 # ask labeler to label 5 samples (tops) base_training_df, validation_data_df = prepare_balanced_dataset() pos_sents = pandas_util.get_all_positive_sentences( base_training_df, cn.col_names.text, cn.col_names.tag, cn.pos_tags) # prepare all data generated_pool_df = sg.generate_sents_using_random_synthesis( pos_sents, base_training_df, cn.col_names) labeled_pool_df = label_df_with_expert(generated_pool_df, cn.col_names) enriched_train_df = pd.concat([base_training_df, generated_pool_df], ignore_index=True) ideal_df = pd.concat([base_training_df, labeled_pool_df], ignore_index=True) extractor = cn.Feature_Extractor( enriched_train_df, cn.col_names) # build the feature extractor lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor)) manager = multiprocessing.Manager() return_dict = manager.dict() jobs = [] # first job p = multiprocessing.Process(target=self.libact_first_try_first_run, args=(enriched_train_df, extractor, lbr, quota, validation_data_df, return_dict)) jobs.append(p) p.start() # second job p = multiprocessing.Process(target=self.libact_first_try_second_run, args=(enriched_train_df, extractor, ideal_df, lbr, quota, validation_data_df, return_dict)) jobs.append(p) p.start() for proc in jobs: proc.join() self.assertTrue(np.array_equal(return_dict[1], return_dict[2]))
def prepare_classifier(train_data_df, validation_data_df, col_names): # type: (pd.DataFrame, pd.DataFrame, ColumnNames) -> (PredictionModel, np.ndarray, np.ndarray) # build the feature extractor extractor = cn.Feature_Extractor(train_data_df, col_names) # this is actually faster than DataFrame.append() # extract all the features combined_df = pd.concat([train_data_df, validation_data_df]) X_all = extractor.transform(combined_df, col_names) # save time by using fit_transform X_train = X_all[:len(train_data_df[col_names.text])] y_train = train_data_df[col_names.tag].tolist() X_test = X_all[len(train_data_df[col_names.text]):] y_test = validation_data_df[col_names.tag] model = cn.Inner_PredictionModel(X_train, y_train) # expert model trains on all data (makes it an expert) return model, X_test, np.array(y_test, dtype=int)
def build_feature_extractor(sent_df, col_names): # type: (pd.DataFrame, ColumnNames) -> tuple(FeatureExtractor, pd.DataFrame) """ Builds and returns a feature extractor using sent_df """ return cn.Feature_Extractor(sent_df, col_names) # build the feature extractor