def test_libact_first_try_results_are_the_same(self): """ test that the first libact example work the same way as the original example taken from github\ very long test ! """ # self.skipTest(reason="too long") cn.Inner_PredictionModel = SvmModel cn.Feature_Extractor = AvgGloveExtractor cn.load_codementor_sentiment_analysis_parameters() kb_helper.load_WordNet_model() quota = 5 # ask labeler to label 5 samples (tops) base_training_df, validation_data_df = prepare_balanced_dataset() pos_sents = pandas_util.get_all_positive_sentences( base_training_df, cn.col_names.text, cn.col_names.tag, cn.pos_tags) # prepare all data generated_pool_df = sg.generate_sents_using_random_synthesis( pos_sents, base_training_df, cn.col_names) labeled_pool_df = label_df_with_expert(generated_pool_df, cn.col_names) enriched_train_df = pd.concat([base_training_df, generated_pool_df], ignore_index=True) ideal_df = pd.concat([base_training_df, labeled_pool_df], ignore_index=True) extractor = cn.Feature_Extractor( enriched_train_df, cn.col_names) # build the feature extractor lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor)) manager = multiprocessing.Manager() return_dict = manager.dict() jobs = [] # first job p = multiprocessing.Process(target=self.libact_first_try_first_run, args=(enriched_train_df, extractor, lbr, quota, validation_data_df, return_dict)) jobs.append(p) p.start() # second job p = multiprocessing.Process(target=self.libact_first_try_second_run, args=(enriched_train_df, extractor, ideal_df, lbr, quota, validation_data_df, return_dict)) jobs.append(p) p.start() for proc in jobs: proc.join() self.assertTrue(np.array_equal(return_dict[1], return_dict[2]))
def test_heuristic_test_data_gain_works(self): cn.load_codementor_sentiment_analysis_parameters() kb_helper.load_WordNet_model() base_train_df, pool_df, validation_data_df = prepare_pool_based_dataset( ) all_sents = list(base_train_df[cn.col_names.text]) tns = 2 pool_name, prep_pools = ( "orig pool", lambda *_: (pool_df.iloc[:tns + 5], cn.labeled_pool_df.iloc[:tns + 5])) train_with_pool_df = pd.concat([base_train_df, pool_df], ignore_index=True) generated_pool_df, labeled_pool_df = prep_pools( all_sents, train_with_pool_df, cn.col_names, tns) cn.experiment_purpose += pool_name + " " trn_ds, lbr, extractor = prepare_trn_ds(base_train_df, generated_pool_df, labeled_pool_df) final_scoring_fun = partial( lambda en_df: run_classifier(en_df, validation_data_df).acc) table_headers = ['#added examples'] data = [range(0, tns + 1)] compared_heuristics = [("test-data-gain", lambda: SynStateTestDataGain), ("random", lambda: "random")] for (heuristic_name, prepare_usage) in compared_heuristics: ss_type = prepare_usage() table_headers.append(heuristic_name) print heuristic_name _, heur_scores = insert_in_AL_fashion(trn_ds, final_scoring_fun, lbr, ss_type, labeled_pool_df, quota=tns) data.append(heur_scores) print data[1] print data[2] self.assertEqual(data[1][0], data[2][0], "starts are same") self.assertGreater(data[1][-1], data[2][-1], "test-data-gain should be better than random")
def setUpClass(cls): cn.load_codementor_sentiment_analysis_parameters(d_measure=10) cn.data_df = pretokenize_df(cn.data_df, cn.col_names)
def setUpClass(cls): cn.load_codementor_sentiment_analysis_parameters() ce.load_machine_expert(cn.col_names, cn.relevant_tags, cn.data_df)