def output_word_probs(input_folder, fv_name, word_list_path, \ fout_path=None): ''' output a file with rows as follows: word | index | p(y=1|word) | p(y=1|word) ''' ### # we need access to the learners. ### sys.path.append("../modeling/curious_snake") import dataset import learners.base_nb_learner as base_nb_learner # naive bayes import pickle path_to_src = os.path.join(input_folder, fv_name) source_dataset = dataset.build_dataset_from_file(path_to_src) word_list = eval(open(word_list_path).readline()) ### # construct a naive bayes learner over the source # dataset. nb_learner_src = base_nb_learner.BaseNBLearner([source_dataset]) nb_learner_src.label_all_data() # legal, because this is the source data nb_learner_src.rebuild_models(True) cond_probs = nb_learner_src.models[0].conditional_probs out_d = {} for f_j in cond_probs.keys(): word = word_list[f_j] out_d[word]=cond_probs[f_j] ''' out_str = ["word\tsource_index\tp(y=1|x)\tp(y=-1|x)"] for f_j in cond_probs: if f_j % 100 == 0: print "on feature %s" % f_j word = source_word_list[f_j] if word in target_word_list: tgt_index = target_word_list.index(word) target_word_list.pop(word) p_1, p_0 = cond_probs[f_j][1], cond_probs[f_j][-1] out_str.append("%s\t%s\t%s\t%s\t%s" % (word, f_j, tgt_index, p_1, p_0)) ''' if fout_path is None: fout_path = os.path.join(input_folder, "%s_probs_d" % fv_name) fout = open(fout_path, 'w') pickle.dump(out_d, fout) fout.close()
######################################## # # run hold out experiments on sleep apnea # ######################################## import os import dataset #data_paths = [os.path.join("data", "sleep_apnea", "r0_denish",s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"]]#, "sleep_concepts"]] #feature_sets = [os.path.join("data", "sleep_apnea",s) for s in ["sleep_concepts"]] test_data_paths = [os.path.join("data", "sleep_apnea", "r0",s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"]]#, "sleep_concepts"]] test_datasets = [dataset.build_dataset_from_file(f, ignore_unlabeled_instances=True) for f in test_data_paths] #feature_sets = [os.path.join("data", "sleep_apnea",s) for s in ["sleep_concepts"]] import curious_snake # todo: make learner setup function parameteric so you can pass it in here #curious_snake.retro_diversity(feature_sets) data_paths = [os.path.join("data", "sleep_apnea", "%s" % "r0_denish",s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"]]#, "sleep_concepts"]] curious_snake.run_cv_experiments_with_test_data(data_paths, test_data_paths, os.path.join("output", "retro_cv_no_undersample"), test_datasets=test_datasets, num_runs=10, hold_out_p=.10) ''' for d in ["r0"] + ["r%s_denish" % (x+1) for x in range(23)]: print d data_paths = [os.path.join("data", "sleep_apnea", "%s" % d,s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"]]#, "sleep_concepts"]] curious_snake.run_cv_experiments_with_test_data(data_paths, test_data_paths, os.path.join("output", "retro_cv_no_undersample_%s" % d), test_datasets=test_datasets, num_runs=10, hold_out_p=.10)
def pooling_adapt(input_folder, src_fv_name, tgt_fv_name, fout_path,\ shared_word_list_name="shared_word_list.txt", N=200): ''' note that we assume instances are here represented in a *shared* space (comprising the words in shared_word_list_name) sample call pooling_adapt("_transfer/prostate", "prostate_as_ab_mh_ti_shared_representation", "prostate_ab_mh_ti_shared_representation", "_transfer/prostate/pooling_adapt", "shared_word_list.txt") ''' ### # we need access to the learners. ### sys.path.append("../modeling/curious_snake") import dataset import learners.base_nb_learner as base_nb_learner # naive bayes path_to_src = os.path.join(input_folder, src_fv_name) source_dataset = dataset.build_dataset_from_file(path_to_src) path_to_tgt = os.path.join(input_folder, tgt_fv_name) target_dataset = dataset.build_dataset_from_file(path_to_tgt) ### # make sure that the ids are unique s1 = set(source_dataset.instances.keys()) s2 = set(target_dataset.instances.keys()) if len(s1.intersection(s2)) > 0: raise Exception, "whoops, you don't have unique ids bro." # the start index for psuedo instances start_index = max(max(source_dataset.instances.keys()), \ max(target_dataset.instances.keys())) + 1 shared_word_list = \ eval(open(os.path.join(input_folder, shared_word_list_name)).readline()) ### # construct a naive bayes learner over the source # dataset. nb_learner_src = base_nb_learner.BaseNBLearner([source_dataset]) nb_learner_src.label_all_data() # legal, because this is the source data nb_learner_src.rebuild_models(True) #### # construct a learner on the *target* # we're going to cheat for now on the # features -- this is our ORACLE nb_learner_test = base_nb_learner.BaseNBLearner([target_dataset]) nb_learner_test.label_all_data() # CHEATING nb_learner_test.rebuild_models(True) best_features = _top_k_features(nb_learner_src.models[0].conditional_probs, 100) best_feature_words = [shared_word_list[f[0]] for f in best_features] # map features to their \deltas in the target task conditional_probs = nb_learner_test.models[0].conditional_probs test_features_to_deltas = {} for f_j in conditional_probs: test_features_to_deltas[f_j] = conditional_probs[f_j][1]-conditional_probs[f_j][-1] # what are the deltas in the test set for the features that were the 'best' # in the source set? best_f_deltas_in_test = [test_features_to_deltas[f_star[0]] for f_star in best_features] # generate our psuedo instance string psuedo_instance_str = _psuedo_positives(best_features, N, start_index) ### # assemble a string containing # <the source dataset> # # target dataset starts here # <the target dataset> # # psuedo-instances start here # <psuedo-instances> out_str = open(path_to_src).readlines() out_str.append("\n# target dataset starts here! \n") out_str.extend(open(path_to_tgt).readlines()) out_str.append("\n# psuedo instances start here! \n") out_str.append("\n".join(psuedo_instance_str)) fout = open(fout_path, 'w') pdb.set_trace() fout.write("".join(out_str)) fout.close() print "ok -- file written to %s" % fout_path
######################################## # # run hold out experiments on sleep apnea # ######################################## import os import dataset #data_paths = [os.path.join("data", "sleep_apnea", "r0_denish",s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"]]#, "sleep_concepts"]] #feature_sets = [os.path.join("data", "sleep_apnea",s) for s in ["sleep_concepts"]] test_data_paths = [ os.path.join("data", "sleep_apnea", "r0", s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"] ] #, "sleep_concepts"]] test_datasets = [ dataset.build_dataset_from_file(f, ignore_unlabeled_instances=True) for f in test_data_paths ] #feature_sets = [os.path.join("data", "sleep_apnea",s) for s in ["sleep_concepts"]] import curious_snake # todo: make learner setup function parameteric so you can pass it in here #curious_snake.retro_diversity(feature_sets) data_paths = [ os.path.join("data", "sleep_apnea", "%s" % "r0_denish", s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"] ] #, "sleep_concepts"]] curious_snake.run_cv_experiments_with_test_data(data_paths, test_data_paths, os.path.join( "output",
def setUp(self): self.path_to_data = os.path.join("..","data","data.txt") print "reading in data..." self.data = dataset.build_dataset_from_file(self.path_to_data) print "success"
def run_experiments_hold_out(data_paths, outpath, hold_out_p=.25, datasets_for_eval=None, upto=None, step_size=25, initial_size=2, batch_size=5, pick_balanced_initial_set=True, num_runs=10, report_results_after_runs=True): ''' This method demonstrates how to use the active learning framework, and is also a functional routine for comparing learners. Basically, a number of runs will be performed, the active learning methods will be evaluated at each step, and results will be reported. The results for each run will be dumped to a text files, which then can be combined (e.g., averaged), elsewhere, or you can use the results_reporter module to aggregate and plot the output. @parameters -- data_paths -- this is either a list (pointing to multiple feature spaces for the same instances) or a string pointing to a single data file (this will be the typical case). e.g., data_paths = "mydata.txt". curious_snake uses a sparse-formated weka-like format, documented elsewhere. outpath -- this is a directory under which all of the results will be dumped. hold_out_p -- the hold out percentage, i.e., how much of your data will be used for evaluation. you can ignore this is you're providing your own dataset(s) for evaluation (i.e., datasets_for_eval is not None)'. datasets_for_eval -- use this is you have datasets you want to use for testing -- i.e., to specify your hold out set independent of the data in data_paths. upto -- active learning will stop when upto examples have been labeled. if this is None, upto will default to the total unlabeled pool available initial_size -- the size of 'bootstrap' set to use prior to starting active learning (for the initial models) batch_size -- the number of examples to be labeled at each iteration in active learning -- optimally, 1 step_size -- results will be reported every time another step_size examples have been labeled pick_balanced_initial_set -- if True, the initial train dataset will be built over an equal number (initial_size/2) of both classes. num_runs -- this many runs will be performed report_results -- if true, the results_reporter module will be used to generate output. ''' for run in range(num_runs): print "\n********\non run %s" % run print data_paths num_labels_so_far = initial_size # set to initial size for first iteration if not os.path.isdir(outpath): os.mkdir(outpath) # if a string (pointing to a single dataset) is passed in, box it in a list data_paths = box_if_string(data_paths) datasets = [dataset.build_dataset_from_file(f) for f in data_paths] total_num_examples = len(datasets[0].instances) test_datasets = [] if datasets_for_eval is not None: # if a test set datafile is specified, use it. datasets_for_eval = box_if_string(datasets_for_eval) test_datasets = [ dataset.build_dataset_from_file(f) for f in datasets_for_eval ] if upto is None: upto = total_num_examples else: # other wise, we copy the first (even if there multiple datasets, it won't matter, # as we're just using the labels) and pick random examples hold_out_size = int(hold_out_p * total_num_examples) test_instance_ids = random.sample(datasets[0].instances, hold_out_size) # now remove them from the dataset(s) for d in datasets: cur_test_dataset = dataset.Dataset( dict( zip(test_instance_ids, d.remove_instances(test_instance_ids)))) test_datasets.append(cur_test_dataset) # if no upper bound was passed in, use the whole pool U if upto is None: upto = total_num_examples - hold_out_size print "using %s out of %s instances for test set" % ( hold_out_size, total_num_examples) print "U has cardinality: %s" % datasets[0].size() # # Set up the learners, add to list. Here is where you would instantiate new learners. # learners = [ random_learner.RandomLearner([d.copy() for d in datasets]), simple_learner.SimpleLearner([d.copy() for d in datasets]) ] #, #pal_learner.PALLearner([d.copy() for d in datasets])] #learners = [random_nb_learner.RandomNBLearner([d.copy() for d in datasets]), # uncertainty_nb_learner.UncertaintyNBLearner([d.copy() for d in datasets])] output_files = [ open("%s//%s_%s.txt" % (outpath, learner.name, run), 'w') for learner in learners ] # we arbitrarily pick the initial ids from the first learner; this doesn't matter, as we just use the instance ids initial_f = learners[0].get_random_unlabeled_ids init_size = num_labels_so_far if pick_balanced_initial_set: initial_f = learners[0].pick_balanced_initial_training_set init_size = int(num_labels_so_far / 2.0) # equal number from both classes # Again, you could call *.initial_f on any learner -- it just returns the ids to label initially. these will # be the same for all learners. init_ids = initial_f(init_size) # label instances and build initial models for learner in learners: learner.label_instances_in_all_datasets(init_ids) learner.rebuild_models() # report initial results, to console and file. report_results(learners, test_datasets, num_labels_so_far, output_files) first_iter = True while num_labels_so_far <= upto - step_size: # # the main active learning loop # cur_step_size = step_size cur_batch_size = batch_size if first_iter: # here we account for the initial labeled dataset size. for example, suppose # the step_size is set to 25 (we want to report results every 25 labels), # but the initial size was 2; then we want to label 23 on the first iteration # so that we report results when 25 total labels have been provided cur_step_size = step_size - num_labels_so_far if num_labels_so_far <= step_size \ else step_size - (num_labels_so_far - step_size) # in general, step_size is assumed to be a multiple of batch_size, for the first iteration, # when we're catching up to to the step_size (as outlined above), we set the # batch_size to 1 to make sure this condition holds. cur_batch_size = 1 first_iter = False for learner in learners: learner.active_learn(cur_step_size, batch_size=cur_batch_size) num_labels_so_far += cur_step_size print "\n***labeled %s examples out of %s so far***" % ( num_labels_so_far, upto) report_results(learners, test_datasets, num_labels_so_far, output_files) # close files for output_file in output_files: output_file.close() # post-experimental reporting if report_results_after_runs: results_reporter.post_runs_report(outpath, [l.name for l in learners], num_runs)
def setUp(self): self.path_to_data = "data.txt" print "reading in data..." self.data = dataset.build_dataset_from_file(self.path_to_data) print "success"
######################################## # # run hold out experiments on sleep apnea # ######################################## import os #feature_sets = [os.path.join("data", "sleep_apnea",s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords", "sleep_concepts"]] data_paths = [ os.path.join("data", "sleep_apnea", s) for s in ["titles_post_r7", "abstracts_post_r7", "keywords_post_r7"] ] import curious_snake import dataset # todo: make learner setup function parameteric so you can pass it in here datasets = [dataset.build_dataset_from_file(f) for f in data_paths] curious_snake.prospective(None, os.path.join("output", "sleepies6"), "predictions_all", datasets=datasets, beta=1) #curious_snake.retro_diversity(feature_sets)
######################################## # # run hold out experiments on sleep apnea # ######################################## import os #feature_sets = [os.path.join("data", "sleep_apnea",s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords", "sleep_concepts"]] data_paths = [os.path.join("data", "sleep_apnea", s) for s in ["titles_post_r7", "abstracts_post_r7", "keywords_post_r7"]] import curious_snake import dataset # todo: make learner setup function parameteric so you can pass it in here datasets = [dataset.build_dataset_from_file(f) for f in data_paths] curious_snake.prospective(None, os.path.join("output", "sleepies6"), "predictions_all", datasets=datasets, beta=1) #curious_snake.retro_diversity(feature_sets)
def setUp(self): self.path_to_data = os.path.join("..", "data", "data.txt") print "reading in data..." self.data = dataset.build_dataset_from_file(self.path_to_data) print "success"
def run_experiments_hold_out(data_paths, outpath, hold_out_p = .25, datasets_for_eval = None, upto = None, step_size = 25, initial_size = 2, batch_size = 5, pick_balanced_initial_set = True, num_runs=10, report_results_after_runs=True): ''' This method demonstrates how to use the active learning framework, and is also a functional routine for comparing learners. Basically, a number of runs will be performed, the active learning methods will be evaluated at each step, and results will be reported. The results for each run will be dumped to a text files, which then can be combined (e.g., averaged), elsewhere, or you can use the results_reporter module to aggregate and plot the output. @parameters -- data_paths -- this is either a list (pointing to multiple feature spaces for the same instances) or a string pointing to a single data file (this will be the typical case). e.g., data_paths = "mydata.txt". curious_snake uses a sparse-formated weka-like format, documented elsewhere. outpath -- this is a directory under which all of the results will be dumped. hold_out_p -- the hold out percentage, i.e., how much of your data will be used for evaluation. you can ignore this is you're providing your own dataset(s) for evaluation (i.e., datasets_for_eval is not None)'. datasets_for_eval -- use this is you have datasets you want to use for testing -- i.e., to specify your hold out set independent of the data in data_paths. upto -- active learning will stop when upto examples have been labeled. if this is None, upto will default to the total unlabeled pool available initial_size -- the size of 'bootstrap' set to use prior to starting active learning (for the initial models) batch_size -- the number of examples to be labeled at each iteration in active learning -- optimally, 1 step_size -- results will be reported every time another step_size examples have been labeled pick_balanced_initial_set -- if True, the initial train dataset will be built over an equal number (initial_size/2) of both classes. num_runs -- this many runs will be performed report_results -- if true, the results_reporter module will be used to generate output. ''' for run in range(num_runs): print "\n********\non run %s" % run print data_paths num_labels_so_far = initial_size # set to initial size for first iteration if not os.path.isdir(outpath): os.mkdir(outpath) # if a string (pointing to a single dataset) is passed in, box it in a list data_paths = box_if_string(data_paths) datasets = [dataset.build_dataset_from_file(f) for f in data_paths] total_num_examples = len(datasets[0].instances) test_datasets = [] if datasets_for_eval is not None: # if a test set is specified, use it. datasets_for_eval = box_if_string(datasets_for_eval) test_datasets = [dataset.build_dataset_from_file(f) for f in datasets_for_eval] if upto is None: upto = total_num_examples else: # other wise, we copy the first (even if there multiple datasets, it won't matter, as we're just using # the labels) and pick random examples hold_out_size = int(hold_out_p * total_num_examples) test_instances = random.sample(datasets[0].instances, hold_out_size) test_instance_ids = [inst.id for inst in test_instances] # now remove them from the dataset(s) for d in datasets: cur_test_dataset = dataset.dataset(d.remove_instances(test_instance_ids)) test_datasets.append(cur_test_dataset) # if no upper bound was passed in, use the whole pool U if upto is None: upto = total_num_examples - hold_out_size print "using %s out of %s instances for test set" % (hold_out_size, total_num_examples) print "U has cardinality: %s" % datasets[0].size() # # Here is where learners can be added for comparison # learners = [random_learner.RandomLearner([d.copy() for d in datasets]), simple_learner.SimpleLearner([d.copy() for d in datasets]), nb_learner.NBLearner([d.copy() for d in datasets])] output_files = [open("%s//%s_%s.txt" % (outpath, learner.name, run), 'w') for learner in learners] # we arbitrarily pick the initial ids from the first learner; this doesn't matter, as we just use the instance ids initial_f = learners[0].get_random_unlabeled_ids init_size = num_labels_so_far if pick_balanced_initial_set: initial_f = learners[0].pick_balanced_initial_training_set init_size = int(num_labels_so_far/2.0) # equal number from both classes # Again, you could call *.initial_f on any learner -- it just returns the ids to label initially. these will # be the same for all learners. init_ids =initial_f(init_size) # label instances and build initial models for learner in learners: learner.label_instances_in_all_datasets(init_ids) learner.rebuild_models() # report initial results, to console and file. report_results(learners, test_datasets, num_labels_so_far, output_files) first_iter = True while num_labels_so_far <= upto - step_size: # # the main active learning loop # cur_step_size = step_size cur_batch_size = batch_size if first_iter: # here we account for the initial labeled dataset size. for example, suppose # the step_size is set to 25 (we want to report results every 25 labels), # but the initial size was 2; then we want to label 23 on the first iteration # so that we report results when 25 total labels have been provided cur_step_size = step_size - num_labels_so_far if num_labels_so_far <= step_size \ else step_size - (num_labels_so_far - step_size) # in general, step_size is assumed to be a multiple of batch_size, for the first iteration, # when we're catching up to to the step_size (as outlined above), we set the # batch_size to 1 to make sure this condition holds. cur_batch_size = 1 first_iter = False for learner in learners: learner.active_learn(cur_step_size, num_to_label_at_each_iteration = cur_batch_size) num_labels_so_far += cur_step_size print "\n***labeled %s examples out of %s so far***" % (num_labels_so_far, upto) report_results(learners, test_datasets, num_labels_so_far, output_files) # close files for output_file in output_files: output_file.close() # post-experimental reporting if report_results_after_runs: results_reporter.post_runs_report(outpath, [l.name for l in learners], num_runs)