Python build_dataset_from_file Examples, dataset.build_dataset_from_file Python Examples

Example #1

0

Show file

File: tfidf2.py Project: gertvv/abstrackr-web

def output_word_probs(input_folder, fv_name, 
                            word_list_path, \
                            fout_path=None):
    '''
    output a file with rows as follows:

        word | index | p(y=1|word) | p(y=1|word)
    '''
    ###
    # we need access to the learners.
    ###
    sys.path.append("../modeling/curious_snake")
    import dataset
    import learners.base_nb_learner as base_nb_learner # naive bayes
    import pickle

    path_to_src = os.path.join(input_folder, fv_name)
    source_dataset = dataset.build_dataset_from_file(path_to_src)
    
    word_list = eval(open(word_list_path).readline())


    ###
    # construct a naive bayes learner over the source
    # dataset.
    nb_learner_src = base_nb_learner.BaseNBLearner([source_dataset])
    nb_learner_src.label_all_data() # legal, because this is the source data
    nb_learner_src.rebuild_models(True)

    cond_probs = nb_learner_src.models[0].conditional_probs
    out_d = {}
    for f_j in cond_probs.keys():
        word = word_list[f_j]
        out_d[word]=cond_probs[f_j]

    '''
    out_str = ["word\tsource_index\tp(y=1|x)\tp(y=-1|x)"]
    for f_j in cond_probs:
        if f_j % 100 == 0:
            print "on feature %s" % f_j

        word = source_word_list[f_j]
        if word in target_word_list:
            tgt_index = target_word_list.index(word)
            target_word_list.pop(word)
            p_1, p_0 = cond_probs[f_j][1], cond_probs[f_j][-1]
            out_str.append("%s\t%s\t%s\t%s\t%s" % (word, f_j, tgt_index, p_1, p_0))
    '''
    if fout_path is None:
        fout_path = os.path.join(input_folder, "%s_probs_d" % fv_name)

    fout = open(fout_path, 'w')
    pickle.dump(out_d, fout)
    fout.close()

Example #2

0

Show file

File: run_sleep_apnea.py Project: bwallace/citation-screening

######################################## 
#
# run hold out experiments on sleep apnea
#
########################################

import os
import dataset
#data_paths = [os.path.join("data", "sleep_apnea", "r0_denish",s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"]]#, "sleep_concepts"]]
#feature_sets = [os.path.join("data", "sleep_apnea",s) for s in ["sleep_concepts"]]
test_data_paths =  [os.path.join("data", "sleep_apnea", "r0",s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"]]#, "sleep_concepts"]]
test_datasets = [dataset.build_dataset_from_file(f, ignore_unlabeled_instances=True) for f in test_data_paths]
#feature_sets = [os.path.join("data", "sleep_apnea",s) for s in ["sleep_concepts"]]
import curious_snake
# todo: make learner setup function parameteric so you can pass it in here
#curious_snake.retro_diversity(feature_sets)

data_paths = [os.path.join("data", "sleep_apnea", "%s" % "r0_denish",s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"]]#, "sleep_concepts"]]
curious_snake.run_cv_experiments_with_test_data(data_paths, test_data_paths, 
                                                    os.path.join("output", "retro_cv_no_undersample"), 
                                                    test_datasets=test_datasets,
                                                    num_runs=10, hold_out_p=.10)
'''
for d in ["r0"] + ["r%s_denish" % (x+1) for x in range(23)]:
    print d
    data_paths = [os.path.join("data", "sleep_apnea", "%s" % d,s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"]]#, "sleep_concepts"]]
    curious_snake.run_cv_experiments_with_test_data(data_paths, test_data_paths, 
                                                        os.path.join("output", "retro_cv_no_undersample_%s" % d), 
                                                        test_datasets=test_datasets,
                                                        num_runs=10, hold_out_p=.10)

Example #3

0

Show file

File: tfidf2.py Project: gertvv/abstrackr-web

def pooling_adapt(input_folder, src_fv_name, tgt_fv_name, fout_path,\
                    shared_word_list_name="shared_word_list.txt",
                    N=200):
    '''
    note that we assume instances are here represented in
    a *shared* space (comprising the words in shared_word_list_name) 

    sample call

        pooling_adapt("_transfer/prostate", 
                      "prostate_as_ab_mh_ti_shared_representation", 
                      "prostate_ab_mh_ti_shared_representation", 
                      "_transfer/prostate/pooling_adapt", 
                      "shared_word_list.txt")
    '''

    ###
    # we need access to the learners.
    ###
    sys.path.append("../modeling/curious_snake")
    import dataset
    import learners.base_nb_learner as base_nb_learner # naive bayes

    path_to_src = os.path.join(input_folder, src_fv_name)
    source_dataset = dataset.build_dataset_from_file(path_to_src)
    
    path_to_tgt = os.path.join(input_folder, tgt_fv_name)
    target_dataset = dataset.build_dataset_from_file(path_to_tgt)

    ###
    # make sure that the ids are unique
    s1 = set(source_dataset.instances.keys())
    s2 = set(target_dataset.instances.keys())
    if len(s1.intersection(s2))  > 0:
        raise Exception, "whoops, you don't have unique ids bro."
    
    # the start index for psuedo instances 
    start_index = max(max(source_dataset.instances.keys()), \
                      max(target_dataset.instances.keys())) + 1
                    
    shared_word_list = \
            eval(open(os.path.join(input_folder, shared_word_list_name)).readline())
    ###
    # construct a naive bayes learner over the source
    # dataset.
    nb_learner_src = base_nb_learner.BaseNBLearner([source_dataset])
    nb_learner_src.label_all_data() # legal, because this is the source data
    nb_learner_src.rebuild_models(True)

    ####
    # construct a learner on the *target*
    # we're going to cheat for now on the 
    # features -- this is our ORACLE
    nb_learner_test = base_nb_learner.BaseNBLearner([target_dataset])
    nb_learner_test.label_all_data() # CHEATING
    nb_learner_test.rebuild_models(True)

    best_features = _top_k_features(nb_learner_src.models[0].conditional_probs, 100)
    best_feature_words = [shared_word_list[f[0]] for f in best_features]


    # map features to their \deltas in the target task
    conditional_probs = nb_learner_test.models[0].conditional_probs
    test_features_to_deltas = {}
    for f_j in conditional_probs:
        test_features_to_deltas[f_j] = conditional_probs[f_j][1]-conditional_probs[f_j][-1]
    
    # what are the deltas in the test set for the features that were the 'best'
    # in the source set?
    best_f_deltas_in_test = [test_features_to_deltas[f_star[0]] for f_star in best_features]
    
    # generate our psuedo instance string
    psuedo_instance_str = _psuedo_positives(best_features, N, start_index)

    ###
    # assemble a string containing
    #       <the source dataset>
    #       # target dataset starts here
    #       <the target dataset>
    #       # psuedo-instances start here   
    #       <psuedo-instances>
    out_str = open(path_to_src).readlines()
    out_str.append("\n# target dataset starts here! \n")
    out_str.extend(open(path_to_tgt).readlines())
    out_str.append("\n# psuedo instances start here! \n")
    out_str.append("\n".join(psuedo_instance_str))
    fout = open(fout_path, 'w')
    pdb.set_trace()
    fout.write("".join(out_str))
    fout.close()
    print "ok -- file written to %s" % fout_path

Example #4

0

Show file

########################################
#
# run hold out experiments on sleep apnea
#
########################################

import os
import dataset
#data_paths = [os.path.join("data", "sleep_apnea", "r0_denish",s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"]]#, "sleep_concepts"]]
#feature_sets = [os.path.join("data", "sleep_apnea",s) for s in ["sleep_concepts"]]
test_data_paths = [
    os.path.join("data", "sleep_apnea", "r0", s)
    for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"]
]  #, "sleep_concepts"]]
test_datasets = [
    dataset.build_dataset_from_file(f, ignore_unlabeled_instances=True)
    for f in test_data_paths
]
#feature_sets = [os.path.join("data", "sleep_apnea",s) for s in ["sleep_concepts"]]
import curious_snake
# todo: make learner setup function parameteric so you can pass it in here
#curious_snake.retro_diversity(feature_sets)

data_paths = [
    os.path.join("data", "sleep_apnea", "%s" % "r0_denish", s)
    for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords"]
]  #, "sleep_concepts"]]
curious_snake.run_cv_experiments_with_test_data(data_paths,
                                                test_data_paths,
                                                os.path.join(
                                                    "output",

Example #5

0

Show file

File: dataset_test.py Project: digmouse/curious_snake

 def setUp(self):
     self.path_to_data = os.path.join("..","data","data.txt")
     print "reading in data..."
     self.data = dataset.build_dataset_from_file(self.path_to_data)
     print "success"

Example #6

0

Show file

def run_experiments_hold_out(data_paths,
                             outpath,
                             hold_out_p=.25,
                             datasets_for_eval=None,
                             upto=None,
                             step_size=25,
                             initial_size=2,
                             batch_size=5,
                             pick_balanced_initial_set=True,
                             num_runs=10,
                             report_results_after_runs=True):
    '''
    This method demonstrates how to use the active learning framework, and is also a functional routine for comparing learners. Basically,
    a number of runs will be performed, the active learning methods will be evaluated at each step, and results will be reported. The results
    for each run will be dumped to a text files, which then can be combined (e.g., averaged), elsewhere, or you can use the results_reporter
    module to aggregate and plot the output.
    
    @parameters
    --
    data_paths -- this is either a list (pointing to multiple feature spaces for the same instances) or a string pointing to a single data file (this will be
                                the typical case). e.g., data_paths = "mydata.txt". curious_snake uses a sparse-formated weka-like format, documented elsewhere.
    outpath -- this is a directory under which all of the results will be dumped.
    hold_out_p -- the hold out percentage, i.e., how much of your data will be used for evaluation. you can ignore this is you're providing your own    
                                  dataset(s) for evaluation (i.e., datasets_for_eval is not None)'.
    datasets_for_eval -- use this is you have datasets you want to use for testing -- i.e., to specify your hold out set independent of the data
                                                in data_paths. 
    upto -- active learning will stop when upto examples have been labeled. if this is None, upto will default to the total unlabeled pool available
    initial_size -- the size of 'bootstrap' set to use prior to starting active learning (for the initial models)
    batch_size -- the number of examples to be labeled at each iteration in active learning -- optimally, 1
    step_size -- results will be reported every time another step_size examples have been labeled
    pick_balanced_initial_set -- if True, the initial train dataset will be built over an equal number (initial_size/2) of both classes.
    num_runs -- this many runs will be performed
    report_results -- if true, the results_reporter module will be used to generate output.
    '''
    for run in range(num_runs):
        print "\n********\non run %s" % run

        print data_paths
        num_labels_so_far = initial_size  # set to initial size for first iteration

        if not os.path.isdir(outpath):
            os.mkdir(outpath)

        # if a string (pointing to a single dataset) is passed in, box it in a list
        data_paths = box_if_string(data_paths)
        datasets = [dataset.build_dataset_from_file(f) for f in data_paths]
        total_num_examples = len(datasets[0].instances)

        test_datasets = []
        if datasets_for_eval is not None:
            # if a test set datafile is specified, use it.
            datasets_for_eval = box_if_string(datasets_for_eval)
            test_datasets = [
                dataset.build_dataset_from_file(f) for f in datasets_for_eval
            ]
            if upto is None:
                upto = total_num_examples
        else:
            # other wise, we copy the first (even if there multiple datasets, it won't matter,
            # as we're just using the labels) and pick random examples
            hold_out_size = int(hold_out_p * total_num_examples)

            test_instance_ids = random.sample(datasets[0].instances,
                                              hold_out_size)
            # now remove them from the dataset(s)
            for d in datasets:
                cur_test_dataset = dataset.Dataset(
                    dict(
                        zip(test_instance_ids,
                            d.remove_instances(test_instance_ids))))
                test_datasets.append(cur_test_dataset)

            # if no upper bound was passed in, use the whole pool U
            if upto is None:
                upto = total_num_examples - hold_out_size

        print "using %s out of %s instances for test set" % (
            hold_out_size, total_num_examples)
        print "U has cardinality: %s" % datasets[0].size()

        #
        # Set up the learners, add to list. Here is where you would instantiate new learners.
        #
        learners = [
            random_learner.RandomLearner([d.copy() for d in datasets]),
            simple_learner.SimpleLearner([d.copy() for d in datasets])
        ]  #,
        #pal_learner.PALLearner([d.copy() for d in datasets])]

        #learners = [random_nb_learner.RandomNBLearner([d.copy() for d in datasets]),
        #            uncertainty_nb_learner.UncertaintyNBLearner([d.copy() for d in datasets])]

        output_files = [
            open("%s//%s_%s.txt" % (outpath, learner.name, run), 'w')
            for learner in learners
        ]

        # we arbitrarily pick the initial ids from the first learner; this doesn't matter, as we just use the instance ids
        initial_f = learners[0].get_random_unlabeled_ids
        init_size = num_labels_so_far
        if pick_balanced_initial_set:
            initial_f = learners[0].pick_balanced_initial_training_set
            init_size = int(num_labels_so_far /
                            2.0)  # equal number from both classes

        # Again, you could call *.initial_f on any learner -- it just returns the ids to label initially. these will
        # be the same for all learners.
        init_ids = initial_f(init_size)

        # label instances and build initial models
        for learner in learners:
            learner.label_instances_in_all_datasets(init_ids)
            learner.rebuild_models()

        # report initial results, to console and file.
        report_results(learners, test_datasets, num_labels_so_far,
                       output_files)

        first_iter = True
        while num_labels_so_far <= upto - step_size:
            #
            # the main active learning loop
            #
            cur_step_size = step_size
            cur_batch_size = batch_size
            if first_iter:
                # here we account for the initial labeled dataset size. for example, suppose
                # the step_size is set to 25 (we want to report results every 25 labels),
                # but the initial size was 2; then we want to label 23 on the first iteration
                # so that we report results when 25 total labels have been provided
                cur_step_size = step_size - num_labels_so_far if num_labels_so_far <= step_size \
                                else step_size - (num_labels_so_far - step_size)
                # in general, step_size is assumed to be a multiple of batch_size, for the first iteration,
                # when we're catching up to to the step_size (as outlined above), we set the
                # batch_size to 1 to make sure this condition holds.
                cur_batch_size = 1
                first_iter = False

            for learner in learners:
                learner.active_learn(cur_step_size, batch_size=cur_batch_size)

            num_labels_so_far += cur_step_size
            print "\n***labeled %s examples out of %s so far***" % (
                num_labels_so_far, upto)

            report_results(learners, test_datasets, num_labels_so_far,
                           output_files)

        # close files
        for output_file in output_files:
            output_file.close()

    # post-experimental reporting
    if report_results_after_runs:
        results_reporter.post_runs_report(outpath, [l.name for l in learners],
                                          num_runs)

Example #7

0

Show file

 def setUp(self):
     self.path_to_data = "data.txt"
     print "reading in data..."
     self.data = dataset.build_dataset_from_file(self.path_to_data)
     print "success"

Example #8

0

Show file

File: run_prospective_sleep_apnea.py Project: marzlia/citation-screening

########################################
#
# run hold out experiments on sleep apnea
#
########################################

import os
#feature_sets = [os.path.join("data", "sleep_apnea",s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords", "sleep_concepts"]]
data_paths = [
    os.path.join("data", "sleep_apnea", s)
    for s in ["titles_post_r7", "abstracts_post_r7", "keywords_post_r7"]
]
import curious_snake
import dataset
# todo: make learner setup function parameteric so you can pass it in here
datasets = [dataset.build_dataset_from_file(f) for f in data_paths]
curious_snake.prospective(None,
                          os.path.join("output", "sleepies6"),
                          "predictions_all",
                          datasets=datasets,
                          beta=1)
#curious_snake.retro_diversity(feature_sets)

Example #9

0

Show file

File: run_prospective_sleep_apnea.py Project: bwallace/citation-screening

######################################## 
#
# run hold out experiments on sleep apnea
#
########################################

import os
#feature_sets = [os.path.join("data", "sleep_apnea",s) for s in ["sleep_titles", "sleep_abstracts", "sleep_keywords", "sleep_concepts"]]
data_paths = [os.path.join("data", "sleep_apnea", s) for s in ["titles_post_r7", "abstracts_post_r7", "keywords_post_r7"]]
import curious_snake
import dataset
# todo: make learner setup function parameteric so you can pass it in here
datasets = [dataset.build_dataset_from_file(f) for f in data_paths]
curious_snake.prospective(None, os.path.join("output", "sleepies6"), "predictions_all", datasets=datasets, beta=1)
#curious_snake.retro_diversity(feature_sets)

Example #10

0

Show file

 def setUp(self):
     self.path_to_data = os.path.join("..", "data", "data.txt")
     print "reading in data..."
     self.data = dataset.build_dataset_from_file(self.path_to_data)
     print "success"

Example #11

0

Show file

File: curious_snake.py Project: digmouse/curious_snake

def run_experiments_hold_out(data_paths, outpath, hold_out_p = .25,  datasets_for_eval = None, upto = None, step_size = 25, 
                                                  initial_size = 2, batch_size = 5,  pick_balanced_initial_set = True, 
                                                  num_runs=10, report_results_after_runs=True):
    '''
    This method demonstrates how to use the active learning framework, and is also a functional routine for comparing learners. Basically,
    a number of runs will be performed, the active learning methods will be evaluated at each step, and results will be reported. The results
    for each run will be dumped to a text files, which then can be combined (e.g., averaged), elsewhere, or you can use the results_reporter
    module to aggregate and plot the output.
    
    @parameters
    --
    data_paths -- this is either a list (pointing to multiple feature spaces for the same instances) or a string pointing to a single data file (this will be
                                the typical case). e.g., data_paths = "mydata.txt". curious_snake uses a sparse-formated weka-like format, documented elsewhere.
    outpath -- this is a directory under which all of the results will be dumped.
    hold_out_p -- the hold out percentage, i.e., how much of your data will be used for evaluation. you can ignore this is you're providing your own    
                                  dataset(s) for evaluation (i.e., datasets_for_eval is not None)'.
    datasets_for_eval -- use this is you have datasets you want to use for testing -- i.e., to specify your hold out set independent of the data
                                                in data_paths. 
    upto -- active learning will stop when upto examples have been labeled. if this is None, upto will default to the total unlabeled pool available
    initial_size -- the size of 'bootstrap' set to use prior to starting active learning (for the initial models)
    batch_size -- the number of examples to be labeled at each iteration in active learning -- optimally, 1
    step_size -- results will be reported every time another step_size examples have been labeled
    pick_balanced_initial_set -- if True, the initial train dataset will be built over an equal number (initial_size/2) of both classes.
    num_runs -- this many runs will be performed
    report_results -- if true, the results_reporter module will be used to generate output.
    '''
    for run in range(num_runs):
        print "\n********\non run %s" % run
 
        print data_paths
        num_labels_so_far = initial_size # set to initial size for first iteration

        if not os.path.isdir(outpath):
            os.mkdir(outpath)
        
        # if a string (pointing to a single dataset) is passed in, box it in a list
        data_paths = box_if_string(data_paths)
        datasets = [dataset.build_dataset_from_file(f) for f in data_paths]
        total_num_examples = len(datasets[0].instances)
        
        test_datasets = []
        if datasets_for_eval is not None:
            # if a test set is specified, use it.
            datasets_for_eval = box_if_string(datasets_for_eval)
            test_datasets = [dataset.build_dataset_from_file(f) for f in datasets_for_eval]
            if upto is None:
                upto = total_num_examples
        else:
            # other wise, we copy the first (even if there multiple datasets, it won't matter, as we're just using 
            # the labels) and pick random examples
            hold_out_size = int(hold_out_p * total_num_examples)
            test_instances = random.sample(datasets[0].instances, hold_out_size)
            test_instance_ids = [inst.id for inst in test_instances]
            # now remove them from the dataset(s)
            for d in datasets:
                cur_test_dataset = dataset.dataset(d.remove_instances(test_instance_ids))                    
                test_datasets.append(cur_test_dataset)
            
            # if no upper bound was passed in, use the whole pool U
            if upto is None:
                upto = total_num_examples - hold_out_size
                
        print "using %s out of %s instances for test set" % (hold_out_size, total_num_examples)
        print "U has cardinality: %s" % datasets[0].size()
        
        
        #
        # Here is where learners can be added for comparison
        #
        learners = [random_learner.RandomLearner([d.copy() for d in datasets]), 
                    simple_learner.SimpleLearner([d.copy() for d in datasets]),
                    nb_learner.NBLearner([d.copy() for d in datasets])]
                
        output_files = [open("%s//%s_%s.txt" % (outpath, learner.name, run), 'w') for learner in learners]

        # we arbitrarily pick the initial ids from the first learner; this doesn't matter, as we just use the instance ids
        initial_f = learners[0].get_random_unlabeled_ids 
        init_size = num_labels_so_far
        if pick_balanced_initial_set:
            initial_f = learners[0].pick_balanced_initial_training_set
            init_size = int(num_labels_so_far/2.0) # equal number from both classes
            
        # Again, you could call *.initial_f on any learner -- it just returns the ids to label initially. these will
        # be the same for all learners.
        init_ids =initial_f(init_size)
        
        # label instances and build initial models
        for learner in learners:
            learner.label_instances_in_all_datasets(init_ids)
            learner.rebuild_models()
            
        # report initial results, to console and file.
        report_results(learners, test_datasets, num_labels_so_far, output_files)
              
        first_iter = True
        while num_labels_so_far <= upto - step_size:
            #
            # the main active learning loop
            #
            cur_step_size = step_size
            cur_batch_size = batch_size
            if first_iter:
                # here we account for the initial labeled dataset size. for example, suppose
                # the step_size is set to 25 (we want to report results every 25 labels), 
                # but the initial size was 2; then we want to label 23 on the first iteration
                # so that we report results when 25 total labels have been provided
                cur_step_size = step_size - num_labels_so_far if num_labels_so_far <= step_size \
                                else step_size - (num_labels_so_far - step_size)
                # in general, step_size is assumed to be a multiple of batch_size, for the first iteration, 
                # when we're catching up to to the step_size (as outlined above), we set the
                # batch_size to 1 to make sure this condition holds.
                cur_batch_size = 1 
                first_iter = False
            
            for learner in learners:
                learner.active_learn(cur_step_size, num_to_label_at_each_iteration = cur_batch_size)
                            
            num_labels_so_far += cur_step_size
            print "\n***labeled %s examples out of %s so far***" % (num_labels_so_far, upto)
            
            report_results(learners, test_datasets, num_labels_so_far, output_files)

        # close files
        for output_file in output_files:
            output_file.close()
    
    # post-experimental reporting
    if report_results_after_runs:
        results_reporter.post_runs_report(outpath, [l.name for l in learners], num_runs)