Exemple #1
0
def filter_entries(options):
   
    ids_to_include_dict = {} 
    ids_to_exclude_dict = {}
    inclusion_active = False
    exclusion_active = False
    if (len(options.files_with_lines_to_include) > 0):
        inclusion_active = True
        read_ids_into_dict(options.files_with_lines_to_include, ids_to_include_dict, options.files_with_lines_to_include_cols, options.title_present_filter_files)
    if (len(options.files_with_liens_to_exclude) > 0):
        exclusion_active = True
        read_ids_into_dict(options.files_with_liens_to_exclude, ids_to_exclude_dict, options.files_with_liens_to_exclude_cols, options.title_present_filter_files)
    assert (inclusion_active or exclusion_active)    

    for file_with_lines_to_filter in options.files_with_lines_to_filter:
        file_name_parts = util.get_file_name_parts(file_with_lines_to_filter) 
        output_dir = options.output_dir
        if output_dir is None:
           output_dir = file_name_parts.directory  
        output_file_name = output_dir+"/"+file_name_parts.get_transformed_file_path(lambda x: options.output_prefix+x)
        output_file_handle = fp.get_file_handle(output_file_name, 'w')
        
        print "Filtering",file_with_lines_to_filter
        def action(inp_arr,line_number):
            the_id = extract_id(inp_arr, options.files_with_lines_to_filter_cols)
            passes = False
            include = the_id in ids_to_include_dict
            exclude = the_id in ids_to_exclude_dict
            if (exclusion_active==False):
                assert inclusion_active == True
                passes = include
            elif (inclusion_active==False):
                assert exclusion_active == True
                passes = (exclude == False)
            else:
                assert inclusion_active and exclusion_active
                if (exclude_has_precedence):
                    passes = False if exclude else include #include if on the inclusion list UNLESS appears on the exclusion list.
                else:
                    passes = True if include else (exclude==False) #exclude if on the exclusion list UNLESS appears on the inclusion list.
            if passes:
                output_file_handle.write("\t".join(inp_arr)+"\n")
        
        file_handle = fp.get_file_handle(file_with_lines_to_filter)
        if (options.title_present_orig):
            output_file_handle.write(file_handle.readline())
        fp.perform_action_on_each_line_of_file(
            file_handle
            , transformation=fp.default_tab_seppd
            , action=action
            , progress_update=options.progress_update
        )
Exemple #2
0
def process_labels_with_labels_action(labels_objects, labels_action,
                                      set_label_names_action):
    print("Reading in labels")
    for labels_object in labels_objects:
        LabelsKeys.check_for_unsupported_keys_and_fill_in_defaults(
            labels_object)
        output_mode = labels_object[LabelsKeys.keys.output_mode_name]

        labels_file_name = labels_object[LabelsKeys.keys.file_name]
        file_with_subset_of_label_names =\
            labels_object[LabelsKeys.keys.file_with_subset_of_label_names]
        content_type = get_content_type_from_name(
            labels_object[LabelsKeys.keys.content_type])
        subset_of_columns_to_use_options=\
          (None if file_with_subset_of_label_names is None
            else fp.SubsetOfColumnsToUseOptions(
                    column_names=fp.read_rows_into_arr(
                                    fp.get_file_handle(
                                    file_with_subset_of_label_names))))
        core_titled_mapping_action = fp.get_core_titled_mapping_action(
            subset_of_columns_to_use_options=\
                subset_of_columns_to_use_options,
                content_type=content_type,
                content_start_index=1,
                key_columns=labels_object[LabelsKeys.keys.key_columns])

        def action(inp, line_number):
            if (line_number == 1):
                #If this is the first row, then pick out the list
                #of names relevant in the title
                label_names = core_titled_mapping_action(inp, line_number)
                set_label_names_action(output_mode=output_mode,
                                       label_names=label_names)
            else:
                #otherwise, pick out the labels
                the_id, labels = core_titled_mapping_action(inp, line_number)
                labels_action(output_mode=output_mode,
                              the_id=the_id,
                              labels=labels)

        fp.perform_action_on_each_line_of_file(
            file_handle=fp.get_file_handle(labels_file_name),
            action=action,
            transformation=fp.default_tab_seppd,
            progress_update=labels_object[LabelsKeys.keys.progress_update])
Exemple #3
0
def read_ids_into_dict(files, the_dict, id_cols, title_present):
    for aFile in files:
        file_handle = fp.get_file_handle(aFile)
        def action(inp_arr, line_number):
            the_id = extract_id(inp_arr,id_cols)
            the_dict[the_id] = 1
        fp.perform_action_on_each_line_of_file(
            file_handle
            , transformation = fp.default_tab_seppd
            , action = action
            , ignore_input_title = title_present
        )
Exemple #4
0
def fasta_iterator(features_opts):
    KeysObj = FeaturesFormatOptions_Fasta
    KeysObj.check_for_unsupported_keys_and_fill_in_defaults(features_opts)
    file_names = features_opts[KeysObj.keys.file_names]
    progress_update = features_opts[KeysObj.keys.progress_update]
    for file_name in file_names:
        print("on file", file_name)
        fasta_iterator = fp.FastaIterator(
            file_handle=fp.get_file_handle(file_name),
            progress_update=progress_update)
        for seq_id, seq in fasta_iterator:
            yield seq_id, av_util.seq_to_2d_image(seq)
Exemple #5
0
def columns_iterator(features_opts):
    KeysObj = FeaturesFormatOptions_Columns
    KeysObj.check_for_unsupported_keys_and_fill_in_defaults(features_opts)
    file_names = features_opts[KeysObj.keys.file_names]
    progress_update = features_opts[KeysObj.keys.progress_update]
    for file_name in file_names:
        print("on file", file_name)
        for line_number, line in enumerate(fp.get_file_handle(file_name)):
            if (line_number > 0):
                inp = fp.default_tab_seppd(line)
                inp_id = inp[0]
                inp_vals = [float(x) for x in inp[1:]]
                yield inp_id, np.array(inp_vals)
    def get_generator(self, loop_infinitely):
        #read bed_source into memory
        bed_fh = fp.get_file_handle(self.bed_source)
        data = []
        print("Reading bed file " + self.bed_source + " into memory")

        for a_row in bed_fh:
            a_row = a_row.rstrip().split("\t")
            data.append(
                Interval(chrom=a_row[0],
                         start=int(a_row[1]),
                         stop=int(a_row[2]),
                         labels=[self.labels_dtype(x) for x in a_row[3:]]))
        print("Finished reading bed file into memory; got " + str(len(data)) +
              "rows")
        if (self.num_to_load_for_eval > len(data)):
            print("num_to_load_for_eval is " + str(self.num_to_load_for_eval) +
                  " but length of data is " + str(len(data)) + "; adjusting")
            self.num_to_load_for_eval = len(data)
        random_obj = np.random.RandomState(self.random_seed)
        if (self.randomize_after_pass):
            data = shuffle_array(arr=data, random_obj=random_obj)

        #fasta extraction
        import pyfasta
        f = pyfasta.Fasta(self.fasta_data_source)

        idx = 0
        while (idx < len(data)):

            to_extract = data[idx:idx + 1]
            if (idx % 1000 == 0):
                print(to_extract)
            to_yield = f[
                to_extract[0].chrom][to_extract[0].start:to_extract[0].stop]
            to_yield = np.array([one_hot_encode[x] for x in to_yield])
            yield (to_yield, to_extract[0].labels,
                   (to_extract[0].chrom, to_extract[0].start,
                    to_extract[0].stop))

            idx += 1
            if (idx == len(data)):
                if (loop_infinitely):
                    if (self.randomize_after_pass):
                        data = shuffle_array(arr=data, random_obj=random_obj)
                    idx = 0
                else:
                    raise StopIteration()
Exemple #7
0
    def get_generator(self, loop_infinitely):
        #read bed_source into memory
        bed_fh = fp.get_file_handle(self.bed_source)
        data = []
        print("Reading bed file " + self.bed_source + " into memory")
        for a_row in bed_fh:
            a_row = a_row.rstrip().split("\t")
            data.append(
                Interval(chrom=a_row[0],
                         start=int(a_row[1]),
                         stop=int(a_row[2]),
                         labels=[self.labels_dtype(x) for x in a_row[3:]]))
        print("Finished reading bed file into memory; got " + str(len(data)) +
              "rows")
        if (self.num_to_load_for_eval > len(data)):
            print("num_to_load_for_eval is " + str(self.num_to_load_for_eval) +
                  " but length of data is " + str(len(data)) + "; adjusting")
            self.num_to_load_for_eval = len(data)
        random_obj = np.random.RandomState(self.random_seed)
        if (self.randomize_after_pass):
            data = shuffle_array(arr=data, random_obj=random_obj)

        #Set up the genomelake extractors
        import genomelake
        import genomelake.extractors
        extractor = genomelake.extractors.ArrayExtractor(
            datafile=self.genomelake_data_source)

        idx = 0
        while (idx < len(data)):

            to_extract = data[idx:idx + 1]
            to_yield = extractor(to_extract)[0]
            yield (to_yield, to_extract[0].labels,
                   (to_extract[0].chrom, to_extract[0].start,
                    to_extract[0].stop))

            idx += 1
            if (idx == len(data)):
                if (loop_infinitely):
                    if (self.randomize_after_pass):
                        data = shuffle_array(arr=data, random_obj=random_obj)
                    idx = 0
                else:
                    raise StopIteration()
Exemple #8
0
def fasta_iterator(features_opts):
    KeysObj = FeaturesFormatOptions_Fasta
    KeysObj.check_for_unsupported_keys_and_fill_in_defaults(features_opts)
    file_names = features_opts[KeysObj.keys.file_names]
    progress_update = features_opts[KeysObj.keys.progress_update]
    for file_name in file_names:
        print("on file", file_name)
        fasta_iterator = fp.FastaIterator(
            file_handle=fp.get_file_handle(file_name),
            progress_update=progress_update)
        for seq_id, seq in fasta_iterator:
            one_hot_format = features_opts[KeysObj.keys.one_hot_format]
            if (one_hot_format == OneHotFormats._1d):
                yield seq_id, av_util.seq_to_one_hot(seq)
            elif (one_hot_format == OneHotFormats.theano_one_hot_row):
                yield seq_id, av_util.theano_seq_to_2d_image(seq)
            else:
                raise RuntimeError("Unsupported one_hot_format: " +
                                   one_hot_format +
                                   "; supported formats are: " +
                                   str(OneHotFormats.vals))
Exemple #9
0
def get_id_to_split_names(split_object):
    """
        return:
        id_to_split_names
        split_to_ids
    """
    SplitKeys.check_for_unsupported_keys_and_fill_in_defaults(split_object)
    opts = split_object[SplitKeys.keys.opts]
    SplitOptsKeys.check_for_unsupported_keys_and_fill_in_defaults(opts)
    split_name_to_split_files = split_object[
        SplitKeys.keys.split_name_to_split_files]
    id_to_split_names = {}
    split_to_ids = OrderedDict()
    for split_name in split_name_to_split_files:
        ids_in_split = fp.read_col_into_arr(
            fp.get_file_handle(split_name_to_split_files[split_name]), **opts)
        split_to_ids[split_name] = ids_in_split
        for the_id in ids_in_split:
            if the_id not in id_to_split_names:
                id_to_split_names[the_id] = []
            id_to_split_names[the_id].append(split_name)
    return id_to_split_names, split_to_ids
 def __init__(self,
              file_with_fasta,
              fasta_col,
              randomize_after_pass,
              random_seed,
              loop_infinitely,
              label_columns,
              labels_dtype,
              title_present,
              pre_onehot=False):
     #read bed_source into memory
     bed_fh = fp.get_file_handle(file_with_fasta)
     data = []
     print("Reading file " + file_with_fasta + " into memory")
     for (idx, a_row) in enumerate(bed_fh):
         if (hasattr(a_row, 'decode')):
             a_row = a_row.decode("utf-8")
         if (title_present == False or idx > 0):
             a_row = a_row.rstrip().split("\t")
             if (pre_onehot):
                 #the > 0 is to have x be boolean, to save space.
                 x = (np.array(
                     [one_hot_encode[x] for x in a_row[fasta_col]]) > 0)
             else:
                 x = a_row[fasta_col]
             y = [labels_dtype(a_row[x]) for x in label_columns]
             data.append((x, y))
     print("Finished reading file into memory; got " + str(len(data)) +
           " rows")
     random_obj = np.random.RandomState(random_seed)
     if (randomize_after_pass):
         data = shuffle_array(arr=data, random_obj=random_obj)
     self.data = data
     self.randomize_after_pass = randomize_after_pass
     self.random_obj = random_obj
     self.loop_infinitely = loop_infinitely
     self.pre_onehot = pre_onehot
Exemple #11
0
    def __call__(self, performance_history, model_wrapper, training_metadata,
                 message, model_creator_info, model_trainer_info,
                 other_data_loaders_info, **kwargs):
        if (training_metadata['total_epochs_trained_for'] > 0):
            #acquire lock on db file
            db_lock = fp.FileLockAsDir(self.db_path)

            #read the contents if file exists, otherwise init as you will
            if (util.file_exists(self.db_path)):
                db_contents = yaml.load(fp.get_file_handle(self.db_path))
            else:
                db_contents = OrderedDict([
                    ('metadata',
                     OrderedDict([('total_records', 0),
                                  ('best_valid_key_metric', None),
                                  ('best_saved_files_config', None)])),
                    ('records', [])
                ])

            #partition into metadata and records
            metadata = db_contents['metadata']
            records = db_contents['records']

            #arrange the fields in the records in the right order
            new_records = []
            for record in records:
                new_record = OrderedDict()
                for key in self.record_keys.get_keys():
                    if key in record:
                        new_record[key] = record[key]
                    else:
                        new_record[key] = None
                #put in any leftover keys that are not in our
                #current set of keys
                for key in record:
                    if key not in new_record:
                        new_record[key] = record[key]
                new_records.append(new_record)
            records = new_records

            new_record_num = metadata['total_records'] + 1
            model_wrapper.prefix_to_last_saved_files(
                prefix="record_" + str(new_record_num),
                new_directory=self.new_save_dir)

            #update the metadata
            metadata['total_records'] = new_record_num
            previous_best_valid_key_metric = metadata['best_valid_key_metric']
            current_best_valid_perf_info =\
                performance_history.get_best_valid_epoch_perf_info()
            current_best_valid_key_metric = current_best_valid_perf_info\
                                            .valid_key_metric
            if ((previous_best_valid_key_metric is None)
                    or (((-1 if self.larger_is_better else 1) *
                         previous_best_valid_key_metric) >
                        ((-1 if self.larger_is_better else 1) *
                         current_best_valid_key_metric))):
                metadata[
                    'best_valid_key_metric'] = current_best_valid_key_metric
                metadata['best_saved_files_config'] =\
                    model_wrapper.get_last_saved_files_config()

            #create a new entry for the db
            entry = OrderedDict()
            entry[self.record_keys.k.record_number] = new_record_num
            entry[self.record_keys.k.message] = message
            entry[self.record_keys.k.best_valid_key_metric] =\
                current_best_valid_key_metric
            entry[self.record_keys.k.best_valid_perf_info] =\
                current_best_valid_perf_info.get_jsonable_object()
            entry[self.record_keys.k.key_metric_history] =\
                [('train','valid')]+\
                zip(performance_history.get_train_key_metric_history(),
                    performance_history.get_valid_key_metric_history())
            entry[self.record_keys.k.all_valid_metrics_history] =\
                performance_history.get_all_metrics_valid_history()
            entry[self.record_keys.k.saved_files_config] =\
                model_wrapper.get_last_saved_files_config()
            entry[self.record_keys.k.model_creator_info] = model_creator_info
            entry[self.record_keys.k.other_data_loaders_info] =\
                other_data_loaders_info
            entry[self.record_keys.k.model_trainer_info] = model_trainer_info
            entry[self.record_keys.k.training_metadata] = training_metadata

            #append a new entry to the records
            records.append(entry)
            #sort the records by perf
            records = sorted(records,
                             key=lambda x: ((-1 if self.larger_is_better else 1
                                             ) * x['best_valid_key_metric']))

            #open BackupForWriteFileHandle, write the json, close
            file_handle = fp.BackupForWriteFileHandle(self.db_path)
            file_handle.write(
                util.format_as_json(
                    OrderedDict([('metadata', metadata),
                                 ('records', records)])))
            file_handle.close()

            #release the lock on the db file
            db_lock.release()
def get_pyfasta_generator(bed_source,
                          fasta_data_source,
                          append_chrom_number,
                          labels_dtype,
                          randomize_after_pass,
                          stratification_settings,
                          random_seed,
                          loop_infinitely,
                          labels_subset=None):
    #read bed_source into memory
    bed_fh = fp.get_file_handle(bed_source)
    data = []
    print("Reading bed file " + bed_source + " into memory")
    for a_row in bed_fh:
        a_row = (a_row.decode("utf-8")
                 if hasattr(a_row, 'decode') else a_row).rstrip().split("\t")
        data.append(
            Interval(chrom=a_row[0],
                     start=int(a_row[1]),
                     stop=int(a_row[2]),
                     labels=[
                         labels_dtype(x)
                         for x in (a_row[3:] if labels_subset is None else
                                   [a_row[3 + y] for y in labels_subset])
                     ]))
    print("Finished reading bed file into memory; got " + str(len(data)) +
          "rows")
    random_obj = np.random.RandomState(random_seed)

    if (stratification_settings is not None):
        stratification_type = stratification_settings["type"]
        stratification_column = stratification_settings["column"]
        num_splits = stratification_settings['num_splits']
        bin_sizes = int(np.ceil(len(data) / num_splits))
        if (stratification_type == "continuous"):
            sorted_data = sorted(data,
                                 key=lambda x: x.labels[stratification_column])
            stratifications = [
                sorted_data[i * bin_sizes:min(len(data), (i + 1) * bin_sizes)]
                for i in range(num_splits)
            ]
        else:
            raise RuntimeError("Unrecognized stratification type",
                               stratification_type)

    if (randomize_after_pass):
        if (stratification_settings is not None):
            data = get_stratified_shuffle(stratifications=stratifications,
                                          random_obj=random_obj)
        else:
            data = shuffle_array(arr=data, random_obj=random_obj)

    #fasta extraction
    import pyfasta
    f = pyfasta.Fasta(fasta_data_source)

    idx = 0
    while (idx < len(data)):
        to_extract = data[idx:idx + 1]
        chrom = to_extract[0].chrom
        if (append_chrom_number == True):
            chrom = chrom + " " + chrom[3:]
        to_yield_str = f[chrom][to_extract[0].start:to_extract[0].stop]
        to_yield = np.array([one_hot_encode[x] for x in to_yield_str])
        to_yield_labels = to_extract[0].labels
        yield (to_yield, to_yield_labels,
               (to_extract[0].chrom, to_extract[0].start,
                to_extract[0].stop), to_yield_str)

        idx += 1
        if (idx == len(data)):
            if (loop_infinitely):
                if (randomize_after_pass):
                    if (stratification_settings is not None):
                        data = get_stratified_shuffle(
                            stratifications=stratifications,
                            random_obj=random_obj)
                    else:
                        data = shuffle_array(arr=data, random_obj=random_obj)
                idx = 0
            else:
                return