def filter_entries(options): ids_to_include_dict = {} ids_to_exclude_dict = {} inclusion_active = False exclusion_active = False if (len(options.files_with_lines_to_include) > 0): inclusion_active = True read_ids_into_dict(options.files_with_lines_to_include, ids_to_include_dict, options.files_with_lines_to_include_cols, options.title_present_filter_files) if (len(options.files_with_liens_to_exclude) > 0): exclusion_active = True read_ids_into_dict(options.files_with_liens_to_exclude, ids_to_exclude_dict, options.files_with_liens_to_exclude_cols, options.title_present_filter_files) assert (inclusion_active or exclusion_active) for file_with_lines_to_filter in options.files_with_lines_to_filter: file_name_parts = util.get_file_name_parts(file_with_lines_to_filter) output_dir = options.output_dir if output_dir is None: output_dir = file_name_parts.directory output_file_name = output_dir+"/"+file_name_parts.get_transformed_file_path(lambda x: options.output_prefix+x) output_file_handle = fp.get_file_handle(output_file_name, 'w') print "Filtering",file_with_lines_to_filter def action(inp_arr,line_number): the_id = extract_id(inp_arr, options.files_with_lines_to_filter_cols) passes = False include = the_id in ids_to_include_dict exclude = the_id in ids_to_exclude_dict if (exclusion_active==False): assert inclusion_active == True passes = include elif (inclusion_active==False): assert exclusion_active == True passes = (exclude == False) else: assert inclusion_active and exclusion_active if (exclude_has_precedence): passes = False if exclude else include #include if on the inclusion list UNLESS appears on the exclusion list. else: passes = True if include else (exclude==False) #exclude if on the exclusion list UNLESS appears on the inclusion list. if passes: output_file_handle.write("\t".join(inp_arr)+"\n") file_handle = fp.get_file_handle(file_with_lines_to_filter) if (options.title_present_orig): output_file_handle.write(file_handle.readline()) fp.perform_action_on_each_line_of_file( file_handle , transformation=fp.default_tab_seppd , action=action , progress_update=options.progress_update )
def process_labels_with_labels_action(labels_objects, labels_action, set_label_names_action): print("Reading in labels") for labels_object in labels_objects: LabelsKeys.check_for_unsupported_keys_and_fill_in_defaults( labels_object) output_mode = labels_object[LabelsKeys.keys.output_mode_name] labels_file_name = labels_object[LabelsKeys.keys.file_name] file_with_subset_of_label_names =\ labels_object[LabelsKeys.keys.file_with_subset_of_label_names] content_type = get_content_type_from_name( labels_object[LabelsKeys.keys.content_type]) subset_of_columns_to_use_options=\ (None if file_with_subset_of_label_names is None else fp.SubsetOfColumnsToUseOptions( column_names=fp.read_rows_into_arr( fp.get_file_handle( file_with_subset_of_label_names)))) core_titled_mapping_action = fp.get_core_titled_mapping_action( subset_of_columns_to_use_options=\ subset_of_columns_to_use_options, content_type=content_type, content_start_index=1, key_columns=labels_object[LabelsKeys.keys.key_columns]) def action(inp, line_number): if (line_number == 1): #If this is the first row, then pick out the list #of names relevant in the title label_names = core_titled_mapping_action(inp, line_number) set_label_names_action(output_mode=output_mode, label_names=label_names) else: #otherwise, pick out the labels the_id, labels = core_titled_mapping_action(inp, line_number) labels_action(output_mode=output_mode, the_id=the_id, labels=labels) fp.perform_action_on_each_line_of_file( file_handle=fp.get_file_handle(labels_file_name), action=action, transformation=fp.default_tab_seppd, progress_update=labels_object[LabelsKeys.keys.progress_update])
def read_ids_into_dict(files, the_dict, id_cols, title_present): for aFile in files: file_handle = fp.get_file_handle(aFile) def action(inp_arr, line_number): the_id = extract_id(inp_arr,id_cols) the_dict[the_id] = 1 fp.perform_action_on_each_line_of_file( file_handle , transformation = fp.default_tab_seppd , action = action , ignore_input_title = title_present )
def fasta_iterator(features_opts): KeysObj = FeaturesFormatOptions_Fasta KeysObj.check_for_unsupported_keys_and_fill_in_defaults(features_opts) file_names = features_opts[KeysObj.keys.file_names] progress_update = features_opts[KeysObj.keys.progress_update] for file_name in file_names: print("on file", file_name) fasta_iterator = fp.FastaIterator( file_handle=fp.get_file_handle(file_name), progress_update=progress_update) for seq_id, seq in fasta_iterator: yield seq_id, av_util.seq_to_2d_image(seq)
def columns_iterator(features_opts): KeysObj = FeaturesFormatOptions_Columns KeysObj.check_for_unsupported_keys_and_fill_in_defaults(features_opts) file_names = features_opts[KeysObj.keys.file_names] progress_update = features_opts[KeysObj.keys.progress_update] for file_name in file_names: print("on file", file_name) for line_number, line in enumerate(fp.get_file_handle(file_name)): if (line_number > 0): inp = fp.default_tab_seppd(line) inp_id = inp[0] inp_vals = [float(x) for x in inp[1:]] yield inp_id, np.array(inp_vals)
def get_generator(self, loop_infinitely): #read bed_source into memory bed_fh = fp.get_file_handle(self.bed_source) data = [] print("Reading bed file " + self.bed_source + " into memory") for a_row in bed_fh: a_row = a_row.rstrip().split("\t") data.append( Interval(chrom=a_row[0], start=int(a_row[1]), stop=int(a_row[2]), labels=[self.labels_dtype(x) for x in a_row[3:]])) print("Finished reading bed file into memory; got " + str(len(data)) + "rows") if (self.num_to_load_for_eval > len(data)): print("num_to_load_for_eval is " + str(self.num_to_load_for_eval) + " but length of data is " + str(len(data)) + "; adjusting") self.num_to_load_for_eval = len(data) random_obj = np.random.RandomState(self.random_seed) if (self.randomize_after_pass): data = shuffle_array(arr=data, random_obj=random_obj) #fasta extraction import pyfasta f = pyfasta.Fasta(self.fasta_data_source) idx = 0 while (idx < len(data)): to_extract = data[idx:idx + 1] if (idx % 1000 == 0): print(to_extract) to_yield = f[ to_extract[0].chrom][to_extract[0].start:to_extract[0].stop] to_yield = np.array([one_hot_encode[x] for x in to_yield]) yield (to_yield, to_extract[0].labels, (to_extract[0].chrom, to_extract[0].start, to_extract[0].stop)) idx += 1 if (idx == len(data)): if (loop_infinitely): if (self.randomize_after_pass): data = shuffle_array(arr=data, random_obj=random_obj) idx = 0 else: raise StopIteration()
def get_generator(self, loop_infinitely): #read bed_source into memory bed_fh = fp.get_file_handle(self.bed_source) data = [] print("Reading bed file " + self.bed_source + " into memory") for a_row in bed_fh: a_row = a_row.rstrip().split("\t") data.append( Interval(chrom=a_row[0], start=int(a_row[1]), stop=int(a_row[2]), labels=[self.labels_dtype(x) for x in a_row[3:]])) print("Finished reading bed file into memory; got " + str(len(data)) + "rows") if (self.num_to_load_for_eval > len(data)): print("num_to_load_for_eval is " + str(self.num_to_load_for_eval) + " but length of data is " + str(len(data)) + "; adjusting") self.num_to_load_for_eval = len(data) random_obj = np.random.RandomState(self.random_seed) if (self.randomize_after_pass): data = shuffle_array(arr=data, random_obj=random_obj) #Set up the genomelake extractors import genomelake import genomelake.extractors extractor = genomelake.extractors.ArrayExtractor( datafile=self.genomelake_data_source) idx = 0 while (idx < len(data)): to_extract = data[idx:idx + 1] to_yield = extractor(to_extract)[0] yield (to_yield, to_extract[0].labels, (to_extract[0].chrom, to_extract[0].start, to_extract[0].stop)) idx += 1 if (idx == len(data)): if (loop_infinitely): if (self.randomize_after_pass): data = shuffle_array(arr=data, random_obj=random_obj) idx = 0 else: raise StopIteration()
def fasta_iterator(features_opts): KeysObj = FeaturesFormatOptions_Fasta KeysObj.check_for_unsupported_keys_and_fill_in_defaults(features_opts) file_names = features_opts[KeysObj.keys.file_names] progress_update = features_opts[KeysObj.keys.progress_update] for file_name in file_names: print("on file", file_name) fasta_iterator = fp.FastaIterator( file_handle=fp.get_file_handle(file_name), progress_update=progress_update) for seq_id, seq in fasta_iterator: one_hot_format = features_opts[KeysObj.keys.one_hot_format] if (one_hot_format == OneHotFormats._1d): yield seq_id, av_util.seq_to_one_hot(seq) elif (one_hot_format == OneHotFormats.theano_one_hot_row): yield seq_id, av_util.theano_seq_to_2d_image(seq) else: raise RuntimeError("Unsupported one_hot_format: " + one_hot_format + "; supported formats are: " + str(OneHotFormats.vals))
def get_id_to_split_names(split_object): """ return: id_to_split_names split_to_ids """ SplitKeys.check_for_unsupported_keys_and_fill_in_defaults(split_object) opts = split_object[SplitKeys.keys.opts] SplitOptsKeys.check_for_unsupported_keys_and_fill_in_defaults(opts) split_name_to_split_files = split_object[ SplitKeys.keys.split_name_to_split_files] id_to_split_names = {} split_to_ids = OrderedDict() for split_name in split_name_to_split_files: ids_in_split = fp.read_col_into_arr( fp.get_file_handle(split_name_to_split_files[split_name]), **opts) split_to_ids[split_name] = ids_in_split for the_id in ids_in_split: if the_id not in id_to_split_names: id_to_split_names[the_id] = [] id_to_split_names[the_id].append(split_name) return id_to_split_names, split_to_ids
def __init__(self, file_with_fasta, fasta_col, randomize_after_pass, random_seed, loop_infinitely, label_columns, labels_dtype, title_present, pre_onehot=False): #read bed_source into memory bed_fh = fp.get_file_handle(file_with_fasta) data = [] print("Reading file " + file_with_fasta + " into memory") for (idx, a_row) in enumerate(bed_fh): if (hasattr(a_row, 'decode')): a_row = a_row.decode("utf-8") if (title_present == False or idx > 0): a_row = a_row.rstrip().split("\t") if (pre_onehot): #the > 0 is to have x be boolean, to save space. x = (np.array( [one_hot_encode[x] for x in a_row[fasta_col]]) > 0) else: x = a_row[fasta_col] y = [labels_dtype(a_row[x]) for x in label_columns] data.append((x, y)) print("Finished reading file into memory; got " + str(len(data)) + " rows") random_obj = np.random.RandomState(random_seed) if (randomize_after_pass): data = shuffle_array(arr=data, random_obj=random_obj) self.data = data self.randomize_after_pass = randomize_after_pass self.random_obj = random_obj self.loop_infinitely = loop_infinitely self.pre_onehot = pre_onehot
def __call__(self, performance_history, model_wrapper, training_metadata, message, model_creator_info, model_trainer_info, other_data_loaders_info, **kwargs): if (training_metadata['total_epochs_trained_for'] > 0): #acquire lock on db file db_lock = fp.FileLockAsDir(self.db_path) #read the contents if file exists, otherwise init as you will if (util.file_exists(self.db_path)): db_contents = yaml.load(fp.get_file_handle(self.db_path)) else: db_contents = OrderedDict([ ('metadata', OrderedDict([('total_records', 0), ('best_valid_key_metric', None), ('best_saved_files_config', None)])), ('records', []) ]) #partition into metadata and records metadata = db_contents['metadata'] records = db_contents['records'] #arrange the fields in the records in the right order new_records = [] for record in records: new_record = OrderedDict() for key in self.record_keys.get_keys(): if key in record: new_record[key] = record[key] else: new_record[key] = None #put in any leftover keys that are not in our #current set of keys for key in record: if key not in new_record: new_record[key] = record[key] new_records.append(new_record) records = new_records new_record_num = metadata['total_records'] + 1 model_wrapper.prefix_to_last_saved_files( prefix="record_" + str(new_record_num), new_directory=self.new_save_dir) #update the metadata metadata['total_records'] = new_record_num previous_best_valid_key_metric = metadata['best_valid_key_metric'] current_best_valid_perf_info =\ performance_history.get_best_valid_epoch_perf_info() current_best_valid_key_metric = current_best_valid_perf_info\ .valid_key_metric if ((previous_best_valid_key_metric is None) or (((-1 if self.larger_is_better else 1) * previous_best_valid_key_metric) > ((-1 if self.larger_is_better else 1) * current_best_valid_key_metric))): metadata[ 'best_valid_key_metric'] = current_best_valid_key_metric metadata['best_saved_files_config'] =\ model_wrapper.get_last_saved_files_config() #create a new entry for the db entry = OrderedDict() entry[self.record_keys.k.record_number] = new_record_num entry[self.record_keys.k.message] = message entry[self.record_keys.k.best_valid_key_metric] =\ current_best_valid_key_metric entry[self.record_keys.k.best_valid_perf_info] =\ current_best_valid_perf_info.get_jsonable_object() entry[self.record_keys.k.key_metric_history] =\ [('train','valid')]+\ zip(performance_history.get_train_key_metric_history(), performance_history.get_valid_key_metric_history()) entry[self.record_keys.k.all_valid_metrics_history] =\ performance_history.get_all_metrics_valid_history() entry[self.record_keys.k.saved_files_config] =\ model_wrapper.get_last_saved_files_config() entry[self.record_keys.k.model_creator_info] = model_creator_info entry[self.record_keys.k.other_data_loaders_info] =\ other_data_loaders_info entry[self.record_keys.k.model_trainer_info] = model_trainer_info entry[self.record_keys.k.training_metadata] = training_metadata #append a new entry to the records records.append(entry) #sort the records by perf records = sorted(records, key=lambda x: ((-1 if self.larger_is_better else 1 ) * x['best_valid_key_metric'])) #open BackupForWriteFileHandle, write the json, close file_handle = fp.BackupForWriteFileHandle(self.db_path) file_handle.write( util.format_as_json( OrderedDict([('metadata', metadata), ('records', records)]))) file_handle.close() #release the lock on the db file db_lock.release()
def get_pyfasta_generator(bed_source, fasta_data_source, append_chrom_number, labels_dtype, randomize_after_pass, stratification_settings, random_seed, loop_infinitely, labels_subset=None): #read bed_source into memory bed_fh = fp.get_file_handle(bed_source) data = [] print("Reading bed file " + bed_source + " into memory") for a_row in bed_fh: a_row = (a_row.decode("utf-8") if hasattr(a_row, 'decode') else a_row).rstrip().split("\t") data.append( Interval(chrom=a_row[0], start=int(a_row[1]), stop=int(a_row[2]), labels=[ labels_dtype(x) for x in (a_row[3:] if labels_subset is None else [a_row[3 + y] for y in labels_subset]) ])) print("Finished reading bed file into memory; got " + str(len(data)) + "rows") random_obj = np.random.RandomState(random_seed) if (stratification_settings is not None): stratification_type = stratification_settings["type"] stratification_column = stratification_settings["column"] num_splits = stratification_settings['num_splits'] bin_sizes = int(np.ceil(len(data) / num_splits)) if (stratification_type == "continuous"): sorted_data = sorted(data, key=lambda x: x.labels[stratification_column]) stratifications = [ sorted_data[i * bin_sizes:min(len(data), (i + 1) * bin_sizes)] for i in range(num_splits) ] else: raise RuntimeError("Unrecognized stratification type", stratification_type) if (randomize_after_pass): if (stratification_settings is not None): data = get_stratified_shuffle(stratifications=stratifications, random_obj=random_obj) else: data = shuffle_array(arr=data, random_obj=random_obj) #fasta extraction import pyfasta f = pyfasta.Fasta(fasta_data_source) idx = 0 while (idx < len(data)): to_extract = data[idx:idx + 1] chrom = to_extract[0].chrom if (append_chrom_number == True): chrom = chrom + " " + chrom[3:] to_yield_str = f[chrom][to_extract[0].start:to_extract[0].stop] to_yield = np.array([one_hot_encode[x] for x in to_yield_str]) to_yield_labels = to_extract[0].labels yield (to_yield, to_yield_labels, (to_extract[0].chrom, to_extract[0].start, to_extract[0].stop), to_yield_str) idx += 1 if (idx == len(data)): if (loop_infinitely): if (randomize_after_pass): if (stratification_settings is not None): data = get_stratified_shuffle( stratifications=stratifications, random_obj=random_obj) else: data = shuffle_array(arr=data, random_obj=random_obj) idx = 0 else: return