def extract_fast5(input_file_path, bin_h, mode='DNA'): """ Extract the signal and label from a single fast5 file Args: input_file_path: path of a fast5 file. bin_h: handle of the binary file. mode: The signal type dealed with. Default to 'DNA'. """ try: (raw_data, raw_label, raw_start, raw_length) = labelop.get_label_raw(input_file_path, FLAGS.basecall_group, FLAGS.basecall_subgroup) except IOError: fail_list.append(input_file_path) return False except: fail_list.append(input_file_path) return False if mode=='rna': print(type(raw_data)) raw_data = raw_data[::-1] if FLAGS.normalization == 'mean': raw_data = (raw_data - np.median(raw_data)) / np.float(np.std(raw_data)) elif FLAGS.normalization == 'median': raw_data = (raw_data - np.median(raw_data)) / np.float(robust.mad(raw_data)) pre_start = raw_start[0] pre_index = 0 for index, start in enumerate(raw_start): if start - pre_start > FLAGS.length: if index - 1 == pre_index: # If a single segment is longer than the maximum singal length, skip it. pre_start = start pre_index = index continue event.append(np.pad(raw_data[pre_start:raw_start[index - 1]], (0, FLAGS.length + pre_start - raw_start[index - 1]), mode='constant')) event_length.append(int(raw_start[index - 1] - pre_start)) label_ind = raw_label['base'][pre_index:(index - 1)] temp_label = [DNA_BASE[x.decode('UTF-8')] for x in label_ind] label.append( np.pad(temp_label, (0, FLAGS.length - index + 1 + pre_index), mode='constant', constant_values=-1)) label_length.append(index - 1 - pre_index) pre_index = index - 1 pre_start = raw_start[index - 1] if raw_start[index] - pre_start > FLAGS.length: # Skip a single event segment longer than the required signal length pre_index = index pre_start = raw_start[index] success_list.append(input_file_path) while len(event) > FLAGS.batch: for index in range(0, FLAGS.batch): bin_h.write(struct.pack(format_string, *[event_length[index]] + event[index].tolist() + [label_length[index]] + label[ index].tolist())) del event[:FLAGS.batch] del event_length[:FLAGS.batch] del label[:FLAGS.batch] del label_length[:FLAGS.batch] return True return False
def extract_file(input_file,output_file): try: (raw_data, raw_label, raw_start, raw_length) = labelop.get_label_raw(input_file,FLAGS.basecall_group,FLAGS.basecall_subgroup) except IOError: return False except: return False f_signal = open(output_file+'.signal','w+') f_label = open(output_file+'.label','w+') f_signal.write(" ".join(str(val) for val in raw_data)) for index,start in enumerate(raw_start): f_label.write("%d %d %c\n"%(start,start+raw_length[index],str(raw_label['base'][index]))) f_signal.close() f_label.close() return True
def extract_file(input_file): try: (raw_data, raw_label, raw_start, raw_length) = labelop.get_label_raw(input_file, FLAGS.basecall_group, FLAGS.basecall_subgroup) except Exception as e: print(str(e)) return False, (None, None) raw_data_array = [] for index, start in enumerate(raw_start): raw_data_array.append( [start, start + raw_length[index], str(raw_label['base'][index])]) if FLAGS.mode == 'rna': raw_data = raw_data[::-1] return True, (raw_data, np.array(raw_data_array, dtype='S8'))