def iterate_file_read_pairs(filepaths, read_ids, limit=None, verbose=0): """Iterate over file and read id pairs. Args: filepaths (list of str): list of filepaths read_ids (list of str): list of read ids limit (int, optional): Maximum number of tuples to produce verbose (int, optional): Output level of debug verbosity Yields: tuple(str, str): containing filepath and read id """ nyielded = 0 for filepath, read_id in zip(filepaths, read_ids): if not os.path.exists(filepath): sys.stderr.write( 'File {} does not exist, skipping\n'.format(filepath)) continue try: with get_fast5_file(filepath, 'r') as f5file: if read_id not in f5file.get_read_ids(): continue if verbose > 0: print("Reading", read_id, "from", filepath) yield filepath, read_id nyielded += 1 if limit is not None and nyielded >= limit: return # ends iterator except Exception as e: sys.stderr.write(( "Warning: An exception occured in fast5utils (skipped " + "this read):\n{}\n").format(str(e))) return
def get_raw_data(filename): """ Get the raw signal and read id from the fast5 files """ with get_fast5_file(filename, 'r') as f5_fh: for res in f5_fh.get_reads(): yield Read(res, filename)
def check_file_type(myfile): fobj = fast5_interface.get_fast5_file(os.path.join(root, name)) if fast5_interface.check_file_type(fobj) == "multi-read": #convert file to single fast5 print("converting fast5 file****") multi_to_single_fast5.convert_multi_to_single(os.path.join(root, name), directory, "single")
def call_file(filename): out = [] try: with get_fast5_file(filename, mode="r") as f5: ftype = check_file_type(f5) # single-read/multi-read for read in f5.get_reads(): read_id = read.read_id run_id = read.run_id.decode('utf-8') read_number = read.handle['Raw'].attrs[ 'read_number'] if ftype == 'multi-read' else read.status.read_info[ 0].read_number start_time = read.handle['Raw'].attrs[ 'start_time'] if ftype == 'multi-read' else read.status.read_info[ 0].start_time channel_number = read.handle[ read.global_key + 'channel_id'].attrs['channel_number'].decode('utf-8') sampling_rate = read.handle[ read.global_key + 'channel_id'].attrs['sampling_rate'] exp_start_time = read.handle[ read.global_key + 'tracking_id'].attrs['exp_start_time'].decode('utf-8') start_time = add_time_seconds(exp_start_time, start_time / sampling_rate) signal = read.get_raw_data() signal = rescale_signal(signal) basecall, qual = caller.call_raw_signal(signal) out.append((read_id, run_id, read_number, channel_number, start_time, basecall, qual)) except OSError: return [] return out
def get_raw_data_for_read(info): """ Get the raw signal from the fast5 file for a given filename, read_id pair """ filename, read_id = info with get_fast5_file(filename, 'r') as f5_fh: return Read(f5_fh.get_read(read_id), filename)
def get_remapping(sig_fn, dacs, scale_params, ref_seq, stride, read_id, r_to_q_poss, rl_cumsum, r_ref_pos, ref_out_info): read = fast5_interface.get_fast5_file(sig_fn, 'r').get_read(read_id) channel_info = dict(fast5utils.get_channel_info(read).items()) rd_factor = channel_info['range'] / channel_info['digitisation'] shift_from_pA = (scale_params[0] + channel_info['offset']) * rd_factor scale_from_pA = scale_params[1] * rd_factor read_attrs = dict(fast5utils.get_read_attributes(read).items()) # prepare taiyaki signal object sig = tai_signal.Signal(dacs=dacs) sig.channel_info = channel_info sig.read_attributes = read_attrs sig.offset = channel_info['offset'] sig.range = channel_info['range'] sig.digitisation = channel_info['digitisation'] path = np.full((dacs.shape[0] // stride) + 1, -1) # skip last value since this is where the two seqs end for ref_pos, q_pos in enumerate(r_to_q_poss[:-1]): # if the query position maps to the end of the mapping skip it if rl_cumsum[q_pos + r_ref_pos.q_trim_start] >= path.shape[0]: continue path[rl_cumsum[q_pos + r_ref_pos.q_trim_start]] = ref_pos remapping = tai_mapping.Mapping.from_remapping_path( sig, path, ref_seq, stride) try: remapping.add_integer_reference(ref_out_info.alphabet) except Exception: raise mh.MegaError('Invalid reference sequence encountered') return (remapping.get_read_dictionary(shift_from_pA, scale_from_pA, read_id), prepare_mapping_funcs.RemapResult.SUCCESS)
def hdf_to_sam_worker(fname): """Extract and align basecall and methylation data from `.fast5`. :param reference: `.fasta` file containing reference sequence(s). :param fname: `.'fast5` file containing read data. """ logger = medaka.common.get_named_logger('ModExtract') logger.info("Processing {}.".format(fname)) results = list() with get_fast5_file(fname, mode="r") as f5: reads = list(f5.get_read_ids()) logger.debug("Found {} reads for {}.".format(len(reads), fname)) for read_id in reads: read = f5.get_read(read_id) latest = read.get_latest_analysis(BASECALLANALYSIS) # get modified base data mod_base = read.get_analysis_dataset(latest, MODBASEPATH) mod_base = mod_base.view(dtype=MODTYPE) mA = 'MA:B:C,{}'.format(format_uint8_list(mod_base['6mA'])) mC = 'MC:B:C,{}'.format(format_uint8_list(mod_base['5mC'])) # get basecalling data fastq = read.get_analysis_dataset(latest, FASTQPATH) header, sequence, _, qstring = fastq.splitlines() # put everything together read = Read(read_id, sequence, qstring) results.append((read, (mA, mC))) return results
def load_file(self, filepath): with get_fast5_file(filepath, mode="r") as f5: read_ids = f5.get_read_ids() assert len( read_ids) == 1, f'File {filepath} contains multiple reads.' for read in f5.get_reads(): return read.get_raw_data(), read.read_id
def fast5s_to_fastq(dir_): print(dir_) start = time.time() plus = '+' fastq_fn = os.path.join(os.path.join(dir_), os.path.basename(dir_) + '.fastq') fast5s = [os.path.join(dir_ ,x) for x in os.listdir(dir_) if x.endswith('.fast5') ] n = [] s = [] q = [] for fast5_fn in fast5s: with get_fast5_file(fast5_fn, mode='r') as f5: with Basecall1DTools(f5) as basecall: n1, s1, q1 = basecall.get_called_sequence('template') n.append(n1) s.append(s1) q.append(q1) with open(fastq_fn, mode='w') as fastq_fh: for (n1, s1, q1) in zip(n, s, q): print('@%s' % n1, file=fastq_fh) print(s1, file=fastq_fh) print(plus, file=fastq_fh) print(q1, file=fastq_fh) string = '%s done' % fastq_fn stop = time.time() string = string + ': Done in {:.2f}'.format(stop - start) print(string) return string
def index_fast5(self, files: [Path] = None, tags: list = None, store_signal: bool = False): """ Main access method to index Fast5 files into MongoDB """ for file in files: self.logger.info(f"Index signal data [ {self.db_name} ]: {file}") reads = [] with get_fast5_file(str(file), mode="r") as f5: for read in f5.get_reads(): unique_identifier = uuid.uuid4() fast5_path = file.absolute() read = Read(fast5=str(fast5_path), uuid=str(unique_identifier), tags=tags, read_id=read.read_id, signal_data=read.get_signal( start=None, end=None, scale=False) if store_signal else list()) reads.append(read) try: Read.objects.insert(reads) self.logger.info( f'Inserted {len(reads)} signal reads with tags [ {", ".join(tags)} ]' ) except: raise
def yield_fast5_reads(input_path, recursive, follow_symlinks=True, read_ids=None): """ Iterate over reads in fast5 files and yield read_ids and fast5 read objects. If read_id_set is defined, skip reads which are not in this set/list. An empty set/list returns all. :param input_dir: Path :param recursive: bool :param follow_symlinks: bool :param read_ids: set or list :raise TypeError: read_id_set must be of type set or list' :return: yielded tuple (read_id, fast5_read_object) """ if not isinstance(read_ids, (list, set)) and read_ids is not None: raise TypeError('read_ids must be of type set or list or none') if read_ids and isinstance(read_ids, list): read_ids = set(read_ids) for fast5_path in yield_fast5_files(input_path=input_path, recursive=recursive, follow_symlinks=follow_symlinks): fast5_file = get_fast5_file(fast5_path) if read_ids: selected_reads = read_ids.intersection(fast5_file.get_read_ids()) else: selected_reads = fast5_file.get_read_ids() for read_id in selected_reads: yield read_id, fast5_file.get_read(read_id)
def test_correct_type(self): single_read_path = os.path.join(test_data, "single_reads", "read0.fast5") single_read_id = Fast5File(single_read_path).get_read_id() with get_fast5_file(single_read_path) as f5: self.assertEqual(type(f5), Fast5File) self.assertEqual(check_file_type(f5), SINGLE_READ) self.assertEqual(len(f5.get_read_ids()), 1) self.assertEqual(single_read_id, f5.get_read_ids()[0]) self.get_raw(f5) multi_read_path = os.path.join(test_data, "multi_read", "batch_0.fast5") with get_fast5_file(multi_read_path) as f5: self.assertEqual(type(f5), MultiFast5File) self.assertEqual(check_file_type(f5), MULTI_READ) self.assertTrue(len(f5.get_read_ids()) >= 1) self.get_raw(f5)
def main(): import argparse usage = "%(prog)s -v" #usage=usage, parser = argparse.ArgumentParser(description=desc, epilog=epilog, \ formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--version', action='version', version='0.10a') parser.add_argument("-v", "--verbose", default=False, action="store_true", help="verbose") parser.add_argument("-i", "--fast5", nargs="+", help="input Fast5 file(s)") parser.add_argument("-o", "--out", default=sys.stdout, type=argparse.FileType("w"), help="output stream [stdout]") o = parser.parse_args() if o.verbose: sys.stderr.write("Options: %s\n" % str(o)) for fn in o.fast5: seqs = [] f5file = get_fast5_file(fn, mode="r") for read_id in f5file.get_read_ids(): read = f5file.get_read(read_id) bcgrp = read.get_latest_analysis("Basecall_1D") #Basecall_1D_000 fastq = read.get_analysis_dataset(bcgrp, "BaseCalled_template/Fastq") o.out.write(fastq)
def one_read_shift_scale(read_tuple): read_filename, read_id = read_tuple try: with fast5_interface.get_fast5_file(read_filename, 'r') as f5file: read = f5file.get_read(read_id) sig = Signal(read) except Exception as e: sys.stderr.write( 'Unable to obtain signal for {} from {}.\n{}\n'.format( read_id, read_filename, repr(e))) return (None, None, None) else: signal = sig.current if len(signal) > 0: shift, scale = med_mad(signal) else: shift, scale = np.NaN, np.NaN # note - if signal trimmed by ub, it could be of length zero by this point for short reads # These are taken out later in the existing code, in the new code we'll take out ub trimming return (read_id, shift, scale)
def print_all_raw_data(): fast5_filepath = 'test/mapped_reads.hdf5' # This can be a single- or multi-read file with get_fast5_file(fast5_filepath, mode='r') as f5: for read_id in f5.get_read_ids(): read = f5.get_read(read_id) raw_data = read.get_raw_data() print(read_id, raw_data)
def resquiggle_reads(multifast5_fn, aligner, ref, seq_samp_type, std_ref, rsqgl_params, outlier_thresh=OUTLIER_THRESH, max_scaling_iters=3, max_per_ref=0, valid_bases=set(list('ACGT'))): ref2c = {} # process reads from multi fast5 faidx = pysam.FastaFile(ref) ref2len = {r: l for r, l in zip(faidx.references, faidx.lengths)} #; ref2len f5file = get_fast5_file(multifast5_fn, mode="r") for a in aligner: # process only given number of reads per reference if max_per_ref: contig = a.reference_name #map_results.genome_loc.Chrom if contig in ref2c: if ref2c[contig] >= max_per_ref: continue else: ref2c[contig] = 0 # skip reads without alignment or secondary/qcfails if a.is_unmapped or a.is_secondary or a.is_qcfail: yield None, "No alignment" if a.is_unmapped else "Secondary alignment" continue # get alignment data map_results = map_read(a, faidx, seq_samp_type, std_ref, ref2len) # make sure only ACGT chars in reference! if set(map_results.genome_seq).difference(valid_bases): yield None, "Non-ACGT sequence" # instead maybe just replace by random char? continue # extract data from FAST5 read = f5file.get_read(a.qname) #read_id) all_raw_signal = read.get_raw_data(scale=False) map_results = map_results._replace(raw_signal=all_raw_signal) try: # this causes sometimes TomboError: Read event to sequence alignment extends beyond bandwidth map_results = adjust_map_res(map_results, seq_samp_type, rsqgl_params) rsqgl_res = resquiggle.resquiggle_read(map_results, std_ref, rsqgl_params, outlier_thresh) n_iters = 1 while n_iters < max_scaling_iters and rsqgl_res.norm_params_changed: rsqgl_res = resquiggle.resquiggle_read( map_results._replace(scale_values=rsqgl_res.scale_values), std_ref, rsqgl_params, outlier_thresh) n_iters += 1 except Exception as inst: yield None, str(inst) continue rsqgl_res = adjust_rsqgl_res(rsqgl_res, all_raw_signal, seq_samp_type) # add alignment and read as those are needed later rsqgl_res.a, rsqgl_res.read = a, read # update ref counter if ref2c: ref2c[contig] += 1 yield rsqgl_res, ""
def reads_from_bins(bins, bins_dir): if isinstance(bins, str): bins = [bins] reads = [] for bin in bins: with get_fast5_file(os.path.join(bins_dir, bin + '.fast5'), mode='r') as f5: reads.extend(f5.get_read_ids()) return reads
def open(self, force_open=False): if force_open or (not self._is_open): try: self.close() except: pass self.io = get_fast5_file(self.file, mode="r") self._is_open = True
def get_read_ids(filename): """ Return a dictionary of read_id -> filename mappings. """ with get_fast5_file(filename, 'r') as f5: return { read.read_id: basename(filename) for read in f5.get_reads() }
def get_raw_data(filename, read_ids=None, skip=False): """ Get the raw signal and read id from the fast5 files """ with get_fast5_file(filename, 'r') as f5_fh: for read_id in f5_fh.get_read_ids(): if read_ids is None or (read_id in read_ids) ^ skip: yield Read(f5_fh.get_read(read_id), filename)
def get_raw_data(infile, fileNM, data_test, data_name, cutoff): fast5_filepath = os.path.join(infile, fileNM) with get_fast5_file(fast5_filepath, mode="r") as f5: for read in f5.get_reads(): raw_data = read.get_raw_data(scale=True) if len(raw_data) >= (cutoff + 3000): data_test.append(raw_data[cutoff:(cutoff+3000)]) data_name.append(read.read_id) return data_test, data_name
def get_fast5_files(self, root_path): files = [] sub_directories = os.listdir(root_path) for dir in sub_directories: filename = "batch0.fast5" path = "{}/{}/{}".format(root_path, dir, filename) file = get_fast5_file(path, mode="r") files.append((file, dir)) return files
def get_read_ids(filename, read_ids=None, skip=False): """ Get all the read_ids from the file `filename`. """ with get_fast5_file(filename, 'r') as f5_fh: ids = [(filename, rid) for rid in f5_fh.get_read_ids()] if read_ids is None: return ids return [rid for rid in ids if (rid[1] in read_ids) ^ skip]
def call_file(filename): out = [] with get_fast5_file(filename, mode="r") as f5: for read in f5.get_reads(): read_id = read.get_read_id() signal = read.get_raw_data() signal = rescale_signal(signal) out.append((read_id, caller.call_raw_signal(signal))) return out
def get_raw_data(fast5_filepath): """ Get the raw signal and read id from the fast5 files """ with get_fast5_file(fast5_filepath, mode="r") as f5: for read_id in f5.get_read_ids(): read = f5.get_read(read_id) raw_data = read.get_raw_data(scale=True) raw_data = preprocess(raw_data) yield read_id, raw_data
def get_meta_data(filename, read_ids=None, skip=False): """ Get the meta data from the fast5 file for a given `filename`. """ meta_reads = [] with get_fast5_file(filename, 'r') as f5_fh: for read_id in f5_fh.get_read_ids(): if read_ids is None or (read_id in read_ids) ^ skip: meta_reads.append( Read(f5_fh.get_read(read_id), filename, meta=True)) return meta_reads
def get_signal_length(fast5_filepath): """ :param fast5_filepath: can be a single- or multi-read file :return: raw signal length """ with get_fast5_file(fast5_filepath, mode="r") as f5: for read in f5.get_reads(): raw_data = read.get_raw_data() # print(read.read_id, raw_data, len(raw_data)) return len(raw_data)
def get_signal(fast5_fn, read_id, scale=True): """ Get raw signal from read. """ with get_fast5_file(fast5_fn, 'r') as fast5_fp: raw_sig = fast5_fp.get_read(read_id).get_raw_data() if scale: med, mad = mh.med_mad(raw_sig) raw_sig = (raw_sig - med) / mad return raw_sig
def update_fast5(self, fast5_filepath, mod_index=3, verbose=False): """Update (i.e. add or change) the methylation data for reads in the given fast5 file. mod_index gives the index of the modification call table to store in the database. Default is mC modification. Indices: A,mA,C,mC,G,T = 0,1,2,3,4,5""" from ont_fast5_api.fast5_interface import get_fast5_file import numpy as np import logging as log if verbose: from tqdm import tqdm else: def tqdm(x): return x log.info("Processing file {}".format(fast5_filepath)) UNMODIFIED_BASES = [b"A", b"A", b"C", b"C", b"G", b"T"] assert mod_index >= 0 and mod_index < len( UNMODIFIED_BASES), "mod_index must be in the range 0-5." BASE = UNMODIFIED_BASES[mod_index] log.info("Looking for modification {} of base {}.".format( mod_index, BASE)) with get_fast5_file(fast5_filepath, mode="r") as f5: for read_id in tqdm(f5.get_read_ids()): #if read_idx%100: # log.info("Processing read {}".format(read_id)) read = f5.get_read(read_id) latest_basecall = read.get_latest_analysis('Basecall_1D') mod_base_table = read.get_analysis_dataset( latest_basecall, 'BaseCalled_template/ModBaseProbs') if mod_base_table is None: log.info("No ModBaseProbs for {}".format(read_id)) continue fastq = read.get_analysis_dataset(latest_basecall, 'BaseCalled_template/Fastq') if fastq is None: log.info("No Fastq for {}".format(read_id)) continue seq_title, seq, _, qvals, _ = fastq.split("\n") mod_likelihoods = mod_base_table[np.fromstring(seq, "|S1") == BASE, mod_index] self.put(read_id, mod_likelihoods)
def guppy_fast5_extraction(filename): data_set = [] with get_fast5_file(filename, mode='r') as f: for read_id in f.get_read_ids(): read = f.get_read(read_id) latest_basecall = read.get_latest_analysis('Basecall_1D') fastq = read.get_analysis_dataset(latest_basecall, 'BaseCalled_template/Fastq') mod_base_table = read.get_analysis_dataset(latest_basecall, 'BaseCalled_template/ModBaseProbs') data_set.append((read_id, fastq, mod_base_table)) return data_set