Ejemplo n.º 1
0
def iterate_file_read_pairs(filepaths, read_ids, limit=None, verbose=0):
    """Iterate over file and read id pairs.

    Args:
        filepaths (list of str): list of filepaths
        read_ids (list of str): list of read ids
        limit (int, optional): Maximum number of tuples to produce
        verbose (int, optional): Output level of debug verbosity

    Yields:
        tuple(str, str): containing filepath and read id
    """
    nyielded = 0
    for filepath, read_id in zip(filepaths, read_ids):
        if not os.path.exists(filepath):
            sys.stderr.write(
                'File {} does not exist, skipping\n'.format(filepath))
            continue
        try:
            with get_fast5_file(filepath, 'r') as f5file:
                if read_id not in f5file.get_read_ids():
                    continue
            if verbose > 0:
                print("Reading", read_id, "from", filepath)
            yield filepath, read_id
            nyielded += 1
            if limit is not None and nyielded >= limit:
                return  # ends iterator
        except Exception as e:
            sys.stderr.write((
                "Warning: An exception occured in fast5utils (skipped " +
                "this read):\n{}\n").format(str(e)))
    return
Ejemplo n.º 2
0
def get_raw_data(filename):
    """
    Get the raw signal and read id from the fast5 files
    """
    with get_fast5_file(filename, 'r') as f5_fh:
        for res in f5_fh.get_reads():
            yield Read(res, filename)
Ejemplo n.º 3
0
def check_file_type(myfile):
    fobj = fast5_interface.get_fast5_file(os.path.join(root, name))
    if fast5_interface.check_file_type(fobj) == "multi-read":
        #convert file to single fast5
        print("converting fast5 file****")
        multi_to_single_fast5.convert_multi_to_single(os.path.join(root, name),
                                                      directory, "single")
Ejemplo n.º 4
0
def call_file(filename):
    out = []
    try:
        with get_fast5_file(filename, mode="r") as f5:
            ftype = check_file_type(f5)  # single-read/multi-read
            for read in f5.get_reads():
                read_id = read.read_id
                run_id = read.run_id.decode('utf-8')
                read_number = read.handle['Raw'].attrs[
                    'read_number'] if ftype == 'multi-read' else read.status.read_info[
                        0].read_number
                start_time = read.handle['Raw'].attrs[
                    'start_time'] if ftype == 'multi-read' else read.status.read_info[
                        0].start_time
                channel_number = read.handle[
                    read.global_key +
                    'channel_id'].attrs['channel_number'].decode('utf-8')
                sampling_rate = read.handle[
                    read.global_key + 'channel_id'].attrs['sampling_rate']
                exp_start_time = read.handle[
                    read.global_key +
                    'tracking_id'].attrs['exp_start_time'].decode('utf-8')

                start_time = add_time_seconds(exp_start_time,
                                              start_time / sampling_rate)

                signal = read.get_raw_data()
                signal = rescale_signal(signal)

                basecall, qual = caller.call_raw_signal(signal)
                out.append((read_id, run_id, read_number, channel_number,
                            start_time, basecall, qual))
    except OSError:
        return []
    return out
Ejemplo n.º 5
0
def get_raw_data_for_read(info):
    """
    Get the raw signal from the fast5 file for a given filename, read_id pair
    """
    filename, read_id = info
    with get_fast5_file(filename, 'r') as f5_fh:
        return Read(f5_fh.get_read(read_id), filename)
Ejemplo n.º 6
0
def get_remapping(sig_fn, dacs, scale_params, ref_seq, stride, read_id,
                  r_to_q_poss, rl_cumsum, r_ref_pos, ref_out_info):
    read = fast5_interface.get_fast5_file(sig_fn, 'r').get_read(read_id)
    channel_info = dict(fast5utils.get_channel_info(read).items())
    rd_factor = channel_info['range'] / channel_info['digitisation']
    shift_from_pA = (scale_params[0] + channel_info['offset']) * rd_factor
    scale_from_pA = scale_params[1] * rd_factor
    read_attrs = dict(fast5utils.get_read_attributes(read).items())

    # prepare taiyaki signal object
    sig = tai_signal.Signal(dacs=dacs)
    sig.channel_info = channel_info
    sig.read_attributes = read_attrs
    sig.offset = channel_info['offset']
    sig.range = channel_info['range']
    sig.digitisation = channel_info['digitisation']

    path = np.full((dacs.shape[0] // stride) + 1, -1)
    # skip last value since this is where the two seqs end
    for ref_pos, q_pos in enumerate(r_to_q_poss[:-1]):
        # if the query position maps to the end of the mapping skip it
        if rl_cumsum[q_pos + r_ref_pos.q_trim_start] >= path.shape[0]:
            continue
        path[rl_cumsum[q_pos + r_ref_pos.q_trim_start]] = ref_pos
    remapping = tai_mapping.Mapping.from_remapping_path(
        sig, path, ref_seq, stride)
    try:
        remapping.add_integer_reference(ref_out_info.alphabet)
    except Exception:
        raise mh.MegaError('Invalid reference sequence encountered')

    return (remapping.get_read_dictionary(shift_from_pA, scale_from_pA,
                                          read_id),
            prepare_mapping_funcs.RemapResult.SUCCESS)
Ejemplo n.º 7
0
def hdf_to_sam_worker(fname):
    """Extract and align basecall and methylation data from `.fast5`.

    :param reference: `.fasta` file containing reference sequence(s).
    :param fname: `.'fast5` file containing read data.
    """
    logger = medaka.common.get_named_logger('ModExtract')
    logger.info("Processing {}.".format(fname))
    results = list()
    with get_fast5_file(fname, mode="r") as f5:
        reads = list(f5.get_read_ids())
        logger.debug("Found {} reads for {}.".format(len(reads), fname))
        for read_id in reads:
            read = f5.get_read(read_id)
            latest = read.get_latest_analysis(BASECALLANALYSIS)
            # get modified base data
            mod_base = read.get_analysis_dataset(latest, MODBASEPATH)
            mod_base = mod_base.view(dtype=MODTYPE)
            mA = 'MA:B:C,{}'.format(format_uint8_list(mod_base['6mA']))
            mC = 'MC:B:C,{}'.format(format_uint8_list(mod_base['5mC']))
            # get basecalling data
            fastq = read.get_analysis_dataset(latest, FASTQPATH)
            header, sequence, _, qstring = fastq.splitlines()
            # put everything together
            read = Read(read_id, sequence, qstring)
            results.append((read, (mA, mC)))
    return results
 def load_file(self, filepath):
     with get_fast5_file(filepath, mode="r") as f5:
         read_ids = f5.get_read_ids()
         assert len(
             read_ids) == 1, f'File {filepath} contains multiple reads.'
         for read in f5.get_reads():
             return read.get_raw_data(), read.read_id
def fast5s_to_fastq(dir_):
    print(dir_)
    start = time.time()
    plus = '+'
    fastq_fn = os.path.join(os.path.join(dir_), os.path.basename(dir_) + '.fastq')
    fast5s = [os.path.join(dir_ ,x) for x in os.listdir(dir_) if x.endswith('.fast5') ]
    n  = []
    s  = []
    q  = []
    
    for fast5_fn in fast5s:
        with get_fast5_file(fast5_fn, mode='r') as f5:
            with Basecall1DTools(f5) as basecall:
                n1, s1, q1 = basecall.get_called_sequence('template')
                n.append(n1)
                s.append(s1)
                q.append(q1)
    with open(fastq_fn, mode='w') as fastq_fh:
        for (n1, s1, q1) in zip(n, s, q):
            print('@%s' % n1, file=fastq_fh)
            print(s1, file=fastq_fh)
            print(plus, file=fastq_fh)
            print(q1, file=fastq_fh)
    string = '%s done' % fastq_fn
    stop = time.time()
    string = string + ': Done in {:.2f}'.format(stop - start)
    print(string)
    return string
Ejemplo n.º 10
0
    def index_fast5(self,
                    files: [Path] = None,
                    tags: list = None,
                    store_signal: bool = False):
        """ Main access method to index Fast5 files into MongoDB """

        for file in files:
            self.logger.info(f"Index signal data [ {self.db_name} ]: {file}")

            reads = []
            with get_fast5_file(str(file), mode="r") as f5:
                for read in f5.get_reads():
                    unique_identifier = uuid.uuid4()
                    fast5_path = file.absolute()
                    read = Read(fast5=str(fast5_path),
                                uuid=str(unique_identifier),
                                tags=tags,
                                read_id=read.read_id,
                                signal_data=read.get_signal(
                                    start=None, end=None, scale=False)
                                if store_signal else list())
                    reads.append(read)

            try:
                Read.objects.insert(reads)
                self.logger.info(
                    f'Inserted {len(reads)} signal reads with tags [ {", ".join(tags)} ]'
                )
            except:
                raise
Ejemplo n.º 11
0
def yield_fast5_reads(input_path, recursive, follow_symlinks=True, read_ids=None):
    """
    Iterate over reads in fast5 files and yield read_ids and fast5 read objects.
    If read_id_set is defined, skip reads which are not in this set/list. An empty set/list returns all.

    :param input_dir: Path
    :param recursive: bool
    :param follow_symlinks: bool
    :param read_ids: set or list
    :raise TypeError: read_id_set must be of type set or list'
    :return: yielded tuple (read_id, fast5_read_object)
    """
    if not isinstance(read_ids, (list, set)) and read_ids is not None:
        raise TypeError('read_ids must be of type set or list or none')

    if read_ids and isinstance(read_ids, list):
        read_ids = set(read_ids)

    for fast5_path in yield_fast5_files(input_path=input_path, recursive=recursive, follow_symlinks=follow_symlinks):
        fast5_file = get_fast5_file(fast5_path)
        if read_ids:
            selected_reads = read_ids.intersection(fast5_file.get_read_ids())
        else:
            selected_reads = fast5_file.get_read_ids()

        for read_id in selected_reads:
            yield read_id, fast5_file.get_read(read_id)
Ejemplo n.º 12
0
    def test_correct_type(self):
        single_read_path = os.path.join(test_data, "single_reads", "read0.fast5")
        single_read_id = Fast5File(single_read_path).get_read_id()
        with get_fast5_file(single_read_path) as f5:
            self.assertEqual(type(f5), Fast5File)
            self.assertEqual(check_file_type(f5), SINGLE_READ)
            self.assertEqual(len(f5.get_read_ids()), 1)
            self.assertEqual(single_read_id, f5.get_read_ids()[0])
            self.get_raw(f5)

        multi_read_path = os.path.join(test_data, "multi_read", "batch_0.fast5")
        with get_fast5_file(multi_read_path) as f5:
            self.assertEqual(type(f5), MultiFast5File)
            self.assertEqual(check_file_type(f5), MULTI_READ)
            self.assertTrue(len(f5.get_read_ids()) >= 1)
            self.get_raw(f5)
Ejemplo n.º 13
0
def main():
    import argparse
    usage = "%(prog)s -v"  #usage=usage,
    parser  = argparse.ArgumentParser(description=desc, epilog=epilog, \
                                      formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('--version', action='version', version='0.10a')
    parser.add_argument("-v",
                        "--verbose",
                        default=False,
                        action="store_true",
                        help="verbose")
    parser.add_argument("-i", "--fast5", nargs="+", help="input Fast5 file(s)")
    parser.add_argument("-o",
                        "--out",
                        default=sys.stdout,
                        type=argparse.FileType("w"),
                        help="output stream [stdout]")

    o = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\n" % str(o))

    for fn in o.fast5:
        seqs = []
        f5file = get_fast5_file(fn, mode="r")
        for read_id in f5file.get_read_ids():
            read = f5file.get_read(read_id)
            bcgrp = read.get_latest_analysis("Basecall_1D")  #Basecall_1D_000
            fastq = read.get_analysis_dataset(bcgrp,
                                              "BaseCalled_template/Fastq")
            o.out.write(fastq)
Ejemplo n.º 14
0
def one_read_shift_scale(read_tuple):

    read_filename, read_id = read_tuple

    try:
        with fast5_interface.get_fast5_file(read_filename, 'r') as f5file:
            read = f5file.get_read(read_id)
            sig = Signal(read)

    except Exception as e:
        sys.stderr.write(
            'Unable to obtain signal for {} from {}.\n{}\n'.format(
                read_id, read_filename, repr(e)))
        return (None, None, None)

    else:
        signal = sig.current

        if len(signal) > 0:
            shift, scale = med_mad(signal)
        else:
            shift, scale = np.NaN, np.NaN
            # note - if signal trimmed by ub, it could be of length zero by this point for short reads
            # These are taken out later in the existing code, in the new code we'll take out ub trimming

        return (read_id, shift, scale)
Ejemplo n.º 15
0
def print_all_raw_data():
    fast5_filepath = 'test/mapped_reads.hdf5'  # This can be a single- or multi-read file
    with get_fast5_file(fast5_filepath, mode='r') as f5:
        for read_id in f5.get_read_ids():
            read = f5.get_read(read_id)
            raw_data = read.get_raw_data()
            print(read_id, raw_data)
Ejemplo n.º 16
0
def resquiggle_reads(multifast5_fn,
                     aligner,
                     ref,
                     seq_samp_type,
                     std_ref,
                     rsqgl_params,
                     outlier_thresh=OUTLIER_THRESH,
                     max_scaling_iters=3,
                     max_per_ref=0,
                     valid_bases=set(list('ACGT'))):
    ref2c = {}
    # process reads from multi fast5
    faidx = pysam.FastaFile(ref)
    ref2len = {r: l
               for r, l in zip(faidx.references, faidx.lengths)}  #; ref2len
    f5file = get_fast5_file(multifast5_fn, mode="r")
    for a in aligner:
        # process only given number of reads per reference
        if max_per_ref:
            contig = a.reference_name  #map_results.genome_loc.Chrom
            if contig in ref2c:
                if ref2c[contig] >= max_per_ref: continue
            else: ref2c[contig] = 0
        # skip reads without alignment or secondary/qcfails
        if a.is_unmapped or a.is_secondary or a.is_qcfail:
            yield None, "No alignment" if a.is_unmapped else "Secondary alignment"
            continue
        # get alignment data
        map_results = map_read(a, faidx, seq_samp_type, std_ref, ref2len)
        # make sure only ACGT chars in reference!
        if set(map_results.genome_seq).difference(valid_bases):
            yield None, "Non-ACGT sequence"  # instead maybe just replace by random char?
            continue
        # extract data from FAST5
        read = f5file.get_read(a.qname)  #read_id)
        all_raw_signal = read.get_raw_data(scale=False)
        map_results = map_results._replace(raw_signal=all_raw_signal)
        try:
            # this causes sometimes TomboError: Read event to sequence alignment extends beyond bandwidth
            map_results = adjust_map_res(map_results, seq_samp_type,
                                         rsqgl_params)
            rsqgl_res = resquiggle.resquiggle_read(map_results, std_ref,
                                                   rsqgl_params,
                                                   outlier_thresh)
            n_iters = 1
            while n_iters < max_scaling_iters and rsqgl_res.norm_params_changed:
                rsqgl_res = resquiggle.resquiggle_read(
                    map_results._replace(scale_values=rsqgl_res.scale_values),
                    std_ref, rsqgl_params, outlier_thresh)
                n_iters += 1
        except Exception as inst:
            yield None, str(inst)
            continue
        rsqgl_res = adjust_rsqgl_res(rsqgl_res, all_raw_signal, seq_samp_type)
        # add alignment and read as those are needed later
        rsqgl_res.a, rsqgl_res.read = a, read
        # update ref counter
        if ref2c: ref2c[contig] += 1
        yield rsqgl_res, ""
Ejemplo n.º 17
0
def reads_from_bins(bins, bins_dir):
    if isinstance(bins, str):
        bins = [bins]
    reads = []
    for bin in bins:
        with get_fast5_file(os.path.join(bins_dir, bin + '.fast5'), mode='r') as f5:
            reads.extend(f5.get_read_ids())
    return reads
Ejemplo n.º 18
0
 def open(self, force_open=False):
     if force_open or (not self._is_open):
         try:
             self.close()
         except:
             pass
         self.io = get_fast5_file(self.file, mode="r")
     self._is_open = True
Ejemplo n.º 19
0
def get_read_ids(filename):
    """
    Return a dictionary of read_id -> filename mappings.
    """
    with get_fast5_file(filename, 'r') as f5:
        return {
            read.read_id: basename(filename) for read in f5.get_reads()
        }
Ejemplo n.º 20
0
def get_raw_data(filename, read_ids=None, skip=False):
    """
    Get the raw signal and read id from the fast5 files
    """
    with get_fast5_file(filename, 'r') as f5_fh:
        for read_id in f5_fh.get_read_ids():
            if read_ids is None or (read_id in read_ids) ^ skip:
                yield Read(f5_fh.get_read(read_id), filename)
Ejemplo n.º 21
0
def get_raw_data(infile, fileNM, data_test, data_name, cutoff):
	fast5_filepath = os.path.join(infile, fileNM)
	with get_fast5_file(fast5_filepath, mode="r") as f5:
		for read in f5.get_reads():
			raw_data = read.get_raw_data(scale=True)
			if len(raw_data) >= (cutoff + 3000):
				data_test.append(raw_data[cutoff:(cutoff+3000)])
				data_name.append(read.read_id)
	return data_test, data_name
Ejemplo n.º 22
0
 def get_fast5_files(self, root_path):
     files = []
     sub_directories = os.listdir(root_path)
     for dir in sub_directories:
         filename = "batch0.fast5"
         path = "{}/{}/{}".format(root_path, dir, filename)
         file = get_fast5_file(path, mode="r")
         files.append((file, dir))
     return files
Ejemplo n.º 23
0
def get_read_ids(filename, read_ids=None, skip=False):
    """
    Get all the read_ids from the file `filename`.
    """
    with get_fast5_file(filename, 'r') as f5_fh:
        ids = [(filename, rid) for rid in f5_fh.get_read_ids()]
        if read_ids is None:
            return ids
        return [rid for rid in ids if (rid[1] in read_ids) ^ skip]
Ejemplo n.º 24
0
def call_file(filename):
    out = []
    with get_fast5_file(filename, mode="r") as f5:
        for read in f5.get_reads():
            read_id = read.get_read_id()
            signal = read.get_raw_data()
            signal = rescale_signal(signal)

            out.append((read_id, caller.call_raw_signal(signal)))
    return out
Ejemplo n.º 25
0
def get_raw_data(fast5_filepath):
    """
    Get the raw signal and read id from the fast5 files
    """
    with get_fast5_file(fast5_filepath, mode="r") as f5:
        for read_id in f5.get_read_ids():
            read = f5.get_read(read_id)
            raw_data = read.get_raw_data(scale=True)
            raw_data = preprocess(raw_data)
            yield read_id, raw_data
Ejemplo n.º 26
0
def get_meta_data(filename, read_ids=None, skip=False):
    """
    Get the meta data from the fast5 file for a given `filename`.
    """
    meta_reads = []
    with get_fast5_file(filename, 'r') as f5_fh:
        for read_id in f5_fh.get_read_ids():
            if read_ids is None or (read_id in read_ids) ^ skip:
                meta_reads.append(
                    Read(f5_fh.get_read(read_id), filename, meta=True))
        return meta_reads
Ejemplo n.º 27
0
def get_signal_length(fast5_filepath):
    """
    :param fast5_filepath: can be a single- or multi-read file
    :return: raw signal length
    """
    with get_fast5_file(fast5_filepath, mode="r") as f5:
        for read in f5.get_reads():
            raw_data = read.get_raw_data()
            # print(read.read_id, raw_data, len(raw_data))

    return len(raw_data)
Ejemplo n.º 28
0
def get_signal(fast5_fn, read_id, scale=True):
    """ Get raw signal from read.
    """
    with get_fast5_file(fast5_fn, 'r') as fast5_fp:
        raw_sig = fast5_fp.get_read(read_id).get_raw_data()

    if scale:
        med, mad = mh.med_mad(raw_sig)
        raw_sig = (raw_sig - med) / mad

    return raw_sig
Ejemplo n.º 29
0
    def update_fast5(self, fast5_filepath, mod_index=3, verbose=False):
        """Update (i.e. add or change) the methylation data for reads in the given fast5 file.
        
        mod_index gives the index of the modification call table to store in the database. 
                    Default is mC modification. Indices: A,mA,C,mC,G,T = 0,1,2,3,4,5"""

        from ont_fast5_api.fast5_interface import get_fast5_file
        import numpy as np
        import logging as log
        if verbose:
            from tqdm import tqdm
        else:

            def tqdm(x):
                return x

        log.info("Processing file {}".format(fast5_filepath))

        UNMODIFIED_BASES = [b"A", b"A", b"C", b"C", b"G", b"T"]
        assert mod_index >= 0 and mod_index < len(
            UNMODIFIED_BASES), "mod_index must be in the range 0-5."

        BASE = UNMODIFIED_BASES[mod_index]

        log.info("Looking for modification {} of base {}.".format(
            mod_index, BASE))

        with get_fast5_file(fast5_filepath, mode="r") as f5:
            for read_id in tqdm(f5.get_read_ids()):
                #if read_idx%100:
                #    log.info("Processing read {}".format(read_id))

                read = f5.get_read(read_id)

                latest_basecall = read.get_latest_analysis('Basecall_1D')

                mod_base_table = read.get_analysis_dataset(
                    latest_basecall, 'BaseCalled_template/ModBaseProbs')
                if mod_base_table is None:
                    log.info("No ModBaseProbs for {}".format(read_id))
                    continue

                fastq = read.get_analysis_dataset(latest_basecall,
                                                  'BaseCalled_template/Fastq')
                if fastq is None:
                    log.info("No Fastq for {}".format(read_id))
                    continue

                seq_title, seq, _, qvals, _ = fastq.split("\n")

                mod_likelihoods = mod_base_table[np.fromstring(seq, "|S1") ==
                                                 BASE, mod_index]

                self.put(read_id, mod_likelihoods)
def guppy_fast5_extraction(filename):
    data_set = []
    with get_fast5_file(filename, mode='r') as f:
        for read_id in f.get_read_ids():
            read = f.get_read(read_id)
            latest_basecall = read.get_latest_analysis('Basecall_1D')
            fastq = read.get_analysis_dataset(latest_basecall, 
            'BaseCalled_template/Fastq')
            mod_base_table = read.get_analysis_dataset(latest_basecall, 
            'BaseCalled_template/ModBaseProbs')
            data_set.append((read_id, fastq, mod_base_table))
    return data_set