Beispiel #1
0
def concat_bam(in_fns, out_fn):
    """Concat input bam files to an output bam file.
    Note that each input bam has ONLY one reference sequence.
    """
    # construct sam header
    h = concat_bam_header(in_fns)
    o = BamWriter(out_fn, header=h)
    for index, in_fn in enumerate(in_fns):
        s = pysam.Samfile(in_fn, 'rb')
        for r in s:
            r.tid = index # Overwrite tid !!!
            o.write(r)
        s.close()
    o.close()
Beispiel #2
0
def concat_bam(in_fns, out_fn):
    """Concat input bam files to an output bam file.
    Note that each input bam has ONLY one reference sequence.
    """
    # construct sam header
    h = concat_bam_header(in_fns)
    o = BamWriter(out_fn, header=h)
    for index, in_fn in enumerate(in_fns):
        s = Samfile(in_fn, 'rb')
        for r in s:
            r.tid = index  # Overwrite tid !!!
            o.write(r)
        s.close()
    o.close()
Beispiel #3
0
def trim_subreads_and_write(reader,
                            in_seqids,
                            out_file,
                            trim_len,
                            min_len,
                            ignore_keyerror=False,
                            bam=False):
    """ Extract (dump) raw subreads of every zmws from in_seqeids from reader
    to out_file.
        reader --- provides random access to raw subreads in input file.
                   type = MetaSubreadFastaReader, when input files are in FASTA,
                   and reads are in format <movie>/<holeNumber>/<subread or CCS>.
                   type = BamCollection, when input files are in BAM.
        trim_len --- trim the first and last n bases when input is BAM
        min_len --- minimum read length to write a subread when input is BAM
        in_seqids --- zmw ids to dump
        out_file --- a FASTA file when input files are in FASTA; a BAM file when
                     input files are in BAM.
        return movies seen
    """
    movies = set()
    zmw_seen = set()
    f = None  # output open file handler

    if bam:
        assert isinstance(reader, BamCollection)
        f = BamWriter(out_file, reader.header)
    else:
        assert isinstance(reader, MetaSubreadFastaReader)
        f = FastaWriter(out_file)

    for seqid in in_seqids:
        zmw = seqid
        try:
            zmw = '/'.join(seqid.split('/')[0:2])
        except ValueError:
            raise ValueError("%s does not contain a valid pacbio zmw id." %
                             seqid)

        if zmw not in zmw_seen:
            movies.add(zmw.split('/')[0])
            zmw_seen.add(zmw)
            try:
                if bam:
                    for rec in reader[zmw].subreads:
                        if len(rec) >= 2 * trim_len + min_len:
                            f.write(
                                rec.Clip(rec.readStart + trim_len,
                                         rec.readEnd - trim_len))
                else:
                    for rec in reader[zmw]:
                        if len(rec) >= 2 * trim_len + min_len:
                            try:
                                m, hn, s_e = rec.name.split('/')
                                s, e = [int(x) for x in s_e.split('_')]
                                new_id = "%s/%s/%d_%d" % (m, hn, s + trim_len,
                                                          e - trim_len)
                                f.writeRecord(new_id,
                                              rec.sequence[trim_len:-trim_len])
                            except ValueError:
                                raise ValueError(
                                    "%s is not a valid pacbio subread." %
                                    rec.name)
            except KeyError:
                if ignore_keyerror:
                    logging.warning(
                        "Ignoring {zmw} because the input FASTA/BAM ".format(
                            zmw=zmw) + " does not contain it.")
                else:
                    raise ValueError("{0} doesn't exist. Abort!".format(zmw))
    f.close()

    return movies
Beispiel #4
0
def trim_subreads_and_write(reader, in_seqids, out_file, trim_len, min_len,
                            ignore_keyerror=False, bam=False):
    """ Extract (dump) raw subreads of every zmws from in_seqeids from reader
    to out_file.
        reader --- provides random access to raw subreads in input file.
                   type = MetaSubreadFastaReader, when input files are in FASTA,
                   and reads are in format <movie>/<holeNumber>/<subread or CCS>.
                   type = BamCollection, when input files are in BAM.
        trim_len --- trim the first and last n bases when input is BAM
        min_len --- minimum read length to write a subread when input is BAM
        in_seqids --- zmw ids to dump
        out_file --- a FASTA file when input files are in FASTA; a BAM file when
                     input files are in BAM.
        return movies seen
    """
    movies = set()
    zmw_seen = set()
    f = None # output open file handler

    if bam:
        assert isinstance(reader, BamCollection)
        f = BamWriter(out_file, reader.header)
    else:
        assert isinstance(reader, MetaSubreadFastaReader)
        f = FastaWriter(out_file)

    for seqid in in_seqids:
        zmw = seqid
        try:
            zmw = '/'.join(seqid.split('/')[0:2])
        except ValueError:
            raise ValueError("%s does not contain a valid pacbio zmw id." % seqid)

        if zmw not in zmw_seen:
            movies.add(zmw.split('/')[0])
            zmw_seen.add(zmw)
            try:
                if bam:
                    for rec in reader[zmw].subreads:
                        if len(rec) >= 2*trim_len + min_len:
                            f.write(rec.Clip(rec.readStart+trim_len,
                                             rec.readEnd-trim_len))
                else:
                    for rec in reader[zmw]:
                        if len(rec) >= 2*trim_len + min_len:
                            try:
                                m, hn, s_e = rec.name.split('/')
                                s, e = [int(x) for x in s_e.split('_')]
                                new_id = "%s/%s/%d_%d" % (m, hn, s+trim_len, e-trim_len)
                                f.writeRecord(new_id, rec.sequence[trim_len:-trim_len])
                            except ValueError:
                                raise ValueError("%s is not a valid pacbio subread." % rec.name)
            except KeyError:
                if ignore_keyerror:
                    logging.warning("Ignoring {zmw} because the input FASTA/BAM ".
                                    format(zmw=zmw) + " does not contain it.")
                else:
                    raise ValueError("{0} doesn't exist. Abort!".format(zmw))
    f.close()

    return movies