Ejemplo n.º 1
0
def iter_gmap_sam_for_fusion(gmap_sam_filename, fusion_candidates,
                             transfrag_len_dict):
    """
    Iterate through a sorted GMAP SAM file
    Continuously yield a group of overlapping records {'+': [r1, r2, ...], '-': [r3, r4....]}
    """
    records = []
    iter = BioReaders.GMAPSAMReader(gmap_sam_filename,
                                    True,
                                    query_len_dict=transfrag_len_dict)
    for r in iter:
        if r.qID in fusion_candidates:
            records = [r]
            break

    for r in iter:
        if len(records) >= 1 and (r.sID == records[-1].sID
                                  and r.sStart < records[-1].sStart):
            print("ERROR: SAM file is NOT sorted. ABORT!", file=sys.stderr)
            sys.exit(-1)
        if len(records) >= 1 and (r.sID != records[0].sID
                                  or r.sStart > records[-1].sEnd):
            yield (sep_by_strand(records))
            records = []
        if r.qID in fusion_candidates:
            records.append(r)

    if len(records) > 0:
        yield (sep_by_strand(records))
Ejemplo n.º 2
0
def find_fusion_candidates(sam_filename, query_len_dict, min_locus_coverage=.05, min_locus_coverage_bp=1, min_total_coverage=.99, min_dist_between_loci=10000):
    """
    Return list of fusion candidates qIDs
    (1) must map to 2 or more loci
    (2) minimum coverage for each loci is 5% AND minimum coverage in bp is >= 1 bp
    (3) total coverage is >= 95%
    (4) distance between the loci is at least 10kb
    """
    TmpRec = namedtuple('TmpRec', ['qCov', 'qLen', 'qStart', 'qEnd', 'sStart', 'sEnd', 'iden'])
    def total_coverage(tmprecs):
        tree = ClusterTree(0, 0)
        for r in tmprecs: tree.insert(r.qStart, r.qEnd, -1)
        return sum(reg[1]-reg[0] for reg in tree.getregions())

    d = defaultdict(lambda: [])
    reader = BioReaders.GMAPSAMReader(sam_filename, True, query_len_dict=query_len_dict)
    for r in reader:
        if r.sID == '*': continue
        if r.flag.strand == '+':
            d[r.qID].append(TmpRec(qCov=r.qCoverage, qLen=r.qLen, qStart=r.qStart, qEnd=r.qEnd, sStart=r.sStart, sEnd=r.sEnd, iden=r.identity))
        else:
            d[r.qID].append(TmpRec(qCov=r.qCoverage, qLen=r.qLen, qStart=r.qLen-r.qEnd, qEnd=r.qLen-r.qStart, sStart=r.sStart, sEnd=r.sEnd, iden=r.identity))
    fusion_candidates = []
    for k, data in d.items():
#        if k.startswith('i3_c68723/f6p549'): pdb.set_trace()
        if len(data) > 1 and \
            all(a.iden>=.95 for a in data) and \
            all(a.qCov>=min_locus_coverage for a in data) and \
            all(a.qCov*a.qLen >= min_locus_coverage_bp for a in data) and \
            total_coverage(data)*1./data[0].qLen >= min_total_coverage and \
            all(max(a.sStart,b.sStart)-min(a.sEnd,b.sEnd)>=min_dist_between_loci \
                           for a,b in itertools.combinations(data, 2)):
                    fusion_candidates.append(k)
    return fusion_candidates
Ejemplo n.º 3
0
    def iter_gmap_sam(self, gmap_sam_filename, ignored_fout):
        """
        Iterate over a SORTED GMAP SAM file.
        Return a collection of records that overlap by at least 1 base.
        """
        def sep_by_clustertree(records):
            tree = ClusterTree(0, 0)
            for i, r in enumerate(records):
                tree.insert(r.sStart, r.sEnd, i)
            result = []
            for s, e, indices in tree.getregions():
                result.append([records[i] for i in indices])
            return result

        def sep_by_strand(records):
            """
            Note! Must further separate again within each strand. Because of initially processing
            the strands together, could've collapesd some genes.
            """
            output = {'+': [], '-': []}
            for r in records:
                output[r.flag.strand].append(r)
            # process + strand using ClusterTree
            output['+'] = sep_by_clustertree(output['+'])
            output['-'] = sep_by_clustertree(output['-'])
            return output

        gmap_sam_reader = BioReaders.GMAPSAMReader(
            gmap_sam_filename, True, query_len_dict=self.transfrag_len_dict)
        quality_alignments = self.get_quality_alignments(
            gmap_sam_reader, ignored_fout)

        # find first acceptably mapped read
        try:
            records = [next(quality_alignments)]
            max_end = records[0].sEnd
        except StopIteration:
            print("No valid records from {0}!".format(gmap_sam_filename),
                  file=sys.stderr)
            return
        # go through remainder of alignments and group by subject ID
        for r in quality_alignments:
            if r.sID == records[0].sID and r.sStart < records[-1].sStart:
                print("SAM file is NOT sorted. ABORT!", file=sys.stderr)
                sys.exit(-1)
            if r.sID != records[0].sID or r.sStart > max_end:
                yield sep_by_strand(records)
                records = [r]
                max_end = r.sEnd
            else:
                records.append(r)
                max_end = max(max_end, r.sEnd)
        yield sep_by_strand(records)
Ejemplo n.º 4
0
def summarize_GMAP_sam(input_fa_or_fq, input_sam):
    d = dict((r.id, len(r.seq)) for r in SeqIO.parse(
        open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq)))

    map_count = defaultdict(lambda: 0)
    for r in BioReaders.GMAPSAMReader(input_sam, True):
        map_count[r.qID] += 1
    multi = [x for x in map_count if map_count[x] > 1]

    f = open(input_sam + '.summary.txt', 'w')
    f.write(
        "id\tqLength\tqCoverage\tidentity\tnum_nonmatch\tnum_ins\tnum_del\tunique\n"
    )
    for r in BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d):
        if r.sID == '*': continue
        if r.qID in multi: uni = 'N'
        else: uni = 'Y'
        f.write("{0}\t{1}\t{2:.4f}\t{3:.4f}\t\t{4}\t{5}\t{6}\t{7}\n".format(
            r.qID, d[r.qID], r.qCoverage, r.identity, r.num_nonmatches,
            r.num_ins, r.num_del, uni))
    f.close()

    print("Output written to: {0}".format(f.name), file=sys.stderr)
Ejemplo n.º 5
0
    def iter_gmap_sam(self, gmap_sam_filename, ignored_fout):
        """
        Iterate over a SORTED GMAP SAM file.
        Return a collection of records that overlap by at least 1 base.
        """
        def sep_by_strand(records):
            output = {'+': [], '-': []}
            for r in records:
                output[r.flag.strand].append(r)
            return output

        records = None  # holds the current set of records that overlap in coordinates
        iter = BioReaders.GMAPSAMReader(gmap_sam_filename,
                                        True,
                                        query_len_dict=self.transfrag_len_dict)
        for r in iter:
            if r.sID == '*':
                ignored_fout.write("{0}\tUnmapped.\n".format(r.qID))
            elif r.qCoverage < self.min_aln_coverage:
                ignored_fout.write("{0}\tCoverage {1:.3f} too low.\n".format(
                    r.qID, r.qCoverage))
            elif r.identity < self.min_aln_identity:
                ignored_fout.write("{0}\tIdentity {1:.3f} too low.\n".format(
                    r.qID, r.identity))
            else:
                break
        try:
            records = [r]
        except NameError:
            print >> sys.stderr, "No valid records from {0}!".format(
                gmap_sam_filename)
            return
        for r in iter:
            if r.sID == records[0].sID and r.sStart < records[-1].sStart:
                print >> sys.stderr, "SAM file is NOT sorted. ABORT!"
                sys.exit(-1)
            if r.qCoverage < self.min_aln_coverage:
                ignored_fout.write("{0}\tCoverage {1:.3f} too low.\n".format(
                    r.qID, r.qCoverage))
            elif r.identity < self.min_aln_identity:
                ignored_fout.write("{0}\tIdentity {1:.3f} too low.\n".format(
                    r.qID, r.identity))
            elif r.sID != records[0].sID or r.sStart > max(x.sEnd
                                                           for x in records):
                yield sep_by_strand(records)
                records = [r]
            else:
                records.append(r)
        yield sep_by_strand(records)
Ejemplo n.º 6
0
def iter_sorted_gmap_record(sam_filename,
                            umi_bc_dict,
                            out_dir,
                            f_out,
                            use_BC=False):
    """
    :param sam_filename: sorted SAM file of tagged FLNC mapped to genome
    :param umi_bc_dict: dict of ccs_id --> dict of UMI/BC/info
    :param out_dir: output directory
    :param f_out: DictWriter object for writing out ccs_id --> group assignment
    :param use_BC: is single cell so also use the "BC" field in addition to "UMI" field
    :return: map_seqid_to_group which is dict of seqid --> group name

    A group is FLNCs that have the same (mapped locus, UMI)
    group name is currently a string of the directory we will create later, which is
       <out_dir>/<loci_index>/<UMI>-<BC>/flnc_tagged.bam
    """
    map_seqid_to_group = {}  # seqid (FLNC CCS id) --> group name
    reader = BioReaders.GMAPSAMReader(sam_filename, True)

    # find first acceptably mapped read
    for r in reader:
        if r.sID != '*': break
    records = [r]
    max_end = r.sEnd

    loci_index = 1
    for r in reader:
        if r.sID == '*': continue
        if r.sID == records[0].sID and r.sStart < records[-1].sStart:
            print("SAM file is NOT sorted. ABORT!", file=sys.stderr)
            sys.exit(-1)
        if r.sID != records[0].sID or r.sStart > max_end:
            print("processing {0}:{1}...{2} records".format(
                r.sID, max_end, len(records)),
                  file=sys.stdout)
            loci_index = sep_by_UMI(records, umi_bc_dict, out_dir, f_out,
                                    map_seqid_to_group, loci_index, use_BC)
            records = [r]
            max_end = r.sEnd
        else:
            records.append(r)
            max_end = max(max_end, r.sEnd)

    return map_seqid_to_group
Ejemplo n.º 7
0
def pick_rep(fa_fq_filename,
             sam_filename,
             gff_filename,
             group_filename,
             output_filename,
             fusion_candidate_ranges,
             is_fq=False):
    """
    For each group, select the representative record
    Always pick the longest one!
    """
    if is_fq:
        fd = LazyFastqReader(fa_fq_filename)
        fout = open(output_filename, 'w')
    else:
        fd = LazyFastaReader(fa_fq_filename)
        fout = open(output_filename, 'w')

    rep_info = {}
    id_to_rep = {}
    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        print("Picking representative sequence for", pb_id, file=sys.stdout)
        best_id = None
        best_seq = None
        max_len = 0
        for x in members.split(','):
            if len(fd[x].seq) >= max_len:
                best_id = x
                best_seq = fd[x].seq
                best_qual = fd[x].letter_annotations[
                    'phred_quality'] if is_fq else None
                max_len = len(fd[x].seq)
        rep_info[pb_id] = (best_id, best_seq, best_qual)
        id_to_rep[best_id] = pb_id

    f_gff = open(gff_filename, 'w')
    coords = {}
    record_storage = {
    }  # temporary storage for the .1 record to write in conjunction with second record
    for r in BioReaders.GMAPSAMReader(sam_filename, True):
        if r.qID in id_to_rep:
            pb_id = id_to_rep[r.qID]
            # make coordinates & write the SAM file
            isoform_index = get_isoform_index(fusion_candidate_ranges[r.qID],
                                              r.sID, r.sStart, r.sEnd)
            if r.qID not in coords:
                coords[r.qID] = [None] * len(fusion_candidate_ranges[r.qID])
                record_storage[pb_id] = [None] * len(
                    fusion_candidate_ranges[r.qID])
            coords[r.qID][isoform_index] = "{0}:{1}-{2}({3})".format(
                r.sID, r.sStart, r.sEnd, r.flag.strand)
            record_storage[pb_id][isoform_index] = r

    for pb_id, records in record_storage.items():
        for i, r in enumerate(records):
            isoform_index = i + 1
            f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
            for s in r.segments:
                f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                    chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
    f_gff.close()

    for pb_id in rep_info:
        best_id, best_seq, best_qual = rep_info[pb_id]
        _id_ = "{0}|{1}|{2}".format(pb_id, "+".join(coords[best_id]), best_id)
        _seq_ = best_seq
        if is_fq:
            SeqIO.write(
                SeqRecord(_seq_,
                          id=_id_,
                          letter_annotations={'phred_quality': best_qual}),
                fout, 'fastq')
        else:
            SeqIO.write(SeqRecord(_seq_, id=_id_), fout, 'fasta')
Ejemplo n.º 8
0
def find_fusion_candidates(sam_filename,
                           query_len_dict,
                           min_locus_coverage=.05,
                           min_locus_coverage_bp=1,
                           min_total_coverage=.99,
                           min_dist_between_loci=10000,
                           min_identity=0.95):
    """
    Return dict of
       fusion candidate qID --> list (in order) of the fusion ranges (ex: (chr3,100,200), (chr1,500,1000))
    (1) must map to 2 or more loci
    (2) minimum coverage for each loci is 5% AND minimum coverage in bp is >= 1 bp
    (3) total coverage is >= 95%
    (4) distance between the loci is at least 10kb
    """
    TmpRec = namedtuple(
        'TmpRec',
        ['qCov', 'qLen', 'qStart', 'qEnd', 'sStart', 'sEnd', 'iden', 'chrom'])

    def total_coverage(tmprecs):
        tree = ClusterTree(0, 0)
        for r in tmprecs:
            tree.insert(r.qStart, r.qEnd, -1)
        return sum(reg[1] - reg[0] for reg in tree.getregions())

    d = defaultdict(lambda: [])
    reader = BioReaders.GMAPSAMReader(sam_filename,
                                      True,
                                      query_len_dict=query_len_dict)
    for r in reader:
        if r.sID == '*': continue
        if r.flag.strand == '+':
            d[r.qID].append(
                TmpRec(qCov=r.qCoverage,
                       qLen=r.qLen,
                       qStart=r.qStart,
                       qEnd=r.qEnd,
                       sStart=r.sStart,
                       sEnd=r.sEnd,
                       iden=r.identity,
                       chrom=r.sID))
        else:
            d[r.qID].append(
                TmpRec(qCov=r.qCoverage,
                       qLen=r.qLen,
                       qStart=r.qLen - r.qEnd,
                       qEnd=r.qLen - r.qStart,
                       sStart=r.sStart,
                       sEnd=r.sEnd,
                       iden=r.identity,
                       chrom=r.sID))
    fusion_candidates = {}
    for k, data in d.items():
        if len(data) > 1 and \
            all(a.iden>=min_identity for a in data) and \
            all(a.qCov>=min_locus_coverage for a in data) and \
            all(a.qCov*a.qLen >= min_locus_coverage_bp for a in data) and \
            total_coverage(data)*1./data[0].qLen >= min_total_coverage and \
            all(max(a.sStart,b.sStart)-min(a.sEnd,b.sEnd)>=min_dist_between_loci \
                           for a,b in itertools.combinations(data, 2)):
            data.sort(key=lambda x: x.qStart)
            #pdb.set_trace()
            fusion_candidates[k] = [(a.chrom, a.sStart, a.sEnd) for a in data]
    return fusion_candidates
Ejemplo n.º 9
0
def pick_rep(fa_fq_filename,
             sam_filename,
             gff_filename,
             group_filename,
             output_filename,
             is_fq=False,
             pick_least_err_instead=False):
    """
    For each group, select the representative record

    If is FASTA file (is_fa False) -- then always pick the longest one
    If is FASTQ file (is_fq True) -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    if is_fq:
        fd = LazyFastqReader(fa_fq_filename)
        fout = open(output_filename, 'w')
    else:
        fd = LazyFastaReader(fa_fq_filename)
        fout = open(output_filename, 'w')


#    for line in open(gff_filename):
#        # ex: chr1    PacBio  transcript      27567   29336   .       -       .       gene_id "PBfusion.1"; transcript_id "PBfusion.1.1";
#        raw = line.strip().split('\t')
#        if raw[2] == 'transcript':
#            # check if this is first or 2+ part of fusion
#            tid = raw[-1].split('; ')[1].split()[1][1:-2] # ex: tid = PBfusion.1.1
#            gid = tid[:tid.rfind('.')] # ex: gid = PBfusion.1
#            if tid.endswith('.1'):
#                coords[gid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])
#            else:
#                assert gid in coords
#                coords[gid] += "+{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6])

    rep_info = {}
    id_to_rep = {}
    for line in open(group_filename):
        pb_id, members = line.strip().split('\t')
        print >> sys.stderr, "Picking representative sequence for", pb_id
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0
        for x in members.split(','):
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.)
                          for i in fd[x].letter_annotations['phred_quality'])
            if (is_fq and pick_least_err_instead and err < best_err) or (
                (not is_fq or not pick_least_err_instead)
                    and len(fd[x].seq) >= max_len):
                best_id = x
                best_seq = fd[x].seq
                if is_fq:
                    best_qual = fd[x].letter_annotations['phred_quality']
                    best_err = err
                max_len = len(fd[x].seq)
        rep_info[pb_id] = (best_id, best_seq, best_qual)
        id_to_rep[best_id] = pb_id

    f_gff = open(gff_filename, 'w')
    coords = {}
    record_storage = {
    }  # temporary storage for the .1 record to write in conjunction with second record
    for r in BioReaders.GMAPSAMReader(sam_filename, True):
        if r.qID in id_to_rep:
            pb_id = id_to_rep[r.qID]
            best_id, best_seq, best_qual = rep_info[pb_id]

            # make coordinates & write the SAM file
            if r.qID not in coords:
                # this is the .1 portion
                coords[r.qID] = "{0}:{1}-{2}({3})".format(
                    r.sID, r.sStart, r.sEnd, r.flag.strand)
                isoform_index = 1
                record_storage[pb_id] = [r]
            else:
                # this is the .2 portion, or even .3, .4....! handle fusions with > 2 loci correctly
                coords[r.qID] += "+{0}:{1}-{2}({3})".format(
                    r.sID, r.sStart, r.sEnd, r.flag.strand)
                record_storage[pb_id].append(r)

    for pb_id, records in record_storage.iteritems():
        for i, r in enumerate(records):
            isoform_index = i + 1
            f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
            for s in r.segments:
                f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\
                    chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand))
    f_gff.close()

    for pb_id in rep_info:
        best_id, best_seq, best_qual = rep_info[pb_id]
        _id_ = "{0}|{1}|{2}".format(pb_id, coords[best_id], best_id)
        _seq_ = best_seq
        if is_fq:
            SeqIO.write(
                SeqRecord(_seq_,
                          id=_id_,
                          letter_annotations={'phred_quality': best_qual}),
                fout, 'fastq')
        else:
            SeqIO.write(SeqRecord(_seq_, id=_id_), fout, 'fasta')
Ejemplo n.º 10
0
    def iter_gmap_sam(self, gmap_sam_filename, ignored_fout):
        """
        Iterate over a SORTED GMAP SAM file.
        Return a collection of records that overlap by at least 1 base.
        """
        def sep_by_clustertree(records):
            tree = ClusterTree(0, 0)
            for i, r in enumerate(records):
                tree.insert(r.sStart, r.sEnd, i)
            result = []
            for s, e, indices in tree.getregions():
                result.append([records[i] for i in indices])
            return result

        def sep_by_strand(records):
            """
            Note! Must further separate again within each strand. Because of initially processing
            the strands together, could've collapesd some genes.
            """
            output = {'+': [], '-': []}
            for r in records:
                output[r.flag.strand].append(r)
            # process + strand using ClusterTree
            output['+'] = sep_by_clustertree(output['+'])
            output['-'] = sep_by_clustertree(output['-'])
            return output

        records = None  # holds the current set of records that overlap in coordinates
        iter = BioReaders.GMAPSAMReader(gmap_sam_filename,
                                        True,
                                        query_len_dict=self.transfrag_len_dict)
        for r in iter:
            if r.sID == '*':
                ignored_fout.write("{0}\tUnmapped.\n".format(r.qID))
            elif r.qCoverage < self.min_aln_coverage:
                ignored_fout.write("{0}\tCoverage {1:.3f} too low.\n".format(
                    r.qID, r.qCoverage))
            elif r.identity < self.min_aln_identity:
                ignored_fout.write("{0}\tIdentity {1:.3f} too low.\n".format(
                    r.qID, r.identity))
            else:
                break
        try:
            records = [r]
        except NameError:
            print >> sys.stderr, "No valid records from {0}!".format(
                gmap_sam_filename)
            return
        for r in iter:
            if r.sID == records[0].sID and r.sStart < records[-1].sStart:
                print >> sys.stderr, "SAM file is NOT sorted. ABORT!"
                sys.exit(-1)
            if r.qCoverage < self.min_aln_coverage:
                ignored_fout.write("{0}\tCoverage {1:.3f} too low.\n".format(
                    r.qID, r.qCoverage))
            elif r.identity < self.min_aln_identity:
                ignored_fout.write("{0}\tIdentity {1:.3f} too low.\n".format(
                    r.qID, r.identity))
            elif r.sID != records[0].sID or r.sStart > max(x.sEnd
                                                           for x in records):
                yield sep_by_strand(records)
                records = [r]
            else:
                records.append(r)
        yield sep_by_strand(records)