def iter_gmap_sam_for_fusion(gmap_sam_filename, fusion_candidates, transfrag_len_dict): """ Iterate through a sorted GMAP SAM file Continuously yield a group of overlapping records {'+': [r1, r2, ...], '-': [r3, r4....]} """ records = [] iter = BioReaders.GMAPSAMReader(gmap_sam_filename, True, query_len_dict=transfrag_len_dict) for r in iter: if r.qID in fusion_candidates: records = [r] break for r in iter: if len(records) >= 1 and (r.sID == records[-1].sID and r.sStart < records[-1].sStart): print("ERROR: SAM file is NOT sorted. ABORT!", file=sys.stderr) sys.exit(-1) if len(records) >= 1 and (r.sID != records[0].sID or r.sStart > records[-1].sEnd): yield (sep_by_strand(records)) records = [] if r.qID in fusion_candidates: records.append(r) if len(records) > 0: yield (sep_by_strand(records))
def find_fusion_candidates(sam_filename, query_len_dict, min_locus_coverage=.05, min_locus_coverage_bp=1, min_total_coverage=.99, min_dist_between_loci=10000): """ Return list of fusion candidates qIDs (1) must map to 2 or more loci (2) minimum coverage for each loci is 5% AND minimum coverage in bp is >= 1 bp (3) total coverage is >= 95% (4) distance between the loci is at least 10kb """ TmpRec = namedtuple('TmpRec', ['qCov', 'qLen', 'qStart', 'qEnd', 'sStart', 'sEnd', 'iden']) def total_coverage(tmprecs): tree = ClusterTree(0, 0) for r in tmprecs: tree.insert(r.qStart, r.qEnd, -1) return sum(reg[1]-reg[0] for reg in tree.getregions()) d = defaultdict(lambda: []) reader = BioReaders.GMAPSAMReader(sam_filename, True, query_len_dict=query_len_dict) for r in reader: if r.sID == '*': continue if r.flag.strand == '+': d[r.qID].append(TmpRec(qCov=r.qCoverage, qLen=r.qLen, qStart=r.qStart, qEnd=r.qEnd, sStart=r.sStart, sEnd=r.sEnd, iden=r.identity)) else: d[r.qID].append(TmpRec(qCov=r.qCoverage, qLen=r.qLen, qStart=r.qLen-r.qEnd, qEnd=r.qLen-r.qStart, sStart=r.sStart, sEnd=r.sEnd, iden=r.identity)) fusion_candidates = [] for k, data in d.items(): # if k.startswith('i3_c68723/f6p549'): pdb.set_trace() if len(data) > 1 and \ all(a.iden>=.95 for a in data) and \ all(a.qCov>=min_locus_coverage for a in data) and \ all(a.qCov*a.qLen >= min_locus_coverage_bp for a in data) and \ total_coverage(data)*1./data[0].qLen >= min_total_coverage and \ all(max(a.sStart,b.sStart)-min(a.sEnd,b.sEnd)>=min_dist_between_loci \ for a,b in itertools.combinations(data, 2)): fusion_candidates.append(k) return fusion_candidates
def iter_gmap_sam(self, gmap_sam_filename, ignored_fout): """ Iterate over a SORTED GMAP SAM file. Return a collection of records that overlap by at least 1 base. """ def sep_by_clustertree(records): tree = ClusterTree(0, 0) for i, r in enumerate(records): tree.insert(r.sStart, r.sEnd, i) result = [] for s, e, indices in tree.getregions(): result.append([records[i] for i in indices]) return result def sep_by_strand(records): """ Note! Must further separate again within each strand. Because of initially processing the strands together, could've collapesd some genes. """ output = {'+': [], '-': []} for r in records: output[r.flag.strand].append(r) # process + strand using ClusterTree output['+'] = sep_by_clustertree(output['+']) output['-'] = sep_by_clustertree(output['-']) return output gmap_sam_reader = BioReaders.GMAPSAMReader( gmap_sam_filename, True, query_len_dict=self.transfrag_len_dict) quality_alignments = self.get_quality_alignments( gmap_sam_reader, ignored_fout) # find first acceptably mapped read try: records = [next(quality_alignments)] max_end = records[0].sEnd except StopIteration: print("No valid records from {0}!".format(gmap_sam_filename), file=sys.stderr) return # go through remainder of alignments and group by subject ID for r in quality_alignments: if r.sID == records[0].sID and r.sStart < records[-1].sStart: print("SAM file is NOT sorted. ABORT!", file=sys.stderr) sys.exit(-1) if r.sID != records[0].sID or r.sStart > max_end: yield sep_by_strand(records) records = [r] max_end = r.sEnd else: records.append(r) max_end = max(max_end, r.sEnd) yield sep_by_strand(records)
def summarize_GMAP_sam(input_fa_or_fq, input_sam): d = dict((r.id, len(r.seq)) for r in SeqIO.parse( open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq))) map_count = defaultdict(lambda: 0) for r in BioReaders.GMAPSAMReader(input_sam, True): map_count[r.qID] += 1 multi = [x for x in map_count if map_count[x] > 1] f = open(input_sam + '.summary.txt', 'w') f.write( "id\tqLength\tqCoverage\tidentity\tnum_nonmatch\tnum_ins\tnum_del\tunique\n" ) for r in BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d): if r.sID == '*': continue if r.qID in multi: uni = 'N' else: uni = 'Y' f.write("{0}\t{1}\t{2:.4f}\t{3:.4f}\t\t{4}\t{5}\t{6}\t{7}\n".format( r.qID, d[r.qID], r.qCoverage, r.identity, r.num_nonmatches, r.num_ins, r.num_del, uni)) f.close() print("Output written to: {0}".format(f.name), file=sys.stderr)
def iter_gmap_sam(self, gmap_sam_filename, ignored_fout): """ Iterate over a SORTED GMAP SAM file. Return a collection of records that overlap by at least 1 base. """ def sep_by_strand(records): output = {'+': [], '-': []} for r in records: output[r.flag.strand].append(r) return output records = None # holds the current set of records that overlap in coordinates iter = BioReaders.GMAPSAMReader(gmap_sam_filename, True, query_len_dict=self.transfrag_len_dict) for r in iter: if r.sID == '*': ignored_fout.write("{0}\tUnmapped.\n".format(r.qID)) elif r.qCoverage < self.min_aln_coverage: ignored_fout.write("{0}\tCoverage {1:.3f} too low.\n".format( r.qID, r.qCoverage)) elif r.identity < self.min_aln_identity: ignored_fout.write("{0}\tIdentity {1:.3f} too low.\n".format( r.qID, r.identity)) else: break try: records = [r] except NameError: print >> sys.stderr, "No valid records from {0}!".format( gmap_sam_filename) return for r in iter: if r.sID == records[0].sID and r.sStart < records[-1].sStart: print >> sys.stderr, "SAM file is NOT sorted. ABORT!" sys.exit(-1) if r.qCoverage < self.min_aln_coverage: ignored_fout.write("{0}\tCoverage {1:.3f} too low.\n".format( r.qID, r.qCoverage)) elif r.identity < self.min_aln_identity: ignored_fout.write("{0}\tIdentity {1:.3f} too low.\n".format( r.qID, r.identity)) elif r.sID != records[0].sID or r.sStart > max(x.sEnd for x in records): yield sep_by_strand(records) records = [r] else: records.append(r) yield sep_by_strand(records)
def iter_sorted_gmap_record(sam_filename, umi_bc_dict, out_dir, f_out, use_BC=False): """ :param sam_filename: sorted SAM file of tagged FLNC mapped to genome :param umi_bc_dict: dict of ccs_id --> dict of UMI/BC/info :param out_dir: output directory :param f_out: DictWriter object for writing out ccs_id --> group assignment :param use_BC: is single cell so also use the "BC" field in addition to "UMI" field :return: map_seqid_to_group which is dict of seqid --> group name A group is FLNCs that have the same (mapped locus, UMI) group name is currently a string of the directory we will create later, which is <out_dir>/<loci_index>/<UMI>-<BC>/flnc_tagged.bam """ map_seqid_to_group = {} # seqid (FLNC CCS id) --> group name reader = BioReaders.GMAPSAMReader(sam_filename, True) # find first acceptably mapped read for r in reader: if r.sID != '*': break records = [r] max_end = r.sEnd loci_index = 1 for r in reader: if r.sID == '*': continue if r.sID == records[0].sID and r.sStart < records[-1].sStart: print("SAM file is NOT sorted. ABORT!", file=sys.stderr) sys.exit(-1) if r.sID != records[0].sID or r.sStart > max_end: print("processing {0}:{1}...{2} records".format( r.sID, max_end, len(records)), file=sys.stdout) loci_index = sep_by_UMI(records, umi_bc_dict, out_dir, f_out, map_seqid_to_group, loci_index, use_BC) records = [r] max_end = r.sEnd else: records.append(r) max_end = max(max_end, r.sEnd) return map_seqid_to_group
def pick_rep(fa_fq_filename, sam_filename, gff_filename, group_filename, output_filename, fusion_candidate_ranges, is_fq=False): """ For each group, select the representative record Always pick the longest one! """ if is_fq: fd = LazyFastqReader(fa_fq_filename) fout = open(output_filename, 'w') else: fd = LazyFastaReader(fa_fq_filename) fout = open(output_filename, 'w') rep_info = {} id_to_rep = {} for line in open(group_filename): pb_id, members = line.strip().split('\t') print("Picking representative sequence for", pb_id, file=sys.stdout) best_id = None best_seq = None max_len = 0 for x in members.split(','): if len(fd[x].seq) >= max_len: best_id = x best_seq = fd[x].seq best_qual = fd[x].letter_annotations[ 'phred_quality'] if is_fq else None max_len = len(fd[x].seq) rep_info[pb_id] = (best_id, best_seq, best_qual) id_to_rep[best_id] = pb_id f_gff = open(gff_filename, 'w') coords = {} record_storage = { } # temporary storage for the .1 record to write in conjunction with second record for r in BioReaders.GMAPSAMReader(sam_filename, True): if r.qID in id_to_rep: pb_id = id_to_rep[r.qID] # make coordinates & write the SAM file isoform_index = get_isoform_index(fusion_candidate_ranges[r.qID], r.sID, r.sStart, r.sEnd) if r.qID not in coords: coords[r.qID] = [None] * len(fusion_candidate_ranges[r.qID]) record_storage[pb_id] = [None] * len( fusion_candidate_ranges[r.qID]) coords[r.qID][isoform_index] = "{0}:{1}-{2}({3})".format( r.sID, r.sStart, r.sEnd, r.flag.strand) record_storage[pb_id][isoform_index] = r for pb_id, records in record_storage.items(): for i, r in enumerate(records): isoform_index = i + 1 f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand)) for s in r.segments: f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand)) f_gff.close() for pb_id in rep_info: best_id, best_seq, best_qual = rep_info[pb_id] _id_ = "{0}|{1}|{2}".format(pb_id, "+".join(coords[best_id]), best_id) _seq_ = best_seq if is_fq: SeqIO.write( SeqRecord(_seq_, id=_id_, letter_annotations={'phred_quality': best_qual}), fout, 'fastq') else: SeqIO.write(SeqRecord(_seq_, id=_id_), fout, 'fasta')
def find_fusion_candidates(sam_filename, query_len_dict, min_locus_coverage=.05, min_locus_coverage_bp=1, min_total_coverage=.99, min_dist_between_loci=10000, min_identity=0.95): """ Return dict of fusion candidate qID --> list (in order) of the fusion ranges (ex: (chr3,100,200), (chr1,500,1000)) (1) must map to 2 or more loci (2) minimum coverage for each loci is 5% AND minimum coverage in bp is >= 1 bp (3) total coverage is >= 95% (4) distance between the loci is at least 10kb """ TmpRec = namedtuple( 'TmpRec', ['qCov', 'qLen', 'qStart', 'qEnd', 'sStart', 'sEnd', 'iden', 'chrom']) def total_coverage(tmprecs): tree = ClusterTree(0, 0) for r in tmprecs: tree.insert(r.qStart, r.qEnd, -1) return sum(reg[1] - reg[0] for reg in tree.getregions()) d = defaultdict(lambda: []) reader = BioReaders.GMAPSAMReader(sam_filename, True, query_len_dict=query_len_dict) for r in reader: if r.sID == '*': continue if r.flag.strand == '+': d[r.qID].append( TmpRec(qCov=r.qCoverage, qLen=r.qLen, qStart=r.qStart, qEnd=r.qEnd, sStart=r.sStart, sEnd=r.sEnd, iden=r.identity, chrom=r.sID)) else: d[r.qID].append( TmpRec(qCov=r.qCoverage, qLen=r.qLen, qStart=r.qLen - r.qEnd, qEnd=r.qLen - r.qStart, sStart=r.sStart, sEnd=r.sEnd, iden=r.identity, chrom=r.sID)) fusion_candidates = {} for k, data in d.items(): if len(data) > 1 and \ all(a.iden>=min_identity for a in data) and \ all(a.qCov>=min_locus_coverage for a in data) and \ all(a.qCov*a.qLen >= min_locus_coverage_bp for a in data) and \ total_coverage(data)*1./data[0].qLen >= min_total_coverage and \ all(max(a.sStart,b.sStart)-min(a.sEnd,b.sEnd)>=min_dist_between_loci \ for a,b in itertools.combinations(data, 2)): data.sort(key=lambda x: x.qStart) #pdb.set_trace() fusion_candidates[k] = [(a.chrom, a.sStart, a.sEnd) for a in data] return fusion_candidates
def pick_rep(fa_fq_filename, sam_filename, gff_filename, group_filename, output_filename, is_fq=False, pick_least_err_instead=False): """ For each group, select the representative record If is FASTA file (is_fa False) -- then always pick the longest one If is FASTQ file (is_fq True) -- then If pick_least_err_instead is True, pick the one w/ least number of expected base errors Else, pick the longest one """ if is_fq: fd = LazyFastqReader(fa_fq_filename) fout = open(output_filename, 'w') else: fd = LazyFastaReader(fa_fq_filename) fout = open(output_filename, 'w') # for line in open(gff_filename): # # ex: chr1 PacBio transcript 27567 29336 . - . gene_id "PBfusion.1"; transcript_id "PBfusion.1.1"; # raw = line.strip().split('\t') # if raw[2] == 'transcript': # # check if this is first or 2+ part of fusion # tid = raw[-1].split('; ')[1].split()[1][1:-2] # ex: tid = PBfusion.1.1 # gid = tid[:tid.rfind('.')] # ex: gid = PBfusion.1 # if tid.endswith('.1'): # coords[gid] = "{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) # else: # assert gid in coords # coords[gid] += "+{0}:{1}-{2}({3})".format(raw[0], raw[3], raw[4], raw[6]) rep_info = {} id_to_rep = {} for line in open(group_filename): pb_id, members = line.strip().split('\t') print >> sys.stderr, "Picking representative sequence for", pb_id best_id = None best_seq = None best_qual = None best_err = 9999999 err = 9999999 max_len = 0 for x in members.split(','): if is_fq and pick_least_err_instead: err = sum(i**-(i / 10.) for i in fd[x].letter_annotations['phred_quality']) if (is_fq and pick_least_err_instead and err < best_err) or ( (not is_fq or not pick_least_err_instead) and len(fd[x].seq) >= max_len): best_id = x best_seq = fd[x].seq if is_fq: best_qual = fd[x].letter_annotations['phred_quality'] best_err = err max_len = len(fd[x].seq) rep_info[pb_id] = (best_id, best_seq, best_qual) id_to_rep[best_id] = pb_id f_gff = open(gff_filename, 'w') coords = {} record_storage = { } # temporary storage for the .1 record to write in conjunction with second record for r in BioReaders.GMAPSAMReader(sam_filename, True): if r.qID in id_to_rep: pb_id = id_to_rep[r.qID] best_id, best_seq, best_qual = rep_info[pb_id] # make coordinates & write the SAM file if r.qID not in coords: # this is the .1 portion coords[r.qID] = "{0}:{1}-{2}({3})".format( r.sID, r.sStart, r.sEnd, r.flag.strand) isoform_index = 1 record_storage[pb_id] = [r] else: # this is the .2 portion, or even .3, .4....! handle fusions with > 2 loci correctly coords[r.qID] += "+{0}:{1}-{2}({3})".format( r.sID, r.sStart, r.sEnd, r.flag.strand) record_storage[pb_id].append(r) for pb_id, records in record_storage.iteritems(): for i, r in enumerate(records): isoform_index = i + 1 f_gff.write("{chr}\tPacBio\ttranscript\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=r.sID, s=r.segments[0].start+1, e=r.segments[-1].end, pi=pb_id, j=isoform_index, strand=r.flag.strand)) for s in r.segments: f_gff.write("{chr}\tPacBio\texon\t{s}\t{e}\t.\t{strand}\t.\tgene_id \"{pi}\"; transcript_id \"{pi}.{j}\";\n".format(\ chr=r.sID, s=s.start+1, e=s.end, pi=pb_id, j=isoform_index, strand=r.flag.strand)) f_gff.close() for pb_id in rep_info: best_id, best_seq, best_qual = rep_info[pb_id] _id_ = "{0}|{1}|{2}".format(pb_id, coords[best_id], best_id) _seq_ = best_seq if is_fq: SeqIO.write( SeqRecord(_seq_, id=_id_, letter_annotations={'phred_quality': best_qual}), fout, 'fastq') else: SeqIO.write(SeqRecord(_seq_, id=_id_), fout, 'fasta')
def iter_gmap_sam(self, gmap_sam_filename, ignored_fout): """ Iterate over a SORTED GMAP SAM file. Return a collection of records that overlap by at least 1 base. """ def sep_by_clustertree(records): tree = ClusterTree(0, 0) for i, r in enumerate(records): tree.insert(r.sStart, r.sEnd, i) result = [] for s, e, indices in tree.getregions(): result.append([records[i] for i in indices]) return result def sep_by_strand(records): """ Note! Must further separate again within each strand. Because of initially processing the strands together, could've collapesd some genes. """ output = {'+': [], '-': []} for r in records: output[r.flag.strand].append(r) # process + strand using ClusterTree output['+'] = sep_by_clustertree(output['+']) output['-'] = sep_by_clustertree(output['-']) return output records = None # holds the current set of records that overlap in coordinates iter = BioReaders.GMAPSAMReader(gmap_sam_filename, True, query_len_dict=self.transfrag_len_dict) for r in iter: if r.sID == '*': ignored_fout.write("{0}\tUnmapped.\n".format(r.qID)) elif r.qCoverage < self.min_aln_coverage: ignored_fout.write("{0}\tCoverage {1:.3f} too low.\n".format( r.qID, r.qCoverage)) elif r.identity < self.min_aln_identity: ignored_fout.write("{0}\tIdentity {1:.3f} too low.\n".format( r.qID, r.identity)) else: break try: records = [r] except NameError: print >> sys.stderr, "No valid records from {0}!".format( gmap_sam_filename) return for r in iter: if r.sID == records[0].sID and r.sStart < records[-1].sStart: print >> sys.stderr, "SAM file is NOT sorted. ABORT!" sys.exit(-1) if r.qCoverage < self.min_aln_coverage: ignored_fout.write("{0}\tCoverage {1:.3f} too low.\n".format( r.qID, r.qCoverage)) elif r.identity < self.min_aln_identity: ignored_fout.write("{0}\tIdentity {1:.3f} too low.\n".format( r.qID, r.identity)) elif r.sID != records[0].sID or r.sStart > max(x.sEnd for x in records): yield sep_by_strand(records) records = [r] else: records.append(r) yield sep_by_strand(records)