def main(): from argparse import ArgumentParser parser = ArgumentParser("Convert SAM to GFF3 format using BCBio GFF") parser.add_argument("sam_filename") parser.add_argument( "-i", "--input_fasta", default=None, help="(Optional) input fasta. If given, coverage will be calculated.") args = parser.parse_args() if not args.sam_filename.endswith('.sam'): print >> sys.stderr, "Only accepts files ending in .sam. Abort!" sys.exit(-1) prefix = args.sam_filename[:-4] output_gff3 = prefix + '.gff3' q_dict = None if args.input_fasta is not None: q_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(open(args.input_fasta), 'fasta')) with open(output_gff3, 'w') as f: recs = [ convert_sam_rec_to_gff3_rec(r0) for r0 in GMAPSAMReader( args.sam_filename, True, query_len_dict=q_dict) ] BCBio_GFF.write(filter(lambda x: x is not None, recs), f) print >> sys.stderr, "Output written to {0}.".format(output_gff3)
def main(): from argparse import ArgumentParser parser = ArgumentParser("Convert SAM to collapsed GFF format") parser.add_argument("sam_filename") args = parser.parse_args() if not args.sam_filename.endswith('.sam'): print >> sys.stderr, "Only accepts files ending in .sam. Abort!" sys.exit(-1) prefix = args.sam_filename[:-4] output_gff = prefix + '.collapsed.gff' with open(output_gff, 'w') as f: reader = GMAPSAMReader(args.sam_filename, True) for r in reader: if r.sID == '*': continue r.strand = r.flag.strand r.seqid = r.qID r.chr = r.sID r.ref_exons = r.segments r.start = r.sStart r.end = r.sEnd r.cds_exons = None write_collapseGFF_format(f, r) print >> sys.stderr, "Output written to {0}.".format(output_gff)
def convert_sam_to_gff3(sam_filename, output_gff3, source, q_dict=None): qid_index_dict = Counter() with open(output_gff3, 'w') as f: recs = [ convert_sam_rec_to_gff3_rec(r0, source, qid_index_dict) for r0 in GMAPSAMReader(sam_filename, True, query_len_dict=q_dict) ] BCBio_GFF.write(filter(lambda x: x is not None, recs), f)
def regroup_sam_to_gff(pooled_sam, demux_count_file, output_prefix, out_group_dict, in_fafq=None): """ :param pooled_sam: SAM file :param demux_count_file: comma-delimited per-barcode count file :param output_prefix: output prefix for GFF :param out_group_dict: dict of barcode name --> group to be long in (ex: {'EM1':'EM', 'EM2':'EM'}) :param in_fafq: optional fasta/fastq that was input to SAM """ if in_fafq is not None: type_fafq = get_type_fafq(in_fafq) in_tissue = defaultdict(lambda: set()) # pbid --> list of tissue it is in (EM, END, R) for r in DictReader(open(demux_count_file),delimiter=','): for k,v in r.iteritems(): if k=='id': continue if int(v) > 0: in_tissue[r['id']].add(k) in_tissue = dict(in_tissue) handles = {} handles_fafq = {} for g in out_group_dict.itervalues(): handles[g] = open("{o}_{g}_only.gff".format(o=output_prefix, g=g), 'w') if in_fafq is not None: handles_fafq[g] = open("{o}_{g}_only.{t}".format(o=output_prefix, g=g, t=type_fafq), 'w') if in_fafq is not None: fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq)) fafq_dict_keys = fafq_dict.keys() for k in fafq_dict_keys: m = rex_pbid.match(k) if m is not None: fafq_dict[m.group(1)] = fafq_dict[k] reader = GMAPSAMReader(pooled_sam, True) for r in reader: if r.sID == '*': print >> sys.stderr, "Ignore {0} because unmapped.".format(r.qID) continue m = rex_pbid.match(r.qID) if m is not None: pbid = m.group(1) else: pbid = r.qID # convert SAM record to GFF record type r.seqid = pbid r.chr = r.sID r.start, r.end = r.sStart, r.sEnd r.strand = r.flag.strand r.ref_exons = r.segments r.cds_exons = None groups_to_write_in = set() if pbid not in in_tissue: print >> sys.stderr, "WARNING: {0} does not belong to any group indicated by outgroup_dict".format(pbid) for tissue in in_tissue[pbid]: groups_to_write_in.add(out_group_dict[tissue]) for g in groups_to_write_in: GFF.write_collapseGFF_format(handles[g], r) if in_fafq is not None: SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
def phase_variant(self, sam_filename, input_fa_or_fq, output_prefix, partial_ok=False): """ :param sam_filename: CCS SAM filename. Can be unsorted. :param input_fa_or_fq: Input CCS fasta/fastq filename. :param output_prefix: Output prefix. Writes to xxx.log. :param partial_ok: default False. if True, (CCS) reads don't need to cover all SNP positions. For each alignment: 1. discard if did not map to the strand expected 2. discard if did not map to the full range of variants (unless <partial_ok> is True) 3. discard if at var positions have non-called bases (outliers) """ f_log = open(output_prefix + '.log', 'w') seq_dict = SeqIO.to_dict( SeqIO.parse(open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq))) for r in GMAPSAMReader(sam_filename, True, query_len_dict=dict((k, len(seq_dict[k].seq)) for k in seq_dict)): if r.sID == '*': f_log.write("Ignore {0} because: unmapped.\n".format(r.qID)) continue if r.flag.strand != self.vc.expected_strand: f_log.write("Ignore {0} because: strand is {1}.\n".format( r.qID, r.flag.strand)) continue # ignore if not partial_ok and (r.sStart > self.min_var_pos or r.sEnd < self.max_var_pos): f_log.write( "Ignore {0} because: aln too short, from {1}-{2}.\n". format(r.qID, r.sStart + 1, r.sEnd)) continue i, msg = self.match_haplotype(r, str(seq_dict[r.qID].seq).upper(), partial_ok) if i is None: # read is rejected for reason listed in <msg> f_log.write("Ignore {0} because: {1}.\n".format(r.qID, msg)) continue else: f_log.write("{0} phased: haplotype {1}={2}\n".format( r.qID, i, self.haplotypes[i])) print "{0} has haplotype {1}:{2}".format( r.qID, i, self.haplotypes[i]) self.seq_hap_info[r.qID] = i
def process_sam_to_wig(sam_filename, output_wig, cov_threshold=200, meta_info=None): cov = np.zeros(REF_LENGTH) reader = GMAPSAMReader(sam_filename, True) f_sam = open(sam_filename[:sam_filename.rfind('.')] + '.metainfo.sam', 'w') f_sam.write(reader.header) bad_count = 0 for r in reader: tags = '' if len(r.segments) > 1: for e in r.segments: cov[e.start:e.end] += 1 bad_count += 1 tags += "\tsg:i:{0}".format(len(r.segments)) # sg: number of segments if meta_info is not None: seqid = r.qID.split('|')[0] if seqid in meta_info: tags += "\tst:A:{0}".format( meta_info[seqid]['Sequencing technology'] [0]) # st: sequencing technology else: print("WARNING: Could not find {0} in metadata. Skipping.". format(seqid)) f_sam.write(r.record_line + tags + '\n') f_sam.close() for i in range(len(cov)): if cov[i] < cov_threshold: cov[i] = 0 f = open(output_wig, 'w') f.write("variableStep chrom=NC_045512v2 start=1") for i in range(len(cov)): f.write("{0} {1}\n".format(i + 1, cov[i])) f.close()
def sqanti_filter_lite(args): fafq_type = 'fasta' with open(args.isoforms) as h: if h.readline().startswith('@'): fafq_type = 'fastq' prefix = args.sqanti_class[:args.sqanti_class.rfind('.')] fcsv = open(prefix + '.filtered_lite_reasons.txt', 'w') fcsv.write("# classification: {0}\n".format(args.sqanti_class)) fcsv.write("# isoform: {0}\n".format(args.isoforms)) fcsv.write("# intrapriming cutoff: {0}\n".format(args.intrapriming)) fcsv.write("# min_cov cutoff: {0}\n".format(args.min_cov)) fcsv.write("filtered_isoform,reason\n") fout = open(prefix + '.filtered_lite.' + fafq_type, 'w') seqids_to_keep = [] total_count = 0 for r in DictReader(open(args.sqanti_class), delimiter='\t'): total_count += 1 filter_flag, filter_msg = False, "" percA = float(r['perc_A_downstream_TTS']) / 100 assert 0 <= percA <= 1 min_cov = float(r['min_cov']) if r['min_cov'] != 'NA' else None num_exon = int(r['exons']) is_RTS = r['RTS_stage'] == 'TRUE' is_canonical = r['all_canonical'] == 'canonical' cat = CATEGORY_DICT[r['structural_category']] if cat in ['FSM', 'ISM', 'NIC']: if (percA >= args.intrapriming and r['polyA_motif'] == 'NA'): filter_flag, filter_msg = True, "IntraPriming" else: if (percA >= args.intrapriming and r['polyA_motif'] == 'NA'): filter_flag, filter_msg = True, "IntraPriming" elif is_RTS: filter_flag, filter_msg = True, "RTSwitching" elif (not is_canonical) and (min_cov is None or (min_cov is not None and min_cov < args.min_cov)): filter_flag, filter_msg = True, "LowCoverage/Non-Canonical" if not filter_flag: seqids_to_keep.append(r['isoform']) else: fcsv.write("{0},{1}\n".format(r['isoform'], filter_msg)) print >> sys.stdout, "{0} isoforms read from {1}. {2} to be kept.".format( total_count, args.sqanti_class, len(seqids_to_keep)) for r in SeqIO.parse(open(args.isoforms), fafq_type): if r.id in seqids_to_keep: SeqIO.write(r, fout, fafq_type) fout.close() print >> sys.stdout, "Output written to: {0}".format(fout.name) # write out a new .classification.txt, .junctions.txt outputClassPath = prefix + '.filtered_lite_classification.txt' with open(outputClassPath, 'w') as f: reader = DictReader(open(args.sqanti_class), delimiter='\t') writer = DictWriter(f, reader.fieldnames, delimiter='\t') writer.writeheader() for r in reader: if r['isoform'] in seqids_to_keep: writer.writerow(r) print >> sys.stdout, "Output written to: {0}".format(f.name) outputJuncPath = prefix + '.filtered_lite_junctions.txt' with open(outputJuncPath, 'w') as f: reader = DictReader(open( args.sqanti_class.replace('_classification', '_junctions')), delimiter='\t') writer = DictWriter(f, reader.fieldnames, delimiter='\t') writer.writeheader() for r in reader: if r['isoform'] in seqids_to_keep: writer.writerow(r) print >> sys.stdout, "Output written to: {0}".format(f.name) outputSam = prefix + '.filtered_lite.sam' with open(outputSam, 'w') as f: reader = GMAPSAMReader(args.sam_file, True) f.write(reader.header) for r in reader: if r.qID in seqids_to_keep: f.write(r.record_line + '\n') print >> sys.stdout, "Output written to: {0}".format(f.name) print >> sys.stderr, "**** Generating SQANTI report...." cmd = RSCRIPTPATH + " {d}/{f} {c} {j}".format( d=utilitiesPath, f=RSCRIPT_REPORT, c=outputClassPath, j=outputJuncPath) if subprocess.check_call(cmd, shell=True) != 0: print >> sys.stderr, "ERROR running command: {0}".format(cmd) sys.exit(-1)
def evaluate_alignment_sam(input_fa_or_fq, sam_filename, genome_d, output_prefix, junction_info=None): h1 = open(output_prefix + '.alignment_report.txt', 'w') h2 = open(output_prefix + '.junction_report.txt', 'w') w1 = DictWriter(h1, fieldnames=fieldnames_report1) w2 = DictWriter(h2, fieldnames=fieldnames_report2) w1.writeheader() w2.writeheader() #fieldnames_report1 = ['seqid', 'coverage', 'identity', 'num_sub', 'num_ins', 'num_del', 'num_exons'] #fieldnames_report2 = ['seqid', 'donor_pos', 'donor_seq', 'donor_dist', 'acceptor_pos', 'acceptor_seq', 'acceptor_dist'] query_len_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse( open(input_fa_or_fq), type_fa_or_fq(input_fa_or_fq))) for r in GMAPSAMReader(sam_filename, True, query_len_dict=query_len_dict): if r.sID == '*': # unaligned rec1 = { 'seqid': r.qID, 'coverage': 'NA', 'identity': 'NA', 'num_sub': 'NA', 'num_ins': 'NA', 'num_del': 'NA', 'num_exons': 'NA' } w1.writerow(rec1) continue rec1 = { 'seqid': r.qID, 'coverage': r.qCoverage, 'identity': r.identity, 'num_sub': r.num_nonmatches - r.num_del - r.num_ins, 'num_ins': r.num_ins, 'num_del': r.num_del, 'num_exons': len(r.segments) } w1.writerow(rec1) for i in xrange(0, len(r.segments) - 1): rec2 = {'seqid': r.qID} seq1, seq2 = get_donor_acceptor(genome_d, r.sID, r.flag.strand, r.segments[i].end - 1, r.segments[i + 1].start) if r.flag.strand == '+': rec2['donor_pos'] = "{0}:+:{1}".format(r.sID, r.segments[i].end - 1) rec2['acceptor_pos'] = "{0}:+:{1}".format( r.sID, r.segments[i + 1].start) else: rec2['donor_pos'] = "{0}:-:{1}".format(r.sID, r.segments[i + 1].start) rec2['acceptor_pos'] = "{0}:-:{1}".format( r.sID, r.segments[i].end - 1) rec2['donor_seq'] = seq1 rec2['acceptor_seq'] = seq2 if junction_info is not None: rec2['donor_dist'], rec2[ 'acceptor_dist'] = get_closest_junction_dist( junction_info, r.sID, r.flag.strand, r.segments[i].end - 1, r.segments[i + 1].start) else: rec2['donor_dist'] = 'NA' rec2['acceptor_dist'] = 'NA' w2.writerow(rec2)
def sqanti_filter_lite(args): fafq_type = 'fasta' with open(args.isoforms) as h: if h.readline().startswith('@'): fafq_type = 'fastq' prefix = args.sqanti_class[:args.sqanti_class.rfind('.')] fcsv = open(prefix + '.filtered_lite_reasons.txt', 'w') fcsv.write("# classification: {0}\n".format(args.sqanti_class)) fcsv.write("# isoform: {0}\n".format(args.isoforms)) fcsv.write("# intrapriming cutoff: {0}\n".format(args.intrapriming)) fcsv.write("# min_cov cutoff: {0}\n".format(args.min_cov)) fcsv.write("filtered_isoform,reason\n") fout = open(prefix + '.filtered_lite.' + fafq_type, 'w') seqids_to_keep = set() total_count = 0 for r in DictReader(open(args.sqanti_class), delimiter='\t'): total_count += 1 filter_flag, filter_msg = False, "" percA = float(r['perc_A_downstream_TTS']) / 100 assert 0 <= percA <= 1 runA = 0 while runA < len(r['seq_A_downstream_TTS']): if r['seq_A_downstream_TTS'][runA] != 'A': break runA += 1 min_cov = float(r['min_cov']) if r['min_cov'] != 'NA' else None num_exon = int(r['exons']) is_RTS = r['RTS_stage'] == 'TRUE' is_canonical = r['all_canonical'] == 'canonical' is_monoexonic = (num_exon == 1) cat = CATEGORY_DICT[r['structural_category']] potential_intrapriming = (percA >= args.intrapriming or runA >= args.runAlength) and \ r['polyA_motif'] == 'NA' and \ (r['diff_to_gene_TSS'] == 'NA' or abs( int(r['diff_to_gene_TTS'])) > args.max_dist_to_known_end) if cat in ['FSM']: if potential_intrapriming: filter_flag, filter_msg = True, "IntraPriming" elif args.filter_mono_exonic and is_monoexonic: filter_flag, filter_msg = True, "Mono-Exonic" else: if potential_intrapriming: filter_flag, filter_msg = True, "IntraPriming" elif args.filter_mono_exonic and is_monoexonic: filter_flag, filter_msg = True, "Mono-Exonic" elif is_RTS: filter_flag, filter_msg = True, "RTSwitching" elif (not is_canonical) and (min_cov is None or (min_cov is not None and min_cov < args.min_cov)): filter_flag, filter_msg = True, "LowCoverage/Non-Canonical" if not filter_flag: seqids_to_keep.add(r['isoform']) else: fcsv.write("{0},{1}\n".format(r['isoform'], filter_msg)) print("{0} isoforms read from {1}. {2} to be kept.".format( total_count, args.sqanti_class, len(seqids_to_keep)), file=sys.stdout) if not args.skipFaFq: for r in SeqIO.parse(open(args.isoforms), fafq_type): if r.id in seqids_to_keep: SeqIO.write(r, fout, fafq_type) fout.close() print("Output written to: {0}".format(fout.name), file=sys.stdout) # write out a new .classification.txt, .junctions.txt outputClassPath = prefix + '.filtered_lite_classification.txt' with open(outputClassPath, 'w') as f: reader = DictReader(open(args.sqanti_class), delimiter='\t') writer = DictWriter(f, reader.fieldnames, delimiter='\t') writer.writeheader() for r in reader: if r['isoform'] in seqids_to_keep: writer.writerow(r) print("Output written to: {0}".format(f.name), file=sys.stdout) if not args.skipJunction: outputJuncPath = prefix + '.filtered_lite_junctions.txt' with open(outputJuncPath, 'w') as f: reader = DictReader(open( args.sqanti_class.replace('_classification', '_junctions')), delimiter='\t') writer = DictWriter(f, reader.fieldnames, delimiter='\t') writer.writeheader() for r in reader: if r['isoform'] in seqids_to_keep: writer.writerow(r) print("Output written to: {0}".format(f.name), file=sys.stdout) if not args.skipGTF: outputGTF = prefix + '.filtered_lite.gtf' with open(outputGTF, 'w') as f: for r in collapseGFFReader(args.gtf_file): if r.seqid in seqids_to_keep: write_collapseGFF_format(f, r) print("Output written to: {0}".format(f.name), file=sys.stdout) if args.sam is not None: outputSam = prefix + '.filtered_lite.sam' with open(outputSam, 'w') as f: reader = GMAPSAMReader(args.sam, True) f.write(reader.header) for r in reader: if r.qID in seqids_to_keep: f.write(r.record_line + '\n') print("Output written to: {0}".format(f.name), file=sys.stdout) if args.faa is not None: outputFAA = prefix + '.filtered_lite.faa' with open(outputFAA, 'w') as f: for r in SeqIO.parse(open(args.faa), 'fasta'): if r.id in seqids_to_keep: f.write(">{0}\n{1}\n".format(r.description, r.seq)) print("Output written to: {0}".format(f.name), file=sys.stdout) print("**** Generating SQANTI3 report....", file=sys.stderr) cmd = RSCRIPTPATH + " {d}/{f} {c} {j} {p} {d}".format(d=utilitiesPath, f=RSCRIPT_REPORT, c=outputClassPath, j=outputJuncPath, p="mock") if subprocess.check_call(cmd, shell=True) != 0: print("ERROR running command: {0}".format(cmd), file=sys.stderr) sys.exit(-1)
def minimap2_against_ref2(sam_filename, query_len_dict, ref_len_dict, is_FL, sID_starts_with_c, ece_penalty=1, ece_min_len=20, same_strand_only=True, max_missed_start=200, max_missed_end=50, full_missed_start=50, full_missed_end=30): """ Excluding criteria: (1) self hit (2) opposite strand hit (should already be in the same orientation; can override with <same_strand_only> set to False) """ for r in GMAPSAMReader(sam_filename, True, query_len_dict=query_len_dict, ref_len_dict=ref_len_dict): missed_q = r.qStart + r.qLen - r.qEnd missed_t = r.sStart + r.sLen - r.sEnd if sID_starts_with_c: # because all consensus should start with # c<cluster_index> assert r.sID.startswith('c') if r.sID.find('/') > 0: r.sID = r.sID.split('/')[0] if r.sID.endswith('_ref'): # probably c<cid>_ref cID = int(r.sID[1:-4]) else: cID = int(r.sID[1:]) else: cID = r.sID # self hit, useless! # opposite strand not allowed! if (cID == r.qID or (r.flag.strand == '-' and same_strand_only)): yield HitItem(qID=r.qID, cID=cID) continue # regardless if whether is full-length (is_FL) # the query MUST be mapped fully (based on full_missed_start/end) if r.qStart > full_missed_start or (r.qLen - r.qEnd) > full_missed_end: yield HitItem(qID=r.qID, cID=cID) # full-length case: allow up to max_missed_start bp of 5' not aligned # and max_missed_end bp of 3' not aligned # non-full-length case: not really tested...don't use if is_FL and not alignment_missed_start_end_less_than_threshold(r,\ max_missed_start, max_missed_end, full_missed_start, full_missed_end): yield HitItem(qID=r.qID, cID=cID) else: ece_arr = eval_sam_alignment(r) if alignment_has_large_nonmatch(ece_arr, ece_penalty, ece_min_len): yield HitItem(qID=r.qID, cID=cID) else: yield HitItem(qID=r.qID, cID=cID, qStart=r.qStart, qEnd=r.qEnd, missed_q=missed_q * 1. / r.qLen, missed_t=missed_t * 1. / r.sLen, fakecigar=r.cigar, ece_arr=ece_arr)