def main(): global sam_input_list, preprocess_list, output_list, leftout_list file = open(sam_input_file, "r") # open sam file in_sam = Reader(file) # convert to a Reader object for read in in_sam: preprocess_list.append([read.qname, read.seq]) print(f"Input sam file contains {len(preprocess_list)} reads\n") print("Leftout reads(dont have either start_seg or end_seg):") for i in preprocess_list: if (start_seg not in i[1]) or (end_seg not in i[1]): print(i) leftout_list.append(f">{i[0]}\n") leftout_list.append(f"{i[1]}\n") else: # now define the range of the seg we need to extract output_list.append(f">{i[0]}\n") start_index = i[1].find(start_seg) end_index = i[1].find(end_seg) + len(end_seg) output_list.append(f"{i[1][start_index:end_index]}\n") print(f"total {len(leftout_list)/2} leftout reads") print(f"total {len(output_list)/2} output reads") write(output_list, fas_output_file) write(leftout_list, working_dir + "leftout_noSpike.fasta")
def change_chr(long_sam, dict_chr_split, wd, threads, verbose, type_reads): if "long" in type_reads: outfile = os.path.join(wd, 'long_reads_mapped') if "short" in type_reads: outfile = os.path.join(wd, 'short_reads_mapped') out_file = open(outfile, 'w') in_file = open(long_sam, "r") in_sam = Reader(in_file) header = in_sam.header sq = header['@SQ'] dict_chr = {} for c in sq: single_elm = c.split(":") if single_elm[1] in dict_chr_split: single_elm[1] = dict_chr_split[single_elm[1]] change_chr = ":".join(single_elm) dict_chr[change_chr] = sq[c] header_new = OrderedDict({ '@CO': header['@CO'], '@HD': header['@HD'], '@PG': header['@PG'], '@SQ': OrderedDict(dict_chr) }) out_sam = Writer(out_file, header_new) for line in in_sam: if line.rname in dict_chr_split: name = dict_chr_split[line.rname] line.rname = name out_sam.write(line) out_sam.close() bam_final = sam_to_sorted_bam(outfile, threads, wd, verbose) return bam_final
def sam2table(filename, inputdir, outputdir): autosomeXY = list(range(1, 23)) autosomeXY.append('X') autosomeXY.append('Y') autosomeXY = ['chr' + str(x) for x in autosomeXY] readsname = os.path.splitext(filename)[0] samfile = open(inputdir + filename, 'r') tablefile = open(outputdir + readsname + '.txt', 'w') errorfile = open(outputdir + readsname + '_error.sam', 'w') sam = Reader(samfile) error = Writer(errorfile) tablefile.write('\t'.join([ 'ID', 'FILENAME', 'READNAME', 'CHR', 'POS', 'INS_STRAND', 'RE', 'R1', 'R2', 'TLEN', 'CIGAR_R1', 'CIGAR_R2', 'MDFLAG_R1', 'MDFLAG_R2', 'BARCODE', 'BARCODE_Q' ]) + '\n') id_count = 0 bar = tnrange(int(count_samlines(inputdir + filename) / 2), desc=readsname) for i in bar: r1 = next(sam) r2 = next(sam) if r1.rname == r2.rname and r1.rname in autosomeXY: if ((r1.flag in [99, 83]) and (r2.flag in [147, 163]) and (r1.qname.split('__abq:')[0] == r2.qname.split('__abq:')[0])): id_count += 1 bc = r2.qname.split('__abq:')[2] bc_q = r2.qname.split('__abq:')[3] re_seq = r1.qname.split('__abq:')[1] if r1.reverse: # it's strand of insertion (not read) strand = '+' pos = r2.pos + abs(r1.tlen) - 1 else: strand = '-' pos = r1.pos mdflag_r1 = r1._tags[1] mdflag_r2 = r1._tags[1] tablefile.write('\t'.join([ str(id_count), readsname, r1.qname.split('__abq:')[0], r1.rname, str(pos), strand, re_seq, r1.seq, r2.seq, str(abs(r1.tlen)), r1.cigar, r2.cigar, mdflag_r1, mdflag_r2, bc, bc_q ]) + '\n') else: error.write(r1) error.write(r2) else: error.write(r1) error.write(r2) samfile.close() tablefile.close() errorfile.close() return (0)
def bamtofastq(bam, verbose): fasta = bam + ".fasta" in_file = open(bam, 'r') in_sam = Reader(in_file) with open(fasta, "w") as output_handle: for line in in_sam: if line.mapped: record = SeqRecord(Seq(str(line.seq)), name=str(line.qname)) SeqIO.write(record, output_handle, "fasta") return fasta
def soft_clip(long_sam): in_file = open(long_sam, "r") in_sam = Reader(in_file) soft_clip_file = "test.fasta" with open(soft_clip_file, "w") as fh: for line in in_sam: if "S" in line.cigars[0][1]: if line.flag == 0 or line.flag == 16: fh.write(line.rname + "\n") fh.write(line.seq + "\n") return soft_clip_file
def main(): if len(sys.argv) != 2: print("Usage: python3 add_cb_ub_tags.py BAM") sys.exit(1) in_file = open(sys.argv[1], 'r') sample = sys.argv[1].split(".bam", 1)[0] print("Sample: " + sample) out_file = sample + "_withtags.sam" in_sam = Reader(in_file) x = next(in_sam) print(x.tags) barcode_tag = 'CB' umi_tag = 'UB' with Reader(open(sys.argv[1])) as in_bam: with Writer(open(out_file, 'w'), in_bam.header) as out_sam: for read in in_bam: #print(read.qname) #read[umi_tag] = read.qname.split(":")[2] # add the umi tag #read[barcode_tag] = read.qname.split(":")[1] # add the barcode tag read[umi_tag] = "dummy_umi" # add the umi tag read[barcode_tag] = sample # add the barcode tag out_sam.write(read)
def change_chr_to_seq(short_reads, dict_ref_name, wd, threads, verbose): sys.stdout.write('###CHANGING CHROMOSOME NAMES IN BAM###\n') sam_link = os.path.join(wd, short_reads.split("/")[-1]) if not os.path.exists(sam_link): os.link(short_reads, sam_link) outfile = sam_link + ".changed.sorted.sam" dict_invert_seq = {} for key in dict_ref_name: dict_invert_seq[dict_ref_name[key]] = key out_file = open(outfile, 'w') in_file = open(sam_link, "r") in_sam = Reader(in_file) header = in_sam.header sq = header['@SQ'] dict_chr = {} for c in sq: single_elm = c.split(":") if single_elm[1] in dict_invert_seq: single_elm[1] = dict_invert_seq[single_elm[1]] change_chr = ":".join(single_elm) dict_chr[change_chr] = sq[c] header_new = OrderedDict({ '@CO': header['@CO'], '@HD': header['@HD'], '@PG': header['@PG'], '@SQ': OrderedDict(dict_chr) }) out_sam = Writer(out_file, header_new) for line in in_sam: if line.rname in dict_invert_seq: name = dict_invert_seq[line.rname] line.rname = name out_sam.write(line) out_sam.close() bam_final = sam_to_sorted_bam(outfile, threads, wd, verbose) sys.stdout.write('###DONE CHANGING CHROMOSOME NAMES IN BAM###\n') return bam_final #if __name__ == '__main__': # dict_ref_name = {"seq1" : "scaffold_3"} # change_chr_to_seq(*sys.argv[1:], dict_ref_name)
def run(arguments): """ read FASTQ or SAM and tabulate basic metrics arguments is a dictionary so that we can call this as a function """ arguments['input'] = argparse.FileType('r')(arguments['input']) arguments['text'] = argparse.FileType('w')(arguments['text']) args = Bunch(arguments) # convert back to an argparse namespace time_start = time.time() if args.input.name != '<stdin>': bsize = os.path.getsize(args.input.name) est_counter = int() sample_lengths = list() sample_binsizes = list() act_nlines = int() name, ext = os.path.splitext(args.input.name) if (args.leftlimit > 0) and (args.rightlimit > 0): if args.rightlimit < args.leftlimit: sys.exit("Left limit must be less than right limit.\n") if args.type: ext = '.' + args.type if ext not in ['.fq', '.fastq', '.sam', '.bam', '.gz' ] and args.input.name != '<stdin>': sys.exit( "Input file must end in either .sam, .bam, .fastq, or .fastq.gz\n") if args.name: sample_name = args.name else: sample_name = args.input.name # estimate the number of lines in args.input if we can if ext in ['.fastq', '.fq']: with FastqReader(open(args.input.name)) as fh: for read in fh: sample_lengths.append(len(read)) sample_binsizes.append(len(str(read))) est_counter += 1 if est_counter == 10000: break mean_bentry = mean(sample_binsizes) mean_len = mean(sample_lengths) est_nlines = int(bsize / mean_bentry) if not args.quiet: sys.stderr.write( "At {bytes:.0f} bytes per read of {len:.0f} length " "we estimate {est:,} reads in input file.\n".format( bytes=mean_bentry, len=mean_len, est=est_nlines)) elif ext == '.sam': with Reader(open(args.input.name)) as fh: for read in fh: sample_lengths.append(len(read)) sample_binsizes.append(len(str(read))) est_counter += 1 if est_counter == 10000: break mean_bentry = mean(sample_binsizes) mean_len = mean(sample_lengths) est_nlines = int(bsize / mean_bentry) if not args.quiet: sys.stderr.write( "At {bytes:.0f} bytes per read of {len:.0f} length " "we estimate {est:,} reads in input file.\n".format( bytes=mean_bentry, len=mean_len, est=est_nlines)) elif ext == '.bam': est_nlines = sum(bam_read_count(args.input.name)) if not args.quiet: sys.stderr.write( "{est:,} reads in input file.\n".format(est=est_nlines)) elif ext == '.gz': if args.binsize: n = args.binsize est_nlines = None if not args.quiet: sys.stderr.write( "Reading from gzipped file, bin size (-s) set to {binsize:n}.\n" .format(binsize=n)) else: sys.stderr.write( "Gzipped file detected. Reading file to determine bin size (-s).\n" ) p1 = Popen(shlex.split('gzip -dc %s' % args.input.name), stdout=PIPE) p2 = Popen(shlex.split('wc -l'), stdin=p1.stdout, stdout=PIPE) est_nlines, _ = p2.communicate() est_nlines = int(est_nlines) // 4 if not args.quiet: sys.stderr.write( "{est:,} reads in input file.\n".format(est=est_nlines)) elif name == '<stdin>': if args.binsize: n = args.binsize else: n = 1 if not args.quiet: sys.stderr.write( "Reading from <stdin>, bin size (-s) set to {binsize:n}.\n". format(binsize=n)) est_nlines = None if est_nlines == 0: sys.exit( "The input file appears empty. Please check the file for data.") elif est_nlines is not None: # set up factor for sampling bin size if args.binsize: n = args.binsize else: nf = math.floor(est_nlines / args.nreads) if nf >= 1: n = int(nf) else: n = 1 if not args.quiet: sys.stderr.write( "Bin size (-s) set to {binsize:n}.\n".format(binsize=n)) if ext in ['.sam', '.bam']: infile = Reader(args.input) else: infile = FastqReader(args.input, ext=ext) read_len = defaultdict(int) cycle_nuc = defaultdict(lambda: defaultdict(int)) cycle_qual = defaultdict(lambda: defaultdict(int)) cycle_gc = defaultdict(int) cycle_kmers = defaultdict(lambda: defaultdict(int)) cycle_mismatch = { 'C': defaultdict(lambda: defaultdict(int)), 'G': defaultdict(lambda: defaultdict(int)), 'A': defaultdict(lambda: defaultdict(int)), 'T': defaultdict(lambda: defaultdict(int)) } if args.count_duplicates: try: from pybloom import ScalableBloomFilter bloom_filter = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) except ImportError: sys.exit("--count-duplicates option requires 'pybloom' package.\n") duplicates = 0 percent_complete = 10 reads = infile.subsample(n) for read in reads: if isinstance(read, Sam): if args.aligned_only and not read.mapped: continue elif args.unaligned_only and read.mapped: continue if read.reverse: seq = read.seq[::-1] qual = read.qual[::-1] else: seq = read.seq qual = read.qual else: seq = read.seq qual = read.qual # Set up limits if (args.leftlimit == 1) and (args.rightlimit < 0): pass elif (args.leftlimit >= 1) and (args.rightlimit > 0): try: seq = seq[args.leftlimit - 1:args.rightlimit] qual = qual[args.leftlimit - 1:args.rightlimit] except IndexError: act_nlines += n continue elif (args.leftlimit > 1) and (args.rightlimit < 0): try: seq = seq[args.leftlimit - 1:] qual = qual[args.leftlimit - 1:] except IndexError: act_nlines += n continue if len(seq) == 0: act_nlines += n continue cycle_gc[gc(seq)] += 1 if args.count_duplicates: if seq in bloom_filter: duplicates += 1 else: bloom_filter.add(seq) for i, (s, q) in enumerate(zip(seq, qual)): cycle_nuc[args.leftlimit + i][s] += 1 cycle_qual[args.leftlimit + i][q] += 1 read_len[len(qual)] += 1 for i, kmer in enumerate(window(seq, n=args.kmer)): cycle_kmers[args.leftlimit + i][kmer] += 1 if isinstance(read, Sam) and read.mapped: try: ref = read.parse_md() for i, (s, r) in enumerate(zip(seq, ref)): if s != r: try: cycle_mismatch[r][args.leftlimit + i][s] += 1 except KeyError: pass except KeyError: pass if est_nlines is not None: if (act_nlines / est_nlines) * 100 >= percent_complete: sys.stderr.write( "Approximately {0:n}% complete at " "read {1:,} in {2}\n".format( percent_complete, act_nlines, time.strftime('%H:%M:%S', time.gmtime(time.time() - time_start)))) percent_complete += 10 act_nlines += n positions = [k for k in sorted(cycle_qual.keys())] depths = [read_len[k] for k in sorted(read_len.keys())] basecalls = [cycle_nuc[k].keys() for k in sorted(cycle_nuc.keys())] bases = set(list(itertools.chain.from_iterable(basecalls))) #nbasecalls = [ '\t'.join([str(cycle_nuc[p].get(k, 0)) for k in bases]) for p in sorted(cycle_nuc.keys())] map(padbases(bases), cycle_nuc.values()) quantile_values = [0.05, 0.25, 0.5, 0.75, 0.95] quantiles = [] # replace ASCII quality with integer for _, v in sorted(cycle_qual.items()): for q in tuple(v.keys( )): # py3 keys are iterator, so build a tuple to avoid recursion v[ord(str(q)) - 33] = v.pop(q) line = [percentile(v, p) for p in quantile_values] quantiles.append(line) # build kmer set of known adapter sequences adapter_kmers = set() for adapter in all_adapter_sequences: for kmer in window(adapter, n=args.kmer): adapter_kmers.add(kmer) # test for nonuniform kmer profiles and calculate obs/exp observed_expected = dict() all_kmers = [cycle_kmers[k].keys() for k in sorted(cycle_kmers.keys())] kmers = set(list(itertools.chain.from_iterable(all_kmers))) bad_kmers = [] sequenced_bases = sum((l * n for l, n in read_len.items())) priors = tuple(map(float, args.base_probs.split(','))) for kmer in kmers: kmer_counts = [(i, cycle_kmers[i][kmer]) for i in sorted(cycle_kmers.keys())] expected_fraction = reduce( mul, (p**kmer.count(b) for b, p in zip(('A', 'T', 'C', 'G', 'N'), priors)), 1) expected = expected_fraction * sequenced_bases observed_expected[kmer] = sum((n for _, n in kmer_counts)) / expected slope, _, _, p_value, _ = stats.linregress(*zip(*kmer_counts)) if abs(slope) > 2 and p_value < 0.05: bad_kmers.append((kmer, slope, p_value)) bad_kmers = sorted(bad_kmers, key=lambda x: x[2])[:10] pos_gc = [] for i in positions: try: pg = sum([cycle_nuc[i]['C'], cycle_nuc[i]['G']]) / sum([ cycle_nuc[i]['C'], cycle_nuc[i]['G'], cycle_nuc[i]['A'], cycle_nuc[i]['T'] ]) * 100 except ZeroDivisionError: pg = 0 # https://github.com/mdshw5/fastqp/issues/26 pos_gc.append(pg) # see http://vita.had.co.nz/papers/tidy-data.pdf args.text.write("{row}\t{column}\t{pos}\t{value:n}\n".format( row=sample_name, column='reads', pos='None', value=act_nlines)) for cycle, count in read_len.items(): args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format( row=sample_name, column='read_len', pos=cycle, value=count)) for i, position in enumerate(positions): args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format( row=sample_name, column='q05', pos=position, value=quantiles[i][0])) args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format( row=sample_name, column='q25', pos=position, value=quantiles[i][1])) args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format( row=sample_name, column='q50', pos=position, value=quantiles[i][2])) args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format( row=sample_name, column='q75', pos=position, value=quantiles[i][3])) args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format( row=sample_name, column='q95', pos=position, value=quantiles[i][4])) for base in bases: for position in positions: args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format( row=sample_name, column=base, pos=position, value=cycle_nuc[position][base])) for position in positions: args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format( row=sample_name, column='pos_gc', pos=position, value=pos_gc[position - 1])) for i in range(101): args.text.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format( row=sample_name, column='read_gc', pos=i, value=cycle_gc[i])) for kmer, obs_exp in sorted(observed_expected.items(), key=lambda x: x[1]): args.text.write("{row}\t{column}\t{pos}\t{value:n}\n".format( row=sample_name, column=kmer, pos='None', value=obs_exp)) if args.count_duplicates: args.text.write("{row}\t{column}\t{pos}\t{value:n}\n".format( row=sample_name, column='duplicate', pos='None', value=duplicates / act_nlines)) from zipfile import ZipFile with ZipFile(args.output + '.zip', mode='w') as zip_archive: fig_kw = {'figsize': (8, 6)} qualplot(positions, quantiles, zip_archive, fig_kw) median_qual = qualdist(cycle_qual.values(), zip_archive, fig_kw) qualmap(cycle_qual, zip_archive, fig_kw) depthplot(read_len, zip_archive, fig_kw) gcplot(positions, pos_gc, zip_archive, fig_kw) gcdist(cycle_gc, zip_archive, fig_kw) nucplot(positions, bases, cycle_nuc, zip_archive, fig_kw) kmerplot(positions, cycle_kmers, zip_archive, [fields[0] for fields in bad_kmers], fig_kw) adaptermerplot(positions, cycle_kmers, adapter_kmers, zip_archive, fig_kw) if isinstance(infile, Reader): mismatchplot(positions, cycle_mismatch, zip_archive, fig_kw) time_finish = time.time() elapsed = time_finish - time_start if not args.quiet: sys.stderr.write( "There were {counts:,} reads in the file. Analysis finished in {sec}.\n" .format(counts=act_nlines, sec=time.strftime('%H:%M:%S', time.gmtime(elapsed)))) if len(bad_kmers) > 0: for kmer in bad_kmers: sys.stderr.write( "KmerWarning: kmer %s has a non-uniform profile (slope = %s, p = %s).\n" % (kmer)) if median_qual < args.median_qual: sys.stderr.write( "QualityWarning: median base quality score is %s.\n" % median_qual)
import sys import simplesam from simplesam import Reader, Writer BAMFILE = sys.argv[1] SAMFILE = sys.argv[2] in_file = open(BAMFILE, 'r') in_sam = Reader(in_file) x = next(in_sam) x.tags with Reader(open(BAMFILE)) as in_bam: with Writer(open(SAMFILE, 'w'), in_bam.header) as out_sam: for read in in_bam: read["UB"] = read.qname.split(":")[2] # add the umi tag read["CB"] = read.qname.split(":")[1] # add the barcode tag out_sam.write(read)
def run(args): """ read FASTQ or SAM and tabulate basic metrics """ time_start = time.time() if args.input.name != '<stdin>': bsize = os.path.getsize(args.input.name) est_counter = int() sample_lengths = list() sample_binsizes = list() act_nlines = int() name, ext = os.path.splitext(args.input.name) if (args.leftlimit > 0) and (args.rightlimit > 0): if args.rightlimit < args.leftlimit: sys.exit("Left limit must be less than right limit.\n") if args.type: ext = '.' + args.type if ext not in ['.fq','.fastq', '.sam', '.bam', '.gz'] and args.input.name != '<stdin>': sys.exit("Input file must end in either .sam, .bam, .fastq, or .fastq.gz\n") if args.name: sample_name = args.name else: sample_name = args.input.name # estimate the number of lines in args.input if we can if ext in ['.fastq','.fq']: with FastqReader(open(args.input.name)) as fh: for read in fh: sample_lengths.append(len(read)) sample_binsizes.append(len(str(read))) est_counter += 1 if est_counter == 10000: break mean_bentry = mean(sample_binsizes) mean_len = mean(sample_lengths) est_nlines = int(bsize / mean_bentry) if not args.quiet: sys.stderr.write("At {bytes:.0f} bytes per read of {len:.0f} length " "we estimate {est:,} reads in input file.\n".format(bytes=mean_bentry, len=mean_len, est=est_nlines)) elif ext == '.sam': with Reader(open(args.input.name)) as fh: for read in fh: sample_lengths.append(len(read)) sample_binsizes.append(len(str(read))) est_counter += 1 if est_counter == 10000: break mean_bentry = mean(sample_binsizes) mean_len = mean(sample_lengths) est_nlines = int(bsize / mean_bentry) if not args.quiet: sys.stderr.write("At {bytes:.0f} bytes per read of {len:.0f} length " "we estimate {est:,} reads in input file.\n".format(bytes=mean_bentry, len=mean_len, est=est_nlines)) elif ext == '.bam': est_nlines = sum(bam_read_count(args.input.name)) if not args.quiet: sys.stderr.write("{est:,} reads in input file.\n".format(est=est_nlines)) elif ext == '.gz': if args.binsize: n = args.binsize est_nlines = None if not args.quiet: sys.stderr.write("Reading from gzipped file, bin size (-s) set to {binsize:n}.\n".format(binsize=n)) else: sys.stderr.write("Gzipped file detected. Reading file to determine bin size (-s).\n") p1 = Popen(shlex.split('gzip -dc %s' % args.input.name), stdout=PIPE) p2 = Popen(shlex.split('wc -l'), stdin=p1.stdout, stdout=PIPE) est_nlines, _ = p2.communicate() est_nlines = int(est_nlines) // 4 if not args.quiet: sys.stderr.write("{est:,} reads in input file.\n".format(est=est_nlines)) elif name == '<stdin>': if args.binsize: n = args.binsize else: n = 1 if not args.quiet: sys.stderr.write("Reading from <stdin>, bin size (-s) set to {binsize:n}.\n".format(binsize=n)) est_nlines = None if est_nlines is not None: # set up factor for sampling bin size if args.binsize: n = args.binsize else: nf = math.floor(est_nlines / args.nreads) if nf >= 1: n = int(nf) else: n = 1 if not args.quiet: sys.stderr.write("Bin size (-s) set to {binsize:n}.\n".format(binsize=n)) if ext in ['.sam', '.bam']: infile = Reader(args.input) else: infile = FastqReader(args.input, ext=ext) read_len = defaultdict(int) cycle_nuc = defaultdict(lambda: defaultdict(int)) cycle_qual = defaultdict(lambda: defaultdict(int)) cycle_gc = defaultdict(int) cycle_kmers = defaultdict(lambda: defaultdict(int)) cycle_mismatch = {'C': defaultdict(lambda: defaultdict(int)), 'G': defaultdict(lambda: defaultdict(int)), 'A': defaultdict(lambda: defaultdict(int)), 'T': defaultdict(lambda: defaultdict(int))} if args.count_duplicates: try: from pybloom import ScalableBloomFilter bloom_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) except ImportError: sys.exit("--count-duplicates option requires 'pybloom' package.\n") duplicates = 0 percent_complete = 10 reads = infile.subsample(n) for read in reads: if isinstance(read, Sam): if args.aligned_only and not read.mapped: continue elif args.unaligned_only and read.mapped: continue if read.reverse: seq = read.seq[::-1] qual = read.qual[::-1] else: seq = read.seq qual = read.qual else: seq = read.seq qual = read.qual # Set up limits if (args.leftlimit == 1) and (args.rightlimit < 0): pass elif (args.leftlimit >= 1) and (args.rightlimit > 0): try: seq = seq[args.leftlimit - 1:args.rightlimit] qual = qual[args.leftlimit - 1:args.rightlimit] except IndexError: act_nlines += n continue elif (args.leftlimit > 1) and (args.rightlimit < 0): try: seq = seq[args.leftlimit - 1:] qual = qual[args.leftlimit - 1:] except IndexError: act_nlines += n continue if len(seq) == 0: act_nlines += n continue cycle_gc[gc(seq)] += 1 if args.count_duplicates: if seq in bloom_filter: duplicates += 1 else: bloom_filter.add(seq) for i, (s, q) in enumerate(zip(seq, qual)): cycle_nuc[args.leftlimit + i][s] += 1 cycle_qual[args.leftlimit + i][q] += 1 read_len[len(qual)] += 1 for i, kmer in enumerate(window(seq, n=args.kmer)): cycle_kmers[args.leftlimit+i][kmer] += 1 if isinstance(read, Sam) and read.mapped: try: ref = read.parse_md() for i, (s, r) in enumerate(zip(seq, ref)): if s != r: try: cycle_mismatch[r][args.leftlimit+i][s] += 1 except KeyError: pass except KeyError: pass if est_nlines is not None: if (act_nlines / est_nlines) * 100 >= percent_complete: sys.stderr.write("Approximately {0:n}% complete at " "read {1:,} in {2}\n".format(percent_complete, act_nlines, time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))) percent_complete += 10 act_nlines += n positions = [k for k in sorted(cycle_qual.keys())] depths = [read_len[k] for k in sorted(read_len.keys())] basecalls = [cycle_nuc[k].keys() for k in sorted(cycle_nuc.keys())] bases = set(list(itertools.chain.from_iterable(basecalls))) #nbasecalls = [ '\t'.join([str(cycle_nuc[p].get(k, 0)) for k in bases]) for p in sorted(cycle_nuc.keys())] map(padbases(bases), cycle_nuc.values()) quantile_values = [0.05,0.25,0.5,0.75,0.95] quantiles = [] ## replace ASCII quality with integer for _, v in sorted(cycle_qual.items()): for q in tuple(v.keys()): ## py3 keys are iterator, so build a tuple to avoid recursion v[ord(str(q)) - 33] = v.pop(q) line = [percentile(v, p) for p in quantile_values] quantiles.append(line) # build kmer set of known adapter sequences adapter_kmers = set() for adapter in all_adapter_sequences: for kmer in window(adapter, n=args.kmer): adapter_kmers.add(kmer) # test for nonuniform kmer profiles and calculate obs/exp observed_expected = dict() all_kmers = [cycle_kmers[k].keys() for k in sorted(cycle_kmers.keys())] kmers = set(list(itertools.chain.from_iterable(all_kmers))) bad_kmers = [] sequenced_bases = sum((l * n for l, n in read_len.items())) priors = tuple(map(float, args.base_probs.split(','))) for kmer in kmers: kmer_counts = [(i, cycle_kmers[i][kmer]) for i in sorted(cycle_kmers.keys())] expected_fraction = reduce(mul, (p ** kmer.count(b) for b, p in zip(('A', 'T', 'C', 'G', 'N'), priors)), 1) expected = expected_fraction * sequenced_bases observed_expected[kmer] = sum((n for _, n in kmer_counts)) / expected slope, _, _, p_value, _ = stats.linregress(*zip(*kmer_counts)) if abs(slope) > 2 and p_value < 0.05: bad_kmers.append((kmer, slope, p_value)) bad_kmers = sorted(bad_kmers, key=lambda x: x[2])[:10] pos_gc = [sum([cycle_nuc[i]['C'], cycle_nuc[i]['G']]) / sum([cycle_nuc[i]['C'], cycle_nuc[i]['G'], cycle_nuc[i]['A'], cycle_nuc[i]['T']]) * 100 for i in positions] # see http://vita.had.co.nz/papers/tidy-data.pdf sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column='reads', pos='None', value=act_nlines)) for cycle, count in read_len.items(): sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='read_len', pos=cycle, value=count)) for i, position in enumerate(positions): sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q05', pos=position, value=quantiles[i][0])) sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q25', pos=position, value=quantiles[i][1])) sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q50', pos=position, value=quantiles[i][2])) sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q75', pos=position, value=quantiles[i][3])) sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q95', pos=position, value=quantiles[i][4])) for base in bases: for position in positions: sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column=base, pos=position, value=cycle_nuc[position][base])) for position in positions: sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='cycle_gc', pos=position, value=cycle_gc[position])) for i in range(101): sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='read_gc', pos=i, value=cycle_gc[i])) for kmer, obs_exp in sorted(observed_expected.items(), key=lambda x: x[1]): sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column=kmer, pos='None', value=obs_exp)) if args.count_duplicates: sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column='duplicate', pos='None', value=duplicates/act_nlines)) from zipfile import ZipFile with ZipFile(args.output + '.zip', mode='w') as zip_archive: fig_kw = {'figsize':(8, 6)} qualplot(positions, quantiles, zip_archive, fig_kw) median_qual = qualdist(cycle_qual.values(), zip_archive, fig_kw) qualmap(cycle_qual, zip_archive, fig_kw) depthplot(read_len, zip_archive, fig_kw) gcplot(positions, pos_gc, zip_archive, fig_kw) gcdist(cycle_gc, zip_archive, fig_kw) nucplot(positions, bases, cycle_nuc, zip_archive, fig_kw) kmerplot(positions, cycle_kmers, zip_archive, [fields[0] for fields in bad_kmers], fig_kw) adaptermerplot(positions, cycle_kmers, adapter_kmers, zip_archive, fig_kw) if isinstance(infile, Reader): mismatchplot(positions , cycle_mismatch, zip_archive, fig_kw) time_finish = time.time() elapsed = time_finish - time_start if not args.quiet: sys.stderr.write("There were {counts:,} reads in the file. Analysis finished in {sec}.\n".format(counts=act_nlines, sec=time.strftime('%H:%M:%S', time.gmtime(elapsed)) )) if len(bad_kmers) > 0: for kmer in bad_kmers: sys.stderr.write("KmerWarning: kmer %s has a non-uniform profile (slope = %s, p = %s).\n" % (kmer)) if median_qual < args.median_qual: sys.stderr.write("QualityWarning: median base quality score is %s.\n" % median_qual)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--trusted-cutoff', type=int, default=5) parser.add_argument("ht", type=str, help="Counting bloom filter for the reads") parser.add_argument("bam_file", type=str, help="bam read mapping file") parser.add_argument("--json", action='store_true', help="output JSON") args = parser.parse_args() ht = khmer.load_countgraph(args.ht) samfile = Reader(open(args.bam_file, 'r')) k = ht.ksize() seq_cnt = 0 dropped_seqs = 0 base_cnt = {} state_cnts = {} trans_cnts = {} total_bases = 0.0 for rec in samfile: seq = rec.seq cigar = rec.cigar seq_cnt += 1 if 'N' in seq: dropped_seqs += 1 continue states = extract_cigar(rec.cigars) kmer = seq[:k] state = states[k] + trusted_str(ht.count(kmer), args.trusted_cutoff) state_cnts[state] = state_cnts.get(state, 0) + 1 base_cnt[kmer[-1]] = base_cnt.get(kmer[-1], 0) + 1 for i in range(1, len(seq) - k - 1): total_bases += 1 kmer = seq[i:i + k] cov = ht.get(kmer) last_state = state state = states[i] + trusted_str(cov, args.trusted_cutoff) trans = last_state + '-' + state trans_cnts[trans] = trans_cnts.get(trans, 0) + 1 state_cnts[state] = state_cnts.get(state, 0) + 1 base_cnt[kmer[-1]] = base_cnt.get(kmer[-1], 0) + 1 if not args.json: print("kmer size=", k) print("seq count=", seq_cnt, "dropped seqs=", dropped_seqs) print("base counts=", base_cnt) print("state counts=", state_cnts) print("trans counts=", trans_cnts) if not args.json: trans_probs = collections.defaultdict(float(0)) for trans in sorted(trans_cnts.keys()): start_state = trans.split('-')[0] trans_probs[trans] = trans_cnts[trans] / float( state_cnts[start_state]) print('{0}\t{1:0.7f}'.format(trans, trans_probs[trans])) print('static double trans_default[] = { log2{0:0.7f}, log2{1:0.7f}, ' \ 'log2{2:0.7f}, log2{3:0.7f}, log2{4:0.7f}, ' \ 'log2(5:0.7f},'.format(trans_probs['M_t-M_t'], trans_probs['M_t-Ir_t'], trans_probs[ 'M_t-Ig_t'], trans_probs['M_t-M_u'], trans_probs['M_t-Ir_u'], trans_probs['M_t-Ig_u'])) print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f},'.format( trans_probs['Ir_t-M_t'], trans_probs['Ir_t-Ir_t'], trans_probs['Ir_t-M_u'], trans_probs['Ir_t,Ir_u'])) print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f},'.format( trans_probs['Ig_t-M_t'], trans_probs['Ig_t-Ig_t'], trans_probs['Ig_t-M_u'], trans_probs['Ig_t,Ig_u'])) print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f}, '\ 'log2{4:0.7f}, log2(5:0.7f},'.format( trans_probs['M_u-M_t'], trans_probs['M_u-Ir_t'], trans_probs['M_u-Ig_t'], trans_probs['M_u-M_u'], trans_probs['M_u-Ir_u'], trans_probs['M_u-Ig_u'])) print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f},'.format( trans_probs['Ir_u-M_t'], trans_probs['Ir_u-Ir_t'], trans_probs['Ir_u-M_u'], trans_probs['Ir_u,Ir_u'])) print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f},'.format( trans_probs['Ig_u-M_t'], trans_probs['Ig_u-Ig_t'], trans_probs['Ig_u-M_u'], trans_probs['Ig_u,Ig_u'])) print('};') else: params = { 'scoring_matrix': [ -0.06642736173897607, -4.643856189774724, -7.965784284662087, -9.965784284662087 ], 'transition_probabilities': (( log(trans_cnts['M_t-M_t'] / float(state_cnts['M_t']), 2), log(trans_cnts['M_t-Ir_t'] / float(state_cnts['M_t']), 2), log(trans_cnts['M_t-Ig_t'] / float(state_cnts['M_t']), 2), log(trans_cnts['M_t-M_u'] / float(state_cnts['M_t']), 2), log(trans_cnts['M_t-Ir_u'] / float(state_cnts['M_t']), 2), log(trans_cnts['M_t-Ig_u'] / float(state_cnts['M_t']), 2), ), ( log(trans_cnts['Ir_t-M_t'] / float(state_cnts['Ir_t']), 2), log(trans_cnts['Ir_t-Ir_t'] / float(state_cnts['Ir_t']), 2), log(trans_cnts['Ir_t-M_u'] / float(state_cnts['Ir_t']), 2), log(trans_cnts['Ir_t-Ir_u'] / float(state_cnts['Ir_t']), 2), ), ( log(trans_cnts['Ig_t-M_t'] / float(state_cnts['Ig_t']), 2), log(trans_cnts['Ig_t-Ig_t'] / float(state_cnts['Ig_t']), 2), log(trans_cnts['Ig_t-M_u'] / float(state_cnts['Ig_t']), 2), log(trans_cnts['Ig_t-Ig_u'] / float(state_cnts['Ig_t']), 2), ), ( log(trans_cnts['M_u-M_t'] / float(state_cnts['M_u']), 2), log(trans_cnts['M_u-Ir_t'] / float(state_cnts['M_u']), 2), log(trans_cnts['M_u-Ig_t'] / float(state_cnts['M_u']), 2), log(trans_cnts['M_u-M_u'] / float(state_cnts['M_u']), 2), log(trans_cnts['M_u-Ir_u'] / float(state_cnts['M_u']), 2), log(trans_cnts['M_u-Ig_u'] / float(state_cnts['M_u']), 2), ), ( log(trans_cnts['Ir_u-M_t'] / float(state_cnts['Ir_u']), 2), log(trans_cnts['Ir_u-Ir_t'] / float(state_cnts['Ir_u']), 2), log(trans_cnts['Ir_u-M_u'] / float(state_cnts['Ir_u']), 2), log(trans_cnts['Ir_u-Ir_u'] / float(state_cnts['Ir_u']), 2), ), ( log(trans_cnts['Ig_u-M_t'] / float(state_cnts['Ig_u']), 2), log(trans_cnts['Ig_u-Ig_t'] / float(state_cnts['Ig_u']), 2), log(trans_cnts['Ig_u-M_u'] / float(state_cnts['Ig_u']), 2), log(trans_cnts['Ig_u-Ig_u'] / float(state_cnts['Ig_u']), 2), )) } print( json.dumps(params, sort_keys=True, indent=4, separators=(',', ': ')))
def main(ext_args=None): from transcoorder import __version__ parser = argparse.ArgumentParser(description="") parser.add_argument('gtf', type=str, help='GTF file containing transcripts') parser.add_argument('bam', type=argparse.FileType('r'), help="SAM or BAM files aligned to transcriptome") parser.add_argument('fasta', type=Fasta, help="FASTA format assembly coresponding to GTF") parser.add_argument('-o', '--out', type=argparse.FileType('w'), default='-', help="output file for genomic SAM (default: stdout)") parser.add_argument( '-t', '--tag-name', type=str, default='ZT', help= "SAM tag name for storing transcript identifier. default: %(default)s") parser.add_argument('--debug', action="store_true", help="enable debugging") parser.add_argument('--version', action="version", version=__version__, help="display version number") # print help usage if no arguments are supplied if len(sys.argv) == 1 and not ext_args: parser.print_help() sys.exit(1) elif ext_args: args = parser.parse_args(ext_args) else: args = parser.parse_args() try: db = FeatureDB(args.gtf + '.db') except ValueError: sys.stderr.write("building sqlite database for %s..." % args.gtf) db = create_db(args.gtf, args.gtf + '.db', disable_infer_transcripts=True, disable_infer_genes=True) header = build_sam_header_from_fasta(args.fasta) with Reader(args.bam) as bamfile, Writer(args.out, header) as outfile: try: read_count = len(bamfile) except NotImplementedError: read_count = None with tqdm(total=read_count, unit='read') as pbar: for read in bamfile: features = cache_gtf_features(db, read.rname) if features == None: if args.debug: sys.stderr.write("%s not found in %s\n" % (read.rname, args.gtf)) else: transcript, genome_offset, transcript_coords = features read = transcript_sam_to_genomic_sam( read, transcript, genome_offset, transcript_coords) if read is not None: outfile.write(read) pbar.update(1)
from simplesam import Reader, Writer import inspect import sys, os, fileinput, string in_file = open(sys.argv[1], 'r') in_sam = Reader(in_file) out_file = open('full_ecoli_mapped_q10_truth.txt', 'w') # out_sam = Writer(out_file) x = next(in_sam) try: while (x.qname != ''): #if(x.reverse): # out_file.write("+" + " ") #else: # out_file.write("-" + " ") out_file.write(x.rname + " ") out_file.write(x.qname + " ") out_file.write(str(x.pos) + " ") out_file.write(str(x.pos + len(x.seq)) + "\n") #print str(type(x)) x = next(in_sam) except: print("Long read alignment ground truth generated") in_file.close() out_file.close()
def analyzeReads(myIntermedFileName, myRefSeq, myRefCGindices): locusUMIreadsDict = {} # locus : UMI : allReadsforthatUMI readDict = {} # fragName:totalReads # initialize readDict for frag in myRefSeq: readDict[frag] = 0 infile = open(myIntermedFileName, 'r') samFile = Reader(infile) for forwardRead in samFile: if forwardRead.mapped and forwardRead.paired: # makes sure reads map to a reference seq and are paired reverseRead = samFile.next() #### UMI... if forwardRead.rname == reverseRead.rname: ### this is ADDED 11/20/19 to make sure both the forward and the reverse read map to the same fragment # make sure read name, UMI are same if forwardRead.qname != reverseRead.qname: print("READS NOT PAIRED, out of sync") print("forwardRead: ", forwardRead.qname, "reverseRead: ", reverseRead.qname) print("EXITING...") sys.exit() fUMI = forwardRead.qname.split("+")[1] rUMI = reverseRead.qname.split("+")[1] # if "N" not in fUMI and "N" not in rUMI: ### ADDED 11/21/19 - PREVENTS US FROM ANALYZING READS THAT HAVE "N"s IN THE UMIs forwardMethylation = Read(forwardRead, myRefSeq, myRefCGindices) reverseMethylation = Read(reverseRead, myRefSeq, myRefCGindices) myLocus = forwardMethylation.locus #ADDED - keeping track of total number of reads - PUT THIS AT THE END readDict[myLocus] += 1 locusCGindices = myRefCGindices[myLocus] myUMI = forwardMethylation.umi consensusIndexString = "" consensusMethString = "" consensusMethIndices = [] consensusUnmethIndices = [] for index in locusCGindices: if index >= forwardMethylation.startCoord and index < reverseMethylation.startCoord and index in forwardMethylation.methIndices: consensusIndexString += (str(index) + "Z") consensusMethString += "Z" consensusMethIndices.append(index) elif index > forwardMethylation.endCoord and index <= reverseMethylation.endCoord and index in reverseMethylation.methIndices: consensusIndexString += (str(index) + "Z") consensusMethString += "Z" consensusMethIndices.append(index) elif index >= forwardMethylation.startCoord and index >= reverseMethylation.startCoord and index <= forwardMethylation.endCoord and index <= reverseMethylation.endCoord and index in forwardMethylation.methIndices and index in reverseMethylation.methIndices: #index >= forwardMethylation.startCoord was in here twice. changed second one to index >= reverseMethylation.startCoord consensusIndexString += (str(index) + "Z") consensusMethString += "Z" consensusMethIndices.append(index) else: consensusIndexString += (str(index) + "z") consensusMethString += "z" consensusUnmethIndices.append(index) if myLocus in locusUMIreadsDict: if myUMI in locusUMIreadsDict[myLocus]: locusUMIreadsDict[myLocus][myUMI].append( consensusMethString) else: locusUMIreadsDict[myLocus][myUMI] = [ consensusMethString ] else: readList = [consensusMethString] myUMIdict = {myUMI: readList} locusUMIreadsDict[myLocus] = myUMIdict myResults = ReadsChunkResults(myIntermedFileName, readDict, locusUMIreadsDict) return myResults
file=tsv) tsv.flush() # Check for missing genes missed = gene_ids.difference(processed) if len(missed) > 0: print( f"No FASTA sources for the following IDs: {', '.join(missed)}\n" + 'These genes are absent from genes FASTA, but may be used for ' + 'read mapping if data for them is available.', file=stderr) # TODO: load SAM data for Illumina reads # Loading PacBio reads print('Loading PacBio hits...', file=stderr) pb_reader = Reader(open(args.p)) mapped_hits = {x: [] for x in regions_of_interest} read_counts = {x: 0 for x in gene_ids} mapped_hit_names = {x: [] for x in gene_ids} for hit in pb_reader: if hit.rname in features_by_source: hit_coord = (hit.pos, hit.pos + len(hit)) for gene in features_by_source[hit.rname]: if overlap((gene.start, gene.end), hit_coord): mapped_hits[hit.rname].append(hit_coord) read_counts[gene.get_id_prefix()] += 1 mapped_hit_names[gene.get_id_prefix()].append(hit.qname) if args.hit_ids: for gene in mapped_hit_names: if len(mapped_hit_names[gene]) > 0: for read_id in mapped_hit_names[gene]: