def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--delimiter", dest="delimiter", type="string", help="delimiter to separate columns [%default]") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=["row-describe", "column-describe"], help="additional methods to apply [%default]") parser.set_defaults( delimiter="\t", methods=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) if not options.methods: options.methods = ["summary"] table = pandas.read_csv(options.stdin, options.delimiter) options.stdout.write("metric\tcount\tpercent\tinfo\n") for method in options.methods: label = re.sub("-", "_", method) if method == "summary": for category, count, denominator, info in compute_table_summary( table): options.stdout.write("\t".join( map(str, (category, count, iotools.pretty_percent(count, denominator, na=""), info))) + "\n") elif method == "column-describe": df = table.describe().T.stack() with E.open_output_file(label) as outf: outf.write("label\tcategory\tvalue\n") df.to_csv(outf, sep="\t") elif method == "row-describe": df = table.T.describe().stack() with E.open_output_file(label) as outf: outf.write("label\tcategory\tvalue\n") df.to_csv(outf, sep="\t") E.stop()
def printValues(contig, max_size, window_size, values, options): """output values.""" outfile = E.open_output_file(contig, "w") outfile.write("abs_pos\trel_pos") for feature in options.features: outfile.write("\tabs_%s\trel_%s" % (feature, feature)) outfile.write("\n") max_vv = [] for f in range(len(options.features)): max_vv.append(float(max([x[f] for x in values]))) bin = 0 for vv in values: outfile.write("%i\t" % bin) outfile.write(options.value_format % (float(bin) / max_size)) for x in range(len(options.features)): outfile.write("\t%i\t%s" % (vv[x], options.value_format % (vv[x] / max_vv[x]))) outfile.write("\n") bin += window_size outfile.close()
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE): """annotate a genome given by the indexed *fasta* file and an iterator over gtf annotations. """ annotations = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * array.array("B").itemsize)) # AString.AString( "a").itemsize )) for contig, size in list(contig_sizes.items()): E.debug("allocating %s: %i bases" % (contig, size)) # annotations[contig] = AString.AString( default_code * size ) # annotations[contig] = array.array("", default_code * size) # Go to list for py3 compatibility, patch annotations[contig] = [default_code] * size E.info("allocated memory for %i contigs" % len(fasta)) counter = E.Counter() # output splice junctions outfile_junctions = E.open_output_file("junctions") outfile_junctions.write( "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n") for gtfs in iterator: counter.input += 1 if counter.input % options.report_step == 0: E.info("iteration %i" % counter.input) try: contig = fasta.getToken(gtfs[0].contig) except KeyError as msg: E.warn("contig %s not found - annotation ignored" % gtfs[0].contig) counter.skipped_contig += 1 continue lcontig = fasta.getLength(contig) # make sure that exons are sorted by coordinate gtfs.sort(key=lambda x: x.start) is_positive = Genomics.IsPositiveStrand(gtfs[0].strand) source = gtfs[0].source # process non-coding data if source in MAP_ENSEMBL: code = MAP_ENSEMBL[source] intervals = [(x.start, x.end) for x in gtfs] addSegments(annotations[contig], intervals, is_positive, code) elif source == "protein_coding": # collect exons for utr exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"] cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"] if len(cds) == 0: counter.skipped_transcripts += 1 E.warn("protein-coding transcript %s without CDS - skipped" % gtfs[0].transcript_id) continue exons = Intervals.truncate(exons, cds) start, end = cds[0][0], cds[-1][1] UTR5 = [x for x in exons if x[1] < start] UTR3 = [x for x in exons if x[0] >= end] if not is_positive: UTR5, UTR3 = UTR3, UTR5 splice_code = "S" else: splice_code = "s" addSegments(annotations[contig], UTR5, is_positive, "u") addIntrons(annotations[contig], UTR5, is_positive, options.max_frameshift_length) addSegments(annotations[contig], UTR3, is_positive, "v") addIntrons(annotations[contig], UTR3, is_positive, options.max_frameshift_length) # output CDS according to frame addCDS(annotations[contig], [x for x in gtfs if x.feature == "CDS"], is_positive) # add introns between CDS addIntrons(annotations[contig], cds, is_positive, options.max_frameshift_length) # output splice junctions cds = [x for x in gtfs if x.feature == "CDS"] # apply corrections for 1-past end coordinates # to point between residues within CDS if is_positive: ender = lambda x: x.end - 1 starter = lambda x: x.start out_positive = "+" else: ender = lambda x: lcontig - x.start - 1 starter = lambda x: lcontig - x.end out_positive = "-" cds.reverse() end = ender(cds[0]) for c in cds[1:]: start = starter(c) outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % ( contig, out_positive, end, start, c.frame, c.gene_id, c.transcript_id, )) end = ender(c) E.info("finished reading genes: %s" % str(counter)) outfile_junctions.close() E.info("started counting") outfile = E.open_output_file("counts") outputCounts(outfile, annotations) outfile.close() E.info("started output") for k in sorted(annotations.keys()): # options.stdout.write(">%s\n%s\n" % (k, annotations[k].tostring())) options.stdout.write(">%s\n%s\n" % (k, "".join(annotations[k])))
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-i", "--input-bam", dest="input_bam_file", type="string", help="input bam file") parser.add_option( "-f", "--reference-bam", dest="reference_bam_file", type="string", help="reference BAM file [%default]") parser.add_option( "-q", "--query-name-regex", dest="query_name_regex", type="string", help="regular expression to apply on query name. " "Potentially required to match samtools sort order and should " "evaluate to an integer [%default]") parser.set_defaults( input_bam_file=None, reference_bam_file=None, query_name_regex=None, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) == 2: options.input_bam_file = args[0] options.reference_bam_file = args[1] if options.input_bam_file is None: raise ValueError("please supply a BAM file as input") if options.reference_bam_file is None: raise ValueError("please supply a BAM file as reference") # update paths to absolute options.input_bam_file = os.path.abspath(options.input_bam_file) options.reference_bam_file = os.path.abspath(options.reference_bam_file) if not os.path.exists(options.input_bam_file): raise OSError("input bam file {} does not exist".format( options.input_bam_file)) if not os.path.exists(options.reference_bam_file): raise OSError("reference bam file {} does not exist".format( options.reference_bam_file)) bam_in = pysam.AlignmentFile(options.input_bam_file) ref_in = pysam.AlignmentFile(options.reference_bam_file) outf_mapped = E.open_output_file("mapped") outf_mapped.write("\t".join( ["read", "length", "status", "overlap", "comp_contig", "comp_start", "comp_end", "ref_contig", "ref_start", "ref_end", "shared_misaligned", "shared_aligned", "shared_insertion", "shared_deletion", "comp_aligned", "comp_insertion", "comp_deletion", "ref_aligned", "ref_insertion", "ref_deletion"]) + "\n") outf_missing = E.open_output_file("missing") outf_missing.write("\t".join( ["read", "length", "status", "aligned", "insertion", "deletion"]) + "\n") counter = E.Counter() if options.query_name_regex: rx = re.compile(options.query_name_regex) def extract_query(x): return int(rx.search(x).groups()[0]) qname_fn = None if options.query_name_regex: qname_fn = extract_query for reads_cmp, read_ref in group_pairs(iterate_read_pairs( bam_in.fetch(until_eof=True), ref_in.fetch(until_eof=True), qname_fn=qname_fn)): if len(reads_cmp) == 0: counter.missing += 1 pairs_ref = set(read_ref.get_aligned_pairs()) outf_missing.write("\t".join( map(str, ( read_ref.query_name, read_ref.query_length, "missing") + count_pairs(pairs_ref))) + "\n") continue if len(reads_cmp) > 1: # multiple matches counter.multi_mapping += 1 prefix = "multi_" else: counter.unique_mapping += 1 prefix = "unique_" is_mapped = False for read_cmp in reads_cmp: counter.paired += 1 if read_cmp.is_unmapped: counter.unmapped += 1 pairs_ref = set(read_ref.get_aligned_pairs()) outf_missing.write("\t".join( map(str, ( read_ref.query_name, read_ref.query_length, "unmapped") + count_pairs(pairs_ref))) + "\n") continue overlap = max(0, (min(read_cmp.reference_end, read_ref.reference_end) - max(read_cmp.reference_start, read_ref.reference_start))) pairs_cmp = set(read_cmp.get_aligned_pairs()) pairs_ref = set(read_ref.get_aligned_pairs()) shared_cmp = pairs_cmp.intersection(pairs_ref) unique_cmp = pairs_cmp.difference(pairs_ref) missaligned = len([x for x, y in unique_cmp if x is not None and y is not None]) if read_cmp.reference_name != read_ref.reference_name or \ overlap == 0: status = "mismapped" else: counter.overlap += 1 status = "mapped" is_mapped = True outf_mapped.write("\t".join( map(str, (read_cmp.query_name, read_cmp.query_length, prefix + status, overlap, read_cmp.reference_name, read_cmp.reference_start, read_cmp.reference_end, read_ref.reference_name, read_ref.reference_start, read_ref.reference_end, missaligned) + count_pairs(shared_cmp) + count_pairs(pairs_cmp) + count_pairs(pairs_ref))) + "\n") else: if is_mapped: status = "mapped" else: status = "mismapped" counter[prefix + status] += 1 with E.open_output_file("summary") as outf: outf.write("category\tcounts\n") outf.write(counter.asTable() + "\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--mask-bed-file", "--mask-gff-file", dest="filename_bed", type="string", metavar='GFF', help="gff formatted file with masking locations. The number of " "reads overlapping the intervals in the given file will be " "computed. Note that the computation currently does not take " "into account indels, so it is an approximate count only. " "[%default]") parser.add_option( "-f", "--ignore-masked-reads", dest="ignore_masked_reads", action="store_true", help="as well as counting reads in the file given by --mask-bed-file, " "also remove these reads for duplicate and match statistics. " "[%default]") parser.add_option( "-i", "--num-reads", dest="input_reads", type="int", help="the number of reads - if given, used to provide percentages " "[%default]") parser.add_option( "-d", "--output-details", dest="output_details", action="store_true", help="output per-read details into a separate file. Read names are " "md5/base64 encoded [%default]") parser.add_option("--output-readmap", dest="output_readmap", action="store_true", help="output map between read name and " "md5/base64 encoded short name[%default]") parser.add_option( "--add-alignment-details", dest="add_alignment_details", action="store_true", help= "add alignment details to per-read details. Implies --output-details " "[%default]") parser.add_option( "-q", "--fastq-file", dest="filename_fastq", help="filename with sequences and quality scores. This file is only " "used to collect sequence identifiers. Thus, for paired end data a " "single file is sufficient [%default]") parser.add_option( "--basic-counts", dest="detailed_count", action="store_false", help="perform basic counting and do not compute per read stats. " "This is more memory efficient and faster stats computation, " "but only a summary counts table is output [%default]") parser.set_defaults( filename_bed=None, ignore_masked_reads=False, input_reads=0, force_output=False, filename_fastq=None, detailed_count=True, output_details=False, output_readmap=False, add_alignment_details=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if options.filename_bed: bed_mask = GTF.readAndIndex( GTF.iterator(iotools.open_file(options.filename_bed))) else: bed_mask = None if options.add_alignment_details: options.output_details = True is_stdin = True if len(args) > 0: pysam_in = pysam.AlignmentFile(args[0], "rb") if args[0] != "-": is_stdin = False elif options.stdin == sys.stdin: pysam_in = pysam.AlignmentFile("-", "rb") else: pysam_in = pysam.AlignmentFile(options.stdin, "rb") if options.stdin != "-": is_stdin = False if options.output_details: outfile_details = E.open_output_file("details", "w") else: outfile_details = None if options.output_readmap: outfile_readmap = E.open_output_file("readmap", "w") else: outfile_readmap = None if options.filename_fastq and not os.path.exists(options.filename_fastq): raise IOError("file %s does not exist" % options.filename_fastq) (counter, flags_counts, nh_filtered, nh_all, nm_filtered, nm_all, mapq, mapq_all, max_hi, details_df) = \ bam2stats_count(pysam_in, bed_mask=bed_mask, ignore_masked_reads=options.ignore_masked_reads, is_stdin=is_stdin, filename_fastq=options.filename_fastq, outfile_details=outfile_details, add_alignment_details=options.add_alignment_details, outfile_readmap=outfile_readmap, detailed_count=options.detailed_count) if max_hi > 0 and max_hi != max(nh_all.keys()): E.warn("max_hi(%i) is inconsistent with max_nh (%i) " "- counts will be corrected" % (max_hi, max(nh_all.keys()))) outs = options.stdout outs.write("category\tcounts\tpercent\tof\n") def _write(outs, text, numerator, denominator, base): percent = iotools.pretty_percent(numerator, denominator) outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base)) ############################### ############################### ############################### # Output alignment information ############################### nalignments_unmapped = flags_counts["unmapped"] nalignments_mapped = counter.alignments_input - nalignments_unmapped _write(outs, "alignments_total", counter.alignments_input, counter.alignments_input, "alignments_total") if counter.alignments_input == 0: E.warn("no alignments in BAM file - no further output") E.stop() return _write(outs, "alignments_mapped", nalignments_mapped, counter.alignments_input, 'alignments_total') _write(outs, "alignments_unmapped", nalignments_unmapped, counter.alignments_input, 'alignments_total') if nalignments_mapped == 0: E.warn("no mapped alignments - no further output") E.stop() return for flag, counts in sorted(flags_counts.items()): if flag == "unmapped": continue _write(outs, 'alignments_' + flag, counts, nalignments_mapped, 'alignments_mapped') if options.filename_bed: _write(outs, "alignments_masked", counter.alignments_masked, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_notmasked", counter.alignments_notmasked, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_filtered", counter.alignments_filtered, nalignments_mapped, "alignments_mapped") if counter.filtered == nalignments_mapped: normby = "alignments_mapped" else: normby = "alignments_filtered" if counter.filtered > 0: _write(outs, "alignments_duplicates", counter.alignments_duplicates, counter.alignments_filtered, normby) _write(outs, "alignments_unique", counter.aligmnments_filtered - counter.alignments_duplicates, counter.alignments_filtered, normby) ############################### ############################### ############################### # Output read based information ############################### # derive the number of mapped reads in file from alignment counts if options.filename_fastq or not is_stdin: nreads_total = counter.total_read _write(outs, "reads_total", counter.total_read, nreads_total, 'reads_total') _write(outs, "reads_unmapped", counter.total_read_is_unmapped, nreads_total, 'reads_total') _write(outs, "reads_mapped", counter.total_read_is_mapped, nreads_total, 'reads_total') _write(outs, "reads_missing", counter.total_read_is_missing, nreads_total, 'reads_total') _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq, counter.total_read_is_mapped, 'reads_mapped') _write(outs, "reads_multimapping", counter.total_read_is_mmap, counter.total_read_is_mapped, 'reads_mapped') _write(outs, "reads_mapped_supplementary", counter.total_read_has_supplementary, counter.total_read_is_mapped, 'reads_mapped') else: E.warn('inferring read counts from alignments and NH tags') nreads_unmapped = flags_counts["unmapped"] nreads_mapped = computeMappedReadsFromAlignments( nalignments_mapped, nh_all, max_hi) nreads_missing = 0 if options.input_reads: nreads_total = options.input_reads # unmapped reads in bam file? if nreads_unmapped: nreads_missing = nreads_total - nreads_unmapped - nreads_mapped else: nreads_unmapped = nreads_total - nreads_mapped elif nreads_unmapped: # if unmapped reads are in bam file, take those nreads_total = nreads_mapped + nreads_unmapped else: # otherwise normalize by mapped reads nreads_unmapped = 0 nreads_total = nreads_mapped outs.write("reads_total\t%i\t%5.2f\treads_total\n" % (nreads_total, 100.0)) outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" % (nreads_mapped, 100.0 * nreads_mapped / nreads_total)) outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" % (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total)) outs.write("reads_missing\t%i\t%5.2f\treads_total\n" % (nreads_missing, 100.0 * nreads_missing / nreads_total)) if len(nh_all) > 1: outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" % (nh_all[1], 100.0 * nh_all[1] / nreads_mapped)) pysam_in.close() ############################### ############################### ############################### # Output pair information ############################### if flags_counts["read2"] > 0: if options.filename_fastq: pairs_mapped = counter.total_pair_is_mapped # sanity check assert counter.total_pair_is_mapped == \ (counter.total_pair_is_proper_uniq + counter.total_pair_is_incomplete_uniq + counter.total_pair_is_incomplete_mmap + counter.total_pair_is_proper_duplicate + counter.total_pair_is_proper_mmap + counter.total_pair_not_proper_uniq + counter.total_pair_is_other) outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (counter.total_pairs, 100.0 * counter.total_pairs / counter.total_pairs)) outs.write( "pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs)) outs.write("pairs_unmapped\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_unmapped, 100.0 * counter.total_pair_is_unmapped / counter.total_pairs)) outs.write( "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_uniq, 100.0 * counter.total_pair_is_proper_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_uniq, 100.0 * counter.total_pair_is_incomplete_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_mmap, 100.0 * counter.total_pair_is_incomplete_mmap / counter.total_pairs)) outs.write( "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_duplicate, 100.0 * counter.total_pair_is_proper_duplicate / counter.total_pairs)) outs.write( "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_mmap, 100.0 * counter.total_pair_is_proper_mmap / counter.total_pairs)) outs.write( "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_not_proper_uniq, 100.0 * counter.total_pair_not_proper_uniq / counter.total_pairs)) outs.write("pairs_other\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_other, 100.0 * counter.total_pair_is_other / counter.total_pairs)) nread1_total = counter.total_read1 _write(outs, "read1_total", counter.total_read1, nread1_total, 'read1_total') _write(outs, "read1_unmapped", counter.total_read1_is_unmapped, nread1_total, 'read1_total') _write(outs, "read1_mapped", counter.total_read1_is_mapped, nread1_total, 'read1_total') _write(outs, "read1_mapped_unique", counter.total_read1_is_mapped_uniq, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "reads_multimapping", counter.total_read1_is_mmap, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "read1_missing", counter.total_read1_is_missing, counter.total_read1_is_mapped, 'read1_total') nread2_total = counter.total_read2 _write(outs, "read2_total", counter.total_read2, nread2_total, 'read2_total') _write(outs, "read2_unmapped", counter.total_read2_is_unmapped, nread2_total, 'read2_total') _write(outs, "read2_mapped", counter.total_read2_is_mapped, nread2_total, 'read2_total') _write(outs, "read2_mapped_unique", counter.total_read2_is_mapped_uniq, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "reads_multimapping", counter.total_read2_is_mmap, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "read2_missing", counter.total_read2_is_missing, counter.total_read2_is_mapped, 'read2_total') else: # approximate counts pairs_total = nreads_total // 2 pairs_mapped = flags_counts["proper_pair"] // 2 _write(outs, "pairs_total", pairs_total, pairs_total, "pairs_total") _write(outs, "pairs_mapped", pairs_mapped, pairs_total, "pairs_total") else: # no paired end data pairs_total = pairs_mapped = 0 outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (pairs_total, 0.0)) outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 0.0)) outs.write("error_rate\t%i\t%5.2f\tmatches+insertions\n" % (counter.error_counts, counter.error_rate * 100.0)) outs.write("insertion_rate\t%i\t%5.2f\tmatches+insertions\n" % (counter.insertion_counts, counter.insertion_rate * 100.0)) outs.write("deletion_rate\t%i\t%5.2f\tmatches+deletions\n" % (counter.deletion_counts, counter.deletion_rate * 100.0)) outs.write("mismatch_rate\t%i\t%5.2f\tmatches\n" % (counter.mismatch_counts, counter.mismatch_rate * 100.0)) outs.write("match_rate\t%i\t%5.2f\tmatches+insertions\n" % (counter.match_counts, counter.match_rate * 100.0)) if options.force_output or len(nm_filtered) > 0: outfile = E.open_output_file("nm", "w") outfile.write("NM\talignments\n") if len(nm_filtered) > 0: for x in range(0, max(nm_filtered.keys()) + 1): outfile.write("%i\t%i\n" % (x, nm_filtered[x])) else: outfile.write("0\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(nh_all) > 1: outfile = E.open_output_file("nh_all", "w") outfile.write("NH\treads\n") if len(nh_all) > 0: writeNH(outfile, nh_all, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.mapped_reads)) outfile.close() if options.force_output or len(nh_filtered) > 1: outfile = E.open_output_file("nh", "w") outfile.write("NH\treads\n") if len(nh_filtered) > 0: writeNH(outfile, nh_filtered, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(mapq_all) > 1: outfile = E.open_output_file("mapq", "w") outfile.write("mapq\tall_reads\tfiltered_reads\n") for x in range(0, max(mapq_all.keys()) + 1): outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x])) outfile.close() if details_df is not None: with E.open_output_file("summaries", "w") as outf: details_df.describe().transpose().to_csv(outf, sep="\t", index_label="metric") bins = numpy.arange(0, 1.01, 0.01) histogram_df = pandas.DataFrame.from_items([ (x, numpy.histogram(details_df[x].dropna(), bins=bins)[0]) for x in details_df.columns ]) histogram_df.index = numpy.arange(0, 1.0, 0.01) row_sums = histogram_df.sum(axis=1) histogram_df = histogram_df[row_sums != 0] with E.open_output_file("histogram", "w") as outf: histogram_df.to_csv(outf, sep="\t", index_label="bin") # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.") parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum value for histogram.") parser.add_option("--no-empty-bins", dest="no_empty_bins", action="store_true", help="do not display empty bins.") parser.add_option("--with-empty-bins", dest="no_empty_bins", action="store_false", help="display empty bins.") parser.add_option( "--ignore-out-of-range", dest="ignore_out_of_range", action="store_true", help="ignore values that are out of range (as opposed to truncating " "them to range border.") parser.add_option("--missing-value", dest="missing_value", type="string", help="entry for missing values [%default].") parser.add_option("--use-dynamic-bins", dest="dynamic_bins", action="store_true", help="each value constitutes its own bin.") parser.add_option("--format", dest="format", type="choice", choices=("gff", "gtf", "bed"), help="input file format [%default].") parser.add_option("--method", dest="methods", type="choice", action="append", choices=("all", "hist", "stats", "overlaps", "values"), help="methods to apply [%default].") parser.add_option("--output-section", dest="output_section", type="choice", choices=("all", "size", "distance"), help="data to compute [%default].") parser.set_defaults( no_empty_bins=True, bin_size=None, dynamic_bins=False, ignore_out_of_range=False, min_value=None, max_value=None, nonull=None, missing_value="na", output_filename_pattern="%s", methods=[], output_section="all", format="gff", ) (options, args) = E.start(parser, add_output_options=True) if "all" in options.methods: options.methods = ("hist", "stats", "overlaps") if not options.output_filename_pattern: options.output_filename_pattern = "%s" if len(options.methods) == 0: raise ValueError( "please provide counting method using --method option") if options.format in ("gff", "gtf"): gffs = GTF.iterator(options.stdin) elif options.format == "bed": gffs = Bed.iterator(options.stdin) values_between = [] values_within = [] values_overlaps = [] if "overlaps" in options.methods: if not options.output_filename_pattern: options.output_filename_pattern = "%s" outfile_overlaps = E.open_output_file("overlaps") else: outfile_overlaps = None last = None ninput, noverlaps = 0, 0 for this in gffs: ninput += 1 values_within.append(this.end - this.start) if last and last.contig == this.contig: if this.start < last.end: noverlaps += 1 if outfile_overlaps: outfile_overlaps.write("%s\t%s\n" % (str(last), str(this))) values_overlaps.append( min(this.end, last.end) - max(last.start, this.start)) if this.end > last.end: last = this continue else: values_between.append(this.start - last.end) # if this.start - last.end < 10: # print str(last) # print str(this) # print "==" values_overlaps.append(0) last = this if "hist" in options.methods: outfile = E.open_output_file("hist") h_within = Histogram.Calculate( values_within, no_empty_bins=options.no_empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, dynamic_bins=options.dynamic_bins, ignore_out_of_range=options.ignore_out_of_range) h_between = Histogram.Calculate( values_between, no_empty_bins=options.no_empty_bins, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, dynamic_bins=options.dynamic_bins, ignore_out_of_range=options.ignore_out_of_range) if "all" == options.output_section: outfile.write("residues\tsize\tdistance\n") combined_histogram = Histogram.Combine( [h_within, h_between], missing_value=options.missing_value) Histogram.Write(outfile, combined_histogram, nonull=options.nonull) elif options.output_section == "size": outfile.write("residues\tsize\n") Histogram.Write(outfile, h_within, nonull=options.nonull) elif options.output_section == "distance": outfile.write("residues\tdistance\n") Histogram.Write(outfile, h_between, nonull=options.nonull) outfile.close() if "stats" in options.methods: outfile = E.open_output_file("stats") outfile.write("data\t%s\n" % Stats.Summary().getHeader()) if options.output_section in ("size", "all"): outfile.write("size\t%s\n" % str(Stats.Summary(values_within))) if options.output_section in ("distance", "all"): outfile.write("distance\t%s\n" % str(Stats.Summary(values_between))) outfile.close() if "values" in options.methods: outfile = E.open_output_file("distances") outfile.write("distance\n%s\n" % "\n".join(map(str, values_between))) outfile.close() outfile = E.open_output_file("sizes") outfile.write("size\n%s\n" % "\n".join(map(str, values_within))) outfile.close() outfile = E.open_output_file("overlaps") outfile.write("overlap\n%s\n" % "\n".join(map(str, values_overlaps))) outfile.close() E.info("ninput=%i, ndistance=%i, nsize=%i, noverlap=%i" % (ninput, len(values_between), len(values_within), noverlaps)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-e", "--exons-file", "--gtf-file", dest="filename_exons", type=str, metavar="gtf", help="gtf formatted file with non-overlapping exon " "locations (required). ") parser.set_defaults( filename_exons=None, read_length=200, ) # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, add_output_options=True, unknowns=True) exons = GTF.readAndIndex( GTF.iterator(iotools.open_file(args.filename_exons))) pysam_in = pysam.AlignmentFile("-", "rb") nspliced = 0 nspliced_ignored = 0 nspliced_nooverlap = 0 nspliced_halfoverlap = 0 nspliced_bothoverlap = 0 nspliced_overrun = [0] * 2 * (args.read_length + 10) nspliced_exact = 0 nspliced_inexact = 0 nunspliced = 0 nunspliced_overlap = 0 nunspliced_ignored = 0 nunspliced_nooverlap = 0 nunspliced_overrun = [0] * (args.read_length + 10) overrun_offset = args.read_length + 10 ninput = 0 nunmapped = 0 c = E.Counter() def _splice_overrun(start, end, overlap): '''return splicesite over/underrun. positive values: overrun negative values: underrun 0: no over/underrun ''' exon_start = min([x[0] for x in overlap]) exon_end = max([x[1] for x in overlap]) if start <= exon_start and end > exon_start: # overrun at start or match r = exon_start - start elif start < exon_end and end >= exon_end: # overrun at end or match r = end - exon_end else: # underrun - distance to closest exon boundary r = -min(start - exon_start, exon_end - end) return r for read in pysam_in: ninput += 1 if read.is_unmapped: nunmapped += 1 continue # check for BAM_CREF_SKIP code in cigar string cigar = read.cigar is_spliced = 3 in [x[0] for x in cigar] contig = pysam_in.getrname(read.tid) start = read.pos end = read.aend if is_spliced: # count both ends nspliced += 1 if len(cigar) != 3: nspliced_ignored += 1 continue start5, end5 = start, start + cigar[0][1] start3, end3 = end - cigar[2][1], end try: overlap3 = list(exons.get(contig, start3, end3)) overlap5 = list(exons.get(contig, start5, end5)) except KeyError: overlap3 = overlap5 = [] ovl3 = len(overlap3) ovl5 = len(overlap5) o3 = o5 = None if not ovl3 and not ovl5: nspliced_nooverlap += 1 elif ovl3 and not ovl5: nspliced_halfoverlap += 1 o3 = _splice_overrun(start3, end3, overlap3) elif ovl5 and not ovl3: nspliced_halfoverlap += 1 o5 = _splice_overrun(start5, end5, overlap5) else: # both overlap nspliced_bothoverlap += 1 o3 = _splice_overrun(start3, end3, overlap3) o5 = _splice_overrun(start5, end5, overlap5) if o3 is not None: if o3 == 0: nspliced_exact += 1 else: nspliced_inexact += 1 nspliced_overrun[max(0, overrun_offset + o3)] += 1 if o5 is not None: if o5 == 0: nspliced_exact += 1 else: nspliced_inexact += 1 nspliced_overrun[max(0, overrun_offset + o5)] += 1 else: nunspliced += 1 try: overlap = list(exons.get(contig, start, end)) except KeyError: overlap = [] if len(overlap) == 0: nunspliced_nooverlap += 1 elif len(overlap) >= 1: nunspliced_overlap += 1 # multiple overlap - merge exons (usually: small introns) exon_start = min([x[0] for x in overlap]) exon_end = max([x[1] for x in overlap]) ostart = max(0, exon_start - start) oend = max(0, end - exon_end) o = min(end, exon_end) - max(start, exon_start) overrun = ostart + oend nunspliced_overrun[overrun] += 1 # output histograms outfile = E.open_output_file("overrun") outfile.write( "bases\tunspliced_overrun_counts\tspliced_overrun_counts\tspliced_underrun_counts\n" ) _nspliced_overrun = nspliced_overrun[overrun_offset:] _nspliced_underrun = nspliced_overrun[:overrun_offset + 1] _nspliced_underrun.reverse() for x, v in enumerate( zip(nunspliced_overrun, _nspliced_overrun, _nspliced_underrun)): outfile.write("%i\t%s\n" % (x, "\t".join(map(str, v)))) outfile.close() # output summary # convert to counter c.input = ninput c.unmapped = nunmapped c.mapped = ninput - nunmapped c.unspliced = nunspliced c.unspliced_nooverlap = nunspliced_nooverlap c.unspliced_nooverrun = nunspliced_overrun[0] c.unspliced_overlap = nunspliced_overlap c.unspliced_overrun = sum(nunspliced_overrun[1:]) c.spliced = nspliced c.spliced_nooverlap = nspliced_nooverlap c.spliced_halfoverlap = nspliced_halfoverlap c.spliced_bothoverlap = nspliced_bothoverlap c.spliced_exact = nspliced_exact c.spliced_inexact = nspliced_inexact c.spliced_ignored = nspliced_ignored c.spliced_underrun = sum(_nspliced_underrun[1:]) c.spliced_overrun = sum(_nspliced_overrun[1:]) outfile = args.stdout outfile.write("category\tcounts\n") for k, v in sorted(c.items()): outfile.write("%s\t%i\n" % (k, v)) # write footer and output benchmark information. E.stop()
def writeMatricesForSortOrder(features_per_interval, bins, foreground_track, control_tracks, shifted, sort_order): '''output one or more matrices for each sort sorder. For each sort order output the forerground. If there are additional controls and shifted section, output these as well The files will named: matrix_<track>_<sortorder> ''' if "name" in features_per_interval[0].interval: names = [x.interval.name for x in features_per_interval] else: names = list(map(str, list(range(1, len(features_per_interval) + 1)))) bins = ["%i" % x for x in bins] sort_order = re.sub("-", "_", sort_order) # write foreground iotools.write_matrix(E.open_output_file("matrix_%s_%s.gz" % (foreground_track, sort_order)), [x.foreground.counts for x in features_per_interval], row_headers=names, col_headers=bins, row_header="name") # write controls for idx, track in enumerate(control_tracks): iotools.write_matrix( E.open_output_file("matrix_%s_%s.gz" % (track, sort_order)), [x.controls[idx].counts for x in features_per_interval], row_headers=names, col_headers=bins, row_header="name") # write shifted matrix if shifted: iotools.write_matrix(E.open_output_file("matrix_shift_%s.gz" % (sort_order)), [x.shifted.counts for x in features_per_interval], row_headers=names, col_headers=bins, row_header="name") # output a combined matrix if len(control_tracks) > 0 or shifted: rows = [] for row in features_per_interval: l = [row.foreground.counts] l.extend( [row.controls[x].counts for x in range(len(control_tracks))]) if shifted: l.append(row.shifted.counts) rows.append(numpy.concatenate(l)) n = 1 + len(control_tracks) if shifted: n += 1 # make column names unique and make sure they can be sorted # lexicographically all_bins = [] for x in range(n): all_bins.extend(["%i:%s" % (x, b) for b in bins]) iotools.write_matrix(E.open_output_file("matrix_sidebyside_%s.gz" % (sort_order)), rows, row_headers=names, col_headers=all_bins, row_header="name")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-m", "--method", dest="method", type=str, choices=('reconcile', 'filter-by-sequence'), help="method to apply.") parser.add_argument( "-c", "--chop-identifier", dest="chop", action="store_true", help="whether or not to trim last character of the " "sequence name. For example sometimes ids in the first " "file in the pair will end with \1 and the second " "with \2. If --chop-identifier is not specified " "then the results will be wrong.") parser.add_argument("-u", "--unpaired", dest="unpaired", action="store_true", help="whether or not to write out unpaired reads " "to a separate file") parser.add_argument("--id-pattern-1", dest="id_pattern_1", help="If specified will use the first group from the" "pattern to determine the ID for the first read") parser.add_argument("--id-pattern-2", dest="id_pattern_2", help="As above but for read 2") parser.add_argument("--input-filename-fasta", dest="input_filename_fasta", type=str, help="input filename of FASTA formatted sequence " "for method 'filter-by-sequence'.") parser.add_argument("--filtering-kmer-size", dest="filtering_kmer_size", type=int, help="kmer size for method 'filter-by-sequence'.") parser.add_argument("--filtering-min-kmer-matches", dest="filtering_min_kmer_matches", type=int, help="minimum number of matches 'filter-by-sequence'.") parser.set_defaults(method="reconcile", chop=False, unpaired=False, input_filename_fasta=None, filtering_kmer_size=10, filtering_min_kmer_matches=20) # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, add_output_options=True, unknowns=True) if len(unknown) != 2: raise ValueError( "please supply at least two fastq files on the commandline") fn1, fn2 = unknown counter = E.Counter() if args.id_pattern_1: id1_getter = PatternGetter(args.id_pattern_1) else: id1_getter = plain_getter if args.id_pattern_2: id2_getter = PatternGetter(args.id_pattern_2) else: id2_getter = plain_getter if args.method == "reconcile": # IMS: switching to no store second set of read names and only use # lazily. Since generators don't have a size must keep track id_lengths = {fn1: 0, fn2: 0} def getIds(infile, id_getter=plain_getter): '''return ids in infile.''' aread = infile.readline while True: l = [aread().rstrip("\r\n") for i in range(4)] if not l[0]: break r = id_getter(l[0].split()[0]) # decide if to chop read number off id_lengths[infile.name] += 1 if args.chop: yield r[:-1] else: yield r def write(outfile, infile, take, unpaired_file=None, id_getter=plain_getter): '''filter fastq files with ids in take.''' aread = infile.readline while True: l = [aread().rstrip("\r\n") for i in range(4)] if not l[0]: break r = id_getter(l[0].split()[0]) if args.chop: r = r[:-1] if r not in take: if unpaired_file is None: continue else: unpaired_file.write("\n".join(l) + "\n") else: outfile.write("\n".join(l) + "\n") E.info("reading first in pair") inf1 = iotools.open_file(fn1) ids1 = set(getIds(inf1, id1_getter)) E.info("reading second in pair") inf2 = iotools.open_file(fn2) # IMS: No longer keep as a set, but lazily evaluate into intersection # leads to large memory saving for large inf2, particularly if # inf1 is small. ids2 = getIds(inf2, id2_getter) take = ids1.intersection(ids2) E.info("first pair: %i reads, second pair: %i reads, " "shared: %i reads" % (id_lengths[fn1], id_lengths[fn2], len(take))) if args.unpaired: unpaired_filename = E.open_output_file("unpaired.fastq.gz", "w") else: unpaired_filename = None with E.open_output_file("1", "w") as outf: inf = iotools.open_file(fn1) E.info("writing first in pair") write(outf, inf, take, unpaired_filename, id1_getter) with E.open_output_file("2", "w") as outf: inf = iotools.open_file(fn2) E.info("writing second in pair") write(outf, inf, take, unpaired_filename, id2_getter) counter.output = len(take) if args.unpaired: unpaired_filename.close() elif args.method == "filter-by-sequence": with pysam.FastxFile(args.input_filename_fasta) as inf: for record in inf: query_sequence = record.sequence break with pysam.FastxFile(fn1, persist=False) as inf1, \ pysam.FastxFile(fn2, persist=False) as inf2, \ E.open_output_file("matched.fastq.1.gz", "w") as outf_matched1, \ E.open_output_file("matched.fastq.2.gz", "w") as outf_matched2, \ E.open_output_file("unmatched.fastq.1.gz", "w") as outf_unmatched1, \ E.open_output_file("unmatched.fastq.2.gz", "w") as outf_unmatched2: counter = fastqtools.filter_by_sequence( query_sequence, inf1, inf2, outf_matched1, outf_matched2, outf_unmatched1, outf_unmatched2, kmer_size=args.filtering_kmer_size, min_kmer_matches=args.filtering_min_kmer_matches) args.stdout.write("\t".join(("input", "matched", "unmatched", "percent_matched")) + "\n") args.stdout.write("\t".join( map(str, (counter.input, counter.matched, counter.unmatched, 100.0 * counter.matched / counter.input))) + "\n") E.info(str(counter)) E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--reference-bed-file", dest="reference_bed_file", type="string", help="reference bed file " "[%default]") parser.add_option("-m", "--method", dest="method", type="choice", choices=("lvc-comparison", ), help="methods to apply [%default]") parser.set_defaults(method="lvc-comparison", reference_fasta_file=None, input_bed_file=None, size_bins=(1000, 10000, 100000), output_sets=True, region_string=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) reference_set = collections.defaultdict(quicksect.IntervalTree) E.info("reading reference bed file from {}".format( options.reference_bed_file)) with iotools.open_file(options.reference_bed_file) as inf: for record in pysam.tabix_iterator(inf, pysam.asBed()): mm = reference_set[record.contig] mm.add(record.start, record.end) E.info("read reference intervals on {} contigs: {}".format( len(list(reference_set.keys())), ",".join(list(reference_set.keys())))) if options.output_sets: output_tp = E.open_output_file("tp") output_fp = E.open_output_file("fp") output_fn = E.open_output_file("fn") else: output_tp = None output_fp = None output_fn = None if options.method == "lvc-comparison": c = E.Counter() found = set() counts = {} names = set() nsize_bins = len(options.size_bins) for bin in range(len(options.size_bins) + 1): counts[bin] = dict([(x, collections.defaultdict(int)) for x in ("tp", "fn", "fp", "test", "truth")]) for record in pysam.tabix_iterator(options.stdin, pysam.asBed()): if record.contig not in reference_set: c.ignored_no_contig += 1 continue c.test += 1 matches = reference_set[record.contig].search( record.start, record.end) size = record.end - record.start bin = get_size_bin(size, options.size_bins) if len(matches) == 0: c.fp += 1 status = "fp" if output_fp: output_fp.write(str(record) + "\n") elif len(matches) >= 1: c.tp += 1 status = "tp" if output_tp: output_tp.write(str(record) + "\n") # todo: overlap criteria # record found for match in matches: found.add((record.contig, match.start, match.end)) name = record.name.split(",")[0] names.add(name) counts[bin]["test"][name] += 1 counts[bin][status][name] += 1 outf = options.stdout with iotools.open_file(options.reference_bed_file) as inf: for record in pysam.tabix_iterator(inf, pysam.asBed()): c.truth += 1 bin = get_size_bin(record.end - record.start, options.size_bins) counts[bin]["truth"]["all"] += 1 key = (record.contig, record.start, record.end) if key not in found: c.fn += 1 counts[bin]["fn"]["all"] += 1 outf.write("\t".join(("category", "size", "test", "tp", "fp", "truth", "fn")) + "\n") for name in sorted(names): for bin in range(len(options.size_bins) + 1): if bin == len(options.size_bins): size_bin = ">={}".format(options.size_bins[-1]) else: size_bin = "<{}".format(options.size_bins[bin]) outf.write("\t".join( map(str, ( name, size_bin, counts[bin]["test"][name], counts[bin]["tp"][name], counts[bin]["fp"][name], counts[bin]["truth"]["all"], counts[bin]["fn"]["all"], ))) + "\n") E.info(str(c)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-e", "--input-bed-file", dest="input_bed_file", type="string", help="input file with intervals. Tab-delimited file of intervals " "in bed format to restrict analysis to. [%default]") parser.add_option( "-m", "--merge-intervals", dest="merge_intervals", action="store_true", help="merge intervals in bed file. Useful if you have a site bed-file " "[%default]") parser.add_option("-f", "--reference-fasta-file", dest="reference_fasta_file", help="reference genomic sequence in fasta format. " "[%default]") parser.add_option( "-c", "--barcode-fasta-file", dest="barcode_fasta_file", help="barcode sequence in fasta format. Variable positions " "should be marked by N " "[%default]") parser.set_defaults( reference_fasta_file=None, barcode_fasta_file=None, merge_intervals=False, input_bed_file=None, anchor=5, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if options.stdin != sys.stdin: bamfile = options.stdin.name elif args: if len(args) > 1: raise ValueError("multiple bam files provided in arguments") bamfile = args[0] else: bamfile = "-" if options.barcode_fasta_file: with pysam.FastxFile(options.barcode_fasta_file) as inf: barcode_sequence = next(inf).sequence else: barcode_sequence = None if not os.path.exists(options.reference_fasta_file): raise OSError("reference fasta file {} does not exist".format( options.reference_fasta_file)) if not os.path.exists(options.input_bed_file): raise OSError("input bed file {} does not exist".format( options.input_bed_file)) bed_in = pysam.TabixFile(options.input_bed_file) pysam_in = pysam.AlignmentFile(bamfile) anchor = options.anchor for region_idx, vals in enumerate( iterate_bed(bed_in, options.merge_intervals)): if region_idx > 0: raise NotImplementedError( "output for multiple regions not yet implemented") contig, region_start, region_end = vals upstream_anchors, downstream_anchors = [], [] counter = E.Counter() unaligned_fn = E.get_output_file( "unaligned_{}.fasta".format(region_idx)) with iotools.open_file(unaligned_fn, "w") as outf: for read in pysam_in.fetch(contig, region_start, region_end): counter.overlapping_reads += 1 try: pairs = read.get_aligned_pairs(with_seq=True) except ValueError: counter.no_md_tag += 1 continue map_ref2read_pos = dict( (x[1], x[0]) for x in pairs if x[0] is not None) map_ref2ref_base = dict( (x[1], x[2]) for x in pairs if x[0] is not None) upstream_anchor = "".join( map_ref2ref_base.get(x, "") for x in range(region_start - anchor, region_start)) downstream_anchor = "".join( map_ref2ref_base.get(x, "") for x in range(region_end, region_end + anchor)) # check if at least one anchor is aligned upstream_matches = sum([x.isupper() for x in upstream_anchor]) downstream_matches = sum( [x.isupper() for x in downstream_anchor]) if upstream_matches < anchor and downstream_matches < anchor: counter.no_anchor += 1 continue seq = read.query_alignment_sequence # collect full length anchors upstream_anchor_start, upstream_anchor_end = region_start - anchor, region_start downstream_anchor_start, downstream_anchor_end = region_end, region_end + anchor if upstream_anchor_start in map_ref2read_pos and upstream_anchor_end in map_ref2read_pos: upstream_anchors.append( seq[map_ref2read_pos[upstream_anchor_start]: map_ref2read_pos[upstream_anchor_end]]) if downstream_anchor_start in map_ref2read_pos and downstream_anchor_end in map_ref2read_pos: downstream_anchors.append( seq[map_ref2read_pos[downstream_anchor_start]: map_ref2read_pos[downstream_anchor_end]]) # get region to align read_start = min( (map_ref2read_pos.get(x, len(seq)) for x in range(region_start - anchor, region_start))) if read_start == len(seq): read_start = 0 read_end = max( (map_ref2read_pos.get(x, 0) + 1 for x in range(region_end, region_end + anchor))) if read_end == 1: read_end = len(seq) counter.collected_reads += 1 outf.write(">{}/{}-{}\n{}\n".format(read.query_name, read_start, read_end, seq[read_start:read_end])) counter.downstream_anchors = len(downstream_anchors) counter.upstream_anchors = len(upstream_anchors) E.info(counter) if counter.overlapping_reads == 0: E.warn("no sequences overlapping region") continue if counter.downstream_anchors == 0 or counter.upstream_anchors == 0: E.warn("at least one anchor undefined") continue if counter.collected_reads == 1: E.warn("only single sequence, multiple aligment skipped") with iotools.open_file(unaligned_fn) as inf: stdout = inf.read() else: # G-INS-i -> global alignment algorithm E.info("starting mafft multiple alignment") stdout = E.run( "mafft --globalpair --maxiterate 100 --quiet --op 2 --ep 0.5 {}" .format(unaligned_fn), return_stdout=True) aligned_fn = E.get_output_file("aligned_{}.fasta".format(region_idx)) with iotools.open_file(aligned_fn, "w") as outf: outf.write(stdout) mali = stdout.splitlines() identifiers = [mali[x] for x in range(0, len(mali), 2)] sequences = [mali[x].upper() for x in range(1, len(mali), 2)] consensus = get_consensus(sequences) E.info("after alignment: consensus={}".format(consensus)) # gap filtering -> remove highly gappy columns consensus = get_consensus(sequences, min_gap_proportion=0.9) E.info("after anchor trimming: consensus={}".format(consensus)) take = [idx for idx, x in enumerate(consensus) if x != "-"] sequences = ["".join([s[x] for x in take]) for s in sequences] consensus = get_consensus(sequences, min_gap_proportion=0.9) E.info("after gap filtering: consensus={}".format(consensus)) # get anchor consensus and chop it off consensus = get_consensus(sequences, ignore_gaps=True) upstream_anchor = get_anchor_consensus(upstream_anchors) downstream_anchor = get_anchor_consensus(downstream_anchors) upstream_anchor_start = consensus.find(upstream_anchor) downstream_anchor_start = consensus.rfind(downstream_anchor) E.info( "anchor consensus (no gaps)={}, upstream={}, downstream={}, upstream_idx={}, downstream_idx={}" .format(consensus, upstream_anchor, downstream_anchor, upstream_anchor_start, downstream_anchor_start)) if upstream_anchor_start < 0 or downstream_anchor_start < 0: E.warn("can't locate anchor, no output produced") continue upstream_anchor_end = upstream_anchor_start + len(upstream_anchor) if upstream_anchor_end >= downstream_anchor_start: E.warn("anchor not in correct order, no output produced") continue sequences = [ x[upstream_anchor_end:downstream_anchor_start] for x in sequences ] consensus = get_consensus(sequences) E.info("after anchor trimming: consensus={}".format(consensus)) truncated_fn = E.get_output_file( "aligned_truncated_{}.fasta".format(region_idx)) with iotools.open_file(truncated_fn, "w") as outf: outf.write("\n".join("{}\n{}\n".format(x, y) for x, y in zip(identifiers, sequences))) positions = list(zip(*sequences)) bases = ["A", "C", "G", "T"] df = pandas.DataFrame([collections.Counter(x) for x in positions]).fillna(0) for missing_base in [x for x in bases if x not in df.columns]: df[missing_base] = 0 df["gapped_depth"] = df.sum(axis=1) df["depth"] = df[bases].sum(axis=1) df["consensus"] = df[bases].idxmax(axis=1) df["consensus_counts"] = df.lookup(df.index, df.consensus) df["consensus_support"] = df.consensus_counts / df.depth df["offconsensus_counts"] = df.depth - df.consensus_counts df.loc[df.consensus_counts == 0, "consensus"] = "N" df["region_id"] = region_idx # replace "gap" consensus positions with + character alignment = global_align(re.sub("-", "+", consensus), barcode_sequence) E.info("alignment: consensus {}".format(alignment[0])) E.info("alignment: barcode {}".format(alignment[1])) barcode_idx = 0 deleted_barcode_bases = [] rows = [] for c, b in zip(*alignment): if c == "-": deleted_barcode_bases.append(barcode_idx) barcode_idx += 1 elif b == "N": rows.append((barcode_idx, "variable")) barcode_idx += 1 elif b == "-": rows.append(("", "insertion")) elif b == c: rows.append((barcode_idx, "fixed-match")) barcode_idx += 1 else: rows.append((barcode_idx, "fixed-mismatch")) barcode_idx += 1 alignment_df = pandas.DataFrame.from_records( rows, columns=["barcode_pos", "barcode_class"]) assert len(alignment_df) == len(df) df = pandas.concat([df, alignment_df], axis=1) with E.open_output_file("pileup") as outf: df.to_csv(outf, sep="\t", index=True, index_label="position") observed_barcode_sequence = "".join( df[df.barcode_class == "variable"].consensus) headers = df.consensus_support.describe().index eval_df = df.loc[df.barcode_class.isin( ("variable", "fixed-match", "fixed-mismatch")), ] median_consensus_depth = eval_df.consensus_counts.median() # zero stuff out if depth is low if median_consensus_depth <= 2: deleted_barcode_bases = [] outf = options.stdout # modules to recover partial bar-codes outf.write("\t".join( map(str, [ "barcode", "ndeleted_barcode_bases", "deleted_barcode_bases" ] + ["support_{}".format(x) for x in headers] + ["counts_{}".format(x) for x in headers] + ["offcounts_{}".format(x) for x in headers])) + "\n") outf.write("\t".join( map(str, [ observed_barcode_sequence, len(deleted_barcode_bases), ",".join( map(str, deleted_barcode_bases)) ] + eval_df.consensus_support.describe().tolist() + eval_df.consensus_counts.describe().tolist() + eval_df.offconsensus_counts.describe().tolist())) + "\n") E.stop()