def low_memory_bam(bam_fn, sample, out_handle, args): if args.genomic: raise ValueError( "low-memory option is not compatible with genomic coordinates.") precursors = args.precursors bam_fn = _sam_to_bam(bam_fn) bam_fn = _bam_sort(bam_fn) mode = "r" if bam_fn.endswith("sam") else "rb" handle = pysam.Samfile(bam_fn, mode) lines = [] current = None for line in handle: if not current or current == line.query_name: lines.append(line) current = line.query_name else: reads = _read_lines(lines, precursors, handle, args) ann = annotate(reads, args.matures, args.precursors, quiet=True) gff_lines = body.create(ann, args.database, sample, args, quiet=True) body.write_body_on_handle(gff_lines, out_handle) current = line.query_name lines = [] lines.append(line) reads = _read_lines(lines, precursors, handle, args) ann = annotate(reads, args.matures, args.precursors, quiet=True) gff_lines = body.create(ann, args.database, sample, args, quiet=True) body.write_body_on_handle(gff_lines, out_handle)
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ if args.low_memory: read.reader(args) return None samples = [] database = mapper.guess_database(args) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch out_dts = dict() if args.keep_name and len(args.files) > 1: logger.warning("--keep-name when running multiple samples\n" "can generate wrong results if the\n" "name read is different across sample\n" "for the same sequence.") for fn in args.files: fn = op.normpath(fn) if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) if args.format == "BAM": reads = _read_bam(fn, args) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, args) elif args.format == "srnabench": out_dts[fn] = srnabench.read_file(fn, args) elif args.format == "prost": reads = prost.read_file(fn, precursors, database, args.gtf) elif args.format == "isomirsea": out_dts[fn] = isomirsea.read_file(fn, args) elif args.format == "manatee": out_dts[fn] = manatee.read_file(fn, database, args) elif args.format == "optimir": out_dts[fn] = optimir.read_file(fn, args) elif args.format == "gff": samples.extend(header.read_samples(fn)) out_dts[fn] = body.read(fn, args) continue if args.format not in ["isomirsea", "srnabench", "manatee", 'optimir']: ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, args) h = header.create([sample], database, header.make_tools(args.format)) _write(out_dts[fn], h, fn_out, args) # merge all reads for all samples into one dict if args.low_memory: return None merged = merge.merge(out_dts, samples) fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format) _write(merged, header.create(samples, database, header.make_tools([args.format])), fn_merged_out, args)
def low_memory_genomic_bam(bam_fn, sample, out_handle, args): logger.info("Reading BAM file in low memory mode.") logger.warning("This is under development and variants can be unexact.") precursors = args.precursors bam_fn = _sam_to_bam(bam_fn) bam_fn = _bam_sort(bam_fn) database = guess_database(args) bed_fn = os.path.join(args.out, os.path.basename(bam_fn) + ".bed") logger.info("Making bed file.") _bed(bam_fn, bed_fn) logger.info("Intersecting bed file.") intersect_fn = intersect(bed_fn, args.gtf) logger.info("Loading database.") # TODO this'll return conn_reads and conn_counts conn = _read_lifted_bam_alpha(intersect_fn, bam_fn, args) rows = sql.select_all_reads(conn) lines = [] current = None logger.info("Analyzing database.") for row in rows: if not current or current == row[0]: lines.append(row) current = row[0] else: # TODO counts of sequence = conn_counts.query UID # it could be counts only same location UID+chrom+start, or counts all UID reads = _read_lifted_lines(lines, precursors, database) ann = annotate(reads, args.matures, args.precursors, quiet=True) gff_lines = body.create(ann, args.database, sample, args, quiet=True) body.write_body_on_handle(gff_lines, out_handle) current = row[0] lines = [] lines.append(row) reads = _read_lifted_lines(lines, precursors, database) ann = annotate(reads, args.matures, args.precursors, quiet=True) gff_lines = body.create(ann, args.database, sample, args, quiet=True) body.write_body_on_handle(gff_lines, out_handle) conn.close() logger.info("Done")
def read_file_low_memory(fn, sample, args, out_handle): precursors = args.precursors reads = defaultdict(hits) col_fix = 0 with open(fn) as handle: header = handle.readline() if header.find("freq") < 0: col_fix = 1 for line in handle: reads = _read_line(line, col_fix, precursors) ann = annotate(reads, args.matures, args.precursors, quiet=True) gff_lines = body.create(ann, args.database, sample, args, quiet=True) body.write_body_on_handle(gff_lines, out_handle)
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ samples = [] database = mapper.guess_database(args.gtf) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch out_dts = dict() for fn in args.files: if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) if args.format == "BAM": reads = _read_bam(fn, args) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, args) elif args.format == "srnabench": out_dts[fn] = srnabench.read_file(fn, args) elif args.format == "prost": reads = prost.read_file(fn, precursors, database, args.gtf) elif args.format == "isomirsea": out_dts[fn] = isomirsea.read_file(fn, args) elif args.format == "gff": samples.extend(header.read_samples(fn)) out_dts[fn] = body.read(fn, args) continue if args.format not in ["isomirsea", "srnabench"]: ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, args) h = header.create([sample], database, "") _write(out_dts[fn], h, fn_out) # merge all reads for all samples into one dict merged = merge.merge(out_dts, samples) fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format) _write(merged, header.create(samples, database, ""), fn_merged_out)
def annotate(fn, read_file, load=False, create=True): import argparse args = argparse.Namespace() args.hairpin = "data/examples/annotate/hairpin.fa" args.sps = "hsa" args.gtf = "data/examples/annotate/hsa.gff3" args.add_extra = True args.out_format = "gtf" from mirtop.mirna import fasta, mapper precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) args.precursors = precursors args.matures = matures args.database = mapper.guess_database(args.gtf) from mirtop.mirna import annotate from mirtop.gff import body if not load: reads = read_file(fn, args) else: reads = read_file if create: ann = annotate.annotate(reads, matures, precursors) body = body.create(ann, "miRBase21", "Example", args) return body
def _analyze_line(line, precursors, database, sample, sep, args): start_idx = 10 end_idx = 11 attr_idx = 15 query_name = line[3] sequence = line[4] if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase return None logger.debug(("READ::line name:{0}").format(line)) if sequence and sequence.find("N") > -1: return None chrom = line[attr_idx].strip().split("Name=")[-1] start = line[1] end = line[2] strand = line[5] counts = float(line[6]) Filter = "Pass" reads = dict() if not start: return None if strand == "+": start = int(start) - int(line[start_idx]) + 1 else: start = int(line[end_idx]) - int(end) iso = isomir() iso.align = line iso.set_pos(start, len(sequence)) logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom)) if len(precursors[chrom]) < start + len(sequence): logger.debug("READ::%s start + %s sequence size are bigger than" " size precursor %s" % ( chrom, len(sequence), len(precursors[chrom]))) iso.subs, iso.add, iso.cigar = filter.tune( sequence, precursors[chrom], start, None) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs)) idu = make_id(sequence) reads[query_name] = hits() reads[query_name].set_sequence(sequence) reads[query_name].counts = counts reads[query_name].sequence = sequence reads[query_name].set_precursor(chrom, iso) reads = annotate(reads, args.matures, args.precursors, quiet=True) gff_line = body.create(reads, args.database, sample, args, quiet=True) if start not in gff_line[chrom]: return None line = gff_line[chrom][start][0][4] logger.debug("READ::line:%s" % line) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) return {'chrom': chrom, 'start': start, 'name': query_name, 'mirna': reads[query_name].precursors[chrom].mirna, 'line': [idu, chrom, counts, sample, line]}