def low_memory_bam(bam_fn, sample, out_handle, args): if args.genomic: raise ValueError( "low-memory option is not compatible with genomic coordinates.") precursors = args.precursors bam_fn = _sam_to_bam(bam_fn) bam_fn = _bam_sort(bam_fn) mode = "r" if bam_fn.endswith("sam") else "rb" handle = pysam.Samfile(bam_fn, mode) lines = [] current = None for line in handle: if not current or current == line.query_name: lines.append(line) current = line.query_name else: reads = _read_lines(lines, precursors, handle, args) ann = annotate(reads, args.matures, args.precursors, quiet=True) gff_lines = body.create(ann, args.database, sample, args, quiet=True) body.write_body_on_handle(gff_lines, out_handle) current = line.query_name lines = [] lines.append(line) reads = _read_lines(lines, precursors, handle, args) ann = annotate(reads, args.matures, args.precursors, quiet=True) gff_lines = body.create(ann, args.database, sample, args, quiet=True) body.write_body_on_handle(gff_lines, out_handle)
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ if args.low_memory: read.reader(args) return None samples = [] database = mapper.guess_database(args) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch out_dts = dict() if args.keep_name and len(args.files) > 1: logger.warning("--keep-name when running multiple samples\n" "can generate wrong results if the\n" "name read is different across sample\n" "for the same sequence.") for fn in args.files: fn = op.normpath(fn) if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) if args.format == "BAM": reads = _read_bam(fn, args) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, args) elif args.format == "srnabench": out_dts[fn] = srnabench.read_file(fn, args) elif args.format == "prost": reads = prost.read_file(fn, precursors, database, args.gtf) elif args.format == "isomirsea": out_dts[fn] = isomirsea.read_file(fn, args) elif args.format == "manatee": out_dts[fn] = manatee.read_file(fn, database, args) elif args.format == "optimir": out_dts[fn] = optimir.read_file(fn, args) elif args.format == "gff": samples.extend(header.read_samples(fn)) out_dts[fn] = body.read(fn, args) continue if args.format not in ["isomirsea", "srnabench", "manatee", 'optimir']: ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, args) h = header.create([sample], database, header.make_tools(args.format)) _write(out_dts[fn], h, fn_out, args) # merge all reads for all samples into one dict if args.low_memory: return None merged = merge.merge(out_dts, samples) fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format) _write(merged, header.create(samples, database, header.make_tools([args.format])), fn_merged_out, args)
def low_memory_genomic_bam(bam_fn, sample, out_handle, args): logger.info("Reading BAM file in low memory mode.") logger.warning("This is under development and variants can be unexact.") precursors = args.precursors bam_fn = _sam_to_bam(bam_fn) bam_fn = _bam_sort(bam_fn) database = guess_database(args) bed_fn = os.path.join(args.out, os.path.basename(bam_fn) + ".bed") logger.info("Making bed file.") _bed(bam_fn, bed_fn) logger.info("Intersecting bed file.") intersect_fn = intersect(bed_fn, args.gtf) logger.info("Loading database.") # TODO this'll return conn_reads and conn_counts conn = _read_lifted_bam_alpha(intersect_fn, bam_fn, args) rows = sql.select_all_reads(conn) lines = [] current = None logger.info("Analyzing database.") for row in rows: if not current or current == row[0]: lines.append(row) current = row[0] else: # TODO counts of sequence = conn_counts.query UID # it could be counts only same location UID+chrom+start, or counts all UID reads = _read_lifted_lines(lines, precursors, database) ann = annotate(reads, args.matures, args.precursors, quiet=True) gff_lines = body.create(ann, args.database, sample, args, quiet=True) body.write_body_on_handle(gff_lines, out_handle) current = row[0] lines = [] lines.append(row) reads = _read_lifted_lines(lines, precursors, database) ann = annotate(reads, args.matures, args.precursors, quiet=True) gff_lines = body.create(ann, args.database, sample, args, quiet=True) body.write_body_on_handle(gff_lines, out_handle) conn.close() logger.info("Done")
def read_file_low_memory(fn, sample, args, out_handle): precursors = args.precursors reads = defaultdict(hits) col_fix = 0 with open(fn) as handle: header = handle.readline() if header.find("freq") < 0: col_fix = 1 for line in handle: reads = _read_line(line, col_fix, precursors) ann = annotate(reads, args.matures, args.precursors, quiet=True) gff_lines = body.create(ann, args.database, sample, args, quiet=True) body.write_body_on_handle(gff_lines, out_handle)
def create_iso(name, mir, seq, numsim, exp): data = dict() reads = dict() full_read = list() clean_read = list() seen = set() for mirna in mir[name]: info = mir[name][mirna] mirSeq = seq[info[0]:info[1] + 1] for rand in range(int(numsim)): # expression e = 1 if exp: trial = random.randint(1, 100) p = random.randint(1, 50) / 50.0 e = numpy.random.negative_binomial(trial, p, 1)[0] iso = realign.isomir() randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation( info, seq) if randSeq in seen: continue seen.add(randSeq) iso.end = iso.start + len(randSeq) aln = realign.align(randSeq, seq[iso.start:iso.end]) iso.cigar = realign.make_cigar(aln[0], aln[1]) iso.mirna = mirna query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq) reads[query_name] = realign.hits() reads[query_name].set_sequence(randSeq) reads[query_name].counts = e reads[query_name].set_precursor(name, iso) full_read.extend(create_read(randSeq, e)) clean_read.append([ randSeq, e, ]) # print [randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq] # data[randSeq] = [exp, iso] # create real object used in code to generate GFF write_fastq(full_read, full_fq) write_collapse_fastq(clean_read, clean_fq) gff = body.create(reads, "miRBase21", "sim1") return gff
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ samples = [] database = mapper.guess_database(args.gtf) args.database = database precursors = fasta.read_precursor(args.hairpin, args.sps) args.precursors = precursors matures = mapper.read_gtf_to_precursor(args.gtf) args.matures = matures # TODO check numbers of miRNA and precursors read # TODO print message if numbers mismatch out_dts = dict() for fn in args.files: if args.format != "gff": sample = op.splitext(op.basename(fn))[0] samples.append(sample) fn_out = op.join(args.out, sample + ".%s" % args.out_format) if args.format == "BAM": reads = _read_bam(fn, args) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, args) elif args.format == "srnabench": out_dts[fn] = srnabench.read_file(fn, args) elif args.format == "prost": reads = prost.read_file(fn, precursors, database, args.gtf) elif args.format == "isomirsea": out_dts[fn] = isomirsea.read_file(fn, args) elif args.format == "gff": samples.extend(header.read_samples(fn)) out_dts[fn] = body.read(fn, args) continue if args.format not in ["isomirsea", "srnabench"]: ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, args) h = header.create([sample], database, "") _write(out_dts[fn], h, fn_out) # merge all reads for all samples into one dict merged = merge.merge(out_dts, samples) fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format) _write(merged, header.create(samples, database, ""), fn_merged_out)
def test_collapse(self): """testing GFF function""" from mirtop.libs import logger from mirtop.mirna import mapper, fasta from mirtop.gff import body, header logger.initialize_logger("test", True, True) logger = logger.getLogger(__name__) precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa", "hsa") # depend on https://github.com/miRTop/mirtop/issues/6 matures = mapper.read_gtf_to_precursor( "data/examples/annotate/hsa.gff3") # matures = mirtop.mirna.read_mature("data/examples/annotate/mirnas.gff", "hsa") from mirtop.bam import bam bam_fn = "data/aligments/collapsing-isomirs.sam" reads = bam.read_bam(bam_fn, precursors) ann = bam.annotate(reads, matures, precursors) fn = bam_fn + ".gff" h = header.create(bam_fn, ["example"], "miRBase21") gff = body.create(ann, "miRBase21", "example", fn, header) print gff return True
def create_iso(name, mir, seq, numsim, exp): data = dict() reads = dict() full_read = list() clean_read = list() seen = set() for mirna in mir[name]: info = mir[name][mirna] mirSeq = seq[info[0]:info[1] + 1] for rand in range(int(numsim)): # expression e = 1 if exp: trial = random.randint(1, 100) p = random.randint(1, 50) / 50.0 e = numpy.random.negative_binomial(trial, p, 1)[0] iso = realign.isomir() randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation(info, seq) if randSeq in seen: continue seen.add(randSeq) iso.end = iso.start + len(randSeq) aln = realign.align(randSeq, seq[iso.start:iso.end]) iso.cigar = realign.make_cigar(aln[0], aln[1]) iso.mirna = mirna query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq) reads[query_name] = realign.hits() reads[query_name].set_sequence(randSeq) reads[query_name].counts = e reads[query_name].set_precursor(name, iso) full_read.extend(create_read(randSeq, e)) clean_read.append([randSeq, e,]) # print([randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq]) # data[randSeq] = [exp, iso] # create real object used in code to generate GFF write_fastq(full_read, full_fq) write_collapse_fastq(clean_read, clean_fq) gff = body.create(reads, "miRBase21", "sim1") return gff
def annotate(fn, read_file, load=False, create=True): import argparse args = argparse.Namespace() args.hairpin = "data/examples/annotate/hairpin.fa" args.sps = "hsa" args.gtf = "data/examples/annotate/hsa.gff3" args.add_extra = True args.out_format = "gtf" from mirtop.mirna import fasta, mapper precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) args.precursors = precursors args.matures = matures args.database = mapper.guess_database(args.gtf) from mirtop.mirna import annotate from mirtop.gff import body if not load: reads = read_file(fn, args) else: reads = read_file if create: ann = annotate.annotate(reads, matures, precursors) body = body.create(ann, "miRBase21", "Example", args) return body
def reader(args): """ Realign BAM hits to miRBAse to get better accuracy and annotation """ database = mapper.guess_database(args.gtf) # hairpin, mirna = download_mirbase(args) precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) # check numnbers of miRNA and precursors read # print message if numbers mismatch out_dts = dict() for fn in args.files: sample = op.splitext(op.basename(fn))[0] fn_out = op.join(args.out, sample + ".gff") if args.format == "BAM": reads = _read_bam(fn, precursors) elif args.format == "seqbuster": reads = seqbuster.read_file(fn, precursors) custom = seqbuster.header() elif args.format == "srnabench": reads = srnabench.read_gile(fn, precursors) h = header.create([sample], database, "") ann = annotate(reads, matures, precursors) out_dts[fn] = body.create(ann, database, sample, fn_out, h)
def annotate(fn, precursors, matures): from mirtop.bam import bam from mirtop.gff import body reads = bam.read_bam(fn, precursors) ann = bam.annotate(reads, matures, precursors) gff = body.create(ann, "miRBase21", "example", fn + ".gff3", "#")
def _analyze_line(line, precursors, database, sample, sep, args): start_idx = 10 end_idx = 11 attr_idx = 15 query_name = line[3] sequence = line[4] if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase return None logger.debug(("READ::line name:{0}").format(line)) if sequence and sequence.find("N") > -1: return None chrom = line[attr_idx].strip().split("Name=")[-1] start = line[1] end = line[2] strand = line[5] counts = float(line[6]) Filter = "Pass" reads = dict() if not start: return None if strand == "+": start = int(start) - int(line[start_idx]) + 1 else: start = int(line[end_idx]) - int(end) iso = isomir() iso.align = line iso.set_pos(start, len(sequence)) logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom)) if len(precursors[chrom]) < start + len(sequence): logger.debug("READ::%s start + %s sequence size are bigger than" " size precursor %s" % ( chrom, len(sequence), len(precursors[chrom]))) iso.subs, iso.add, iso.cigar = filter.tune( sequence, precursors[chrom], start, None) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs)) idu = make_id(sequence) reads[query_name] = hits() reads[query_name].set_sequence(sequence) reads[query_name].counts = counts reads[query_name].sequence = sequence reads[query_name].set_precursor(chrom, iso) reads = annotate(reads, args.matures, args.precursors, quiet=True) gff_line = body.create(reads, args.database, sample, args, quiet=True) if start not in gff_line[chrom]: return None line = gff_line[chrom][start][0][4] logger.debug("READ::line:%s" % line) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) return {'chrom': chrom, 'start': start, 'name': query_name, 'mirna': reads[query_name].precursors[chrom].mirna, 'line': [idu, chrom, counts, sample, line]}