Example #1
0
def low_memory_bam(bam_fn, sample, out_handle, args):
    if args.genomic:
        raise ValueError(
            "low-memory option is not compatible with genomic coordinates.")
    precursors = args.precursors
    bam_fn = _sam_to_bam(bam_fn)
    bam_fn = _bam_sort(bam_fn)
    mode = "r" if bam_fn.endswith("sam") else "rb"
    handle = pysam.Samfile(bam_fn, mode)
    lines = []
    current = None
    for line in handle:
        if not current or current == line.query_name:
            lines.append(line)
            current = line.query_name
        else:
            reads = _read_lines(lines, precursors, handle, args)
            ann = annotate(reads, args.matures, args.precursors, quiet=True)
            gff_lines = body.create(ann,
                                    args.database,
                                    sample,
                                    args,
                                    quiet=True)
            body.write_body_on_handle(gff_lines, out_handle)
            current = line.query_name
            lines = []
            lines.append(line)
    reads = _read_lines(lines, precursors, handle, args)
    ann = annotate(reads, args.matures, args.precursors, quiet=True)
    gff_lines = body.create(ann, args.database, sample, args, quiet=True)
    body.write_body_on_handle(gff_lines, out_handle)
Example #2
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    if args.low_memory:
        read.reader(args)
        return None
    samples = []
    database = mapper.guess_database(args)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    out_dts = dict()
    if args.keep_name and len(args.files) > 1:
        logger.warning("--keep-name when running multiple samples\n"
                       "can generate wrong results if the\n"
                       "name read is different across sample\n"
                       "for the same sequence.")
    for fn in args.files:
        fn = op.normpath(fn)
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        if args.format == "BAM":
            reads = _read_bam(fn, args)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, args)
        elif args.format == "srnabench":
            out_dts[fn] = srnabench.read_file(fn, args)
        elif args.format == "prost":
            reads = prost.read_file(fn, precursors, database, args.gtf)
        elif args.format == "isomirsea":
            out_dts[fn] = isomirsea.read_file(fn, args)
        elif args.format == "manatee":
            out_dts[fn] = manatee.read_file(fn, database, args)
        elif args.format == "optimir":
            out_dts[fn] = optimir.read_file(fn, args)
        elif args.format == "gff":
            samples.extend(header.read_samples(fn))
            out_dts[fn] = body.read(fn, args)
            continue
        if args.format not in ["isomirsea", "srnabench", "manatee", 'optimir']:
            ann = annotate(reads, matures, precursors)
            out_dts[fn] = body.create(ann, database, sample, args)
        h = header.create([sample], database, header.make_tools(args.format))
        _write(out_dts[fn], h, fn_out, args)
    # merge all reads for all samples into one dict
    if args.low_memory:
        return None
    merged = merge.merge(out_dts, samples)
    fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format)
    _write(merged,
           header.create(samples, database, header.make_tools([args.format])),
           fn_merged_out, args)
Example #3
0
def low_memory_genomic_bam(bam_fn, sample, out_handle, args):
    logger.info("Reading BAM file in low memory mode.")
    logger.warning("This is under development and variants can be unexact.")
    precursors = args.precursors
    bam_fn = _sam_to_bam(bam_fn)
    bam_fn = _bam_sort(bam_fn)
    database = guess_database(args)
    bed_fn = os.path.join(args.out, os.path.basename(bam_fn) + ".bed")
    logger.info("Making bed file.")
    _bed(bam_fn, bed_fn)
    logger.info("Intersecting bed file.")
    intersect_fn = intersect(bed_fn, args.gtf)
    logger.info("Loading database.")
    # TODO this'll return conn_reads and conn_counts
    conn = _read_lifted_bam_alpha(intersect_fn, bam_fn, args)
    rows = sql.select_all_reads(conn)
    lines = []
    current = None
    logger.info("Analyzing database.")
    for row in rows:
        if not current or current == row[0]:
            lines.append(row)
            current = row[0]
        else:
            # TODO counts of sequence = conn_counts.query UID
            # it could be counts only same location UID+chrom+start, or counts all UID
            reads = _read_lifted_lines(lines, precursors, database)
            ann = annotate(reads, args.matures, args.precursors, quiet=True)
            gff_lines = body.create(ann,
                                    args.database,
                                    sample,
                                    args,
                                    quiet=True)
            body.write_body_on_handle(gff_lines, out_handle)
            current = row[0]
            lines = []
            lines.append(row)
    reads = _read_lifted_lines(lines, precursors, database)
    ann = annotate(reads, args.matures, args.precursors, quiet=True)
    gff_lines = body.create(ann, args.database, sample, args, quiet=True)
    body.write_body_on_handle(gff_lines, out_handle)
    conn.close()
    logger.info("Done")
Example #4
0
def read_file_low_memory(fn, sample, args, out_handle):
    precursors = args.precursors
    reads = defaultdict(hits)
    col_fix = 0
    with open(fn) as handle:
        header = handle.readline()
        if header.find("freq") < 0:
            col_fix = 1
        for line in handle:
            reads = _read_line(line, col_fix, precursors)
            ann = annotate(reads, args.matures, args.precursors, quiet=True)
            gff_lines = body.create(ann,
                                    args.database,
                                    sample,
                                    args,
                                    quiet=True)
            body.write_body_on_handle(gff_lines, out_handle)
Example #5
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    samples = []
    database = mapper.guess_database(args.gtf)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    out_dts = dict()
    for fn in args.files:
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        if args.format == "BAM":
            reads = _read_bam(fn, args)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, args)
        elif args.format == "srnabench":
            out_dts[fn] = srnabench.read_file(fn, args)
        elif args.format == "prost":
            reads = prost.read_file(fn, precursors, database, args.gtf)
        elif args.format == "isomirsea":
            out_dts[fn] = isomirsea.read_file(fn, args)
        elif args.format == "gff":
            samples.extend(header.read_samples(fn))
            out_dts[fn] = body.read(fn, args)
            continue
        if args.format not in ["isomirsea", "srnabench"]:
            ann = annotate(reads, matures, precursors)
            out_dts[fn] = body.create(ann, database, sample, args)
        h = header.create([sample], database, "")
        _write(out_dts[fn], h, fn_out)
    # merge all reads for all samples into one dict
    merged = merge.merge(out_dts, samples)
    fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format)
    _write(merged, header.create(samples, database, ""), fn_merged_out)
Example #6
0
def annotate(fn, read_file, load=False, create=True):
    import argparse
    args = argparse.Namespace()
    args.hairpin = "data/examples/annotate/hairpin.fa"
    args.sps = "hsa"
    args.gtf = "data/examples/annotate/hsa.gff3"
    args.add_extra = True
    args.out_format = "gtf"
    from mirtop.mirna import fasta, mapper
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.precursors = precursors
    args.matures = matures
    args.database = mapper.guess_database(args.gtf)
    from mirtop.mirna import annotate
    from mirtop.gff import body
    if not load:
        reads = read_file(fn, args)
    else:
        reads = read_file
    if create:
        ann = annotate.annotate(reads, matures, precursors)
        body = body.create(ann, "miRBase21", "Example", args)
    return body
Example #7
0
def annotate(fn, read_file, load=False, create=True):
    import argparse
    args = argparse.Namespace()
    args.hairpin = "data/examples/annotate/hairpin.fa"
    args.sps = "hsa"
    args.gtf = "data/examples/annotate/hsa.gff3"
    args.add_extra = True
    args.out_format = "gtf"
    from mirtop.mirna import fasta, mapper
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.precursors = precursors
    args.matures = matures
    args.database = mapper.guess_database(args.gtf)
    from mirtop.mirna import annotate
    from mirtop.gff import body
    if not load:
        reads = read_file(fn, args)
    else:
        reads = read_file
    if create:
        ann = annotate.annotate(reads, matures, precursors)
        body = body.create(ann, "miRBase21", "Example", args)
    return body
Example #8
0
def _analyze_line(line, precursors, database, sample, sep, args):
    start_idx = 10
    end_idx = 11
    attr_idx = 15
    query_name = line[3]
    sequence = line[4]
    if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase
        return None

    logger.debug(("READ::line name:{0}").format(line))
    if sequence and sequence.find("N") > -1:
        return None

    chrom = line[attr_idx].strip().split("Name=")[-1]
    start = line[1]
    end = line[2]
    strand = line[5]
    counts = float(line[6])
    Filter = "Pass"
    reads = dict()
    if not start:
        return None
    if strand == "+":
        start = int(start) - int(line[start_idx]) + 1
    else:
        start = int(line[end_idx]) - int(end)
    iso = isomir()
    iso.align = line
    iso.set_pos(start, len(sequence))
    logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom))
    if len(precursors[chrom]) < start + len(sequence):
        logger.debug("READ::%s start + %s sequence size are bigger than"
                     " size precursor %s" % (
                                             chrom,
                                             len(sequence),
                                             len(precursors[chrom])))
    iso.subs, iso.add, iso.cigar = filter.tune(
        sequence, precursors[chrom],
        start, None)
    logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
    logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs))

    idu = make_id(sequence)
    reads[query_name] = hits()
    reads[query_name].set_sequence(sequence)
    reads[query_name].counts = counts
    reads[query_name].sequence = sequence
    reads[query_name].set_precursor(chrom, iso)
    reads = annotate(reads, args.matures, args.precursors, quiet=True)
    gff_line = body.create(reads, args.database, sample, args, quiet=True)
    if start not in gff_line[chrom]:
        return None
    line = gff_line[chrom][start][0][4]
    logger.debug("READ::line:%s" % line)
    if args.add_extra:
        extra = variant_with_nt(line, args.precursors,
                                args.matures)
        line = "%s Changes %s;" % (line, extra)

    line = paste_columns(feature(line), sep=sep)
    return {'chrom': chrom,
            'start': start,
            'name': query_name,
            'mirna': reads[query_name].precursors[chrom].mirna,
            'line': [idu, chrom, counts, sample, line]}