Ejemplo n.º 1
0
def _read_lifted_bam_alpha(bed_fn, bam_fn, args):
    database = guess_database(args)
    conn = sql.create_connection()
    key = "name" if args.keep_name else "sequence"
    sql.create_reads_table(conn, key)
    # TODO create counts table sequence and autoincrement or from read
    cur = conn.cursor()
    counts = 0
    seen = set()
    for line in bed_fn:
        fields = _parse_intersect(line, database, bed=True)
        # TODO add sequence to count table args.quant on/off name=UID or name=UID+chrom+pos
        if fields:
            hit = ".".join(fields[:3])
            if hit not in seen:
                counts += 1
                sql.insert_row_in_reads_table(cur, fields)
                seen.add(hit)
        # if counts == 1000:
        #     counts = 0
    del (seen)
    logger.info("Read %s lines that intersected with miRNAs." % counts)
    conn.commit()
    # TODO this'll return conn_reads and conn_counts
    return conn
Ejemplo n.º 2
0
def convert(args):
    samples = []
    database = mapper.guess_database(args.gtf)
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    for fn in args.files:
        read_file(fn, precursors, matures)
Ejemplo n.º 3
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    if args.low_memory:
        read.reader(args)
        return None
    samples = []
    database = mapper.guess_database(args)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    out_dts = dict()
    if args.keep_name and len(args.files) > 1:
        logger.warning("--keep-name when running multiple samples\n"
                       "can generate wrong results if the\n"
                       "name read is different across sample\n"
                       "for the same sequence.")
    for fn in args.files:
        fn = op.normpath(fn)
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        if args.format == "BAM":
            reads = _read_bam(fn, args)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, args)
        elif args.format == "srnabench":
            out_dts[fn] = srnabench.read_file(fn, args)
        elif args.format == "prost":
            reads = prost.read_file(fn, precursors, database, args.gtf)
        elif args.format == "isomirsea":
            out_dts[fn] = isomirsea.read_file(fn, args)
        elif args.format == "manatee":
            out_dts[fn] = manatee.read_file(fn, database, args)
        elif args.format == "optimir":
            out_dts[fn] = optimir.read_file(fn, args)
        elif args.format == "gff":
            samples.extend(header.read_samples(fn))
            out_dts[fn] = body.read(fn, args)
            continue
        if args.format not in ["isomirsea", "srnabench", "manatee", 'optimir']:
            ann = annotate(reads, matures, precursors)
            out_dts[fn] = body.create(ann, database, sample, args)
        h = header.create([sample], database, header.make_tools(args.format))
        _write(out_dts[fn], h, fn_out, args)
    # merge all reads for all samples into one dict
    if args.low_memory:
        return None
    merged = merge.merge(out_dts, samples)
    fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format)
    _write(merged,
           header.create(samples, database, header.make_tools([args.format])),
           fn_merged_out, args)
Ejemplo n.º 4
0
 def test_database(self):
     from mirtop.mirna import mapper
     args = argparse.Namespace()
     args.gtf = "data/examples/annotate/hsa.gff3"
     args.database = None
     db = mapper.guess_database(args)
     print("Database is %s" % db)
     if db != "miRBasev21":
         raise ValueError("%s not eq to miRBasev21" % db)
Ejemplo n.º 5
0
def _read_lifted_bam(handle, reads, args, clean):
    indels_skip = 0
    precursors = args.precursors
    database = guess_database(args)
    for line in handle:
        rows = _parse_intersect(line, database, bed=True)
        reads = _analyze_lifted_line(rows, reads, precursors, database)
    logger.info("Hits: %s" % len(reads))
    logger.info("Hits with indels %s" % indels_skip)
    if clean:
        reads = filter.clean_hits(reads)
        logger.info("Hits after clean: %s" % len(reads))
    return reads
Ejemplo n.º 6
0
def low_memory_genomic_bam(bam_fn, sample, out_handle, args):
    logger.info("Reading BAM file in low memory mode.")
    logger.warning("This is under development and variants can be unexact.")
    precursors = args.precursors
    bam_fn = _sam_to_bam(bam_fn)
    bam_fn = _bam_sort(bam_fn)
    database = guess_database(args)
    bed_fn = os.path.join(args.out, os.path.basename(bam_fn) + ".bed")
    logger.info("Making bed file.")
    _bed(bam_fn, bed_fn)
    logger.info("Intersecting bed file.")
    intersect_fn = intersect(bed_fn, args.gtf)
    logger.info("Loading database.")
    # TODO this'll return conn_reads and conn_counts
    conn = _read_lifted_bam_alpha(intersect_fn, bam_fn, args)
    rows = sql.select_all_reads(conn)
    lines = []
    current = None
    logger.info("Analyzing database.")
    for row in rows:
        if not current or current == row[0]:
            lines.append(row)
            current = row[0]
        else:
            # TODO counts of sequence = conn_counts.query UID
            # it could be counts only same location UID+chrom+start, or counts all UID
            reads = _read_lifted_lines(lines, precursors, database)
            ann = annotate(reads, args.matures, args.precursors, quiet=True)
            gff_lines = body.create(ann,
                                    args.database,
                                    sample,
                                    args,
                                    quiet=True)
            body.write_body_on_handle(gff_lines, out_handle)
            current = row[0]
            lines = []
            lines.append(row)
    reads = _read_lifted_lines(lines, precursors, database)
    ann = annotate(reads, args.matures, args.precursors, quiet=True)
    gff_lines = body.create(ann, args.database, sample, args, quiet=True)
    body.write_body_on_handle(gff_lines, out_handle)
    conn.close()
    logger.info("Done")
Ejemplo n.º 7
0
def read_bam(bam_fn, args, clean=True):
    """
    Read bam file and perform realignment of hits

    Args:
        *bam_fn*: a BAM file with alignments to the precursor

        *precursors*: dict with keys being precursor names and values
            being sequences. Come from mirtop.mirna.fasta.read_precursor().

        *clean*: Use mirtop.filter.clean_hits() to remove lower score hits.

    Returns:
        *reads (dict)*:
             keys are read_id and values are *mirtop.realign.hits*

    """
    bam_fn = _sam_to_bam(bam_fn)
    bam_fn = _bam_sort(bam_fn)
    reads = defaultdict(hits)
    if args.genomic:
        logger.warning(
            "This is under development and variants can be unexact.")
        bed_fn = os.path.join(args.out, os.path.basename(bam_fn) + ".bed")
        logger.info("Making bed file.")
        _bed(bam_fn, bed_fn)
        logger.info("Intersecting bed file.")
        intersect_fn = intersect(bed_fn, args.gtf)
        # logger.info("Analyzing hits.")
        # reads = _read_lifted_bam(intersect_fn, reads, args, clean)
        logger.info("Loading database.")
        conn = _read_lifted_bam_alpha(intersect_fn, bam_fn, args)
        rows = sql.select_all_reads(conn)
        logger.info("Analyzing database.")
        precursors = args.precursors
        database = guess_database(args)
        reads = _read_lifted_lines(rows, precursors, database)
        conn.close()
    else:
        reads = _read_original_bam(bam_fn, reads, args, clean)
    logger.info("Done.")
    return reads
Ejemplo n.º 8
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    samples = []
    database = mapper.guess_database(args.gtf)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    out_dts = dict()
    for fn in args.files:
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        if args.format == "BAM":
            reads = _read_bam(fn, args)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, args)
        elif args.format == "srnabench":
            out_dts[fn] = srnabench.read_file(fn, args)
        elif args.format == "prost":
            reads = prost.read_file(fn, precursors, database, args.gtf)
        elif args.format == "isomirsea":
            out_dts[fn] = isomirsea.read_file(fn, args)
        elif args.format == "gff":
            samples.extend(header.read_samples(fn))
            out_dts[fn] = body.read(fn, args)
            continue
        if args.format not in ["isomirsea", "srnabench"]:
            ann = annotate(reads, matures, precursors)
            out_dts[fn] = body.create(ann, database, sample, args)
        h = header.create([sample], database, "")
        _write(out_dts[fn], h, fn_out)
    # merge all reads for all samples into one dict
    merged = merge.merge(out_dts, samples)
    fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format)
    _write(merged, header.create(samples, database, ""), fn_merged_out)
Ejemplo n.º 9
0
def reader(args):
    """
    Realign BAM hits to miRBase to get better accuracy and annotation
    """
    samples = []
    database = mapper.guess_database(args)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    if args.keep_name and len(args.files) > 1:
        logger.warning("--keep-name when running multiple samples\n"
                       "can generate wrong results if the\n"
                       "name read is different across sample\n"
                       "for the same sequence.")
    for fn in args.files:
        fn = op.normpath(fn)
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        h = header.create([sample], args.database, "")
        out_handle = open(fn_out, 'w')
        print(h, file=out_handle)
        if args.format == "BAM":
            if args.genomic:
                low_memory_genomic_bam(fn, sample, out_handle, args)
            else:
                low_memory_bam(fn, sample, out_handle, args)
        elif args.format == "seqbuster":
            seqbuster.read_file_low_memory(fn, sample, args, out_handle)
        else:
            raise ValueError("%s not supported for low memory" % args.format)
        out_handle.close()
Ejemplo n.º 10
0
def annotate(fn, read_file, load=False, create=True):
    import argparse
    args = argparse.Namespace()
    args.hairpin = "data/examples/annotate/hairpin.fa"
    args.sps = "hsa"
    args.gtf = "data/examples/annotate/hsa.gff3"
    args.add_extra = True
    args.out_format = "gtf"
    from mirtop.mirna import fasta, mapper
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.precursors = precursors
    args.matures = matures
    args.database = mapper.guess_database(args.gtf)
    from mirtop.mirna import annotate
    from mirtop.gff import body
    if not load:
        reads = read_file(fn, args)
    else:
        reads = read_file
    if create:
        ann = annotate.annotate(reads, matures, precursors)
        body = body.create(ann, "miRBase21", "Example", args)
    return body
Ejemplo n.º 11
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    database = mapper.guess_database(args.gtf)
    # hairpin, mirna = download_mirbase(args)
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    # check numnbers of miRNA and precursors read
    # print message if numbers mismatch
    out_dts = dict()
    for fn in args.files:
        sample = op.splitext(op.basename(fn))[0]
        fn_out = op.join(args.out, sample + ".gff")
        if args.format == "BAM":
            reads = _read_bam(fn, precursors)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, precursors)
            custom = seqbuster.header()
        elif args.format == "srnabench":
            reads = srnabench.read_gile(fn, precursors)
        h = header.create([sample], database, "")
        ann = annotate(reads, matures, precursors)
        out_dts[fn] = body.create(ann, database, sample, fn_out, h)
Ejemplo n.º 12
0
def annotate(fn, read_file, load=False, create=True):
    import argparse
    args = argparse.Namespace()
    args.hairpin = "data/examples/annotate/hairpin.fa"
    args.sps = "hsa"
    args.gtf = "data/examples/annotate/hsa.gff3"
    args.add_extra = True
    args.out_format = "gtf"
    from mirtop.mirna import fasta, mapper
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.precursors = precursors
    args.matures = matures
    args.database = mapper.guess_database(args.gtf)
    from mirtop.mirna import annotate
    from mirtop.gff import body
    if not load:
        reads = read_file(fn, args)
    else:
        reads = read_file
    if create:
        ann = annotate.annotate(reads, matures, precursors)
        body = body.create(ann, "miRBase21", "Example", args)
    return body
Ejemplo n.º 13
0
 def test_database(self):
     from mirtop.mirna import mapper
     db = mapper.guess_database("data/examples/annotate/hsa.gff3")
     print "Database is %s" % db
     if db != "miRBasev21":
         raise ValueError("%s not eq to miRBasev21" % db)
Ejemplo n.º 14
0
def _analyze_line(line, precursors, database, sample, sep, args):
    start_idx = 10
    end_idx = 11
    attr_idx = 15
    query_name = line[3]
    sequence = line[4]
    if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase
        return None

    logger.debug(("READ::line name:{0}").format(line))
    if sequence and sequence.find("N") > -1:
        return None

    chrom = line[attr_idx].strip().split("Name=")[-1]
    start = line[1]
    end = line[2]
    strand = line[5]
    counts = float(line[6])
    Filter = "Pass"
    reads = dict()
    if not start:
        return None
    if strand == "+":
        start = int(start) - int(line[start_idx]) + 1
    else:
        start = int(line[end_idx]) - int(end)
    iso = isomir()
    iso.align = line
    iso.set_pos(start, len(sequence))
    logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom))
    if len(precursors[chrom]) < start + len(sequence):
        logger.debug("READ::%s start + %s sequence size are bigger than"
                     " size precursor %s" % (
                                             chrom,
                                             len(sequence),
                                             len(precursors[chrom])))
    iso.subs, iso.add, iso.cigar = filter.tune(
        sequence, precursors[chrom],
        start, None)
    logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
    logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs))

    idu = make_id(sequence)
    reads[query_name] = hits()
    reads[query_name].set_sequence(sequence)
    reads[query_name].counts = counts
    reads[query_name].sequence = sequence
    reads[query_name].set_precursor(chrom, iso)
    reads = annotate(reads, args.matures, args.precursors, quiet=True)
    gff_line = body.create(reads, args.database, sample, args, quiet=True)
    if start not in gff_line[chrom]:
        return None
    line = gff_line[chrom][start][0][4]
    logger.debug("READ::line:%s" % line)
    if args.add_extra:
        extra = variant_with_nt(line, args.precursors,
                                args.matures)
        line = "%s Changes %s;" % (line, extra)

    line = paste_columns(feature(line), sep=sep)
    return {'chrom': chrom,
            'start': start,
            'name': query_name,
            'mirna': reads[query_name].precursors[chrom].mirna,
            'line': [idu, chrom, counts, sample, line]}
Ejemplo n.º 15
0
 def test_database(self):
     from mirtop.mirna import mapper
     db = mapper.guess_database("data/examples/annotate/hsa.gff3")
     print("Database is %s" % db)
     if db != "miRBasev21":
         raise ValueError("%s not eq to miRBasev21" % db)