Ejemplo n.º 1
0
def low_memory_bam(bam_fn, sample, out_handle, args):
    if args.genomic:
        raise ValueError(
            "low-memory option is not compatible with genomic coordinates.")
    precursors = args.precursors
    bam_fn = _sam_to_bam(bam_fn)
    bam_fn = _bam_sort(bam_fn)
    mode = "r" if bam_fn.endswith("sam") else "rb"
    handle = pysam.Samfile(bam_fn, mode)
    lines = []
    current = None
    for line in handle:
        if not current or current == line.query_name:
            lines.append(line)
            current = line.query_name
        else:
            reads = _read_lines(lines, precursors, handle, args)
            ann = annotate(reads, args.matures, args.precursors, quiet=True)
            gff_lines = body.create(ann,
                                    args.database,
                                    sample,
                                    args,
                                    quiet=True)
            body.write_body_on_handle(gff_lines, out_handle)
            current = line.query_name
            lines = []
            lines.append(line)
    reads = _read_lines(lines, precursors, handle, args)
    ann = annotate(reads, args.matures, args.precursors, quiet=True)
    gff_lines = body.create(ann, args.database, sample, args, quiet=True)
    body.write_body_on_handle(gff_lines, out_handle)
Ejemplo n.º 2
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    if args.low_memory:
        read.reader(args)
        return None
    samples = []
    database = mapper.guess_database(args)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    out_dts = dict()
    if args.keep_name and len(args.files) > 1:
        logger.warning("--keep-name when running multiple samples\n"
                       "can generate wrong results if the\n"
                       "name read is different across sample\n"
                       "for the same sequence.")
    for fn in args.files:
        fn = op.normpath(fn)
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        if args.format == "BAM":
            reads = _read_bam(fn, args)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, args)
        elif args.format == "srnabench":
            out_dts[fn] = srnabench.read_file(fn, args)
        elif args.format == "prost":
            reads = prost.read_file(fn, precursors, database, args.gtf)
        elif args.format == "isomirsea":
            out_dts[fn] = isomirsea.read_file(fn, args)
        elif args.format == "manatee":
            out_dts[fn] = manatee.read_file(fn, database, args)
        elif args.format == "optimir":
            out_dts[fn] = optimir.read_file(fn, args)
        elif args.format == "gff":
            samples.extend(header.read_samples(fn))
            out_dts[fn] = body.read(fn, args)
            continue
        if args.format not in ["isomirsea", "srnabench", "manatee", 'optimir']:
            ann = annotate(reads, matures, precursors)
            out_dts[fn] = body.create(ann, database, sample, args)
        h = header.create([sample], database, header.make_tools(args.format))
        _write(out_dts[fn], h, fn_out, args)
    # merge all reads for all samples into one dict
    if args.low_memory:
        return None
    merged = merge.merge(out_dts, samples)
    fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format)
    _write(merged,
           header.create(samples, database, header.make_tools([args.format])),
           fn_merged_out, args)
Ejemplo n.º 3
0
def low_memory_genomic_bam(bam_fn, sample, out_handle, args):
    logger.info("Reading BAM file in low memory mode.")
    logger.warning("This is under development and variants can be unexact.")
    precursors = args.precursors
    bam_fn = _sam_to_bam(bam_fn)
    bam_fn = _bam_sort(bam_fn)
    database = guess_database(args)
    bed_fn = os.path.join(args.out, os.path.basename(bam_fn) + ".bed")
    logger.info("Making bed file.")
    _bed(bam_fn, bed_fn)
    logger.info("Intersecting bed file.")
    intersect_fn = intersect(bed_fn, args.gtf)
    logger.info("Loading database.")
    # TODO this'll return conn_reads and conn_counts
    conn = _read_lifted_bam_alpha(intersect_fn, bam_fn, args)
    rows = sql.select_all_reads(conn)
    lines = []
    current = None
    logger.info("Analyzing database.")
    for row in rows:
        if not current or current == row[0]:
            lines.append(row)
            current = row[0]
        else:
            # TODO counts of sequence = conn_counts.query UID
            # it could be counts only same location UID+chrom+start, or counts all UID
            reads = _read_lifted_lines(lines, precursors, database)
            ann = annotate(reads, args.matures, args.precursors, quiet=True)
            gff_lines = body.create(ann,
                                    args.database,
                                    sample,
                                    args,
                                    quiet=True)
            body.write_body_on_handle(gff_lines, out_handle)
            current = row[0]
            lines = []
            lines.append(row)
    reads = _read_lifted_lines(lines, precursors, database)
    ann = annotate(reads, args.matures, args.precursors, quiet=True)
    gff_lines = body.create(ann, args.database, sample, args, quiet=True)
    body.write_body_on_handle(gff_lines, out_handle)
    conn.close()
    logger.info("Done")
Ejemplo n.º 4
0
def read_file_low_memory(fn, sample, args, out_handle):
    precursors = args.precursors
    reads = defaultdict(hits)
    col_fix = 0
    with open(fn) as handle:
        header = handle.readline()
        if header.find("freq") < 0:
            col_fix = 1
        for line in handle:
            reads = _read_line(line, col_fix, precursors)
            ann = annotate(reads, args.matures, args.precursors, quiet=True)
            gff_lines = body.create(ann,
                                    args.database,
                                    sample,
                                    args,
                                    quiet=True)
            body.write_body_on_handle(gff_lines, out_handle)
Ejemplo n.º 5
0
def create_iso(name, mir, seq, numsim, exp):
    data = dict()
    reads = dict()
    full_read = list()
    clean_read = list()
    seen = set()
    for mirna in mir[name]:
        info = mir[name][mirna]
        mirSeq = seq[info[0]:info[1] + 1]
        for rand in range(int(numsim)):
            # expression
            e = 1
            if exp:
                trial = random.randint(1, 100)
                p = random.randint(1, 50) / 50.0
                e = numpy.random.negative_binomial(trial, p, 1)[0]
            iso = realign.isomir()
            randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation(
                info, seq)
            if randSeq in seen:
                continue
            seen.add(randSeq)
            iso.end = iso.start + len(randSeq)
            aln = realign.align(randSeq, seq[iso.start:iso.end])
            iso.cigar = realign.make_cigar(aln[0], aln[1])
            iso.mirna = mirna
            query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq)
            reads[query_name] = realign.hits()
            reads[query_name].set_sequence(randSeq)
            reads[query_name].counts = e
            reads[query_name].set_precursor(name, iso)
            full_read.extend(create_read(randSeq, e))
            clean_read.append([
                randSeq,
                e,
            ])
            # print [randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq]
            # data[randSeq] = [exp, iso] # create real object used in code to generate GFF
    write_fastq(full_read, full_fq)
    write_collapse_fastq(clean_read, clean_fq)
    gff = body.create(reads, "miRBase21", "sim1")
    return gff
Ejemplo n.º 6
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    samples = []
    database = mapper.guess_database(args.gtf)
    args.database = database
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    args.precursors = precursors
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.matures = matures
    # TODO check numbers of miRNA and precursors read
    # TODO print message if numbers mismatch
    out_dts = dict()
    for fn in args.files:
        if args.format != "gff":
            sample = op.splitext(op.basename(fn))[0]
            samples.append(sample)
            fn_out = op.join(args.out, sample + ".%s" % args.out_format)
        if args.format == "BAM":
            reads = _read_bam(fn, args)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, args)
        elif args.format == "srnabench":
            out_dts[fn] = srnabench.read_file(fn, args)
        elif args.format == "prost":
            reads = prost.read_file(fn, precursors, database, args.gtf)
        elif args.format == "isomirsea":
            out_dts[fn] = isomirsea.read_file(fn, args)
        elif args.format == "gff":
            samples.extend(header.read_samples(fn))
            out_dts[fn] = body.read(fn, args)
            continue
        if args.format not in ["isomirsea", "srnabench"]:
            ann = annotate(reads, matures, precursors)
            out_dts[fn] = body.create(ann, database, sample, args)
        h = header.create([sample], database, "")
        _write(out_dts[fn], h, fn_out)
    # merge all reads for all samples into one dict
    merged = merge.merge(out_dts, samples)
    fn_merged_out = op.join(args.out, "mirtop.%s" % args.out_format)
    _write(merged, header.create(samples, database, ""), fn_merged_out)
Ejemplo n.º 7
0
 def test_collapse(self):
     """testing GFF function"""
     from mirtop.libs import logger
     from mirtop.mirna import mapper, fasta
     from mirtop.gff import body, header
     logger.initialize_logger("test", True, True)
     logger = logger.getLogger(__name__)
     precursors = fasta.read_precursor("data/examples/annotate/hairpin.fa",
                                       "hsa")
     # depend on https://github.com/miRTop/mirtop/issues/6
     matures = mapper.read_gtf_to_precursor(
         "data/examples/annotate/hsa.gff3")
     # matures = mirtop.mirna.read_mature("data/examples/annotate/mirnas.gff", "hsa")
     from mirtop.bam import bam
     bam_fn = "data/aligments/collapsing-isomirs.sam"
     reads = bam.read_bam(bam_fn, precursors)
     ann = bam.annotate(reads, matures, precursors)
     fn = bam_fn + ".gff"
     h = header.create(bam_fn, ["example"], "miRBase21")
     gff = body.create(ann, "miRBase21", "example", fn, header)
     print gff
     return True
Ejemplo n.º 8
0
def create_iso(name, mir, seq, numsim, exp):
    data = dict()
    reads = dict()
    full_read = list()
    clean_read = list()
    seen = set()
    for mirna in mir[name]:
        info = mir[name][mirna]
        mirSeq = seq[info[0]:info[1] + 1]
        for rand in range(int(numsim)):
             # expression
            e = 1
            if exp:
                trial = random.randint(1, 100)
                p = random.randint(1, 50) / 50.0
                e = numpy.random.negative_binomial(trial, p, 1)[0]
            iso = realign.isomir()
            randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation(info, seq)
            if randSeq in seen:
                continue
            seen.add(randSeq)
            iso.end = iso.start + len(randSeq)
            aln = realign.align(randSeq, seq[iso.start:iso.end])
            iso.cigar = realign.make_cigar(aln[0], aln[1])
            iso.mirna = mirna
            query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq)
            reads[query_name] = realign.hits()
            reads[query_name].set_sequence(randSeq)
            reads[query_name].counts = e
            reads[query_name].set_precursor(name, iso)
            full_read.extend(create_read(randSeq, e))
            clean_read.append([randSeq, e,])
            # print([randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq])
            # data[randSeq] = [exp, iso] # create real object used in code to generate GFF
    write_fastq(full_read, full_fq)
    write_collapse_fastq(clean_read, clean_fq)
    gff = body.create(reads, "miRBase21", "sim1")
    return gff
Ejemplo n.º 9
0
def annotate(fn, read_file, load=False, create=True):
    import argparse
    args = argparse.Namespace()
    args.hairpin = "data/examples/annotate/hairpin.fa"
    args.sps = "hsa"
    args.gtf = "data/examples/annotate/hsa.gff3"
    args.add_extra = True
    args.out_format = "gtf"
    from mirtop.mirna import fasta, mapper
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.precursors = precursors
    args.matures = matures
    args.database = mapper.guess_database(args.gtf)
    from mirtop.mirna import annotate
    from mirtop.gff import body
    if not load:
        reads = read_file(fn, args)
    else:
        reads = read_file
    if create:
        ann = annotate.annotate(reads, matures, precursors)
        body = body.create(ann, "miRBase21", "Example", args)
    return body
Ejemplo n.º 10
0
def reader(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    database = mapper.guess_database(args.gtf)
    # hairpin, mirna = download_mirbase(args)
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    # check numnbers of miRNA and precursors read
    # print message if numbers mismatch
    out_dts = dict()
    for fn in args.files:
        sample = op.splitext(op.basename(fn))[0]
        fn_out = op.join(args.out, sample + ".gff")
        if args.format == "BAM":
            reads = _read_bam(fn, precursors)
        elif args.format == "seqbuster":
            reads = seqbuster.read_file(fn, precursors)
            custom = seqbuster.header()
        elif args.format == "srnabench":
            reads = srnabench.read_gile(fn, precursors)
        h = header.create([sample], database, "")
        ann = annotate(reads, matures, precursors)
        out_dts[fn] = body.create(ann, database, sample, fn_out, h)
Ejemplo n.º 11
0
def annotate(fn, read_file, load=False, create=True):
    import argparse
    args = argparse.Namespace()
    args.hairpin = "data/examples/annotate/hairpin.fa"
    args.sps = "hsa"
    args.gtf = "data/examples/annotate/hsa.gff3"
    args.add_extra = True
    args.out_format = "gtf"
    from mirtop.mirna import fasta, mapper
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    args.precursors = precursors
    args.matures = matures
    args.database = mapper.guess_database(args.gtf)
    from mirtop.mirna import annotate
    from mirtop.gff import body
    if not load:
        reads = read_file(fn, args)
    else:
        reads = read_file
    if create:
        ann = annotate.annotate(reads, matures, precursors)
        body = body.create(ann, "miRBase21", "Example", args)
    return body
Ejemplo n.º 12
0
 def annotate(fn, precursors, matures):
     from mirtop.bam import bam
     from mirtop.gff import body
     reads = bam.read_bam(fn, precursors)
     ann = bam.annotate(reads, matures, precursors)
     gff = body.create(ann, "miRBase21", "example", fn + ".gff3", "#")
Ejemplo n.º 13
0
def _analyze_line(line, precursors, database, sample, sep, args):
    start_idx = 10
    end_idx = 11
    attr_idx = 15
    query_name = line[3]
    sequence = line[4]
    if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase
        return None

    logger.debug(("READ::line name:{0}").format(line))
    if sequence and sequence.find("N") > -1:
        return None

    chrom = line[attr_idx].strip().split("Name=")[-1]
    start = line[1]
    end = line[2]
    strand = line[5]
    counts = float(line[6])
    Filter = "Pass"
    reads = dict()
    if not start:
        return None
    if strand == "+":
        start = int(start) - int(line[start_idx]) + 1
    else:
        start = int(line[end_idx]) - int(end)
    iso = isomir()
    iso.align = line
    iso.set_pos(start, len(sequence))
    logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom))
    if len(precursors[chrom]) < start + len(sequence):
        logger.debug("READ::%s start + %s sequence size are bigger than"
                     " size precursor %s" % (
                                             chrom,
                                             len(sequence),
                                             len(precursors[chrom])))
    iso.subs, iso.add, iso.cigar = filter.tune(
        sequence, precursors[chrom],
        start, None)
    logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
    logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs))

    idu = make_id(sequence)
    reads[query_name] = hits()
    reads[query_name].set_sequence(sequence)
    reads[query_name].counts = counts
    reads[query_name].sequence = sequence
    reads[query_name].set_precursor(chrom, iso)
    reads = annotate(reads, args.matures, args.precursors, quiet=True)
    gff_line = body.create(reads, args.database, sample, args, quiet=True)
    if start not in gff_line[chrom]:
        return None
    line = gff_line[chrom][start][0][4]
    logger.debug("READ::line:%s" % line)
    if args.add_extra:
        extra = variant_with_nt(line, args.precursors,
                                args.matures)
        line = "%s Changes %s;" % (line, extra)

    line = paste_columns(feature(line), sep=sep)
    return {'chrom': chrom,
            'start': start,
            'name': query_name,
            'mirna': reads[query_name].precursors[chrom].mirna,
            'line': [idu, chrom, counts, sample, line]}