Exemple #1
0
def _analyze_lifted_line(line, reads, precursors, database):
    query_name = line[0]
    sequence = line[1]
    logger.debug(("READ::line name:{0}").format(line))
    if sequence and sequence.find("N") > -1:
        return reads
    if query_name not in reads:
        reads[query_name].set_sequence(sequence)
        reads[query_name].counts = _get_freq(query_name)
        reads[query_name].sequence = sequence

    chrom = line[2]
    start = line[3]
    iso = isomir()
    iso.align = line
    iso.set_pos(start, len(reads[query_name].sequence))
    logger.debug("READ::From BAM start %s end %s at chrom %s" %
                 (iso.start, iso.end, chrom))
    if len(precursors[chrom]) < start + len(reads[query_name].sequence):
        logger.debug(
            "READ::%s start + %s sequence size are bigger than"
            " size precursor %s" %
            (chrom, len(reads[query_name].sequence), len(precursors[chrom])))
        return reads
    iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence,
                                               precursors[chrom], start, None)
    logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
    logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs))

    reads[query_name].set_precursor(chrom, iso)
    return reads
Exemple #2
0
def read_bam(bam_fn, args, clean=True):
    """
    Read bam file and perform realignment of hits

    Args:
        *bam_fn*: a BAM file with alignments to the precursor

        *precursors*: dict with keys being precursor names and values
            being sequences. Come from mirtop.mirna.fasta.read_precursor().

        *clean*: Use mirtop.filter.clean_hits() to remove lower score hits.

    Returns:
        *reads (dict)*:
             keys are read_id and values are *mirtop.realign.hits*

    """
    precursors = args.precursors
    bam_fn = _sam_to_bam(bam_fn)
    bam_fn = _bam_sort(bam_fn)
    mode = "r" if bam_fn.endswith("sam") else "rb"
    handle = pysam.Samfile(bam_fn, mode)
    reads = defaultdict(hits)
    for line in handle:
        if line.reference_id < 0:
            logger.debug("Sequence not mapped: %s" % line.reference_id)
            continue
        query_name = line.query_name
        # if query_name not in reads and line.query_sequence:
        #     continue
        if line.query_sequence and line.query_sequence.find("N") > -1:
            continue
        if query_name not in reads:
            reads[query_name].set_sequence(line.query_sequence)
            reads[query_name].counts = _get_freq(query_name)
        if line.is_reverse:
            logger.debug("Sequence is reverse: %s" % line.query_name)
            continue
        chrom = handle.getrname(line.reference_id)
        cigar = line.cigartuples
        iso = isomir()
        iso.align = line
        iso.set_pos(line.reference_start, len(reads[query_name].sequence))
        logger.debug("READ::From BAM start %s end %s" % (iso.start, iso.end))
        if len(precursors[chrom]) < line.reference_start + len(
                reads[query_name].sequence):
            continue
        iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence,
                                                   precursors[chrom],
                                                   line.reference_start, cigar)
        logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
        if len(iso.subs) < 2:
            reads[query_name].set_precursor(chrom, iso)
    logger.info("Hits: %s" % len(reads))
    if clean:
        reads = filter.clean_hits(reads)
        logger.info("Hits after clean: %s" % len(reads))
    return reads
Exemple #3
0
def read_bam(bam_fn, args, clean=True):
    """
    Read bam file and perform realignment of hits

    Args:
        *bam_fn*: a BAM file with alignments to the precursor

        *precursors*: dict with keys being precursor names and values
            being sequences. Come from mirtop.mirna.fasta.read_precursor().

        *clean*: Use mirtop.filter.clean_hits() to remove lower score hits.

    Returns:
        *reads (dict)*:
             keys are read_id and values are *mirtop.realign.hits*

    """
    precursors = args.precursors
    bam_fn = _sam_to_bam(bam_fn)
    bam_fn = _bam_sort(bam_fn)
    mode = "r" if bam_fn.endswith("sam") else "rb"
    handle = pysam.Samfile(bam_fn, mode)
    reads = defaultdict(hits)
    for line in handle:
        if line.reference_id < 0:
            logger.debug("Sequence not mapped: %s" % line.reference_id)
            continue
        query_name = line.query_name
        # if query_name not in reads and line.query_sequence:
        #     continue
        if line.query_sequence and line.query_sequence.find("N") > -1:
            continue
        if query_name not in reads:
            reads[query_name].set_sequence(line.query_sequence)
            reads[query_name].counts = _get_freq(query_name)
        if line.is_reverse:
            logger.debug("Sequence is reverse: %s" % line.query_name)
            continue
        chrom = handle.getrname(line.reference_id)
        cigar = line.cigartuples
        iso = isomir()
        iso.align = line
        iso.set_pos(line.reference_start, len(reads[query_name].sequence))
        logger.debug("READ::From BAM start %s end %s" % (iso.start, iso.end))
        if len(precursors[chrom]) < line.reference_start + len(reads[query_name].sequence):
            continue
        iso.subs, iso.add, iso.cigar = filter.tune(
            reads[query_name].sequence, precursors[chrom],
            line.reference_start, cigar)
        logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
        if len(iso.subs) < 2:
            reads[query_name].set_precursor(chrom, iso)
    logger.info("Hits: %s" % len(reads))
    if clean:
        reads = filter.clean_hits(reads)
        logger.info("Hits after clean: %s" % len(reads))
    return reads
Exemple #4
0
def read_file(fn, args):
    """
    Read seqbuster file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with seqbuster output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads*: dictionary where keys are read_id and values are *mirtop.realign.hits*

    """
    precursors = args.precursors
    reads = defaultdict(hits)
    with open(fn) as handle:
        handle.readline()
        for line in handle:
            cols = line.strip().split("\t")
            query_name = cols[1]
            query_sequence = cols[0]
            reference_start = int(cols[4]) - 1
            seqbuster_iso = ":".join(cols[6:10])
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            if query_name not in reads:
                reads[query_name].set_sequence(query_sequence)
                reads[query_name].counts = _get_freq(query_name)
            chrom = cols[13]
            logger.debug("\nSEQBUSTER::NEW::query: {query_sequence}\n"
                         "  precursor {chrom}\n"
                         "  name:  {query_name}\n"
                         "  start: {reference_start}\n"
                         "  iso: {seqbuster_iso}".format(**locals()))
            # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
            iso = isomir()
            iso.align = line
            iso.set_pos(reference_start, len(reads[query_name].sequence))
            logger.debug("SEQBUSTER:: start %s end %s" % (iso.start, iso.end))
            if len(precursors[chrom]) < reference_start + len(
                    reads[query_name].sequence):
                continue
            iso.subs, iso.add, iso.cigar = filter.tune(
                reads[query_name].sequence, precursors[chrom], reference_start,
                None)
            logger.debug("SEQBUSTER::After tune start %s end %s" %
                         (iso.start, iso.end))
            if len(iso.subs) < 2:
                reads[query_name].set_precursor(chrom, iso)
    logger.info("Hits: %s" % len(reads))
    return reads
Exemple #5
0
def read_file(fn, precursors):
    """
    read srnabench file and perform realignment of hits
    """
    reads = defaultdict(hits)
    with open(fn) as handle:
        for line in handle:
            cols = line.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            if cols[3].find("mature") == -1:
                continue
            if query_name not in reads:
                reads[query_name].set_sequence(query_sequence)
                reads[query_name].counts = _get_freq(int(cols[1]))

            for hit in cols[4].split("$"):
                logger.debug("SRNABENCH::line hit: %s" % hit)
                hit_info = hit.split("#")
                pos_info = hit_info[3].split(",")
                reference_start = int(pos_info[1]) - 1
                chrom = pos_info[0]
                logger.debug("SRNABENCH::query: {query_sequence}\n"
                             "  precursor {chrom}\n"
                             "  name:  {query_name}\n"
                             "  start: {reference_start}\n"
                             "  hit: {hit}".format(**locals()))
                iso = isomir()
                iso.align = line
                iso.set_pos(reference_start, len(reads[query_name].sequence))
                logger.debug("SRNABENCH:: start %s end %s" %
                             (iso.start, iso.end))
                if len(precursors[chrom]) < reference_start + len(
                        reads[query_name].sequence):
                    continue
                iso.subs, iso.add, iso.cigar = filter.tune(
                    reads[query_name].sequence, precursors[chrom],
                    reference_start, None)
                logger.debug("SRNABENCH::After tune start %s end %s" %
                             (iso.start, iso.end))
                if len(iso.subs) < 3:
                    reads[query_name].set_precursor(chrom, iso)
    logger.info("Hits: %s" % len(reads))
    return reads
Exemple #6
0
def _analyze_line(line, reads, precursors, handle, args):
    if line.reference_id < 0:
        logger.debug("READ::Sequence not mapped: %s" % line.reference_id)
        return reads
    if not line.cigarstring:
        logger.debug("READ::Sequence malformed: %s" % line)
        return reads
    query_name = line.query_name
    if query_name not in reads and not line.query_sequence:
        return reads
    sequence = line.query_sequence if not line.is_reverse else reverse_complement(
        line.query_sequence)
    logger.debug(("READ::Read name:{0} and Read sequence:{1}").format(
        line.query_name, sequence))
    if line.query_sequence and line.query_sequence.find("N") > -1:
        return reads
    if query_name not in reads:
        reads[query_name].set_sequence(sequence)
        reads[query_name].counts = _get_freq(query_name)
        # TODO if args.quant set to 0
    # TODO if args.quant increase by 1
    if line.is_reverse and not args.genomic:
        logger.debug("READ::Sequence is reverse: %s" % line.query_name)
        return reads
    chrom = handle.getrname(line.reference_id)
    start = line.reference_start

    cigar = line.cigartuples
    # if line.cigarstring.find("I") > -1:
    #     indels_skip += 1
    iso = isomir()
    iso.align = line
    iso.set_pos(start, len(reads[query_name].sequence))
    logger.debug("READ::From BAM start %s end %s at chrom %s" %
                 (iso.start, iso.end, chrom))
    if len(precursors[chrom].replace("N", "")) + 3 < start + len(
            reads[query_name].sequence):
        logger.debug("READ::%s start + %s sequence size are bigger than"
                     " size precursor %s" %
                     (line.reference_id, len(
                         reads[query_name].sequence), len(precursors[chrom])))
        return reads
    iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence,
                                               precursors[chrom], start, cigar)
    logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
    logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs))
    reads[query_name].set_precursor(chrom, iso)
    return reads
Exemple #7
0
def read_bam(bam_fn, precursors, clean=True):
    """
    read bam file and perform realignment of hits
    """
    bam_fn = _sam_to_bam(bam_fn)
    bam_fn = _bam_sort(bam_fn)
    mode = "r" if bam_fn.endswith("sam") else "rb"
    handle = pysam.Samfile(bam_fn, mode)
    reads = defaultdict(hits)
    for line in handle:
        if line.reference_id < 0:
            logger.debug("Sequence not mapped: %s" % line.reference_id)
            continue
        query_name = line.query_name
        if query_name not in reads and line.query_sequence == None:
            continue
        if line.query_sequence and line.query_sequence.find("N") > -1:
            continue
        if query_name not in reads:
            reads[query_name].set_sequence(line.query_sequence)
            reads[query_name].counts = _get_freq(query_name)
        if line.is_reverse:
            logger.debug("Sequence is reverse: %s" % line.query_name)
            continue
        chrom = handle.getrname(line.reference_id)
        #  print "%s %s %s %s" % (line.query_name, line.reference_start, line.query_sequence, chrom)
        cigar = line.cigartuples
        iso = isomir()
        iso.align = line
        iso.set_pos(line.reference_start, len(reads[query_name].sequence))
        logger.debug("READ::From BAM start %s end %s" % (iso.start, iso.end))
        if len(precursors[chrom]) < line.reference_start + len(
                reads[query_name].sequence):
            continue
        iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence,
                                                   precursors[chrom],
                                                   line.reference_start, cigar)
        logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
        if len(iso.subs) < 2:
            reads[query_name].set_precursor(chrom, iso)
    logger.info("Hits: %s" % len(reads))
    if clean:
        reads = filter.clean_hits(reads)
        logger.info("Hits after clean: %s" % len(reads))
    return reads
Exemple #8
0
def read_file(fn, precursors):
    """
    read bam file and perform realignment of hits
    """
    reads = defaultdict(hits)
    with open(fn) as handle:
        handle.readline()
        for line in handle:
            cols = line.strip().split("\t")
            query_name = cols[1]
            query_sequence = cols[0]
            reference_start = int(cols[4]) - 1
            seqbuster_iso = ":".join(cols[6:10])
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            if query_name not in reads:
                reads[query_name].set_sequence(query_sequence)
                reads[query_name].counts = _get_freq(query_name)
            chrom = cols[13]
            logger.debug("\nSEQBUSTER::NEW::query: {query_sequence}\n"
                         "  precursor {chrom}\n"
                         "  name:  {query_name}\n"
                         "  start: {reference_start}\n"
                         "  iso: {seqbuster_iso}".format(**locals()))
            # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
            iso = isomir()
            iso.align = line
            iso.set_pos(reference_start, len(reads[query_name].sequence))
            logger.debug("SEQBUSTER:: start %s end %s" % (iso.start, iso.end))
            if len(precursors[chrom]) < reference_start + len(
                    reads[query_name].sequence):
                continue
            iso.subs, iso.add, iso.cigar = filter.tune(
                reads[query_name].sequence, precursors[chrom], reference_start,
                None)
            logger.debug("SEQBUSTER::After tune start %s end %s" %
                         (iso.start, iso.end))
            if len(iso.subs) < 2:
                reads[query_name].set_precursor(chrom, iso)
    logger.info("Hits: %s" % len(reads))
    return reads
Exemple #9
0
def create_iso(name, mir, seq, numsim, exp):
    data = dict()
    reads = dict()
    full_read = list()
    clean_read = list()
    seen = set()
    for mirna in mir[name]:
        info = mir[name][mirna]
        mirSeq = seq[info[0]:info[1] + 1]
        for rand in range(int(numsim)):
            # expression
            e = 1
            if exp:
                trial = random.randint(1, 100)
                p = random.randint(1, 50) / 50.0
                e = numpy.random.negative_binomial(trial, p, 1)[0]
            iso = realign.isomir()
            randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation(
                info, seq)
            if randSeq in seen:
                continue
            seen.add(randSeq)
            iso.end = iso.start + len(randSeq)
            aln = realign.align(randSeq, seq[iso.start:iso.end])
            iso.cigar = realign.make_cigar(aln[0], aln[1])
            iso.mirna = mirna
            query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq)
            reads[query_name] = realign.hits()
            reads[query_name].set_sequence(randSeq)
            reads[query_name].counts = e
            reads[query_name].set_precursor(name, iso)
            full_read.extend(create_read(randSeq, e))
            clean_read.append([
                randSeq,
                e,
            ])
            # print [randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq]
            # data[randSeq] = [exp, iso] # create real object used in code to generate GFF
    write_fastq(full_read, full_fq)
    write_collapse_fastq(clean_read, clean_fq)
    gff = body.create(reads, "miRBase21", "sim1")
    return gff
Exemple #10
0
def create_iso(name, mir, seq, numsim, exp):
    data = dict()
    reads = dict()
    full_read = list()
    clean_read = list()
    seen = set()
    for mirna in mir[name]:
        info = mir[name][mirna]
        mirSeq = seq[info[0]:info[1] + 1]
        for rand in range(int(numsim)):
             # expression
            e = 1
            if exp:
                trial = random.randint(1, 100)
                p = random.randint(1, 50) / 50.0
                e = numpy.random.negative_binomial(trial, p, 1)[0]
            iso = realign.isomir()
            randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation(info, seq)
            if randSeq in seen:
                continue
            seen.add(randSeq)
            iso.end = iso.start + len(randSeq)
            aln = realign.align(randSeq, seq[iso.start:iso.end])
            iso.cigar = realign.make_cigar(aln[0], aln[1])
            iso.mirna = mirna
            query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq)
            reads[query_name] = realign.hits()
            reads[query_name].set_sequence(randSeq)
            reads[query_name].counts = e
            reads[query_name].set_precursor(name, iso)
            full_read.extend(create_read(randSeq, e))
            clean_read.append([randSeq, e,])
            # print([randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq])
            # data[randSeq] = [exp, iso] # create real object used in code to generate GFF
    write_fastq(full_read, full_fq)
    write_collapse_fastq(clean_read, clean_fq)
    gff = body.create(reads, "miRBase21", "sim1")
    return gff
Exemple #11
0
def _read_line(line, col_fix, precursors):
    reads = defaultdict(hits)
    cols = line.strip().split("\t")
    query_name = cols[1]
    query_sequence = cols[0]
    reference_start = int(cols[4 - col_fix]) - 1
    seqbuster_iso = ":".join(cols[6 - col_fix:10 - col_fix])
    if query_sequence and query_sequence.find("N") > -1:
        return reads
    if query_name not in reads:
        reads[query_name].set_sequence(query_sequence)
        reads[query_name].counts = _get_freq(query_name)
    chrom = cols[13 - col_fix]
    logger.debug("\nSEQBUSTER::NEW::query: {query_sequence}\n"
                 "  precursor: {chrom}\n"
                 "  name:  {query_name}\n"
                 "  start: {reference_start}\n"
                 "  iso: {seqbuster_iso}".format(**locals()))
    # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
    iso = isomir()
    iso.align = line
    iso.set_pos(reference_start, len(reads[query_name].sequence))
    logger.debug("\nSEQBUSTER:: start %s end %s" % (iso.start, iso.end))
    if len(precursors[chrom]) < reference_start + len(
            reads[query_name].sequence):
        logger.debug("\nSEQBUSTER::len precursor" % len(precursors[chrom]))
        return reads
    iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence,
                                               precursors[chrom],
                                               reference_start, None)
    logger.debug("\nSEQBUSTER::After tune start %s end %s" %
                 (iso.start, iso.end))
    if len(iso.subs) < 6:
        logger.debug("\nSEQBUSTER::iso.subs %s - length %s" %
                     (iso.subs, len(iso.subs)))
        reads[query_name].set_precursor(chrom, iso)
    return reads
Exemple #12
0
def read_file(fn, hairpins, database, mirna_gtf):
    """
    Read PROST! file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with PROST output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads*: dictionary where keys are read_id and values are *mirtop.realign.hits*

    """
    reads = defaultdict(hits)
    sample = os.path.splitext(os.path.basename(fn))[0]
    genomics = mapper.read_gtf_to_mirna(mirna_gtf)
    matures = mapper.read_gtf_to_precursor(mirna_gtf)
    non_mirna = 0
    non_chromosome_mirna = 0
    outside_mirna = 0
    lines_read = 0
    ann, ann_type = _group_seqs_by_ann(fn)
    with open(fn) as handle:
        handle.readline()
        for line in handle:
            lines_read += 1
            cols = line.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if not ann[query_sequence]:
                non_mirna += 1
                continue
            miRNA = ann_type[ann[query_sequence]][1]
            preNames = ann_type[ann[query_sequence]][0]
            if query_name not in reads and query_sequence==None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            reads[query_name].set_sequence(query_sequence)
            reads[query_name].counts = cols[9]
            for preName in preNames.split(","):
                if preName in reads[query_name].precursors:
                    continue
                if preName not in hairpins:
                    non_chromosome_mirna += 1
                    continue
                reference_start = _align_to_mature(query_sequence, hairpins[preName], matures[preName][miRNA])
                logger.debug("\nPROST!::NEW::query: {query_sequence}\n"
                             "  precursor {preName}\n"
                             "  name:  {query_name}\n"
                             "  reference_start: {reference_start}\n"
                             "  mirna: {miRNA}".format(**locals()))
                iso = isomir()
                iso.align = line
                iso.set_pos(reference_start, len(reads[query_name].sequence))
                logger.debug("PROST!:: start %s end %s" % (iso.start, iso.end))
                if len(hairpins[preName]) < reference_start + len(reads[query_name].sequence):
                    continue
                iso.subs, iso.add, iso.cigar = filter.tune(
                    reads[query_name].sequence,
                    hairpins[preName],
                    reference_start, None)
                logger.debug("PROST!::After tune start %s end %s" % (
                    iso.start, iso.end))
                if len(iso.subs) < 2:
                    reads[query_name].set_precursor(preName, iso)
    logger.info("Lines loaded: %s" % lines_read)
    logger.info("Skipped lines because non miRNA in line: %s" % non_mirna)
    logger.info("Skipped lines because non chromosome in GTF:"
                " %s" % non_chromosome_mirna)
    logger.info("Skipped lines because outside precursor: %s" % outside_mirna)
    logger.info("Hits: %s" % len(reads))
    return reads
Exemple #13
0
def read_file(folder,  precursors):
    """
    read srnabench file and perform realignment of hits
    """
    n_out = 0
    n_nonmature = 0
    n_ns = 0
    n_in = 0
    n_non_precursor = 0
    reads_anno = os.path.join(folder, "reads.annotation")
    reads_iso = os.path.join(folder, "microRNAannotation.txt")
    reads = defaultdict(hits)
    source_iso = _read_iso(reads_iso)
    logger.info("Reads with isomiR information %s" % len(source_iso))
    with open(reads_anno) as handle:
        for line in handle:
            cols = line.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                n_ns += 1
                continue
            if cols[3].find("mature") == -1:
                n_nonmature += 1
                continue
            if query_name not in reads:
                reads[query_name].set_sequence(query_sequence)
                reads[query_name].counts = int(cols[1])

            for hit in cols[4].split("$"):
                logger.debug("SRNABENCH::line hit: %s" % hit)
                hit_info = hit.split("#")
                pos_info = hit_info[3].split(",")
                reference_start = int(pos_info[1]) - 1
                chrom = pos_info[0]
                iso = isomir()
                iso.align = line
                if (query_sequence, hit_info[1]) in source_iso:
                    iso.external = source_iso[(query_sequence, hit_info[1])]
                external = iso.external
                logger.debug("SRNABENCH::query: {query_sequence}\n"
                             "  precursor {chrom}\n"
                             "  name:  {query_name}\n"
                             "  start: {reference_start}\n"
                             "  external: {external}\n"
                             "  hit: {hit}".format(**locals()))
                iso.set_pos(reference_start, len(reads[query_name].sequence))
                logger.debug("SRNABENCH:: start %s end %s" % (iso.start, iso.end))
                if len(precursors[chrom]) < reference_start + len(reads[query_name].sequence):
                    n_out += 1
                    continue
                iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence,
                                                           precursors[chrom],
                                                           reference_start, None)
                logger.debug("SRNABENCH::After tune start %s end %s" % (iso.start, iso.end))
                n_in += 1
                reads[query_name].set_precursor(chrom, iso)
            if len(reads[query_name].precursors) == 0:
                n_non_precursor += 1
    logger.info("Loaded %s reads with %s hits" % (len(reads), n_in))
    logger.info("Reads without precursor information: %s" % n_non_precursor)
    logger.info("Hit Filtered by having > 3 changes: %s" % n_out)
    logger.info("Hit Filtered by being non-mature: %s" % n_nonmature)
    return reads
Exemple #14
0
def _analyze_line(line, precursors, database, sample, sep, args):
    start_idx = 10
    end_idx = 11
    attr_idx = 15
    query_name = line[3]
    sequence = line[4]
    if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase
        return None

    logger.debug(("READ::line name:{0}").format(line))
    if sequence and sequence.find("N") > -1:
        return None

    chrom = line[attr_idx].strip().split("Name=")[-1]
    start = line[1]
    end = line[2]
    strand = line[5]
    counts = float(line[6])
    Filter = "Pass"
    reads = dict()
    if not start:
        return None
    if strand == "+":
        start = int(start) - int(line[start_idx]) + 1
    else:
        start = int(line[end_idx]) - int(end)
    iso = isomir()
    iso.align = line
    iso.set_pos(start, len(sequence))
    logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom))
    if len(precursors[chrom]) < start + len(sequence):
        logger.debug("READ::%s start + %s sequence size are bigger than"
                     " size precursor %s" % (
                                             chrom,
                                             len(sequence),
                                             len(precursors[chrom])))
    iso.subs, iso.add, iso.cigar = filter.tune(
        sequence, precursors[chrom],
        start, None)
    logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
    logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs))

    idu = make_id(sequence)
    reads[query_name] = hits()
    reads[query_name].set_sequence(sequence)
    reads[query_name].counts = counts
    reads[query_name].sequence = sequence
    reads[query_name].set_precursor(chrom, iso)
    reads = annotate(reads, args.matures, args.precursors, quiet=True)
    gff_line = body.create(reads, args.database, sample, args, quiet=True)
    if start not in gff_line[chrom]:
        return None
    line = gff_line[chrom][start][0][4]
    logger.debug("READ::line:%s" % line)
    if args.add_extra:
        extra = variant_with_nt(line, args.precursors,
                                args.matures)
        line = "%s Changes %s;" % (line, extra)

    line = paste_columns(feature(line), sep=sep)
    return {'chrom': chrom,
            'start': start,
            'name': query_name,
            'mirna': reads[query_name].precursors[chrom].mirna,
            'line': [idu, chrom, counts, sample, line]}