Ejemplo n.º 1
0
def read_file(fn, args):
    """
    Read OptimiR file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with isomiR-SEA output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    database = args.database
    gtf = args.gtf
    sep = " " if args.out_format == "gtf" else "="
    sample = read_samples(fn)
    reads = defaultdict(dict)
    logger.debug("OPTIMIR::SAMPLE::%s" % sample)
    with open(fn) as handle:
        for line in handle:
            gff = feature(line)
            fixed_line = line
            if gff.columns:
                if "Variant" not in gff.attributes:
                    gff.attributes["Variant"] = "NA"

                logger.debug("OPTIMIR::Chrom update from %s to %s" %
                             (gff.columns["chrom"], gff.attributes["Parent"]))
                gff.columns["chrom"] = gff.attributes["Parent"].split(",")[0]
                fixed_line = gff.paste_columns(sep=sep)
                if args.add_extra:
                    extra = variant_with_nt(fixed_line, args.precursors,
                                            args.matures)
                    fixed_line = "%s Changes %s;" % (fixed_line, extra)

                fixed_line = paste_columns(feature(fixed_line), sep=sep)
                counts = gff.attributes["Expression"].split(",")
                chrom = gff.columns["chrom"]
                start = gff.columns["start"]
                if start not in reads[chrom]:
                    reads[chrom][start] = []
                reads[chrom][start].append([
                    gff.attributes["UID"], gff.columns["chrom"], counts,
                    sample, fixed_line
                ])
    return reads
Ejemplo n.º 2
0
def read(fn, args):
    """Read GTF/GFF file and load into annotate, chrom counts, sample, line"""
    samples = read_samples(fn)
    lines = defaultdict(dict)
    sep = " " if args.out_format == "gtf" else "="
    corrupted_uid = 0
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            line = paste_columns(feature(line), sep=sep)
            gff = feature(line)
            cols = gff.columns
            attr = gff.attributes
            if attr['UID'] and not read_id(attr['UID']):
                corrupted_uid += 1
                continue
            if 'UID' not in attr:
                msg = "UID not found."
                if 'Read' not in attr:
                    if not is_sequence(attr['Read']):
                        msg = msg + " Sequence not valid in Read attribute."
                    else:
                        attr['UID'] = make_id(attr['Read'])
                if 'sequence' not in attr:
                    msg = msg + " Sequence not found in sequence attribute."
                    if not is_sequence(attr['sequence']):
                        msg = msg + " Sequence not valid in sequence attribute."
                    else:
                        attr['UID'] = make_id(attr['Read'])
            if 'UID' not in attr:
                logger.warning("Line is not a valid GFF3 line: %s" %
                               line.strip())
                logger.warning(msg)

            if cols['start'] not in lines[cols['chrom']]:
                lines[cols['chrom']][cols['start']] = []
            uid = "%s-%s-%s" % (attr['UID'],
                                attr['Variant'],
                                attr['Name'])
            if args.keep_name:
                uid = "%s-%s" % (uid, attr['Read'])
            lines[cols['chrom']][cols['start']].append(
                [uid,
                 cols['chrom'],
                 attr['Expression'].strip().split(","),
                 samples,
                 line.strip()])
    logger.info("Lines skipped due to corrupted UID: %s" % corrupted_uid)
    return lines
Ejemplo n.º 3
0
def _convert_file(gff, args):
    sep = "\t"
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    variant_header = sep.join(['mism', 'add', 't5', 't3'])

    gff_file = open(gff, 'r')
    out_file = os.path.join(args.out, "%s_rawData.tsv" % os.path.splitext(os.path.basename(gff))[0])
    missing_parent = 0
    missing_mirna = 0
    unvalid_uid = 0
    with open(out_file, 'w') as outh:

        for samples_line in gff_file:
            if samples_line.startswith("## COLDATA:"):
                samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(","))
                header = sep.join(['seq', 'mir',
                                   variant_header, samples])
                print(header, file=outh)
                break

        for mirna_line in gff_file:
            gff = feature(mirna_line)
            attr = gff.attributes
            UID = attr["UID"]
            Read = attr["Read"]
            mirna = attr["Name"]
            parent = attr["Parent"]
            variant = attr["Variant"]
            try:
                Read = read_id(UID)
            except KeyError:
                unvalid_uid += 1
                continue

            expression = sep.join(attr["Expression"].strip().split(","))
            cols_variants = sep.join(_expand(variant))
            logger.debug("COUNTS::Read:%s" % Read)
            logger.debug("COUNTS::EXTRA:%s" % variant)
            if parent not in precursors:
                missing_parent += 1
                continue
            if mirna not in matures[parent]:
                missing_mirna += 1
                continue
            extra = variant_with_nt(mirna_line, precursors, matures)
            if extra == "Invalid":
                continue
            logger.debug("COUNTS::EXTRA:%s" % extra)
            cols_variants = sep.join(_expand(extra, True))
            summary = sep.join([Read,  mirna,
                                cols_variants, expression])
            logger.debug(summary)
            print(summary, file=outh)

    gff_file.close()
    logger.info("Missing Parents in hairpin file: %s" % missing_parent)
    logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna)
    logger.info("Non valid UID: %s" % unvalid_uid)
    logger.info("Output file is at %s" % out_file)
Ejemplo n.º 4
0
 def test_alignment(self):
     """testing alignments function"""
     from mirtop.bam import bam
     from mirtop.gff.classgff import feature
     fns = {
         "let7-last1D.sam": {
             56: "iso_add3p:1,iso_snv"
         },
         "let7-1D.sam": {
             5: "iso_snv,iso_3p:-5"
         },
         "let7-last7M1I.sam": {
             5: "iso_add3p:1,iso_snv_seed"
         },
         "let7-middle1D.sam": {
             5: "iso_snv_central_supp,iso_3p:-2"
         },
         "let7-perfect.sam": {
             5: "NA"
         },
         "let7-triming.sam": {
             5: "iso_3p:+2",
             4: "iso_5p:-1",
             6: "iso_5p:+1,iso_3p:-3"
         }
     }
     #import pdb; pdb.set_trace()
     for fn in fns:
         gff = annotate("data/aligments/%s" % fn, bam.read_bam)
         for pos in gff['hsa-let-7a-1']:
             f = feature(gff['hsa-let-7a-1'][pos][0][4])
             if not set(f.attributes['Variant'].split(",")) == set(
                     fns[fn][pos].split(",")):
                 raise ValueError("Error in %s" % fn)
Ejemplo n.º 5
0
 def test_class(self):
     """Test class to read GFF line"""
     from mirtop.gff.classgff import feature
     gff = feature("hsa-let-7a-5p\tmiRBasev21\tisomiR\t4\t25\t0\t+\t.\t"
                   "Read hsa-let-7a-1_hsa-let-7a-5p_5:26_-1:-1_mut:"
                   "null_add:null_x861; UID bhJJ5WJL2;"
                   " Name hsa-let-7a-5p; Parent hsa-let-7a-1;"
                   " Variant iso_5p:+1,iso_3p:-1; Cigar 22M;"
                   " Expression 861; Filter Pass; Hits 1;")
     print(gff.columns)
     print(gff.attributes)
Ejemplo n.º 6
0
def _process(fn, out_dir):
    if out_dir:
        out_fasta = os.path.join(
            out_dir, "%s.fasta" % os.path.splitext(os.path.basename(fn))[0])
    outh = sys.stdout if not out_dir else open(out_fasta, 'w')
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            gff = feature(line)
            attr = gff.attributes
            read = read_id(attr["UID"])
            print((">{0}\n{1}").format(attr["UID"], read), file=outh)
Ejemplo n.º 7
0
def _read_file(fn, precursors, matures, out_dir):
    samples = read_samples(fn)
    for sample in samples:
        with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh:
            print("\t".join([
                "seq", "name", "freq", "mir", "start", "end", "mism", "add",
                "t5", "t3", "s5", "s3", "DB", "precursor", "ambiguity"
            ]),
                  file=outh)
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            gff = feature(line)
            cols = gff.columns
            attr = gff.attributes
            read = read_id(attr["UID"])
            t5 = variant_to_5p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            t3 = variant_to_3p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            add = variant_to_add(read, attr["Variant"])
            mature_sequence = get_mature_sequence(
                precursors[attr["Parent"]],
                matures[attr["Parent"]][attr["Name"]])
            mm = align_from_variants(read, mature_sequence, attr["Variant"])
            if len(mm) > 1:
                continue
            elif len(mm) == 1:
                mm = "".join(list(map(str, mm[0])))
            else:
                mm = "0"
            hit = attr["Hits"] if "Hits" in attr else "1"
            logger.debug("exporter::isomir::decode %s" %
                         [attr["Variant"], t5, t3, add, mm])
            # Error if attr["Read"] doesn't exist
            # print(cols)
            line = [
                read, attr["Read"], "0", attr["Name"], cols['source'],
                cols['type'], mm, add, t5, t3, "NA", "NA", "miRNA",
                attr["Parent"], hit
            ]
            for sample, counts in zip(samples, attr["Expression"].split(",")):
                with open(os.path.join(out_dir, "%s.mirna" % sample),
                          'a') as outh:
                    line[2] = counts
                    print("\t".join(line), file=outh)
Ejemplo n.º 8
0
def lift_to_genome(line, mapper):
    """
    Function to get a class of type feature from classgff.py
      and map the precursors coordinates to the genomic coordinates
    
    Args:
        *line(str)*: string GFF line.
        *mapper(dict)*: dict with mirna-precursor-genomic coordinas from
          mirna.mapper.read_gtf_to_mirna function.

    Returns:
        *(line)*: string with GFF line with updated chr, star, end, strand
 
    """
    features = feature(line)
    features.attributes["Variant"]
    chr, start, end, strand, id = mapper[features.attributes["Name"]][features.attributes["Parent"]]
    logger.debug("LIFT2GENOME:: %s of %s found in %s(%s) " % (features.attributes["Name"],
                                                            features.attributes["Parent"],
                                                            chr, strand))
    nstart = start
    nend = end
    variants = read_variant(features.attributes["Variant"])
    logger.debug("LIFT2GENOME:: variants %s " % (features.attributes["Variant"]))
    if 'iso_5p' in variants:
        if strand == "+":
            nstart = start + variants['iso_5p']
        else:
            nend = end - variants['iso_5p']
    if 'iso_3p' in variants:
        if strand == "+":
            nend = end + variants['iso_3p']
        else:
            nstart = start - variants['iso_3p']
    if 'iso_add3p' in variants:
        if strand == "+":
            nend = nend + variants['iso_add3p']
        else:
            nstart = nstart - variants['iso_add3p']
    logger.debug("LIFT2GENOME:: start %s to %s |  end %s to %s " % (start, nstart, end, nend))
    features.columns['chrom'] = chr
    features.columns['start'] = str(start)
    features.columns['end'] = str(end)
    features.columns['strand'] = strand
    
    return features.paste_columns()
Ejemplo n.º 9
0
def create_line(read, name, database, args):
    sep = " " if args.out_format == "gtf" else "="

    if args.add_extra:
        precursors = args.precursors
        matures = args.matures

    for (ps, iso) in read.precursors.items():
        p = list(ps)[0]
        if not iso.mirna:
            continue
        chrom = p
        seq = read.sequence
        seq_name = seq if not args.keep_name else name
        if iso.get_score(len(seq)) < 1:
            continue
        if iso.subs:
            iso.subs = [] if "N" in iso.subs[0] else iso.subs
        idseq = read.idseq
        source = "ref_miRNA" if not iso.is_iso() else "isomiR"
        strand = iso.strand
        start, end = iso.start, iso.end
        score = iso.map_score
        mirName = iso.mirna
        preName = p
        Variant = iso.formatGFF()
        Cigar = iso.cigar
        counts = read.counts
        Filter = iso.filter
        annotation = "%s.%s.%s" % (chrom, idseq, seq_name)
        # This get correctly formated with paste_columns below
        attrb = ("Read {seq_name};UID {idseq};Name {mirName};"
                 "Parent {preName};"
                 "Variant {Variant};Cigar {Cigar};"
                 "Expression {counts};"
                 "Filter {Filter};").format(**locals())
        line = ("{chrom}\t{database}\t{source}\t{start}\t{end}"
                "\t{score}\t{strand}\t.\t{attrb}").format(**locals())
        logger.debug("GFF::%s" % line)
        if args.add_extra:
            extra = variant_with_nt(line, precursors, matures)
            line = "%s Changes %s;" % (line, extra)

        line = feature(line).paste_columns(sep)
        return line
Ejemplo n.º 10
0
def read_reference(fn):
    """Read GFF into UID:Variant

    Args:
        *fn (str)*: GFF file.

    Returns:
        *srna (dict)*: dict with >>> {'UID': 'iso_snp:-2,...'}
    """
    srna = dict()
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            gff = feature(line)
            attr = gff.attributes
            srna[attr['UID']] = [_simplify(attr['Variant']), attr]
    return srna
Ejemplo n.º 11
0
def to10to11(gff_line):
    gff_line = gff_line.replace("_snp", "_snv")
    gff_line = gff_line.replace("_add", "_add3p")
    features = feature(gff_line)
    if "iso_5p" in features.attributes["Variant"]:
        variants = features.attributes["Variant"].split(",")
        iso_5p = [v.split(":") for v in variants if v.startswith("iso_5p")]
        iso_5p = -1 * int(iso_5p[0][1])
        if iso_5p > 0:
            iso_5p = "+%s" % iso_5p
        variants = [
            "iso_5p:%s" % iso_5p if v.startswith("iso_5p") else v
            for v in variants
        ]
        features.attributes["Variant"] = ",".join(variants)
    features.attributes["UID"] = make_id(
        read_uid_10(features.attributes["UID"]))
    return features.paste_columns()
Ejemplo n.º 12
0
def variant_with_nt(line, precursors, matures):
    """
    Return nucleotides changes for each variant type
    using Variant attribute, precursor sequences and
    mature position.
    """
    gff = feature(line)
    attr = gff.attributes
    read = read_id(attr["UID"])
    attr["Parent"] = attr["Parent"].split(",")[0]
    if attr["Parent"] not in matures:
        logger.warning("Parent miRNA not found in database %s" % attr["Parent"])
        return ""
    if attr["Name"] not in matures[attr["Parent"]]:
        logger.warning("miRNA not found in database %s" % attr["Name"])
        return ""

    logger.debug("GFF::BODY::precursors %s" % precursors[attr["Parent"]])
    logger.debug("GFF:BODY::mature %s" % matures[attr["Parent"]][attr["Name"]])

    t5 = variant_to_5p(precursors[attr["Parent"]],
                       matures[attr["Parent"]][attr["Name"]],
                       attr["Variant"])
    t3 = variant_to_3p(precursors[attr["Parent"]],
                       matures[attr["Parent"]][attr["Name"]],
                       attr["Variant"])
    add = variant_to_add(read,
                         attr["Variant"])
    mature_sequence = get_mature_sequence(
        precursors[attr["Parent"]],
        matures[attr["Parent"]][attr["Name"]],
        nt=8)
    logger.debug("GFF::BODY::mature_sequence %s" % mature_sequence)
    mm = align_from_variants(read,
                             mature_sequence,
                             attr["Variant"])
    if mm == "Invalid":
        return mm
    if len(mm) > 0:
        mm = "".join(["".join([str(v) for v in m]) for m in mm])
    else:
        mm = "0"
    return "iso_5p:%s,iso_3p:%s,iso_add3p:%s,iso_snv:%s" % (t5, t3, add, mm)
Ejemplo n.º 13
0
def _compare_to_reference(fn, reference):
    same = 0
    diff = list()
    extra = list()
    miss = list()
    results = list()
    seen = 0
    seen_reference = set()
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            gff = feature(line)
            attr = gff.attributes
            if attr['UID'] in reference:
                mirna = "Y" if attr['Name'] == reference[
                    attr['UID']][1]['Name'] else attr['Name']
                accuracy = _accuracy(_simplify(attr['Variant']),
                                     reference[attr['UID']][0])
                results.append([attr['UID'], "D", mirna, accuracy])
                if _simplify(attr['Variant']) == reference[attr['UID']][0]:
                    same += 1
                else:
                    diff.append("%s | reference: %s" %
                                (line.strip(), reference[attr['UID']][1]))
                seen += 1
                seen_reference.add(attr['UID'])
            else:
                extra.append("%s | extra" % line.strip())
                results.append([
                    attr['UID'], "E", attr['Name'],
                    _accuracy(_simplify(attr['Variant']), "")
                ])
    for uid in reference:
        if uid not in seen_reference:
            results.append([uid, "M", "N", _accuracy("", reference[uid][0])])
            miss.append("| miss %s" % reference[uid][1])
    logger.info("Number of sequences found in reference: %s" % seen)
    logger.info("Number of sequences matches reference: %s" % same)
    logger.info("Number of sequences different than reference: %s" % len(diff))
    logger.info("Number of sequences extra sequences: %s" % len(extra))
    logger.info("Number of sequences missed sequences: %s" % len(miss))
    return results
Ejemplo n.º 14
0
def _calc_stats(fn):
    """
    Read files and parse into categories
    """
    samples = _get_samples(fn)
    lines = []
    seen = set()
    ok = re.compile('pass', re.IGNORECASE)
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            gff = feature(line)
            cols = gff.columns
            attr = gff.attributes
            logger.debug("## STATS: attribute %s" % attr)
            if not ok.match(attr['Filter']):
                continue
            if "-".join([attr['UID'], attr['Variant'], attr['Name']]) in seen:
                continue
            seen.add("-".join([attr['UID'], attr['Variant'], attr['Name']]))
            lines.extend(_classify(cols['type'], attr, samples))
    df = _summary(lines)
    return df
Ejemplo n.º 15
0
def read_file(fn, args):
    """
    Read isomiR-SEA file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with isomiR-SEA output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    database = args.database
    gtf = args.gtf
    sep = " " if args.out_format == "gtf" else "="
    map_mir = mapper.read_gtf_to_mirna(gtf)
    reads = defaultdict(dict)
    reads_in = 0
    sample = os.path.splitext(os.path.basename(fn))[0]
    hits = _get_hits(fn)
    logger.debug("ISOMIRSEA::SAMPLE::%s" % sample)
    with open(fn) as handle:
        for line in handle:
            cols = line.strip().split("\t")
            attr = read_attributes(line, "=")
            query_name = attr['TS']
            query_sequence = attr['TS'].replace("U", "T")
            start = int(cols[3])
            end = int(cols[4])
            isomirseq_iso = attr['ISO']
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            counts = attr["TC"]
            chrom = cols[0]
            # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
            cigar = attr['CI'].replace("U", "T")
            idu = make_id(query_sequence)
            isoformat = cigar2variants(cigar, query_sequence, attr['ISO'])
            logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n"
                         "  precursor {chrom}\n"
                         "  name: {query_name}\n"
                         "  idu: {idu}\n"
                         "  start: {start}\n"
                         "  cigar: {cigar}\n"
                         "  iso: {isoformat}\n"
                         "  variant: {isoformat}".format(**locals()))
            source = "isomiR" if isoformat != "NA" else "ref_miRNA"
            strand = "+"
            database = cols[1]
            mirName = attr['MIN'].split()[0]
            preName = attr['PIN'].split()[0]
            score = "."
            Filter = attr['FILTER']
            isotag = attr['ISO']
            tchrom, tstart = _genomic2transcript(map_mir[mirName], chrom,
                                                 start)
            start = start if not tstart else tstart
            chrom = chrom if not tstart else tchrom
            end = start + len(query_sequence)
            hit = hits[idu]
            fields = {
                'seq_name': query_sequence,
                'idseq': idu,
                'name': mirName,
                'parent': preName,
                'variant': isoformat,
                'cigar': cigar,
                'counts': counts,
                'filter': Filter,
                'hits': hit,
                'chrom': chrom,
                'start': start,
                'end': end,
                'database': database,
                'source': source,
                'score': score,
                'strand': strand
            }
            # TODO: convert to genomic if args.out_genomic
            line = feature(fields).line
            if args.add_extra:
                extra = variant_with_nt(line, args.precursors, args.matures)
                line = "%s Changes %s;" % (line, extra)

            line = paste_columns(feature(line), sep=sep)
            if start not in reads[chrom]:
                reads[chrom][start] = []
            if Filter == "Pass":
                reads_in += 1
                reads[chrom][start].append([idu, chrom, counts, sample, line])

    logger.info("Hits: %s" % reads_in)
    return reads
Ejemplo n.º 16
0
def convert_gff_counts(args):
    """ Reads a GFF file to produces output file containing Expression counts

    Args:
        *args(namedtuple)*: arguments parsed from command line with
            *mirtop.libs.parse.add_subparser_counts()*.

    Returns:
        *file (file)*: with columns like:
            UID miRNA Variant Sample1 Sample2 ... Sample N
    """
    sep = "\t"
    variant_header = sep.join(['iso_5p', 'iso_3p', 'iso_add3p', 'iso_snp'])
    if args.add_extra:
        precursors = fasta.read_precursor(args.hairpin, args.sps)
        matures = mapper.read_gtf_to_precursor(args.gtf)
        variant_header = sep.join([
            variant_header, 'iso_5p_nt', 'iso_3p_nt', 'iso_add3p_nt',
            'iso_snp_nt'
        ])

    logger.info("INFO Reading GFF file %s", args.gff)
    logger.info("INFO Writing TSV file to directory %s", args.out)

    gff_file = open(args.gff, 'r')
    out_file = op.join(args.out,
                       "%s.tsv" % op.splitext(op.basename(args.gff))[0])
    missing_parent = 0
    missing_mirna = 0
    unvalid_uid = 0
    with open(out_file, 'w') as outh:

        for samples_line in gff_file:
            if samples_line.startswith("## COLDATA:"):
                samples = sep.join(samples_line.strip().split("COLDATA:")
                                   [1].strip().split(","))
                header = sep.join([
                    'UID', 'Read', 'miRNA', 'Variant', variant_header, samples
                ])
                print(header, file=outh)
                break

        for mirna_line in gff_file:
            gff = feature(mirna_line)
            attr = gff.attributes
            UID = attr["UID"]
            Read = attr["Read"]
            mirna = attr["Name"]
            parent = attr["Parent"]
            variant = attr["Variant"]
            try:
                read_id(UID)
            except KeyError:
                unvalid_uid += 1
                continue

            expression = sep.join(attr["Expression"].strip().split(","))
            cols_variants = sep.join(_expand(variant))
            logger.debug("COUNTS::Read:%s" % Read)
            logger.debug("COUNTS::EXTRA:%s" % variant)
            if args.add_extra:
                if parent not in precursors:
                    missing_parent += 1
                    continue
                if mirna not in matures[parent]:
                    missing_mirna += 1
                    continue
                extra = variant_with_nt(mirna_line, precursors, matures)
                if extra == "Invalid":
                    continue
                logger.debug("COUNTS::EXTRA:%s" % extra)
                cols_variants = sep.join([cols_variants] +
                                         _expand(extra, True))
            summary = sep.join(
                [UID, Read, mirna, variant, cols_variants, expression])
            logger.debug(summary)
            print(summary, file=outh)

    gff_file.close()
    logger.info("Missing Parents in hairpin file: %s" % missing_parent)
    logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna)
    logger.info("Non valid UID: %s" % unvalid_uid)
    logger.info("Output file is at %s" % out_file)
Ejemplo n.º 17
0
def _fix(line, expression):
    # Need to fix Read attribute since not usefull when multiple sample in a line.
    gff = feature(line)
    attr = gff.attributes
    attr['Expression'] = expression
    return paste_columns(gff, guess_format(line))
Ejemplo n.º 18
0
def create(reads, database, sample, args, quiet=False):
    """Read https://github.com/miRTop/mirtop/issues/9"""
    sep = " " if args.out_format == "gtf" else "="
    seen = set()
    lines = defaultdict(defaultdict)
    seen_ann = {}
    filter_precursor = 0
    filter_score = 0
    n_hits = 0
    n_reads = 0
    n_seen = 0
    if args.add_extra:
        precursors = args.precursors
        matures = args.matures
    for (r, read) in reads.items():
        hits = set()
        [hits.add(mature.mirna) for mature in read.precursors.values()
            if mature.mirna]
        hits = len(hits)
        if len(read.precursors) > 0:
            n_reads += 1
        for (ps, iso) in read.precursors.items():
            p = list(ps)[0]
            if not iso.mirna:
                filter_precursor += 1
                continue
            if (r, iso.mirna) not in seen:
                seen.add((r, iso.mirna))
                chrom = p
                seq = reads[r].sequence
                seq_name = seq if not args.keep_name else r
                if iso.get_score(len(seq)) < 1:
                    filter_score += 1
                    continue
                if iso.subs:
                    iso.subs = [] if "N" in iso.subs[0] else iso.subs
                idseq = reads[r].idseq
                source = "ref_miRNA" if not iso.is_iso() else "isomiR"
                strand = iso.strand
                start, end = iso.start, iso.end
                score = iso.map_score
                mirName = iso.mirna
                preName = p
                Variant = iso.formatGFF()
                Cigar = iso.cigar
                counts = read.counts
                Filter = iso.filter
                annotation = "%s.%s.%s" % (chrom, idseq, seq_name)
                # TODO:  This need to be moved to use the feature class
                # It needs a dict with all variable in keys
                fields = {'seq_name': seq_name, 'idseq': idseq,
                          'name': mirName, 'parent': preName,
                          'variant': Variant, 'cigar': Cigar,
                          'counts': counts, 'filter': Filter,
                          'hits': hits, 'chrom': chrom, 
                          'start': start, 'end': end,
                          'database': database, 'source': source,
                          'score': score, 'strand': strand}
                line = feature(fields).line
                logger.debug("GFF::%s" % line)
                if args.add_extra:
                    extra = variant_with_nt(line, precursors, matures)
                    line = "%s Changes %s;" % (line, extra)

                if annotation in seen_ann and seq.find("N") < 0 and (
                        seen_ann[annotation].split("\t")[0].find("N") < 0):
                    logger.warning(
                        "Same isomir %s from different sequence:"
                        " \n%s and \n%s" % (annotation, line,
                                            seen_ann[annotation]))
                seen_ann[annotation] = line
                logger.debug("GFF::external %s" % iso.external)
                if start not in lines[chrom]:
                    lines[chrom][start] = []
                lines[chrom][start].append([annotation, chrom,
                                            counts, sample, line])
                logger.debug("GFF::%s" % line)
                n_hits += 1
            else:
                n_seen += 1
    if not quiet:
        logger.info("GFF miRNAs: %s" % len(lines))
        logger.info("GFF hits %s by %s reads" % (n_hits, n_reads))
        logger.info("Filtered by being duplicated: %s" % n_seen)
        logger.info("Filtered by being outside miRNA positions:"
                    " %s" % filter_precursor)
        logger.info("Filtered by being low score: %s" % filter_score)
    return lines
Ejemplo n.º 19
0
def read_file(folder, args):
    """
    Read sRNAbench file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with sRNAbench output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    reads_anno = os.path.join(folder, "reads.annotation")
    reads_iso = os.path.join(folder, "microRNAannotation.txt")
    sep = " " if args.out_format == "gtf" else "="
    sample = os.path.basename(folder)
    database = args.database
    precursors = args.precursors
    matures = args.matures

    n_out = 0
    n_in = 0
    n_ns = 0
    n_notassign = 0
    n_notindb = 0
    reads = defaultdict(dict)
    seen = set()

    source_iso = _read_iso(reads_iso)
    logger.info("Reads with isomiR information %s" % len(source_iso))
    with open(reads_anno) as handle:
        for sequence in handle:
            cols = sequence.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if query_name not in reads and not query_sequence:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                n_ns += 1
                continue
            if cols[3].find("mature") == -1:
                n_in += 1
                continue

            counts = int(cols[1])

            hits = len(
                set([mirna.split("#")[1] for mirna in cols[4].split("$")]))

            for nhit in cols[4].split("$"):
                logger.debug("SRNABENCH::line hit: %s" % nhit)
                hit_info = nhit.split("#")
                pos_info = hit_info[3].split(",")
                start = int(pos_info[1]) - 1
                end = start + len(query_sequence)  # int(pos_info[2]) - 1
                chrom = pos_info[0]
                mirName = hit_info[1]
                if chrom not in precursors or chrom not in matures:
                    n_notindb += 1
                if mirName not in matures[chrom]:
                    n_notindb += 1
                if (query_sequence, mirName) in seen:
                    continue

                seen.add((query_sequence, mirName))

                if (query_sequence, mirName) not in source_iso:
                    continue

                isoformat = source_iso[(query_sequence, mirName)]

                if isoformat == "mv":
                    n_notassign += 1
                    continue

                source = "isomiR" if isoformat != "NA" else "ref_miRNA"

                logger.debug("SRNABENCH::query: {query_sequence}\n"
                             "  precursor {chrom}\n"
                             "  name:  {query_name}\n"
                             "  start: {start}\n"
                             "  external: {isoformat}\n"
                             "  hit: {hits}".format(**locals()))
                logger.debug("SRNABENCH:: start %s end %s" % (start, end))
                if len(precursors[chrom]) < start + len(query_sequence):
                    n_out += 1
                    continue

                Filter = "Pass"
                cigar = make_cigar(query_sequence,
                                   precursors[chrom][start:end])
                preName = chrom
                score = "."
                strand = "+"
                idu = make_id(query_sequence)
                # attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};"
                #          " Parent {preName}; Variant {isoformat};"
                #          " Cigar {cigar}; Expression {counts};"
                #          " Filter {Filter}; Hits {hits};").format(**locals())
                # line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t"
                #         "{score}\t{strand}\t.\t{attrb}").format(**locals())
                fields = {
                    'seq_name': query_sequence,
                    'idseq': idu,
                    'name': mirName,
                    'parent': preName,
                    'variant': isoformat,
                    'cigar': cigar,
                    'counts': counts,
                    'filter': Filter,
                    'hits': hits,
                    'chrom': chrom,
                    'start': start,
                    'end': end,
                    'database': database,
                    'source': source,
                    'score': score,
                    'strand': strand
                }
                # TODO: convert to genomic if args.out_genomic
                line = feature(fields).line
                if args.add_extra:
                    extra = variant_with_nt(line, args.precursors,
                                            args.matures)
                    line = "%s Changes %s;" % (line, extra)

                line = paste_columns(feature(line), sep=sep)
                if start not in reads[chrom]:
                    reads[chrom][start] = []
                if Filter == "Pass":
                    n_in += 1
                    reads[chrom][start].append(
                        [idu, chrom, counts, sample, line])

    logger.info("Loaded %s reads with %s hits" % (len(reads), n_in))
    logger.info("Reads without precursor information: %s" % n_notindb)
    logger.info("Reads with MV as variant definition,"
                " not supported by GFF: %s" % n_notassign)
    logger.info("Hit Filtered by having > 3 changes: %s" % n_out)

    return reads
Ejemplo n.º 20
0
def _analyze_line(line, precursors, database, sample, sep, args):
    start_idx = 10
    end_idx = 11
    attr_idx = 15
    query_name = line[3]
    sequence = line[4]
    if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase
        return None

    logger.debug(("READ::line name:{0}").format(line))
    if sequence and sequence.find("N") > -1:
        return None

    chrom = line[attr_idx].strip().split("Name=")[-1]
    start = line[1]
    end = line[2]
    strand = line[5]
    counts = float(line[6])
    Filter = "Pass"
    reads = dict()
    if not start:
        return None
    if strand == "+":
        start = int(start) - int(line[start_idx]) + 1
    else:
        start = int(line[end_idx]) - int(end)
    iso = isomir()
    iso.align = line
    iso.set_pos(start, len(sequence))
    logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom))
    if len(precursors[chrom]) < start + len(sequence):
        logger.debug("READ::%s start + %s sequence size are bigger than"
                     " size precursor %s" % (
                                             chrom,
                                             len(sequence),
                                             len(precursors[chrom])))
    iso.subs, iso.add, iso.cigar = filter.tune(
        sequence, precursors[chrom],
        start, None)
    logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
    logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs))

    idu = make_id(sequence)
    reads[query_name] = hits()
    reads[query_name].set_sequence(sequence)
    reads[query_name].counts = counts
    reads[query_name].sequence = sequence
    reads[query_name].set_precursor(chrom, iso)
    reads = annotate(reads, args.matures, args.precursors, quiet=True)
    gff_line = body.create(reads, args.database, sample, args, quiet=True)
    if start not in gff_line[chrom]:
        return None
    line = gff_line[chrom][start][0][4]
    logger.debug("READ::line:%s" % line)
    if args.add_extra:
        extra = variant_with_nt(line, args.precursors,
                                args.matures)
        line = "%s Changes %s;" % (line, extra)

    line = paste_columns(feature(line), sep=sep)
    return {'chrom': chrom,
            'start': start,
            'name': query_name,
            'mirna': reads[query_name].precursors[chrom].mirna,
            'line': [idu, chrom, counts, sample, line]}