Exemple #1
0
def _convert_file(gff, args):
    sep = "\t"
    precursors = fasta.read_precursor(args.hairpin, args.sps)
    matures = mapper.read_gtf_to_precursor(args.gtf)
    variant_header = sep.join(['mism', 'add', 't5', 't3'])

    gff_file = open(gff, 'r')
    out_file = os.path.join(args.out, "%s_rawData.tsv" % os.path.splitext(os.path.basename(gff))[0])
    missing_parent = 0
    missing_mirna = 0
    unvalid_uid = 0
    with open(out_file, 'w') as outh:

        for samples_line in gff_file:
            if samples_line.startswith("## COLDATA:"):
                samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(","))
                header = sep.join(['seq', 'mir',
                                   variant_header, samples])
                print(header, file=outh)
                break

        for mirna_line in gff_file:
            gff = feature(mirna_line)
            attr = gff.attributes
            UID = attr["UID"]
            Read = attr["Read"]
            mirna = attr["Name"]
            parent = attr["Parent"]
            variant = attr["Variant"]
            try:
                Read = read_id(UID)
            except KeyError:
                unvalid_uid += 1
                continue

            expression = sep.join(attr["Expression"].strip().split(","))
            cols_variants = sep.join(_expand(variant))
            logger.debug("COUNTS::Read:%s" % Read)
            logger.debug("COUNTS::EXTRA:%s" % variant)
            if parent not in precursors:
                missing_parent += 1
                continue
            if mirna not in matures[parent]:
                missing_mirna += 1
                continue
            extra = variant_with_nt(mirna_line, precursors, matures)
            if extra == "Invalid":
                continue
            logger.debug("COUNTS::EXTRA:%s" % extra)
            cols_variants = sep.join(_expand(extra, True))
            summary = sep.join([Read,  mirna,
                                cols_variants, expression])
            logger.debug(summary)
            print(summary, file=outh)

    gff_file.close()
    logger.info("Missing Parents in hairpin file: %s" % missing_parent)
    logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna)
    logger.info("Non valid UID: %s" % unvalid_uid)
    logger.info("Output file is at %s" % out_file)
Exemple #2
0
def compare(args):
    """
    From a list of GFF files produce comparison with a reference set.

    Args:
        *args(namedtuple)*: arguments parsed from command line with
            *mirtop.libs.parse.add_subparser_compare()*.
            First file will be considered the reference set.

    Returns:
        *(out_file)*: comparison of the GFF files with the reference.
    """
    out = list()
    result = dict()
    reference = read_reference(args.files[0])
    for fn in args.files[1:]:
        if not os.path.exists(fn):
            raise IOError("%s doesn't exist" % fn)
        result[os.path.basename(fn)] = _compare_to_reference(fn, reference)
    if args.out != "tmp_mirtop":
        fn_out = os.path.join(args.out, "summary.txt")
        with open(fn_out, 'w') as outh:
            for fn in result:
                print("sample\tidu\tseq\ttag\tsame_mirna\t%s" %
                      "\t".join(result[fn][0][3].keys()),
                      file=outh)
                for line in result[fn]:
                    read = read_id(line[0])
                    acc = "\t".join([line[3][v] for v in line[3]])
                    print("%s\t%s\t%s\t%s\t%s\t%s" %
                          (fn, line[0], read, line[1], line[2], acc),
                          file=outh)
Exemple #3
0
def variant_with_nt(line, precursors, matures):
    """
    Return nucleotides changes for each variant type
    using Variant attribute, precursor sequences and
    mature position.
    """
    cols = read_gff_line(line)
    attr = cols["attrb"]
    read = read_id(attr["UID"])
    logger.debug("GFF::BODY::precursors %s" % precursors[attr["Parent"]])
    logger.debug("GFF:BODY::mature %s" % matures[attr["Parent"]][attr["Name"]])
    t5 = variant_to_5p(precursors[attr["Parent"]],
                       matures[attr["Parent"]][attr["Name"]],
                       attr["Variant"])
    t3 = variant_to_3p(precursors[attr["Parent"]],
                       matures[attr["Parent"]][attr["Name"]],
                       attr["Variant"])
    add = variant_to_add(read,
                         attr["Variant"])
    mature_sequence = get_mature_sequence(
        precursors[attr["Parent"]],
        matures[attr["Parent"]][attr["Name"]])
    logger.debug("GFF::BODY::mature_sequence %s" % mature_sequence)
    mm = align_from_variants(read,
                             mature_sequence,
                             attr["Variant"])
    if mm == "Invalid":
        return mm
    if len(mm) > 0:
        mm = "".join(["".join(map(str, m)) for m in mm])
    else:
        mm = "0"
    return "iso_5p:%s,iso_3p:%s,iso_add:%s,iso_snp:%s" % (t5, t3, add, mm)
Exemple #4
0
def compare(args):
    """
    From a list of GFF files produce comparison with a reference set.

    Args:
        *args(namedtuple)*: arguments parsed from command line with
            *mirtop.libs.parse.add_subparser_compare()*.
            First file will be considered the reference set.

    Returns:
        *(out_file)*: comparison of the GFF files with the reference.
    """
    out = list()
    result = dict()
    reference = read_reference(args.files[0])
    for fn in args.files[1:]:
        if not os.path.exists(fn):
            raise IOError("%s doesn't exist" % fn)
        result[os.path.basename(fn)] = _compare_to_reference(fn, reference)
    if args.out != "tmp_mirtop":
        fn_out = os.path.join(args.out, "summary.txt")
        with open(fn_out, 'w') as outh:
            for fn in result:
                print("sample\tidu\tseq\ttag\tsame_mirna\t%s" % "\t".join(result[fn][0][3].keys()), file=outh)
                for line in result[fn]:
                    read = read_id(line[0])
                    acc = "\t".join([line[3][v] for v in line[3]])
                    print("%s\t%s\t%s\t%s\t%s\t%s" % (fn, line[0], read, line[1], line[2], acc), file=outh)
Exemple #5
0
def _process(fn, out_dir):
    if out_dir:
        out_fasta = os.path.join(
            out_dir, "%s.fasta" % os.path.splitext(os.path.basename(fn))[0])
    outh = sys.stdout if not out_dir else open(out_fasta, 'w')
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            gff = feature(line)
            attr = gff.attributes
            read = read_id(attr["UID"])
            print((">{0}\n{1}").format(attr["UID"], read), file=outh)
Exemple #6
0
def read(fn, args):
    """Read GTF/GFF file and load into annotate, chrom counts, sample, line"""
    samples = read_samples(fn)
    lines = defaultdict(dict)
    sep = " " if args.out_format == "gtf" else "="
    corrupted_uid = 0
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            line = paste_columns(feature(line), sep=sep)
            gff = feature(line)
            cols = gff.columns
            attr = gff.attributes
            if attr['UID'] and not read_id(attr['UID']):
                corrupted_uid += 1
                continue
            if 'UID' not in attr:
                msg = "UID not found."
                if 'Read' not in attr:
                    if not is_sequence(attr['Read']):
                        msg = msg + " Sequence not valid in Read attribute."
                    else:
                        attr['UID'] = make_id(attr['Read'])
                if 'sequence' not in attr:
                    msg = msg + " Sequence not found in sequence attribute."
                    if not is_sequence(attr['sequence']):
                        msg = msg + " Sequence not valid in sequence attribute."
                    else:
                        attr['UID'] = make_id(attr['Read'])
            if 'UID' not in attr:
                logger.warning("Line is not a valid GFF3 line: %s" %
                               line.strip())
                logger.warning(msg)

            if cols['start'] not in lines[cols['chrom']]:
                lines[cols['chrom']][cols['start']] = []
            uid = "%s-%s-%s" % (attr['UID'],
                                attr['Variant'],
                                attr['Name'])
            if args.keep_name:
                uid = "%s-%s" % (uid, attr['Read'])
            lines[cols['chrom']][cols['start']].append(
                [uid,
                 cols['chrom'],
                 attr['Expression'].strip().split(","),
                 samples,
                 line.strip()])
    logger.info("Lines skipped due to corrupted UID: %s" % corrupted_uid)
    return lines
Exemple #7
0
def _read_file(fn, precursors, matures, out_dir):
    samples = read_samples(fn)
    for sample in samples:
        with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh:
            print("\t".join([
                "seq", "name", "freq", "mir", "start", "end", "mism", "add",
                "t5", "t3", "s5", "s3", "DB", "precursor", "ambiguity"
            ]),
                  file=outh)
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            gff = feature(line)
            cols = gff.columns
            attr = gff.attributes
            read = read_id(attr["UID"])
            t5 = variant_to_5p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            t3 = variant_to_3p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            add = variant_to_add(read, attr["Variant"])
            mature_sequence = get_mature_sequence(
                precursors[attr["Parent"]],
                matures[attr["Parent"]][attr["Name"]])
            mm = align_from_variants(read, mature_sequence, attr["Variant"])
            if len(mm) > 1:
                continue
            elif len(mm) == 1:
                mm = "".join(list(map(str, mm[0])))
            else:
                mm = "0"
            hit = attr["Hits"] if "Hits" in attr else "1"
            logger.debug("exporter::isomir::decode %s" %
                         [attr["Variant"], t5, t3, add, mm])
            # Error if attr["Read"] doesn't exist
            # print(cols)
            line = [
                read, attr["Read"], "0", attr["Name"], cols['source'],
                cols['type'], mm, add, t5, t3, "NA", "NA", "miRNA",
                attr["Parent"], hit
            ]
            for sample, counts in zip(samples, attr["Expression"].split(",")):
                with open(os.path.join(out_dir, "%s.mirna" % sample),
                          'a') as outh:
                    line[2] = counts
                    print("\t".join(line), file=outh)
Exemple #8
0
def _read_file(fn, precursors, matures, out_dir):
    samples = read_samples(fn)
    for sample in samples:
        with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh:
            print("\t".join(
                ["seq", "name", "freq", "mir", "start", "end",
                 "mism", "add", "t5", "t3", "s5", "s3", "DB",
                 "precursor", "ambiguity"]), file=outh)
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            cols = line.strip().split("\t")
            attr = read_attributes(line)
            read = read_id(attr["UID"])
            t5 = variant_to_5p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            t3 = variant_to_3p(precursors[attr["Parent"]],
                               matures[attr["Parent"]][attr["Name"]],
                               attr["Variant"])
            add = variant_to_add(read,
                                 attr["Variant"])
            mature_sequence = get_mature_sequence(
                precursors[attr["Parent"]],
                matures[attr["Parent"]][attr["Name"]])
            mm = align_from_variants(read,
                                     mature_sequence,
                                     attr["Variant"])
            if len(mm) > 1:
                continue
            elif len(mm) == 1:
                mm = "".join(map(str, mm[0]))
            else:
                mm = "0"
            hit = attr["Hits"] if "Hits" in attr else "1"
            logger.debug("exporter::isomir::decode %s" % [attr["Variant"],
                                                          t5, t3, add, mm])
            # Error if attr["Read"] doesn't exist
            line = [read, attr["Read"], "0", attr["Name"], cols[1], cols[2],
                    mm, add, t5, t3, "NA", "NA", "miRNA",  attr["Parent"], hit]
            for sample, counts in zip(samples, attr["Expression"].split(",")):
                with open(os.path.join(out_dir, "%s.mirna" % sample),
                          'a') as outh:
                    line[2] = counts
                    print("\t".join(line), file=outh)
Exemple #9
0
def variant_with_nt(line, precursors, matures):
    """
    Return nucleotides changes for each variant type
    using Variant attribute, precursor sequences and
    mature position.
    """
    gff = feature(line)
    attr = gff.attributes
    read = read_id(attr["UID"])
    attr["Parent"] = attr["Parent"].split(",")[0]
    if attr["Parent"] not in matures:
        logger.warning("Parent miRNA not found in database %s" % attr["Parent"])
        return ""
    if attr["Name"] not in matures[attr["Parent"]]:
        logger.warning("miRNA not found in database %s" % attr["Name"])
        return ""

    logger.debug("GFF::BODY::precursors %s" % precursors[attr["Parent"]])
    logger.debug("GFF:BODY::mature %s" % matures[attr["Parent"]][attr["Name"]])

    t5 = variant_to_5p(precursors[attr["Parent"]],
                       matures[attr["Parent"]][attr["Name"]],
                       attr["Variant"])
    t3 = variant_to_3p(precursors[attr["Parent"]],
                       matures[attr["Parent"]][attr["Name"]],
                       attr["Variant"])
    add = variant_to_add(read,
                         attr["Variant"])
    mature_sequence = get_mature_sequence(
        precursors[attr["Parent"]],
        matures[attr["Parent"]][attr["Name"]],
        nt=8)
    logger.debug("GFF::BODY::mature_sequence %s" % mature_sequence)
    mm = align_from_variants(read,
                             mature_sequence,
                             attr["Variant"])
    if mm == "Invalid":
        return mm
    if len(mm) > 0:
        mm = "".join(["".join([str(v) for v in m]) for m in mm])
    else:
        mm = "0"
    return "iso_5p:%s,iso_3p:%s,iso_add3p:%s,iso_snv:%s" % (t5, t3, add, mm)
Exemple #10
0
def convert_gff_counts(args):
    """ Reads a GFF file to produces output file containing Expression counts

    Args:
        *args(namedtuple)*: arguments parsed from command line with
            *mirtop.libs.parse.add_subparser_counts()*.

    Returns:
        *file (file)*: with columns like:
            UID miRNA Variant Sample1 Sample2 ... Sample N
    """
    sep = "\t"
    variant_header = sep.join(['iso_5p', 'iso_3p', 'iso_add3p', 'iso_snp'])
    if args.add_extra:
        precursors = fasta.read_precursor(args.hairpin, args.sps)
        matures = mapper.read_gtf_to_precursor(args.gtf)
        variant_header = sep.join([
            variant_header, 'iso_5p_nt', 'iso_3p_nt', 'iso_add3p_nt',
            'iso_snp_nt'
        ])

    logger.info("INFO Reading GFF file %s", args.gff)
    logger.info("INFO Writing TSV file to directory %s", args.out)

    gff_file = open(args.gff, 'r')
    out_file = op.join(args.out,
                       "%s.tsv" % op.splitext(op.basename(args.gff))[0])
    missing_parent = 0
    missing_mirna = 0
    unvalid_uid = 0
    with open(out_file, 'w') as outh:

        for samples_line in gff_file:
            if samples_line.startswith("## COLDATA:"):
                samples = sep.join(samples_line.strip().split("COLDATA:")
                                   [1].strip().split(","))
                header = sep.join([
                    'UID', 'Read', 'miRNA', 'Variant', variant_header, samples
                ])
                print(header, file=outh)
                break

        for mirna_line in gff_file:
            gff = feature(mirna_line)
            attr = gff.attributes
            UID = attr["UID"]
            Read = attr["Read"]
            mirna = attr["Name"]
            parent = attr["Parent"]
            variant = attr["Variant"]
            try:
                read_id(UID)
            except KeyError:
                unvalid_uid += 1
                continue

            expression = sep.join(attr["Expression"].strip().split(","))
            cols_variants = sep.join(_expand(variant))
            logger.debug("COUNTS::Read:%s" % Read)
            logger.debug("COUNTS::EXTRA:%s" % variant)
            if args.add_extra:
                if parent not in precursors:
                    missing_parent += 1
                    continue
                if mirna not in matures[parent]:
                    missing_mirna += 1
                    continue
                extra = variant_with_nt(mirna_line, precursors, matures)
                if extra == "Invalid":
                    continue
                logger.debug("COUNTS::EXTRA:%s" % extra)
                cols_variants = sep.join([cols_variants] +
                                         _expand(extra, True))
            summary = sep.join(
                [UID, Read, mirna, variant, cols_variants, expression])
            logger.debug(summary)
            print(summary, file=outh)

    gff_file.close()
    logger.info("Missing Parents in hairpin file: %s" % missing_parent)
    logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna)
    logger.info("Non valid UID: %s" % unvalid_uid)
    logger.info("Output file is at %s" % out_file)
Exemple #11
0
 def _convert(s, test, reverse=False):
     code = read_id(s) if reverse else make_id(s)
     if code != test:
         raise ValueError("%s didn't result on %s but in %s" %
                          (s, test, code))
Exemple #12
0
def convert_gff_counts(args):
    """ Reads a GFF file to produces output file containing Expression counts

    Args:
        *args(namedtuple)*: arguments parsed from command line with
            *mirtop.libs.parse.add_subparser_counts()*.

    Returns:
        *file (file)*: with columns like:
            UID miRNA Variant Sample1 Sample2 ... Sample N
    """
    sep = "\t"
    variant_header = sep.join(['iso_5p', 'iso_3p',
                               'iso_add', 'iso_snp'])
    if args.add_extra:
        precursors = fasta.read_precursor(args.hairpin, args.sps)
        matures = mapper.read_gtf_to_precursor(args.gtf)
        variant_header = sep.join([variant_header,
                                   'iso_5p_nt', 'iso_3p_nt',
                                   'iso_add_nt', 'iso_snp_nt'])

    logger.info("INFO Reading GFF file %s", args.gff)
    logger.info("INFO Writing TSV file to directory %s", args.out)

    gff_file = open(args.gff, 'r')
    out_file = op.join(args.out, "expression_counts.tsv")
    missing_parent = 0
    missing_mirna = 0
    unvalid_uid = 0
    with open(out_file, 'w') as outh:

        for samples_line in gff_file:
            if samples_line.startswith("## COLDATA:"):
                samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(","))
                header = sep.join(['UID', 'Read', 'miRNA', 'Variant',
                                   variant_header, samples])
                print(header, file=outh)
                break

        for mirna_line in gff_file:
            mirna_values = read_gff_line(mirna_line)
            Read = mirna_values["attrb"]["Read"]
            UID = mirna_values["attrb"]["UID"]
            mirna = mirna_values["attrb"]["Name"]
            parent = mirna_values["attrb"]["Parent"]
            variant = mirna_values["attrb"]["Variant"]
            try:
                read_id(UID)
            except KeyError:
                unvalid_uid += 1
                continue

            expression = sep.join(mirna_values["attrb"]["Expression"].strip().split(","))
            cols_variants = sep.join(_expand(variant))
            logger.debug("COUNTS::Read:%s" % Read)
            logger.debug("COUNTS::EXTRA:%s" % variant)
            if args.add_extra:
                if parent not in precursors:
                    missing_parent += 1
                    continue
                if mirna not in matures[parent]:
                    missing_mirna += 1
                    continue
                extra = variant_with_nt(mirna_line, precursors, matures)
                if extra == "Invalid":
                    continue
                logger.debug("COUNTS::EXTRA:%s" % extra)
                cols_variants = sep.join([cols_variants] + _expand(extra, True))
            summary = sep.join([UID, Read,  mirna, variant,
                                cols_variants, expression])
            logger.debug(summary)
            print(summary, file=outh)

    gff_file.close()
    logger.info("Missing Parents in hairpin file: %s" % missing_parent)
    logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna)
    logger.info("Non valid UID: %s" % unvalid_uid)
    logger.info("Output file is at %s" % out_file)
Exemple #13
0
 def _convert(s, test, reverse=False):
     code = read_id(s) if reverse else make_id(s)
     if code != test:
         raise ValueError("%s didn't result on %s but in %s" %
                          (s, test, code))