Ejemplo n.º 1
0
def _check_line(line, num, num_samples):
    """ Check file for minimum
    """
    fields = read_gff_line(line)

    # Check seqID
    if not fields['chrom']:
        logger.error('MISSING seqID in line %s' % (num))

    # Check source
    source = (fields['source']).lower()
    valid_source = False
    valid_sources = ["mirBase", "mirgeneDB"]
    if (any(s.lower() in source for s in valid_sources)):
        valid_source = True

    if valid_source is False:
        logger.error('INCORRECT SOURCE in line %s' % (num))

    # Check type
    type = fields['type']

    source = (fields['source']).lower()
    valid_type = False
    if type in ["ref_miRNA", "isomiR"]:
        valid_type = True

    if valid_type is False:
        logger.error('INCORRECT TYPE in line %s' % (num))

    # Check start/end
    if not fields['start']:
        logger.error('MISSING START value in line %s' % (num))

    if not fields['end']:
        logger.error('MISSING END value in line %s' % (num))

    # Check strand
    if str(fields['strand']) not in ["+", "-"]:
        logger.error('INCORRECT STRAND in line %s' % (num))

    # Check attribute-variant
    variant = (fields['attrb']['Variant']).lower()
    valid_variant = False
    valid_variants = version.GFFv[version.current]
    if (any(s.lower() in variant for s in valid_variants)):
        valid_variant = True

    if valid_variant is False:
        logger.error('INCORRECT VARIANT type in line %s' % (num))

    # Check attribute-expression

    expression = fields['attrb']['Expression'].strip().split(",")
    expression = filter(None, expression)
    if len(expression) != num_samples:
        logger.error('INCORRECT number of EXPRESSION VALUES \
        in line %s' % (num))
Ejemplo n.º 2
0
def _check_line(line, num, num_samples):
    """ Check file for minimum
    """
    fields = read_gff_line(line)

    # Check seqID
    if not fields['chrom']:
        logger.error('MISSING seqID in line %s' % (num))

    # Check source
    source = (fields['source']).lower()
    valid_source = False
    valid_sources = ["mirBase", "mirgeneDB"]
    if (any(s.lower() in source for s in valid_sources)):
        valid_source = True

    if valid_source is False:
        logger.error('INCORRECT SOURCE in line %s' % (num))

    # Check type
    type = fields['type']

    source = (fields['source']).lower()
    valid_type = False
    if type in ["ref_miRNA", "isomiR"]:
        valid_type = True

    if valid_type is False:
        logger.error('INCORRECT TYPE in line %s' % (num))

    # Check start/end
    if not fields['start']:
        logger.error('MISSING START value in line %s' % (num))

    if not fields['end']:
        logger.error('MISSING END value in line %s' % (num))

    # Check strand
    if str(fields['strand']) not in ["+", "-"]:
        logger.error('INCORRECT STRAND in line %s' % (num))

    # Check attribute-variant
    variant = (fields['attrb']['Variant']).lower()
    valid_variant = False
    valid_variants = version.GFFv[version.current]
    if (any(s.lower() in variant for s in valid_variants)):
        valid_variant = True

    if valid_variant is False:
        logger.error('INCORRECT VARIANT type in line %s' % (num))

    # Check attribute-expression

    expression = fields['attrb']['Expression'].strip().split(",")
    expression = filter(None, expression)
    if len(expression) != num_samples:
        logger.error('INCORRECT number of EXPRESSION VALUES \
        in line %s' % (num))
Ejemplo n.º 3
0
def read_reference(fn):
    """Read GFF into UID:Variant

    Args:
        *fn (str)*: GFF file.

    Returns:
        *srna (dict)*: dict with >>> {'UID': 'iso_snp:-2,...'}
    """
    srna = dict()
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            cols = read_gff_line(line)
            attr = cols['attrb']
            srna[attr['UID']] = [_simplify(attr['Variant']), attr]
    return srna
Ejemplo n.º 4
0
def read_reference(fn):
    """Read GFF into UID:Variant

    Args:
        *fn (str)*: GFF file.

    Returns:
        *srna (dict)*: dict with >>> {'UID': 'iso_snp:-2,...'}
    """
    srna = dict()
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            cols = read_gff_line(line)
            attr = cols['attrb']
            srna[attr['UID']] = [_simplify(attr['Variant']), attr]
    return srna
Ejemplo n.º 5
0
def _compare_to_reference(fn, reference):
    same = 0
    diff = list()
    extra = list()
    miss = list()
    results = list()
    seen = 0
    seen_reference = set()
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            cols = read_gff_line(line)
            attr = cols['attrb']
            if attr['UID'] in reference:
                mirna = "Y" if attr['Name'] == reference[
                    attr['UID']][1]['Name'] else attr['Name']
                accuracy = _accuracy(_simplify(attr['Variant']),
                                     reference[attr['UID']][0])
                results.append([attr['UID'], "D", mirna, accuracy])
                if _simplify(attr['Variant']) == reference[attr['UID']][0]:
                    same += 1
                else:
                    diff.append("%s | reference: %s" %
                                (line.strip(), reference[attr['UID']][1]))
                seen += 1
                seen_reference.add(attr['UID'])
            else:
                extra.append("%s | extra" % line.strip())
                results.append([
                    attr['UID'], "E", attr['Name'],
                    _accuracy(_simplify(attr['Variant']), "")
                ])
    for uid in reference:
        if uid not in seen_reference:
            results.append([uid, "M", "N", _accuracy("", reference[uid][0])])
            miss.append("| miss %s" % reference[uid][1])
    logger.info("Number of sequences found in reference: %s" % seen)
    logger.info("Number of sequences matches reference: %s" % same)
    logger.info("Number of sequences different than reference: %s" % len(diff))
    logger.info("Number of sequences extra sequences: %s" % len(extra))
    logger.info("Number of sequences missed sequences: %s" % len(miss))
    return results
Ejemplo n.º 6
0
def _calc_stats(fn):
    """
    Read files and parse into categories
    """
    samples = _get_samples(fn)
    lines = []
    seen = set()
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            cols = read_gff_line(line)
            logger.debug("## STATS: attribute %s" % cols['attrb'])
            attr = cols['attrb']
            if attr['Filter'] != "Pass":
                continue
            if "-".join([attr['UID'], attr['Variant'], attr['Name']]) in seen:
                continue
            seen.add("-".join([attr['UID'], attr['Variant'], attr['Name']]))
            lines.extend(_classify(cols['type'], attr, samples))
    df = _summary(lines)
    return df
Ejemplo n.º 7
0
def _compare_to_reference(fn, reference):
    same = 0
    diff = list()
    extra = list()
    miss = list()
    results = list()
    seen = 0
    seen_reference = set()
    with open(fn) as inh:
        for line in inh:
            if line.startswith("#"):
                continue
            cols = read_gff_line(line)
            attr = cols['attrb']
            if attr['UID'] in reference:
                mirna = "Y" if attr['Name'] == reference[attr['UID']][1]['Name'] else attr['Name']
                accuracy =  _accuracy(_simplify(attr['Variant']), reference[attr['UID']][0])
                results.append([attr['UID'], "D", mirna, accuracy])
                if _simplify(attr['Variant']) == reference[attr['UID']][0]:
                    same += 1
                else:
                    diff.append("%s | reference: %s" % (line.strip(), reference[attr['UID']][1]))
                seen += 1
                seen_reference.add(attr['UID'])
            else:
                extra.append("%s | extra" % line.strip())
                results.append([attr['UID'], "E", attr['Name'], _accuracy(_simplify(attr['Variant']), "")])
    for uid in reference:
        if uid not in seen_reference:
            results.append([uid, "M", "N", _accuracy("", reference[uid][0])])
            miss.append("| miss %s" %  reference[uid][1])
    logger.info("Number of sequences found in reference: %s" % seen)
    logger.info("Number of sequences matches reference: %s" % same)
    logger.info("Number of sequences different than reference: %s" % len(diff))
    logger.info("Number of sequences extra sequences: %s" % len(extra))
    logger.info("Number of sequences missed sequences: %s" % len(miss))
    return results
Ejemplo n.º 8
0
def _fix(line, expression):
    # Need to fix Read attribute since not usefull when multiple sample in a line.
    cols = read_gff_line(line)
    cols['attrb']['Expression'] = expression
    return paste_columns(cols, guess_format(line))
Ejemplo n.º 9
0
 def test_read_line(self):
     """Read GFF/GTF line"""
     from mirtop.gff.body import read_gff_line
     with open("data/examples/gff/2samples.gff") as inh:
         for line in inh:
             print(read_gff_line(line))
Ejemplo n.º 10
0
def convert_gff_counts(args):
    """ Reads a GFF file to produces output file containing Expression counts

    Args:
        *args(namedtuple)*: arguments parsed from command line with
            *mirtop.libs.parse.add_subparser_counts()*.

    Returns:
        *file (file)*: with columns like:
            UID miRNA Variant Sample1 Sample2 ... Sample N
    """
    sep = "\t"
    variant_header = sep.join(['iso_5p', 'iso_3p',
                               'iso_add', 'iso_snp'])
    if args.add_extra:
        precursors = fasta.read_precursor(args.hairpin, args.sps)
        matures = mapper.read_gtf_to_precursor(args.gtf)
        variant_header = sep.join([variant_header,
                                   'iso_5p_nt', 'iso_3p_nt',
                                   'iso_add_nt', 'iso_snp_nt'])

    logger.info("INFO Reading GFF file %s", args.gff)
    logger.info("INFO Writing TSV file to directory %s", args.out)

    gff_file = open(args.gff, 'r')
    out_file = op.join(args.out, "expression_counts.tsv")
    missing_parent = 0
    missing_mirna = 0
    unvalid_uid = 0
    with open(out_file, 'w') as outh:

        for samples_line in gff_file:
            if samples_line.startswith("## COLDATA:"):
                samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(","))
                header = sep.join(['UID', 'Read', 'miRNA', 'Variant',
                                   variant_header, samples])
                print(header, file=outh)
                break

        for mirna_line in gff_file:
            mirna_values = read_gff_line(mirna_line)
            Read = mirna_values["attrb"]["Read"]
            UID = mirna_values["attrb"]["UID"]
            mirna = mirna_values["attrb"]["Name"]
            parent = mirna_values["attrb"]["Parent"]
            variant = mirna_values["attrb"]["Variant"]
            try:
                read_id(UID)
            except KeyError:
                unvalid_uid += 1
                continue

            expression = sep.join(mirna_values["attrb"]["Expression"].strip().split(","))
            cols_variants = sep.join(_expand(variant))
            logger.debug("COUNTS::Read:%s" % Read)
            logger.debug("COUNTS::EXTRA:%s" % variant)
            if args.add_extra:
                if parent not in precursors:
                    missing_parent += 1
                    continue
                if mirna not in matures[parent]:
                    missing_mirna += 1
                    continue
                extra = variant_with_nt(mirna_line, precursors, matures)
                if extra == "Invalid":
                    continue
                logger.debug("COUNTS::EXTRA:%s" % extra)
                cols_variants = sep.join([cols_variants] + _expand(extra, True))
            summary = sep.join([UID, Read,  mirna, variant,
                                cols_variants, expression])
            logger.debug(summary)
            print(summary, file=outh)

    gff_file.close()
    logger.info("Missing Parents in hairpin file: %s" % missing_parent)
    logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna)
    logger.info("Non valid UID: %s" % unvalid_uid)
    logger.info("Output file is at %s" % out_file)
Ejemplo n.º 11
0
def _fix(line, expression):
    # Need to fix Read attribute since not usefull when multiple sample in a line.
    cols = read_gff_line(line)
    cols['attrb']['Expression'] = expression
    return paste_columns(cols, guess_format(line))
Ejemplo n.º 12
0
def read_file(fn, args):
    """
    Read isomiR-SEA file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with isomiR-SEA output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    database = args.database
    gtf = args.gtf
    sep = " " if args.out_format == "gtf" else "="
    map_mir = mapper.read_gtf_to_mirna(gtf)
    reads = defaultdict(dict)
    reads_in = 0
    sample = os.path.splitext(os.path.basename(fn))[0]
    hits = _get_hits(fn)
    logger.debug("ISOMIRSEA::SAMPLE::%s" % sample)
    with open(fn) as handle:
        for line in handle:
            cols = line.strip().split("\t")
            attr = read_attributes(line, "=")
            query_name = attr['TS']
            query_sequence = attr['TS'].replace("U", "T")
            start = int(cols[3])
            end = int(cols[4])
            isomirseq_iso = attr['ISO']
            if query_name not in reads and query_sequence == None:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                continue
            counts = attr["TC"]
            chrom = cols[0]
            # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals()))
            cigar = attr['CI'].replace("U", "T")
            idu = make_id(query_sequence)
            isoformat = cigar2variants(cigar, query_sequence, attr['ISO'])
            logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n"
                         "  precursor {chrom}\n"
                         "  name: {query_name}\n"
                         "  idu: {idu}\n"
                         "  start: {start}\n"
                         "  cigar: {cigar}\n"
                         "  iso: {isoformat}\n"
                         "  variant: {isoformat}".format(**locals()))
            source = "isomiR" if isoformat != "NA" else "ref_miRNA"
            strand = "+"
            database = cols[1]
            mirName = attr['MIN'].split()[0]
            preName = attr['PIN'].split()[0]
            score = "."
            Filter = attr['FILTER']
            isotag = attr['ISO']
            tchrom, tstart = _genomic2transcript(map_mir[mirName],
                                                 chrom, start)
            start = start if not tstart else tstart
            chrom = chrom if not tstart else tchrom
            end = start + len(query_sequence)
            hit = hits[idu]
            attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};"
                     " Parent {preName}; Variant {isoformat};"
                     " Isocode {isotag}; Cigar {cigar}; Expression {counts};"
                     " Filter {Filter}; Hits {hit};").format(**locals())
            line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t"
                    "{score}\t{strand}\t.\t{attrb}").format(**locals())
            if args.add_extra:
                extra = variant_with_nt(line, args.precursors, args.matures)
                line = "%s Changes %s;" % (line, extra)

            line = paste_columns(read_gff_line(line), sep=sep)
            if start not in reads[chrom]:
                reads[chrom][start] = []
            if Filter == "Pass":
                reads_in += 1
                reads[chrom][start].append([idu, chrom, counts, sample, line])

    logger.info("Hits: %s" % reads_in)
    return reads
Ejemplo n.º 13
0
 def test_read_line(self):
     """Read GFF/GTF line"""
     from mirtop.gff.body import read_gff_line
     with open("data/examples/gff/2samples.gff") as inh:
         for line in inh:
             print(read_gff_line(line))
Ejemplo n.º 14
0
def read_file(folder, args):
    """
    Read sRNAbench file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with sRNAbench output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    reads_anno = os.path.join(folder, "reads.annotation")
    reads_iso = os.path.join(folder, "microRNAannotation.txt")
    sep = " " if args.out_format == "gtf" else "="
    sample = os.path.basename(folder)
    database = args.database
    precursors = args.precursors
    matures = args.matures

    n_out = 0
    n_in = 0
    n_ns = 0
    n_notassign = 0
    n_notindb = 0
    reads = defaultdict(dict)
    seen = set()

    source_iso = _read_iso(reads_iso)
    logger.info("Reads with isomiR information %s" % len(source_iso))
    with open(reads_anno) as handle:
        for sequence in handle:
            cols = sequence.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if query_name not in reads and not query_sequence:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                n_ns += 1
                continue
            if cols[3].find("mature") == -1:
                n_in += 1
                continue

            counts = int(cols[1])

            hit = len(set([mirna.split("#")[1] for mirna in cols[4].split("$")]))

            for nhit in cols[4].split("$"):
                logger.debug("SRNABENCH::line hit: %s" % nhit)
                hit_info = nhit.split("#")
                pos_info = hit_info[3].split(",")
                start = int(pos_info[1]) - 1
                end = start + len(query_sequence)  # int(pos_info[2]) - 1
                chrom = pos_info[0]
                mirName = hit_info[1]
                if chrom not in precursors or chrom not in matures:
                    n_notindb += 1
                if mirName not in matures[chrom]:
                    n_notindb += 1
                if (query_sequence, mirName) in seen:
                    continue

                seen.add((query_sequence, mirName))

                if (query_sequence, mirName) not in source_iso:
                    continue

                isoformat = source_iso[(query_sequence, mirName)]

                if isoformat == "mv":
                    n_notassign += 1
                    continue

                source = "isomiR" if isoformat != "NA" else "ref_miRNA"

                logger.debug("SRNABENCH::query: {query_sequence}\n"
                             "  precursor {chrom}\n"
                             "  name:  {query_name}\n"
                             "  start: {start}\n"
                             "  external: {isoformat}\n"
                             "  hit: {hit}".format(**locals()))
                logger.debug("SRNABENCH:: start %s end %s" % (start, end))
                if len(precursors[chrom]) < start + len(query_sequence):
                    n_out += 1
                    continue

                Filter = "Pass"
                cigar = make_cigar(query_sequence,
                                   precursors[chrom][start:end])
                preName = chrom
                score = "."
                strand = "+"
                idu = make_id(query_sequence)
                attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};"
                         " Parent {preName}; Variant {isoformat};"
                         " Cigar {cigar}; Expression {counts};"
                         " Filter {Filter}; Hits {hit};").format(**locals())
                line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t"
                        "{score}\t{strand}\t.\t{attrb}").format(**locals())
                if args.add_extra:
                    extra = variant_with_nt(line, args.precursors,
                                            args.matures)
                    line = "%s Changes %s;" % (line, extra)

                line = paste_columns(read_gff_line(line), sep=sep)
                if start not in reads[chrom]:
                    reads[chrom][start] = []
                if Filter == "Pass":
                    n_in += 1
                    reads[chrom][start].append([idu, chrom, counts,
                                                sample, line])

    logger.info("Loaded %s reads with %s hits" % (len(reads), n_in))
    logger.info("Reads without precursor information: %s" % n_notindb)
    logger.info("Reads with MV as variant definition,"
                " not supported by GFF: %s" % n_notassign)
    logger.info("Hit Filtered by having > 3 changes: %s" % n_out)

    return reads
Ejemplo n.º 15
0
def create_vcf(mirgff3, precursor, gtf, vcffile):
    """
    Args:
        'mirgff3(str)': File with mirGFF3 format that will be converted
        'precursor(str)': Fasta format sequences of all miRNA hairpins
        'gtf(str)': Genome coordinates
        'vcffile': name of the file to be saved
    Returns:
        Nothing is returned, instead, a VCF file is generated
    """
    #Check if the input files exist:
    try:
        gff3_file = open(mirgff3, "r", encoding="utf-8") if six.PY3 else open(
            mirgff3, "r")
    except IOError:
        print("Can't read the file", end=mirgff3)
        sys.exit()
    with gff3_file:
        data = gff3_file.read()
        if six.PY2:
            data = data.decode("utf-8-sig").encode("utf-8")

    gff3_data = data.split("\n")
    vcf_file = open(vcffile, "w")

    ver = "v4.3"  # Current VCF version formatting
    vcf_file.write("##fileformat=VCF%s\n" % ver)
    date = datetime.datetime.now().strftime("%Y%m%d")
    vcf_file.write("##fileDate=%s\n" % date)
    source = "\n".join(s for s in gff3_data
                       if "## source-ontology: " in s)[20:]
    line = 0
    sample_names = []
    while gff3_data[line][:2] == "##":
        if gff3_data[line][:19] == "## source-ontology:":
            source = gff3_data[line][20:]
        elif gff3_data[line][:11] == "## COLDATA:":
            sample_names = gff3_data[line][12:].split(",")
        line += 1
    vcf_file.write("##source=%s\n" % source)
    vcf_file.write(
        '##INFO=<ID=NS,Type=Integer,Description="Number of samples"\n')
    vcf_file.write("##FILTER=<ID=REJECT,Description='"
                   'Filter not passed'
                   "'>\n")
    vcf_file.write(
        '##FORMAT=<ID=TRC,Number=1,Type=Integer,Description="Total read count">\n'
    )
    vcf_file.write(
        '##FORMAT=<ID=TSC,Number=1,Type=Integer,Description="Total SNP count">\n'
    )
    vcf_file.write(
        '##FORMAT=<ID=TMC,Number=1,Type=Integer,Description="Total miRNA count">\n'
    )
    vcf_file.write(
        '##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Genotype">\n')
    header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"
    # Adds Header
    for s in range(len(sample_names)):
        header = header + "\t" + sample_names[s]
    vcf_file.write(header)

    all_dict = dict(
    )  # initializing an empty dictionary where all info will be added
    key_list = [
    ]  # Initializing a list which will contain all the keys of the dictionary
    mirna_dict = dict(
    )  # initializing an empty dictionary where mirna info will be put
    n_SNP = 0
    n_noSNP = 0
    no_var = 0
    hairpins = read_precursor(precursor)
    gff3 = read_gtf_to_precursor(gtf)
    gtf_dic = read_gtf_to_mirna(gtf)
    for line in range(0, len(gff3_data)):
        if not gff3_data[line]:
            continue
        if gff3_data[line][1] == "#":
            continue
        else:  # Parsing the gff3 mirna lecture:
            gff_fields = read_gff_line(gff3_data[line])
            gtf_name = gff_fields['attrb']['Name']
            gtf_parent = gff_fields['attrb']['Parent']
            if gtf_parent not in gff3:
                continue
            if gtf_name not in gff3[gtf_parent]:
                continue
            parent_ini_pos = gff3[gtf_parent][gtf_name][0]
            parent_end_pos = gff3[gtf_parent][gtf_name][1]
            ref_seq = (hairpins[gtf_parent][parent_ini_pos:parent_end_pos + 1])
            vcf_chrom = gtf_dic[gtf_name][gtf_parent][0]
            vcf_pos = int(gff_fields['start']) + int(
                gtf_dic[gtf_name][gtf_parent][1])
            hairpin = hairpins[gtf_parent]
            variants = gff_fields['attrb']['Variant'].split(",")
            logger.debug("VCF::Variant::%s" % variants)
            #  Obtaining the iso_3p, iso_add3p and iso_5p values:

            var3p = [s for s in variants if 'iso_3p' in s]
            if len(var3p):
                var3p = int(var3p[0][7:])  # Position of iso_3p value
            else:
                var3p = 0

            var_add3p = [s for s in variants if 'iso_add3p' in s]
            if len(var_add3p):
                var_add3p = int(
                    var_add3p[0][10:])  # Position of iso_add3p value
            else:
                var_add3p = 0
            var3p = var3p + var_add3p
            logger.debug("VCF::VAR_3p::%s" % var3p)
            var5p = [s for s in variants if 'iso_5p' in s]
            if len(var5p):
                var5p = int(var5p[0][7:])  # Position of iso_5p value
            else:
                var5p = 0  #
            logger.debug("VCF::VAR_5p::%s" % var5p)
            cigar = gff_fields['attrb']["Cigar"]
            # Obtaining all the variants from the cigar:
            if 1:
                (key_pos, key_var, vcf_ref, vcf_alt) = cigar_2_key(
                    cigar, gff_fields['attrb']['Read'], ref_seq, vcf_pos,
                    var5p, var3p, parent_ini_pos, parent_end_pos, hairpin)

                # Adding the variants to a dictionary and calculating all the fields of a vcf file format:
                if len(key_var) > 0:
                    for s in range(len(key_var)):
                        key_dict = vcf_chrom + '-' + str(
                            key_pos[s]) + '-' + str(key_var[s])
                        raw_counts = gff_fields['attrb']['Expression']
                        raw_counts = [int(i) for i in raw_counts.split(',')]
                        nozero_counts = [
                            int(i > 0) for i in raw_counts
                        ]  # counts for every sample if expr != 0.
                        if gtf_name in mirna_dict:  # Adding expression values to same mirnas
                            mirna_dict[gtf_name]['Z'] = [
                                sum(x) for x in zip(mirna_dict[gtf_name]['Z'],
                                                    raw_counts)
                            ]
                        else:
                            mirna_dict[gtf_name] = {}
                            mirna_dict[gtf_name]["Z"] = raw_counts
                        if key_dict in all_dict:
                            if all_dict[key_dict]["Type"] in [
                                    "A", "C", "T", "G"
                            ]:
                                all_dict[key_dict]['X'] = [
                                    sum(x) for x in zip(
                                        all_dict[key_dict]['X'], nozero_counts)
                                ]
                                all_dict[key_dict]['Y'] = [
                                    sum(x) for x in zip(
                                        all_dict[key_dict]['Y'], raw_counts)
                                ]
                        else:
                            key_list.append(key_dict)
                            all_dict[key_dict] = {}
                            all_dict[key_dict]["Chrom"] = vcf_chrom
                            all_dict[key_dict]["Position"] = key_pos[s]
                            all_dict[key_dict]["mirna"] = gtf_name
                            all_dict[key_dict]["Type"] = key_var[s]
                            if key_var[s][0] in ["A", "C", "T", "G"]:
                                n_SNP += 1
                                all_dict[key_dict]["SNP"] = True
                                all_dict[key_dict]["ID"] = gff_fields['attrb'][
                                    'Name'] + '-SNP' + str(n_SNP)
                                all_dict[key_dict]['X'] = nozero_counts
                                all_dict[key_dict]['Y'] = raw_counts
                            else:
                                n_noSNP += 1
                                all_dict[key_dict]["SNP"] = False
                                all_dict[key_dict]["ID"] = gff_fields['attrb'][
                                    'Name'] + '-nonSNP' + str(n_noSNP)
                            all_dict[key_dict]["Ref"] = vcf_ref[s]
                            all_dict[key_dict]["Alt"] = vcf_alt[s]
                            all_dict[key_dict]["Qual"] = "."
                            all_dict[key_dict]["Filter"] = gff_fields['attrb'][
                                'Filter']
                            all_dict[key_dict]["Info"] = "NS=" + str(
                                len(sample_names))
            else:
                no_var += 1

    #  Writing the VCF file:
    for s in key_list:
        variant_line = (
            "\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %
            (all_dict[s]["Chrom"], all_dict[s]["Position"], all_dict[s]["ID"],
             all_dict[s]["Ref"], all_dict[s]["Alt"], all_dict[s]["Qual"],
             all_dict[s]["Filter"], all_dict[s]["Info"]))
        if all_dict[s]["Type"] in ["A", "T", "C", "G"]:
            format_col = "TRC:TSC:TMC:GT"
            variant_line = variant_line + "\t" + format_col
            samples = ""
            for n in range(len(sample_names)):
                X = all_dict[s]["X"][n]
                Y = all_dict[s]["Y"][n]
                Z = mirna_dict[all_dict[s]["mirna"]]["Z"][n]
                # Calculating the genotype:
                if Y == 0:
                    GT = "0|0"
                elif Z == Y:
                    GT = "1|1"
                else:
                    GT = "1|0"
                samples = samples + "\t" + str(X) + ":" + str(Y) + ":" + str(
                    Z) + ":" + GT
            variant_line = variant_line + samples
        else:
            format_col = ""
            variant_line = variant_line + format_col
        vcf_file.write(variant_line)
    vcf_file.close()
Ejemplo n.º 16
0
def read_file(folder, args):
    """
    Read sRNAbench file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with sRNAbench output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    reads_anno = os.path.join(folder, "reads.annotation")
    reads_iso = os.path.join(folder, "microRNAannotation.txt")
    sep = " " if args.out_format == "gtf" else "="
    sample = os.path.basename(folder)
    database = args.database
    precursors = args.precursors
    matures = args.matures

    n_out = 0
    n_in = 0
    n_ns = 0
    n_notassign = 0
    n_notindb = 0
    reads = defaultdict(dict)
    seen = set()

    source_iso = _read_iso(reads_iso)
    logger.info("Reads with isomiR information %s" % len(source_iso))
    with open(reads_anno) as handle:
        for sequence in handle:
            cols = sequence.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if query_name not in reads and not query_sequence:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                n_ns += 1
                continue
            if cols[3].find("mature") == -1:
                n_in += 1
                continue

            counts = int(cols[1])

            hit = len(
                set([mirna.split("#")[1] for mirna in cols[4].split("$")]))

            for nhit in cols[4].split("$"):
                logger.debug("SRNABENCH::line hit: %s" % nhit)
                hit_info = nhit.split("#")
                pos_info = hit_info[3].split(",")
                start = int(pos_info[1]) - 1
                end = start + len(query_sequence)  # int(pos_info[2]) - 1
                chrom = pos_info[0]
                mirName = hit_info[1]
                if chrom not in precursors or chrom not in matures:
                    n_notindb += 1
                if mirName not in matures[chrom]:
                    n_notindb += 1
                if (query_sequence, mirName) in seen:
                    continue

                seen.add((query_sequence, mirName))

                if (query_sequence, mirName) not in source_iso:
                    continue

                isoformat = source_iso[(query_sequence, mirName)]

                if isoformat == "mv":
                    n_notassign += 1
                    continue

                source = "isomiR" if isoformat != "NA" else "ref_miRNA"

                logger.debug("SRNABENCH::query: {query_sequence}\n"
                             "  precursor {chrom}\n"
                             "  name:  {query_name}\n"
                             "  start: {start}\n"
                             "  external: {isoformat}\n"
                             "  hit: {hit}".format(**locals()))
                logger.debug("SRNABENCH:: start %s end %s" % (start, end))
                if len(precursors[chrom]) < start + len(query_sequence):
                    n_out += 1
                    continue

                Filter = "Pass"
                cigar = make_cigar(query_sequence,
                                   precursors[chrom][start:end])
                preName = chrom
                score = "."
                strand = "+"
                idu = make_id(query_sequence)
                attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};"
                         " Parent {preName}; Variant {isoformat};"
                         " Cigar {cigar}; Expression {counts};"
                         " Filter {Filter}; Hits {hit};").format(**locals())
                line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t"
                        "{score}\t{strand}\t.\t{attrb}").format(**locals())
                if args.add_extra:
                    extra = variant_with_nt(line, args.precursors,
                                            args.matures)
                    line = "%s Changes %s;" % (line, extra)

                line = paste_columns(read_gff_line(line), sep=sep)
                if start not in reads[chrom]:
                    reads[chrom][start] = []
                if Filter == "Pass":
                    n_in += 1
                    reads[chrom][start].append(
                        [idu, chrom, counts, sample, line])

    logger.info("Loaded %s reads with %s hits" % (len(reads), n_in))
    logger.info("Reads without precursor information: %s" % n_notindb)
    logger.info("Reads with MV as variant definition,"
                " not supported by GFF: %s" % n_notassign)
    logger.info("Hit Filtered by having > 3 changes: %s" % n_out)

    return reads