def _convert_file(gff, args): sep = "\t" precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) variant_header = sep.join(['mism', 'add', 't5', 't3']) gff_file = open(gff, 'r') out_file = os.path.join(args.out, "%s_rawData.tsv" % os.path.splitext(os.path.basename(gff))[0]) missing_parent = 0 missing_mirna = 0 unvalid_uid = 0 with open(out_file, 'w') as outh: for samples_line in gff_file: if samples_line.startswith("## COLDATA:"): samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(",")) header = sep.join(['seq', 'mir', variant_header, samples]) print(header, file=outh) break for mirna_line in gff_file: gff = feature(mirna_line) attr = gff.attributes UID = attr["UID"] Read = attr["Read"] mirna = attr["Name"] parent = attr["Parent"] variant = attr["Variant"] try: Read = read_id(UID) except KeyError: unvalid_uid += 1 continue expression = sep.join(attr["Expression"].strip().split(",")) cols_variants = sep.join(_expand(variant)) logger.debug("COUNTS::Read:%s" % Read) logger.debug("COUNTS::EXTRA:%s" % variant) if parent not in precursors: missing_parent += 1 continue if mirna not in matures[parent]: missing_mirna += 1 continue extra = variant_with_nt(mirna_line, precursors, matures) if extra == "Invalid": continue logger.debug("COUNTS::EXTRA:%s" % extra) cols_variants = sep.join(_expand(extra, True)) summary = sep.join([Read, mirna, cols_variants, expression]) logger.debug(summary) print(summary, file=outh) gff_file.close() logger.info("Missing Parents in hairpin file: %s" % missing_parent) logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna) logger.info("Non valid UID: %s" % unvalid_uid) logger.info("Output file is at %s" % out_file)
def compare(args): """ From a list of GFF files produce comparison with a reference set. Args: *args(namedtuple)*: arguments parsed from command line with *mirtop.libs.parse.add_subparser_compare()*. First file will be considered the reference set. Returns: *(out_file)*: comparison of the GFF files with the reference. """ out = list() result = dict() reference = read_reference(args.files[0]) for fn in args.files[1:]: if not os.path.exists(fn): raise IOError("%s doesn't exist" % fn) result[os.path.basename(fn)] = _compare_to_reference(fn, reference) if args.out != "tmp_mirtop": fn_out = os.path.join(args.out, "summary.txt") with open(fn_out, 'w') as outh: for fn in result: print("sample\tidu\tseq\ttag\tsame_mirna\t%s" % "\t".join(result[fn][0][3].keys()), file=outh) for line in result[fn]: read = read_id(line[0]) acc = "\t".join([line[3][v] for v in line[3]]) print("%s\t%s\t%s\t%s\t%s\t%s" % (fn, line[0], read, line[1], line[2], acc), file=outh)
def variant_with_nt(line, precursors, matures): """ Return nucleotides changes for each variant type using Variant attribute, precursor sequences and mature position. """ cols = read_gff_line(line) attr = cols["attrb"] read = read_id(attr["UID"]) logger.debug("GFF::BODY::precursors %s" % precursors[attr["Parent"]]) logger.debug("GFF:BODY::mature %s" % matures[attr["Parent"]][attr["Name"]]) t5 = variant_to_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = variant_to_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = variant_to_add(read, attr["Variant"]) mature_sequence = get_mature_sequence( precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]]) logger.debug("GFF::BODY::mature_sequence %s" % mature_sequence) mm = align_from_variants(read, mature_sequence, attr["Variant"]) if mm == "Invalid": return mm if len(mm) > 0: mm = "".join(["".join(map(str, m)) for m in mm]) else: mm = "0" return "iso_5p:%s,iso_3p:%s,iso_add:%s,iso_snp:%s" % (t5, t3, add, mm)
def _process(fn, out_dir): if out_dir: out_fasta = os.path.join( out_dir, "%s.fasta" % os.path.splitext(os.path.basename(fn))[0]) outh = sys.stdout if not out_dir else open(out_fasta, 'w') with open(fn) as inh: for line in inh: if line.startswith("#"): continue gff = feature(line) attr = gff.attributes read = read_id(attr["UID"]) print((">{0}\n{1}").format(attr["UID"], read), file=outh)
def read(fn, args): """Read GTF/GFF file and load into annotate, chrom counts, sample, line""" samples = read_samples(fn) lines = defaultdict(dict) sep = " " if args.out_format == "gtf" else "=" corrupted_uid = 0 with open(fn) as inh: for line in inh: if line.startswith("#"): continue line = paste_columns(feature(line), sep=sep) gff = feature(line) cols = gff.columns attr = gff.attributes if attr['UID'] and not read_id(attr['UID']): corrupted_uid += 1 continue if 'UID' not in attr: msg = "UID not found." if 'Read' not in attr: if not is_sequence(attr['Read']): msg = msg + " Sequence not valid in Read attribute." else: attr['UID'] = make_id(attr['Read']) if 'sequence' not in attr: msg = msg + " Sequence not found in sequence attribute." if not is_sequence(attr['sequence']): msg = msg + " Sequence not valid in sequence attribute." else: attr['UID'] = make_id(attr['Read']) if 'UID' not in attr: logger.warning("Line is not a valid GFF3 line: %s" % line.strip()) logger.warning(msg) if cols['start'] not in lines[cols['chrom']]: lines[cols['chrom']][cols['start']] = [] uid = "%s-%s-%s" % (attr['UID'], attr['Variant'], attr['Name']) if args.keep_name: uid = "%s-%s" % (uid, attr['Read']) lines[cols['chrom']][cols['start']].append( [uid, cols['chrom'], attr['Expression'].strip().split(","), samples, line.strip()]) logger.info("Lines skipped due to corrupted UID: %s" % corrupted_uid) return lines
def _read_file(fn, precursors, matures, out_dir): samples = read_samples(fn) for sample in samples: with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh: print("\t".join([ "seq", "name", "freq", "mir", "start", "end", "mism", "add", "t5", "t3", "s5", "s3", "DB", "precursor", "ambiguity" ]), file=outh) with open(fn) as inh: for line in inh: if line.startswith("#"): continue gff = feature(line) cols = gff.columns attr = gff.attributes read = read_id(attr["UID"]) t5 = variant_to_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = variant_to_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = variant_to_add(read, attr["Variant"]) mature_sequence = get_mature_sequence( precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]]) mm = align_from_variants(read, mature_sequence, attr["Variant"]) if len(mm) > 1: continue elif len(mm) == 1: mm = "".join(list(map(str, mm[0]))) else: mm = "0" hit = attr["Hits"] if "Hits" in attr else "1" logger.debug("exporter::isomir::decode %s" % [attr["Variant"], t5, t3, add, mm]) # Error if attr["Read"] doesn't exist # print(cols) line = [ read, attr["Read"], "0", attr["Name"], cols['source'], cols['type'], mm, add, t5, t3, "NA", "NA", "miRNA", attr["Parent"], hit ] for sample, counts in zip(samples, attr["Expression"].split(",")): with open(os.path.join(out_dir, "%s.mirna" % sample), 'a') as outh: line[2] = counts print("\t".join(line), file=outh)
def _read_file(fn, precursors, matures, out_dir): samples = read_samples(fn) for sample in samples: with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh: print("\t".join( ["seq", "name", "freq", "mir", "start", "end", "mism", "add", "t5", "t3", "s5", "s3", "DB", "precursor", "ambiguity"]), file=outh) with open(fn) as inh: for line in inh: if line.startswith("#"): continue cols = line.strip().split("\t") attr = read_attributes(line) read = read_id(attr["UID"]) t5 = variant_to_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = variant_to_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = variant_to_add(read, attr["Variant"]) mature_sequence = get_mature_sequence( precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]]) mm = align_from_variants(read, mature_sequence, attr["Variant"]) if len(mm) > 1: continue elif len(mm) == 1: mm = "".join(map(str, mm[0])) else: mm = "0" hit = attr["Hits"] if "Hits" in attr else "1" logger.debug("exporter::isomir::decode %s" % [attr["Variant"], t5, t3, add, mm]) # Error if attr["Read"] doesn't exist line = [read, attr["Read"], "0", attr["Name"], cols[1], cols[2], mm, add, t5, t3, "NA", "NA", "miRNA", attr["Parent"], hit] for sample, counts in zip(samples, attr["Expression"].split(",")): with open(os.path.join(out_dir, "%s.mirna" % sample), 'a') as outh: line[2] = counts print("\t".join(line), file=outh)
def variant_with_nt(line, precursors, matures): """ Return nucleotides changes for each variant type using Variant attribute, precursor sequences and mature position. """ gff = feature(line) attr = gff.attributes read = read_id(attr["UID"]) attr["Parent"] = attr["Parent"].split(",")[0] if attr["Parent"] not in matures: logger.warning("Parent miRNA not found in database %s" % attr["Parent"]) return "" if attr["Name"] not in matures[attr["Parent"]]: logger.warning("miRNA not found in database %s" % attr["Name"]) return "" logger.debug("GFF::BODY::precursors %s" % precursors[attr["Parent"]]) logger.debug("GFF:BODY::mature %s" % matures[attr["Parent"]][attr["Name"]]) t5 = variant_to_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = variant_to_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = variant_to_add(read, attr["Variant"]) mature_sequence = get_mature_sequence( precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], nt=8) logger.debug("GFF::BODY::mature_sequence %s" % mature_sequence) mm = align_from_variants(read, mature_sequence, attr["Variant"]) if mm == "Invalid": return mm if len(mm) > 0: mm = "".join(["".join([str(v) for v in m]) for m in mm]) else: mm = "0" return "iso_5p:%s,iso_3p:%s,iso_add3p:%s,iso_snv:%s" % (t5, t3, add, mm)
def convert_gff_counts(args): """ Reads a GFF file to produces output file containing Expression counts Args: *args(namedtuple)*: arguments parsed from command line with *mirtop.libs.parse.add_subparser_counts()*. Returns: *file (file)*: with columns like: UID miRNA Variant Sample1 Sample2 ... Sample N """ sep = "\t" variant_header = sep.join(['iso_5p', 'iso_3p', 'iso_add3p', 'iso_snp']) if args.add_extra: precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) variant_header = sep.join([ variant_header, 'iso_5p_nt', 'iso_3p_nt', 'iso_add3p_nt', 'iso_snp_nt' ]) logger.info("INFO Reading GFF file %s", args.gff) logger.info("INFO Writing TSV file to directory %s", args.out) gff_file = open(args.gff, 'r') out_file = op.join(args.out, "%s.tsv" % op.splitext(op.basename(args.gff))[0]) missing_parent = 0 missing_mirna = 0 unvalid_uid = 0 with open(out_file, 'w') as outh: for samples_line in gff_file: if samples_line.startswith("## COLDATA:"): samples = sep.join(samples_line.strip().split("COLDATA:") [1].strip().split(",")) header = sep.join([ 'UID', 'Read', 'miRNA', 'Variant', variant_header, samples ]) print(header, file=outh) break for mirna_line in gff_file: gff = feature(mirna_line) attr = gff.attributes UID = attr["UID"] Read = attr["Read"] mirna = attr["Name"] parent = attr["Parent"] variant = attr["Variant"] try: read_id(UID) except KeyError: unvalid_uid += 1 continue expression = sep.join(attr["Expression"].strip().split(",")) cols_variants = sep.join(_expand(variant)) logger.debug("COUNTS::Read:%s" % Read) logger.debug("COUNTS::EXTRA:%s" % variant) if args.add_extra: if parent not in precursors: missing_parent += 1 continue if mirna not in matures[parent]: missing_mirna += 1 continue extra = variant_with_nt(mirna_line, precursors, matures) if extra == "Invalid": continue logger.debug("COUNTS::EXTRA:%s" % extra) cols_variants = sep.join([cols_variants] + _expand(extra, True)) summary = sep.join( [UID, Read, mirna, variant, cols_variants, expression]) logger.debug(summary) print(summary, file=outh) gff_file.close() logger.info("Missing Parents in hairpin file: %s" % missing_parent) logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna) logger.info("Non valid UID: %s" % unvalid_uid) logger.info("Output file is at %s" % out_file)
def _convert(s, test, reverse=False): code = read_id(s) if reverse else make_id(s) if code != test: raise ValueError("%s didn't result on %s but in %s" % (s, test, code))
def convert_gff_counts(args): """ Reads a GFF file to produces output file containing Expression counts Args: *args(namedtuple)*: arguments parsed from command line with *mirtop.libs.parse.add_subparser_counts()*. Returns: *file (file)*: with columns like: UID miRNA Variant Sample1 Sample2 ... Sample N """ sep = "\t" variant_header = sep.join(['iso_5p', 'iso_3p', 'iso_add', 'iso_snp']) if args.add_extra: precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) variant_header = sep.join([variant_header, 'iso_5p_nt', 'iso_3p_nt', 'iso_add_nt', 'iso_snp_nt']) logger.info("INFO Reading GFF file %s", args.gff) logger.info("INFO Writing TSV file to directory %s", args.out) gff_file = open(args.gff, 'r') out_file = op.join(args.out, "expression_counts.tsv") missing_parent = 0 missing_mirna = 0 unvalid_uid = 0 with open(out_file, 'w') as outh: for samples_line in gff_file: if samples_line.startswith("## COLDATA:"): samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(",")) header = sep.join(['UID', 'Read', 'miRNA', 'Variant', variant_header, samples]) print(header, file=outh) break for mirna_line in gff_file: mirna_values = read_gff_line(mirna_line) Read = mirna_values["attrb"]["Read"] UID = mirna_values["attrb"]["UID"] mirna = mirna_values["attrb"]["Name"] parent = mirna_values["attrb"]["Parent"] variant = mirna_values["attrb"]["Variant"] try: read_id(UID) except KeyError: unvalid_uid += 1 continue expression = sep.join(mirna_values["attrb"]["Expression"].strip().split(",")) cols_variants = sep.join(_expand(variant)) logger.debug("COUNTS::Read:%s" % Read) logger.debug("COUNTS::EXTRA:%s" % variant) if args.add_extra: if parent not in precursors: missing_parent += 1 continue if mirna not in matures[parent]: missing_mirna += 1 continue extra = variant_with_nt(mirna_line, precursors, matures) if extra == "Invalid": continue logger.debug("COUNTS::EXTRA:%s" % extra) cols_variants = sep.join([cols_variants] + _expand(extra, True)) summary = sep.join([UID, Read, mirna, variant, cols_variants, expression]) logger.debug(summary) print(summary, file=outh) gff_file.close() logger.info("Missing Parents in hairpin file: %s" % missing_parent) logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna) logger.info("Non valid UID: %s" % unvalid_uid) logger.info("Output file is at %s" % out_file)