def _convert_file(gff, args): sep = "\t" precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) variant_header = sep.join(['mism', 'add', 't5', 't3']) gff_file = open(gff, 'r') out_file = os.path.join(args.out, "%s_rawData.tsv" % os.path.splitext(os.path.basename(gff))[0]) missing_parent = 0 missing_mirna = 0 unvalid_uid = 0 with open(out_file, 'w') as outh: for samples_line in gff_file: if samples_line.startswith("## COLDATA:"): samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(",")) header = sep.join(['seq', 'mir', variant_header, samples]) print(header, file=outh) break for mirna_line in gff_file: gff = feature(mirna_line) attr = gff.attributes UID = attr["UID"] Read = attr["Read"] mirna = attr["Name"] parent = attr["Parent"] variant = attr["Variant"] try: Read = read_id(UID) except KeyError: unvalid_uid += 1 continue expression = sep.join(attr["Expression"].strip().split(",")) cols_variants = sep.join(_expand(variant)) logger.debug("COUNTS::Read:%s" % Read) logger.debug("COUNTS::EXTRA:%s" % variant) if parent not in precursors: missing_parent += 1 continue if mirna not in matures[parent]: missing_mirna += 1 continue extra = variant_with_nt(mirna_line, precursors, matures) if extra == "Invalid": continue logger.debug("COUNTS::EXTRA:%s" % extra) cols_variants = sep.join(_expand(extra, True)) summary = sep.join([Read, mirna, cols_variants, expression]) logger.debug(summary) print(summary, file=outh) gff_file.close() logger.info("Missing Parents in hairpin file: %s" % missing_parent) logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna) logger.info("Non valid UID: %s" % unvalid_uid) logger.info("Output file is at %s" % out_file)
def read_file(fn, args): """ Read OptimiR file and convert to mirtop GFF format. Args: *fn(str)*: file name with isomiR-SEA output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ database = args.database gtf = args.gtf sep = " " if args.out_format == "gtf" else "=" sample = read_samples(fn) reads = defaultdict(dict) logger.debug("OPTIMIR::SAMPLE::%s" % sample) with open(fn) as handle: for line in handle: gff = feature(line) fixed_line = line if gff.columns: if "Variant" not in gff.attributes: gff.attributes["Variant"] = "NA" logger.debug("OPTIMIR::Chrom update from %s to %s" % (gff.columns["chrom"], gff.attributes["Parent"])) gff.columns["chrom"] = gff.attributes["Parent"].split(",")[0] fixed_line = gff.paste_columns(sep=sep) if args.add_extra: extra = variant_with_nt(fixed_line, args.precursors, args.matures) fixed_line = "%s Changes %s;" % (fixed_line, extra) fixed_line = paste_columns(feature(fixed_line), sep=sep) counts = gff.attributes["Expression"].split(",") chrom = gff.columns["chrom"] start = gff.columns["start"] if start not in reads[chrom]: reads[chrom][start] = [] reads[chrom][start].append([ gff.attributes["UID"], gff.columns["chrom"], counts, sample, fixed_line ]) return reads
def read_file(fn, args): """ Read isomiR-SEA file and convert to mirtop GFF format. Args: *fn(str)*: file name with isomiR-SEA output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ database = args.database gtf = args.gtf sep = " " if args.out_format == "gtf" else "=" map_mir = mapper.read_gtf_to_mirna(gtf) reads = defaultdict(dict) reads_in = 0 sample = os.path.splitext(os.path.basename(fn))[0] hits = _get_hits(fn) logger.debug("ISOMIRSEA::SAMPLE::%s" % sample) with open(fn) as handle: for line in handle: cols = line.strip().split("\t") attr = read_attributes(line, "=") query_name = attr['TS'] query_sequence = attr['TS'].replace("U", "T") start = int(cols[3]) end = int(cols[4]) isomirseq_iso = attr['ISO'] if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue counts = attr["TC"] chrom = cols[0] # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) cigar = attr['CI'].replace("U", "T") idu = make_id(query_sequence) isoformat = cigar2variants(cigar, query_sequence, attr['ISO']) logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " idu: {idu}\n" " start: {start}\n" " cigar: {cigar}\n" " iso: {isoformat}\n" " variant: {isoformat}".format(**locals())) source = "isomiR" if isoformat != "NA" else "ref_miRNA" strand = "+" database = cols[1] mirName = attr['MIN'].split()[0] preName = attr['PIN'].split()[0] score = "." Filter = attr['FILTER'] isotag = attr['ISO'] tchrom, tstart = _genomic2transcript(map_mir[mirName], chrom, start) start = start if not tstart else tstart chrom = chrom if not tstart else tchrom end = start + len(query_sequence) hit = hits[idu] fields = { 'seq_name': query_sequence, 'idseq': idu, 'name': mirName, 'parent': preName, 'variant': isoformat, 'cigar': cigar, 'counts': counts, 'filter': Filter, 'hits': hit, 'chrom': chrom, 'start': start, 'end': end, 'database': database, 'source': source, 'score': score, 'strand': strand } # TODO: convert to genomic if args.out_genomic line = feature(fields).line if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": reads_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, line]) logger.info("Hits: %s" % reads_in) return reads
def convert_gff_counts(args): """ Reads a GFF file to produces output file containing Expression counts Args: *args(namedtuple)*: arguments parsed from command line with *mirtop.libs.parse.add_subparser_counts()*. Returns: *file (file)*: with columns like: UID miRNA Variant Sample1 Sample2 ... Sample N """ sep = "\t" variant_header = sep.join(['iso_5p', 'iso_3p', 'iso_add3p', 'iso_snp']) if args.add_extra: precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) variant_header = sep.join([ variant_header, 'iso_5p_nt', 'iso_3p_nt', 'iso_add3p_nt', 'iso_snp_nt' ]) logger.info("INFO Reading GFF file %s", args.gff) logger.info("INFO Writing TSV file to directory %s", args.out) gff_file = open(args.gff, 'r') out_file = op.join(args.out, "%s.tsv" % op.splitext(op.basename(args.gff))[0]) missing_parent = 0 missing_mirna = 0 unvalid_uid = 0 with open(out_file, 'w') as outh: for samples_line in gff_file: if samples_line.startswith("## COLDATA:"): samples = sep.join(samples_line.strip().split("COLDATA:") [1].strip().split(",")) header = sep.join([ 'UID', 'Read', 'miRNA', 'Variant', variant_header, samples ]) print(header, file=outh) break for mirna_line in gff_file: gff = feature(mirna_line) attr = gff.attributes UID = attr["UID"] Read = attr["Read"] mirna = attr["Name"] parent = attr["Parent"] variant = attr["Variant"] try: read_id(UID) except KeyError: unvalid_uid += 1 continue expression = sep.join(attr["Expression"].strip().split(",")) cols_variants = sep.join(_expand(variant)) logger.debug("COUNTS::Read:%s" % Read) logger.debug("COUNTS::EXTRA:%s" % variant) if args.add_extra: if parent not in precursors: missing_parent += 1 continue if mirna not in matures[parent]: missing_mirna += 1 continue extra = variant_with_nt(mirna_line, precursors, matures) if extra == "Invalid": continue logger.debug("COUNTS::EXTRA:%s" % extra) cols_variants = sep.join([cols_variants] + _expand(extra, True)) summary = sep.join( [UID, Read, mirna, variant, cols_variants, expression]) logger.debug(summary) print(summary, file=outh) gff_file.close() logger.info("Missing Parents in hairpin file: %s" % missing_parent) logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna) logger.info("Non valid UID: %s" % unvalid_uid) logger.info("Output file is at %s" % out_file)
def read_file(folder, args): """ Read sRNAbench file and convert to mirtop GFF format. Args: *fn(str)*: file name with sRNAbench output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ reads_anno = os.path.join(folder, "reads.annotation") reads_iso = os.path.join(folder, "microRNAannotation.txt") sep = " " if args.out_format == "gtf" else "=" sample = os.path.basename(folder) database = args.database precursors = args.precursors matures = args.matures n_out = 0 n_in = 0 n_ns = 0 n_notassign = 0 n_notindb = 0 reads = defaultdict(dict) seen = set() source_iso = _read_iso(reads_iso) logger.info("Reads with isomiR information %s" % len(source_iso)) with open(reads_anno) as handle: for sequence in handle: cols = sequence.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if query_name not in reads and not query_sequence: continue if query_sequence and query_sequence.find("N") > -1: n_ns += 1 continue if cols[3].find("mature") == -1: n_in += 1 continue counts = int(cols[1]) hits = len( set([mirna.split("#")[1] for mirna in cols[4].split("$")])) for nhit in cols[4].split("$"): logger.debug("SRNABENCH::line hit: %s" % nhit) hit_info = nhit.split("#") pos_info = hit_info[3].split(",") start = int(pos_info[1]) - 1 end = start + len(query_sequence) # int(pos_info[2]) - 1 chrom = pos_info[0] mirName = hit_info[1] if chrom not in precursors or chrom not in matures: n_notindb += 1 if mirName not in matures[chrom]: n_notindb += 1 if (query_sequence, mirName) in seen: continue seen.add((query_sequence, mirName)) if (query_sequence, mirName) not in source_iso: continue isoformat = source_iso[(query_sequence, mirName)] if isoformat == "mv": n_notassign += 1 continue source = "isomiR" if isoformat != "NA" else "ref_miRNA" logger.debug("SRNABENCH::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {start}\n" " external: {isoformat}\n" " hit: {hits}".format(**locals())) logger.debug("SRNABENCH:: start %s end %s" % (start, end)) if len(precursors[chrom]) < start + len(query_sequence): n_out += 1 continue Filter = "Pass" cigar = make_cigar(query_sequence, precursors[chrom][start:end]) preName = chrom score = "." strand = "+" idu = make_id(query_sequence) # attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" # " Parent {preName}; Variant {isoformat};" # " Cigar {cigar}; Expression {counts};" # " Filter {Filter}; Hits {hits};").format(**locals()) # line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" # "{score}\t{strand}\t.\t{attrb}").format(**locals()) fields = { 'seq_name': query_sequence, 'idseq': idu, 'name': mirName, 'parent': preName, 'variant': isoformat, 'cigar': cigar, 'counts': counts, 'filter': Filter, 'hits': hits, 'chrom': chrom, 'start': start, 'end': end, 'database': database, 'source': source, 'score': score, 'strand': strand } # TODO: convert to genomic if args.out_genomic line = feature(fields).line if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": n_in += 1 reads[chrom][start].append( [idu, chrom, counts, sample, line]) logger.info("Loaded %s reads with %s hits" % (len(reads), n_in)) logger.info("Reads without precursor information: %s" % n_notindb) logger.info("Reads with MV as variant definition," " not supported by GFF: %s" % n_notassign) logger.info("Hit Filtered by having > 3 changes: %s" % n_out) return reads
def _analyze_line(line, precursors, database, sample, sep, args): start_idx = 10 end_idx = 11 attr_idx = 15 query_name = line[3] sequence = line[4] if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase return None logger.debug(("READ::line name:{0}").format(line)) if sequence and sequence.find("N") > -1: return None chrom = line[attr_idx].strip().split("Name=")[-1] start = line[1] end = line[2] strand = line[5] counts = float(line[6]) Filter = "Pass" reads = dict() if not start: return None if strand == "+": start = int(start) - int(line[start_idx]) + 1 else: start = int(line[end_idx]) - int(end) iso = isomir() iso.align = line iso.set_pos(start, len(sequence)) logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom)) if len(precursors[chrom]) < start + len(sequence): logger.debug("READ::%s start + %s sequence size are bigger than" " size precursor %s" % ( chrom, len(sequence), len(precursors[chrom]))) iso.subs, iso.add, iso.cigar = filter.tune( sequence, precursors[chrom], start, None) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs)) idu = make_id(sequence) reads[query_name] = hits() reads[query_name].set_sequence(sequence) reads[query_name].counts = counts reads[query_name].sequence = sequence reads[query_name].set_precursor(chrom, iso) reads = annotate(reads, args.matures, args.precursors, quiet=True) gff_line = body.create(reads, args.database, sample, args, quiet=True) if start not in gff_line[chrom]: return None line = gff_line[chrom][start][0][4] logger.debug("READ::line:%s" % line) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) return {'chrom': chrom, 'start': start, 'name': query_name, 'mirna': reads[query_name].precursors[chrom].mirna, 'line': [idu, chrom, counts, sample, line]}
def convert_gff_counts(args): """ Reads a GFF file to produces output file containing Expression counts Args: *args(namedtuple)*: arguments parsed from command line with *mirtop.libs.parse.add_subparser_counts()*. Returns: *file (file)*: with columns like: UID miRNA Variant Sample1 Sample2 ... Sample N """ sep = "\t" variant_header = sep.join(['iso_5p', 'iso_3p', 'iso_add', 'iso_snp']) if args.add_extra: precursors = fasta.read_precursor(args.hairpin, args.sps) matures = mapper.read_gtf_to_precursor(args.gtf) variant_header = sep.join([variant_header, 'iso_5p_nt', 'iso_3p_nt', 'iso_add_nt', 'iso_snp_nt']) logger.info("INFO Reading GFF file %s", args.gff) logger.info("INFO Writing TSV file to directory %s", args.out) gff_file = open(args.gff, 'r') out_file = op.join(args.out, "expression_counts.tsv") missing_parent = 0 missing_mirna = 0 unvalid_uid = 0 with open(out_file, 'w') as outh: for samples_line in gff_file: if samples_line.startswith("## COLDATA:"): samples = sep.join(samples_line.strip().split("COLDATA:")[1].strip().split(",")) header = sep.join(['UID', 'Read', 'miRNA', 'Variant', variant_header, samples]) print(header, file=outh) break for mirna_line in gff_file: mirna_values = read_gff_line(mirna_line) Read = mirna_values["attrb"]["Read"] UID = mirna_values["attrb"]["UID"] mirna = mirna_values["attrb"]["Name"] parent = mirna_values["attrb"]["Parent"] variant = mirna_values["attrb"]["Variant"] try: read_id(UID) except KeyError: unvalid_uid += 1 continue expression = sep.join(mirna_values["attrb"]["Expression"].strip().split(",")) cols_variants = sep.join(_expand(variant)) logger.debug("COUNTS::Read:%s" % Read) logger.debug("COUNTS::EXTRA:%s" % variant) if args.add_extra: if parent not in precursors: missing_parent += 1 continue if mirna not in matures[parent]: missing_mirna += 1 continue extra = variant_with_nt(mirna_line, precursors, matures) if extra == "Invalid": continue logger.debug("COUNTS::EXTRA:%s" % extra) cols_variants = sep.join([cols_variants] + _expand(extra, True)) summary = sep.join([UID, Read, mirna, variant, cols_variants, expression]) logger.debug(summary) print(summary, file=outh) gff_file.close() logger.info("Missing Parents in hairpin file: %s" % missing_parent) logger.info("Missing MiRNAs in GFF file: %s" % missing_mirna) logger.info("Non valid UID: %s" % unvalid_uid) logger.info("Output file is at %s" % out_file)
def read_file(fn, args): """ Read isomiR-SEA file and convert to mirtop GFF format. Args: *fn(str)*: file name with isomiR-SEA output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ database = args.database gtf = args.gtf sep = " " if args.out_format == "gtf" else "=" map_mir = mapper.read_gtf_to_mirna(gtf) reads = defaultdict(dict) reads_in = 0 sample = os.path.splitext(os.path.basename(fn))[0] hits = _get_hits(fn) logger.debug("ISOMIRSEA::SAMPLE::%s" % sample) with open(fn) as handle: for line in handle: cols = line.strip().split("\t") attr = read_attributes(line, "=") query_name = attr['TS'] query_sequence = attr['TS'].replace("U", "T") start = int(cols[3]) end = int(cols[4]) isomirseq_iso = attr['ISO'] if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue counts = attr["TC"] chrom = cols[0] # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) cigar = attr['CI'].replace("U", "T") idu = make_id(query_sequence) isoformat = cigar2variants(cigar, query_sequence, attr['ISO']) logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " idu: {idu}\n" " start: {start}\n" " cigar: {cigar}\n" " iso: {isoformat}\n" " variant: {isoformat}".format(**locals())) source = "isomiR" if isoformat != "NA" else "ref_miRNA" strand = "+" database = cols[1] mirName = attr['MIN'].split()[0] preName = attr['PIN'].split()[0] score = "." Filter = attr['FILTER'] isotag = attr['ISO'] tchrom, tstart = _genomic2transcript(map_mir[mirName], chrom, start) start = start if not tstart else tstart chrom = chrom if not tstart else tchrom end = start + len(query_sequence) hit = hits[idu] attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" " Parent {preName}; Variant {isoformat};" " Isocode {isotag}; Cigar {cigar}; Expression {counts};" " Filter {Filter}; Hits {hit};").format(**locals()) line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" "{score}\t{strand}\t.\t{attrb}").format(**locals()) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(read_gff_line(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": reads_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, line]) logger.info("Hits: %s" % reads_in) return reads
def read_file(folder, args): """ Read sRNAbench file and convert to mirtop GFF format. Args: *fn(str)*: file name with sRNAbench output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ reads_anno = os.path.join(folder, "reads.annotation") reads_iso = os.path.join(folder, "microRNAannotation.txt") sep = " " if args.out_format == "gtf" else "=" sample = os.path.basename(folder) database = args.database precursors = args.precursors matures = args.matures n_out = 0 n_in = 0 n_ns = 0 n_notassign = 0 n_notindb = 0 reads = defaultdict(dict) seen = set() source_iso = _read_iso(reads_iso) logger.info("Reads with isomiR information %s" % len(source_iso)) with open(reads_anno) as handle: for sequence in handle: cols = sequence.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if query_name not in reads and not query_sequence: continue if query_sequence and query_sequence.find("N") > -1: n_ns += 1 continue if cols[3].find("mature") == -1: n_in += 1 continue counts = int(cols[1]) hit = len(set([mirna.split("#")[1] for mirna in cols[4].split("$")])) for nhit in cols[4].split("$"): logger.debug("SRNABENCH::line hit: %s" % nhit) hit_info = nhit.split("#") pos_info = hit_info[3].split(",") start = int(pos_info[1]) - 1 end = start + len(query_sequence) # int(pos_info[2]) - 1 chrom = pos_info[0] mirName = hit_info[1] if chrom not in precursors or chrom not in matures: n_notindb += 1 if mirName not in matures[chrom]: n_notindb += 1 if (query_sequence, mirName) in seen: continue seen.add((query_sequence, mirName)) if (query_sequence, mirName) not in source_iso: continue isoformat = source_iso[(query_sequence, mirName)] if isoformat == "mv": n_notassign += 1 continue source = "isomiR" if isoformat != "NA" else "ref_miRNA" logger.debug("SRNABENCH::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {start}\n" " external: {isoformat}\n" " hit: {hit}".format(**locals())) logger.debug("SRNABENCH:: start %s end %s" % (start, end)) if len(precursors[chrom]) < start + len(query_sequence): n_out += 1 continue Filter = "Pass" cigar = make_cigar(query_sequence, precursors[chrom][start:end]) preName = chrom score = "." strand = "+" idu = make_id(query_sequence) attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" " Parent {preName}; Variant {isoformat};" " Cigar {cigar}; Expression {counts};" " Filter {Filter}; Hits {hit};").format(**locals()) line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" "{score}\t{strand}\t.\t{attrb}").format(**locals()) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(read_gff_line(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": n_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, line]) logger.info("Loaded %s reads with %s hits" % (len(reads), n_in)) logger.info("Reads without precursor information: %s" % n_notindb) logger.info("Reads with MV as variant definition," " not supported by GFF: %s" % n_notassign) logger.info("Hit Filtered by having > 3 changes: %s" % n_out) return reads