def _get_hits(fn): hits = Counter() with open(fn) as handle: for line in handle: attr = read_attributes(line, "=") query_sequence = attr['TS'].replace("U", "T") if query_sequence and query_sequence.find("N") > -1: continue idu = make_id(query_sequence) hits[idu] += 1 return hits
def read_file(fn, precursors, matures): with open(fn) as inh: for line in inh: if line.startswith("#"): continue cols = line.strip().split("\t") attr = read_attributes(line) t5 = _get_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = _get_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = _get_add(attr["Read"], attr["Variant"]) print[attr["Variant"], t5, t3, add]
def _read_file(fn, precursors, matures, out_dir): samples = read_samples(fn) for sample in samples: with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh: print("\t".join( ["seq", "name", "freq", "mir", "start", "end", "mism", "add", "t5", "t3", "s5", "s3", "DB", "precursor", "ambiguity"]), file=outh) with open(fn) as inh: for line in inh: if line.startswith("#"): continue cols = line.strip().split("\t") attr = read_attributes(line) read = read_id(attr["UID"]) t5 = variant_to_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = variant_to_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = variant_to_add(read, attr["Variant"]) mature_sequence = get_mature_sequence( precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]]) mm = align_from_variants(read, mature_sequence, attr["Variant"]) if len(mm) > 1: continue elif len(mm) == 1: mm = "".join(map(str, mm[0])) else: mm = "0" hit = attr["Hits"] if "Hits" in attr else "1" logger.debug("exporter::isomir::decode %s" % [attr["Variant"], t5, t3, add, mm]) # Error if attr["Read"] doesn't exist line = [read, attr["Read"], "0", attr["Name"], cols[1], cols[2], mm, add, t5, t3, "NA", "NA", "miRNA", attr["Parent"], hit] for sample, counts in zip(samples, attr["Expression"].split(",")): with open(os.path.join(out_dir, "%s.mirna" % sample), 'a') as outh: line[2] = counts print("\t".join(line), file=outh)
def _read_file(fn, precursors, matures, out_dir): samples = read_samples(fn) for sample in samples: with open(os.path.join(out_dir, "%s.mirna" % sample), 'w') as outh: print("\t".join([ "seq", "name", "freq", "mir", "start", "end", "mism", "add", "t5", "t3", "s5", "s3", "DB", "precursor", "ambiguity" ]), file=outh) with open(fn) as inh: for line in inh: if line.startswith("#"): continue cols = line.strip().split("\t") attr = read_attributes(line) read = read_id(attr["UID"]) t5 = variant_to_5p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) t3 = variant_to_3p(precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]], attr["Variant"]) add = variant_to_add(read, attr["Variant"]) mature_sequence = get_mature_sequence( precursors[attr["Parent"]], matures[attr["Parent"]][attr["Name"]]) mm = align_from_variants(read, mature_sequence, attr["Variant"]) if len(mm) > 1: continue elif len(mm) == 1: mm = "".join(map(str, mm[0])) else: mm = "0" hit = attr["Hits"] if "Hits" in attr else "1" logger.debug("exporter::isomir::decode %s" % [attr["Variant"], t5, t3, add, mm]) # Error if attr["Read"] doesn't exist line = [ read, attr["Read"], "0", attr["Name"], cols[1], cols[2], mm, add, t5, t3, "NA", "NA", "miRNA", attr["Parent"], hit ] for sample, counts in zip(samples, attr["Expression"].split(",")): with open(os.path.join(out_dir, "%s.mirna" % sample), 'a') as outh: line[2] = counts print("\t".join(line), file=outh)
def _calc_stats(fn): """ Read files and parse into categories """ samples = _get_samples(fn) lines = [] seen = set() with open(fn) as inh: for line in inh: if line.startswith("#"): continue cols = line.strip().split("\t") logger.debug("## STATS: attribute %s" % cols[8]) attr = read_attributes(line) if "-".join([attr['Variant'], attr['Name']]) in seen: continue seen.add("-".join([attr['Variant'], attr['Name']])) lines.extend(_classify(cols[2], attr, samples)) df = _summary(lines) return df
def read_file(fn, args): """ Read isomiR-SEA file and convert to mirtop GFF format. Args: *fn(str)*: file name with isomiR-SEA output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ database = args.database gtf = args.gtf sep = " " if args.out_format == "gtf" else "=" map_mir = mapper.read_gtf_to_mirna(gtf) reads = defaultdict(dict) reads_in = 0 sample = os.path.splitext(os.path.basename(fn))[0] hits = _get_hits(fn) logger.debug("ISOMIRSEA::SAMPLE::%s" % sample) with open(fn) as handle: for line in handle: cols = line.strip().split("\t") attr = read_attributes(line, "=") query_name = attr['TS'] query_sequence = attr['TS'].replace("U", "T") start = int(cols[3]) end = int(cols[4]) isomirseq_iso = attr['ISO'] if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue counts = attr["TC"] chrom = cols[0] # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) cigar = attr['CI'].replace("U", "T") idu = make_id(query_sequence) isoformat = cigar2variants(cigar, query_sequence, attr['ISO']) logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " idu: {idu}\n" " start: {start}\n" " cigar: {cigar}\n" " iso: {isoformat}\n" " variant: {isoformat}".format(**locals())) source = "isomiR" if isoformat != "NA" else "ref_miRNA" strand = "+" database = cols[1] mirName = attr['MIN'].split()[0] preName = attr['PIN'].split()[0] score = "." Filter = attr['FILTER'] isotag = attr['ISO'] tchrom, tstart = _genomic2transcript(map_mir[mirName], chrom, start) start = start if not tstart else tstart chrom = chrom if not tstart else tchrom end = start + len(query_sequence) hit = hits[idu] fields = { 'seq_name': query_sequence, 'idseq': idu, 'name': mirName, 'parent': preName, 'variant': isoformat, 'cigar': cigar, 'counts': counts, 'filter': Filter, 'hits': hit, 'chrom': chrom, 'start': start, 'end': end, 'database': database, 'source': source, 'score': score, 'strand': strand } # TODO: convert to genomic if args.out_genomic line = feature(fields).line if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": reads_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, line]) logger.info("Hits: %s" % reads_in) return reads
def read_file(fn, database, gtf): """ read bam file and perform realignment of hits """ map_mir = mapper.read_gtf_to_mirna(gtf) reads = defaultdict(dict) reads_in = 0 sample = os.path.splitext(os.path.basename(gtf))[0] hits = _get_hits(fn) with open(fn) as handle: for line in handle: cols = line.strip().split("\t") attr = read_attributes(line, "=") query_name = attr['TS'] query_sequence = attr['TS'].replace("U", "T") start = int(cols[3]) end = int(cols[4]) isomirseq_iso = attr['ISO'] if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue counts = attr["TC"] chrom = cols[0] # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) cigar = attr['CI'].replace("U", "T") idu = make_id(query_sequence) isoformat = cigar2variants(cigar, query_sequence, attr['ISO']) logger.debug("\nSOMIRSEA::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " idu: {idu}\n" " start: {start}\n" " cigar: {cigar}\n" " iso: {isoformat}\n" " variant: {isoformat}".format(**locals())) source = "isomiR" if isoformat != "NA" else "ref_miRNA" strand = "+" database = cols[1] mirName = attr['MIN'].split()[0] preName = attr['PIN'].split()[0] score = "." Filter = attr['FILTER'] isotag = attr['ISO'] tchrom, tstart = genomic2transcript(map_mir[mirName], chrom, start) start = start if not tstart else tstart chrom = chrom if not tstart else tchrom end = start + len(query_sequence) hit = hits[idu] attrb = ( "Read {query_sequence}; UID {idu}; Name {mirName}; Parent {preName}; Variant {isoformat}; Isocode {isotag}; Cigar {cigar}; Expression {counts}; Filter {Filter}; Hits {hit};" ).format(**locals()) res = ( "{chrom}\t{database}\t{source}\t{start}\t{end}\t{score}\t{strand}\t.\t{attrb}" ).format(**locals()) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": reads_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, res]) logger.info("Hits: %s" % reads_in) return reads
def read_file(fn, args): """ Read isomiR-SEA file and convert to mirtop GFF format. Args: *fn(str)*: file name with isomiR-SEA output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ database = args.database gtf = args.gtf sep = " " if args.out_format == "gtf" else "=" map_mir = mapper.read_gtf_to_mirna(gtf) reads = defaultdict(dict) reads_in = 0 sample = os.path.splitext(os.path.basename(fn))[0] hits = _get_hits(fn) logger.debug("ISOMIRSEA::SAMPLE::%s" % sample) with open(fn) as handle: for line in handle: cols = line.strip().split("\t") attr = read_attributes(line, "=") query_name = attr['TS'] query_sequence = attr['TS'].replace("U", "T") start = int(cols[3]) end = int(cols[4]) isomirseq_iso = attr['ISO'] if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue counts = attr["TC"] chrom = cols[0] # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) cigar = attr['CI'].replace("U", "T") idu = make_id(query_sequence) isoformat = cigar2variants(cigar, query_sequence, attr['ISO']) logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " idu: {idu}\n" " start: {start}\n" " cigar: {cigar}\n" " iso: {isoformat}\n" " variant: {isoformat}".format(**locals())) source = "isomiR" if isoformat != "NA" else "ref_miRNA" strand = "+" database = cols[1] mirName = attr['MIN'].split()[0] preName = attr['PIN'].split()[0] score = "." Filter = attr['FILTER'] isotag = attr['ISO'] tchrom, tstart = _genomic2transcript(map_mir[mirName], chrom, start) start = start if not tstart else tstart chrom = chrom if not tstart else tchrom end = start + len(query_sequence) hit = hits[idu] attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" " Parent {preName}; Variant {isoformat};" " Isocode {isotag}; Cigar {cigar}; Expression {counts};" " Filter {Filter}; Hits {hit};").format(**locals()) line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" "{score}\t{strand}\t.\t{attrb}").format(**locals()) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(read_gff_line(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": reads_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, line]) logger.info("Hits: %s" % reads_in) return reads