def test_code_convert(self): """testing code correction function""" from mirtop.mirna.realign import make_id from mirtop.gff.update import read_uid_10 if not make_id(read_uid_10("@#%$")) == "iso-12-B1NY4": raise ValueError("Update ID is not working.") if not make_id(read_uid_10("@#%$@2")) == "iso-13-B1NYDX": raise ValueError("Update ID is not working.")
def read(fn, args): """Read GTF/GFF file and load into annotate, chrom counts, sample, line""" samples = read_samples(fn) lines = defaultdict(dict) sep = " " if args.out_format == "gtf" else "=" corrupted_uid = 0 with open(fn) as inh: for line in inh: if line.startswith("#"): continue line = paste_columns(feature(line), sep=sep) gff = feature(line) cols = gff.columns attr = gff.attributes if attr['UID'] and not read_id(attr['UID']): corrupted_uid += 1 continue if 'UID' not in attr: msg = "UID not found." if 'Read' not in attr: if not is_sequence(attr['Read']): msg = msg + " Sequence not valid in Read attribute." else: attr['UID'] = make_id(attr['Read']) if 'sequence' not in attr: msg = msg + " Sequence not found in sequence attribute." if not is_sequence(attr['sequence']): msg = msg + " Sequence not valid in sequence attribute." else: attr['UID'] = make_id(attr['Read']) if 'UID' not in attr: logger.warning("Line is not a valid GFF3 line: %s" % line.strip()) logger.warning(msg) if cols['start'] not in lines[cols['chrom']]: lines[cols['chrom']][cols['start']] = [] uid = "%s-%s-%s" % (attr['UID'], attr['Variant'], attr['Name']) if args.keep_name: uid = "%s-%s" % (uid, attr['Read']) lines[cols['chrom']][cols['start']].append( [uid, cols['chrom'], attr['Expression'].strip().split(","), samples, line.strip()]) logger.info("Lines skipped due to corrupted UID: %s" % corrupted_uid) return lines
def _get_hits(fn): hits = Counter() with open(fn) as handle: for line in handle: attr = read_attributes(line, "=") query_sequence = attr['TS'].replace("U", "T") if query_sequence and query_sequence.find("N") > -1: continue idu = make_id(query_sequence) hits[idu] += 1 return hits
def _get_hits(fn): hits = Counter() with open(fn) as handle: for line in handle: attr = read_attributes(line, "=") query_sequence = attr['TS'].replace("U", "T") if query_sequence and query_sequence.find("N") > -1: continue idu = make_id(query_sequence) hits[idu] += 1 return hits
def to10to11(gff_line): gff_line = gff_line.replace("_snp", "_snv") gff_line = gff_line.replace("_add", "_add3p") features = feature(gff_line) if "iso_5p" in features.attributes["Variant"]: variants = features.attributes["Variant"].split(",") iso_5p = [v.split(":") for v in variants if v.startswith("iso_5p")] iso_5p = -1 * int(iso_5p[0][1]) if iso_5p > 0: iso_5p = "+%s" % iso_5p variants = [ "iso_5p:%s" % iso_5p if v.startswith("iso_5p") else v for v in variants ] features.attributes["Variant"] = ",".join(variants) features.attributes["UID"] = make_id( read_uid_10(features.attributes["UID"])) return features.paste_columns()
def read_file(fn, args): """ Read isomiR-SEA file and convert to mirtop GFF format. Args: *fn(str)*: file name with isomiR-SEA output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ database = args.database gtf = args.gtf sep = " " if args.out_format == "gtf" else "=" map_mir = mapper.read_gtf_to_mirna(gtf) reads = defaultdict(dict) reads_in = 0 sample = os.path.splitext(os.path.basename(fn))[0] hits = _get_hits(fn) logger.debug("ISOMIRSEA::SAMPLE::%s" % sample) with open(fn) as handle: for line in handle: cols = line.strip().split("\t") attr = read_attributes(line, "=") query_name = attr['TS'] query_sequence = attr['TS'].replace("U", "T") start = int(cols[3]) end = int(cols[4]) isomirseq_iso = attr['ISO'] if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue counts = attr["TC"] chrom = cols[0] # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) cigar = attr['CI'].replace("U", "T") idu = make_id(query_sequence) isoformat = cigar2variants(cigar, query_sequence, attr['ISO']) logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " idu: {idu}\n" " start: {start}\n" " cigar: {cigar}\n" " iso: {isoformat}\n" " variant: {isoformat}".format(**locals())) source = "isomiR" if isoformat != "NA" else "ref_miRNA" strand = "+" database = cols[1] mirName = attr['MIN'].split()[0] preName = attr['PIN'].split()[0] score = "." Filter = attr['FILTER'] isotag = attr['ISO'] tchrom, tstart = _genomic2transcript(map_mir[mirName], chrom, start) start = start if not tstart else tstart chrom = chrom if not tstart else tchrom end = start + len(query_sequence) hit = hits[idu] fields = { 'seq_name': query_sequence, 'idseq': idu, 'name': mirName, 'parent': preName, 'variant': isoformat, 'cigar': cigar, 'counts': counts, 'filter': Filter, 'hits': hit, 'chrom': chrom, 'start': start, 'end': end, 'database': database, 'source': source, 'score': score, 'strand': strand } # TODO: convert to genomic if args.out_genomic line = feature(fields).line if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": reads_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, line]) logger.info("Hits: %s" % reads_in) return reads
def read_file(fn, precursors, database, mirna_gtf): """ read bam file and perform realignment of hits """ reads = defaultdict(dict) sample = os.path.splitext(os.path.basename(fn))[0] map_mir = mapper.read_gtf_to_mirna(mirna_gtf) non_mirna = 0 non_chromosome_mirna = 0 outside_mirna = 0 lines_read = 0 with open(fn) as handle: handle.readline() for line in handle: lines_read += 1 cols = line.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if len(cols) < 12: non_mirna += 1 continue miRNA = cols[11] if not miRNA: if cols[13]: miRNA = cols[13] elif cols[15]: miRNA = cols[15] else: continue if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue for loc in cols[5].split(";")[:1]: if loc.find("-") < 0: non_chromosome_mirna += 1 continue chrom = loc.split(":")[0] start, end = loc.split(":")[1].split("-") preName, reference_start = genomic2transcript(map_mir[miRNA], chrom, int(start)) if not chrom: non_chromosome_mirna += 1 continue # reference_start = int(cols[4]) - 1 logger.debug("\nPROST::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {start}\n" " reference_start: {reference_start}\n" " mirna: {miRNA}".format(**locals())) Filter = "PASS" hit = "NA" isoformat = _make_variant(cols[19:]) idu = make_id(query_sequence) strand = "." counts = cols[9] cigar = "NA" score = "." source = "isomiR" if isoformat != "NA" else "ref_miRNA" attrb = ("Read {query_sequence}; UID {idu}; Name {miRNA}; Parent {preName}; Variant {isoformat}; Cigar {cigar}; Expression {counts}; Filter {Filter}; Hits {hit};").format(**locals()) res = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t{score}\t{strand}\t.\t{attrb}").format(**locals()) if start not in reads[chrom]: reads[chrom][start] = [] reads[chrom][start].append([idu, chrom, counts, sample, res]) logger.info("Lines loaded: %s" % lines_read) logger.info("Skipped lines because non miRNA in line: %s" % non_mirna) logger.info("Skipped lines because non chromosome in GTF: %s" % non_chromosome_mirna) logger.info("Skipped lines because outside precursor: %s" % outside_mirna) logger.info("Hits: %s" % len(reads)) return reads
def test_code(self): """testing code correction function""" from mirtop.mirna.realign import make_id print make_id("AAACCCTTTGGG") print make_id("AAACCCTTTGGGA") print make_id("AAACCCTTTGGGAT")
def _convert(s, test, reverse=False): code = read_id(s) if reverse else make_id(s) if code != test: raise ValueError("%s didn't result on %s but in %s" % (s, test, code))
def read_file(fn, database, gtf): """ read bam file and perform realignment of hits """ map_mir = mapper.read_gtf_to_mirna(gtf) reads = defaultdict(dict) reads_in = 0 sample = os.path.splitext(os.path.basename(gtf))[0] hits = _get_hits(fn) with open(fn) as handle: for line in handle: cols = line.strip().split("\t") attr = read_attributes(line, "=") query_name = attr['TS'] query_sequence = attr['TS'].replace("U", "T") start = int(cols[3]) end = int(cols[4]) isomirseq_iso = attr['ISO'] if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue counts = attr["TC"] chrom = cols[0] # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) cigar = attr['CI'].replace("U", "T") idu = make_id(query_sequence) isoformat = cigar2variants(cigar, query_sequence, attr['ISO']) logger.debug("\nSOMIRSEA::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " idu: {idu}\n" " start: {start}\n" " cigar: {cigar}\n" " iso: {isoformat}\n" " variant: {isoformat}".format(**locals())) source = "isomiR" if isoformat != "NA" else "ref_miRNA" strand = "+" database = cols[1] mirName = attr['MIN'].split()[0] preName = attr['PIN'].split()[0] score = "." Filter = attr['FILTER'] isotag = attr['ISO'] tchrom, tstart = genomic2transcript(map_mir[mirName], chrom, start) start = start if not tstart else tstart chrom = chrom if not tstart else tchrom end = start + len(query_sequence) hit = hits[idu] attrb = ( "Read {query_sequence}; UID {idu}; Name {mirName}; Parent {preName}; Variant {isoformat}; Isocode {isotag}; Cigar {cigar}; Expression {counts}; Filter {Filter}; Hits {hit};" ).format(**locals()) res = ( "{chrom}\t{database}\t{source}\t{start}\t{end}\t{score}\t{strand}\t.\t{attrb}" ).format(**locals()) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": reads_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, res]) logger.info("Hits: %s" % reads_in) return reads
def read_file(folder, args): """ Read sRNAbench file and convert to mirtop GFF format. Args: *fn(str)*: file name with sRNAbench output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ reads_anno = os.path.join(folder, "reads.annotation") reads_iso = os.path.join(folder, "microRNAannotation.txt") sep = " " if args.out_format == "gtf" else "=" sample = os.path.basename(folder) database = args.database precursors = args.precursors matures = args.matures n_out = 0 n_in = 0 n_ns = 0 n_notassign = 0 n_notindb = 0 reads = defaultdict(dict) seen = set() source_iso = _read_iso(reads_iso) logger.info("Reads with isomiR information %s" % len(source_iso)) with open(reads_anno) as handle: for sequence in handle: cols = sequence.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if query_name not in reads and not query_sequence: continue if query_sequence and query_sequence.find("N") > -1: n_ns += 1 continue if cols[3].find("mature") == -1: n_in += 1 continue counts = int(cols[1]) hits = len( set([mirna.split("#")[1] for mirna in cols[4].split("$")])) for nhit in cols[4].split("$"): logger.debug("SRNABENCH::line hit: %s" % nhit) hit_info = nhit.split("#") pos_info = hit_info[3].split(",") start = int(pos_info[1]) - 1 end = start + len(query_sequence) # int(pos_info[2]) - 1 chrom = pos_info[0] mirName = hit_info[1] if chrom not in precursors or chrom not in matures: n_notindb += 1 if mirName not in matures[chrom]: n_notindb += 1 if (query_sequence, mirName) in seen: continue seen.add((query_sequence, mirName)) if (query_sequence, mirName) not in source_iso: continue isoformat = source_iso[(query_sequence, mirName)] if isoformat == "mv": n_notassign += 1 continue source = "isomiR" if isoformat != "NA" else "ref_miRNA" logger.debug("SRNABENCH::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {start}\n" " external: {isoformat}\n" " hit: {hits}".format(**locals())) logger.debug("SRNABENCH:: start %s end %s" % (start, end)) if len(precursors[chrom]) < start + len(query_sequence): n_out += 1 continue Filter = "Pass" cigar = make_cigar(query_sequence, precursors[chrom][start:end]) preName = chrom score = "." strand = "+" idu = make_id(query_sequence) # attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" # " Parent {preName}; Variant {isoformat};" # " Cigar {cigar}; Expression {counts};" # " Filter {Filter}; Hits {hits};").format(**locals()) # line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" # "{score}\t{strand}\t.\t{attrb}").format(**locals()) fields = { 'seq_name': query_sequence, 'idseq': idu, 'name': mirName, 'parent': preName, 'variant': isoformat, 'cigar': cigar, 'counts': counts, 'filter': Filter, 'hits': hits, 'chrom': chrom, 'start': start, 'end': end, 'database': database, 'source': source, 'score': score, 'strand': strand } # TODO: convert to genomic if args.out_genomic line = feature(fields).line if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": n_in += 1 reads[chrom][start].append( [idu, chrom, counts, sample, line]) logger.info("Loaded %s reads with %s hits" % (len(reads), n_in)) logger.info("Reads without precursor information: %s" % n_notindb) logger.info("Reads with MV as variant definition," " not supported by GFF: %s" % n_notassign) logger.info("Hit Filtered by having > 3 changes: %s" % n_out) return reads
def _analyze_line(line, precursors, database, sample, sep, args): start_idx = 10 end_idx = 11 attr_idx = 15 query_name = line[3] sequence = line[4] if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase return None logger.debug(("READ::line name:{0}").format(line)) if sequence and sequence.find("N") > -1: return None chrom = line[attr_idx].strip().split("Name=")[-1] start = line[1] end = line[2] strand = line[5] counts = float(line[6]) Filter = "Pass" reads = dict() if not start: return None if strand == "+": start = int(start) - int(line[start_idx]) + 1 else: start = int(line[end_idx]) - int(end) iso = isomir() iso.align = line iso.set_pos(start, len(sequence)) logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom)) if len(precursors[chrom]) < start + len(sequence): logger.debug("READ::%s start + %s sequence size are bigger than" " size precursor %s" % ( chrom, len(sequence), len(precursors[chrom]))) iso.subs, iso.add, iso.cigar = filter.tune( sequence, precursors[chrom], start, None) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs)) idu = make_id(sequence) reads[query_name] = hits() reads[query_name].set_sequence(sequence) reads[query_name].counts = counts reads[query_name].sequence = sequence reads[query_name].set_precursor(chrom, iso) reads = annotate(reads, args.matures, args.precursors, quiet=True) gff_line = body.create(reads, args.database, sample, args, quiet=True) if start not in gff_line[chrom]: return None line = gff_line[chrom][start][0][4] logger.debug("READ::line:%s" % line) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) return {'chrom': chrom, 'start': start, 'name': query_name, 'mirna': reads[query_name].precursors[chrom].mirna, 'line': [idu, chrom, counts, sample, line]}
def read_file(fn, args): """ Read isomiR-SEA file and convert to mirtop GFF format. Args: *fn(str)*: file name with isomiR-SEA output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ database = args.database gtf = args.gtf sep = " " if args.out_format == "gtf" else "=" map_mir = mapper.read_gtf_to_mirna(gtf) reads = defaultdict(dict) reads_in = 0 sample = os.path.splitext(os.path.basename(fn))[0] hits = _get_hits(fn) logger.debug("ISOMIRSEA::SAMPLE::%s" % sample) with open(fn) as handle: for line in handle: cols = line.strip().split("\t") attr = read_attributes(line, "=") query_name = attr['TS'] query_sequence = attr['TS'].replace("U", "T") start = int(cols[3]) end = int(cols[4]) isomirseq_iso = attr['ISO'] if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue counts = attr["TC"] chrom = cols[0] # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) cigar = attr['CI'].replace("U", "T") idu = make_id(query_sequence) isoformat = cigar2variants(cigar, query_sequence, attr['ISO']) logger.debug("\nISOMIRSEA::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " idu: {idu}\n" " start: {start}\n" " cigar: {cigar}\n" " iso: {isoformat}\n" " variant: {isoformat}".format(**locals())) source = "isomiR" if isoformat != "NA" else "ref_miRNA" strand = "+" database = cols[1] mirName = attr['MIN'].split()[0] preName = attr['PIN'].split()[0] score = "." Filter = attr['FILTER'] isotag = attr['ISO'] tchrom, tstart = _genomic2transcript(map_mir[mirName], chrom, start) start = start if not tstart else tstart chrom = chrom if not tstart else tchrom end = start + len(query_sequence) hit = hits[idu] attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" " Parent {preName}; Variant {isoformat};" " Isocode {isotag}; Cigar {cigar}; Expression {counts};" " Filter {Filter}; Hits {hit};").format(**locals()) line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" "{score}\t{strand}\t.\t{attrb}").format(**locals()) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(read_gff_line(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": reads_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, line]) logger.info("Hits: %s" % reads_in) return reads
def _convert(s, test, reverse=False): code = read_id(s) if reverse else make_id(s) if code != test: raise ValueError("%s didn't result on %s but in %s" % (s, test, code))
def read_file(folder, args): """ Read sRNAbench file and convert to mirtop GFF format. Args: *fn(str)*: file name with sRNAbench output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ reads_anno = os.path.join(folder, "reads.annotation") reads_iso = os.path.join(folder, "microRNAannotation.txt") sep = " " if args.out_format == "gtf" else "=" sample = os.path.basename(folder) database = args.database precursors = args.precursors matures = args.matures n_out = 0 n_in = 0 n_ns = 0 n_notassign = 0 n_notindb = 0 reads = defaultdict(dict) seen = set() source_iso = _read_iso(reads_iso) logger.info("Reads with isomiR information %s" % len(source_iso)) with open(reads_anno) as handle: for sequence in handle: cols = sequence.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if query_name not in reads and not query_sequence: continue if query_sequence and query_sequence.find("N") > -1: n_ns += 1 continue if cols[3].find("mature") == -1: n_in += 1 continue counts = int(cols[1]) hit = len(set([mirna.split("#")[1] for mirna in cols[4].split("$")])) for nhit in cols[4].split("$"): logger.debug("SRNABENCH::line hit: %s" % nhit) hit_info = nhit.split("#") pos_info = hit_info[3].split(",") start = int(pos_info[1]) - 1 end = start + len(query_sequence) # int(pos_info[2]) - 1 chrom = pos_info[0] mirName = hit_info[1] if chrom not in precursors or chrom not in matures: n_notindb += 1 if mirName not in matures[chrom]: n_notindb += 1 if (query_sequence, mirName) in seen: continue seen.add((query_sequence, mirName)) if (query_sequence, mirName) not in source_iso: continue isoformat = source_iso[(query_sequence, mirName)] if isoformat == "mv": n_notassign += 1 continue source = "isomiR" if isoformat != "NA" else "ref_miRNA" logger.debug("SRNABENCH::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {start}\n" " external: {isoformat}\n" " hit: {hit}".format(**locals())) logger.debug("SRNABENCH:: start %s end %s" % (start, end)) if len(precursors[chrom]) < start + len(query_sequence): n_out += 1 continue Filter = "Pass" cigar = make_cigar(query_sequence, precursors[chrom][start:end]) preName = chrom score = "." strand = "+" idu = make_id(query_sequence) attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" " Parent {preName}; Variant {isoformat};" " Cigar {cigar}; Expression {counts};" " Filter {Filter}; Hits {hit};").format(**locals()) line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" "{score}\t{strand}\t.\t{attrb}").format(**locals()) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(read_gff_line(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": n_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, line]) logger.info("Loaded %s reads with %s hits" % (len(reads), n_in)) logger.info("Reads without precursor information: %s" % n_notindb) logger.info("Reads with MV as variant definition," " not supported by GFF: %s" % n_notassign) logger.info("Hit Filtered by having > 3 changes: %s" % n_out) return reads