def test_cigar(self): """testing cigar correction function""" cigar = [[0, 14], [1, 1], [0, 5]] from mirtop.mirna.realign import cigar_correction, make_cigar fixed = cigar_correction(cigar, "AAAAGCTGGGTTGAGGAGGA", "AAAAGCTGGGTTGAGAGGA") if not fixed[0] == "AAAAGCTGGGTTGAGGAGGA": raise ValueError("Sequence 1 is not right.") if not fixed[1] == "AAAAGCTGGGTTGA-GAGGA": raise ValueError("Sequence 2 is not right.") if not make_cigar("AAA-AAATAAA", "AGACAAA-AAA") == "MGMD3MI3M": raise ValueError("Cigar not eq to MAMDMMMIMMM: %s" % make_cigar("AAA-AAATAAA", "AGACAAA-AAA"))
def tune(seq, precursor, start, cigar): """ The actual fn that will realign the sequence """ if cigar: seq, mature = cigar_correction(cigar, seq, precursor[start:]) else: seq, mature, score, p, size = align(seq, precursor[start:start + len(seq)]) cigar = make_cigar(seq, mature) if seq.startswith("-"): seq = seq[1:] if seq.endswith("-"): seq = seq[:-1] logger.debug("TUNE:: %s %s %s" % (cigar, seq, mature)) error = set() pattern_addition = [[1, 1, 0], [1, 0, 1], [0, 1, 0], [0, 1, 1], [0, 0, 1], [1, 1, 1]] for pos in range(0, len(seq)): if seq[pos] != mature[pos]: error.add(pos) subs, add = [], [] for e in error: if e < len(seq) - 3: subs.append([e, seq[e], mature[e]]) pattern, error_add = [], [] for e in range(len(seq) - 3, len(seq)): if e in error: pattern.append(1) error_add.append(e) else: pattern.append(0) for p in pattern_addition: if pattern == p: add = seq[error_add[0]:].replace("-", "") break if not add and error_add: for e in error_add: subs.append([e, seq[e], mature[e]]) return subs, add, make_cigar(seq, mature)
def test_cigar(self): """testing cigar correction function""" cigar = [[0, 14], [1, 1], [0, 5]] from mirtop.mirna.realign import cigar_correction, make_cigar, \ cigar2snp, expand_cigar fixed = cigar_correction(cigar, "AAAAGCTGGGTTGAGGAGGA", "AAAAGCTGGGTTGAGAGGA") if not fixed[0] == "AAAAGCTGGGTTGAGGAGGA": raise ValueError("Sequence 1 is not right.") if not fixed[1] == "AAAAGCTGGGTTGA-GAGGA": raise ValueError("Sequence 2 is not right.") if not make_cigar("AAA-AAATAAA", "AGACAAA-AAA") == "MAMD3MI3M": raise ValueError("Cigar not eq to MAMD3MI3M: %s" % make_cigar("AAA-AAATAAA", "AGACAAA-AAA")) # test expand cigar if not expand_cigar("3MA3M") == "MMMAMMM": raise ValueError("Cigar 3MA3M not eqaul to MMMAMMM but to %s" % expand_cigar("3MA3M")) # test cigar to snp if not cigar2snp("3MA3M", "AAATCCC")[0] == [3, "A", "T"]: raise ValueError("3MA3M not equal AAATCCC but %s" % cigar2snp("3MA3M", "AAATCCC"))
def create_iso(name, mir, seq, numsim, exp): data = dict() reads = dict() full_read = list() clean_read = list() seen = set() for mirna in mir[name]: info = mir[name][mirna] mirSeq = seq[info[0]:info[1] + 1] for rand in range(int(numsim)): # expression e = 1 if exp: trial = random.randint(1, 100) p = random.randint(1, 50) / 50.0 e = numpy.random.negative_binomial(trial, p, 1)[0] iso = realign.isomir() randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation( info, seq) if randSeq in seen: continue seen.add(randSeq) iso.end = iso.start + len(randSeq) aln = realign.align(randSeq, seq[iso.start:iso.end]) iso.cigar = realign.make_cigar(aln[0], aln[1]) iso.mirna = mirna query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq) reads[query_name] = realign.hits() reads[query_name].set_sequence(randSeq) reads[query_name].counts = e reads[query_name].set_precursor(name, iso) full_read.extend(create_read(randSeq, e)) clean_read.append([ randSeq, e, ]) # print [randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq] # data[randSeq] = [exp, iso] # create real object used in code to generate GFF write_fastq(full_read, full_fq) write_collapse_fastq(clean_read, clean_fq) gff = body.create(reads, "miRBase21", "sim1") return gff
def create_iso(name, mir, seq, numsim, exp): data = dict() reads = dict() full_read = list() clean_read = list() seen = set() for mirna in mir[name]: info = mir[name][mirna] mirSeq = seq[info[0]:info[1] + 1] for rand in range(int(numsim)): # expression e = 1 if exp: trial = random.randint(1, 100) p = random.randint(1, 50) / 50.0 e = numpy.random.negative_binomial(trial, p, 1)[0] iso = realign.isomir() randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation(info, seq) if randSeq in seen: continue seen.add(randSeq) iso.end = iso.start + len(randSeq) aln = realign.align(randSeq, seq[iso.start:iso.end]) iso.cigar = realign.make_cigar(aln[0], aln[1]) iso.mirna = mirna query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq) reads[query_name] = realign.hits() reads[query_name].set_sequence(randSeq) reads[query_name].counts = e reads[query_name].set_precursor(name, iso) full_read.extend(create_read(randSeq, e)) clean_read.append([randSeq, e,]) # print([randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq]) # data[randSeq] = [exp, iso] # create real object used in code to generate GFF write_fastq(full_read, full_fq) write_collapse_fastq(clean_read, clean_fq) gff = body.create(reads, "miRBase21", "sim1") return gff
def read_file(folder, args): """ Read sRNAbench file and convert to mirtop GFF format. Args: *fn(str)*: file name with sRNAbench output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ reads_anno = os.path.join(folder, "reads.annotation") reads_iso = os.path.join(folder, "microRNAannotation.txt") sep = " " if args.out_format == "gtf" else "=" sample = os.path.basename(folder) database = args.database precursors = args.precursors matures = args.matures n_out = 0 n_in = 0 n_ns = 0 n_notassign = 0 n_notindb = 0 reads = defaultdict(dict) seen = set() source_iso = _read_iso(reads_iso) logger.info("Reads with isomiR information %s" % len(source_iso)) with open(reads_anno) as handle: for sequence in handle: cols = sequence.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if query_name not in reads and not query_sequence: continue if query_sequence and query_sequence.find("N") > -1: n_ns += 1 continue if cols[3].find("mature") == -1: n_in += 1 continue counts = int(cols[1]) hits = len( set([mirna.split("#")[1] for mirna in cols[4].split("$")])) for nhit in cols[4].split("$"): logger.debug("SRNABENCH::line hit: %s" % nhit) hit_info = nhit.split("#") pos_info = hit_info[3].split(",") start = int(pos_info[1]) - 1 end = start + len(query_sequence) # int(pos_info[2]) - 1 chrom = pos_info[0] mirName = hit_info[1] if chrom not in precursors or chrom not in matures: n_notindb += 1 if mirName not in matures[chrom]: n_notindb += 1 if (query_sequence, mirName) in seen: continue seen.add((query_sequence, mirName)) if (query_sequence, mirName) not in source_iso: continue isoformat = source_iso[(query_sequence, mirName)] if isoformat == "mv": n_notassign += 1 continue source = "isomiR" if isoformat != "NA" else "ref_miRNA" logger.debug("SRNABENCH::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {start}\n" " external: {isoformat}\n" " hit: {hits}".format(**locals())) logger.debug("SRNABENCH:: start %s end %s" % (start, end)) if len(precursors[chrom]) < start + len(query_sequence): n_out += 1 continue Filter = "Pass" cigar = make_cigar(query_sequence, precursors[chrom][start:end]) preName = chrom score = "." strand = "+" idu = make_id(query_sequence) # attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" # " Parent {preName}; Variant {isoformat};" # " Cigar {cigar}; Expression {counts};" # " Filter {Filter}; Hits {hits};").format(**locals()) # line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" # "{score}\t{strand}\t.\t{attrb}").format(**locals()) fields = { 'seq_name': query_sequence, 'idseq': idu, 'name': mirName, 'parent': preName, 'variant': isoformat, 'cigar': cigar, 'counts': counts, 'filter': Filter, 'hits': hits, 'chrom': chrom, 'start': start, 'end': end, 'database': database, 'source': source, 'score': score, 'strand': strand } # TODO: convert to genomic if args.out_genomic line = feature(fields).line if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": n_in += 1 reads[chrom][start].append( [idu, chrom, counts, sample, line]) logger.info("Loaded %s reads with %s hits" % (len(reads), n_in)) logger.info("Reads without precursor information: %s" % n_notindb) logger.info("Reads with MV as variant definition," " not supported by GFF: %s" % n_notassign) logger.info("Hit Filtered by having > 3 changes: %s" % n_out) return reads
def tune(seq, precursor, start, cigar): """ The actual fn that will realign the sequence to find the nt changes at 5', 3' sequence and nt variations. Args: *seq (str)*: sequence of the read. *precursor (str)*: sequence of the precursor. *start (int)*: start position of sequence on the precursor, +1. *cigar (str)*: similar to SAM CIGAR attribute. Returns: *list* with: subs (list): substitutions add (list): nt added to the end cigar (str): updated cigar """ end = len(seq) if start < 0: end = end + start start = 0 if cigar: seq, mature = cigar_correction(cigar, seq, precursor[start:]) else: seq, mature, score, p, size = align(seq, precursor[start:start + end]) cigar = make_cigar(seq, mature) if seq.startswith("-"): seq = seq[1:] if seq.endswith("-"): seq = seq[:-1] logger.debug("TUNE:: %s %s %s" % (cigar, seq, mature)) error = set() for pos in range(0, len(seq)): if seq[pos] != mature[pos]: error.add(pos) subs, add = [], [] prob = 0 add_position = [] for e in range(len(seq) - 1, len(seq) - 6, -1): if e in error: prob = 1 if prob == 1: add.append(seq[e]) add_position.append(e) if e not in error and prob == 0 and seq[e] in ["A", "T"]: add.append(seq[e]) add_position.append(e) continue if e not in error: if add: add.pop() add_position.pop() if prob == 0: add = [] add_position = [] break for e in error: if e not in add_position: subs.append([e, seq[e], mature[e]]) logger.debug("TUNE:: %s %s" % (subs, add)) return subs, "".join(add), make_cigar(seq, mature)
def read_file(folder, args): """ Read sRNAbench file and convert to mirtop GFF format. Args: *fn(str)*: file name with sRNAbench output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads (nested dicts)*:gff_list has the format as defined in *mirtop.gff.body.read()*. """ reads_anno = os.path.join(folder, "reads.annotation") reads_iso = os.path.join(folder, "microRNAannotation.txt") sep = " " if args.out_format == "gtf" else "=" sample = os.path.basename(folder) database = args.database precursors = args.precursors matures = args.matures n_out = 0 n_in = 0 n_ns = 0 n_notassign = 0 n_notindb = 0 reads = defaultdict(dict) seen = set() source_iso = _read_iso(reads_iso) logger.info("Reads with isomiR information %s" % len(source_iso)) with open(reads_anno) as handle: for sequence in handle: cols = sequence.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if query_name not in reads and not query_sequence: continue if query_sequence and query_sequence.find("N") > -1: n_ns += 1 continue if cols[3].find("mature") == -1: n_in += 1 continue counts = int(cols[1]) hit = len(set([mirna.split("#")[1] for mirna in cols[4].split("$")])) for nhit in cols[4].split("$"): logger.debug("SRNABENCH::line hit: %s" % nhit) hit_info = nhit.split("#") pos_info = hit_info[3].split(",") start = int(pos_info[1]) - 1 end = start + len(query_sequence) # int(pos_info[2]) - 1 chrom = pos_info[0] mirName = hit_info[1] if chrom not in precursors or chrom not in matures: n_notindb += 1 if mirName not in matures[chrom]: n_notindb += 1 if (query_sequence, mirName) in seen: continue seen.add((query_sequence, mirName)) if (query_sequence, mirName) not in source_iso: continue isoformat = source_iso[(query_sequence, mirName)] if isoformat == "mv": n_notassign += 1 continue source = "isomiR" if isoformat != "NA" else "ref_miRNA" logger.debug("SRNABENCH::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {start}\n" " external: {isoformat}\n" " hit: {hit}".format(**locals())) logger.debug("SRNABENCH:: start %s end %s" % (start, end)) if len(precursors[chrom]) < start + len(query_sequence): n_out += 1 continue Filter = "Pass" cigar = make_cigar(query_sequence, precursors[chrom][start:end]) preName = chrom score = "." strand = "+" idu = make_id(query_sequence) attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};" " Parent {preName}; Variant {isoformat};" " Cigar {cigar}; Expression {counts};" " Filter {Filter}; Hits {hit};").format(**locals()) line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t" "{score}\t{strand}\t.\t{attrb}").format(**locals()) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(read_gff_line(line), sep=sep) if start not in reads[chrom]: reads[chrom][start] = [] if Filter == "Pass": n_in += 1 reads[chrom][start].append([idu, chrom, counts, sample, line]) logger.info("Loaded %s reads with %s hits" % (len(reads), n_in)) logger.info("Reads without precursor information: %s" % n_notindb) logger.info("Reads with MV as variant definition," " not supported by GFF: %s" % n_notassign) logger.info("Hit Filtered by having > 3 changes: %s" % n_out) return reads