def _analyze_lifted_line(line, reads, precursors, database): query_name = line[0] sequence = line[1] logger.debug(("READ::line name:{0}").format(line)) if sequence and sequence.find("N") > -1: return reads if query_name not in reads: reads[query_name].set_sequence(sequence) reads[query_name].counts = _get_freq(query_name) reads[query_name].sequence = sequence chrom = line[2] start = line[3] iso = isomir() iso.align = line iso.set_pos(start, len(reads[query_name].sequence)) logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom)) if len(precursors[chrom]) < start + len(reads[query_name].sequence): logger.debug( "READ::%s start + %s sequence size are bigger than" " size precursor %s" % (chrom, len(reads[query_name].sequence), len(precursors[chrom]))) return reads iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence, precursors[chrom], start, None) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs)) reads[query_name].set_precursor(chrom, iso) return reads
def read_bam(bam_fn, args, clean=True): """ Read bam file and perform realignment of hits Args: *bam_fn*: a BAM file with alignments to the precursor *precursors*: dict with keys being precursor names and values being sequences. Come from mirtop.mirna.fasta.read_precursor(). *clean*: Use mirtop.filter.clean_hits() to remove lower score hits. Returns: *reads (dict)*: keys are read_id and values are *mirtop.realign.hits* """ precursors = args.precursors bam_fn = _sam_to_bam(bam_fn) bam_fn = _bam_sort(bam_fn) mode = "r" if bam_fn.endswith("sam") else "rb" handle = pysam.Samfile(bam_fn, mode) reads = defaultdict(hits) for line in handle: if line.reference_id < 0: logger.debug("Sequence not mapped: %s" % line.reference_id) continue query_name = line.query_name # if query_name not in reads and line.query_sequence: # continue if line.query_sequence and line.query_sequence.find("N") > -1: continue if query_name not in reads: reads[query_name].set_sequence(line.query_sequence) reads[query_name].counts = _get_freq(query_name) if line.is_reverse: logger.debug("Sequence is reverse: %s" % line.query_name) continue chrom = handle.getrname(line.reference_id) cigar = line.cigartuples iso = isomir() iso.align = line iso.set_pos(line.reference_start, len(reads[query_name].sequence)) logger.debug("READ::From BAM start %s end %s" % (iso.start, iso.end)) if len(precursors[chrom]) < line.reference_start + len( reads[query_name].sequence): continue iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence, precursors[chrom], line.reference_start, cigar) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) if len(iso.subs) < 2: reads[query_name].set_precursor(chrom, iso) logger.info("Hits: %s" % len(reads)) if clean: reads = filter.clean_hits(reads) logger.info("Hits after clean: %s" % len(reads)) return reads
def read_bam(bam_fn, args, clean=True): """ Read bam file and perform realignment of hits Args: *bam_fn*: a BAM file with alignments to the precursor *precursors*: dict with keys being precursor names and values being sequences. Come from mirtop.mirna.fasta.read_precursor(). *clean*: Use mirtop.filter.clean_hits() to remove lower score hits. Returns: *reads (dict)*: keys are read_id and values are *mirtop.realign.hits* """ precursors = args.precursors bam_fn = _sam_to_bam(bam_fn) bam_fn = _bam_sort(bam_fn) mode = "r" if bam_fn.endswith("sam") else "rb" handle = pysam.Samfile(bam_fn, mode) reads = defaultdict(hits) for line in handle: if line.reference_id < 0: logger.debug("Sequence not mapped: %s" % line.reference_id) continue query_name = line.query_name # if query_name not in reads and line.query_sequence: # continue if line.query_sequence and line.query_sequence.find("N") > -1: continue if query_name not in reads: reads[query_name].set_sequence(line.query_sequence) reads[query_name].counts = _get_freq(query_name) if line.is_reverse: logger.debug("Sequence is reverse: %s" % line.query_name) continue chrom = handle.getrname(line.reference_id) cigar = line.cigartuples iso = isomir() iso.align = line iso.set_pos(line.reference_start, len(reads[query_name].sequence)) logger.debug("READ::From BAM start %s end %s" % (iso.start, iso.end)) if len(precursors[chrom]) < line.reference_start + len(reads[query_name].sequence): continue iso.subs, iso.add, iso.cigar = filter.tune( reads[query_name].sequence, precursors[chrom], line.reference_start, cigar) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) if len(iso.subs) < 2: reads[query_name].set_precursor(chrom, iso) logger.info("Hits: %s" % len(reads)) if clean: reads = filter.clean_hits(reads) logger.info("Hits after clean: %s" % len(reads)) return reads
def read_file(fn, args): """ Read seqbuster file and convert to mirtop GFF format. Args: *fn(str)*: file name with seqbuster output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads*: dictionary where keys are read_id and values are *mirtop.realign.hits* """ precursors = args.precursors reads = defaultdict(hits) with open(fn) as handle: handle.readline() for line in handle: cols = line.strip().split("\t") query_name = cols[1] query_sequence = cols[0] reference_start = int(cols[4]) - 1 seqbuster_iso = ":".join(cols[6:10]) if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue if query_name not in reads: reads[query_name].set_sequence(query_sequence) reads[query_name].counts = _get_freq(query_name) chrom = cols[13] logger.debug("\nSEQBUSTER::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {reference_start}\n" " iso: {seqbuster_iso}".format(**locals())) # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) iso = isomir() iso.align = line iso.set_pos(reference_start, len(reads[query_name].sequence)) logger.debug("SEQBUSTER:: start %s end %s" % (iso.start, iso.end)) if len(precursors[chrom]) < reference_start + len( reads[query_name].sequence): continue iso.subs, iso.add, iso.cigar = filter.tune( reads[query_name].sequence, precursors[chrom], reference_start, None) logger.debug("SEQBUSTER::After tune start %s end %s" % (iso.start, iso.end)) if len(iso.subs) < 2: reads[query_name].set_precursor(chrom, iso) logger.info("Hits: %s" % len(reads)) return reads
def read_file(fn, precursors): """ read srnabench file and perform realignment of hits """ reads = defaultdict(hits) with open(fn) as handle: for line in handle: cols = line.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue if cols[3].find("mature") == -1: continue if query_name not in reads: reads[query_name].set_sequence(query_sequence) reads[query_name].counts = _get_freq(int(cols[1])) for hit in cols[4].split("$"): logger.debug("SRNABENCH::line hit: %s" % hit) hit_info = hit.split("#") pos_info = hit_info[3].split(",") reference_start = int(pos_info[1]) - 1 chrom = pos_info[0] logger.debug("SRNABENCH::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {reference_start}\n" " hit: {hit}".format(**locals())) iso = isomir() iso.align = line iso.set_pos(reference_start, len(reads[query_name].sequence)) logger.debug("SRNABENCH:: start %s end %s" % (iso.start, iso.end)) if len(precursors[chrom]) < reference_start + len( reads[query_name].sequence): continue iso.subs, iso.add, iso.cigar = filter.tune( reads[query_name].sequence, precursors[chrom], reference_start, None) logger.debug("SRNABENCH::After tune start %s end %s" % (iso.start, iso.end)) if len(iso.subs) < 3: reads[query_name].set_precursor(chrom, iso) logger.info("Hits: %s" % len(reads)) return reads
def _analyze_line(line, reads, precursors, handle, args): if line.reference_id < 0: logger.debug("READ::Sequence not mapped: %s" % line.reference_id) return reads if not line.cigarstring: logger.debug("READ::Sequence malformed: %s" % line) return reads query_name = line.query_name if query_name not in reads and not line.query_sequence: return reads sequence = line.query_sequence if not line.is_reverse else reverse_complement( line.query_sequence) logger.debug(("READ::Read name:{0} and Read sequence:{1}").format( line.query_name, sequence)) if line.query_sequence and line.query_sequence.find("N") > -1: return reads if query_name not in reads: reads[query_name].set_sequence(sequence) reads[query_name].counts = _get_freq(query_name) # TODO if args.quant set to 0 # TODO if args.quant increase by 1 if line.is_reverse and not args.genomic: logger.debug("READ::Sequence is reverse: %s" % line.query_name) return reads chrom = handle.getrname(line.reference_id) start = line.reference_start cigar = line.cigartuples # if line.cigarstring.find("I") > -1: # indels_skip += 1 iso = isomir() iso.align = line iso.set_pos(start, len(reads[query_name].sequence)) logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom)) if len(precursors[chrom].replace("N", "")) + 3 < start + len( reads[query_name].sequence): logger.debug("READ::%s start + %s sequence size are bigger than" " size precursor %s" % (line.reference_id, len( reads[query_name].sequence), len(precursors[chrom]))) return reads iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence, precursors[chrom], start, cigar) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs)) reads[query_name].set_precursor(chrom, iso) return reads
def read_bam(bam_fn, precursors, clean=True): """ read bam file and perform realignment of hits """ bam_fn = _sam_to_bam(bam_fn) bam_fn = _bam_sort(bam_fn) mode = "r" if bam_fn.endswith("sam") else "rb" handle = pysam.Samfile(bam_fn, mode) reads = defaultdict(hits) for line in handle: if line.reference_id < 0: logger.debug("Sequence not mapped: %s" % line.reference_id) continue query_name = line.query_name if query_name not in reads and line.query_sequence == None: continue if line.query_sequence and line.query_sequence.find("N") > -1: continue if query_name not in reads: reads[query_name].set_sequence(line.query_sequence) reads[query_name].counts = _get_freq(query_name) if line.is_reverse: logger.debug("Sequence is reverse: %s" % line.query_name) continue chrom = handle.getrname(line.reference_id) # print "%s %s %s %s" % (line.query_name, line.reference_start, line.query_sequence, chrom) cigar = line.cigartuples iso = isomir() iso.align = line iso.set_pos(line.reference_start, len(reads[query_name].sequence)) logger.debug("READ::From BAM start %s end %s" % (iso.start, iso.end)) if len(precursors[chrom]) < line.reference_start + len( reads[query_name].sequence): continue iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence, precursors[chrom], line.reference_start, cigar) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) if len(iso.subs) < 2: reads[query_name].set_precursor(chrom, iso) logger.info("Hits: %s" % len(reads)) if clean: reads = filter.clean_hits(reads) logger.info("Hits after clean: %s" % len(reads)) return reads
def read_file(fn, precursors): """ read bam file and perform realignment of hits """ reads = defaultdict(hits) with open(fn) as handle: handle.readline() for line in handle: cols = line.strip().split("\t") query_name = cols[1] query_sequence = cols[0] reference_start = int(cols[4]) - 1 seqbuster_iso = ":".join(cols[6:10]) if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: continue if query_name not in reads: reads[query_name].set_sequence(query_sequence) reads[query_name].counts = _get_freq(query_name) chrom = cols[13] logger.debug("\nSEQBUSTER::NEW::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {reference_start}\n" " iso: {seqbuster_iso}".format(**locals())) # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) iso = isomir() iso.align = line iso.set_pos(reference_start, len(reads[query_name].sequence)) logger.debug("SEQBUSTER:: start %s end %s" % (iso.start, iso.end)) if len(precursors[chrom]) < reference_start + len( reads[query_name].sequence): continue iso.subs, iso.add, iso.cigar = filter.tune( reads[query_name].sequence, precursors[chrom], reference_start, None) logger.debug("SEQBUSTER::After tune start %s end %s" % (iso.start, iso.end)) if len(iso.subs) < 2: reads[query_name].set_precursor(chrom, iso) logger.info("Hits: %s" % len(reads)) return reads
def create_iso(name, mir, seq, numsim, exp): data = dict() reads = dict() full_read = list() clean_read = list() seen = set() for mirna in mir[name]: info = mir[name][mirna] mirSeq = seq[info[0]:info[1] + 1] for rand in range(int(numsim)): # expression e = 1 if exp: trial = random.randint(1, 100) p = random.randint(1, 50) / 50.0 e = numpy.random.negative_binomial(trial, p, 1)[0] iso = realign.isomir() randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation( info, seq) if randSeq in seen: continue seen.add(randSeq) iso.end = iso.start + len(randSeq) aln = realign.align(randSeq, seq[iso.start:iso.end]) iso.cigar = realign.make_cigar(aln[0], aln[1]) iso.mirna = mirna query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq) reads[query_name] = realign.hits() reads[query_name].set_sequence(randSeq) reads[query_name].counts = e reads[query_name].set_precursor(name, iso) full_read.extend(create_read(randSeq, e)) clean_read.append([ randSeq, e, ]) # print [randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq] # data[randSeq] = [exp, iso] # create real object used in code to generate GFF write_fastq(full_read, full_fq) write_collapse_fastq(clean_read, clean_fq) gff = body.create(reads, "miRBase21", "sim1") return gff
def create_iso(name, mir, seq, numsim, exp): data = dict() reads = dict() full_read = list() clean_read = list() seen = set() for mirna in mir[name]: info = mir[name][mirna] mirSeq = seq[info[0]:info[1] + 1] for rand in range(int(numsim)): # expression e = 1 if exp: trial = random.randint(1, 100) p = random.randint(1, 50) / 50.0 e = numpy.random.negative_binomial(trial, p, 1)[0] iso = realign.isomir() randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation(info, seq) if randSeq in seen: continue seen.add(randSeq) iso.end = iso.start + len(randSeq) aln = realign.align(randSeq, seq[iso.start:iso.end]) iso.cigar = realign.make_cigar(aln[0], aln[1]) iso.mirna = mirna query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq) reads[query_name] = realign.hits() reads[query_name].set_sequence(randSeq) reads[query_name].counts = e reads[query_name].set_precursor(name, iso) full_read.extend(create_read(randSeq, e)) clean_read.append([randSeq, e,]) # print([randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq]) # data[randSeq] = [exp, iso] # create real object used in code to generate GFF write_fastq(full_read, full_fq) write_collapse_fastq(clean_read, clean_fq) gff = body.create(reads, "miRBase21", "sim1") return gff
def _read_line(line, col_fix, precursors): reads = defaultdict(hits) cols = line.strip().split("\t") query_name = cols[1] query_sequence = cols[0] reference_start = int(cols[4 - col_fix]) - 1 seqbuster_iso = ":".join(cols[6 - col_fix:10 - col_fix]) if query_sequence and query_sequence.find("N") > -1: return reads if query_name not in reads: reads[query_name].set_sequence(query_sequence) reads[query_name].counts = _get_freq(query_name) chrom = cols[13 - col_fix] logger.debug("\nSEQBUSTER::NEW::query: {query_sequence}\n" " precursor: {chrom}\n" " name: {query_name}\n" " start: {reference_start}\n" " iso: {seqbuster_iso}".format(**locals())) # logger.debug("SEQBUSTER:: cigar {cigar}".format(**locals())) iso = isomir() iso.align = line iso.set_pos(reference_start, len(reads[query_name].sequence)) logger.debug("\nSEQBUSTER:: start %s end %s" % (iso.start, iso.end)) if len(precursors[chrom]) < reference_start + len( reads[query_name].sequence): logger.debug("\nSEQBUSTER::len precursor" % len(precursors[chrom])) return reads iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence, precursors[chrom], reference_start, None) logger.debug("\nSEQBUSTER::After tune start %s end %s" % (iso.start, iso.end)) if len(iso.subs) < 6: logger.debug("\nSEQBUSTER::iso.subs %s - length %s" % (iso.subs, len(iso.subs))) reads[query_name].set_precursor(chrom, iso) return reads
def read_file(fn, hairpins, database, mirna_gtf): """ Read PROST! file and convert to mirtop GFF format. Args: *fn(str)*: file name with PROST output information. *database(str)*: database name. *args(namedtuple)*: arguments from command line. See *mirtop.libs.parse.add_subparser_gff()*. Returns: *reads*: dictionary where keys are read_id and values are *mirtop.realign.hits* """ reads = defaultdict(hits) sample = os.path.splitext(os.path.basename(fn))[0] genomics = mapper.read_gtf_to_mirna(mirna_gtf) matures = mapper.read_gtf_to_precursor(mirna_gtf) non_mirna = 0 non_chromosome_mirna = 0 outside_mirna = 0 lines_read = 0 ann, ann_type = _group_seqs_by_ann(fn) with open(fn) as handle: handle.readline() for line in handle: lines_read += 1 cols = line.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if not ann[query_sequence]: non_mirna += 1 continue miRNA = ann_type[ann[query_sequence]][1] preNames = ann_type[ann[query_sequence]][0] if query_name not in reads and query_sequence==None: continue if query_sequence and query_sequence.find("N") > -1: continue reads[query_name].set_sequence(query_sequence) reads[query_name].counts = cols[9] for preName in preNames.split(","): if preName in reads[query_name].precursors: continue if preName not in hairpins: non_chromosome_mirna += 1 continue reference_start = _align_to_mature(query_sequence, hairpins[preName], matures[preName][miRNA]) logger.debug("\nPROST!::NEW::query: {query_sequence}\n" " precursor {preName}\n" " name: {query_name}\n" " reference_start: {reference_start}\n" " mirna: {miRNA}".format(**locals())) iso = isomir() iso.align = line iso.set_pos(reference_start, len(reads[query_name].sequence)) logger.debug("PROST!:: start %s end %s" % (iso.start, iso.end)) if len(hairpins[preName]) < reference_start + len(reads[query_name].sequence): continue iso.subs, iso.add, iso.cigar = filter.tune( reads[query_name].sequence, hairpins[preName], reference_start, None) logger.debug("PROST!::After tune start %s end %s" % ( iso.start, iso.end)) if len(iso.subs) < 2: reads[query_name].set_precursor(preName, iso) logger.info("Lines loaded: %s" % lines_read) logger.info("Skipped lines because non miRNA in line: %s" % non_mirna) logger.info("Skipped lines because non chromosome in GTF:" " %s" % non_chromosome_mirna) logger.info("Skipped lines because outside precursor: %s" % outside_mirna) logger.info("Hits: %s" % len(reads)) return reads
def read_file(folder, precursors): """ read srnabench file and perform realignment of hits """ n_out = 0 n_nonmature = 0 n_ns = 0 n_in = 0 n_non_precursor = 0 reads_anno = os.path.join(folder, "reads.annotation") reads_iso = os.path.join(folder, "microRNAannotation.txt") reads = defaultdict(hits) source_iso = _read_iso(reads_iso) logger.info("Reads with isomiR information %s" % len(source_iso)) with open(reads_anno) as handle: for line in handle: cols = line.strip().split("\t") query_name = cols[0] query_sequence = cols[0] if query_name not in reads and query_sequence == None: continue if query_sequence and query_sequence.find("N") > -1: n_ns += 1 continue if cols[3].find("mature") == -1: n_nonmature += 1 continue if query_name not in reads: reads[query_name].set_sequence(query_sequence) reads[query_name].counts = int(cols[1]) for hit in cols[4].split("$"): logger.debug("SRNABENCH::line hit: %s" % hit) hit_info = hit.split("#") pos_info = hit_info[3].split(",") reference_start = int(pos_info[1]) - 1 chrom = pos_info[0] iso = isomir() iso.align = line if (query_sequence, hit_info[1]) in source_iso: iso.external = source_iso[(query_sequence, hit_info[1])] external = iso.external logger.debug("SRNABENCH::query: {query_sequence}\n" " precursor {chrom}\n" " name: {query_name}\n" " start: {reference_start}\n" " external: {external}\n" " hit: {hit}".format(**locals())) iso.set_pos(reference_start, len(reads[query_name].sequence)) logger.debug("SRNABENCH:: start %s end %s" % (iso.start, iso.end)) if len(precursors[chrom]) < reference_start + len(reads[query_name].sequence): n_out += 1 continue iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence, precursors[chrom], reference_start, None) logger.debug("SRNABENCH::After tune start %s end %s" % (iso.start, iso.end)) n_in += 1 reads[query_name].set_precursor(chrom, iso) if len(reads[query_name].precursors) == 0: n_non_precursor += 1 logger.info("Loaded %s reads with %s hits" % (len(reads), n_in)) logger.info("Reads without precursor information: %s" % n_non_precursor) logger.info("Hit Filtered by having > 3 changes: %s" % n_out) logger.info("Hit Filtered by being non-mature: %s" % n_nonmature) return reads
def _analyze_line(line, precursors, database, sample, sep, args): start_idx = 10 end_idx = 11 attr_idx = 15 query_name = line[3] sequence = line[4] if str(line).find(get_primary_transcript(guess_database(args))) < 0: # only working with mirbase return None logger.debug(("READ::line name:{0}").format(line)) if sequence and sequence.find("N") > -1: return None chrom = line[attr_idx].strip().split("Name=")[-1] start = line[1] end = line[2] strand = line[5] counts = float(line[6]) Filter = "Pass" reads = dict() if not start: return None if strand == "+": start = int(start) - int(line[start_idx]) + 1 else: start = int(line[end_idx]) - int(end) iso = isomir() iso.align = line iso.set_pos(start, len(sequence)) logger.debug("READ::From BAM start %s end %s at chrom %s" % (iso.start, iso.end, chrom)) if len(precursors[chrom]) < start + len(sequence): logger.debug("READ::%s start + %s sequence size are bigger than" " size precursor %s" % ( chrom, len(sequence), len(precursors[chrom]))) iso.subs, iso.add, iso.cigar = filter.tune( sequence, precursors[chrom], start, None) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) logger.debug("READ::iso add %s iso subs %s" % (iso.add, iso.subs)) idu = make_id(sequence) reads[query_name] = hits() reads[query_name].set_sequence(sequence) reads[query_name].counts = counts reads[query_name].sequence = sequence reads[query_name].set_precursor(chrom, iso) reads = annotate(reads, args.matures, args.precursors, quiet=True) gff_line = body.create(reads, args.database, sample, args, quiet=True) if start not in gff_line[chrom]: return None line = gff_line[chrom][start][0][4] logger.debug("READ::line:%s" % line) if args.add_extra: extra = variant_with_nt(line, args.precursors, args.matures) line = "%s Changes %s;" % (line, extra) line = paste_columns(feature(line), sep=sep) return {'chrom': chrom, 'start': start, 'name': query_name, 'mirna': reads[query_name].precursors[chrom].mirna, 'line': [idu, chrom, counts, sample, line]}