def run(): scriptPath = os.path.dirname(os.path.realpath(__file__)) plot_script = os.path.join(scriptPath, 'plotNucleotideProbabilities.R') parser = create_parser() args = parser.parse_args() functions.checkExistence(args.inputfile) functions.checkPath(args.outdir) outfile = os.path.join(args.outdir, args.prefix + '_nuc_mutations.table') outfile_img = os.path.join(args.outdir, args.prefix + '_nuc_mutations.pdf') main(args.inputfile, outfile, args.coverage, args.verbose) cmd = [ 'R', '-q', '--slave', '-f %r' % plot_script, '--args', '%r' % outfile, '%r' % outfile_img, args.limit, ] execute(cmd) if args.remove: os.remove(outfile)
def run(): scriptPath = os.path.dirname(os.path.realpath(__file__)) plot_script = os.path.join(scriptPath, 'plotHeatMapSmall.R') parser = create_parser() args = parser.parse_args() functions.checkExistence(args.parclip) functions.checkExistence(args.gff) prefix_pattern = '%s_mat_sm_up%s_do%s_min%s_max%s_xbins%s_ybins%s' out_prefix = prefix_pattern % (args.prefix, args.upstream, args.downstream, args.min, args.max, args.xbins, args.ybins) outfile_mat_sense = os.path.join(args.outputdir, out_prefix + '_sense.table') outfile_mat_asense = os.path.join(args.outputdir, out_prefix + '_asense.table') outfile_img_sense = os.path.join(args.outputdir, out_prefix + '_sense.png') outfile_img_asense = os.path.join(args.outputdir, out_prefix + '_asense.png') sense = main(args.parclip, args.gff, args.upstream, args.downstream, True, args.min, args.max, args.verbose, args.xbins, args.ybins, 'Collecting data from sense strand') asense = main(args.parclip, args.gff, args.upstream, args.downstream, False, args.min, args.max, args.verbose, args.xbins, args.ybins, 'Collecting data from anti-sense strand') total = args.upstream + args.max + 1 + args.downstream saveMat(outfile_mat_sense, sense[0], args.upstream, args.downstream, sense[1], args.xbins, total) saveMat(outfile_mat_asense, asense[0], args.upstream, args.downstream, asense[1], args.xbins, total) start = args.upstream / total * args.xbins cmd = [ 'R', '-q', '--slave', '-f %r' % plot_script, '--args', '%r' % outfile_mat_sense, '%r' % outfile_mat_asense, '%r' % outfile_img_sense, '%r' % outfile_img_asense, 0.98, # hard-coded qvalue start, args.ypx, args.xpx, ] execute(cmd) if args.remove: os.remove(outfile_mat_sense) os.remove(outfile_mat_asense)
def cleanup(self, keep_intermed=False): """Cleans up temporary and intermediate files Args: keep_intermed (:obj:`boolean`): if set to True, intermediate files are not removed """ if not self._keep_all: for rm_file in self._tmp_files: logger.debug('%s is marked as a temporary file, cleaning up', rm_file) execute('rm -rf %s' % rm_file, exit=False) if not keep_intermed: for rm_file in self._intermed_files: logger.debug( '%s is marked as an intermediate file, cleaning up', rm_file) execute('rm -rf %s' % rm_file, exit=False)
def run(): scriptPath = os.path.dirname(os.path.realpath(__file__)) plot_script = os.path.join(scriptPath, 'plotKmerPerPosition.R') parser = create_parser() args = parser.parse_args() sites = ParclipSiteContainer.from_file(args.inputfile) if args.filterGFF != '': sites.remove_gff_sites(args.filterGFF, args.awidth) sites.sort(by=args.key, ascending=False) with EfficientGenome(args.genome) as genome: sites = sites[args.start:args.stop] seqs = sites.get_all_sequences(genome, args.width) prefix_fmt = '%s_kmerPerPosition_kmer%s_start%s_stop%s_width%s_sort_%s' prefix = prefix_fmt % (args.prefix, args.kmer, args.start, args.stop, args.width, args.key) outfile_table = os.path.join(args.outdir, prefix + '.table') outfile_pdf = os.path.join(args.outdir, prefix + '.pdf') seq_len = 2 * args.width + 1 getKmerOccurences(seqs, seq_len, outfile_table, kmer=(args.kmer - 1), verbose=args.verbose) cmd = [ 'R', '-q', '--slave', '-f %s' % plot_script, '--args', outfile_table, outfile_pdf, args.width, 0, args.width + 1 ] execute(cmd) if args.remove: os.remove(outfile_table)
def run(): scriptPath = os.path.dirname(os.path.realpath(__file__)) parser = create_parser() args = parser.parse_args() prefix_pat = '%s_xxmotif_start%s_stop%s_width%s_sort_%s' file_prefix = prefix_pat % (args.prefix, args.start, args.stop, args.width, args.key) sites = ParclipSiteContainer.from_file(args.inputfile) if args.filterGFF != '': sites.remove_gff_sites(args.filterGFF, args.awidth) sites.sort(by=args.key, ascending=False) sites = sites[args.start:args.stop] gen_file = os.path.join(args.outdir, file_prefix + '.fa') with EfficientGenome(args.genome) as genome: sites.save2Fasta(genome, gen_file, width=args.width) cmd = [ 'XXmotif', args.outdir, gen_file, '--zoops', '--merge-motif-threshold LOW', '--max-match-positions 10', ] if args.negSet: cmd.append('--negSet %s' % args.negSet) execute(cmd) tmp_dir = os.path.join(args.outdir, 'tmp') mini_plot_script = os.path.join(tmp_dir, 'plotDistribution.R') mini_plot_cmd = [ 'R', '-q', '--slave', '-f %r' % mini_plot_script, '--args', '%r' % args.outdir, ] execute(mini_plot_cmd) plot_script = os.path.join(scriptPath, '..', 'plots', 'weblogo.R') pwm_file = os.path.join(args.outdir, file_prefix + '.pwm') plot_cmd = [ 'R', '-q', '--slave', '-f %s' % plot_script, '--args', pwm_file, args.outdir, file_prefix, args.plotPWM, ] if args.plotPWM > 0: execute(plot_cmd) if not args.keep_tmp_files: shutil.rmtree(tmp_dir, ignore_errors=True)
def execute(self): """Execute all queued commands""" for cmd in self._cmds: yield execute(cmd)
def main(parclip, outdir, prefix, genomepath, negset, gfffile, kmer, key, useQuantiles, verbose, args): scriptPath = os.path.dirname(os.path.realpath(__file__)) plot_script = os.path.join(scriptPath, 'plotKmerLogOdds.R') pc = ParclipSiteContainer.from_file(parclip) if gfffile is not None: pc.remove_gff_sites(gfffile) pc.sort(by=key, ascending=False) kmers = functions.makekmers(kmer, list('ACGT'))[kmer - 1] negfreq = loadNegTable(negset) with EfficientGenome(genomepath) as genomeseq: allfreqs = [] fileprefix = '%s_logodds_%smer_sort_%s' % (prefix, kmer, key) if useQuantiles: fileprefix = fileprefix + '_quantiles' allfreqs.append( getkmerLogs(pc, genomeseq, negfreq, kmers, 0, 1000, 15)) quantiles = [ 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.125, 0.15, 0.175, 0.2, 0.225, 0.25, 0.275, 0.3, 0.325, 0.35, 0.375, 0.4, 0.45, 0.5, 0.55, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9 ] count = 1 stop = 1000 for q in quantiles: if verbose: functions.showProgress( count, len(quantiles), 'Getting kmer log-odds from quantiles...') old_stop = stop start = functions.getQuantileIndex(len(pc), q) - 500 stop = functions.getQuantileIndex(len(pc), q) + 500 if start < 0: start = 0 if stop > len(pc) - 2: break count = count + 1 if (stop - 500) < old_stop: msg_pat = 'Bin %s and %s are overlapping by %s sites!' # TODO 2x quantiles[count - 2] is probably a bug msg = msg_pat % (quantiles[count - 2], quantiles[count - 2], old_stop - (stop - 500)) print(msg, file=sys.stderr) allfreqs.append( getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop, 15)) else: maxsize = 50000 stepsize = 1000 start = 0 stop = 1000 run = True while run: if stop > len(pc) - 2 or stop > maxsize: print() print('STOP at: %s' % +stop) run = False break if verbose: functions.showProgress( stop, maxsize, 'Getting kmer log-odds from bins...') allfreqs.append( getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop, 15)) start = stop stop = stop + stepsize table_file = os.path.join(outdir, fileprefix + '.table') pdf_file = os.path.join(outdir, fileprefix + '.pdf') sortAndSave(allfreqs, table_file, kmers) cmd = [ 'R', '-q', '--slave', '-f %r' % plot_script, '--args', '%r' % table_file, '%r' % pdf_file, ] execute(cmd) if not args.keep_tmp_files: os.remove(table_file)
def main(): parser = create_parser() args = parser.parse_args() np.random.seed(args.seed) pc = ParclipSiteContainer.from_file(args.pc_table) gff_records = [] with open(args.gff_file) as gff_handle: parser = GFF3Parser(gff_handle) for rec in parser.parse(): if args.min_ts_len <= rec.end - rec.start + 1 <= args.max_ts_len: gff_records.append(rec) cut_len = args.downstream_bp + args.upstream_bp + 2 * args.gene_bp + 2 gene_bp = args.gene_bp upstream_bp = args.upstream_bp downstream_bp = args.downstream_bp def aggregate_data(gff_rec, sense=True): def rev(strand): return '+' if strand == '-' else '-' chrom = gff_rec.seqid start = gff_rec.start end = gff_rec.end smooth_window = args.smooth_window anno_strand = gff_rec.strand query_strand = anno_strand if sense else rev(anno_strand) if anno_strand == '+': values_upstream = pc.get_occ_profile(chrom, start - upstream_bp, start + gene_bp, query_strand) values_dostream = pc.get_occ_profile(chrom, end - gene_bp, end + downstream_bp, query_strand) else: values_upstream = pc.get_occ_profile(chrom, end - gene_bp, end + upstream_bp, query_strand)[::-1] values_dostream = pc.get_occ_profile(chrom, start - downstream_bp, start + gene_bp, query_strand)[::-1] upstr = pd.Series(values_upstream).rolling(window=smooth_window, center=True, min_periods=0).mean() dostr = pd.Series(values_dostream).rolling(window=smooth_window, center=True, min_periods=0).mean() return np.hstack((upstr, dostr)) def write_out_data(sense, out_file, bs_file): global data_dict data_dict = {} for i, gff_rec in enumerate(gff_records): data = aggregate_data(gff_rec, sense) data_dict[i] = data ids = np.array(list(data_dict.keys())) # actual smoothed curve with open(out_file, 'w') as out: avg_vec = np.zeros(cut_len) for ts_ind in ids: avg_vec += data_dict[ts_ind] avg_vec /= len(ids) print(*avg_vec, sep='\t', file=out) with open(bs_file, 'w') as out: with Pool(args.n_processes) as pool: jobs = [] for bs in range(args.n_bs_iterations): ts_ind = np.random.choice(ids, size=len(ids)) job = pool.apply_async(calc_profile, args=(ts_ind, cut_len)) jobs.append(job) for job in jobs: res_vec = job.get() print(*res_vec, sep='\t', file=out) prefix_fmt = '%s_centerBoth_up%s_gene%s_do%s_min%s_max%s' fmt_args = ( args.prefix, args.upstream_bp, args.gene_bp, args.downstream_bp, args.min_ts_len, args.max_ts_len, ) fn_prefix = prefix_fmt % fmt_args sense_table = os.path.join(args.outputdir, fn_prefix + '_sense.table') sense_bs_table = os.path.join(args.outputdir, fn_prefix + '_sense_bs.table') asense_table = os.path.join(args.outputdir, fn_prefix + '_asense.table') asense_bs_table = os.path.join(args.outputdir, fn_prefix + '_asense_bs.table') write_out_data(True, sense_table, sense_bs_table) write_out_data(False, asense_table, asense_bs_table) scriptPath = os.path.dirname(os.path.realpath(__file__)) plot_script = os.path.join(scriptPath, 'plotCenterBothEnds_bs.R') if not args.title: title = args.prefix else: title = args.title outfile_pdf = os.path.join(args.outputdir, fn_prefix + '_sm%s.pdf' % args.smooth_window) plot_cmd = [ 'R', '-q', '--slave', '-f %r' % plot_script, '--args', '%r' % sense_table, '%r' % sense_bs_table, '%r' % asense_table, '%r' % asense_bs_table, '%r' % outfile_pdf, '%r' % title, args.upstream_bp, args.downstream_bp, args.gene_bp, args.smooth_window, '%r' % args.labelCenterA, '%r' % args.labelBody, '%r' % args.labelCenterB, ] execute(plot_cmd) if args.cleanup: os.remove(sense_table) os.remove(sense_bs_table) os.remove(asense_table) os.remove(asense_bs_table)
def run(): scriptPath = os.path.dirname(os.path.realpath(__file__)) plot_script = os.path.join(scriptPath, 'plotCenterBothEnds.R') parser = create_parser() args = parser.parse_args() if not args.title: title = args.prefix else: title = args.title prefix_fmt = '%s_centerBoth_up%s_gene%s_do%s_min%s_max%s' fmt_args = ( args.prefix, args.upstream, args.gene, args.downstream, args.min, args.max, ) fn_prefix = prefix_fmt % fmt_args outfile_sense = os.path.join(args.outputdir, fn_prefix + '_sense.table') outfile_asense = os.path.join(args.outputdir, fn_prefix + '_asense.table') outfile_pdf = os.path.join(args.outputdir, fn_prefix + '_sm%s.pdf' % args.plotSmooth) start_time = datetime.datetime.now() main(args.parclip, outfile_sense, args.gff, args.downstream, args.upstream, args.gene, True, args.min, args.max, args.verbose, 'Collecting PAR-CLIP sense data') main(args.parclip, outfile_asense, args.gff, args.downstream, args.upstream, args.gene, False, args.min, args.max, args.verbose, 'Collecting PAR-CLIP anti-sense data') if args.verbose: end_time = datetime.datetime.now() run_time = end_time - start_time print() print('time: %s seconds' % run_time.seconds) plot_cmd = [ 'R', '-q', '--slave', '-f %r' % plot_script, '--args', '%r' % outfile_sense, '%r' % outfile_asense, '%r' % outfile_pdf, '%r' % title, args.upstream, args.downstream, args.gene, args.plotSmooth, '%r' % args.labelCenterA, '%r' % args.labelBody, '%r' % args.labelCenterB, ] execute(plot_cmd) if args.remove: os.remove(outfile_sense) os.remove(outfile_asense)