def main(parclipfile, gfffile, upstream, downstream, sense, minSize, maxSize, verbose, xbins, ybins, vstring=''): anno = gff.GFF(gfffile) anno.filterSize(minSize, maxSize) totalsize = upstream + maxSize + 1 + downstream anno.sort2size() pc = ParclipSiteContainer.from_file(parclipfile) mat = [] annosize = [] for g in range(anno.size()): tmp = [-1] * totalsize if verbose: functions.showProgress(g, (anno.size() - 1), vstring) if anno.strand[g] == '+': values = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g], sense, upstream, (anno.stop[g] - anno.start[g]) + downstream) else: values = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g], sense, upstream, (anno.stop[g] - anno.start[g]) + downstream) if values is not None: tmp[0:(len(values) - 1)] = values mat.append(functions.shrinkValues(tmp, xbins)) annosize.append(anno.stop[g] - anno.start[g]) smat = [] sannosize = [] if ybins >= anno.size(): print('Warning: --ybins >= entries in ' + gfffile) ybins = anno.size() ystep = round(anno.size() / ybins) ystart = 0 ystop = ystep while ystop < anno.size(): tmp = [0] * xbins for i in range(xbins): count = 0 tmpanno = 0 for j in range(ystart, ystop): tmp[i] += mat[j][i] # [row][col] tmpanno += annosize[j] count += 1 tmp[i] = tmp[i] / count tmpanno = tmpanno / count smat.append(tmp) sannosize.append(tmpanno) ystart = ystop ystop += ystep return smat, sannosize if verbose: print()
def run(): scriptPath = os.path.dirname(os.path.realpath(__file__)) plot_script = os.path.join(scriptPath, 'plotKmerPerPosition.R') parser = create_parser() args = parser.parse_args() sites = ParclipSiteContainer.from_file(args.inputfile) if args.filterGFF != '': sites.remove_gff_sites(args.filterGFF, args.awidth) sites.sort(by=args.key, ascending=False) with EfficientGenome(args.genome) as genome: sites = sites[args.start:args.stop] seqs = sites.get_all_sequences(genome, args.width) prefix_fmt = '%s_kmerPerPosition_kmer%s_start%s_stop%s_width%s_sort_%s' prefix = prefix_fmt % (args.prefix, args.kmer, args.start, args.stop, args.width, args.key) outfile_table = os.path.join(args.outdir, prefix + '.table') outfile_pdf = os.path.join(args.outdir, prefix + '.pdf') seq_len = 2 * args.width + 1 getKmerOccurences(seqs, seq_len, outfile_table, kmer=(args.kmer - 1), verbose=args.verbose) cmd = [ 'R', '-q', '--slave', '-f %s' % plot_script, '--args', outfile_table, outfile_pdf, args.width, 0, args.width + 1 ] execute(cmd) if args.remove: os.remove(outfile_table)
def main(input_file, output_file, q): if not 0 <= q < 1: print('q must lie between 0 and 1 - got %s' % q) sys.exit(1) sites = ParclipSiteContainer.from_file(input_file) # dirty hack to avoid errors on empty files occ_vals = [] for rec in sites: occ_vals.append(rec.occupancy) if len(occ_vals) > 0: max_occ = functions.getQuantile(occ_vals, q) records = [] for rec in sites: if rec.occupancy > max_occ: rec = rec._replace(occupancy=max_occ) records.append(rec) new_sites = ParclipSiteContainer(records) new_sites.save2File(output_file)
def run(): scriptPath = os.path.dirname(os.path.realpath(__file__)) parser = create_parser() args = parser.parse_args() prefix_pat = '%s_xxmotif_start%s_stop%s_width%s_sort_%s' file_prefix = prefix_pat % (args.prefix, args.start, args.stop, args.width, args.key) sites = ParclipSiteContainer.from_file(args.inputfile) if args.filterGFF != '': sites.remove_gff_sites(args.filterGFF, args.awidth) sites.sort(by=args.key, ascending=False) sites = sites[args.start:args.stop] gen_file = os.path.join(args.outdir, file_prefix + '.fa') with EfficientGenome(args.genome) as genome: sites.save2Fasta(genome, gen_file, width=args.width) cmd = [ 'XXmotif', args.outdir, gen_file, '--zoops', '--merge-motif-threshold LOW', '--max-match-positions 10', ] if args.negSet: cmd.append('--negSet %s' % args.negSet) execute(cmd) tmp_dir = os.path.join(args.outdir, 'tmp') mini_plot_script = os.path.join(tmp_dir, 'plotDistribution.R') mini_plot_cmd = [ 'R', '-q', '--slave', '-f %r' % mini_plot_script, '--args', '%r' % args.outdir, ] execute(mini_plot_cmd) plot_script = os.path.join(scriptPath, '..', 'plots', 'weblogo.R') pwm_file = os.path.join(args.outdir, file_prefix + '.pwm') plot_cmd = [ 'R', '-q', '--slave', '-f %s' % plot_script, '--args', pwm_file, args.outdir, file_prefix, args.plotPWM, ] if args.plotPWM > 0: execute(plot_cmd) if not args.keep_tmp_files: shutil.rmtree(tmp_dir, ignore_errors=True)
def _get_container(self): pc_table = ParclipSiteContainer.from_file(TABLE_DIR) return pc_table
def main(parclip, outdir, prefix, genomepath, negset, gfffile, kmer, key, useQuantiles, verbose, args): scriptPath = os.path.dirname(os.path.realpath(__file__)) plot_script = os.path.join(scriptPath, 'plotKmerLogOdds.R') pc = ParclipSiteContainer.from_file(parclip) if gfffile is not None: pc.remove_gff_sites(gfffile) pc.sort(by=key, ascending=False) kmers = functions.makekmers(kmer, list('ACGT'))[kmer - 1] negfreq = loadNegTable(negset) with EfficientGenome(genomepath) as genomeseq: allfreqs = [] fileprefix = '%s_logodds_%smer_sort_%s' % (prefix, kmer, key) if useQuantiles: fileprefix = fileprefix + '_quantiles' allfreqs.append( getkmerLogs(pc, genomeseq, negfreq, kmers, 0, 1000, 15)) quantiles = [ 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.125, 0.15, 0.175, 0.2, 0.225, 0.25, 0.275, 0.3, 0.325, 0.35, 0.375, 0.4, 0.45, 0.5, 0.55, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9 ] count = 1 stop = 1000 for q in quantiles: if verbose: functions.showProgress( count, len(quantiles), 'Getting kmer log-odds from quantiles...') old_stop = stop start = functions.getQuantileIndex(len(pc), q) - 500 stop = functions.getQuantileIndex(len(pc), q) + 500 if start < 0: start = 0 if stop > len(pc) - 2: break count = count + 1 if (stop - 500) < old_stop: msg_pat = 'Bin %s and %s are overlapping by %s sites!' # TODO 2x quantiles[count - 2] is probably a bug msg = msg_pat % (quantiles[count - 2], quantiles[count - 2], old_stop - (stop - 500)) print(msg, file=sys.stderr) allfreqs.append( getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop, 15)) else: maxsize = 50000 stepsize = 1000 start = 0 stop = 1000 run = True while run: if stop > len(pc) - 2 or stop > maxsize: print() print('STOP at: %s' % +stop) run = False break if verbose: functions.showProgress( stop, maxsize, 'Getting kmer log-odds from bins...') allfreqs.append( getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop, 15)) start = stop stop = stop + stepsize table_file = os.path.join(outdir, fileprefix + '.table') pdf_file = os.path.join(outdir, fileprefix + '.pdf') sortAndSave(allfreqs, table_file, kmers) cmd = [ 'R', '-q', '--slave', '-f %r' % plot_script, '--args', '%r' % table_file, '%r' % pdf_file, ] execute(cmd) if not args.keep_tmp_files: os.remove(table_file)