def main(parclipA, parclipB, outfile, width, verbose): quantiles = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] total = (len(quantiles)-1)*(len(quantiles)-1) total_count = 0 if verbose: functions.showProgress(total_count, total, 'Calculating Jaccard-Index') fc = open(outfile, 'w') for q1 in range(len(quantiles)-1): a = ParclipSiteContainer() a.loadFromFile(parclipA) aq = getEntries(a,quantiles[q1], quantiles[q1+1]) #removeEntries(a,quantiles[q1], quantiles[q1+1]) for q2 in range(len(quantiles)-1): b = ParclipSiteContainer() b.loadFromFile(parclipB) #removeEntries(b,quantiles[q2], quantiles[q2+1]) bq = getEntries(b,quantiles[q2], quantiles[q2+1]) intersect = 0 for j in range(bq.size()): if aq.exactSearch(bq.chrs[j], bq.pos[j], bq.strand[j], width=width)[1]: intersect += 1 jaccard = intersect/(aq.size()+bq.size()-intersect) #print('q1: '+str(quantiles[q1])+' q2: '+str(quantiles[q2])+' '+str(round(jaccard,4))) fc.write(str(round(jaccard,4))+'\t') total_count += 1 if verbose: functions.showProgress(total_count, total, 'Calculating Jaccard-Index') fc.write('\n') print('') fc.close()
def main(parclipfile, gfffile, upstream, downstream, sense, minSize, maxSize, verbose, xbins, ybins, vstring=''): anno = gff.GFF(gfffile) anno.filterSize(minSize, maxSize) totalsize = upstream + maxSize + 1 + downstream anno.sort2size() pc = ParclipSiteContainer.from_file(parclipfile) mat = [] annosize = [] for g in range(anno.size()): tmp = [-1] * totalsize if verbose: functions.showProgress(g, (anno.size() - 1), vstring) if anno.strand[g] == '+': values = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g], sense, upstream, (anno.stop[g] - anno.start[g]) + downstream) else: values = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g], sense, upstream, (anno.stop[g] - anno.start[g]) + downstream) if values is not None: tmp[0:(len(values) - 1)] = values mat.append(functions.shrinkValues(tmp, xbins)) annosize.append(anno.stop[g] - anno.start[g]) smat = [] sannosize = [] if ybins >= anno.size(): print('Warning: --ybins >= entries in ' + gfffile) ybins = anno.size() ystep = round(anno.size() / ybins) ystart = 0 ystop = ystep while ystop < anno.size(): tmp = [0] * xbins for i in range(xbins): count = 0 tmpanno = 0 for j in range(ystart, ystop): tmp[i] += mat[j][i] # [row][col] tmpanno += annosize[j] count += 1 tmp[i] = tmp[i] / count tmpanno = tmpanno / count smat.append(tmp) sannosize.append(tmpanno) ystart = ystop ystop += ystep return smat, sannosize if verbose: print()
def main(inputfile, outputfile): if os.path.isfile(inputfile) == False: print('Inputfile: '+inputfile+' does not exist') sys.exit(-1) sites = ParclipSiteContainer() sites.loadFromFile(inputfile) for i in range(sites.size()): sites.occ[i] = sites.m[i]/sites.r[i] sites.save2File(outputfile)
def main(input_file, output_file, q): if not 0 <= q < 1: print('q must lie between 0 and 1 - got %s' % q) sys.exit(1) sites = ParclipSiteContainer.from_file(input_file) # dirty hack to avoid errors on empty files occ_vals = [] for rec in sites: occ_vals.append(rec.occupancy) if len(occ_vals) > 0: max_occ = functions.getQuantile(occ_vals, q) records = [] for rec in sites: if rec.occupancy > max_occ: rec = rec._replace(occupancy=max_occ) records.append(rec) new_sites = ParclipSiteContainer(records) new_sites.save2File(output_file)
def getEntries(sites, q1, q2): i = 0 lower = functions.getQuantile(sites.occ,q1) upper = functions.getQuantile(sites.occ,q2) pc = ParclipSiteContainer() count = 0 size = sites.size() for i in range(sites.size()): if sites.occ[i] > lower and sites.occ[i] <= upper: count += 1 pc.addSite(sites.chrs[i], sites.pos[i], sites.m[i], sites.r[i], sites.result[i], sites.strand[i], sites.occ[i]) pc.getChromosomePositions() return pc
def run(): scriptPath = os.path.dirname(os.path.realpath(__file__)) plot_script = os.path.join(scriptPath, 'plotKmerPerPosition.R') parser = create_parser() args = parser.parse_args() sites = ParclipSiteContainer.from_file(args.inputfile) if args.filterGFF != '': sites.remove_gff_sites(args.filterGFF, args.awidth) sites.sort(by=args.key, ascending=False) with EfficientGenome(args.genome) as genome: sites = sites[args.start:args.stop] seqs = sites.get_all_sequences(genome, args.width) prefix_fmt = '%s_kmerPerPosition_kmer%s_start%s_stop%s_width%s_sort_%s' prefix = prefix_fmt % (args.prefix, args.kmer, args.start, args.stop, args.width, args.key) outfile_table = os.path.join(args.outdir, prefix + '.table') outfile_pdf = os.path.join(args.outdir, prefix + '.pdf') seq_len = 2 * args.width + 1 getKmerOccurences(seqs, seq_len, outfile_table, kmer=(args.kmer - 1), verbose=args.verbose) cmd = [ 'R', '-q', '--slave', '-f %s' % plot_script, '--args', outfile_table, outfile_pdf, args.width, 0, args.width + 1 ] execute(cmd) if args.remove: os.remove(outfile_table)
def main(parclipfile, outputfile, gfffile, downstream, upstream, gene, sense, minSize, maxSize, verbose, vstring=''): anno = gff.GFF(gfffile) anno.filterSize(minSize, maxSize) pc = ParclipSiteContainer() pc.loadFromFile(parclipfile) with open(outputfile, 'w') as fc_out: for g in range(anno.size()): if verbose: functions.showProgress(g, (anno.size() - 1), vstring) if anno.strand[g] == '+': values_upstream = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g], sense, upstream, gene) values_dostream = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g], sense, gene, downstream) else: values_upstream = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g], sense, upstream, gene) values_dostream = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g], sense, gene, downstream) if values_upstream is not None and values_dostream is not None: print(*chain(values_upstream, values_dostream), sep='\t', file=fc_out) if verbose: print()
def run(): scriptPath = os.path.dirname(os.path.realpath(__file__)) parser = create_parser() args = parser.parse_args() prefix_pat = '%s_xxmotif_start%s_stop%s_width%s_sort_%s' file_prefix = prefix_pat % (args.prefix, args.start, args.stop, args.width, args.key) sites = ParclipSiteContainer.from_file(args.inputfile) if args.filterGFF != '': sites.remove_gff_sites(args.filterGFF, args.awidth) sites.sort(by=args.key, ascending=False) sites = sites[args.start:args.stop] gen_file = os.path.join(args.outdir, file_prefix + '.fa') with EfficientGenome(args.genome) as genome: sites.save2Fasta(genome, gen_file, width=args.width) cmd = [ 'XXmotif', args.outdir, gen_file, '--zoops', '--merge-motif-threshold LOW', '--max-match-positions 10', ] if args.negSet: cmd.append('--negSet %s' % args.negSet) execute(cmd) tmp_dir = os.path.join(args.outdir, 'tmp') mini_plot_script = os.path.join(tmp_dir, 'plotDistribution.R') mini_plot_cmd = [ 'R', '-q', '--slave', '-f %r' % mini_plot_script, '--args', '%r' % args.outdir, ] execute(mini_plot_cmd) plot_script = os.path.join(scriptPath, '..', 'plots', 'weblogo.R') pwm_file = os.path.join(args.outdir, file_prefix + '.pwm') plot_cmd = [ 'R', '-q', '--slave', '-f %s' % plot_script, '--args', pwm_file, args.outdir, file_prefix, args.plotPWM, ] if args.plotPWM > 0: execute(plot_cmd) if not args.keep_tmp_files: shutil.rmtree(tmp_dir, ignore_errors=True)
def main(parclipA, parclipB, start, stop, width, anno=None, annowidth=100, logRatio=False, verbose=False): tmpA = ParclipSiteContainer() dataB = ParclipSiteContainer() tmpA.loadFromFile(parclipA) tmpA.sort(key='occ') dataB.loadFromFile(parclipB) if start < 0 or stop < start or stop >= tmpA.size(): print('Bullshit start and stop indices. Come on! Concentrate!') sys.exit() dataA = parclipsites.ParclipSites('') total = stop - start count = 0 i = start while count < total and i < (tmpA.size()-1): if verbose: functions.showProgress(count,total-1,'Selecting PAR-CLIP sites') if anno == None: dataA.addSite(tmpA.chrs[i], tmpA.pos[i], tmpA.m[i], tmpA.r[i], tmpA.result[i], tmpA.strand[i], tmpA.occ[i]) count +=1 else: if anno.isInside(tmpA.chrs[i], tmpA.pos[i], tmpA.strand[i], annowidth, annowidth)[1]: dataA.addSite(tmpA.chrs[i], tmpA.pos[i], tmpA.m[i], tmpA.r[i], tmpA.result[i], tmpA.strand[i], tmpA.occ[i]) count +=1 i += 1 coloc = 1 count_coloc = 1 if verbose: print('\n') for i in range(dataA.size()): values = dataB.getValues(dataA.chrs[i], dataA.pos[i], dataA.strand[i], True, width, width) if values != None: count_coloc += 1 coloc += max(values) if verbose: functions.showProgress(i, (dataA.size()-1), 'Collecting colocolization data') coloc = coloc / count_coloc if verbose: print('') if logRatio: return math.log( coloc/functions.getQuantile(dataB.occ,0.5) ,2) else: return coloc
def _get_container(self): pc_table = ParclipSiteContainer.from_file(TABLE_DIR) return pc_table
def main(inputfile, outputfile, gfffile, gffmin, gffmax, takeStop, upstream, downstream, verbose): takeStart = True if takeStop: takeStart = False sites = ParclipSiteContainer() sites.loadFromFile(inputfile) anno = gff.GFF(gfffile) anno.filterSize(gffmin, gffmax) anno.getChromosomePositions() if anno.size() < 10: print('Warning: Low number of annotation enries! ' + str(anno.size())) fsites = ParclipSiteContainer() percent_old = 0 percent_new = 0 for i in range(sites.size()): if anno.isAround(sites.chrs[i], sites.pos[i], sites.strand[i], takeStart, upstream, downstream)[1]: fsites.addSite(sites.chrs[i], sites.pos[i], sites.m[i], sites.r[i], sites.result[i], sites.strand[i], sites.occ[i]) percent_new = round(i / sites.size() * 100) if percent_new > percent_old: if verbose: functions.showProgress(i, anno.size(), 'selecting sites') percent_old = percent_new fsites.save2File(outputfile)
def main(parclip, outdir, prefix, genomepath, negset, gfffile, kmer, key, useQuantiles, verbose, args): scriptPath = os.path.dirname(os.path.realpath(__file__)) plot_script = os.path.join(scriptPath, 'plotKmerLogOdds.R') pc = ParclipSiteContainer.from_file(parclip) if gfffile is not None: pc.remove_gff_sites(gfffile) pc.sort(by=key, ascending=False) kmers = functions.makekmers(kmer, list('ACGT'))[kmer - 1] negfreq = loadNegTable(negset) with EfficientGenome(genomepath) as genomeseq: allfreqs = [] fileprefix = '%s_logodds_%smer_sort_%s' % (prefix, kmer, key) if useQuantiles: fileprefix = fileprefix + '_quantiles' allfreqs.append( getkmerLogs(pc, genomeseq, negfreq, kmers, 0, 1000, 15)) quantiles = [ 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.125, 0.15, 0.175, 0.2, 0.225, 0.25, 0.275, 0.3, 0.325, 0.35, 0.375, 0.4, 0.45, 0.5, 0.55, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9 ] count = 1 stop = 1000 for q in quantiles: if verbose: functions.showProgress( count, len(quantiles), 'Getting kmer log-odds from quantiles...') old_stop = stop start = functions.getQuantileIndex(len(pc), q) - 500 stop = functions.getQuantileIndex(len(pc), q) + 500 if start < 0: start = 0 if stop > len(pc) - 2: break count = count + 1 if (stop - 500) < old_stop: msg_pat = 'Bin %s and %s are overlapping by %s sites!' # TODO 2x quantiles[count - 2] is probably a bug msg = msg_pat % (quantiles[count - 2], quantiles[count - 2], old_stop - (stop - 500)) print(msg, file=sys.stderr) allfreqs.append( getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop, 15)) else: maxsize = 50000 stepsize = 1000 start = 0 stop = 1000 run = True while run: if stop > len(pc) - 2 or stop > maxsize: print() print('STOP at: %s' % +stop) run = False break if verbose: functions.showProgress( stop, maxsize, 'Getting kmer log-odds from bins...') allfreqs.append( getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop, 15)) start = stop stop = stop + stepsize table_file = os.path.join(outdir, fileprefix + '.table') pdf_file = os.path.join(outdir, fileprefix + '.pdf') sortAndSave(allfreqs, table_file, kmers) cmd = [ 'R', '-q', '--slave', '-f %r' % plot_script, '--args', '%r' % table_file, '%r' % pdf_file, ] execute(cmd) if not args.keep_tmp_files: os.remove(table_file)
from mockinbird.utils import ParclipSiteContainer if __name__ == '__main__': parser = argparse.ArgumentParser(description='Takes PAR-CLIP sites and a genome and saves genomic sequences as fasta file around PAR-CLIP sites according to the given parameters.', epilog="contact: [email protected]") parser.add_argument('sites', help='PAR-CLIP file *.table') parser.add_argument('genome', help='path to genome') parser.add_argument('fafile', help='output filename') parser.add_argument('filterGFF', help='set path to GFF if sites should be removed that overlap with the GFF [default = '']', default='') parser.add_argument('start', help='start index of PAR-CLIP sites [default=0]', type=int, default = 0) parser.add_argument('stop', help='stop index of PAR-CLIP sites [default=1500]', type=int, default = 1500) parser.add_argument('width', help='number of nt +/- the crosslink site [default=15]', type=int, default = 15) parser.add_argument('additionalFilterWidth', help='number of nt that are added to the start/stop indices of the GFF annotations', type=int, default = 20) parser.add_argument('key', help='set key that is used for PAR-CLIP site ordering [default = \'occ\'], options: [\'occ\', \'m\', \'r\', \'mr\', \'pvalue\']', default='occ') parser.add_argument('-v','--verbose', dest='verbose', action="store_true", default=False, help='verbose output') args = parser.parse_args() yeast = genome.Genome(args.genome, False) sites = ParclipSiteContainer() sites.loadFromFile(args.sites) if args.verbose: print('#sites : '+str(sites.size())) if args.filterGFF != '': anno = gff.GFF(args.filterGFF) sites = sites.removeSitesLocatedInGFF(anno, args.additionalFilterWidth) print('#sites after removal: '+str(sites.size())) sites.sort(args.key) sites.save2Fasta(yeast, args.fafile, args.start, args.stop, args.width)