def runBiSeq_liver(infiles, outfile): job_options = "-l mem_free=10G -pe dedicated 10" RRBS.runBiSeq(infiles, outfile, "Liver", submit=True, job_options=job_options)
def runM3D(infile, outfile, root, design): job_options = "-l mem_free=4G -pe dedicated 1" groups = [x for x in itertools.combinations(EXPERIMENTS, 2)] # **code repeated - refactor** for pair in groups: pair = [re.sub("-agg", "", str(x)) for x in pair] pair1, pair2 = pair pair1_split = pair1.split("-") pair2_split = pair2.split("-") # only want pairs with one difference # e.g treatment or tissue but not both if not (pair1_split[0] != pair2_split[0] and pair1_split[1] != pair2_split[1]): outfile = ("%(root)s%(pair1)s_vs_%(pair2)s.tsv" % locals()) if pair1_split[0] != pair2_split[0]: groups = [pair1_split[0], pair2_split[0]] elif pair1_split[1] != pair2_split[1]: groups = [pair1_split[1], pair2_split[1]] else: E.error( "This pair does not contain any comparisons: %(pair)s" % locals()) RRBS.calculateM3DStat(infile, outfile, design, pair=pair, groups=groups, submit=True, job_options=job_options)
def runBiSeq_germline(infiles, outfile): job_options = "-l mem_free=10G -pe dedicated 10" RRBS.runBiSeq(infiles, outfile, "Germline", submit=True, job_options=job_options)
def clusterSpikeInsPowerAnalysis(infiles, outfile): job_options = "-l mem_free=23G" RRBS.spikeInClustersAnalysis(infiles, outfile, submit=True, job_options=job_options)
def runM3DSpikeClusters(infiles, outfile): job_options = "-l mem_free=4G -pe dedicated 1" infile, design = infiles RRBS.calculateM3DStat(infile, outfile, design, submit=True, job_options=job_options)
def findCpGs(outfile): genome_infile = PARAMS["methylation_summary_genome_fasta"] job_options = "-l mem_free=2G" RRBS.fasta2CpG(genome_infile, outfile, submit=True, job_options=job_options)
def addTreatmentMeans(infile, outfile): job_options = "-l mem_free=48G" RRBS.addTreatmentMean(infile, outfile, submit=True, job_options=job_options)
def extractRepeatCpGs(outfile): '''extract repeats sequences and identify CpG locations''' RRBS.findRepeatCpGs(outfile, PARAMS["methylation_summary_genome_fasta"], PARAMS["annotation_repeats_gff"], submit=True, job_memory="4G")
def categorisePromoterCpGs(outfile): '''extract promoter sequences and categorise them by CpG density''' RRBS.categorisePromoterCpGs(outfile, PARAMS["methylation_summary_genome_fasta"], PARAMS['annotation_database'], submit=True, job_memory="4G")
def subsetCpGsToCovered(infile, outfile): job_options = "-l mem_free=48G" RRBS.subsetToCovered(infile, outfile, cov_threshold=10, submit=True, job_options=job_options)
def calculateM3DSpikeClustersPvalue(infiles, outfile): job_options = "-l mem_free=4G -pe dedicated 1" design = infiles[-1] infiles = infiles[:-1] RRBS.calculateM3DSpikepvalue(infiles, outfile, design, submit=True, job_options=job_options) P.touch(outfile)
def splitClustersDataframe(infile, outfiles): outprefix = "subframes.dir/cluster_subframe_" suffix = ".tsv" job_options = "-l mem_free=8G -pe dedicated 1" RRBS.splitDataframeClusters(infile, outprefix, suffix, submit=True, job_options=job_options)
def extractDMRCpGs(outfile): '''extract sequences for Highly conserved non-coding element and identify CpG locations''' RRBS.findCpGsFromBed(outfile, PARAMS["methylation_summary_genome_fasta"], PARAMS["annotation_dmr"], "DMR", both_strands=True, submit=True, job_memory="4G")
def plotReadBias(infile, outfile): job_options = "-l mem_free=1G" m_bias_infile = P.snip(infile, ".bismark.cov") + ".M-bias.txt" print(m_bias_infile) RRBS.plotReadBias(m_bias_infile, outfile, submit=True, job_options=job_options)
def mergeCoverage(infiles, outfile): cpgs_infile = infiles[-1] coverage_infiles = infiles[:-1] # this should be replaced with a non-pandas based solution # very memory intensive! - find out why and re-code job_options = "-l mem_free=48G" job_threads = 2 RRBS.mergeAndDrop(cpgs_infile, coverage_infiles, outfile, submit=True, job_options=job_options)
def calculateM3DClustersPvalue(infiles, outfile, pair1, pair2): job_options = "-l mem_free=4G -pe dedicated 1" infiles = infiles[:-1] print("pair1: %s" % pair1) print("pair2: %s" % pair2) pair = [pair1, pair2] print(infiles, outfile, pair) RRBS.calculateM3Dpvalue(infiles, outfile, pair, submit=True, job_options=job_options)
def mergeCpGAnnotations(infiles, outfile): '''merge together the CpG annotations for plotting''' meth_inf, prom_inf, repeat_inf, hcne_inf, dmr_inf = infiles RRBS.mergeCpGAnnotations(meth_inf, prom_inf, repeat_inf, hcne_inf, dmr_inf, outfile, submit=True, job_memory="4G")
def addCpGIs(infiles, outfile): infile, CpGI = infiles # TS: still memory intensive even after supplying data types # for all columns! # this should be replaced with a non-pandas based solution job_memory = "40G" job_threads = 1 RRBS.pandasMerge(infile, CpGI, outfile, merge_type="left", left=['contig', 'position'], right=['contig', 'position'], submit=True, job_memory=job_memory)
def plotCoverage(infile, outfiles): RRBS.plotCoverage(infile, outfiles, submit=True, job_memory="6G")
def makeSummaryPlots(infile, outfile): job_options = "-l mem_free=48G" RRBS.summaryPlots(infile, outfile, submit=True, job_options=job_options) P.touch(outfile)
def plotMethylationFrequency(infile, outfile): RRBS.plotMethFrequency(infile, outfile, job_memory="2G", submit=True)
def plotCpGAnnotations(infile, outfiles): ''' make histogram and boxplots for the CpGs facetted per annotation''' outfile_hist, outfile_box = outfiles RRBS.plotCpGAnnotations(infile, outfile_hist, outfile_box)
def calculateCoverage(infile, outfile): RRBS.calculateCoverage(infile, outfile, submit=True, job_memory="2G")
def summariseM3D(infile, outfile): ''' summarise the number of cluster passing threshold''' # adjusted p-value threshold threshold = 0.05 print(infile, outfile, threshold) RRBS.summariseM3D(infile, outfile, threshold, submit=True)