def test_basic_prune(self): ppg.FileGeneratingJob("A", lambda: write("A", "A")) b = ppg.FileGeneratingJob("B", lambda: write("B", "B")) b.prune() ppg.run_pipegraph() assert Path("A").read_text() == "A" assert not Path("B").exists()
def test_invalidation_redoes_output(self, new_pipegraph): o = Dummy() def calc(): return ", ".join(str(x) for x in range(0, 100)) job = ppg.CachedAttributeLoadingJob("out/mycalc", o, "a", calc) of = "out/A" def do_write(): write(of, o.a) ppg.FileGeneratingJob(of, do_write).depends_on(job) ppg.run_pipegraph() assert read(of) == ", ".join(str(x) for x in range(0, 100)) new_pipegraph.new_pipegraph() def calc2(): return ", ".join(str(x) for x in range(0, 200)) job = ppg.CachedAttributeLoadingJob( "out/mycalc", o, "a", calc2) # now, jobB should be deleted... ppg.FileGeneratingJob(of, do_write).depends_on(job) ppg.run_pipegraph() assert read(of) == ", ".join(str(x) for x in range(0, 200))
def test_invalidation_ignored_does_not_redo_output(self, new_pipegraph): o = Dummy() def calc(): return ", ".join(str(x) for x in range(0, 100)) job = ppg.CachedAttributeLoadingJob("out/mycalc", o, "a", calc) of = "out/A" def do_write(): write(of, o.a) ppg.FileGeneratingJob(of, do_write).depends_on(job) ppg.run_pipegraph() assert read(of) == ", ".join(str(x) for x in range(0, 100)) new_pipegraph.new_pipegraph() def calc2(): return ", ".join(str(x) for x in range(0, 200)) job = ppg.CachedAttributeLoadingJob("out/mycalc", o, "a", calc2) job.ignore_code_changes() ppg.FileGeneratingJob(of, do_write).depends_on(job) ppg.run_pipegraph() assert read(of) == ", ".join(str(x) for x in range(0, 100)) new_pipegraph.new_pipegraph() job = ppg.CachedAttributeLoadingJob("out/mycalc", o, "a", calc2) ppg.FileGeneratingJob(of, do_write).depends_on(job) ppg.run_pipegraph() assert read(of) == ", ".join( str(x) for x in range(0, 200) ) # The new stuff - you either have an explicit ignore_code_changes in our codebase, or we enforce consistency between code and result
def run_alevin_on_sample(self, lane, genome, method): output = Path("results/alevin/") / lane.name def run_alevin(): output.mkdir(exist_ok=True, parents=True) self.run_alevin( output, [lane.get_aligner_input_filenames()], genome, method ) (output / "sentinel.txt").write_text("done") job = ppg.FileGeneratingJob(output / "sentinel.txt", run_alevin).depends_on( genome.build_index(self), lane.prepare_input() ) def run_qc(): (output / "QC").mkdir(exist_ok=True) import rpy2.robjects as ro ro.r("library('alevinQC')") ro.r("alevinQCReport")( baseDir=str(output.absolute()), sampleId=lane.name, outputFile="alevinReport.html", outputFormat="html_document", outputDir=str((output / "QC").absolute()), forceOverwrite=True, ) qc_job = ppg.FileGeneratingJob( output / "QC" / "alevinReport.html", run_qc ).depends_on(job) return job, qc_job
def test_unpickle_bug_prevents_single_job_from_unpickling(self): def do_a(): write("out/A", "A") append("out/As", "A") ppg.FileGeneratingJob("out/A", do_a) def do_b(): write("out/B", "A") append("out/Bs", "A") job_B = ppg.FileGeneratingJob("out/B", do_b) cd = CantDepickle() job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd, )) job_B.depends_on(job_parameter_unpickle_problem) ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/As") == "A" assert read("out/B") == "A" assert read("out/Bs") == "A" print("second run") ppg.new_pipegraph(dump_graph=False) ppg.FileGeneratingJob("out/A", do_a) job_B = ppg.FileGeneratingJob("out/B", do_b) job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd, )) job_B.depends_on(job_parameter_unpickle_problem) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/As") == "A" assert read("out/B") == "A" assert ( read("out/Bs") == "AA" ) # this one got rerun because we could not load the invariant...
def test_jobs_concurrent_jobs_run_concurrently(self): # we'll determine this by the start respective end times.. ppg.new_pipegraph( ppg.resource_coordinators.LocalSystem(max_cores_to_use=2), quiet=True, dump_graph=False, ) jobA = ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A")) jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B")) jobA.cores_needed = 1 jobB.cores_needed = 1 ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/B") == "B" if jobA.start_time < jobB.start_time: first_job = jobA second_job = jobB else: first_job = jobB second_job = jobA print( "times", first_job.start_time, first_job.stop_time, second_job.start_time, second_job.stop_time, ) if jobA.start_time is None: raise ValueError("JobA did not run") assert first_job.stop_time > second_job.start_time
def test_raises_on_non_dependend_job_injection2(self): o = Dummy() of = "out/A" def do_write(): write(of, o.A + o.B) job = ppg.FileGeneratingJob(of, do_write) ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D")) def generate_deps(): def load_a(): return "A" def load_b(): return "B" dlA = ppg.AttributeLoadingJob("dlA", o, "A", load_a) ppg.AttributeLoadingJob("dlB", o, "B", load_b) job.depends_on(dlA) # let's not do anything with dlA gen_job = ppg.DependencyInjectionJob("C", generate_deps) job.depends_on(gen_job) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert not (os.path.exists(of)) # since the gen job crashed assert os.path.exists( "out/D") # since it has no relation to the gen job actually... assert isinstance(gen_job.exception, ppg.JobContractError) assert "case 1" in str(gen_job.exception)
def test_raises_on_non_dependend_job_injection2_can_be_ignored(self): o = Dummy() of = "out/A" def do_write(): write(of, o.A) # + o.B - but B is not in the dependency chain! job = ppg.FileGeneratingJob(of, do_write) ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D")) def generate_deps(): def load_a(): return "A" def load_b(): return "B" dlA = ppg.AttributeLoadingJob("dlA", o, "A", load_a) ppg.AttributeLoadingJob("dlB", o, "B", load_b) job.depends_on(dlA) # let's not do anything with dlA gen_job = ppg.DependencyInjectionJob( "C", generate_deps, check_for_dependency_injections=False) job.depends_on(gen_job) ppg.run_pipegraph() assert os.path.exists(of) # since the gen job crashed
def test_registration_and_pruning(self, new_pipegraph): with pytest.raises(TypeError): register_qc("shu") jobA = ppg.FileGeneratingJob("a", lambda: Path("a").write_text("hello")) register_qc(jobA) print(list(get_qc_jobs())) assert jobA in list(get_qc_jobs()) assert not jobA._pruned jobc = register_qc( ppg.FileGeneratingJob("c", lambda: Path("b").write_text("hello"))) def check_prune(job): return job.job_id.lower()[-1] == "c" prune_qc(check_prune) assert jobc in list(get_qc_jobs()) assert not jobc._pruned jobB = register_qc( ppg.FileGeneratingJob("b", lambda: Path("b").write_text("hello"))) assert jobB in list(get_qc_jobs()) assert jobB._pruned jobC = register_qc( ppg.FileGeneratingJob("C", lambda: Path("b").write_text("hello"))) assert not jobC._pruned assert len(list(get_qc_jobs())) == 4 prune_qc() assert jobA._pruned assert jobB._pruned assert jobc._pruned assert jobC._pruned for j in get_qc_jobs(): assert j._pruned
def test_filegen_invalidated_jobgen_created_filegen_later_also_invalidated( self, new_pipegraph): a = ppg.FileGeneratingJob("out/A", lambda: writeappend("out/A", "out/Ac", "A")) p = ppg.ParameterInvariant("p", "p") a.depends_on(p) def gen(): c = ppg.FileGeneratingJob( "out/C", lambda: writeappend("out/C", "out/Cx", "C")) c.depends_on(a) ppg.JobGeneratingJob("b", gen) ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/Ac") == "A" assert read("out/C") == "C" assert read("out/Cx") == "C" new_pipegraph.new_pipegraph() a = ppg.FileGeneratingJob("out/A", lambda: writeappend("out/A", "out/Ac", "A")) p = ppg.ParameterInvariant("p", "p2") a.depends_on(p) ppg.JobGeneratingJob("b", gen) ppg.run_pipegraph() assert read("out/Ac") == "AA" assert read("out/Cx") == "CC"
def test_basic_prune2(self): a = ppg.FileGeneratingJob('A', lambda: write('A', 'A')) b = ppg.FileGeneratingJob('B', lambda: write('B', 'B')) b.depends_on(a) b.prune() ppg.run_pipegraph() assert Path('A').read_text() == 'A' assert not Path('B').exists()
def test_older_jobs_added_back_to_new_pipegraph(self, new_pipegraph): a = ppg.FileGeneratingJob("out/A", lambda of: write(of, "a")) ppg.util.global_pipegraph.run() new_pipegraph.new_pipegraph() b = ppg.FileGeneratingJob("out/B", lambda of: write(of, "b")) with pytest.raises(ppg.PyPipeGraphError): a.depends_on(b) with pytest.raises(ppg.PyPipeGraphError): b.depends_on(a)
def test_pruning_final_jobs_directly(self): ppg.FileGeneratingJob("A", lambda: write("A", "A")) ppg.FileGeneratingJob("B", lambda: write("B", "B")) c = ppg.FinalJob("shu", lambda: write("C", "C")) c.prune() ppg.run_pipegraph() assert Path("A").read_text() == "A" assert Path("B").read_text() == "B" assert not Path("C").exists()
def test_pruning_final_jobs_directly(self): a = ppg.FileGeneratingJob('A', lambda: write('A', 'A')) b = ppg.FileGeneratingJob('B', lambda: write('B', 'B')) c = ppg.FinalJob('shu', lambda: write('C', 'C')) c.prune() ppg.run_pipegraph() assert Path('A').read_text() == 'A' assert Path('B').read_text() == 'B' assert not Path('C').exists()
def test_ignored_if_generating_within_filegenerating(self): write_job = ppg.FileGeneratingJob("out/A", lambda: write("out/A", "aa")) def load(): ppg.FileGeneratingJob("out/B", lambda: write("out/B", "aa")) write("out/C", "c") dl = ppg.FileGeneratingJob("out/C", load) write_job.depends_on(dl) ppg.run_pipegraph() assert read("out/C") == "c"
def test_tempfile_still_run_if_needed_for_other(self): a = ppg.TempFileGeneratingJob("A", lambda: write("A", "A")) b = ppg.FileGeneratingJob("B", lambda: write("B", "B" + read("A"))) c = ppg.FileGeneratingJob("C", lambda: write("C", "C" + read("A"))) b.depends_on(a) c.depends_on(a) b.prune() ppg.run_pipegraph() assert not Path("B").exists() assert Path("C").exists() assert Path("C").read_text() == "CA" assert not Path("A").exists()
def test_invalidation(self, new_pipegraph): import numpy o = {} def calc(): return numpy.array(range(0, 10), dtype=numpy.uint32) def store(value): o[0] = value def cleanup(): del o[0] dl = ppg.MemMappedDataLoadingJob("out/A", calc, store, numpy.uint32) dl.cleanup = cleanup of = "out/B" def do_write(): assert isinstance(o[0], numpy.core.memmap) write(of, ",".join(str(x) for x in o[0])) append("out/C", "a") ppg.FileGeneratingJob(of, do_write).depends_on(dl) ppg.run_pipegraph() assert read("out/B") == "0,1,2,3,4,5,6,7,8,9" assert read("out/C") == "a" # now, no run... new_pipegraph.new_pipegraph() dl = ppg.MemMappedDataLoadingJob("out/A", calc, store, numpy.uint32) dl.cleanup = cleanup ppg.FileGeneratingJob(of, do_write).depends_on(dl) ppg.run_pipegraph() assert read("out/C") == "a" new_pipegraph.new_pipegraph() def calc2(): append("out/D", "a") return numpy.array(range(0, 12), dtype=numpy.uint32) dl = ppg.MemMappedDataLoadingJob("out/A", calc2, store, numpy.uint32) ppg.FileGeneratingJob(of, do_write).depends_on(dl) dl.cleanup = cleanup ppg.run_pipegraph() assert read("out/D") == "a" assert read("out/B") == "0,1,2,3,4,5,6,7,8,9,10,11" assert read("out/C") == "aa"
def test_indirect_cicle(self): ppg.new_pipegraph(quiet=True, dump_graph=False) jobA = ppg.FileGeneratingJob("A", lambda: write("A", "A")) jobB = ppg.FileGeneratingJob("B", lambda: write("B", "A")) jobC = ppg.FileGeneratingJob("C", lambda: write("C", "A")) jobC.depends_on(jobB) jobB.depends_on(jobA) jobA.depends_on(jobC) def inner(): ppg.run_pipegraph() assertRaises(ppg.CycleError, inner)
def test_basic_prune3(self): a = ppg.FileGeneratingJob('A', lambda: write('A', 'A')) b = ppg.FileGeneratingJob('B', lambda: write('B', 'B')) c = ppg.FileGeneratingJob('C', lambda: write('C', 'C')) d = ppg.FileGeneratingJob('D', lambda: write('D', 'D')) b.depends_on(a) b.prune() c.depends_on(b) # that is ok, pruning happens after complet build. d.depends_on(a) ppg.run_pipegraph() assert Path('A').read_text() == 'A' assert Path('D').read_text() == 'D' assert not Path('B').exists() assert not Path('C').exists()
def b(): jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B")) jobD = ppg.FileGeneratingJob("out/D", lambda: write("out/D", "D")) def genA(): jobC = ppg.FileGeneratingJob("out/C", lambda: write("out/C", "C")) jobC.depends_on(jobB) jobC.depends_on(jobD) jobA = ppg.JobGeneratingJob("A", genA) jobB.depends_on(jobA) ppg.run_pipegraph() assert read("out/B") == "B" assert read("out/C") == "C"
def test_basic_prune3(self): a = ppg.FileGeneratingJob("A", lambda: write("A", "A")) b = ppg.FileGeneratingJob("B", lambda: write("B", "B")) c = ppg.FileGeneratingJob("C", lambda: write("C", "C")) d = ppg.FileGeneratingJob("D", lambda: write("D", "D")) b.depends_on(a) b.prune() c.depends_on(b) # that is ok, pruning happens after complet build. d.depends_on(a) ppg.run_pipegraph() assert Path("A").read_text() == "A" assert Path("D").read_text() == "D" assert not Path("B").exists() assert not Path("C").exists() assert c._pruned == b.job_id
def prerequisite_jobs(self) -> List[Job]: """ Returns a list of global prerequisite jobs. Returns a list of global prerequisite jobs necessary for all samples: copying and reindexing the reference fasta, creating a dictionary file. Returns ------- List[ppg.Job] List of jobs that have to be done before the actual mutation calling. """ def copy_genome(gatk_compliant_genome_file): # GATK needs a dictionary in the same directory as the genome so we copy shutil.copy(self.genome.find_file("genome.fasta"), str(gatk_compliant_genome_file)) def create_index(gatk_compliant_genome_file): # create the index file cmd = ["samtools", "faidx", self.gatk_compliant_genome_file] with Path(str(gatk_compliant_genome_file) + ".fai.stderr").open("wb") as stderr: subprocess.check_call(cmd, stdout=stderr) def create_dict(): # create the dictionary arguments = [ "CreateSequenceDictionary", "-R", str(self.gatk_compliant_genome_file), ] cmd = self.build_cmd(self.gatk_compliant_genome_file.parent, 1, arguments) with Path(str(self.gatk_compliant_genome_file) + ".stderr").open("wb") as stderr: subprocess.check_call(cmd, stderr=stderr) job1 = ppg.FileGeneratingJob(self.gatk_compliant_genome_file, copy_genome) job2 = ppg.FileGeneratingJob( str(self.gatk_compliant_genome_file) + ".fai", create_index).depends_on(job1) job3 = ppg.FileGeneratingJob( self.gatk_compliant_genome_file.parent / (self.gatk_compliant_genome_file.stem + ".dict"), create_dict, ).depends_on([job1, job2]) return [job1, job2, job3]
def fastq_dump(self): def dump(): import subprocess import os cmd = [ self.algo.path / "bin/" "fasterq-dump", "-O", str(self.target_dir), "-t", str(self.cache_dir), "-e", "4", ] if self.paired: cmd.append("-S") cmd.append(self.accession) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() (self.target_dir / "stdout").write_bytes(stdout) (self.target_dir / "stderr").write_bytes(stderr) if p.returncode != 0 or b'invalid accession' in stderr: raise ValueError("fasterq-dump", p.returncode) (self.target_dir / "sentinel").write_text("done") return ppg.FileGeneratingJob(self.target_dir / "sentinel", dump)
def prep_fasta(self, input_filenames, output_filename): if isinstance(input_filenames, ppg.Job): filenames = input_filenames.filenames deps = input_filenames elif isinstance(input_filenames, (str, Path)): filenames = [str(input_filenames)] deps = PrebuildFileInvariantsExploding( str(output_filename) + "_prep_fasta", filenames) else: filenames = [str(x) for x in input_filenames] deps = PrebuildFileInvariantsExploding( str(output_filename) + "_prep_fasta", filenames) def prep(output_filename): import pysam with open(output_filename, "wb") as op: for fn in filenames: for key, seq in iter_fasta( fn, lambda x: x[:x.find(b" ")] if b" " in x else x): op.write(b">%s\n%s\n" % (key, b"\n".join(wrappedIterator(80)(seq)))) pysam.faidx(output_filename) Path(output_filename).parent.mkdir(exist_ok=True) job = ppg.FileGeneratingJob(output_filename, prep) job.depends_on(deps) self._download_jobs.append(job) return job
def plot(self): normed = self.normed_ddf(self.ddf) ordered = self.ordered_ddf(normed) names = self.handle_names() def plot(): p = self.plot_strategy.plot(ordered.df, names, self.plot_options) self.plot_strategy.render(str(self.output_filename), p) if ppg.inside_ppg(): ppg.util.global_pipegraph.quiet = False deps = [ ordered.load(), ppg.FunctionInvariant( "mbf_heatmap." + self.plot_strategy.name + "plot_func", self.plot_strategy.__class__.plot, ), ppg.FunctionInvariant( "mbf_heatmap" + self.plot_strategy.name + "render_func", self.plot_strategy.__class__.render, ), ppg.ParameterInvariant(self.output_filename, freeze( (self.names, self.plot_options))), ] return ppg.FileGeneratingJob(self.output_filename, plot).depends_on(deps) else: plot() return self.output_filename
def generate_file(self, filename, write_callback, dependencies, empty_ok=False): return ( ppg.FileGeneratingJob(filename, write_callback, empty_ok=empty_ok) .depends_on(dependencies) .depends_on(self.load()), Path(filename), )
def write_gct(genes_or_dataframe: Union[Genes, DataFrame], output_directory: Path, phenotypes: Tuple[str, str], columns_a_b: Tuple[List[str], List[str]], dependencies: List[Job] = []) -> FileGeneratingJob: """ Creates a Job that writes expression data for GSEA at a specified folder. A file named imput.gct is created. Parameters ---------- output_directory : Path The output directory in which an input.gct file is created. phenotypes : Tuple[str, str] The phenotype/class names of the groups to be compared. columns_a_b : Tuple[List[str], List[str]] The DataFrame columns of the relevant expression values. dependencies : List[Job], optional List of prerequisite jobs on which the , by default [] Returns ------- FileGeneratingJob The job that creates the file. """ output_directory.mkdir(parents=True, exist_ok=True) outfile = output_directory / "input.gct" if isinstance(genes_or_dataframe, Genes): dependencies.append( genes_or_dataframe.add_annotator( mbf_genomics.genes.annotators.Description())) def __dump(): df = genes_or_dataframe if isinstance(genes_or_dataframe, Genes): df = genes_or_dataframe.df.copy() elif isinstance(genes_or_dataframe, DataFrame): df = df.copy() else: raise ValueError( f"Parameter genes_or_dataframe must be an instance of Genes or DataFrame, was {type(genes_or_dataframe)}." ) with outfile.open("w") as handle: handle.write("#1.2\n") handle.write( f"{len(df)}\t{len(columns_a_b[0]) + len(columns_a_b[1])}\n") handle.write("ProbeName\tDescription\t") handle.write("\t".join(columns_a_b[0] + columns_a_b[1])) handle.write("\n") df = df.rename(columns={"gene_stable_id": "NAME"}) description = [ f"{x} {y}" for x, y in zip(df["name"], df["description"]) ] df["Description"] = description df = df[["NAME", "Description"] + columns_a_b[0] + columns_a_b[1]] df = df.fillna(0) for _, row in df.iterrows(): handle.write("\t".join([str(x) for x in row]) + "\n") return ppg.FileGeneratingJob(outfile, __dump).depends_on(dependencies)
def register(self, prerequisites): """Call the webservice and register this project""" sentinel_file = Path("web/registration_sentinel") def do_register(): project_name = os.environ["ANYSNAKE_PROJECT_PATH"] project_name = project_name[project_name.rfind("/") + 1:] print("registration for", project_name) import requests data = {"project_name": project_name} auth = requests.auth.HTTPBasicAuth("feed", "feed") print( requests.get( "http://mbf.imt.uni-marburg.de/bil2/register?", params=data, auth=auth, )) print( requests.get("http://mbf.imt.uni-marburg.de/bil2/gbrowse_dump", auth=auth)) sentinel_file.write_text("Done") return ppg.FileGeneratingJob(sentinel_file, do_register).depends_on(prerequisites)
def register_qc_biotypes(self): output_filename = self.result_dir / f"{self.name}_reads_per_biotype.png" from mbf_genomics.genes import Genes from mbf_genomics.genes.anno_tag_counts import GeneUnstranded genes = Genes(self.genome) anno = GeneUnstranded(self) def plot(output_filename): print(genes.df.columns) return (dp(genes.df).groupby("biotype").summarize( (anno.columns[0], lambda x: x.sum(), "read count")).mutate(sample=self.name).p9( ).theme_bw().annotation_stripes().add_bar( "biotype", "read count", stat="identity").scale_y_continuous( labels=lambda xs: ["%.2g" % x for x in xs]) # .turn_x_axis_labels() .coord_flip().title(self.name).render( output_filename, width=6, height=2 + len(genes.df.biotype.unique()) * 0.25, )) return register_qc( ppg.FileGeneratingJob(output_filename, plot).depends_on(genes.add_annotator(anno)))
def dump_rsync_list(self): output_filename = "web/scb/rsync_list.txt" def dump(output_filename): paths_to_copy = set() for group in self.meta_data: for entry in self.meta_data[group]: for x in [ "table_path", "path_bigbed", "path_table", "path_bam", "path", ]: if x in entry: # genes paths_to_copy.add(Path(entry[x]).absolute()) if "path_bam" in entry: paths_to_copy.add( Path(entry["path_bam"] + ".bai").absolute()) for fn in Path("web/scb").glob("*"): paths_to_copy.add(fn.absolute()) output = "" # paths_to_copy.add("/project/web/scb/metadata.json") for x in sorted([str(x) for x in paths_to_copy]): if x.startswith("/project"): output += x[len("/project/"):] output += "\n" Path(output_filename).write_text(output) return (ppg.FileGeneratingJob(output_filename, dump).depends_on( self.deps).depends_on(self.dump_meta_data_json()))