def test_tempfile_not_run_on_prune(self): a = ppg.TempFileGeneratingJob("A", lambda: write("A", "A")) b = ppg.FileGeneratingJob("B", lambda: write("B", "B" + read("A"))) b.depends_on(a) b.prune() ppg.run_pipegraph() assert not Path("B").exists() assert not Path("A").exists()
def test_tempfile_still_run_if_needed_for_other(self): a = ppg.TempFileGeneratingJob("A", lambda: write("A", "A")) b = ppg.FileGeneratingJob("B", lambda: write("B", "B" + read("A"))) c = ppg.FileGeneratingJob("C", lambda: write("C", "C" + read("A"))) b.depends_on(a) c.depends_on(a) b.prune() ppg.run_pipegraph() assert not Path("B").exists() assert Path("C").exists() assert Path("C").read_text() == "CA" assert not Path("A").exists()
def job_download(info): """Download the package defined in info""" target_fn = f'/anysnake/bioconductor_download/{info["repo"]}/{info["name"]}_{info["version"]}.tar.gz' def download(): p = Path(target_fn) p.parent.mkdir(exist_ok=True, parents=False) r = requests.get(info["url"], stream=True) if r.status_code != 200: raise ValueError("Error return on %s %s " % (info["url"], r.status_code)) with open(str(target_fn) + "_temp", "wb") as op: for block in r.iter_content(1024 * 1024): op.write(block) shutil.move(str(target_fn) + "_temp", str(target_fn)) job = ppg.TempFileGeneratingJob(target_fn, download) job.ignore_code_changes() return job
def test_accepts(self): import pathlib write("aaa", "hello") write("bbb", "hello") write("ccc", "hello") a = ppg.FileTimeInvariant(pathlib.Path("aaa")) a1 = ppg.MultiFileInvariant([pathlib.Path("bbb"), "ccc"]) b = ppg.FileGeneratingJob( pathlib.Path("b"), lambda of: write(of, "bb" + read("aaa") + read("bbb") + read("ccc")), ) b.depends_on(a) b.depends_on(a1) dd = Dummy() def mf(): write("c", "cc" + read("g")) write("d", "dd" + read("h") + dd.attr) write("e", "ee" + read("i") + read("j")) c = ppg.MultiFileGeneratingJob([pathlib.Path("c"), "d", pathlib.Path("e")], mf) c.depends_on(b) d = ppg.FunctionInvariant(pathlib.Path("f"), lambda x: x + 1) c.depends_on(d) e = ppg.ParameterInvariant(pathlib.Path("c"), "hello") c.depends_on(e) f = ppg.TempFileGeneratingJob(pathlib.Path("g"), lambda: write("g", "gg")) c.depends_on(f) def tmf(): write("h", "hh") write("i", "ii") g = ppg.MultiTempFileGeneratingJob([pathlib.Path("h"), "i"], tmf) c.depends_on(g) def tpf(): write("j", "jjjj") write("k", "kkkk") h = ppg.TempFilePlusGeneratingJob(pathlib.Path("j"), pathlib.Path("k"), tpf) c.depends_on(h) i = ppg.CachedDataLoadingJob( pathlib.Path("l"), lambda: write("l", "llll"), lambda res: res ) c.depends_on(i) m = ppg.CachedAttributeLoadingJob(pathlib.Path("m"), dd, "attr", lambda: "55") c.depends_on(m) ppg.run_pipegraph() assert read("aaa") == "hello" assert read("b") == "bbhellohellohello" assert read("c") == "ccgg" assert read("d") == "ddhh55" assert read("e") == "eeiijjjj" assert not (os.path.exists("g")) assert not (os.path.exists("h")) assert not (os.path.exists("i")) assert not (os.path.exists("j")) assert read("k") == "kkkk"
def prepare_input(self): # input_strategy returns a list of # paired fastq files # ie. [('A_R1_.fastq1', 'A_R2.fastq', ...), ...] input_pairs = self.input_strategy() any_r2 = any([len(x) > 1 for x in input_pairs]) # Single end - works from flat list if self.pairing == "single": if any_r2: raise PairingError( f"{self.name}: paired end lane defined as single end - you need to change the pairing parameter" ) input_filenames = [str(f[0]) for f in input_pairs] elif self.pairing == "paired_as_single": input_filenames = [str(f) for fl in input_pairs for f in fl] elif self.pairing == "only_first": input_filenames = [str(f[0]) for f in input_pairs] elif self.pairing == "only_second": input_filenames = [str(f[1]) for f in input_pairs] elif self.pairing == "paired": if not any_r2: raise PairingError( f"Paired end lane, but no R2 reads found. Found files: {input_pairs}" ) input_filenames = [(str(f[0]), str(f[1])) for f in input_pairs ] # throwing away all later... else: raise PairingError("unknown pairing") # pragma: no cover if self.pairing == "paired": flat_input_filenames = [f for fl in input_pairs for f in fl] else: flat_input_filenames = input_filenames if hasattr(self.input_strategy, "dependencies"): deps = self.input_strategy.dependencies else: deps = [ppg.FileChecksumInvariant(f) for f in flat_input_filenames] output_filenames = self.get_aligner_input_filenames() if self.pairing == "paired": if hasattr(self.fastq_processor, "generate_aligner_input_paired"): def prep_aligner_input(): import shutil self.fastq_processor.generate_aligner_input_paired( str(output_filenames[0]) + ".temp", str(output_filenames[1]) + ".temp", input_filenames, self.reverse_reads, ) shutil.move( str(output_filenames[0]) + ".temp", output_filenames[0]) shutil.move( str(output_filenames[1]) + ".temp", output_filenames[1]) job = ppg.MultiTempFileGeneratingJob(output_filenames, prep_aligner_input) job.depends_on( self.fastq_processor.get_dependencies( [str(x) for x in output_filenames])) else: def prep_aligner_input_r1(): import shutil self.fastq_processor.generate_aligner_input( str(output_filenames[0]) + ".temp", [x[0] for x in input_filenames], self.reverse_reads, ) shutil.move( str(output_filenames[0]) + ".temp", output_filenames[0]) def prep_aligner_input_r2(): import shutil self.fastq_processor.generate_aligner_input( str(output_filenames[1]) + ".temp", [x[1] for x in input_filenames], self.reverse_reads, ) shutil.move( str(output_filenames[1]) + ".temp", output_filenames[1]) jobR1 = ppg.TempFileGeneratingJob(output_filenames[0], prep_aligner_input_r1) jobR2 = ppg.TempFileGeneratingJob(output_filenames[1], prep_aligner_input_r2) jobR1.depends_on( self.fastq_processor.get_dependencies( str(output_filenames[0]))) jobR2.depends_on( self.fastq_processor.get_dependencies( str(output_filenames[1]))) job = ppg.JobList([jobR1, jobR2]) # needed by downstream code. job.filenames = [output_filenames[0], output_filenames[1]] else: def prep_aligner_input(output_filename): import shutil self.fastq_processor.generate_aligner_input( str(output_filename) + ".temp", input_filenames, self.reverse_reads) shutil.move(str(output_filename) + ".temp", output_filename) job = ppg.TempFileGeneratingJob(output_filenames[0], prep_aligner_input) job.depends_on( self.fastq_processor.get_dependencies(str( output_filenames[0]))) job.depends_on( deps, ppg.ParameterInvariant( self.name + "input_files", tuple(sorted(input_filenames)) + (self.reverse_reads, self.fastq_processor.__class__.__name__), ), ) return job