def plot_venn_from_genes_with_comparisons(output_prefix, a_dict, id_column="gene_stable_id"): if len(a_dict) not in (2, 3): raise ValueError("Max support 3 sets currently") def plot(): up = {} down = {} for name, genes_ddf in sorted(a_dict.items()): df = genes_ddf.df stable_ids = df[id_column] column = genes_ddf.venn_annotator["log2FC"] up[name] = set(stable_ids[df[column] > 0]) down[name] = set(stable_ids[df[column] < 0]) plt.figure(figsize=(4, 4)) venn.venn(up) plt.savefig(str(output_prefix) + ".up.png", dpi=72) plt.figure(figsize=(4, 4)) venn.venn(down) plt.savefig(str(output_prefix) + ".down.png", dpi=72) return (ppg.MultiFileGeneratingJob( [str(output_prefix) + ".up.png", str(output_prefix) + ".down.png"], plot).depends_on([ x.add_annotator(x.venn_annotator) for x in a_dict.values() ]).depends_on(ppg.ParameterInvariant(output_prefix, id_column)))
def test_multifilegenerating_job_requires_string_filenames(self): import pathlib x = lambda: 5 # noqa:E731 ppg.MultiFileGeneratingJob(["a"], x) ppg.MultiFileGeneratingJob([pathlib.Path("a")], x) def inner(): ppg.MultiFileGeneratingJob([0]) assertRaises(TypeError, inner) def inner(): ppg.MultiFileGeneratingJob([b"a"]) # bytes is not a string type assertRaises(TypeError, inner)
def save_input(self): """Store the filtered input also in filename for later reference""" import gzip temp_job = self.prepare_input() output_dir = self.result_dir / "aligner_input" output_dir.mkdir(exist_ok=True) output_names = [ output_dir / (Path(x).name + ".gz") for x in temp_job.filenames ] pairs = zip(temp_job.filenames, output_names) def do_store(): block_size = 10 * 1024 * 1024 for input_filename, output_filename in pairs: op = open(input_filename, "rb") op_out = gzip.GzipFile(output_filename, "wb") f = op.read(block_size) while f: op_out.write(f) f = op.read(block_size) op_out.close() op.close() return ppg.MultiFileGeneratingJob(output_names, do_store).depends_on(temp_job)
def align_job( self, input_fastq, paired_end_filename, index_basename, output_bam_filename, parameters, ): def align(): with open(output_bam_filename, "w") as op: json.dump( [ open(input_fastq).read(200), open(paired_end_filename).read(200) if paired_end_filename else "", index_basename, str(parameters), ], op, ) with open(str(output_bam_filename) + ".bai", "w") as op: op.write("Done") job = ppg.MultiFileGeneratingJob( [output_bam_filename, str(output_bam_filename) + ".bai"], align ) job.depends_on_params("") return job
def register_qc_pca(self): output_filename = self.result_dir / "pca.png" def plot(): import sklearn.decomposition as decom pca = decom.PCA(n_components=2, whiten=False) data = self.get_df() # min max scaling 0..1 per gene data = data.sub(data.min(axis=1), axis=0) data = data.div(data.max(axis=1), axis=0) data = data[~pd.isnull(data).any(axis=1)] # can' do pca on NAN values pca.fit(data.T) xy = pca.transform(data.T) title = "PCA %s (%s)\nExplained variance: x %.2f%%, y %.2f%%" % ( self.ddf.name, self.find_variable_name(), pca.explained_variance_ratio_[0] * 100, pca.explained_variance_ratio_[1] * 100, ) plot_df = pd.DataFrame( { "x": xy[:, 0], "y": xy[:, 1], "label": [self.get_plot_name(c) for (a, c) in self.samples], "group": [ self.sample_column_to_group[c] for (a, c) in self.samples ], } ) p = dp(plot_df).p9().theme_bw().add_scatter("x", "y", color="group") if data.shape[1] < 15: p = p.add_text( "x", "y", "label", _alpha=0.5, # _adjust_text={ # "expand_points": (2, 2), # "arrowprops": {"arrowstyle": "->", "color": "darkgrey"}, # }, ) p = ( p.scale_color_many_categories() .title(title) .render(output_filename, width=8, height=6, dpi=72) ) plot_df.to_csv(output_filename.with_suffix(".tsv"), sep="\t") return register_qc( ppg.MultiFileGeneratingJob( [output_filename, output_filename.with_suffix(".tsv")], plot ).depends_on(self.deps()) )
def align_job( self, input_fastq, paired_end_filename, index_basename, output_bam_filename, parameters, ): job = ppg.MultiFileGeneratingJob( [output_bam_filename, str(output_bam_filename) + ".bai"], lambda: 5 ) # job.depends_on_params("") # that's the line we check return job
def call(self): """ Creates the vcf generating job. Creates a pypipegraph Job that does the variant calling and takes care of preprocessing and dependencies. Returns ------- pypipegraph.FileGeneratingJob The job that does the variant calling. """ run_callable = self.caller.run() def run(): run_callable(self.input_samples, self.output_file) job = ppg.FileGeneratingJob(self.output_file, run, empty_ok=False) # job can depend on preprocessor dependencies, caller dependencies and the preprocessor job job.depends_on(self.caller.get_dependencies()) lanes_loaded = [ input_sample.load() for sample_list in self.input_samples for input_sample in sample_list ] # If a snp caller needs some preparation, this is the place to do it. if self.caller.preprocessor is not None: job.depends_on(self.caller.preprocessor.get_dependencies()) job.depends_on(self.caller.preprocessor.prerequisite_jobs()) preprocessor_output = self.caller.preprocessor.get_preprocessed_output( self.input_samples) if len(preprocessor_output) > 0: preprocessor_job = ppg.MultiFileGeneratingJob( preprocessor_output, self.caller.preprocessor.preprocess(self.input_samples), ) preprocessor_job.depends_on(lanes_loaded) preprocessor_job.depends_on( self.caller.preprocessor.get_dependencies()).depends_on( self.caller.preprocessor.prerequisite_jobs()) job.depends_on(preprocessor_job) job.depends_on(lanes_loaded) job.depends_on( ppg.FunctionInvariant( f"{self.caller.__class__.__name__}.run", self.caller.__class__.run, )) for sample_list in self.input_samples: for input_sample in sample_list: job.depends_on(input_sample.load()) job.cores_needed = self.caller.get_cores_needed() return job
def download_files(self): result = [] for url, target_fn in zip(self.urls, self.target_files): def download(url=url, target_fn=target_fn): Path(target_fn).parent.mkdir(exist_ok=True, parents=True) target_fn.with_name(target_fn.name + ".url").write_text(url) with open(str(target_fn) + "_temp", "wb") as op: download_file(url, op) shutil.move(str(target_fn) + "_temp", target_fn) job = ppg.MultiFileGeneratingJob( [target_fn, target_fn.with_name(target_fn.name + ".url")], download) result.append(job) return result
def test_creating_index_for_mfg(self): def gen(): shutil.copy(get_sample_data(Path("mbf_align/ex2.bam")), "sample.bam") ppg.util.global_pipegraph.quiet = False job = ppg.MultiFileGeneratingJob(["sample.bam"], gen) genome = object() lane = mbf_align.AlignedSample("test_lane", job, genome, False, "AA123") assert isinstance(lane.load()[1], ppg.FileGeneratingJob) assert lane.load()[0] in lane.load()[1].prerequisites ppg.run_pipegraph() assert Path("sample.bam").exists() assert Path("sample.bam.bai").exists()
def job_reheader_and_rename_chromosomes(input_bam_path, output_bam_path, replacements): input_path_bam = Path(input_bam_path) output_bam_path = Path(output_bam_path) def do_replace(replacements=replacements): reheader_and_rename_chromosomes(input_bam_path, output_bam_path, replacements) output_bam_path.parent.mkdir(exist_ok=True, parents=True) return ppg.MultiFileGeneratingJob( [output_bam_path, output_bam_path.with_suffix(".bam.bai")], do_replace).depends_on( ppg.FileInvariant(input_bam_path), ppg.FunctionInvariant("mbf_bam.reheader_and_rename_chromosomes", reheader_and_rename_chromosomes), )
def test_accepts(self): import pathlib write("aaa", "hello") write("bbb", "hello") write("ccc", "hello") a = ppg.FileTimeInvariant(pathlib.Path("aaa")) a1 = ppg.MultiFileInvariant([pathlib.Path("bbb"), "ccc"]) b = ppg.FileGeneratingJob( pathlib.Path("b"), lambda of: write(of, "bb" + read("aaa") + read("bbb") + read("ccc")), ) b.depends_on(a) b.depends_on(a1) dd = Dummy() def mf(): write("c", "cc" + read("g")) write("d", "dd" + read("h") + dd.attr) write("e", "ee" + read("i") + read("j")) c = ppg.MultiFileGeneratingJob([pathlib.Path("c"), "d", pathlib.Path("e")], mf) c.depends_on(b) d = ppg.FunctionInvariant(pathlib.Path("f"), lambda x: x + 1) c.depends_on(d) e = ppg.ParameterInvariant(pathlib.Path("c"), "hello") c.depends_on(e) f = ppg.TempFileGeneratingJob(pathlib.Path("g"), lambda: write("g", "gg")) c.depends_on(f) def tmf(): write("h", "hh") write("i", "ii") g = ppg.MultiTempFileGeneratingJob([pathlib.Path("h"), "i"], tmf) c.depends_on(g) def tpf(): write("j", "jjjj") write("k", "kkkk") h = ppg.TempFilePlusGeneratingJob(pathlib.Path("j"), pathlib.Path("k"), tpf) c.depends_on(h) i = ppg.CachedDataLoadingJob( pathlib.Path("l"), lambda: write("l", "llll"), lambda res: res ) c.depends_on(i) m = ppg.CachedAttributeLoadingJob(pathlib.Path("m"), dd, "attr", lambda: "55") c.depends_on(m) ppg.run_pipegraph() assert read("aaa") == "hello" assert read("b") == "bbhellohellohello" assert read("c") == "ccgg" assert read("d") == "ddhh55" assert read("e") == "eeiijjjj" assert not (os.path.exists("g")) assert not (os.path.exists("h")) assert not (os.path.exists("i")) assert not (os.path.exists("j")) assert read("k") == "kkkk"
def inner(): ppg.MultiFileGeneratingJob([b"a"]) # bytes is not a string type
def inner(): ppg.MultiFileGeneratingJob([0])
def test_lane_raises_on_multifilegeneratingJobWithNoBAM(self): mfg = ppg.MultiFileGeneratingJob(["a.sam"], lambda: 5) genome = object() with pytest.raises(ValueError): mbf_align.AlignedSample("test_lane", mfg, genome, False, "AA123")
def run_exports(gen_additional_jobs=None, handle_ppg=True, settings='ovca'): if settings == 'ovca': apply_ovca_settings() else: raise ValueError("unknow setting value", settings) old = Path(os.getcwd()).absolute() os.chdir("/project") if handle_ppg: ppg.new_pipegraph() # os.chdir(old) to_wide_columns = {} jobs = [] for cls in exporting_classes: instance = cls() if hasattr(instance, "exports"): instance.exports() out_prefix = getattr(instance, "out_prefix", "") for method_name in dir(instance): method = getattr(instance, method_name) if hasattr(method, "_output_name"): print(cls.__name__, method.__name__) output_filename = ("/project/processed/" + out_prefix + method._output_name + ".units") cwd = str(Path(method._abs_filename).parent) def write(output_filename=output_filename, method=method, cwd=cwd): os.chdir(cwd) df = method() os.chdir("/project") check_dataframe(out_prefix + method._output_name, df) Path(output_filename).parent.mkdir(exist_ok=True, parents=True) if "unit" in df: for ii, (unit, sub_df) in enumerate( df.groupby("unit", sort=True)): try: sub_df.to_parquet( output_filename[:output_filename. rfind(".")] + "." + str(ii) + ".parquet") except: sub_df.to_pickle("debug.pickle") raise Path(output_filename).write_text( json.dumps(sorted(df.unit.unique()))) else: df.to_parquet( output_filename[:output_filename.rfind(".")] + ".0.parquet") Path(output_filename).write_text(json.dumps(["nounit" ])) Path(output_filename + ".desc").write_text( method._description) job = ppg.MultiFileGeneratingJob( [output_filename, output_filename + ".desc"], write) job.depends_on( ppg.FunctionInvariant(output_filename + "_inner_func", method)) if method._input_files: job.depends_on(ppg.MultiFileInvariant(method._input_files)) if method._deps: if hasattr(method._deps, "__call__"): deps = method._deps(method.__self__) else: deps = method._deps job.depends_on(deps) print(output_filename) print("") os.chdir("/project") jobs.append(job) to_wide_columns[out_prefix + method._output_name] = method._wide_columns def dump_to_wide_columns(output_filename): Path(output_filename).write_text(json.dumps(to_wide_columns)) jobs.append( ppg.FileGeneratingJob( "/project/processed/_to_wide_columns.json", dump_to_wide_columns).depends_on( ppg.ParameterInvariant( "/project/processed/_to_wide_columns.json", ppg.util.freeze(to_wide_columns), ))) old = Path(os.getcwd()).absolute() if handle_ppg: os.chdir("/project") ppg.run_pipegraph() os.chdir(old) return jobs