def test_unpickle_bug_prevents_single_job_from_unpickling(self): def do_a(): write("out/A", "A") append("out/As", "A") ppg.FileGeneratingJob("out/A", do_a) def do_b(): write("out/B", "A") append("out/Bs", "A") job_B = ppg.FileGeneratingJob("out/B", do_b) cd = CantDepickle() job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd,)) job_B.depends_on(job_parameter_unpickle_problem) ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/As") == "A" assert read("out/B") == "A" assert read("out/Bs") == "A" print("second run") ppg.new_pipegraph(dump_graph=False) ppg.FileGeneratingJob("out/A", do_a) job_B = ppg.FileGeneratingJob("out/B", do_b) job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd,)) job_B.depends_on(job_parameter_unpickle_problem) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/As") == "A" assert read("out/B") == "A" assert ( read("out/Bs") == "AA" ) # this one got rerun because we could not load the invariant...
def test_filegen_invalidated_jobgen_created_filegen_later_also_invalidated( self, new_pipegraph): a = ppg.FileGeneratingJob("out/A", lambda: writeappend("out/A", "out/Ac", "A")) p = ppg.ParameterInvariant("p", "p") a.depends_on(p) def gen(): c = ppg.FileGeneratingJob( "out/C", lambda: writeappend("out/C", "out/Cx", "C")) c.depends_on(a) ppg.JobGeneratingJob("b", gen) ppg.run_pipegraph() assert read("out/A") == "A" assert read("out/Ac") == "A" assert read("out/C") == "C" assert read("out/Cx") == "C" new_pipegraph.new_pipegraph() a = ppg.FileGeneratingJob("out/A", lambda: writeappend("out/A", "out/Ac", "A")) p = ppg.ParameterInvariant("p", "p2") a.depends_on(p) ppg.JobGeneratingJob("b", gen) ppg.run_pipegraph() assert read("out/Ac") == "AA" assert read("out/Cx") == "CC"
def __init__( self, genes_or_dataframe: Union[Genes, DataFrame], phenotypes: Tuple[str, str], columns_a_b: Tuple[List[str], List[str]], name: str = "Gct_df_default", dependencies: List[Job] = [], ): self.dependencies = dependencies if isinstance(genes_or_dataframe, Genes): self.name = ( f"Gct_{genes_or_dataframe.name}_{phenotypes[0]}_vs_{phenotypes[1]}" ) self.dependencies.append(genes_or_dataframe.load()) else: self.name = f"{name}_{phenotypes[0]}_vs_{phenotypes[1]}" self.cache_dir = Path("cache") / "gct" / self.name self.columns_a_b = columns_a_b self.phenotypes = phenotypes self.genes_or_dataframe = genes_or_dataframe self.dependencies.append( ppg.ParameterInvariant( self.name, list(phenotypes) + columns_a_b[0] + columns_a_b[1] + [self.name], )) self._gct = self.cache_dir / "input.gct"
def calc_regions(self): def calc(): return self.do_calc_regions() key = hashlib.md5( ",".join( [self.gr_to_draw.name, self.region_strategy.name] + list(set([x.name for x in self.lanes_to_draw])) ).encode() ).hexdigest() # technically, we could share the regions job between heatmaps with the same regions but differen lanes # but we're using a CachedAttributeLoadingJob and that would.. .complicate things quite a bit of = self.cache_dir / "regions" / key of.parent.mkdir(exist_ok=True, parents=True) return ppg.CachedAttributeLoadingJob(of, self, "regions_", calc).depends_on( [ ppg.ParameterInvariant( of, (self.region_strategy.name, self.gr_to_draw.name) ), ppg.FunctionInvariant( "genomics.regions.heatmap." + self.region_strategy.name + "calc_func", self.region_strategy.__class__.calc, ), ] + self.region_strategy.get_dependencies(self.gr_to_draw) )
def FromDifference(name, a, b, sheet_name="Differences"): """a minus b""" def do_load(df): remove_ids = set(b.df["gene_stable_id"]) keep = ~np.array( [stable_id in remove_ids for stable_id in a.df["gene_stable_id"]], dtype=np.bool, ) return keep if a.load_strategy.build_deps: deps = [ a.load(), b.load(), ppg.ParameterInvariant( "Genes_%s_parents" % name, (a.name, b.name)), # so if you swap out the gr, it's detected... ] else: deps = [] res = a.filter( name, do_load, dependencies=deps, sheet_name=sheet_name, vid=["Difference", a.vid, b.vid], ) res.parent = a return res
def calc_order(self): def calc(): return self.do_calc_order() of = self.cache_dir / "order" deps = self.order_strategy.get_dependencies( self.heatmap.gr_to_draw, self.heatmap.lanes_to_draw ) if len(deps) == 2: order_deps, order_params = deps order_func = None else: order_deps, order_params, order_func = deps return ppg.CachedAttributeLoadingJob(of, self, "order_", calc).depends_on( [ self.heatmap.calc_raw_data(), self.calc_norm_data(), ppg.ParameterInvariant(of, (self.order_strategy.name,) + order_params), ppg.FunctionInvariant( of.name + "_secondary_func", order_func ), ppg.FunctionInvariant( "genomics.regions.heatmap." + self.order_strategy.name + "calc_func", self.order_strategy.__class__.calc, ), ] + order_deps )
def plot(self): normed = self.normed_ddf(self.ddf) ordered = self.ordered_ddf(normed) names = self.handle_names() def plot(): p = self.plot_strategy.plot(ordered.df, names, self.plot_options) self.plot_strategy.render(str(self.output_filename), p) if ppg.inside_ppg(): ppg.util.global_pipegraph.quiet = False deps = [ ordered.load(), ppg.FunctionInvariant( "mbf_heatmap." + self.plot_strategy.name + "plot_func", self.plot_strategy.__class__.plot, ), ppg.FunctionInvariant( "mbf_heatmap" + self.plot_strategy.name + "render_func", self.plot_strategy.__class__.render, ), ppg.ParameterInvariant(self.output_filename, freeze( (self.names, self.plot_options))), ] return ppg.FileGeneratingJob(self.output_filename, plot).depends_on(deps) else: plot() return self.output_filename
def __init__( self, collection_name: str, genome: EnsemblGenome, version: str = "7.1", subset: str = "all", ): name = ".".join([collection_name, subset, "v" + version]) super().__init__(name, genome) if int(self.genome.revision) < 97: raise ValueError( "Please use an Ensembl Genome from revision 97 onward.") try: v = float(version) except ValueError: raise ValueError( f"Cannot understand version {version}. It should be something like 7.1." ) if v < 7: raise ValueError( "MSigDB Ensembl mapping only works for version 7.0 onward. Version was {version}." ) self.version = version self.subset = subset self.collection_name = name self.input_file = self.cache_dir / (self.name + ".gmt") self.dependencies = [ ppg.ParameterInvariant( self.name, [self.collection_name, self.version, self.subset]) ]
def plot_venn_from_genes_with_comparisons(output_prefix, a_dict, id_column="gene_stable_id"): if len(a_dict) not in (2, 3): raise ValueError("Max support 3 sets currently") def plot(): up = {} down = {} for name, genes_ddf in sorted(a_dict.items()): df = genes_ddf.df stable_ids = df[id_column] column = genes_ddf.venn_annotator["log2FC"] up[name] = set(stable_ids[df[column] > 0]) down[name] = set(stable_ids[df[column] < 0]) plt.figure(figsize=(4, 4)) venn.venn(up) plt.savefig(str(output_prefix) + ".up.png", dpi=72) plt.figure(figsize=(4, 4)) venn.venn(down) plt.savefig(str(output_prefix) + ".down.png", dpi=72) return (ppg.MultiFileGeneratingJob( [str(output_prefix) + ".up.png", str(output_prefix) + ".down.png"], plot).depends_on([ x.add_annotator(x.venn_annotator) for x in a_dict.values() ]).depends_on(ppg.ParameterInvariant(output_prefix, id_column)))
def GenomicRegions_Union(name, list_of_grs, summit_annotator=None, sheet_name="Overlaps"): """Combine serveral GRs into one. Do not set on_overlap """ verify_same_genome(list_of_grs) def load(): dfs = [x.df[["chr", "start", "stop"]] for x in list_of_grs] return pd.concat(dfs, axis=0) if ppg.inside_ppg(): deps = [x.load() for x in list_of_grs] deps.append( ppg.ParameterInvariant(name + "_input_grs", list(sorted([x.name for x in list_of_grs])))) else: deps = [] vid = ("union", [x.vid for x in list_of_grs]) return GenomicRegions( name, load, deps, list_of_grs[0].genome, on_overlap="merge", summit_annotator=summit_annotator, sheet_name=sheet_name, vid=vid, )
def test_random_same_number(self): def sample_data(): return pd.DataFrame({ "chr": ["1", "2", "1"], "start": [10, 100, 1000], "stop": [12, 110, 1110], "column_that_will_disappear": ["A", "b", "c"], }) def convert(df): res = df[["chr", "start", "stop"]] res = res.assign(start=res["start"] + 1) return res if ppg.inside_ppg(): deps = [ppg.ParameterInvariant("shuParam", ("hello"))] else: deps = [] a = regions.GenomicRegions("sharum", sample_data, [], get_genome_chr_length()) a.add_annotator(Constant("Constant", 5)) a.annotate() b = a.convert("a+1", convert, dependencies=deps) force_load(b.load()) for d in deps: assert d in b.load().lfg.prerequisites run_pipegraph() assert len(a.df) == len(b.df) assert (a.df["start"] == b.df["start"] - 1).all() assert "column_that_will_disappear" in a.df.columns assert not ("column_that_will_disappear" in b.df.columns)
def get_dependencies(self): import mbf_bam return super().get_dependencies() + [ self.other_alignment.load(), ppg.ParameterInvariant("SubtractOtherLane.mbf_bam.version", mbf_bam.__version__), ]
def deps(self): import rpy2.robjects as ro ro.r("library('DESeq2')") version = str(ro.r("packageVersion")("DESeq2")) return ppg.ParameterInvariant( self.__class__.__name__ + "_" + self.name, (version,) )
def deps(self): import rpy2.robjects as ro ro.r("library('edgeR')") version = str(ro.r("packageVersion")("edgeR")) return ppg.ParameterInvariant( self.__class__.__name__ + "_" + self.name, (version, self.ignore_if_max_count_less_than), )
def get_dependencies(self): return [ ppg.ParameterInvariant( self.name + "parameters", [ self.no_of_clusters, self.threshold, self.affinity, self.linkage ], ) ]
def deps(self): input_columns = [] for k in sorted(self.groups_to_samples): for ac in self.groups_to_samples[k]: input_columns.append(ac[1]) return [ self.ddf.add_annotator(ac[0]) for ac in self.samples if ac[0] is not None ] + [ self.ddf.load(), ppg.ParameterInvariant(self.name, freeze(input_columns)), ] # you might be working with an anno less ddf afterall
def __init__(self, phenotypes: Tuple[str, str], columns_a_b: Tuple[List[str], List[str]]): self.name = f"Cls_{phenotypes[0]}_vs_{phenotypes[1]}" self.cache_dir = Path("cache") / "cls" / self.name self.columns_a_b = columns_a_b self.phenotypes = phenotypes self.dependencies = [ ppg.ParameterInvariant( self.name, list(phenotypes) + columns_a_b[0] + columns_a_b[1]) ] self._cls = self.cache_dir / "input.cls"
def generate_download_jobs(): jobs = [] global urls # we use a global to communicate with the later job # please note that this is only possible in Job generating and data loading jobs, # not in the output jobs. urls = retrieve_urls() for url in urls: jobs.append(download_job(url)) jobs.append( # this makes sure that if you remove urls, the output job would also rerun. # adding urls not seen before would make it rerun either way. pypipegraph.ParameterInvariant('retrieved_urls', urls)) return jobs
def __init__(self, urls): if isinstance(urls, str): urls = [urls] self.urls = sorted(urls) self.target_files = self.name_files() self.jobs = self.download_files() self.dependencies = self.jobs + [ ppg.ParameterInvariant( hashlib.md5(("".join(self.urls)).encode("utf8")).hexdigest(), sorted(self.urls), ) ]
def get_dependencies(self): res = [] for peakset in self._get_regions(): res.append(peakset.load()) res.append(peakset.write_bigbed()[0]) res.append(peakset.write()[0]) res.append( ppg.ParameterInvariant( self.get_filename(), tuple(sorted([x.name for x in self._get_regions()])), )) return res
def generate_stitched_fastq(output_file: Path, r1: Path, r2: Path, dependencies: List[Job] = [], options: Dict[str, str] = {}): """ generate_stitched_fastq wrapper for ngmerge. Parameters ---------- output_file : Path Output file path for the new fastq file. r1 : Path Path to R1 file. r2 : Path Path to R2 file. dependencies : List[Job], optional List of dependencies, by default []. options : Dict[str, str], optional Additional options to pass to ngmerge, by default {}. Returns ------- Job FileGeneratingJob that creates the merged bam file. """ output_file.parent.mkdir(parents=True, exist_ok=True) deps = dependencies deps.append(ppg.ParameterInvariant(f"PI_{output_file}", list(options))) def __dump(): if not output_file.exists(): cmd = [ "/project/code/NGmerge/NGmerge", "-1", str(r1), "-2", str(r2), "-s", "-o", str(output_file), ] for k, v in options.items(): if v == "": cmd.append(k) else: cmd.extend([k, v]) print(" ".join(cmd)) subprocess.check_call(cmd) job = ppg.FileGeneratingJob(output_file, __dump).depends_on(deps) return job
def allow_access(self, groups): """write the permisisons file that says which groups may access this project""" filename = Path("web/permissions.dat") def do_dump(groups=groups): if not hasattr(groups, "__iter__"): groups = [groups] filename.parent.mkdir(exist_ok=True) filename.write_text("\n".join(groups) + "\n") return ppg.FileGeneratingJob(filename, do_dump).depends_on( ppg.ParameterInvariant(filename, groups))
def align_job( self, input_fastq, paired_end_filename, index_basename, output_bam_filename, parameters, ): cmd = [ "FROM_ALIGNER", str(self.path / f"STAR-{self.version}" / "bin" / "Linux_x86_64_static" / "STAR"), "--genomeDir", Path(index_basename).absolute(), "--genomeLoad", "NoSharedMemory", "--readFilesIn", ] if ',' in str(input_fastq) or ( paired_end_filename and ',' in str(paired_end_filename)): # pragma: no cover raise ValueError( "STAR does not handle fastq filenames with a comma") if paired_end_filename: cmd.extend([ '"%s"' % Path(paired_end_filename).absolute(), '"%s"' % Path(input_fastq).absolute(), ]) else: cmd.extend([Path(input_fastq).absolute()]) cmd.extend(["--outSAMtype", "BAM", "SortedByCoordinate"]) for k, v in parameters.items(): cmd.append(k) cmd.append(str(v)) def rename_after_alignment(): ob = Path(output_bam_filename) (ob.parent / "Aligned.sortedByCoord.out.bam").rename(ob.parent / ob.name) job = self.run( Path(output_bam_filename).parent, cmd, cwd=Path(output_bam_filename).parent, call_afterwards=rename_after_alignment, additional_files_created=[output_bam_filename], ) job.depends_on( ppg.ParameterInvariant(output_bam_filename, sorted(parameters.items()))) return job
def __init__( self, name, scaler=None, imputer=None, missing_value=np.NaN, cluster_columns=False, eps=0.5, min_samples=5, metric="euclidean", algorithm="auto", leaf_size=30, p=None, n_jobs=1, dependencies=[], ): """ This is a wrapper for DBSCAN @param eps maximum neighborhood distance @param min_samples = minimum number of neighbors for a point to be a valid core point @param metric distance metric, allowed is string (metrics.pairwise.calculate_distance) or callable @param algorithm nearest neighbor algorithm, allowed is 'auto', 'ball_tree', 'kd_tree' or 'brute' @param leaf_size leaf_size passed to ball_tree or kd_tree @param p power for minkowski metric to calculate distances """ self.eps = eps self.name = name self.min_samples = min_samples self.metric = metric self.algorithm = algorithm self.leaf_size = leaf_size self.p = p self.n_jobs = n_jobs dependencies += [ ppg.ParameterInvariant( self.name + "_parameters", [eps, p, min_samples, metric, algorithm, leaf_size, p], ), ppg.FunctionInvariant(self.name + "_fit", self.fit), ] ClusteringMethod.__init__(self, name) self.clustering = sklearn.cluster.DBSCAN( eps=self.eps, min_samples=self.min_samples, metric=self.metric, algorithm=self.algorithm, leaf_size=self.leaf_size, p=self.p, n_jobs=self.n_jobs, ) self.clustering.predict = self.clustering.fit_predict
def __call__(self): norm_job = self.calc_norm_data() order_job = self.calc_order() names_in_order = [ self.handle_name(self.names, x, ii) for (ii, x) in enumerate(self.heatmap.lanes_to_draw) ] def plot(): p = self.do_plot() self.plot_strategy.render(self.output_filename, p) plot_job = ppg.FileGeneratingJob(self.output_filename, plot) plot_job.ignore_code_changes() plot_job.depends_on(norm_job) plot_job.depends_on(order_job) plot_job.depends_on( ppg.FunctionInvariant( "genomics.regions._HeatmapPlot.do_plot", _HeatmapPlot.do_plot ) ) plot_job.depends_on( ppg.FunctionInvariant( "genomics.regions.heatmap." + self.plot_strategy.name + "plot_func", self.plot_strategy.__class__.plot, ) ) plot_job.depends_on( ppg.FunctionInvariant( "genomics.regions.heatmap." + self.plot_strategy.name + "render_func", self.plot_strategy.__class__.render, ) ) plot_job.depends_on(self.heatmap.gr_to_draw.load()) plot_job.depends_on( ppg.ParameterInvariant( self.output_filename, self.plot_strategy.get_parameters( self.plot_options, self.heatmap.lanes_to_draw ) + (names_in_order,), ) ) plot_job.depends_on( self.plot_strategy.get_dependencies(self, self.plot_options) ) if hasattr(self.names, "__call__"): plot_job.depends_on( ppg.FunctionInvariant(self.output_filename + "_names", self.names) )
def GenomicRegions_Intersection(new_name, gr_a, gr_b, summit_annotator=None, sheet_name="intersection"): """Create an intersection of all intervals... [(10, 100), (400, 450)], [(80, 120), (600, 700)] becomes [(80, 100),] Note that all interval-set based operations (L{union}, L{intersection}, L{difference}) drop all columns but chr, start, stop (annotators are merged and readded from all sets involved) """ verify_same_genome([gr_a, gr_b]) def do_load(): new_rows = [] for chr, start, stop in gr_a._iter_intersections(gr_b): new_rows.append({"chr": chr, "start": start, "stop": stop}) if new_rows: return pd.DataFrame(new_rows) else: return pd.DataFrame({"chr": [], "start": [], "stop": []}) if gr_a.load_strategy.build_deps: deps = [ gr_b.load(), gr_a.load(), ppg.ParameterInvariant( "GenomicRegions_%s_parents" % new_name, (gr_a.name, gr_b.name)), # so if you swap out the gr, it's detected... ] else: deps = [] gr_b.load() result = GenomicRegions( new_name, do_load, deps, gr_a.genome, on_overlap="merge", summit_annotator=summit_annotator, vid=["intersection"] + gr_a.vid + gr_b.vid, sheet_name=sheet_name, ) return result
def write(self, output_filename=None, mangler_function=None, float_format="%4g"): """Job: Store the internal DataFrame (df) in a table. To sort, filter, remove columns, etc before output, pass in a mangler_function (takes df, returns df) Retruns a (Job, Path) tuple - job is None if outside ppg """ output_filename = self.pathify(output_filename, self.get_table_filename().absolute()) def write(output_filename): if mangler_function: df = mangler_function(self.df.copy()) else: df = self.mangle_df_for_write(self.df) if str(output_filename).endswith(".xls"): try: df.to_excel(output_filename, index=False, float_format=float_format) except (ValueError): df.to_csv( output_filename, sep="\t", index=False, float_format=float_format, ) else: df.to_csv( output_filename, sep="\t", index=False, encoding="utf-8", float_format=float_format, ) if self.load_strategy.build_deps: deps = [ self.annotate(), ppg.FunctionInvariant( str(output_filename) + "_mangler", mangler_function), ppg.ParameterInvariant(str(output_filename), float_format), ] else: deps = [] return self.load_strategy.generate_file(output_filename, write, deps)
def __init__(self, species: str = "Homo_sapiens", version: str = "7.1"): """ An ensembl chip object that takes care of file download and input.chip generation for GSEA. Parameters ---------- version : str, optional The MSigDB version, by default "7.1" species : str, optional The species, by default "Homo_sapiens". Currently only supports Human, Mouse and Rat data. Raises ------ ValueError If an unsupported species is provided. """ if species == "Homo_sapiens": self.species = "Human" self.url = f"https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/{self.species}_ENSEMBL_Gene_MSigDB.v{version}.chip" elif species == "Mus_musculus": self.species = "Mouse" if version == "7.0": self.url = "https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/Mouse_ENSEMBL_Gene_ID_MSigDB.v7.0.chip" elif version == "7.1": self.url = "https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/Mouse_ENSEMBL_Gene_ID_to_Human_Orthologs_MSigDB.v7.1.chip" elif version == "7.2": self.url = f"https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/Mouse_ENSEMBL_Gene_ID_Human_Orthologs_MSigDB.v{version}.chip" elif species == "Rattus_norvegicus": self.species = "Rat" if version == "7.0": self.url = "https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/Rat_ENSEMBL_Gene_ID_MSigDB.v7.0.chip" elif version == "7.1": self.url = "https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/Rat_ENSEMBL_Gene_ID_to_Human_Orthologs_MSigDB.v7.1.chip 30-Mar-2020 16:56" elif version == "7.2": self.url = f"https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/Rat_ENSEMBL_Gene_ID_Human_Orthologs_MSigDB.v{version}.chip" else: raise ValueError( f"Currently the species {species} is not supported. Check MsigDB chip files at https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/." ) self.name = self.__class__.generate_name(species, version) self.version = version self.cache_dir = Path("cache") / "chip" / self.name self.cache_dir.mkdir(parents=True, exist_ok=True) self._chip = self.cache_dir / "input.chip" self.dependencies = [ ppg.ParameterInvariant(self.name, [self.species, self.version]) ]
def get_dependencies(self): return [ ppg.ParameterInvariant( self.name + "parameters", [ self.eps, self.min_samples, self.metric, self.algorithm, self.leaf_size, self.p, self.n_jobs, ], ) ]
def get_dependencies(self): deps = [] for lane in self._get_lanes(): deps.append(lane.load()) # if hasattr(lane, "read_distribution_dict"): # deps.append(lane.read_distribution_dict()) # else: # deps.append(lane.count_aligned_reads()) # for lane in self._get_browser_lanes(): # deps.append(lane.dump_gbrowes_adjustments()) deps.append( ppg.ParameterInvariant( self.get_filename(), tuple(sorted([x.name for x in self._get_lanes()])) + tuple(sorted([x.name for x in self._get_browser_lanes()])), )) return deps