def plot(self): normed = self.normed_ddf(self.ddf) ordered = self.ordered_ddf(normed) names = self.handle_names() def plot(): p = self.plot_strategy.plot(ordered.df, names, self.plot_options) self.plot_strategy.render(str(self.output_filename), p) if ppg.inside_ppg(): ppg.util.global_pipegraph.quiet = False deps = [ ordered.load(), ppg.FunctionInvariant( "mbf_heatmap." + self.plot_strategy.name + "plot_func", self.plot_strategy.__class__.plot, ), ppg.FunctionInvariant( "mbf_heatmap" + self.plot_strategy.name + "render_func", self.plot_strategy.__class__.render, ), ppg.ParameterInvariant(self.output_filename, freeze( (self.names, self.plot_options))), ] return ppg.FileGeneratingJob(self.output_filename, plot).depends_on(deps) else: plot() return self.output_filename
def calc_order(self): def calc(): return self.do_calc_order() of = self.cache_dir / "order" deps = self.order_strategy.get_dependencies( self.heatmap.gr_to_draw, self.heatmap.lanes_to_draw ) if len(deps) == 2: order_deps, order_params = deps order_func = None else: order_deps, order_params, order_func = deps return ppg.CachedAttributeLoadingJob(of, self, "order_", calc).depends_on( [ self.heatmap.calc_raw_data(), self.calc_norm_data(), ppg.ParameterInvariant(of, (self.order_strategy.name,) + order_params), ppg.FunctionInvariant( of.name + "_secondary_func", order_func ), ppg.FunctionInvariant( "genomics.regions.heatmap." + self.order_strategy.name + "calc_func", self.order_strategy.__class__.calc, ), ] + order_deps )
def __call__(self): norm_job = self.calc_norm_data() order_job = self.calc_order() names_in_order = [ self.handle_name(self.names, x, ii) for (ii, x) in enumerate(self.heatmap.lanes_to_draw) ] def plot(): p = self.do_plot() self.plot_strategy.render(self.output_filename, p) plot_job = ppg.FileGeneratingJob(self.output_filename, plot) plot_job.ignore_code_changes() plot_job.depends_on(norm_job) plot_job.depends_on(order_job) plot_job.depends_on( ppg.FunctionInvariant( "genomics.regions._HeatmapPlot.do_plot", _HeatmapPlot.do_plot ) ) plot_job.depends_on( ppg.FunctionInvariant( "genomics.regions.heatmap." + self.plot_strategy.name + "plot_func", self.plot_strategy.__class__.plot, ) ) plot_job.depends_on( ppg.FunctionInvariant( "genomics.regions.heatmap." + self.plot_strategy.name + "render_func", self.plot_strategy.__class__.render, ) ) plot_job.depends_on(self.heatmap.gr_to_draw.load()) plot_job.depends_on( ppg.ParameterInvariant( self.output_filename, self.plot_strategy.get_parameters( self.plot_options, self.heatmap.lanes_to_draw ) + (names_in_order,), ) ) plot_job.depends_on( self.plot_strategy.get_dependencies(self, self.plot_options) ) if hasattr(self.names, "__call__"): plot_job.depends_on( ppg.FunctionInvariant(self.output_filename + "_names", self.names) )
def plot(self, output_filename, plot_func, calc_func=None, annotators=None): output_filename = self.pathify(output_filename) def do_plot(output_filename=output_filename): df = self.df if calc_func is not None: df = calc_func(df) p = plot_func(df) if hasattr(p, "pd"): p = p.pd p.save(output_filename, verbose=False) if self.load_strategy.build_deps: deps = [ ppg.FunctionInvariant( output_filename.with_name(output_filename.name + "_plot_func"), plot_func, ) ] if annotators is None: deps.append(self.annotate()) else: deps.extend([self.add_annotator(x) for x in annotators]) else: deps = [] if annotators is not None: for anno in annotators: self += anno return self.load_strategy.generate_file(output_filename, do_plot, deps)
def normed_ddf(self, input_ddf): def load(): df = input_ddf.df[[ac[1] for ac in self.columns]] normed_df = self.normalization_strategy.calc( df, [ac[1] for ac in self.columns]) return normed_df output_name = input_ddf.name + "_heatmap_" + self.normalization_strategy.name if ppg.inside_ppg(): deps = [ self.ddf.add_annotator(ac[0]) for ac in self.columns if ac[0] is not None ] + [ self.normalization_strategy.deps(), input_ddf.load(), ppg.FunctionInvariant(output_name + '_calc', self.normalization_strategy.calc) ] else: deps = [] return DelayedDataFrame( output_name, load, deps, input_ddf.result_dir, )
def get_dependencies(self, lane): deps = [lane.load()] deps.append(self.background[lane.name].load()) deps.append( ppg.FunctionInvariant("genomics.regions.heatmaps." + self.name, self.__class__.calc)) return deps
def inject_auto_invariants(self): if not self.do_ignore_code_changes: self.depends_on( ppg.FunctionInvariant(self.job_id + "_func", self.inner_callback) ) else: pass
def calc_regions(self): def calc(): return self.do_calc_regions() key = hashlib.md5( ",".join( [self.gr_to_draw.name, self.region_strategy.name] + list(set([x.name for x in self.lanes_to_draw])) ).encode() ).hexdigest() # technically, we could share the regions job between heatmaps with the same regions but differen lanes # but we're using a CachedAttributeLoadingJob and that would.. .complicate things quite a bit of = self.cache_dir / "regions" / key of.parent.mkdir(exist_ok=True, parents=True) return ppg.CachedAttributeLoadingJob(of, self, "regions_", calc).depends_on( [ ppg.ParameterInvariant( of, (self.region_strategy.name, self.gr_to_draw.name) ), ppg.FunctionInvariant( "genomics.regions.heatmap." + self.region_strategy.name + "calc_func", self.region_strategy.__class__.calc, ), ] + self.region_strategy.get_dependencies(self.gr_to_draw) )
def _anno_cache_and_calc(self, anno): def calc(): if not isinstance(anno.columns, list): raise ValueError("Columns was not a list") if hasattr(anno, "calc_ddf"): df = anno.calc_ddf(self.ddf) else: df = anno.calc(self.ddf.df) if isinstance(df, pd.Series) and len(anno.columns) == 1: df = pd.DataFrame({anno.columns[0]: df}) if not isinstance(df, pd.DataFrame): raise ValueError( "result was no dataframe (or series and len(anno.columns) == 1)" ) return df def load(df): s_should = set(anno.columns) if not len(s_should): raise ValueError("anno.columns was empty") s_actual = set(df.columns) if s_should != s_actual: raise ValueError( "Annotator declared different columns from those actualy calculated: %s" % (s_should.symmetric_difference(s_actual)) ) if set(df.columns).intersection(self.ddf.df.columns): raise ValueError( "Annotator created columns that were already present.", self.ddf.name, anno.get_cache_name(), set(df.columns).intersection(self.ddf.df.columns), ) self.ddf.df = _combine_annotator_df_and_old_df(df, self.ddf.df) (self.ddf.cache_dir / anno.__class__.__name__).mkdir(exist_ok=True) job = ppg.CachedDataLoadingJob( self.ddf.cache_dir / anno.__class__.__name__ / anno.get_cache_name(), calc, load, ) ppg.Job.depends_on( job, self.load() ) # both the load and nthe calc needs our ddf.df job.depends_on( self.load(), ppg.FunctionInvariant( self.ddf.cache_dir / (anno.get_cache_name() + "_calc_func"), anno.calc if hasattr(anno, "calc") else anno.calc_ddf, ), ) for d in anno.dep_annos(): if d is not None: job.depends_on(self.ddf.anno_jobs[d.get_cache_name()]) job.depends_on(anno.deps(self.ddf)) job.lfg.cores_needed = getattr(anno, "cores_needed", 1) return job
def get_convert_func(self, key, keep_name=False, filter_to_these_chromosomes=None): """Note that filter_to_these_chromosomes is after the replacements have kicked in""" chain_file = self.data_path / (key + ".over.chain") if not chain_file.exists(): # pragma: no cover raise ValueError("invalid liftover key, file not found: %s" % chain_file) if filter_to_these_chromosomes: filter_to_these_chromosomes = set(filter_to_these_chromosomes) def do_convert(df): if df.index.duplicated().any(): # pragma: no cover raise ValueError("liftover only works with unique indices") df.index = [str(x) for x in df.index] input_tuples = [("chr" + row["chr"], row["start"], row["stop"], idx) for idx, row in df.iterrows()] output_tuples = self.do_liftover(input_tuples, chain_file) output_lists = list(zip(*output_tuples)) res = pd.DataFrame({ "chr": output_lists[0], "start": output_lists[1], "stop": output_lists[2], "parent": [x.decode("utf-8") for x in output_lists[3]], }).set_index("parent") new_chr = [] for x in res["chr"]: x = x[3:] # these are untested as of 2019-03-27 if x == "m": # pragma: no cover x = "MT" elif (key in self.replacements and x in self.replacements[key]): # pragma: no cover x = self.replacements[key][x] new_chr.append(x) res["chr"] = new_chr for col in df.columns: if col not in res.columns: res = res.assign(**{col: df[col]}) if filter_to_these_chromosomes: res = res[res["chr"].isin(filter_to_these_chromosomes)] return res if ppg.inside_ppg(): do_convert.dependencies = [ ppg.FileTimeInvariant(chain_file), ppg.FunctionInvariant( "genomics.regions.convert.LiftOver.do_liftover", LiftOver.do_liftover, ), ] return do_convert
def get_dependencies(self) -> List[ppg.Job]: """ Returns a list of dependencies. Returns a list of pypipegraph.Job instances that need to run before the actual mutation analysis. Overrides the superclass method. Returns ------- typing.List[ppg.Job] List o f pypipegraph.Job instances. """ return [ ppg.FunctionInvariant(self.instance_name + "_pre_process", self.preprocess), ppg.FunctionInvariant(self.instance_name + "_run_modifier", self.run_modifier), ]
def get_dependencies(self, gr, lanes): return ( [ gr.load(), ppg.FunctionInvariant( "genomics.regions.heatmap._OrderKMeans.do_cluster", self.__class__.do_cluster, ), ], (self.lane_to_sort_by,), )
def __init__(self, objs, top_level_genes=None): self.output_path = Path("web/scb") self.output_path.mkdir(exist_ok=True, parents=True) self.objs = objs self.used = set() self.genomes = set() self.meta_data = {} self.deps = [] self.errors = [] self.vids = [] self.master_gene_list_by_genome = {} if top_level_genes: self.genes = top_level_genes else: self.genes = {} self.genes_to_dump = {} self.extract_genomes() self.extract_lanes() self.extract_genomic_regions() self.extract_genes() self.extract_tpm_annos() if len(self.used) != len(self.objs): missing = set(self.objs) - self.used raise ValueError("unused objects", missing) if self.errors: raise ValueError(self.errors) self.deps.append( ppg.ParameterInvariant("SCBSubmission_vids", str(self.vids))) self.deps.append( ppg.FunctionInvariant("SCBSubmission.extract_genomes", self.__class__.extract_genomes)) self.deps.append( ppg.FunctionInvariant("SCBSubmission.extract_lanes", self.__class__.extract_lanes)) self.deps.append( ppg.FunctionInvariant( "SCBSubmission.extract_genomic_regions", self.__class__.extract_genomic_regions, ))
def load(self): def load_func(df): self.ddf.df = df self.ddf.non_annotator_columns = self.ddf.df.columns job = ppg.CachedDataLoadingJob(self.ddf.cache_dir / "calc", self.loading_function, load_func) job.depends_on(self.deps).depends_on( ppg.FunctionInvariant( self.ddf.__class__.__name__ + "_" + self.ddf.name + "_load", self.loading_function, )) return job
def call(self): """ Creates the vcf generating job. Creates a pypipegraph Job that does the variant calling and takes care of preprocessing and dependencies. Returns ------- pypipegraph.FileGeneratingJob The job that does the variant calling. """ run_callable = self.caller.run() def run(): run_callable(self.input_samples, self.output_file) job = ppg.FileGeneratingJob(self.output_file, run, empty_ok=False) # job can depend on preprocessor dependencies, caller dependencies and the preprocessor job job.depends_on(self.caller.get_dependencies()) lanes_loaded = [ input_sample.load() for sample_list in self.input_samples for input_sample in sample_list ] # If a snp caller needs some preparation, this is the place to do it. if self.caller.preprocessor is not None: job.depends_on(self.caller.preprocessor.get_dependencies()) job.depends_on(self.caller.preprocessor.prerequisite_jobs()) preprocessor_output = self.caller.preprocessor.get_preprocessed_output( self.input_samples) if len(preprocessor_output) > 0: preprocessor_job = ppg.MultiFileGeneratingJob( preprocessor_output, self.caller.preprocessor.preprocess(self.input_samples), ) preprocessor_job.depends_on(lanes_loaded) preprocessor_job.depends_on( self.caller.preprocessor.get_dependencies()).depends_on( self.caller.preprocessor.prerequisite_jobs()) job.depends_on(preprocessor_job) job.depends_on(lanes_loaded) job.depends_on( ppg.FunctionInvariant( f"{self.caller.__class__.__name__}.run", self.caller.__class__.run, )) for sample_list in self.input_samples: for input_sample in sample_list: job.depends_on(input_sample.load()) job.cores_needed = self.caller.get_cores_needed() return job
def __init__( self, name, scaler=None, imputer=None, missing_value=np.NaN, cluster_columns=False, eps=0.5, min_samples=5, metric="euclidean", algorithm="auto", leaf_size=30, p=None, n_jobs=1, dependencies=[], ): """ This is a wrapper for DBSCAN @param eps maximum neighborhood distance @param min_samples = minimum number of neighbors for a point to be a valid core point @param metric distance metric, allowed is string (metrics.pairwise.calculate_distance) or callable @param algorithm nearest neighbor algorithm, allowed is 'auto', 'ball_tree', 'kd_tree' or 'brute' @param leaf_size leaf_size passed to ball_tree or kd_tree @param p power for minkowski metric to calculate distances """ self.eps = eps self.name = name self.min_samples = min_samples self.metric = metric self.algorithm = algorithm self.leaf_size = leaf_size self.p = p self.n_jobs = n_jobs dependencies += [ ppg.ParameterInvariant( self.name + "_parameters", [eps, p, min_samples, metric, algorithm, leaf_size, p], ), ppg.FunctionInvariant(self.name + "_fit", self.fit), ] ClusteringMethod.__init__(self, name) self.clustering = sklearn.cluster.DBSCAN( eps=self.eps, min_samples=self.min_samples, metric=self.metric, algorithm=self.algorithm, leaf_size=self.leaf_size, p=self.p, n_jobs=self.n_jobs, ) self.clustering.predict = self.clustering.fit_predict
def GenomicRegions_FromTable( name, filename, genome, on_overlap="raise", summit_annotator=None, filter_func=None, vid=None, sheet_name="FromTable", drop_further_columns=True, chr_column="chr", start_column="start", stop_column="stop", one_based=False, reader=read_pandas, ): """Read a table file (csv/tsv/xls) with the chr/start/stop columns (renamed?), optionally drop all further columns""" def load(): df = reader(filename) df["chr"] = df[chr_column].astype(str) df["start"] = df[start_column].astype(int) if one_based: # pragma: no cover df["start"] -= 1 df["stop"] = df[stop_column].astype(int) if drop_further_columns: # pragma: no cover df = df[["chr", "start", "stop"]] if filter_func: # pragma: no cover df = filter_func(df) return df if ppg.inside_ppg(): deps = [ ppg.FileTimeInvariant(filename), ppg.FunctionInvariant(name + "_filter_func", filter_func), ] else: deps = [] return GenomicRegions( name, load, deps, genome, on_overlap, summit_annotator=summit_annotator, sheet_name=sheet_name, vid=vid, )
def register_qc_volcano(self, genes, filtered=None, filter_func=None): """perform a volcano plot """ if filtered is None: output_filename = genes.result_dir / "volcano.png" else: output_filename = filtered.result_dir / "volcano.png" def plot(output_filename): df = (dp(genes.df).mutate(significant=filter_func(genes.df) if filter_func is not None else "tbd.").pd) no_sig_lower = (df["significant"] & (df[self["log2FC"]] < 0)).sum() no_sig_higher = (df["significant"] & (df[self["log2FC"]] > 0)).sum() (dp(df).p9().scale_color_many_categories( name="regulated", shift=3).scale_y_continuous( name="p", trans=dp.reverse_transform("log10"), labels=lambda xs: ["%.2g" % x for x in xs], ).add_vline(xintercept=1, _color="blue").add_vline( xintercept=-1, _color="blue").add_hline(yintercept=0.05, _color="blue"). add_rect( # shade 'simply' significant regions xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax", _fill="lightgrey", data=pd.DataFrame({ "xmin": [-np.inf, 1], "xmax": [-1, np.inf], "ymin": [0, 0], "ymax": [0.05, 0.05], }), _alpha=0.8, ).add_scatter( self["log2FC"], self["p"], color="significant").title( f"# regulated down/ up: {no_sig_lower} / {no_sig_higher}") # .coord_trans(x="reverse", y="reverse") #broken as of 2019-01-31 .render(output_filename, width=8, height=6, dpi=300)) return register_qc( ppg.FileGeneratingJob(output_filename, plot).depends_on( genes.add_annotator(self), ppg.FunctionInvariant( str(output_filename) + "_filter_func", filter_func), ))
def write(self, output_filename=None, mangler_function=None, float_format="%4g"): """Job: Store the internal DataFrame (df) in a table. To sort, filter, remove columns, etc before output, pass in a mangler_function (takes df, returns df) Retruns a (Job, Path) tuple - job is None if outside ppg """ output_filename = self.pathify(output_filename, self.get_table_filename().absolute()) def write(output_filename): if mangler_function: df = mangler_function(self.df.copy()) else: df = self.mangle_df_for_write(self.df) if str(output_filename).endswith(".xls"): try: df.to_excel(output_filename, index=False, float_format=float_format) except (ValueError): df.to_csv( output_filename, sep="\t", index=False, float_format=float_format, ) else: df.to_csv( output_filename, sep="\t", index=False, encoding="utf-8", float_format=float_format, ) if self.load_strategy.build_deps: deps = [ self.annotate(), ppg.FunctionInvariant( str(output_filename) + "_mangler", mangler_function), ppg.ParameterInvariant(str(output_filename), float_format), ] else: deps = [] return self.load_strategy.generate_file(output_filename, write, deps)
def test_prebuild_job_raises_on_executing_if_dep_on_anything_not_prebuild( self, new_pipegraph): Path("prebuilt").mkdir() count_file = Path("count") count_file.write_text("0") mgr = PrebuildManager("prebuilt", "test_host") def calc_05(output_path): (output_path / "A").write_text("0.5") c = int(count_file.read_text()) count_file.write_text(str(c + 1)) jobA = mgr.prebuild("partA", "0.5", [], "A", calc_05) with pytest.raises(ppg.JobContractError): jobA.depends_on(ppg.FunctionInvariant("shu", lambda: 5))
def _msg_pack_job(self, property_name, filename, callback_function, files_to_invariant_on): out_dir = self.cache_dir / "lookup" out_dir.mkdir(exist_ok=True) def dump(output_filename): df = callback_function(self) pandas_msgpack.to_msgpack(output_filename, df) j = ppg.FileGeneratingJob(out_dir / filename, dump).depends_on( ppg.FunctionInvariant(out_dir / filename / property_name, callback_function)) self._prebuilds.append(j) for f in files_to_invariant_on: j.depends_on_file(f) return j
def job_reheader_and_rename_chromosomes(input_bam_path, output_bam_path, replacements): input_path_bam = Path(input_bam_path) output_bam_path = Path(output_bam_path) def do_replace(replacements=replacements): reheader_and_rename_chromosomes(input_bam_path, output_bam_path, replacements) output_bam_path.parent.mkdir(exist_ok=True, parents=True) return ppg.MultiFileGeneratingJob( [output_bam_path, output_bam_path.with_suffix(".bam.bai")], do_replace).depends_on( ppg.FileInvariant(input_bam_path), ppg.FunctionInvariant("mbf_bam.reheader_and_rename_chromosomes", reheader_and_rename_chromosomes), )
def GenomicRegions_FromBigBed( name, filename, genome, chromosome_mangler=lambda x: x, on_overlap="raise", summit_annotator=None, sheet_name=None, vid=None, ): """Create GenomicRegions from a BigBed file. @chromosome_mangler translates genome chromosomes into the bigbed's chromosomes! """ from mbf_fileformats.bed import read_bigbed def load(): res = read_bigbed(filename, genome.get_chromosome_lengths(), chromosome_mangler) if (res["strand"] == 1).all(): res = res.drop("strand", axis=1) if len(res) == 0: # pragma: no cover raise ValueError( "Emtpty BigBed file (or wrong chromosome names)- %s" % filename) return res if ppg.inside_ppg(): deps = [ ppg.FileTimeInvariant(filename), ppg.FunctionInvariant(name + "_chrmangler", chromosome_mangler), ] else: deps = [] return GenomicRegions( name, load, deps, genome, on_overlap=on_overlap, summit_annotator=summit_annotator, sheet_name=sheet_name, vid=vid, )
def get_dependencies(self) -> List[Job]: """ Returns a list of dependencies for the pypipegraph. Returns a list of pypipegraph.Jobs that need to run before the actual variant calling. Returns ------- List[Job] List of dependencies. """ return [ ppg.ParameterInvariant( f"{self.instance_name}", OptionHandler.options_as_list_str(self.options)), ppg.FunctionInvariant(f"{self.instance_name}_runfunc", self.run), ]
def _anno_load(self, anno): def load(): self.ddf.df = pd.concat( [ self.ddf.df, self.ddf.parent.df[anno.columns].reindex(self.ddf.df.index), ], axis=1, ) job = ppg.DataLoadingJob(self.ddf.cache_dir / anno.get_cache_name(), load) job.depends_on( ppg.FunctionInvariant( self.ddf.cache_dir / (anno.get_cache_name() + "_funcv"), anno.calc ), self.ddf.parent.anno_jobs[anno.get_cache_name()], self.ddf.load(), ) return job
def calc_norm_data(self): def calc(): """Normalized data is a dictionary: lane_name: 2d matrix""" return self.do_calc_norm_data() of = self.cache_dir / "norm_data" return ppg.AttributeLoadingJob(of, self, "norm_data_", calc).depends_on( [ ppg.ParameterInvariant(of, (self.normalization_strategy.name,)), self.heatmap.calc_raw_data(), ppg.FunctionInvariant( "genomics.regions.heatmap." + self.normalization_strategy.name + "calc_func", self.normalization_strategy.__class__.calc, ), ] + self.normalization_strategy.get_dependencies(self.heatmap.lanes_to_draw) )
def regions_exons_merged(self): """Return positions of all exonic regions - possibly overlapping""" if self.load_strategy.build_deps: deps = [ self.load(), ppg.FunctionInvariant( "GenomicRegions_{self.name}_exons_merged_actual_load", type(self.genome).df_exons, ), ] else: deps = [] return GenomicRegions( self.name + "_exons_merged", lambda: self.genome.df_exons, deps, self.genome, on_overlap="merge", )
def inject_auto_invariants(self): if not self.do_ignore_code_changes: self.depends_on( ppg.FunctionInvariant(self.job_id + "_func", self.inner_callback)) names = [] for obj in self.objects: if hasattr(obj, 'name'): names.append(obj.name) elif hasattr(obj, 'columns'): names.append(obj.columns[0]) else: print(type(obj)) raise ValueError(dir(obj)) self.depends_on( ppg.ParameterInvariant( self.job_id, tuple(sorted(names)), )) else: pass
def _msg_pack_job(self, property_name, filename, callback_function, files_to_invariant_on): out_dir = self.cache_dir / "lookup" out_dir.mkdir(exist_ok=True) if not ppg.util.inside_ppg(): if not Path(filename).exists(): # pragma: no branch df = callback_function(self) pandas_msgpack.to_msgpack(out_dir / filename, df) else: def dump(output_filename): df = callback_function(self) pandas_msgpack.to_msgpack(output_filename, df) j = ppg.FileGeneratingJob(out_dir / filename, dump).depends_on( ppg.FunctionInvariant(out_dir / filename / property_name, callback_function)) for f in files_to_invariant_on: j.depends_on_file(f) self._prebuilds.append(j) return j return
def _anno_cache_and_calc(self, anno): def calc(): if not isinstance(anno.columns, list): raise ValueError("Columns was not a list") if hasattr(anno, "calc_ddf"): df = anno.calc_ddf(self.ddf) else: df = anno.calc(self.ddf.df) if isinstance(df, pd.Series) and len(anno.columns) == 1: df = pd.DataFrame({anno.columns[0]: df}) if not isinstance(df, pd.DataFrame): raise ValueError( "result was no dataframe (or series and len(anno.columns) == 1)" ) return df def load(df): s_should = set(anno.columns) if not len(s_should): raise ValueError("anno.columns was empty") s_actual = set(df.columns) if s_should != s_actual: raise ValueError( "Annotator declared different columns from those actualy calculated: %s" % (s_should.symmetric_difference(s_actual))) if set(df.columns).intersection(self.ddf.df.columns): raise ValueError( "Annotator created columns that were already present.", self.ddf.name, anno.get_cache_name(), set(df.columns).intersection(self.ddf.df.columns), ) if isinstance(df.index, pd.RangeIndex): if len(df) == len( self.ddf.df): # assume it's simply ordered by the df df.index = self.ddf.df.index else: raise ValueError( "Length and index mismatch between DataFrame and Annotator result - " "Annotator must return either a DF with a compatible index " "or at least one with the same length (and a RangeIndex)" ) self.ddf.df = pd.concat([self.ddf.df, df], axis=1) (self.ddf.cache_dir / anno.__class__.__name__).mkdir(exist_ok=True) job = ppg.CachedDataLoadingJob( self.ddf.cache_dir / anno.__class__.__name__ / anno.get_cache_name(), calc, load, ) ppg.Job.depends_on( job, self.load()) # both the load and nthe calc needs our ddf.df job.depends_on( self.load(), ppg.FunctionInvariant( self.ddf.cache_dir / (anno.get_cache_name() + "_calc_func"), anno.calc if hasattr(anno, "calc") else anno.calc_ddf, ), ) for d in anno.dep_annos(): if d is not None: job.depends_on(self.ddf.anno_jobs[d.get_cache_name()]) job.depends_on(anno.deps(self.ddf)) job.lfg.cores_needed = getattr(anno, "cores_needed", 1) return job