Ejemplo n.º 1
0
    def plot(self):
        normed = self.normed_ddf(self.ddf)
        ordered = self.ordered_ddf(normed)
        names = self.handle_names()

        def plot():
            p = self.plot_strategy.plot(ordered.df, names, self.plot_options)
            self.plot_strategy.render(str(self.output_filename), p)

        if ppg.inside_ppg():
            ppg.util.global_pipegraph.quiet = False
            deps = [
                ordered.load(),
                ppg.FunctionInvariant(
                    "mbf_heatmap." + self.plot_strategy.name + "plot_func",
                    self.plot_strategy.__class__.plot,
                ),
                ppg.FunctionInvariant(
                    "mbf_heatmap" + self.plot_strategy.name + "render_func",
                    self.plot_strategy.__class__.render,
                ),
                ppg.ParameterInvariant(self.output_filename,
                                       freeze(
                                           (self.names, self.plot_options))),
            ]
            return ppg.FileGeneratingJob(self.output_filename,
                                         plot).depends_on(deps)
        else:
            plot()
            return self.output_filename
Ejemplo n.º 2
0
    def calc_order(self):
        def calc():
            return self.do_calc_order()

        of = self.cache_dir / "order"
        deps = self.order_strategy.get_dependencies(
            self.heatmap.gr_to_draw, self.heatmap.lanes_to_draw
        )
        if len(deps) == 2:
            order_deps, order_params = deps
            order_func = None
        else:
            order_deps, order_params, order_func = deps

        return ppg.CachedAttributeLoadingJob(of, self, "order_", calc).depends_on(
            [
                self.heatmap.calc_raw_data(),
                self.calc_norm_data(),
                ppg.ParameterInvariant(of, (self.order_strategy.name,) + order_params),
                ppg.FunctionInvariant(
                    of.name + "_secondary_func", order_func
                ),
                ppg.FunctionInvariant(
                    "genomics.regions.heatmap."
                    + self.order_strategy.name
                    + "calc_func",
                    self.order_strategy.__class__.calc,
                ),
            ]
            + order_deps
        )
Ejemplo n.º 3
0
    def __call__(self):
        norm_job = self.calc_norm_data()
        order_job = self.calc_order()
        names_in_order = [
            self.handle_name(self.names, x, ii)
            for (ii, x) in enumerate(self.heatmap.lanes_to_draw)
        ]

        def plot():
            p = self.do_plot()
            self.plot_strategy.render(self.output_filename, p)

        plot_job = ppg.FileGeneratingJob(self.output_filename, plot)
        plot_job.ignore_code_changes()
        plot_job.depends_on(norm_job)
        plot_job.depends_on(order_job)
        plot_job.depends_on(
            ppg.FunctionInvariant(
                "genomics.regions._HeatmapPlot.do_plot", _HeatmapPlot.do_plot
            )
        )
        plot_job.depends_on(
            ppg.FunctionInvariant(
                "genomics.regions.heatmap." + self.plot_strategy.name + "plot_func",
                self.plot_strategy.__class__.plot,
            )
        )
        plot_job.depends_on(
            ppg.FunctionInvariant(
                "genomics.regions.heatmap." + self.plot_strategy.name + "render_func",
                self.plot_strategy.__class__.render,
            )
        )
        plot_job.depends_on(self.heatmap.gr_to_draw.load())
        plot_job.depends_on(
            ppg.ParameterInvariant(
                self.output_filename,
                self.plot_strategy.get_parameters(
                    self.plot_options, self.heatmap.lanes_to_draw
                )
                + (names_in_order,),
            )
        )
        plot_job.depends_on(
            self.plot_strategy.get_dependencies(self, self.plot_options)
        )
        if hasattr(self.names, "__call__"):
            plot_job.depends_on(
                ppg.FunctionInvariant(self.output_filename + "_names", self.names)
            )
Ejemplo n.º 4
0
    def plot(self,
             output_filename,
             plot_func,
             calc_func=None,
             annotators=None):
        output_filename = self.pathify(output_filename)

        def do_plot(output_filename=output_filename):
            df = self.df
            if calc_func is not None:
                df = calc_func(df)
            p = plot_func(df)
            if hasattr(p, "pd"):
                p = p.pd
            p.save(output_filename, verbose=False)

        if self.load_strategy.build_deps:
            deps = [
                ppg.FunctionInvariant(
                    output_filename.with_name(output_filename.name +
                                              "_plot_func"),
                    plot_func,
                )
            ]
            if annotators is None:
                deps.append(self.annotate())
            else:
                deps.extend([self.add_annotator(x) for x in annotators])
        else:
            deps = []
            if annotators is not None:
                for anno in annotators:
                    self += anno
        return self.load_strategy.generate_file(output_filename, do_plot, deps)
Ejemplo n.º 5
0
    def normed_ddf(self, input_ddf):
        def load():
            df = input_ddf.df[[ac[1] for ac in self.columns]]
            normed_df = self.normalization_strategy.calc(
                df, [ac[1] for ac in self.columns])
            return normed_df

        output_name = input_ddf.name + "_heatmap_" + self.normalization_strategy.name
        if ppg.inside_ppg():
            deps = [
                self.ddf.add_annotator(ac[0])
                for ac in self.columns if ac[0] is not None
            ] + [
                self.normalization_strategy.deps(),
                input_ddf.load(),
                ppg.FunctionInvariant(output_name + '_calc',
                                      self.normalization_strategy.calc)
            ]
        else:
            deps = []

        return DelayedDataFrame(
            output_name,
            load,
            deps,
            input_ddf.result_dir,
        )
Ejemplo n.º 6
0
 def get_dependencies(self, lane):
     deps = [lane.load()]
     deps.append(self.background[lane.name].load())
     deps.append(
         ppg.FunctionInvariant("genomics.regions.heatmaps." + self.name,
                               self.__class__.calc))
     return deps
Ejemplo n.º 7
0
 def inject_auto_invariants(self):
     if not self.do_ignore_code_changes:
         self.depends_on(
             ppg.FunctionInvariant(self.job_id + "_func", self.inner_callback)
         )
     else:
         pass
Ejemplo n.º 8
0
    def calc_regions(self):
        def calc():
            return self.do_calc_regions()

        key = hashlib.md5(
            ",".join(
                [self.gr_to_draw.name, self.region_strategy.name]
                + list(set([x.name for x in self.lanes_to_draw]))
            ).encode()
        ).hexdigest()
        #  technically, we could share the regions job between heatmaps with the same regions but differen lanes
        # but we're using a CachedAttributeLoadingJob and that would.. .complicate things quite a bit
        of = self.cache_dir / "regions" / key
        of.parent.mkdir(exist_ok=True, parents=True)
        return ppg.CachedAttributeLoadingJob(of, self, "regions_", calc).depends_on(
            [
                ppg.ParameterInvariant(
                    of, (self.region_strategy.name, self.gr_to_draw.name)
                ),
                ppg.FunctionInvariant(
                    "genomics.regions.heatmap."
                    + self.region_strategy.name
                    + "calc_func",
                    self.region_strategy.__class__.calc,
                ),
            ]
            + self.region_strategy.get_dependencies(self.gr_to_draw)
        )
Ejemplo n.º 9
0
    def _anno_cache_and_calc(self, anno):
        def calc():
            if not isinstance(anno.columns, list):
                raise ValueError("Columns was not a list")

            if hasattr(anno, "calc_ddf"):
                df = anno.calc_ddf(self.ddf)
            else:
                df = anno.calc(self.ddf.df)
            if isinstance(df, pd.Series) and len(anno.columns) == 1:
                df = pd.DataFrame({anno.columns[0]: df})
            if not isinstance(df, pd.DataFrame):
                raise ValueError(
                    "result was no dataframe (or series and len(anno.columns) == 1)"
                )
            return df

        def load(df):
            s_should = set(anno.columns)
            if not len(s_should):
                raise ValueError("anno.columns was empty")
            s_actual = set(df.columns)
            if s_should != s_actual:
                raise ValueError(
                    "Annotator declared different columns from those actualy calculated: %s"
                    % (s_should.symmetric_difference(s_actual))
                )
            if set(df.columns).intersection(self.ddf.df.columns):
                raise ValueError(
                    "Annotator created columns that were already present.",
                    self.ddf.name,
                    anno.get_cache_name(),
                    set(df.columns).intersection(self.ddf.df.columns),
                )
            self.ddf.df = _combine_annotator_df_and_old_df(df, self.ddf.df)

        (self.ddf.cache_dir / anno.__class__.__name__).mkdir(exist_ok=True)
        job = ppg.CachedDataLoadingJob(
            self.ddf.cache_dir / anno.__class__.__name__ / anno.get_cache_name(),
            calc,
            load,
        )
        ppg.Job.depends_on(
            job, self.load()
        )  # both the load and nthe calc needs our ddf.df
        job.depends_on(
            self.load(),
            ppg.FunctionInvariant(
                self.ddf.cache_dir / (anno.get_cache_name() + "_calc_func"),
                anno.calc if hasattr(anno, "calc") else anno.calc_ddf,
            ),
        )
        for d in anno.dep_annos():
            if d is not None:
                job.depends_on(self.ddf.anno_jobs[d.get_cache_name()])
        job.depends_on(anno.deps(self.ddf))
        job.lfg.cores_needed = getattr(anno, "cores_needed", 1)
        return job
Ejemplo n.º 10
0
    def get_convert_func(self,
                         key,
                         keep_name=False,
                         filter_to_these_chromosomes=None):
        """Note that filter_to_these_chromosomes is after the replacements have kicked in"""
        chain_file = self.data_path / (key + ".over.chain")
        if not chain_file.exists():  # pragma: no cover
            raise ValueError("invalid liftover key, file not found: %s" %
                             chain_file)
        if filter_to_these_chromosomes:
            filter_to_these_chromosomes = set(filter_to_these_chromosomes)

        def do_convert(df):
            if df.index.duplicated().any():  # pragma: no cover
                raise ValueError("liftover only works with unique indices")
            df.index = [str(x) for x in df.index]
            input_tuples = [("chr" + row["chr"], row["start"], row["stop"],
                             idx) for idx, row in df.iterrows()]

            output_tuples = self.do_liftover(input_tuples, chain_file)
            output_lists = list(zip(*output_tuples))
            res = pd.DataFrame({
                "chr":
                output_lists[0],
                "start":
                output_lists[1],
                "stop":
                output_lists[2],
                "parent": [x.decode("utf-8") for x in output_lists[3]],
            }).set_index("parent")
            new_chr = []
            for x in res["chr"]:
                x = x[3:]
                # these are untested as of 2019-03-27
                if x == "m":  # pragma: no cover
                    x = "MT"
                elif (key in self.replacements
                      and x in self.replacements[key]):  # pragma: no cover
                    x = self.replacements[key][x]
                new_chr.append(x)
            res["chr"] = new_chr
            for col in df.columns:
                if col not in res.columns:
                    res = res.assign(**{col: df[col]})
            if filter_to_these_chromosomes:
                res = res[res["chr"].isin(filter_to_these_chromosomes)]
            return res

        if ppg.inside_ppg():
            do_convert.dependencies = [
                ppg.FileTimeInvariant(chain_file),
                ppg.FunctionInvariant(
                    "genomics.regions.convert.LiftOver.do_liftover",
                    LiftOver.do_liftover,
                ),
            ]
        return do_convert
Ejemplo n.º 11
0
    def get_dependencies(self) -> List[ppg.Job]:
        """
        Returns a list of dependencies.

        Returns a list of pypipegraph.Job instances that
        need to run before the actual mutation analysis. Overrides the 
        superclass method.

        Returns
        -------
        typing.List[ppg.Job]
            List o  f pypipegraph.Job instances.
        """
        return [
            ppg.FunctionInvariant(self.instance_name + "_pre_process",
                                  self.preprocess),
            ppg.FunctionInvariant(self.instance_name + "_run_modifier",
                                  self.run_modifier),
        ]
Ejemplo n.º 12
0
 def get_dependencies(self, gr, lanes):
     return (
         [
             gr.load(),
             ppg.FunctionInvariant(
                 "genomics.regions.heatmap._OrderKMeans.do_cluster",
                 self.__class__.do_cluster,
             ),
         ],
         (self.lane_to_sort_by,),
     )
Ejemplo n.º 13
0
    def __init__(self, objs, top_level_genes=None):
        self.output_path = Path("web/scb")
        self.output_path.mkdir(exist_ok=True, parents=True)
        self.objs = objs
        self.used = set()
        self.genomes = set()
        self.meta_data = {}
        self.deps = []
        self.errors = []
        self.vids = []
        self.master_gene_list_by_genome = {}
        if top_level_genes:
            self.genes = top_level_genes
        else:
            self.genes = {}

        self.genes_to_dump = {}
        self.extract_genomes()
        self.extract_lanes()
        self.extract_genomic_regions()
        self.extract_genes()
        self.extract_tpm_annos()
        if len(self.used) != len(self.objs):
            missing = set(self.objs) - self.used
            raise ValueError("unused objects", missing)
        if self.errors:
            raise ValueError(self.errors)
        self.deps.append(
            ppg.ParameterInvariant("SCBSubmission_vids", str(self.vids)))

        self.deps.append(
            ppg.FunctionInvariant("SCBSubmission.extract_genomes",
                                  self.__class__.extract_genomes))
        self.deps.append(
            ppg.FunctionInvariant("SCBSubmission.extract_lanes",
                                  self.__class__.extract_lanes))
        self.deps.append(
            ppg.FunctionInvariant(
                "SCBSubmission.extract_genomic_regions",
                self.__class__.extract_genomic_regions,
            ))
Ejemplo n.º 14
0
    def load(self):
        def load_func(df):
            self.ddf.df = df
            self.ddf.non_annotator_columns = self.ddf.df.columns

        job = ppg.CachedDataLoadingJob(self.ddf.cache_dir / "calc",
                                       self.loading_function, load_func)
        job.depends_on(self.deps).depends_on(
            ppg.FunctionInvariant(
                self.ddf.__class__.__name__ + "_" + self.ddf.name + "_load",
                self.loading_function,
            ))
        return job
Ejemplo n.º 15
0
    def call(self):
        """
        Creates the vcf generating job.

        Creates a pypipegraph Job that does the variant calling and takes care
        of preprocessing and dependencies.

        Returns
        -------
        pypipegraph.FileGeneratingJob
            The job that does the variant calling.
        """
        run_callable = self.caller.run()

        def run():
            run_callable(self.input_samples, self.output_file)

        job = ppg.FileGeneratingJob(self.output_file, run, empty_ok=False)
        # job can depend on preprocessor dependencies, caller dependencies and the preprocessor job
        job.depends_on(self.caller.get_dependencies())
        lanes_loaded = [
            input_sample.load() for sample_list in self.input_samples
            for input_sample in sample_list
        ]
        # If a snp caller needs some preparation, this is the place to do it.
        if self.caller.preprocessor is not None:
            job.depends_on(self.caller.preprocessor.get_dependencies())
            job.depends_on(self.caller.preprocessor.prerequisite_jobs())
            preprocessor_output = self.caller.preprocessor.get_preprocessed_output(
                self.input_samples)
            if len(preprocessor_output) > 0:
                preprocessor_job = ppg.MultiFileGeneratingJob(
                    preprocessor_output,
                    self.caller.preprocessor.preprocess(self.input_samples),
                )
                preprocessor_job.depends_on(lanes_loaded)
                preprocessor_job.depends_on(
                    self.caller.preprocessor.get_dependencies()).depends_on(
                        self.caller.preprocessor.prerequisite_jobs())
                job.depends_on(preprocessor_job)
        job.depends_on(lanes_loaded)
        job.depends_on(
            ppg.FunctionInvariant(
                f"{self.caller.__class__.__name__}.run",
                self.caller.__class__.run,
            ))
        for sample_list in self.input_samples:
            for input_sample in sample_list:
                job.depends_on(input_sample.load())
        job.cores_needed = self.caller.get_cores_needed()
        return job
Ejemplo n.º 16
0
 def __init__(
     self,
     name,
     scaler=None,
     imputer=None,
     missing_value=np.NaN,
     cluster_columns=False,
     eps=0.5,
     min_samples=5,
     metric="euclidean",
     algorithm="auto",
     leaf_size=30,
     p=None,
     n_jobs=1,
     dependencies=[],
 ):
     """
     This is a wrapper for DBSCAN
     @param eps maximum neighborhood distance
     @param min_samples = minimum number of neighbors for a point to be a valid core point
     @param metric distance metric, allowed is string (metrics.pairwise.calculate_distance) or callable
     @param algorithm nearest neighbor algorithm, allowed is 'auto', 'ball_tree', 'kd_tree' or 'brute'
     @param leaf_size leaf_size passed to ball_tree or kd_tree
     @param p power for minkowski metric to calculate distances
     """
     self.eps = eps
     self.name = name
     self.min_samples = min_samples
     self.metric = metric
     self.algorithm = algorithm
     self.leaf_size = leaf_size
     self.p = p
     self.n_jobs = n_jobs
     dependencies += [
         ppg.ParameterInvariant(
             self.name + "_parameters",
             [eps, p, min_samples, metric, algorithm, leaf_size, p],
         ),
         ppg.FunctionInvariant(self.name + "_fit", self.fit),
     ]
     ClusteringMethod.__init__(self, name)
     self.clustering = sklearn.cluster.DBSCAN(
         eps=self.eps,
         min_samples=self.min_samples,
         metric=self.metric,
         algorithm=self.algorithm,
         leaf_size=self.leaf_size,
         p=self.p,
         n_jobs=self.n_jobs,
     )
     self.clustering.predict = self.clustering.fit_predict
Ejemplo n.º 17
0
def GenomicRegions_FromTable(
    name,
    filename,
    genome,
    on_overlap="raise",
    summit_annotator=None,
    filter_func=None,
    vid=None,
    sheet_name="FromTable",
    drop_further_columns=True,
    chr_column="chr",
    start_column="start",
    stop_column="stop",
    one_based=False,
    reader=read_pandas,
):
    """Read a table file (csv/tsv/xls) with the chr/start/stop columns (renamed?), optionally
    drop all further columns"""
    def load():

        df = reader(filename)
        df["chr"] = df[chr_column].astype(str)
        df["start"] = df[start_column].astype(int)
        if one_based:  # pragma: no cover
            df["start"] -= 1
        df["stop"] = df[stop_column].astype(int)
        if drop_further_columns:  # pragma: no cover
            df = df[["chr", "start", "stop"]]
        if filter_func:  # pragma: no cover
            df = filter_func(df)
        return df

    if ppg.inside_ppg():
        deps = [
            ppg.FileTimeInvariant(filename),
            ppg.FunctionInvariant(name + "_filter_func", filter_func),
        ]
    else:
        deps = []
    return GenomicRegions(
        name,
        load,
        deps,
        genome,
        on_overlap,
        summit_annotator=summit_annotator,
        sheet_name=sheet_name,
        vid=vid,
    )
Ejemplo n.º 18
0
    def register_qc_volcano(self, genes, filtered=None, filter_func=None):
        """perform a volcano plot
        """
        if filtered is None:
            output_filename = genes.result_dir / "volcano.png"
        else:
            output_filename = filtered.result_dir / "volcano.png"

        def plot(output_filename):
            df = (dp(genes.df).mutate(significant=filter_func(genes.df) if
                                      filter_func is not None else "tbd.").pd)

            no_sig_lower = (df["significant"] & (df[self["log2FC"]] < 0)).sum()
            no_sig_higher = (df["significant"] &
                             (df[self["log2FC"]] > 0)).sum()

            (dp(df).p9().scale_color_many_categories(
                name="regulated", shift=3).scale_y_continuous(
                    name="p",
                    trans=dp.reverse_transform("log10"),
                    labels=lambda xs: ["%.2g" % x for x in xs],
                ).add_vline(xintercept=1, _color="blue").add_vline(
                    xintercept=-1, _color="blue").add_hline(yintercept=0.05,
                                                            _color="blue").
             add_rect(  # shade 'simply' significant regions
                 xmin="xmin",
                 xmax="xmax",
                 ymin="ymin",
                 ymax="ymax",
                 _fill="lightgrey",
                 data=pd.DataFrame({
                     "xmin": [-np.inf, 1],
                     "xmax": [-1, np.inf],
                     "ymin": [0, 0],
                     "ymax": [0.05, 0.05],
                 }),
                 _alpha=0.8,
             ).add_scatter(
                 self["log2FC"], self["p"], color="significant").title(
                     f"# regulated down/ up: {no_sig_lower} / {no_sig_higher}")
             # .coord_trans(x="reverse", y="reverse")  #broken as of 2019-01-31
             .render(output_filename, width=8, height=6, dpi=300))

        return register_qc(
            ppg.FileGeneratingJob(output_filename, plot).depends_on(
                genes.add_annotator(self),
                ppg.FunctionInvariant(
                    str(output_filename) + "_filter_func", filter_func),
            ))
Ejemplo n.º 19
0
    def write(self,
              output_filename=None,
              mangler_function=None,
              float_format="%4g"):
        """Job: Store the internal DataFrame (df) in a table.
        To sort, filter, remove columns, etc before output,
        pass in a mangler_function (takes df, returns df)

        Retruns a (Job, Path) tuple - job is None if outside ppg
        """
        output_filename = self.pathify(output_filename,
                                       self.get_table_filename().absolute())

        def write(output_filename):
            if mangler_function:
                df = mangler_function(self.df.copy())
            else:
                df = self.mangle_df_for_write(self.df)
            if str(output_filename).endswith(".xls"):
                try:
                    df.to_excel(output_filename,
                                index=False,
                                float_format=float_format)
                except (ValueError):
                    df.to_csv(
                        output_filename,
                        sep="\t",
                        index=False,
                        float_format=float_format,
                    )
            else:
                df.to_csv(
                    output_filename,
                    sep="\t",
                    index=False,
                    encoding="utf-8",
                    float_format=float_format,
                )

        if self.load_strategy.build_deps:
            deps = [
                self.annotate(),
                ppg.FunctionInvariant(
                    str(output_filename) + "_mangler", mangler_function),
                ppg.ParameterInvariant(str(output_filename), float_format),
            ]
        else:
            deps = []
        return self.load_strategy.generate_file(output_filename, write, deps)
Ejemplo n.º 20
0
    def test_prebuild_job_raises_on_executing_if_dep_on_anything_not_prebuild(
            self, new_pipegraph):
        Path("prebuilt").mkdir()
        count_file = Path("count")
        count_file.write_text("0")
        mgr = PrebuildManager("prebuilt", "test_host")

        def calc_05(output_path):
            (output_path / "A").write_text("0.5")
            c = int(count_file.read_text())
            count_file.write_text(str(c + 1))

        jobA = mgr.prebuild("partA", "0.5", [], "A", calc_05)
        with pytest.raises(ppg.JobContractError):
            jobA.depends_on(ppg.FunctionInvariant("shu", lambda: 5))
Ejemplo n.º 21
0
    def _msg_pack_job(self, property_name, filename, callback_function,
                      files_to_invariant_on):
        out_dir = self.cache_dir / "lookup"
        out_dir.mkdir(exist_ok=True)

        def dump(output_filename):
            df = callback_function(self)
            pandas_msgpack.to_msgpack(output_filename, df)

        j = ppg.FileGeneratingJob(out_dir / filename, dump).depends_on(
            ppg.FunctionInvariant(out_dir / filename / property_name,
                                  callback_function))
        self._prebuilds.append(j)
        for f in files_to_invariant_on:
            j.depends_on_file(f)
        return j
Ejemplo n.º 22
0
def job_reheader_and_rename_chromosomes(input_bam_path, output_bam_path,
                                        replacements):
    input_path_bam = Path(input_bam_path)
    output_bam_path = Path(output_bam_path)

    def do_replace(replacements=replacements):
        reheader_and_rename_chromosomes(input_bam_path, output_bam_path,
                                        replacements)

    output_bam_path.parent.mkdir(exist_ok=True, parents=True)
    return ppg.MultiFileGeneratingJob(
        [output_bam_path,
         output_bam_path.with_suffix(".bam.bai")], do_replace).depends_on(
             ppg.FileInvariant(input_bam_path),
             ppg.FunctionInvariant("mbf_bam.reheader_and_rename_chromosomes",
                                   reheader_and_rename_chromosomes),
         )
Ejemplo n.º 23
0
def GenomicRegions_FromBigBed(
    name,
    filename,
    genome,
    chromosome_mangler=lambda x: x,
    on_overlap="raise",
    summit_annotator=None,
    sheet_name=None,
    vid=None,
):
    """Create GenomicRegions from a BigBed file.
    @chromosome_mangler translates genome chromosomes into the bigbed's chromosomes!

    """
    from mbf_fileformats.bed import read_bigbed

    def load():
        res = read_bigbed(filename, genome.get_chromosome_lengths(),
                          chromosome_mangler)
        if (res["strand"] == 1).all():
            res = res.drop("strand", axis=1)
        if len(res) == 0:  # pragma: no cover
            raise ValueError(
                "Emtpty BigBed file (or wrong chromosome names)- %s" %
                filename)
        return res

    if ppg.inside_ppg():
        deps = [
            ppg.FileTimeInvariant(filename),
            ppg.FunctionInvariant(name + "_chrmangler", chromosome_mangler),
        ]
    else:
        deps = []

    return GenomicRegions(
        name,
        load,
        deps,
        genome,
        on_overlap=on_overlap,
        summit_annotator=summit_annotator,
        sheet_name=sheet_name,
        vid=vid,
    )
Ejemplo n.º 24
0
    def get_dependencies(self) -> List[Job]:
        """
        Returns a list of dependencies for the pypipegraph.

        Returns a list of pypipegraph.Jobs that need to run before the actual
        variant calling.

        Returns
        -------
        List[Job]
            List of dependencies.
        """
        return [
            ppg.ParameterInvariant(
                f"{self.instance_name}",
                OptionHandler.options_as_list_str(self.options)),
            ppg.FunctionInvariant(f"{self.instance_name}_runfunc", self.run),
        ]
Ejemplo n.º 25
0
    def _anno_load(self, anno):
        def load():
            self.ddf.df = pd.concat(
                [
                    self.ddf.df,
                    self.ddf.parent.df[anno.columns].reindex(self.ddf.df.index),
                ],
                axis=1,
            )

        job = ppg.DataLoadingJob(self.ddf.cache_dir / anno.get_cache_name(), load)
        job.depends_on(
            ppg.FunctionInvariant(
                self.ddf.cache_dir / (anno.get_cache_name() + "_funcv"), anno.calc
            ),
            self.ddf.parent.anno_jobs[anno.get_cache_name()],
            self.ddf.load(),
        )
        return job
Ejemplo n.º 26
0
    def calc_norm_data(self):
        def calc():
            """Normalized data is a dictionary: lane_name: 2d matrix"""
            return self.do_calc_norm_data()

        of = self.cache_dir / "norm_data"
        return ppg.AttributeLoadingJob(of, self, "norm_data_", calc).depends_on(
            [
                ppg.ParameterInvariant(of, (self.normalization_strategy.name,)),
                self.heatmap.calc_raw_data(),
                ppg.FunctionInvariant(
                    "genomics.regions.heatmap."
                    + self.normalization_strategy.name
                    + "calc_func",
                    self.normalization_strategy.__class__.calc,
                ),
            ]
            + self.normalization_strategy.get_dependencies(self.heatmap.lanes_to_draw)
        )
Ejemplo n.º 27
0
    def regions_exons_merged(self):
        """Return positions of all exonic regions - possibly overlapping"""
        if self.load_strategy.build_deps:
            deps = [
                self.load(),
                ppg.FunctionInvariant(
                    "GenomicRegions_{self.name}_exons_merged_actual_load",
                    type(self.genome).df_exons,
                ),
            ]
        else:
            deps = []

        return GenomicRegions(
            self.name + "_exons_merged",
            lambda: self.genome.df_exons,
            deps,
            self.genome,
            on_overlap="merge",
        )
Ejemplo n.º 28
0
 def inject_auto_invariants(self):
     if not self.do_ignore_code_changes:
         self.depends_on(
             ppg.FunctionInvariant(self.job_id + "_func",
                                   self.inner_callback))
         names = []
         for obj in self.objects:
             if hasattr(obj, 'name'):
                 names.append(obj.name)
             elif hasattr(obj, 'columns'):
                 names.append(obj.columns[0])
             else:
                 print(type(obj))
                 raise ValueError(dir(obj))
         self.depends_on(
             ppg.ParameterInvariant(
                 self.job_id,
                 tuple(sorted(names)),
             ))
     else:
         pass
Ejemplo n.º 29
0
    def _msg_pack_job(self, property_name, filename, callback_function,
                      files_to_invariant_on):
        out_dir = self.cache_dir / "lookup"
        out_dir.mkdir(exist_ok=True)

        if not ppg.util.inside_ppg():
            if not Path(filename).exists():  # pragma: no branch
                df = callback_function(self)
                pandas_msgpack.to_msgpack(out_dir / filename, df)
        else:

            def dump(output_filename):
                df = callback_function(self)
                pandas_msgpack.to_msgpack(output_filename, df)

            j = ppg.FileGeneratingJob(out_dir / filename, dump).depends_on(
                ppg.FunctionInvariant(out_dir / filename / property_name,
                                      callback_function))
            for f in files_to_invariant_on:
                j.depends_on_file(f)
            self._prebuilds.append(j)
            return j
        return
Ejemplo n.º 30
0
    def _anno_cache_and_calc(self, anno):
        def calc():
            if not isinstance(anno.columns, list):
                raise ValueError("Columns was not a list")

            if hasattr(anno, "calc_ddf"):
                df = anno.calc_ddf(self.ddf)
            else:
                df = anno.calc(self.ddf.df)
            if isinstance(df, pd.Series) and len(anno.columns) == 1:
                df = pd.DataFrame({anno.columns[0]: df})
            if not isinstance(df, pd.DataFrame):
                raise ValueError(
                    "result was no dataframe (or series and len(anno.columns) == 1)"
                )
            return df

        def load(df):
            s_should = set(anno.columns)
            if not len(s_should):
                raise ValueError("anno.columns was empty")
            s_actual = set(df.columns)
            if s_should != s_actual:
                raise ValueError(
                    "Annotator declared different columns from those actualy calculated: %s"
                    % (s_should.symmetric_difference(s_actual)))
            if set(df.columns).intersection(self.ddf.df.columns):
                raise ValueError(
                    "Annotator created columns that were already present.",
                    self.ddf.name,
                    anno.get_cache_name(),
                    set(df.columns).intersection(self.ddf.df.columns),
                )
            if isinstance(df.index, pd.RangeIndex):
                if len(df) == len(
                        self.ddf.df):  # assume it's simply ordered by the df
                    df.index = self.ddf.df.index
                else:
                    raise ValueError(
                        "Length and index mismatch between DataFrame and Annotator result - "
                        "Annotator must return either a DF with a compatible index "
                        "or at least one with the same length (and a RangeIndex)"
                    )

            self.ddf.df = pd.concat([self.ddf.df, df], axis=1)

        (self.ddf.cache_dir / anno.__class__.__name__).mkdir(exist_ok=True)
        job = ppg.CachedDataLoadingJob(
            self.ddf.cache_dir / anno.__class__.__name__ /
            anno.get_cache_name(),
            calc,
            load,
        )
        ppg.Job.depends_on(
            job, self.load())  # both the load and nthe calc needs our ddf.df
        job.depends_on(
            self.load(),
            ppg.FunctionInvariant(
                self.ddf.cache_dir / (anno.get_cache_name() + "_calc_func"),
                anno.calc if hasattr(anno, "calc") else anno.calc_ddf,
            ),
        )
        for d in anno.dep_annos():
            if d is not None:
                job.depends_on(self.ddf.anno_jobs[d.get_cache_name()])
        job.depends_on(anno.deps(self.ddf))
        job.lfg.cores_needed = getattr(anno, "cores_needed", 1)
        return job