Esempio n. 1
0
    def test_unpickle_bug_prevents_single_job_from_unpickling(self):
        def do_a():
            write("out/A", "A")
            append("out/As", "A")

        ppg.FileGeneratingJob("out/A", do_a)

        def do_b():
            write("out/B", "A")
            append("out/Bs", "A")

        job_B = ppg.FileGeneratingJob("out/B", do_b)
        cd = CantDepickle()
        job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd,))
        job_B.depends_on(job_parameter_unpickle_problem)
        ppg.run_pipegraph()
        assert read("out/A") == "A"
        assert read("out/As") == "A"
        assert read("out/B") == "A"
        assert read("out/Bs") == "A"
        print("second run")
        ppg.new_pipegraph(dump_graph=False)

        ppg.FileGeneratingJob("out/A", do_a)
        job_B = ppg.FileGeneratingJob("out/B", do_b)
        job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd,))
        job_B.depends_on(job_parameter_unpickle_problem)
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert read("out/A") == "A"
        assert read("out/As") == "A"
        assert read("out/B") == "A"
        assert (
            read("out/Bs") == "AA"
        )  # this one got rerun because we could not load the invariant...
Esempio n. 2
0
    def test_filegen_invalidated_jobgen_created_filegen_later_also_invalidated(
            self, new_pipegraph):
        a = ppg.FileGeneratingJob("out/A",
                                  lambda: writeappend("out/A", "out/Ac", "A"))
        p = ppg.ParameterInvariant("p", "p")
        a.depends_on(p)

        def gen():
            c = ppg.FileGeneratingJob(
                "out/C", lambda: writeappend("out/C", "out/Cx", "C"))
            c.depends_on(a)

        ppg.JobGeneratingJob("b", gen)
        ppg.run_pipegraph()
        assert read("out/A") == "A"
        assert read("out/Ac") == "A"
        assert read("out/C") == "C"
        assert read("out/Cx") == "C"
        new_pipegraph.new_pipegraph()

        a = ppg.FileGeneratingJob("out/A",
                                  lambda: writeappend("out/A", "out/Ac", "A"))
        p = ppg.ParameterInvariant("p", "p2")
        a.depends_on(p)
        ppg.JobGeneratingJob("b", gen)
        ppg.run_pipegraph()
        assert read("out/Ac") == "AA"
        assert read("out/Cx") == "CC"
Esempio n. 3
0
 def __init__(
     self,
     genes_or_dataframe: Union[Genes, DataFrame],
     phenotypes: Tuple[str, str],
     columns_a_b: Tuple[List[str], List[str]],
     name: str = "Gct_df_default",
     dependencies: List[Job] = [],
 ):
     self.dependencies = dependencies
     if isinstance(genes_or_dataframe, Genes):
         self.name = (
             f"Gct_{genes_or_dataframe.name}_{phenotypes[0]}_vs_{phenotypes[1]}"
         )
         self.dependencies.append(genes_or_dataframe.load())
     else:
         self.name = f"{name}_{phenotypes[0]}_vs_{phenotypes[1]}"
     self.cache_dir = Path("cache") / "gct" / self.name
     self.columns_a_b = columns_a_b
     self.phenotypes = phenotypes
     self.genes_or_dataframe = genes_or_dataframe
     self.dependencies.append(
         ppg.ParameterInvariant(
             self.name,
             list(phenotypes) + columns_a_b[0] + columns_a_b[1] +
             [self.name],
         ))
     self._gct = self.cache_dir / "input.gct"
Esempio n. 4
0
    def calc_regions(self):
        def calc():
            return self.do_calc_regions()

        key = hashlib.md5(
            ",".join(
                [self.gr_to_draw.name, self.region_strategy.name]
                + list(set([x.name for x in self.lanes_to_draw]))
            ).encode()
        ).hexdigest()
        #  technically, we could share the regions job between heatmaps with the same regions but differen lanes
        # but we're using a CachedAttributeLoadingJob and that would.. .complicate things quite a bit
        of = self.cache_dir / "regions" / key
        of.parent.mkdir(exist_ok=True, parents=True)
        return ppg.CachedAttributeLoadingJob(of, self, "regions_", calc).depends_on(
            [
                ppg.ParameterInvariant(
                    of, (self.region_strategy.name, self.gr_to_draw.name)
                ),
                ppg.FunctionInvariant(
                    "genomics.regions.heatmap."
                    + self.region_strategy.name
                    + "calc_func",
                    self.region_strategy.__class__.calc,
                ),
            ]
            + self.region_strategy.get_dependencies(self.gr_to_draw)
        )
Esempio n. 5
0
def FromDifference(name, a, b, sheet_name="Differences"):
    """a minus b"""
    def do_load(df):
        remove_ids = set(b.df["gene_stable_id"])
        keep = ~np.array(
            [stable_id in remove_ids for stable_id in a.df["gene_stable_id"]],
            dtype=np.bool,
        )
        return keep

    if a.load_strategy.build_deps:
        deps = [
            a.load(),
            b.load(),
            ppg.ParameterInvariant(
                "Genes_%s_parents" % name,
                (a.name,
                 b.name)),  # so if you swap out the gr, it's detected...
        ]
    else:
        deps = []

    res = a.filter(
        name,
        do_load,
        dependencies=deps,
        sheet_name=sheet_name,
        vid=["Difference", a.vid, b.vid],
    )
    res.parent = a
    return res
Esempio n. 6
0
    def calc_order(self):
        def calc():
            return self.do_calc_order()

        of = self.cache_dir / "order"
        deps = self.order_strategy.get_dependencies(
            self.heatmap.gr_to_draw, self.heatmap.lanes_to_draw
        )
        if len(deps) == 2:
            order_deps, order_params = deps
            order_func = None
        else:
            order_deps, order_params, order_func = deps

        return ppg.CachedAttributeLoadingJob(of, self, "order_", calc).depends_on(
            [
                self.heatmap.calc_raw_data(),
                self.calc_norm_data(),
                ppg.ParameterInvariant(of, (self.order_strategy.name,) + order_params),
                ppg.FunctionInvariant(
                    of.name + "_secondary_func", order_func
                ),
                ppg.FunctionInvariant(
                    "genomics.regions.heatmap."
                    + self.order_strategy.name
                    + "calc_func",
                    self.order_strategy.__class__.calc,
                ),
            ]
            + order_deps
        )
Esempio n. 7
0
    def plot(self):
        normed = self.normed_ddf(self.ddf)
        ordered = self.ordered_ddf(normed)
        names = self.handle_names()

        def plot():
            p = self.plot_strategy.plot(ordered.df, names, self.plot_options)
            self.plot_strategy.render(str(self.output_filename), p)

        if ppg.inside_ppg():
            ppg.util.global_pipegraph.quiet = False
            deps = [
                ordered.load(),
                ppg.FunctionInvariant(
                    "mbf_heatmap." + self.plot_strategy.name + "plot_func",
                    self.plot_strategy.__class__.plot,
                ),
                ppg.FunctionInvariant(
                    "mbf_heatmap" + self.plot_strategy.name + "render_func",
                    self.plot_strategy.__class__.render,
                ),
                ppg.ParameterInvariant(self.output_filename,
                                       freeze(
                                           (self.names, self.plot_options))),
            ]
            return ppg.FileGeneratingJob(self.output_filename,
                                         plot).depends_on(deps)
        else:
            plot()
            return self.output_filename
Esempio n. 8
0
 def __init__(
     self,
     collection_name: str,
     genome: EnsemblGenome,
     version: str = "7.1",
     subset: str = "all",
 ):
     name = ".".join([collection_name, subset, "v" + version])
     super().__init__(name, genome)
     if int(self.genome.revision) < 97:
         raise ValueError(
             "Please use an Ensembl Genome from revision 97 onward.")
     try:
         v = float(version)
     except ValueError:
         raise ValueError(
             f"Cannot understand version {version}. It should be something like 7.1."
         )
     if v < 7:
         raise ValueError(
             "MSigDB Ensembl mapping only works for version 7.0 onward. Version was {version}."
         )
     self.version = version
     self.subset = subset
     self.collection_name = name
     self.input_file = self.cache_dir / (self.name + ".gmt")
     self.dependencies = [
         ppg.ParameterInvariant(
             self.name, [self.collection_name, self.version, self.subset])
     ]
Esempio n. 9
0
def plot_venn_from_genes_with_comparisons(output_prefix,
                                          a_dict,
                                          id_column="gene_stable_id"):
    if len(a_dict) not in (2, 3):
        raise ValueError("Max support 3 sets currently")

    def plot():
        up = {}
        down = {}
        for name, genes_ddf in sorted(a_dict.items()):
            df = genes_ddf.df
            stable_ids = df[id_column]
            column = genes_ddf.venn_annotator["log2FC"]
            up[name] = set(stable_ids[df[column] > 0])
            down[name] = set(stable_ids[df[column] < 0])
        plt.figure(figsize=(4, 4))
        venn.venn(up)
        plt.savefig(str(output_prefix) + ".up.png", dpi=72)
        plt.figure(figsize=(4, 4))
        venn.venn(down)
        plt.savefig(str(output_prefix) + ".down.png", dpi=72)

    return (ppg.MultiFileGeneratingJob(
        [str(output_prefix) + ".up.png",
         str(output_prefix) + ".down.png"], plot).depends_on([
             x.add_annotator(x.venn_annotator) for x in a_dict.values()
         ]).depends_on(ppg.ParameterInvariant(output_prefix, id_column)))
Esempio n. 10
0
def GenomicRegions_Union(name,
                         list_of_grs,
                         summit_annotator=None,
                         sheet_name="Overlaps"):
    """Combine serveral GRs into one.

    Do not set on_overlap


    """
    verify_same_genome(list_of_grs)

    def load():
        dfs = [x.df[["chr", "start", "stop"]] for x in list_of_grs]
        return pd.concat(dfs, axis=0)

    if ppg.inside_ppg():
        deps = [x.load() for x in list_of_grs]
        deps.append(
            ppg.ParameterInvariant(name + "_input_grs",
                                   list(sorted([x.name
                                                for x in list_of_grs]))))
    else:
        deps = []
    vid = ("union", [x.vid for x in list_of_grs])
    return GenomicRegions(
        name,
        load,
        deps,
        list_of_grs[0].genome,
        on_overlap="merge",
        summit_annotator=summit_annotator,
        sheet_name=sheet_name,
        vid=vid,
    )
Esempio n. 11
0
    def test_random_same_number(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "2", "1"],
                "start": [10, 100, 1000],
                "stop": [12, 110, 1110],
                "column_that_will_disappear": ["A", "b", "c"],
            })

        def convert(df):
            res = df[["chr", "start", "stop"]]
            res = res.assign(start=res["start"] + 1)
            return res

        if ppg.inside_ppg():
            deps = [ppg.ParameterInvariant("shuParam", ("hello"))]
        else:
            deps = []
        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        a.add_annotator(Constant("Constant", 5))
        a.annotate()
        b = a.convert("a+1", convert, dependencies=deps)
        force_load(b.load())
        for d in deps:
            assert d in b.load().lfg.prerequisites
        run_pipegraph()
        assert len(a.df) == len(b.df)
        assert (a.df["start"] == b.df["start"] - 1).all()
        assert "column_that_will_disappear" in a.df.columns
        assert not ("column_that_will_disappear" in b.df.columns)
Esempio n. 12
0
    def get_dependencies(self):
        import mbf_bam

        return super().get_dependencies() + [
            self.other_alignment.load(),
            ppg.ParameterInvariant("SubtractOtherLane.mbf_bam.version",
                                   mbf_bam.__version__),
        ]
Esempio n. 13
0
    def deps(self):
        import rpy2.robjects as ro

        ro.r("library('DESeq2')")
        version = str(ro.r("packageVersion")("DESeq2"))
        return ppg.ParameterInvariant(
            self.__class__.__name__ + "_" + self.name, (version,)
        )
Esempio n. 14
0
    def deps(self):
        import rpy2.robjects as ro

        ro.r("library('edgeR')")
        version = str(ro.r("packageVersion")("edgeR"))
        return ppg.ParameterInvariant(
            self.__class__.__name__ + "_" + self.name,
            (version, self.ignore_if_max_count_less_than),
        )
Esempio n. 15
0
 def get_dependencies(self):
     return [
         ppg.ParameterInvariant(
             self.name + "parameters",
             [
                 self.no_of_clusters, self.threshold, self.affinity,
                 self.linkage
             ],
         )
     ]
Esempio n. 16
0
 def deps(self):
     input_columns = []
     for k in sorted(self.groups_to_samples):
         for ac in self.groups_to_samples[k]:
             input_columns.append(ac[1])
     return [
         self.ddf.add_annotator(ac[0]) for ac in self.samples if ac[0] is not None
     ] + [
         self.ddf.load(),
         ppg.ParameterInvariant(self.name, freeze(input_columns)),
     ]  # you might be working with an anno less ddf afterall
Esempio n. 17
0
 def __init__(self, phenotypes: Tuple[str, str],
              columns_a_b: Tuple[List[str], List[str]]):
     self.name = f"Cls_{phenotypes[0]}_vs_{phenotypes[1]}"
     self.cache_dir = Path("cache") / "cls" / self.name
     self.columns_a_b = columns_a_b
     self.phenotypes = phenotypes
     self.dependencies = [
         ppg.ParameterInvariant(
             self.name,
             list(phenotypes) + columns_a_b[0] + columns_a_b[1])
     ]
     self._cls = self.cache_dir / "input.cls"
Esempio n. 18
0
def generate_download_jobs():
    jobs = []
    global urls  # we use a global to communicate with the later job
    # please note that this is only possible in Job generating and data loading jobs,
    # not in the output jobs.
    urls = retrieve_urls()
    for url in urls:
        jobs.append(download_job(url))
    jobs.append(  # this makes sure that if you remove urls, the output job would also rerun.
        # adding urls not seen before would make it rerun either way.
        pypipegraph.ParameterInvariant('retrieved_urls', urls))
    return jobs
Esempio n. 19
0
 def __init__(self, urls):
     if isinstance(urls, str):
         urls = [urls]
     self.urls = sorted(urls)
     self.target_files = self.name_files()
     self.jobs = self.download_files()
     self.dependencies = self.jobs + [
         ppg.ParameterInvariant(
             hashlib.md5(("".join(self.urls)).encode("utf8")).hexdigest(),
             sorted(self.urls),
         )
     ]
Esempio n. 20
0
 def get_dependencies(self):
     res = []
     for peakset in self._get_regions():
         res.append(peakset.load())
         res.append(peakset.write_bigbed()[0])
         res.append(peakset.write()[0])
     res.append(
         ppg.ParameterInvariant(
             self.get_filename(),
             tuple(sorted([x.name for x in self._get_regions()])),
         ))
     return res
Esempio n. 21
0
def generate_stitched_fastq(output_file: Path,
                            r1: Path,
                            r2: Path,
                            dependencies: List[Job] = [],
                            options: Dict[str, str] = {}):
    """
    generate_stitched_fastq wrapper for ngmerge.

    Parameters
    ----------
    output_file : Path
        Output file path for the new fastq file.
    r1 : Path
        Path to R1 file.
    r2 : Path
        Path to R2 file.
    dependencies : List[Job], optional
        List of dependencies, by default [].
    options : Dict[str, str], optional
        Additional options to pass to ngmerge, by default {}.

    Returns
    -------
    Job
        FileGeneratingJob that creates the merged bam file.
    """
    output_file.parent.mkdir(parents=True, exist_ok=True)
    deps = dependencies
    deps.append(ppg.ParameterInvariant(f"PI_{output_file}", list(options)))

    def __dump():
        if not output_file.exists():
            cmd = [
                "/project/code/NGmerge/NGmerge",
                "-1",
                str(r1),
                "-2",
                str(r2),
                "-s",
                "-o",
                str(output_file),
            ]
            for k, v in options.items():
                if v == "":
                    cmd.append(k)
                else:
                    cmd.extend([k, v])
            print(" ".join(cmd))
            subprocess.check_call(cmd)

    job = ppg.FileGeneratingJob(output_file, __dump).depends_on(deps)
    return job
Esempio n. 22
0
    def allow_access(self, groups):
        """write the permisisons file that says which groups may access
        this project"""
        filename = Path("web/permissions.dat")

        def do_dump(groups=groups):
            if not hasattr(groups, "__iter__"):
                groups = [groups]
            filename.parent.mkdir(exist_ok=True)
            filename.write_text("\n".join(groups) + "\n")

        return ppg.FileGeneratingJob(filename, do_dump).depends_on(
            ppg.ParameterInvariant(filename, groups))
Esempio n. 23
0
    def align_job(
        self,
        input_fastq,
        paired_end_filename,
        index_basename,
        output_bam_filename,
        parameters,
    ):
        cmd = [
            "FROM_ALIGNER",
            str(self.path / f"STAR-{self.version}" / "bin" /
                "Linux_x86_64_static" / "STAR"),
            "--genomeDir",
            Path(index_basename).absolute(),
            "--genomeLoad",
            "NoSharedMemory",
            "--readFilesIn",
        ]
        if ',' in str(input_fastq) or (
                paired_end_filename
                and ',' in str(paired_end_filename)):  # pragma: no cover
            raise ValueError(
                "STAR does not handle fastq filenames with a comma")
        if paired_end_filename:
            cmd.extend([
                '"%s"' % Path(paired_end_filename).absolute(),
                '"%s"' % Path(input_fastq).absolute(),
            ])
        else:
            cmd.extend([Path(input_fastq).absolute()])
        cmd.extend(["--outSAMtype", "BAM", "SortedByCoordinate"])
        for k, v in parameters.items():
            cmd.append(k)
            cmd.append(str(v))

        def rename_after_alignment():
            ob = Path(output_bam_filename)
            (ob.parent / "Aligned.sortedByCoord.out.bam").rename(ob.parent /
                                                                 ob.name)

        job = self.run(
            Path(output_bam_filename).parent,
            cmd,
            cwd=Path(output_bam_filename).parent,
            call_afterwards=rename_after_alignment,
            additional_files_created=[output_bam_filename],
        )
        job.depends_on(
            ppg.ParameterInvariant(output_bam_filename,
                                   sorted(parameters.items())))
        return job
Esempio n. 24
0
 def __init__(
     self,
     name,
     scaler=None,
     imputer=None,
     missing_value=np.NaN,
     cluster_columns=False,
     eps=0.5,
     min_samples=5,
     metric="euclidean",
     algorithm="auto",
     leaf_size=30,
     p=None,
     n_jobs=1,
     dependencies=[],
 ):
     """
     This is a wrapper for DBSCAN
     @param eps maximum neighborhood distance
     @param min_samples = minimum number of neighbors for a point to be a valid core point
     @param metric distance metric, allowed is string (metrics.pairwise.calculate_distance) or callable
     @param algorithm nearest neighbor algorithm, allowed is 'auto', 'ball_tree', 'kd_tree' or 'brute'
     @param leaf_size leaf_size passed to ball_tree or kd_tree
     @param p power for minkowski metric to calculate distances
     """
     self.eps = eps
     self.name = name
     self.min_samples = min_samples
     self.metric = metric
     self.algorithm = algorithm
     self.leaf_size = leaf_size
     self.p = p
     self.n_jobs = n_jobs
     dependencies += [
         ppg.ParameterInvariant(
             self.name + "_parameters",
             [eps, p, min_samples, metric, algorithm, leaf_size, p],
         ),
         ppg.FunctionInvariant(self.name + "_fit", self.fit),
     ]
     ClusteringMethod.__init__(self, name)
     self.clustering = sklearn.cluster.DBSCAN(
         eps=self.eps,
         min_samples=self.min_samples,
         metric=self.metric,
         algorithm=self.algorithm,
         leaf_size=self.leaf_size,
         p=self.p,
         n_jobs=self.n_jobs,
     )
     self.clustering.predict = self.clustering.fit_predict
Esempio n. 25
0
    def __call__(self):
        norm_job = self.calc_norm_data()
        order_job = self.calc_order()
        names_in_order = [
            self.handle_name(self.names, x, ii)
            for (ii, x) in enumerate(self.heatmap.lanes_to_draw)
        ]

        def plot():
            p = self.do_plot()
            self.plot_strategy.render(self.output_filename, p)

        plot_job = ppg.FileGeneratingJob(self.output_filename, plot)
        plot_job.ignore_code_changes()
        plot_job.depends_on(norm_job)
        plot_job.depends_on(order_job)
        plot_job.depends_on(
            ppg.FunctionInvariant(
                "genomics.regions._HeatmapPlot.do_plot", _HeatmapPlot.do_plot
            )
        )
        plot_job.depends_on(
            ppg.FunctionInvariant(
                "genomics.regions.heatmap." + self.plot_strategy.name + "plot_func",
                self.plot_strategy.__class__.plot,
            )
        )
        plot_job.depends_on(
            ppg.FunctionInvariant(
                "genomics.regions.heatmap." + self.plot_strategy.name + "render_func",
                self.plot_strategy.__class__.render,
            )
        )
        plot_job.depends_on(self.heatmap.gr_to_draw.load())
        plot_job.depends_on(
            ppg.ParameterInvariant(
                self.output_filename,
                self.plot_strategy.get_parameters(
                    self.plot_options, self.heatmap.lanes_to_draw
                )
                + (names_in_order,),
            )
        )
        plot_job.depends_on(
            self.plot_strategy.get_dependencies(self, self.plot_options)
        )
        if hasattr(self.names, "__call__"):
            plot_job.depends_on(
                ppg.FunctionInvariant(self.output_filename + "_names", self.names)
            )
Esempio n. 26
0
def GenomicRegions_Intersection(new_name,
                                gr_a,
                                gr_b,
                                summit_annotator=None,
                                sheet_name="intersection"):
    """Create an intersection of all intervals...
        [(10, 100), (400, 450)],
        [(80, 120), (600, 700)]
        becomes
        [(80, 100),]

        Note that all interval-set based operations (L{union}, L{intersection}, L{difference})
        drop all columns but chr, start, stop (annotators are merged and readded from all sets involved)
        """
    verify_same_genome([gr_a, gr_b])

    def do_load():
        new_rows = []
        for chr, start, stop in gr_a._iter_intersections(gr_b):
            new_rows.append({"chr": chr, "start": start, "stop": stop})
        if new_rows:
            return pd.DataFrame(new_rows)
        else:
            return pd.DataFrame({"chr": [], "start": [], "stop": []})

    if gr_a.load_strategy.build_deps:
        deps = [
            gr_b.load(),
            gr_a.load(),
            ppg.ParameterInvariant(
                "GenomicRegions_%s_parents" % new_name,
                (gr_a.name,
                 gr_b.name)),  # so if you swap out the gr, it's detected...
        ]
    else:
        deps = []
        gr_b.load()

    result = GenomicRegions(
        new_name,
        do_load,
        deps,
        gr_a.genome,
        on_overlap="merge",
        summit_annotator=summit_annotator,
        vid=["intersection"] + gr_a.vid + gr_b.vid,
        sheet_name=sheet_name,
    )
    return result
Esempio n. 27
0
    def write(self,
              output_filename=None,
              mangler_function=None,
              float_format="%4g"):
        """Job: Store the internal DataFrame (df) in a table.
        To sort, filter, remove columns, etc before output,
        pass in a mangler_function (takes df, returns df)

        Retruns a (Job, Path) tuple - job is None if outside ppg
        """
        output_filename = self.pathify(output_filename,
                                       self.get_table_filename().absolute())

        def write(output_filename):
            if mangler_function:
                df = mangler_function(self.df.copy())
            else:
                df = self.mangle_df_for_write(self.df)
            if str(output_filename).endswith(".xls"):
                try:
                    df.to_excel(output_filename,
                                index=False,
                                float_format=float_format)
                except (ValueError):
                    df.to_csv(
                        output_filename,
                        sep="\t",
                        index=False,
                        float_format=float_format,
                    )
            else:
                df.to_csv(
                    output_filename,
                    sep="\t",
                    index=False,
                    encoding="utf-8",
                    float_format=float_format,
                )

        if self.load_strategy.build_deps:
            deps = [
                self.annotate(),
                ppg.FunctionInvariant(
                    str(output_filename) + "_mangler", mangler_function),
                ppg.ParameterInvariant(str(output_filename), float_format),
            ]
        else:
            deps = []
        return self.load_strategy.generate_file(output_filename, write, deps)
Esempio n. 28
0
    def __init__(self, species: str = "Homo_sapiens", version: str = "7.1"):
        """
        An ensembl chip object that takes care of file download and input.chip
        generation for GSEA.

        Parameters
        ----------
        version : str, optional
            The MSigDB version, by default "7.1"
        species : str, optional
            The species, by default "Homo_sapiens". Currently only supports
            Human, Mouse and Rat data.

        Raises
        ------
        ValueError
            If an unsupported species is provided.
        """
        if species == "Homo_sapiens":
            self.species = "Human"
            self.url = f"https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/{self.species}_ENSEMBL_Gene_MSigDB.v{version}.chip"
        elif species == "Mus_musculus":
            self.species = "Mouse"
            if version == "7.0":
                self.url = "https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/Mouse_ENSEMBL_Gene_ID_MSigDB.v7.0.chip"
            elif version == "7.1":
                self.url = "https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/Mouse_ENSEMBL_Gene_ID_to_Human_Orthologs_MSigDB.v7.1.chip"
            elif version == "7.2":
                self.url = f"https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/Mouse_ENSEMBL_Gene_ID_Human_Orthologs_MSigDB.v{version}.chip"
        elif species == "Rattus_norvegicus":
            self.species = "Rat"
            if version == "7.0":
                self.url = "https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/Rat_ENSEMBL_Gene_ID_MSigDB.v7.0.chip"
            elif version == "7.1":
                self.url = "https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/Rat_ENSEMBL_Gene_ID_to_Human_Orthologs_MSigDB.v7.1.chip	30-Mar-2020 16:56"
            elif version == "7.2":
                self.url = f"https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/Rat_ENSEMBL_Gene_ID_Human_Orthologs_MSigDB.v{version}.chip"
        else:
            raise ValueError(
                f"Currently the species {species} is not supported. Check MsigDB chip files at https://data.broadinstitute.org/gsea-msigdb/msigdb/annotations_versioned/."
            )
        self.name = self.__class__.generate_name(species, version)
        self.version = version
        self.cache_dir = Path("cache") / "chip" / self.name
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        self._chip = self.cache_dir / "input.chip"
        self.dependencies = [
            ppg.ParameterInvariant(self.name, [self.species, self.version])
        ]
Esempio n. 29
0
 def get_dependencies(self):
     return [
         ppg.ParameterInvariant(
             self.name + "parameters",
             [
                 self.eps,
                 self.min_samples,
                 self.metric,
                 self.algorithm,
                 self.leaf_size,
                 self.p,
                 self.n_jobs,
             ],
         )
     ]
Esempio n. 30
0
 def get_dependencies(self):
     deps = []
     for lane in self._get_lanes():
         deps.append(lane.load())
         # if hasattr(lane, "read_distribution_dict"):
         # deps.append(lane.read_distribution_dict())
         # else:
         # deps.append(lane.count_aligned_reads())
     # for lane in self._get_browser_lanes():
     # deps.append(lane.dump_gbrowes_adjustments())
     deps.append(
         ppg.ParameterInvariant(
             self.get_filename(),
             tuple(sorted([x.name for x in self._get_lanes()])) +
             tuple(sorted([x.name for x in self._get_browser_lanes()])),
         ))
     return deps