Example #1
0
def GenomicRegions_Union(name,
                         list_of_grs,
                         summit_annotator=None,
                         sheet_name="Overlaps"):
    """Combine serveral GRs into one.

    Do not set on_overlap


    """
    verify_same_genome(list_of_grs)

    def load():
        dfs = [x.df[["chr", "start", "stop"]] for x in list_of_grs]
        return pd.concat(dfs, axis=0)

    if ppg.inside_ppg():
        deps = [x.load() for x in list_of_grs]
        deps.append(
            ppg.ParameterInvariant(name + "_input_grs",
                                   list(sorted([x.name
                                                for x in list_of_grs]))))
    else:
        deps = []
    vid = ("union", [x.vid for x in list_of_grs])
    return GenomicRegions(
        name,
        load,
        deps,
        list_of_grs[0].genome,
        on_overlap="merge",
        summit_annotator=summit_annotator,
        sheet_name=sheet_name,
        vid=vid,
    )
Example #2
0
    def __init__(self, ddf, groups_to_samples, name=None):
        if not isinstance(ddf, DelayedDataFrame):
            raise ValueError("Ddf must be a DelayedDataFrame")
        self.ddf = ddf
        self.groups_to_samples = self._check_input_dict(groups_to_samples)
        self.sample_column_to_group = self._sample_columns_to_group()
        self.samples = functools.reduce(
            list.__add__, [x[1] for x in sorted(self.groups_to_samples.items())]
        )
        if name is None:
            self.name = "comparison__" + "_".join(sorted(self.groups_to_samples.keys()))
        else:
            self.name = "comparison__" + name
        self.result_dir = self.ddf.result_dir / self.name
        self.result_dir.mkdir(exist_ok=True, parents=True)
        if ppg.inside_ppg():
            ppg.assert_uniqueness_of_object(self)
            if not hasattr(ppg.util.global_pipegraph, "_mbf_comparisons_name_dedup"):
                ppg.util.global_pipegraph._mbf_comparisons_name_dedup = set()
            for name in self.groups_to_samples:
                if name in ppg.util.global_pipegraph._mbf_comparisons_name_dedup:
                    raise ValueError(
                        f"Comparisons group {name} defined in multiple Comparisons - not supported"
                    )

        self.register_qc()
Example #3
0
    def plot(self):
        normed = self.normed_ddf(self.ddf)
        ordered = self.ordered_ddf(normed)
        names = self.handle_names()

        def plot():
            p = self.plot_strategy.plot(ordered.df, names, self.plot_options)
            self.plot_strategy.render(str(self.output_filename), p)

        if ppg.inside_ppg():
            ppg.util.global_pipegraph.quiet = False
            deps = [
                ordered.load(),
                ppg.FunctionInvariant(
                    "mbf_heatmap." + self.plot_strategy.name + "plot_func",
                    self.plot_strategy.__class__.plot,
                ),
                ppg.FunctionInvariant(
                    "mbf_heatmap" + self.plot_strategy.name + "render_func",
                    self.plot_strategy.__class__.render,
                ),
                ppg.ParameterInvariant(self.output_filename,
                                       freeze(
                                           (self.names, self.plot_options))),
            ]
            return ppg.FileGeneratingJob(self.output_filename,
                                         plot).depends_on(deps)
        else:
            plot()
            return self.output_filename
Example #4
0
    def __new__(cls, *args, **kwargs):
        cn = cls.__name__
        if ppg.inside_ppg():
            if not hasattr(ppg.util.global_pipegraph,
                           "_annotator_singleton_dict"):
                ppg.util.global_pipegraph._annotator_singleton_dict = {
                    "lookup": []
                }
            singleton_dict = ppg.util.global_pipegraph._annotator_singleton_dict
        else:
            singleton_dict = annotator_singletons
        if not cn in singleton_dict:
            singleton_dict[cn] = {}
        key = {}
        for ii in range(0, len(args)):
            key["arg_%i" % ii] = args[ii]
        key.update(kwargs)
        for k, v in key.items():
            key[k] = freeze(v)
        key = tuple(sorted(key.items()))
        if not key in singleton_dict[cn]:
            singleton_dict[cn][key] = object.__new__(cls)
            singleton_dict["lookup"].append(singleton_dict[cn][key])

        return singleton_dict[cn][key]
Example #5
0
    def __init__(self,
                 name,
                 loading_function,
                 dependencies=[],
                 result_dir=None):
        # assert_uniqueness_of_object is taking core of by the load_strategy
        self.name = name
        if result_dir:
            self.result_dir = Path(result_dir)
        else:
            self.result_dir = Path(
                "results") / self.__class__.__name__ / self.name
        self.result_dir.mkdir(parents=True, exist_ok=True)
        if isinstance(loading_function, pd.DataFrame):
            # don't you just love lambda variable binding?
            loading_function = (
                lambda loading_function=loading_function: loading_function)

        if not ppg.inside_ppg():
            self.load_strategy = Load_Direct(self, loading_function)
        else:
            self.load_strategy = Load_PPG(self, loading_function, dependencies)
        self.column_to_annotators = {}
        self.annotators = {}
        self.parent = None
        self.children = []
        # this prevents writing the same file with two different mangler functions
        # but still allows you to call write() in ppg settings multiple times
        # if different parts need to ensure it's being written out
        self.mangler_dict = {self.get_table_filename(): None}
        self.load()
Example #6
0
    def normed_ddf(self, input_ddf):
        def load():
            df = input_ddf.df[[ac[1] for ac in self.columns]]
            normed_df = self.normalization_strategy.calc(
                df, [ac[1] for ac in self.columns])
            return normed_df

        output_name = input_ddf.name + "_heatmap_" + self.normalization_strategy.name
        if ppg.inside_ppg():
            deps = [
                self.ddf.add_annotator(ac[0])
                for ac in self.columns if ac[0] is not None
            ] + [
                self.normalization_strategy.deps(),
                input_ddf.load(),
                ppg.FunctionInvariant(output_name + '_calc',
                                      self.normalization_strategy.calc)
            ]
        else:
            deps = []

        return DelayedDataFrame(
            output_name,
            load,
            deps,
            input_ddf.result_dir,
        )
Example #7
0
    def test_do_load_only_happens_once(self):
        df = pd.DataFrame([{
            "gene_stable_id": "fake1",
            "chr": "1",
            "strand": 1,
            "tss": 5000,
            "tes": 5500,
            "description": "bla",
        }])
        counter = [0]

        def load():
            counter[0] += 1
            return df

        g = genes.Genes(get_genome_chr_length(), load, name="shu")
        if ppg.inside_ppg():
            assert counter[0] == 0
            g.load()
            assert counter[0] == 0
            g.load()
            assert counter[0] == 0
            ppg.run_pipegraph()
        else:
            assert counter[0] == 1
            g.load()
            assert counter[0] == 1
Example #8
0
    def test_random_same_number(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "2", "1"],
                "start": [10, 100, 1000],
                "stop": [12, 110, 1110],
                "column_that_will_disappear": ["A", "b", "c"],
            })

        def convert(df):
            res = df[["chr", "start", "stop"]]
            res = res.assign(start=res["start"] + 1)
            return res

        if ppg.inside_ppg():
            deps = [ppg.ParameterInvariant("shuParam", ("hello"))]
        else:
            deps = []
        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        a.add_annotator(Constant("Constant", 5))
        a.annotate()
        b = a.convert("a+1", convert, dependencies=deps)
        force_load(b.load())
        for d in deps:
            assert d in b.load().lfg.prerequisites
        run_pipegraph()
        assert len(a.df) == len(b.df)
        assert (a.df["start"] == b.df["start"] - 1).all()
        assert "column_that_will_disappear" in a.df.columns
        assert not ("column_that_will_disappear" in b.df.columns)
    def test_multi_plus_filter(self, clear_annotators):
        d = DelayedDataFrame(
            "ex1",
            pd.DataFrame({
                "a1": [1 / 0.99, 2 / 0.99, 3 / 0.99],
                "a2": [1 * 0.99, 2 * 0.99, 3 * 0.99],
                "b1": [2 * 0.99, 8 * 0.99, (16 * 3) * 0.99],
                "b2": [2 / 0.99, 8 / 0.99, (16 * 3) / 0.99],
                "delta": [10, 20, 30],
            }),
        )
        c = Comparisons(d, {"a": ["a1", "a2"], "b": ["b1", "b2"]})
        a = c.a_vs_b("a", "b", Log2FC(), laplace_offset=0)
        anno1 = Constant("shu1", 5)
        anno2 = Constant("shu2", 5)  # noqa: F841
        anno3 = Constant("shu3", 5)  # noqa: F841
        to_test = [
            (("log2FC", "==", -1.0), [-1.0]),
            (("log2FC", ">", -2.0), [-1.0]),
            (("log2FC", "<", -2.0), [-4.0]),
            (("log2FC", ">=", -2.0), [-1.0, -2.0]),
            (("log2FC", "<=", -2.0), [-2.0, -4.0]),
            (("log2FC", "|>", 2.0), [-4.0]),
            (("log2FC", "|<", 2.0), [-1.0]),
            (("log2FC", "|>=", 2.0), [-2.0, -4.0]),
            (("log2FC", "|<=", 2.0), [-1.0, -2.0]),
            ((a["log2FC"], "<", -2.0), [-4.0]),
            (("log2FC", "|", -2.0), ValueError),
            ([("log2FC", "|>=", 2.0), ("log2FC", "<=", 0)], [-2.0, -4.0]),
            ((anno1, ">=", 5), [-1, -2.0, -4.0]),
            (((anno1, 0), ">=", 5), [-1, -2.0, -4.0]),
            (("shu2", ">=", 5), [-1, -2.0, -4.0]),
            (("delta", ">", 10), [-2.0, -4.0]),
        ]
        if not ppg.inside_ppg():  # can't test for missing columns in ppg.
            to_test.extend([(("log2FC_no_such_column", "<", -2.0), KeyError)])
        filtered = {}
        for ii, (f, r) in enumerate(to_test):
            if r in (ValueError, KeyError):
                with pytest.raises(r):
                    a.filter([f], "new%i" % ii)
            else:
                filtered[tuple(f)] = a.filter(
                    [f] if isinstance(f, tuple) else f, "new%i" % ii)
                assert filtered[tuple(f)].name == "new%i" % ii
                force_load(filtered[tuple(f)].annotate(),
                           filtered[tuple(f)].name)

        force_load(d.add_annotator(a), "somethingsomethingjob")
        run_pipegraph()
        c = a["log2FC"]
        assert (d.df[c] == [-1.0, -2.0, -4.0]).all()
        for f, r in to_test:
            if r not in (ValueError, KeyError):
                try:
                    assert filtered[tuple(f)].df[c].values == approx(r)
                except AssertionError:
                    print(f)
                    raise
Example #10
0
    def get_convert_func(self,
                         key,
                         keep_name=False,
                         filter_to_these_chromosomes=None):
        """Note that filter_to_these_chromosomes is after the replacements have kicked in"""
        chain_file = self.data_path / (key + ".over.chain")
        if not chain_file.exists():  # pragma: no cover
            raise ValueError("invalid liftover key, file not found: %s" %
                             chain_file)
        if filter_to_these_chromosomes:
            filter_to_these_chromosomes = set(filter_to_these_chromosomes)

        def do_convert(df):
            if df.index.duplicated().any():  # pragma: no cover
                raise ValueError("liftover only works with unique indices")
            df.index = [str(x) for x in df.index]
            input_tuples = [("chr" + row["chr"], row["start"], row["stop"],
                             idx) for idx, row in df.iterrows()]

            output_tuples = self.do_liftover(input_tuples, chain_file)
            output_lists = list(zip(*output_tuples))
            res = pd.DataFrame({
                "chr":
                output_lists[0],
                "start":
                output_lists[1],
                "stop":
                output_lists[2],
                "parent": [x.decode("utf-8") for x in output_lists[3]],
            }).set_index("parent")
            new_chr = []
            for x in res["chr"]:
                x = x[3:]
                # these are untested as of 2019-03-27
                if x == "m":  # pragma: no cover
                    x = "MT"
                elif (key in self.replacements
                      and x in self.replacements[key]):  # pragma: no cover
                    x = self.replacements[key][x]
                new_chr.append(x)
            res["chr"] = new_chr
            for col in df.columns:
                if col not in res.columns:
                    res = res.assign(**{col: df[col]})
            if filter_to_these_chromosomes:
                res = res[res["chr"].isin(filter_to_these_chromosomes)]
            return res

        if ppg.inside_ppg():
            do_convert.dependencies = [
                ppg.FileTimeInvariant(chain_file),
                ppg.FunctionInvariant(
                    "genomics.regions.convert.LiftOver.do_liftover",
                    LiftOver.do_liftover,
                ),
            ]
        return do_convert
Example #11
0
 def calc(self, df):
     if ppg.inside_ppg():
         data = self._data
     else:
         data = self.calc_data()
     lookup = self.count_strategy.extract_lookup(data)
     result = []
     for gene_stable_id in df["gene_stable_id"]:
         result.append(lookup.get(gene_stable_id, 0))
     result = np.array(result, dtype=np.float)
     return pd.Series(result)
Example #12
0
 def calc(self, df):
     if ppg.inside_ppg():
         data = self._data
     else:
         data = self.calc_data()
     lookup = self.count_strategy.extract_lookup(data)
     result = []
     for idx in df.index:
         result.append(lookup.get(str(idx), 0))
     result = np.array(result, dtype=np.float)
     return pd.Series(result)
Example #13
0
def force_load(job, prefix=None):
    """make sure a dataloadingjob has been loaded (if applicable)"""
    if ppg.inside_ppg():
        if not isinstance(job, ppg.Job):
            if prefix is None:
                global fl_count
                fl_count += 1
                prefix = "fl_%i" % fl_count
        else:
            prefix = job.job_id
        return ppg.JobGeneratingJob(prefix + "_force_load",
                                    lambda: None).depends_on(job)
Example #14
0
def GenomicRegions_FromTable(
    name,
    filename,
    genome,
    on_overlap="raise",
    summit_annotator=None,
    filter_func=None,
    vid=None,
    sheet_name="FromTable",
    drop_further_columns=True,
    chr_column="chr",
    start_column="start",
    stop_column="stop",
    one_based=False,
    reader=read_pandas,
):
    """Read a table file (csv/tsv/xls) with the chr/start/stop columns (renamed?), optionally
    drop all further columns"""
    def load():

        df = reader(filename)
        df["chr"] = df[chr_column].astype(str)
        df["start"] = df[start_column].astype(int)
        if one_based:  # pragma: no cover
            df["start"] -= 1
        df["stop"] = df[stop_column].astype(int)
        if drop_further_columns:  # pragma: no cover
            df = df[["chr", "start", "stop"]]
        if filter_func:  # pragma: no cover
            df = filter_func(df)
        return df

    if ppg.inside_ppg():
        deps = [
            ppg.FileTimeInvariant(filename),
            ppg.FunctionInvariant(name + "_filter_func", filter_func),
        ]
    else:
        deps = []
    return GenomicRegions(
        name,
        load,
        deps,
        genome,
        on_overlap,
        summit_annotator=summit_annotator,
        sheet_name=sheet_name,
        vid=vid,
    )
Example #15
0
    def __init__(self, species, revision, prebuild_manager):
        super().__init__()
        self.prebuild_manager = prebuild_manager

        self.species = species
        if not re.match(r"^[A-Z][a-z]+_[a-z]+$", species):
            raise ValueError("Species must be capitalized like 'Homo_sapiens")
        self.revision = str(int(revision))
        self.name = f"{self.species}_{self.revision}"
        if ppg.inside_ppg():
            ppg.util.assert_uniqueness_of_object(self)
        self.genetic_code = EukaryoticCode
        self.download_genome()
        self._seq_region_is_canonical = {}
        self._canonical_cache = {}
Example #16
0
    def test_find_annos_from_column(self, both_ppg_and_no_ppg_no_qc,
                                    clear_annotators):
        a = Constant("shu", 5)
        assert find_annos_from_column("shu") == [a]
        assert find_annos_from_column("shu")[0] is a
        with pytest.raises(KeyError):
            find_annos_from_column("nosuchcolumn")

        b = PolyConstant(["shu"], [10])
        assert find_annos_from_column("shu") == [a, b]

        if ppg.inside_ppg():
            both_ppg_and_no_ppg_no_qc.new_pipegraph()
            with pytest.raises(KeyError):
                find_annos_from_column("shu")
Example #17
0
    def test_filtering_by_definition(self):

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        c = XAnno("C", [1, 2])
        a += c
        d = XAnno("D", [4, 5])

        # native column
        a1 = a.filter("a1", ("A", "==", 1))
        # search for the anno
        a2 = a.filter("a2", ("C", "==", 2))
        # extract the column name from the anno - anno already added
        a4 = a.filter("a4", (d, "==", 5))
        # extract the column name from the anno - anno not already added
        a3 = a.filter("a3", (c, "==", 1))
        # lookup column to name
        a6 = a.filter("a6", ("X", "==", 2), column_lookup={"X": "C"})
        # lookup column to anno
        a7 = a.filter("a7", ("X", "==", 2), column_lookup={"X": c})

        if not ppg.inside_ppg():
            e1 = XAnno("E", [6, 7])
            e2 = XAnno("E", [6, 8])
            assert find_annos_from_column("E") == [e1, e2]
            # column name to longer unique
            with pytest.raises(KeyError):
                a.filter("a5", ("E", "==", 5))
            with pytest.raises(KeyError):
                a.filter("a5", ((c, "D"), "==", 5))
        force_load(a1.annotate())
        force_load(a2.annotate())
        force_load(a3.annotate())
        force_load(a4.annotate())
        force_load(a6.annotate())
        force_load(a7.annotate())
        run_pipegraph()

        assert (a1.df["A"] == [1]).all()

        assert (a2.df["A"] == [2]).all()

        assert (a3.df["A"] == [1]).all()

        assert (a4.df["A"] == [2]).all()
        assert (a6.df["A"] == [2]).all()
        assert (a7.df["A"] == [2]).all()
Example #18
0
    def __iadd__(self, other):
        """Add and return self"""
        if isinstance(other, Annotator):
            if ppg.inside_ppg():
                if not self.has_annotator(other):
                    self.load_strategy.add_annotator(other)
                elif self.get_annotator(other.get_cache_name()) is not other:
                    raise ValueError(
                        "trying to add different annotators with identical cache_names\n%s\n%s"
                        % (other, self.get_annotator(other.get_cache_name())))
            else:
                self.load_strategy.add_annotator(other)

            return self
        else:
            return NotImplemented
Example #19
0
def GenomicRegions_FromBigBed(
    name,
    filename,
    genome,
    chromosome_mangler=lambda x: x,
    on_overlap="raise",
    summit_annotator=None,
    sheet_name=None,
    vid=None,
):
    """Create GenomicRegions from a BigBed file.
    @chromosome_mangler translates genome chromosomes into the bigbed's chromosomes!

    """
    from mbf_fileformats.bed import read_bigbed

    def load():
        res = read_bigbed(filename, genome.get_chromosome_lengths(),
                          chromosome_mangler)
        if (res["strand"] == 1).all():
            res = res.drop("strand", axis=1)
        if len(res) == 0:  # pragma: no cover
            raise ValueError(
                "Emtpty BigBed file (or wrong chromosome names)- %s" %
                filename)
        return res

    if ppg.inside_ppg():
        deps = [
            ppg.FileTimeInvariant(filename),
            ppg.FunctionInvariant(name + "_chrmangler", chromosome_mangler),
        ]
    else:
        deps = []

    return GenomicRegions(
        name,
        load,
        deps,
        genome,
        on_overlap=on_overlap,
        summit_annotator=summit_annotator,
        sheet_name=sheet_name,
        vid=vid,
    )
Example #20
0
    def __exit__(self, *tp):
        from _pytest.outcomes import fail

        if ppg.inside_ppg():
            with pytest.raises(ppg.RuntimeError) as e:
                run_pipegraph()
            assert isinstance(e.value.exceptions[0], self.expected_exception)
            if self.search_message:
                assert self.search_message in str(e.value.exceptions[0])
        else:
            __tracebackhide__ = True
            if tp[0] is None:
                fail(self.message)
            self.excinfo.__init__(tp)
            suppress_exception = issubclass(self.excinfo.type,
                                            self.expected_exception)
            if sys.version_info[0] == 2 and suppress_exception:
                sys.exc_clear()
            return suppress_exception
Example #21
0
def find_annos_from_column(k):
    from . import annotator
    import pypipegraph as ppg

    if ppg.inside_ppg():
        if not hasattr(ppg.util.global_pipegraph, "_annotator_singleton_dict"):
            ppg.util.global_pipegraph._annotator_singleton_dict = {}
        singleton_dict = ppg.util.global_pipegraph._annotator_singleton_dict
    else:
        singleton_dict = annotator.annotator_singletons

    res = []
    for anno in singleton_dict["lookup"]:
        if k in anno.columns:
            res.append(anno)
    if res:
        return res
    else:
        raise KeyError("No anno for column '%s' found" % (k, ))
Example #22
0
def GenomicRegions_CommonInAtLeastX(name,
                                    list_of_grs,
                                    X,
                                    summit_annotator=None,
                                    sheet_name="Overlaps"):
    """Combine serveral GRs into one. Keep only those (union) regions occuring in at least x."""
    def load():
        union = merge_df_intervals(
            pd.concat([x.df[["chr", "start", "stop"]]
                       for x in list_of_grs])).reset_index()
        keep = np.zeros((len(union), ), dtype=np.bool)
        for ii, row in union.iterrows():
            count = 0
            for gr in list_of_grs:
                if gr.has_overlapping(row["chr"], row["start"], row["stop"]):
                    count += 1
            keep[ii] = count >= X
        if not keep.any():  # pragma: no cover
            raise ValueError("Filtered all of them")
        return union.iloc[keep]

    if len(set([x.genome for x in list_of_grs])) > 1:  # pragma: no cover
        raise ValueError(
            "Can only merge GenomicRegions that have the same genome")
    if ppg.inside_ppg():
        deps = [x.load() for x in list_of_grs]
        deps.append(
            ppg.ParameterInvariant(name + "_input_grs",
                                   sorted([x.name for x in list_of_grs])))
    else:
        deps = []
        [x.load() for x in list_of_grs]
    vid = ("common at least %i" % X, [x.vid for x in list_of_grs])
    return GenomicRegions(
        name,
        load,
        deps,
        list_of_grs[0].genome,
        on_overlap="raise",
        summit_annotator=summit_annotator,
        sheet_name=sheet_name,
        vid=vid,
    )
Example #23
0
def GenomicRegions_FromWig(
    name,
    filename,
    genome,
    enlarge_5prime=0,
    enlarge_3prime=0,
    on_overlap="raise",
    comment_char=None,
    summit_annotator=None,
    vid=None,
):
    """Create GenomicRegions from a Wiggle file.

    @enlarge_5prime and @enlarge_3prime increase the size of the fragments described in the wig in
    the respective direction (for example if a chip-chip array did not cover every base).
    @comment_char defines which lines to ignore in the wiggle (see {mbf_fileformats.wiggle_to_intervals})

    The resulting GenomicRegions has a column 'Score' that contains the wiggle score"""
    from mbf_fileformats.wiggle import wiggle_to_intervals

    def load():
        df = wiggle_to_intervals(filename, comment_char=comment_char)
        df["chr"] = [to_string(x) for x in df["chr"]]
        df["start"] -= enlarge_5prime
        df["stop"] += enlarge_3prime
        return df

    if ppg.inside_ppg():
        deps = [ppg.FileTimeInvariant(filename)]
    else:
        deps = []

    return GenomicRegions(name,
                          load,
                          deps,
                          genome,
                          on_overlap,
                          summit_annotator=summit_annotator,
                          vid=vid)
Example #24
0
def GenomicRegions_Common(name,
                          list_of_grs,
                          summit_annotator=None,
                          sheet_name="Overlaps"):
    """Combine serveral GRs into one. Keep only those (union) regions occuring in all."""
    def load():
        union = merge_df_intervals(
            pd.concat([x.df[["chr", "start", "stop"]]
                       for x in list_of_grs])).reset_index(drop=True)
        keep = np.ones((len(union), ), dtype=np.bool)
        for gr in list_of_grs:
            for ii, row in union.iterrows():
                if keep[ii]:  # no point in checking if we already falsified - short circuit...
                    if not gr.has_overlapping(row["chr"], row["start"],
                                              row["stop"]):
                        keep[ii] = False
        return union[keep]

    verify_same_genome(list_of_grs)
    if ppg.inside_ppg():
        deps = [x.load() for x in list_of_grs]
        deps.append(
            ppg.ParameterInvariant(name + "_input_grs",
                                   sorted([x.name for x in list_of_grs])))
    else:
        for x in list_of_grs:
            x.load()
        deps = []
    vid = ("common", [x.vid for x in list_of_grs])
    return GenomicRegions(
        name,
        load,
        deps,
        list_of_grs[0].genome,
        on_overlap="raise",
        summit_annotator=summit_annotator,
        sheet_name=sheet_name,
        vid=vid,
    )
Example #25
0
def run_pipegraph():
    if ppg.inside_ppg():
        ppg.run_pipegraph()
    else:
        pass
 def test_both_fixtures(self, both_ppg_and_no_ppg_no_qc):
     if not ppg.inside_ppg():
         assert qc_disabled()
     else:
         assert qc_disabled()
Example #27
0
def GenomicRegions_FromBed(
    name,
    filename,
    genome,
    chromosome_mangler=lambda x: x,
    on_overlap="raise",
    filter_invalid_chromosomes=False,
    summit_annotator=None,
    sheet_name=None,
    vid=None,
):
    """Create GenomicRegions from a Bed file.

    The resulting GenomicRegions has a column 'Score' that contains the wiggle score"""
    from mbf_fileformats.bed import read_bed

    def load():
        valid_chromosomes = set(genome.get_chromosome_lengths())
        data = {}
        entries = read_bed(filename)
        data["chr"] = np.array(
            [chromosome_mangler(to_string(e.refseq)) for e in entries],
            dtype=np.object)
        data["start"] = np.array([e.position for e in entries], dtype=np.int32)
        data["stop"] = np.array([e.position + e.length for e in entries],
                                dtype=np.int32)
        data["score"] = np.array([e.score for e in entries], dtype=np.float)
        data["strand"] = np.array([e.strand for e in entries], dtype=np.int8)
        data["name"] = np.array([to_string(e.name) for e in entries],
                                dtype=np.object)
        data = pd.DataFrame(data)
        if filter_invalid_chromosomes:  # pragma: no cover
            keep = [x in valid_chromosomes for x in data["chr"]]
            data = data[keep]
        res = data
        if len(res) == 0:
            raise ValueError("Emtpty Bed file - %s" % filename)
        if (np.isnan(res["score"])).all():
            res = res.drop(["score"], axis=1)
        if (len(res["name"]) > 1) and (len(res["name"].unique()) == 1):
            res = res.drop(["name"], axis=1)
        return res

    if ppg.inside_ppg():
        deps = [
            ppg.FileTimeInvariant(filename),
            ppg.FunctionInvariant(name + "_chrmangler", chromosome_mangler),
        ]
    else:
        deps = []

    return GenomicRegions(
        name,
        load,
        deps,
        genome,
        on_overlap=on_overlap,
        summit_annotator=summit_annotator,
        sheet_name=sheet_name,
        vid=vid,
    )
Example #28
0
def GenomicRegions_FromGFF(
    name,
    filename,
    genome,
    filter_function=None,
    comment_char=None,
    on_overlap="raise",
    chromosome_mangler=None,
    fix_negative_coordinates=False,
    alternative_class=None,
    summit_annotator=None,
    vid=None,
):
    """Create a GenomicRegions from a gff file.
    You can filter entries with @filter_function(gff_entry_dict) -> Bool,
    remove comment lines starting with a specific character with @comment_char,
    mangle the chromosomes with @chromosome_mangler(str) -> str,
    replace negative coordinates with 0 (@fix_negative_coordinates),
    or provide an alternative constructor to call with @alternative_class
    """
    def load():
        from mbf_fileformats.gff import gffToDict

        entries = gffToDict(filename, comment_char=comment_char)
        data = {
            "chr": [],
            "start": [],
            "stop": [],
            "score": [],
            "strand": [],
            "name": [],
        }
        name_found = False
        for entry in entries:
            if filter_function and not filter_function(entry):
                continue
            if chromosome_mangler:
                chr = chromosome_mangler(entry["seqname"])
            else:
                chr = entry["seqname"]
            data["chr"].append(to_string(chr))
            start = entry["start"]
            if fix_negative_coordinates and start < 0:
                start = 0
            data["start"].append(start)
            data["stop"].append(entry["end"])
            data["score"].append(entry["score"])
            data["strand"].append(entry["strand"])
            name = entry["attributes"].get("Name", [""])[0]
            data["name"].append(name)
            if name:
                name_found = True
        if not name_found:
            del data["name"]
        return pd.DataFrame(data)

    if alternative_class is None:  # pragma: no cover
        alternative_class = GenomicRegions
    if ppg.inside_ppg():
        deps = [
            ppg.FileTimeInvariant(filename),
            ppg.ParameterInvariant(
                name + "_params_GenomicRegions_FromGFF",
                (comment_char, fix_negative_coordinates),
            ),
            ppg.FunctionInvariant(name + "_filter_func_GenomicRegions_FromGFF",
                                  filter_function),
            ppg.FunctionInvariant(
                name + "_chromosome_manlger_GenomicRegions_FromGFF",
                chromosome_mangler),
        ]
    else:
        deps = []
    return alternative_class(name,
                             load,
                             deps,
                             genome,
                             on_overlap,
                             summit_annotator=summit_annotator,
                             vid=vid)
Example #29
0
    def prebuild(  # noqa: C901
        self,
        name,
        version,
        input_files,
        output_files,
        calculating_function,
        minimum_acceptable_version=None,
        maximum_acceptable_version=None,
        further_function_deps={},
    ):
        """Create a job that will prebuilt the files if necessary

        @further_function_deps is a dictionary name => func,
        and will end up as PrebuildFunctionInvariantFileStoredExploding
        in the correct directory

        """
        if minimum_acceptable_version is None:
            minimum_acceptable_version = version

        available_versions = self._find_versions(name)
        if version in available_versions:
            output_path = available_versions[version]
        else:
            # these are within minimum..maximum_acceptable_version
            acceptable_versions = sort_versions([
                (v, p) for v, p in available_versions.items()
                if ((Version(v) >= minimum_acceptable_version) and (
                    maximum_acceptable_version is None or
                    (Version(v) < maximum_acceptable_version)))
            ])
            ok_versions = []

            (
                new_source,
                new_funchash,
                new_closure,
            ) = ppg.FunctionInvariant._hash_function(calculating_function)

            for v, p in acceptable_versions:
                func_md5sum_path = p / "mbf_func.md5sum"
                func_md5sum_path2 = p / "mbf_func.md5sum2"
                try:
                    func_md5sum = json.loads(func_md5sum_path2.read_text())
                except OSError:
                    func_md5sum = func_md5sum_path.read_text()
                ok = False
                try:
                    new = ppg.FunctionInvariant._compare_new_and_old(
                        new_source, new_funchash, new_closure, func_md5sum)
                    ok = False
                except ppg.NothingChanged:
                    ok = True
                if ok:
                    ok_versions.append((v, p))

            if ok_versions:
                version, output_path = ok_versions[-1]
            else:  # no version that is within the acceptable range and had the same build function
                output_path = self.prebuilt_path / self.hostname / name / version

        if isinstance(output_files, (str, Path)):
            output_files = [output_files]
        output_files = [Path(of) for of in output_files]
        if ppg.inside_ppg():
            job = PrebuildJob(output_files, calculating_function, output_path)
            job.depends_on(
                _PrebuildFileInvariantsExploding(output_path, input_files))
            job.version = version
            return job
        else:
            for of in output_files:
                if not (output_path / of).exists():
                    raise ValueError(
                        "%s was missing and prebuild used outside of ppg - can't build it"
                        % (output_path / of).absolute())

            class DummyJob:
                """just enough of the Jobs interface to ignore the various calls
                and allow finding the msgpack jobs
                """
                def __init__(self, output_path, filenames):
                    self.output_path = output_path
                    self.filenames = PrebuildJob._normalize_output_files(
                        filenames, output_path)
                    # self.job_id = ":".join(sorted(str(x) for x in filenames))

                def depends_on(self, _other_job):  # pragma: no cover
                    return self

                def depends_on_func(self, _name, _func):  # pragma: no cover
                    return self

                def depends_on_file(self, _filename):  # pragma: no cover
                    return self

                def name_file(self, output_filename):
                    """Adjust path of output_filename by job path"""
                    return self.output_path / output_filename

                def find_file(self, output_filename):
                    """Search for a file named output_filename in the job's known created files"""
                    of = self.name_file(output_filename)
                    for fn in self.filenames:
                        if of.resolve() == Path(fn).resolve():
                            return of
                    else:
                        raise KeyError("file not found: %s" % output_filename)

                def __iter__(self):
                    yield self

            return DummyJob(output_path, output_files)
def qc_disabled():
    if not ppg.inside_ppg():
        return True
    return getattr(ppg.util.global_pipegraph, "_qc_keep_function", True) is False