def get_convert_func(self,
                         key,
                         keep_name=False,
                         filter_to_these_chromosomes=None):
        """Note that filter_to_these_chromosomes is after the replacements have kicked in"""
        chain_file = self.data_path / (key + ".over.chain")
        if not chain_file.exists():  # pragma: no cover
            raise ValueError("invalid liftover key, file not found: %s" %
                             chain_file)
        if filter_to_these_chromosomes:
            filter_to_these_chromosomes = set(filter_to_these_chromosomes)

        def do_convert(df):
            if df.index.duplicated().any():  # pragma: no cover
                raise ValueError("liftover only works with unique indices")
            df.index = [str(x) for x in df.index]
            input_tuples = [("chr" + row["chr"], row["start"], row["stop"],
                             idx) for idx, row in df.iterrows()]

            output_tuples = self.do_liftover(input_tuples, chain_file)
            output_lists = list(zip(*output_tuples))
            res = pd.DataFrame({
                "chr":
                output_lists[0],
                "start":
                output_lists[1],
                "stop":
                output_lists[2],
                "parent": [x.decode("utf-8") for x in output_lists[3]],
            }).set_index("parent")
            new_chr = []
            for x in res["chr"]:
                x = x[3:]
                # these are untested as of 2019-03-27
                if x == "m":  # pragma: no cover
                    x = "MT"
                elif (key in self.replacements
                      and x in self.replacements[key]):  # pragma: no cover
                    x = self.replacements[key][x]
                new_chr.append(x)
            res["chr"] = new_chr
            for col in df.columns:
                if col not in res.columns:
                    res = res.assign(**{col: df[col]})
            if filter_to_these_chromosomes:
                res = res[res["chr"].isin(filter_to_these_chromosomes)]
            return res

        if ppg.inside_ppg():
            do_convert.dependencies = [
                ppg.FileTimeInvariant(chain_file),
                ppg.FunctionInvariant(
                    "genomics.regions.convert.LiftOver.do_liftover",
                    LiftOver.do_liftover,
                ),
            ]
        return do_convert
Exemple #2
0
def GenomicRegions_FromTable(
    name,
    filename,
    genome,
    on_overlap="raise",
    summit_annotator=None,
    filter_func=None,
    vid=None,
    sheet_name="FromTable",
    drop_further_columns=True,
    chr_column="chr",
    start_column="start",
    stop_column="stop",
    one_based=False,
    reader=read_pandas,
):
    """Read a table file (csv/tsv/xls) with the chr/start/stop columns (renamed?), optionally
    drop all further columns"""
    def load():

        df = reader(filename)
        df["chr"] = df[chr_column].astype(str)
        df["start"] = df[start_column].astype(int)
        if one_based:  # pragma: no cover
            df["start"] -= 1
        df["stop"] = df[stop_column].astype(int)
        if drop_further_columns:  # pragma: no cover
            df = df[["chr", "start", "stop"]]
        if filter_func:  # pragma: no cover
            df = filter_func(df)
        return df

    if ppg.inside_ppg():
        deps = [
            ppg.FileTimeInvariant(filename),
            ppg.FunctionInvariant(name + "_filter_func", filter_func),
        ]
    else:
        deps = []
    return GenomicRegions(
        name,
        load,
        deps,
        genome,
        on_overlap,
        summit_annotator=summit_annotator,
        sheet_name=sheet_name,
        vid=vid,
    )
Exemple #3
0
def GenomicRegions_FromBigBed(
    name,
    filename,
    genome,
    chromosome_mangler=lambda x: x,
    on_overlap="raise",
    summit_annotator=None,
    sheet_name=None,
    vid=None,
):
    """Create GenomicRegions from a BigBed file.
    @chromosome_mangler translates genome chromosomes into the bigbed's chromosomes!

    """
    from mbf_fileformats.bed import read_bigbed

    def load():
        res = read_bigbed(filename, genome.get_chromosome_lengths(),
                          chromosome_mangler)
        if (res["strand"] == 1).all():
            res = res.drop("strand", axis=1)
        if len(res) == 0:  # pragma: no cover
            raise ValueError(
                "Emtpty BigBed file (or wrong chromosome names)- %s" %
                filename)
        return res

    if ppg.inside_ppg():
        deps = [
            ppg.FileTimeInvariant(filename),
            ppg.FunctionInvariant(name + "_chrmangler", chromosome_mangler),
        ]
    else:
        deps = []

    return GenomicRegions(
        name,
        load,
        deps,
        genome,
        on_overlap=on_overlap,
        summit_annotator=summit_annotator,
        sheet_name=sheet_name,
        vid=vid,
    )
Exemple #4
0
def GenomicRegions_FromWig(
    name,
    filename,
    genome,
    enlarge_5prime=0,
    enlarge_3prime=0,
    on_overlap="raise",
    comment_char=None,
    summit_annotator=None,
    vid=None,
):
    """Create GenomicRegions from a Wiggle file.

    @enlarge_5prime and @enlarge_3prime increase the size of the fragments described in the wig in
    the respective direction (for example if a chip-chip array did not cover every base).
    @comment_char defines which lines to ignore in the wiggle (see {mbf_fileformats.wiggle_to_intervals})

    The resulting GenomicRegions has a column 'Score' that contains the wiggle score"""
    from mbf_fileformats.wiggle import wiggle_to_intervals

    def load():
        df = wiggle_to_intervals(filename, comment_char=comment_char)
        df["chr"] = [to_string(x) for x in df["chr"]]
        df["start"] -= enlarge_5prime
        df["stop"] += enlarge_3prime
        return df

    if ppg.inside_ppg():
        deps = [ppg.FileTimeInvariant(filename)]
    else:
        deps = []

    return GenomicRegions(name,
                          load,
                          deps,
                          genome,
                          on_overlap,
                          summit_annotator=summit_annotator,
                          vid=vid)
    def test_accepts(self):
        import pathlib

        write("aaa", "hello")
        write("bbb", "hello")
        write("ccc", "hello")
        a = ppg.FileTimeInvariant(pathlib.Path("aaa"))
        a1 = ppg.MultiFileInvariant([pathlib.Path("bbb"), "ccc"])
        b = ppg.FileGeneratingJob(
            pathlib.Path("b"),
            lambda of: write(of, "bb" + read("aaa") + read("bbb") + read("ccc")),
        )
        b.depends_on(a)
        b.depends_on(a1)

        dd = Dummy()

        def mf():
            write("c", "cc" + read("g"))
            write("d", "dd" + read("h") + dd.attr)
            write("e", "ee" + read("i") + read("j"))

        c = ppg.MultiFileGeneratingJob([pathlib.Path("c"), "d", pathlib.Path("e")], mf)
        c.depends_on(b)
        d = ppg.FunctionInvariant(pathlib.Path("f"), lambda x: x + 1)
        c.depends_on(d)
        e = ppg.ParameterInvariant(pathlib.Path("c"), "hello")
        c.depends_on(e)
        f = ppg.TempFileGeneratingJob(pathlib.Path("g"), lambda: write("g", "gg"))
        c.depends_on(f)

        def tmf():
            write("h", "hh")
            write("i", "ii")

        g = ppg.MultiTempFileGeneratingJob([pathlib.Path("h"), "i"], tmf)
        c.depends_on(g)

        def tpf():
            write("j", "jjjj")
            write("k", "kkkk")

        h = ppg.TempFilePlusGeneratingJob(pathlib.Path("j"), pathlib.Path("k"), tpf)
        c.depends_on(h)

        i = ppg.CachedDataLoadingJob(
            pathlib.Path("l"), lambda: write("l", "llll"), lambda res: res
        )
        c.depends_on(i)

        m = ppg.CachedAttributeLoadingJob(pathlib.Path("m"), dd, "attr", lambda: "55")
        c.depends_on(m)
        ppg.run_pipegraph()
        assert read("aaa") == "hello"
        assert read("b") == "bbhellohellohello"
        assert read("c") == "ccgg"
        assert read("d") == "ddhh55"
        assert read("e") == "eeiijjjj"
        assert not (os.path.exists("g"))
        assert not (os.path.exists("h"))
        assert not (os.path.exists("i"))
        assert not (os.path.exists("j"))
        assert read("k") == "kkkk"
Exemple #6
0
def GenomicRegions_FromGFF(
    name,
    filename,
    genome,
    filter_function=None,
    comment_char=None,
    on_overlap="raise",
    chromosome_mangler=None,
    fix_negative_coordinates=False,
    alternative_class=None,
    summit_annotator=None,
    vid=None,
):
    """Create a GenomicRegions from a gff file.
    You can filter entries with @filter_function(gff_entry_dict) -> Bool,
    remove comment lines starting with a specific character with @comment_char,
    mangle the chromosomes with @chromosome_mangler(str) -> str,
    replace negative coordinates with 0 (@fix_negative_coordinates),
    or provide an alternative constructor to call with @alternative_class
    """
    def load():
        from mbf_fileformats.gff import gffToDict

        entries = gffToDict(filename, comment_char=comment_char)
        data = {
            "chr": [],
            "start": [],
            "stop": [],
            "score": [],
            "strand": [],
            "name": [],
        }
        name_found = False
        for entry in entries:
            if filter_function and not filter_function(entry):
                continue
            if chromosome_mangler:
                chr = chromosome_mangler(entry["seqname"])
            else:
                chr = entry["seqname"]
            data["chr"].append(to_string(chr))
            start = entry["start"]
            if fix_negative_coordinates and start < 0:
                start = 0
            data["start"].append(start)
            data["stop"].append(entry["end"])
            data["score"].append(entry["score"])
            data["strand"].append(entry["strand"])
            name = entry["attributes"].get("Name", [""])[0]
            data["name"].append(name)
            if name:
                name_found = True
        if not name_found:
            del data["name"]
        return pd.DataFrame(data)

    if alternative_class is None:  # pragma: no cover
        alternative_class = GenomicRegions
    if ppg.inside_ppg():
        deps = [
            ppg.FileTimeInvariant(filename),
            ppg.ParameterInvariant(
                name + "_params_GenomicRegions_FromGFF",
                (comment_char, fix_negative_coordinates),
            ),
            ppg.FunctionInvariant(name + "_filter_func_GenomicRegions_FromGFF",
                                  filter_function),
            ppg.FunctionInvariant(
                name + "_chromosome_manlger_GenomicRegions_FromGFF",
                chromosome_mangler),
        ]
    else:
        deps = []
    return alternative_class(name,
                             load,
                             deps,
                             genome,
                             on_overlap,
                             summit_annotator=summit_annotator,
                             vid=vid)
Exemple #7
0
def GenomicRegions_FromBed(
    name,
    filename,
    genome,
    chromosome_mangler=lambda x: x,
    on_overlap="raise",
    filter_invalid_chromosomes=False,
    summit_annotator=None,
    sheet_name=None,
    vid=None,
):
    """Create GenomicRegions from a Bed file.

    The resulting GenomicRegions has a column 'Score' that contains the wiggle score"""
    from mbf_fileformats.bed import read_bed

    def load():
        valid_chromosomes = set(genome.get_chromosome_lengths())
        data = {}
        entries = read_bed(filename)
        data["chr"] = np.array(
            [chromosome_mangler(to_string(e.refseq)) for e in entries],
            dtype=np.object)
        data["start"] = np.array([e.position for e in entries], dtype=np.int32)
        data["stop"] = np.array([e.position + e.length for e in entries],
                                dtype=np.int32)
        data["score"] = np.array([e.score for e in entries], dtype=np.float)
        data["strand"] = np.array([e.strand for e in entries], dtype=np.int8)
        data["name"] = np.array([to_string(e.name) for e in entries],
                                dtype=np.object)
        data = pd.DataFrame(data)
        if filter_invalid_chromosomes:  # pragma: no cover
            keep = [x in valid_chromosomes for x in data["chr"]]
            data = data[keep]
        res = data
        if len(res) == 0:
            raise ValueError("Emtpty Bed file - %s" % filename)
        if (np.isnan(res["score"])).all():
            res = res.drop(["score"], axis=1)
        if (len(res["name"]) > 1) and (len(res["name"].unique()) == 1):
            res = res.drop(["name"], axis=1)
        return res

    if ppg.inside_ppg():
        deps = [
            ppg.FileTimeInvariant(filename),
            ppg.FunctionInvariant(name + "_chrmangler", chromosome_mangler),
        ]
    else:
        deps = []

    return GenomicRegions(
        name,
        load,
        deps,
        genome,
        on_overlap=on_overlap,
        summit_annotator=summit_annotator,
        sheet_name=sheet_name,
        vid=vid,
    )