Exemple #1
0
def download_graph(graph_data: Dict, root: str):
    """Download and extract the given graph_data.

    Parameters
    ---------------------------
    graph_data: Dict,
        The dictionary with the metadata.
    root: str,
        The root of the directory where to store the files.
    """
    url = graph_data["url"]
    folder = os.path.join(root, graph_data["folder_name"])
    path = os.path.join(root, url.split("/")[-1])
    if not os.path.exists(path):
        try:
            logger.info("Downloading %s -> %s", url, path)
            download(url, path, cache=True)
        except Exception as e:
            if os.path.exists(path):
                os.remove(path)
            raise e

    extracted_folder = os.path.join(folder,
                                    graph_data.get("extraction_path",
                                                   "")).strip("/")
    if not os.path.exists(extracted_folder):
        logger.info("Extracting %s -> %s", path, extracted_folder)
        extract(path, extracted_folder)
Exemple #2
0
def get_cell_line(root: str, cell_line: str, states: int, genome: str,
                  enhancers_labels: List[str], promoters_labels: List[str],
                  url: str, nrows: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Return enhancers and promoters for given cell line.

    Parameters
    -----------------------------------
    cell_line: str,
        The chosen cell line standard name.
    states: int,
        The number of states of the chosen model.
    enhancers_labels: List[str],
        The labels to use for active enhancers.
    promoters_labels: List[str],
        The labels to use for active promoters.
    url: str
        Url for downloading the chosen cell line.
    nrows: int,
        the number of rows to read, usefull when testing pipelines for creating smaller datasets.

    Returns
    ------------------------------------
    Return tuple containing the dataframe of the enhancers
    and the dataframe of the promoters for the given cell line.
    """
    path = f"{root}/{genome}/{states}/{cell_line}.bed.gz"
    try:
        download(url, path, cache=True)
    except ValueError:
        warnings.warn(
            "Unable to retrieve the data relative to cell line {}".format(
                cell_line), UserWarning)
        return None, None

    roadmap_data = pd.read_csv(path,
                               sep="\t",
                               skiprows=[0, 1],
                               header=None,
                               names=["chrom", "start", "end", cell_line],
                               nrows=nrows)

    roadmap_data = roadmap_data.set_index(["chrom", "start", "end"])

    enhancers = roadmap_data[roadmap_data[cell_line].isin(
        enhancers_labels)].copy()
    promoters = roadmap_data[roadmap_data[cell_line].isin(
        promoters_labels)].copy()
    enhancers[cell_line] = 1  # Encode active enhancers as 1
    promoters[cell_line] = 1  # Encode active promoters as 1

    return enhancers, promoters
def load_matrix(root: str, genome: str, region: str, info: Dict, nrows: int) -> pd.DataFrame:
    """Return the matrix with the CAGE peaks data.

    Parameters
    ----------------------
    root: str,
        Root where to store the downloaded data.
    genome: str,
        Genomic assembly.
    region: str,
        Name of the regions to consider.
    info: Dict,
        URls data.
    nrows: int= None,
        the number of rows to read, usefull when testing pipelines for creating smaller datasets.

    Returns
    ----------------------
    Pandas dataframe with CAGE peaks data.
    """
    matrix_path = f"{root}/{genome}/{region}/matrix.tsv.gz"
    bed_path = f"{root}/{genome}/{region}/regions.bed.gz"
    download(info[genome][region]["matrix"], matrix_path, cache=True)
    download(info[genome][region]["bed"], bed_path, cache=True)
    if nrows is not None and region == "promoters":
        nrows += 2
    matrix = pd.read_csv(
        matrix_path,
        comment="#",
        sep="\t",
        low_memory=False,
        nrows=nrows
    )
    if region == "promoters":
        matrix.drop(index=[0, 1], inplace=True)
        matrix.reset_index(drop=True, inplace=True)
    matrix.set_index(matrix.columns[0], inplace=True)
    bed = load_bed(bed_path)
    bed.set_index("name", inplace=True)
    matrix = pd.concat(
        [
            bed.loc[matrix.index],
            matrix
        ],
        axis=1
    )
    matrix.reset_index(drop=True, inplace=True)
    return matrix
def fantom_available_cell_lines(
    root: str = "fantom",
) -> pd.DataFrame:
    """Return supported cell lines available within FANTOM dataset.

    Parameters
    ---------------------------------------
    root: str = "fantom",
        Where to store / load from the downloaded data.

    Returns
    ---------------------------------------
    Return dataframe with the supported cell lines mapped to FANTOM name.
    """
    info = compress_json.local_load("fantom.json")
    path = f"{root}/cell_lines.tsv"
    download(info["cell_lines"], path, cache=True)
    df = pd.read_csv(
        path,
        sep="\t",
        header=None
    )
    cell_lines_names = df[0].str.split("cell line:", expand=True)
    cell_lines_names[1][
        cell_lines_names[0].str.startswith("H1") &
        cell_lines_names[0].str.contains("day00")
    ] = "H1"
    cell_lines_names[1][
        cell_lines_names[0].str.startswith("H9") &
        cell_lines_names[0].str.contains("H9ES")
    ] = "H9"
    nan_mask = pd.notnull(cell_lines_names[1])
    cell_lines_names = cell_lines_names[nan_mask]
    infected_mask = ~cell_lines_names[1].str.contains("infection")
    cell_lines_names = cell_lines_names[infected_mask]
    cell_lines_names[1] = cell_lines_names[1].str.split("/").str[0]
    cell_lines_names[1] = cell_lines_names[1].str.split(",").str[0]
    cell_lines_codes = pd.concat(
        objs=[
            cell_lines_names[1].apply(lambda x: x.split("ENCODE")[
                                      0].strip()).str.upper().str.replace("-", ""),
            df[nan_mask][infected_mask][1],
        ],
        axis=1
    )
    cell_lines_codes.columns = ["cell_line", "code"]
    return cell_lines_codes.reset_index(drop=True).groupby("cell_line").first().reset_index()
def cafa4_mapping() -> pd.DataFrame:
    """Return DataFrame containing CAFA4 and Uniprot IDs."""
    # List of the paths considered in the function
    paths = ["cafa4.tar.gz", "CAFA4-export/TargetFiles/sp_species.9606.tfa"]
    if not any(os.path.exists(path) for path in paths):
        # Downloading the url to the given path
        download(
            url=
            "https://www.biofunctionprediction.org/cafa-targets/CAFA4-export.tgz",
            path=paths[0])
        # Extracting the acquire
        shutil.unpack_archive(paths[0], ".")
        # Delete the archived file
        os.remove(paths[0])
    # Parse the file and retrieve the IDs from the fasta file
    f = open(paths[1], "r")
    df = pd.DataFrame((line[1:-1].split(" ")
                       for line in f.readlines() if line.startswith(">")),
                      columns=["cafa4_id", "uniprot_id"])
    f.close()
    # Return the obtained IDs
    return df
Exemple #6
0
def job(
    bed_path: str,
    epigenome_path: str,
    target_path: str,
    url: str,
    nan_threshold: float,
    clear_download: bool
):
    # Download file if it does not already exist
    if not os.path.exists(epigenome_path):
        download(url, epigenome_path)
    # Extract the features
    bed, scores = extract(
        bed_path=bed_path,
        bigwig_path=epigenome_path,
        nan_threshold=nan_threshold
    )
    # Save the obtained features
    pd.concat([bed, scores], axis=1).to_csv(target_path, sep="\t")

    # Remove the bigwig file if required
    if clear_download:
        os.remove(epigenome_path)
Exemple #7
0
def roadmap_available_cell_lines(root: str) -> pd.DataFrame:
    """Return Roadmap supported available cell lines.

    Parameters
    ---------------------------------------
    root: str,
        Where to store / load from the downloaded data.

    Returns
    ---------------------------------------
    Return dataframe with the cell lines supported available in Roadmap dataset.
    """
    info = compress_json.local_load("roadmap.json")
    filename = f"{root}/cell_lines.tsv"
    download(info["cell_lines"], filename, cache=True)
    cell_lines_codes = pd.read_csv(filename, sep="\t")
    cell_lines_codes = cell_lines_codes[
        (cell_lines_codes.TYPE != "ESCDerived")
        & cell_lines_codes.GROUP.isin(["ENCODE2012", "ESC", "IMR90"])]
    cell_lines_codes["cell_line"] = cell_lines_codes.MNEMONIC.str.split(
        ".").str[1].str.replace("-", "")
    cell_lines_codes["code"] = cell_lines_codes.EID
    return cell_lines_codes[["cell_line", "code"]].reset_index(drop=True)
Exemple #8
0
def test_download():
    download(
        "https://encode-public.s3.amazonaws.com/2012/07/01/074e1b37-2be1-4f6a-aa42-6c512fd1834b/ENCFF000XOW.bigWig"
    )
    os.remove("ENCFF000XOW.bigWig")