def download_graph(graph_data: Dict, root: str): """Download and extract the given graph_data. Parameters --------------------------- graph_data: Dict, The dictionary with the metadata. root: str, The root of the directory where to store the files. """ url = graph_data["url"] folder = os.path.join(root, graph_data["folder_name"]) path = os.path.join(root, url.split("/")[-1]) if not os.path.exists(path): try: logger.info("Downloading %s -> %s", url, path) download(url, path, cache=True) except Exception as e: if os.path.exists(path): os.remove(path) raise e extracted_folder = os.path.join(folder, graph_data.get("extraction_path", "")).strip("/") if not os.path.exists(extracted_folder): logger.info("Extracting %s -> %s", path, extracted_folder) extract(path, extracted_folder)
def get_cell_line(root: str, cell_line: str, states: int, genome: str, enhancers_labels: List[str], promoters_labels: List[str], url: str, nrows: int) -> Tuple[pd.DataFrame, pd.DataFrame]: """Return enhancers and promoters for given cell line. Parameters ----------------------------------- cell_line: str, The chosen cell line standard name. states: int, The number of states of the chosen model. enhancers_labels: List[str], The labels to use for active enhancers. promoters_labels: List[str], The labels to use for active promoters. url: str Url for downloading the chosen cell line. nrows: int, the number of rows to read, usefull when testing pipelines for creating smaller datasets. Returns ------------------------------------ Return tuple containing the dataframe of the enhancers and the dataframe of the promoters for the given cell line. """ path = f"{root}/{genome}/{states}/{cell_line}.bed.gz" try: download(url, path, cache=True) except ValueError: warnings.warn( "Unable to retrieve the data relative to cell line {}".format( cell_line), UserWarning) return None, None roadmap_data = pd.read_csv(path, sep="\t", skiprows=[0, 1], header=None, names=["chrom", "start", "end", cell_line], nrows=nrows) roadmap_data = roadmap_data.set_index(["chrom", "start", "end"]) enhancers = roadmap_data[roadmap_data[cell_line].isin( enhancers_labels)].copy() promoters = roadmap_data[roadmap_data[cell_line].isin( promoters_labels)].copy() enhancers[cell_line] = 1 # Encode active enhancers as 1 promoters[cell_line] = 1 # Encode active promoters as 1 return enhancers, promoters
def load_matrix(root: str, genome: str, region: str, info: Dict, nrows: int) -> pd.DataFrame: """Return the matrix with the CAGE peaks data. Parameters ---------------------- root: str, Root where to store the downloaded data. genome: str, Genomic assembly. region: str, Name of the regions to consider. info: Dict, URls data. nrows: int= None, the number of rows to read, usefull when testing pipelines for creating smaller datasets. Returns ---------------------- Pandas dataframe with CAGE peaks data. """ matrix_path = f"{root}/{genome}/{region}/matrix.tsv.gz" bed_path = f"{root}/{genome}/{region}/regions.bed.gz" download(info[genome][region]["matrix"], matrix_path, cache=True) download(info[genome][region]["bed"], bed_path, cache=True) if nrows is not None and region == "promoters": nrows += 2 matrix = pd.read_csv( matrix_path, comment="#", sep="\t", low_memory=False, nrows=nrows ) if region == "promoters": matrix.drop(index=[0, 1], inplace=True) matrix.reset_index(drop=True, inplace=True) matrix.set_index(matrix.columns[0], inplace=True) bed = load_bed(bed_path) bed.set_index("name", inplace=True) matrix = pd.concat( [ bed.loc[matrix.index], matrix ], axis=1 ) matrix.reset_index(drop=True, inplace=True) return matrix
def fantom_available_cell_lines( root: str = "fantom", ) -> pd.DataFrame: """Return supported cell lines available within FANTOM dataset. Parameters --------------------------------------- root: str = "fantom", Where to store / load from the downloaded data. Returns --------------------------------------- Return dataframe with the supported cell lines mapped to FANTOM name. """ info = compress_json.local_load("fantom.json") path = f"{root}/cell_lines.tsv" download(info["cell_lines"], path, cache=True) df = pd.read_csv( path, sep="\t", header=None ) cell_lines_names = df[0].str.split("cell line:", expand=True) cell_lines_names[1][ cell_lines_names[0].str.startswith("H1") & cell_lines_names[0].str.contains("day00") ] = "H1" cell_lines_names[1][ cell_lines_names[0].str.startswith("H9") & cell_lines_names[0].str.contains("H9ES") ] = "H9" nan_mask = pd.notnull(cell_lines_names[1]) cell_lines_names = cell_lines_names[nan_mask] infected_mask = ~cell_lines_names[1].str.contains("infection") cell_lines_names = cell_lines_names[infected_mask] cell_lines_names[1] = cell_lines_names[1].str.split("/").str[0] cell_lines_names[1] = cell_lines_names[1].str.split(",").str[0] cell_lines_codes = pd.concat( objs=[ cell_lines_names[1].apply(lambda x: x.split("ENCODE")[ 0].strip()).str.upper().str.replace("-", ""), df[nan_mask][infected_mask][1], ], axis=1 ) cell_lines_codes.columns = ["cell_line", "code"] return cell_lines_codes.reset_index(drop=True).groupby("cell_line").first().reset_index()
def cafa4_mapping() -> pd.DataFrame: """Return DataFrame containing CAFA4 and Uniprot IDs.""" # List of the paths considered in the function paths = ["cafa4.tar.gz", "CAFA4-export/TargetFiles/sp_species.9606.tfa"] if not any(os.path.exists(path) for path in paths): # Downloading the url to the given path download( url= "https://www.biofunctionprediction.org/cafa-targets/CAFA4-export.tgz", path=paths[0]) # Extracting the acquire shutil.unpack_archive(paths[0], ".") # Delete the archived file os.remove(paths[0]) # Parse the file and retrieve the IDs from the fasta file f = open(paths[1], "r") df = pd.DataFrame((line[1:-1].split(" ") for line in f.readlines() if line.startswith(">")), columns=["cafa4_id", "uniprot_id"]) f.close() # Return the obtained IDs return df
def job( bed_path: str, epigenome_path: str, target_path: str, url: str, nan_threshold: float, clear_download: bool ): # Download file if it does not already exist if not os.path.exists(epigenome_path): download(url, epigenome_path) # Extract the features bed, scores = extract( bed_path=bed_path, bigwig_path=epigenome_path, nan_threshold=nan_threshold ) # Save the obtained features pd.concat([bed, scores], axis=1).to_csv(target_path, sep="\t") # Remove the bigwig file if required if clear_download: os.remove(epigenome_path)
def roadmap_available_cell_lines(root: str) -> pd.DataFrame: """Return Roadmap supported available cell lines. Parameters --------------------------------------- root: str, Where to store / load from the downloaded data. Returns --------------------------------------- Return dataframe with the cell lines supported available in Roadmap dataset. """ info = compress_json.local_load("roadmap.json") filename = f"{root}/cell_lines.tsv" download(info["cell_lines"], filename, cache=True) cell_lines_codes = pd.read_csv(filename, sep="\t") cell_lines_codes = cell_lines_codes[ (cell_lines_codes.TYPE != "ESCDerived") & cell_lines_codes.GROUP.isin(["ENCODE2012", "ESC", "IMR90"])] cell_lines_codes["cell_line"] = cell_lines_codes.MNEMONIC.str.split( ".").str[1].str.replace("-", "") cell_lines_codes["code"] = cell_lines_codes.EID return cell_lines_codes[["cell_line", "code"]].reset_index(drop=True)
def test_download(): download( "https://encode-public.s3.amazonaws.com/2012/07/01/074e1b37-2be1-4f6a-aa42-6c512fd1834b/ENCFF000XOW.bigWig" ) os.remove("ENCFF000XOW.bigWig")