def get_drug_target_df(data_root: str, int_dtype: type = np.int8): """df = get_drug_target_df('./data/') This function the drug property dataframe, process it and return the drug target families dataframe. The processing includes: * removing all columns but 'TARGET'; * drop drugs/rows that are not in the TGT_FAMS list; * encode target families into integer labels; * convert data types for more compact structure; Args: data_root (str): path to the data root folder. int_dtype (type): int dtype for storage in RAM. Returns: pd.DataFrame: drug target families dataframe. """ df = get_drug_prop_df(data_root=data_root)[['TARGET']] # Only take the rows with specific target families for classification df = df[df['TARGET'].isin(TGT_FAMS)][['TARGET']] # Encode str formatted target families into integers df['TARGET'] = encode_label_to_int(data_root=data_root, dict_name='drug_target_dict.txt', labels=df['TARGET']) # Convert the dtypes for a more efficient, compact dataframe # Note that it is safe to use int8 here for there are only 10 classes return df.astype(int_dtype)
def get_cl_meta_df(data_root: str, encoding: bool = True, int_dtype: type = np.int8): """df = get_cl_meta_df('./data/') This function loads the metadata for cell lines, process it and return as a dataframe. The processing includes: * change column names to ['data_src', 'site', 'type', 'category']; * remove the '-' in cell line names; * convert data types for more compact structure; Note that if the dataframe is already stored in the processed folder, the function simply read from file and return after converting dtypes. Args: data_root (str): path to the data root folder. encoding (bool): indicator for encoding types/sites into integers. int_dtype (type): int dtype for storage in RAM. Returns: pd.DataFrame: processed cell line metadata dataframe. """ df_filename = 'cl_meta_df.pkl' df_path = os.path.join(data_root, PROC_FOLDER, df_filename) # If the dataframe already exists, load and continue ###################### if os.path.exists(df_path) and encoding: df = pd.read_pickle(df_path) # Otherwise load from raw files, process it and save ###################### else: logger.debug('Processing cell line meta dataframe ... ') # Download the raw file if not exist download_files(filenames=CL_METADATA_FILENAME, target_folder=os.path.join(data_root, RAW_FOLDER)) df = pd.read_csv(os.path.join(data_root, RAW_FOLDER, CL_METADATA_FILENAME), sep='\t', header=0, index_col=0, usecols=[ 'sample_name', 'dataset', 'simplified_tumor_site', 'simplified_tumor_type', 'sample_category' ], dtype=str) # Renaming columns for shorter and better column names df.index.names = ['sample'] df.columns = ['data_src', 'site', 'type', 'category'] # Delete '-', which could be inconsistent between seq and meta df.index = df.index.str.replace('-', '') # Convert all the categorical data from text to numeric if encoding: columns = df.columns dict_names = [i + '_dict.txt' for i in columns] for col, dict_name in zip(columns, dict_names): df[col] = encode_label_to_int(data_root=data_root, dict_name=dict_name, labels=df[col]) # Convert data type into generic python types df = df.astype(int) else: return df # save to disk for future usage try: os.makedirs(os.path.join(data_root, PROC_FOLDER)) except FileExistsError: pass df.to_pickle(df_path) df = df.astype(int_dtype) return df
def get_drug_resp_df(data_root: str, grth_scaling: str, int_dtype: type = np.int8, float_dtype: type = np.float32): """df = get_drug_resp_df('./data/', 'std') This function loads the whole drug response file, process it and return as a dataframe. The processing includes: * remove the '-' in cell line names; * encode str format data sources into integer; * scaling the growth accordingly; * convert data types for more compact structure; Note that if the dataframe is already stored in the processed folder, the function simply read from file and return after converting dtypes. Args: data_root (str): path to the data root folder. grth_scaling (str): scaling strategy for growth in drug response. int_dtype (type): int dtype for storage in RAM. float_dtype (float): int dtype for storage in RAM. Returns: pd.DataFrame: processed drug response dataframe. """ df_filename = 'drug_resp_df(scaling=%s).pkl' % grth_scaling df_path = os.path.join(data_root, PROC_FOLDER, df_filename) # If the dataframe already exists, load and continue ###################### if os.path.exists(df_path): df = pd.read_pickle(df_path) # Otherwise load from raw files, process it and save ###################### else: logger.debug('Processing drug response dataframe ... ') # Download the raw file if not exist download_files(filenames=DRUG_RESP_FILENAME, target_folder=os.path.join(data_root, RAW_FOLDER)) df = pd.read_csv(os.path.join(data_root, RAW_FOLDER, DRUG_RESP_FILENAME), sep='\t', header=0, index_col=None, usecols=[ 0, 1, 2, 4, 6, ]) # Delete '-', which could be inconsistent between seq and meta df['CELLNAME'] = df['CELLNAME'].str.replace('-', '') # Encode data sources into numeric df['SOURCE'] = encode_label_to_int(data_root=data_root, dict_name='data_src_dict.txt', labels=df['SOURCE'].tolist()) # Scaling the growth with given scaling method df['GROWTH'] = scale_dataframe(df['GROWTH'], grth_scaling) # Convert data type into generic python types df[['SOURCE']] = df[['SOURCE']].astype(int) df[['LOG_CONCENTRATION', 'GROWTH']] = \ df[['LOG_CONCENTRATION', 'GROWTH']].astype(float) # save to disk for future usage try: os.makedirs(os.path.join(data_root, PROC_FOLDER)) except FileExistsError: pass df.to_pickle(df_path) # Convert the dtypes for a more efficient, compact dataframe ############## df[['SOURCE']] = df[['SOURCE']].astype(int_dtype) df[['LOG_CONCENTRATION', 'GROWTH']] = \ df[['LOG_CONCENTRATION', 'GROWTH']].astype(float_dtype) return df