def get_drug_qed_df(data_root: str, qed_scaling: str, float_dtype: type = np.float32): """df = get_drug_qed_df('./data/', 'none') This function the drug property dataframe, process it and return the drug weighted QED dataframe. The processing includes: * removing all columns but 'QED'; * drop drugs/rows that have NaN as weighted QED; * scaling the QED accordingly; * convert data types for more compact structure; Args: data_root (str): path to the data root folder. qed_scaling (str): scaling strategy for weighted QED. float_dtype (float): float dtype for storage in RAM. Returns: pd.DataFrame: drug weighted QED dataframe. """ df = get_drug_prop_df(data_root=data_root)[['QED']] # Drop all the NaN values before scaling df.dropna(axis=0, inplace=True) # Note that weighted QED is by default already in the range of [0, 1] # Scaling the weighted QED with given scaling method df = scale_dataframe(df, qed_scaling) # Convert the dtypes for a more efficient, compact dataframe return df.astype(float_dtype)
def get_drug_lat_df(data_root: str, lat_scaling: str, float_dtype: type = np.int8): """ df = get_drug_latent_df('./data/') """ df_filename = 'drug_lat_df(scaling=%s).pkl' % lat_scaling df_path = os.path.join(data_root, PROC_FOLDER, df_filename) # If the dataframe already exists, load and continue ###################### if os.path.exists(df_path): df = pd.read_pickle(df_path) # Otherwise load from raw files, process it and save ###################### else: logger.debug('Processing drug latent dataframe ... ') # Download the raw file if not exist download_files(filenames=LAT_FILENAME, target_folder=os.path.join(data_root, RAW_FOLDER)) df = pd.read_csv( os.path.join(data_root, RAW_FOLDER, LAT_FILENAME), sep='\t', header=0, index_col=0) # Scaling the descriptor with given scaling method df = scale_dataframe(df, lat_scaling) # Convert data type into generic python types df = df.astype(float) # save to disk for future usage try: os.makedirs(os.path.join(data_root, PROC_FOLDER)) except FileExistsError: pass df.to_pickle(df_path) # Convert the dtypes for a more efficient, compact dataframe ############## df = df.astype(float_dtype) return df
def get_rna_seq_df(data_root: str, rnaseq_feature_usage: str, rnaseq_scaling: str, float_dtype: type = np.float32): """df = get_rna_seq_df('./data/', 'source_scale', 'std') This function loads the RNA sequence file, process it and return as a dataframe. The processing includes: * remove the '-' in cell line names; * remove duplicate indices; * scaling all the sequence features accordingly; * convert data types for more compact structure; Note that if the dataframe is already stored in the processed folder, the function simply read from file and return after converting dtypes. Args: data_root (str): path to the data root folder. rnaseq_feature_usage (str): feature usage indicator, Choose between 'source_scale' and 'combat'. rnaseq_scaling (str): scaling strategy for RNA sequence. float_dtype (float): int dtype for storage in RAM. Returns: pd.DataFrame: processed RNA sequence dataframe. """ df_filename = 'rnaseq_df(%s, scaling=%s).pkl' \ % (rnaseq_feature_usage, rnaseq_scaling) df_path = os.path.join(data_root, PROC_FOLDER, df_filename) # If the dataframe already exists, load and continue ###################### if os.path.exists(df_path): df = pd.read_pickle(df_path) # Otherwise load from raw files, process it and save ###################### else: logger.debug('Processing RNA sequence dataframe ... ') if rnaseq_feature_usage == 'source_scale': raw_data_filename = RNASEQ_SOURCE_SCALE_FILENAME elif rnaseq_feature_usage == 'combat': raw_data_filename = RNASEQ_COMBAT_FILENAME elif rnaseq_feature_usage == 'livermore': raw_data_filename = RNASEQ_LIVERMORE_FILENAME else: logger.error('Unknown RNA feature %s.' % rnaseq_feature_usage, exc_info=True) raise ValueError('RNA feature usage must be one of ' '\'source_scale\' or \'combat\'.') # Download the raw file if not exist download_files(filenames=raw_data_filename, target_folder=os.path.join(data_root, RAW_FOLDER)) df = pd.read_csv(os.path.join(data_root, RAW_FOLDER, raw_data_filename), sep='\t', header=0, index_col=0) # Delete '-', which could be inconsistent between seq and meta df.index = df.index.str.replace('-', '') # Note that after this name changing, some rows will have the same # name like 'GDSC.TT' and 'GDSC.T-T', but they are actually the same # Drop the duplicates for consistency df = df[~df.index.duplicated(keep='first')] # Scaling the descriptor with given scaling method df = scale_dataframe(df, rnaseq_scaling) # Convert data type into generic python types df = df.astype(float) # save to disk for future usage try: os.makedirs(os.path.join(data_root, PROC_FOLDER)) except FileExistsError: pass df.to_pickle(df_path) # Convert the dtypes for a more efficient, compact dataframe ############## df = df.astype(float_dtype) return df
def get_drug_resp_df(data_root: str, grth_scaling: str, int_dtype: type = np.int8, float_dtype: type = np.float32): """df = get_drug_resp_df('./data/', 'std') This function loads the whole drug response file, process it and return as a dataframe. The processing includes: * remove the '-' in cell line names; * encode str format data sources into integer; * scaling the growth accordingly; * convert data types for more compact structure; Note that if the dataframe is already stored in the processed folder, the function simply read from file and return after converting dtypes. Args: data_root (str): path to the data root folder. grth_scaling (str): scaling strategy for growth in drug response. int_dtype (type): int dtype for storage in RAM. float_dtype (float): int dtype for storage in RAM. Returns: pd.DataFrame: processed drug response dataframe. """ df_filename = 'drug_resp_df(scaling=%s).pkl' % grth_scaling df_path = os.path.join(data_root, PROC_FOLDER, df_filename) # If the dataframe already exists, load and continue ###################### if os.path.exists(df_path): df = pd.read_pickle(df_path) # Otherwise load from raw files, process it and save ###################### else: logger.debug('Processing drug response dataframe ... ') # Download the raw file if not exist download_files(filenames=DRUG_RESP_FILENAME, target_folder=os.path.join(data_root, RAW_FOLDER)) df = pd.read_csv(os.path.join(data_root, RAW_FOLDER, DRUG_RESP_FILENAME), sep='\t', header=0, index_col=None, usecols=[ 0, 1, 2, 4, 6, ]) # Delete '-', which could be inconsistent between seq and meta df['CELLNAME'] = df['CELLNAME'].str.replace('-', '') # Encode data sources into numeric df['SOURCE'] = encode_label_to_int(data_root=data_root, dict_name='data_src_dict.txt', labels=df['SOURCE'].tolist()) # Scaling the growth with given scaling method df['GROWTH'] = scale_dataframe(df['GROWTH'], grth_scaling) # Convert data type into generic python types df[['SOURCE']] = df[['SOURCE']].astype(int) df[['LOG_CONCENTRATION', 'GROWTH']] = \ df[['LOG_CONCENTRATION', 'GROWTH']].astype(float) # save to disk for future usage try: os.makedirs(os.path.join(data_root, PROC_FOLDER)) except FileExistsError: pass df.to_pickle(df_path) # Convert the dtypes for a more efficient, compact dataframe ############## df[['SOURCE']] = df[['SOURCE']].astype(int_dtype) df[['LOG_CONCENTRATION', 'GROWTH']] = \ df[['LOG_CONCENTRATION', 'GROWTH']].astype(float_dtype) return df
def get_drug_dscptr_df(data_root: str, dscptr_scaling: str, dscptr_nan_thresh: float, float_dtype: type = np.float32): """df = get_drug_dscptr_df('./data/', 'std', 0.0) This function loads the drug descriptor file, process it and return as a dataframe. The processing includes: * removing columns (features) and rows (drugs) that have exceeding ratio of NaN values comparing to nan_thresh; * scaling all the descriptor features accordingly; * convert data types for more compact structure; Note that if the dataframe is already stored in the processed folder, the function simply read from file and return after converting dtypes. Args: data_root (str): path to the data root folder. dscptr_scaling (str): scaling strategy for all descriptor features. dscptr_nan_thresh (float): threshold ratio of NaN values. float_dtype (float): float dtype for storage in RAM. Returns: pd.DataFrame: processed drug descriptor dataframe. """ df_filename = 'drug_dscptr_df(scaling=%s, nan_thresh=%.2f).pkl' \ % (dscptr_scaling, dscptr_nan_thresh) df_path = os.path.join(data_root, PROC_FOLDER, df_filename) # If the dataframe already exists, load and continue ###################### if os.path.exists(df_path): df = pd.read_pickle(df_path) # Otherwise load from raw files, process it and save ###################### else: logger.debug('Processing drug descriptor dataframe ... ') # Download the raw file if not exist download_files(filenames=DSCPTR_FILENAME, target_folder=os.path.join(data_root, RAW_FOLDER)) df = pd.read_csv(os.path.join(data_root, RAW_FOLDER, DSCPTR_FILENAME), sep='\t', header=0, index_col=0, na_values='na') # Drop NaN values if the percentage of NaN exceeds nan_threshold # Note that columns (features) are dropped first, and then rows (drugs) valid_thresh = 1.0 - dscptr_nan_thresh df.dropna(axis=1, inplace=True, thresh=int(df.shape[0] * valid_thresh)) df.dropna(axis=0, inplace=True, thresh=int(df.shape[1] * valid_thresh)) # Fill the rest of NaN with column means df.fillna(df.mean(), inplace=True) # Scaling the descriptor with given scaling method df = scale_dataframe(df, dscptr_scaling) # Convert data type into generic python types df = df.astype(float) # save to disk for future usage try: os.makedirs(os.path.join(data_root, PROC_FOLDER)) except FileExistsError: pass df.to_pickle(df_path) # Convert the dtypes for a more efficient, compact dataframe ############## df = df.astype(float_dtype) return df