def cell_name_to_ids(name, source=None): path = get_file(DATA_URL + 'NCI60_CELLNAME_to_Combo.txt') df1 = pd.read_csv(path, sep='\t') hits1 = candle.lookup(df1, name, 'NCI60.ID', ['NCI60.ID', 'CELLNAME', 'Name'], match='contains') path = get_file(DATA_URL + 'cl_mapping') df2 = pd.read_csv(path, sep='\t', header=None) hits2 = candle.lookup(df2, name, [0, 1], [0, 1], match='contains') hits = hits1 + hits2 if source: hits = [x for x in hits if x.startswith(source.upper() + '.')] return hits
def load_drug_set_descriptors(drug_set='Combined_PubChem', ncols=None, usecols=None, scaling=None, imputing=None, add_prefix=False): path = get_file(DATA_URL + '{}_dragon7_descriptors.tsv'.format(drug_set)) df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0) total = df_cols.shape[1] - 1 if usecols is not None: usecols = [x for x in usecols if x in df_cols.columns] if usecols[0] != 'NAME': usecols = ['NAME'] + usecols df_cols = df_cols.loc[:, usecols] elif ncols and ncols < total: usecols = np.random.choice(total, size=ncols, replace=False) usecols = np.append([0], np.add(sorted(usecols), 1)) df_cols = df_cols.iloc[:, usecols] dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) df = pd.read_csv(path, engine='c', sep='\t', usecols=usecols, dtype=dtype_dict, na_values=['na', '-', '']) df1 = pd.DataFrame(df.loc[:, 'NAME']) df1.rename(columns={'NAME': 'Drug'}, inplace=True) df2 = df.drop('NAME', 1) if add_prefix: df2 = df2.add_prefix('dragon7.') df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing, dropna=None) df = pd.concat([df1, df2], axis=1) return df
def load_single_dose_response(combo_format=False, fraction=True): # path = get_file(DATA_URL + 'combined_single_drug_growth') path = get_file(DATA_URL + 'rescaled_combined_single_drug_growth') df = global_cache.get(path) if df is None: df = pd.read_csv(path, sep='\t', engine='c', na_values=['na', '-', ''], # nrows=10, dtype={'SOURCE': str, 'DRUG_ID': str, 'CELLNAME': str, 'CONCUNIT': str, 'LOG_CONCENTRATION': np.float32, 'EXPID': str, 'GROWTH': np.float32}) global_cache[path] = df df['DOSE'] = -df['LOG_CONCENTRATION'] df = df.rename(columns={'CELLNAME': 'CELL', 'DRUG_ID': 'DRUG', 'EXPID': 'STUDY'}) df = df[['SOURCE', 'CELL', 'DRUG', 'DOSE', 'GROWTH', 'STUDY']] if fraction: df['GROWTH'] /= 100 if combo_format: df = df.rename(columns={'DRUG': 'DRUG1', 'DOSE': 'DOSE1'}) df['DRUG2'] = np.nan df['DOSE2'] = np.nan df['DRUG2'] = df['DRUG2'].astype(object) df['DOSE2'] = df['DOSE2'].astype(np.float32) df = df[['SOURCE', 'CELL', 'DRUG1', 'DOSE1', 'DRUG2', 'DOSE2', 'GROWTH', 'STUDY']] return df
def drug_name_to_ids(name, source=None): df1 = load_drug_info() path = get_file(DATA_URL + 'NCI_IOA_AOA_drugs') df2 = pd.read_csv(path, sep='\t', dtype=str) df2['NSC'] = 'NSC.' + df2['NSC'] hits1 = candle.lookup(df1, name, 'ID', ['ID', 'NAME', 'CLEAN_NAME', 'PUBCHEM']) hits2 = candle.lookup(df2, name, 'NSC', ['NSC', 'Generic Name', 'Preffered Name']) hits = hits1 + hits2 if source: hits = [x for x in hits if x.startswith(source.upper()+'.')] return hits
def load_combo_dose_response(fraction=True): path = get_file(DATA_URL + 'ComboDrugGrowth_Nov2017.csv') df = global_cache.get(path) if df is None: df = pd.read_csv(path, sep=',', engine='c', na_values=['na','-',''], usecols=['CELLNAME', 'NSC1', 'CONC1', 'NSC2', 'CONC2', 'PERCENTGROWTH', 'VALID', 'SCREENER', 'STUDY'], # nrows=10000, dtype={'CELLNAME': str, 'NSC1': str, 'NSC2': str, 'CONC1': np.float32, 'CONC2': np.float32, 'PERCENTGROWTH':np.float32, 'VALID': str, 'SCREENER': str, 'STUDY': str}, error_bad_lines=False, warn_bad_lines=True) global_cache[path] = df df = df[df['VALID'] == 'Y'] df['SOURCE'] = 'ALMANAC.' + df['SCREENER'] cellmap_path = get_file(DATA_URL + 'NCI60_CELLNAME_to_Combo.txt') df_cellmap = pd.read_csv(cellmap_path, sep='\t') df_cellmap.set_index('Name', inplace=True) cellmap = df_cellmap[['NCI60.ID']].to_dict()['NCI60.ID'] df['CELL'] = df['CELLNAME'].map(lambda x: cellmap[x]) df['DOSE1'] = -np.log10(df['CONC1']) df['DOSE2'] = -np.log10(df['CONC2']) df['DRUG1'] = 'NSC.' + df['NSC1'] df['DRUG2'] = 'NSC.' + df['NSC2'] if fraction: df['GROWTH'] = df['PERCENTGROWTH'] / 100 else: df['GROWTH'] = df['PERCENTGROWTH'] df = df[['SOURCE', 'CELL', 'DRUG1', 'DOSE1', 'DRUG2', 'DOSE2', 'GROWTH', 'STUDY']] return df
def load_drug_set_fingerprints(drug_set='Combined_PubChem', ncols=None, usecols=None, scaling=None, imputing=None, add_prefix=False): fps = ['PFP', 'ECFP'] usecols_all = usecols df_merged = None for fp in fps: path = get_file(DATA_URL + '{}_dragon7_{}.tsv'.format(drug_set, fp)) df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0, skiprows=1, header=None) total = df_cols.shape[1] - 1 if usecols_all is not None: usecols = [x.replace(fp+'.', '') for x in usecols_all] usecols = [int(x) for x in usecols if x.isdigit()] usecols = [x for x in usecols if x in df_cols.columns] if usecols[0] != 0: usecols = [0] + usecols df_cols = df_cols.loc[:, usecols] elif ncols and ncols < total: usecols = np.random.choice(total, size=ncols, replace=False) usecols = np.append([0], np.add(sorted(usecols), 1)) df_cols = df_cols.iloc[:, usecols] dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) df = pd.read_csv(path, engine='c', sep='\t', skiprows=1, header=None, usecols=usecols, dtype=dtype_dict) df.columns = ['{}.{}'.format(fp, x) for x in df.columns] col1 = '{}.0'.format(fp) df1 = pd.DataFrame(df.loc[:, col1]) df1.rename(columns={col1: 'Drug'}, inplace=True) df2 = df.drop(col1, 1) if add_prefix: df2 = df2.add_prefix('dragon7.') df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing, dropna=None) df = pd.concat([df1, df2], axis=1) df_merged = df if df_merged is None else df_merged.merge(df) return df_merged
def load_aggregated_single_response(target='AUC', min_r2_fit=0.3, max_ec50_se=3, combo_format=False, rename=True): path = get_file(DATA_URL + 'combined_single_response_agg') df = global_cache.get(path) if df is None: df = pd.read_csv(path, engine='c', sep='\t', dtype={'SOURCE': str, 'CELL': str, 'DRUG': str, 'STUDY': str, 'AUC': np.float32, 'IC50': np.float32, 'EC50': np.float32, 'EC50se': np.float32, 'R2fit': np.float32, 'Einf': np.float32, 'HS': np.float32, 'AAC1': np.float32, 'AUC1': np.float32, 'DSS1': np.float32}) global_cache[path] = df total = len(df) df = df[(df['R2fit'] >= min_r2_fit) & (df['EC50se'] <= max_ec50_se)] df = df[['SOURCE', 'CELL', 'DRUG', target, 'STUDY']] df = df[~df[target].isnull()] logger.info('Loaded %d dose independent response samples (filtered by EC50se <= %f & R2fit >=%f from a total of %d).', len(df), max_ec50_se, min_r2_fit, total) if combo_format: df = df.rename(columns={'DRUG': 'DRUG1'}) df['DRUG2'] = np.nan df['DRUG2'] = df['DRUG2'].astype(object) df = df[['SOURCE', 'CELL', 'DRUG1', 'DRUG2', target, 'STUDY']] if rename: df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', 'DRUG1': 'Drug1', 'DRUG2': 'Drug2', 'STUDY': 'Study'}) else: if rename: df = df.rename(columns={'SOURCE': 'Source', 'CELL': 'Sample', 'DRUG': 'Drug', 'STUDY': 'Study'}) return df
def load_cell_rnaseq(ncols=None, scaling='std', imputing='mean', add_prefix=True, use_landmark_genes=False, use_filtered_genes=False, feature_subset=None, preprocess_rnaseq=None, embed_feature_source=False, sample_set=None, index_by_sample=False): if use_landmark_genes: filename = 'combined_rnaseq_data_lincs1000' elif use_filtered_genes: filename = 'combined_rnaseq_data_filtered' else: filename = 'combined_rnaseq_data' if preprocess_rnaseq and preprocess_rnaseq != 'none': scaling = None filename += ('_' + preprocess_rnaseq) # 'source_scale' or 'combat' path = get_file(DATA_URL + filename) df_cols = pd.read_csv(path, engine='c', sep='\t', nrows=0) total = df_cols.shape[1] - 1 # remove Sample column if 'Cancer_type_id' in df_cols.columns: total -= 1 usecols = None if ncols and ncols < total: usecols = np.random.choice(total, size=ncols, replace=False) usecols = np.append([0], np.add(sorted(usecols), 2)) df_cols = df_cols.iloc[:, usecols] if feature_subset: with_prefix = lambda x: 'rnaseq.' + x if add_prefix else x usecols = [0] + [ i for i, c in enumerate(df_cols.columns) if with_prefix(c) in feature_subset ] df_cols = df_cols.iloc[:, usecols] dtype_dict = dict((x, np.float32) for x in df_cols.columns[1:]) df = pd.read_csv(path, engine='c', sep='\t', usecols=usecols, dtype=dtype_dict) if 'Cancer_type_id' in df.columns: df.drop('Cancer_type_id', axis=1, inplace=True) prefixes = df['Sample'].str.extract('^([^.]*)', expand=False).rename('Source') sources = prefixes.drop_duplicates().reset_index(drop=True) df_source = pd.get_dummies(sources, prefix='rnaseq.source', prefix_sep='.') df_source = pd.concat([sources, df_source], axis=1) df1 = df['Sample'] if embed_feature_source: df_sample_source = pd.concat([df1, prefixes], axis=1) df1 = df_sample_source.merge(df_source, on='Source', how='left').drop('Source', axis=1) logger.info( 'Embedding RNAseq data source into features: %d additional columns', df1.shape[1] - 1) df2 = df.drop('Sample', 1) if add_prefix: df2 = df2.add_prefix('rnaseq.') df2 = candle.drop_impute_and_scale_dataframe(df2, scaling, imputing) df = pd.concat([df1, df2], axis=1) # scaling needs to be done before subsampling if sample_set: chosen = df['Sample'].str.startswith(sample_set) df = df[chosen].reset_index(drop=True) if index_by_sample: df = df.set_index('Sample') logger.info('Loaded combined RNAseq data: %s', df.shape) return df
def load_cell_metadata(): path = get_file(DATA_URL + 'cl_metadata') df = pd.read_csv(path, sep='\t') return df
def load_drug_info(): path = get_file(DATA_URL + 'drug_info') df = pd.read_csv(path, sep='\t', dtype=object) df['PUBCHEM'] = 'PubChem.CID.' + df['PUBCHEM'] return df