def select_and_evaluate_decoys(f, target, file_loc='./', output_loc='./', dataset='ALL', num_cand_dec_per_act=100, num_dec_per_act=50, max_idx_cmpd=10000): print("Processing: ", f) dec_results = [f] dec_results.append(dataset) # Read data data = decoy_utils.read_paired_file(file_loc+f) # Filter dupes and actives that are too small dec_results.append(len(set([d[0] for d in data]))) tmp = [Chem.MolFromSmiles(d[0]) for d in data] data = [d for idx, d in enumerate(data) if tmp[idx] is not None \ and tmp[idx].GetNumHeavyAtoms()>min_active_size] data = pd.DataFrame(data, columns=['act', 'dec']) pss = get_pss_from_smiles( data['act'].values, data['dec'].values) data['pss'] = pss.mean(0) data['score'] = data['pss'] result = [] for key, tmp_df in data.groupby('act'): tmp_df = tmp_df.sort_values('score', ascending=False) tmp_df = tmp_df.reset_index(drop=True) for i in range(min([5, tmp_df.shape[0]])): result.append([key, tmp_df['dec'].values[i]]) result = pd.DataFrame(result, columns=['act', 'dec']) result.to_csv('tmp.smi', index=False, header=None, sep=' ') # ============================================================================= # # ============================================================================= decoy_smis_gen = list(set(result['dec'])) decoy_mols_gen = [Chem.MolFromSmiles(smi) for smi in decoy_smis_gen] active_smis_gen = list(set(result['act'])) active_mols_gen = [Chem.MolFromSmiles(smi) for smi in active_smis_gen] dataset = 'dude' print('Calc props for chosen decoys') actives_feat = decoy_utils.calc_dataset_props_dude(active_mols_gen) decoys_feat = decoy_utils.calc_dataset_props_dude(decoy_mols_gen) print('ML model performance') print(actives_feat.shape) print(decoys_feat.shape) dec_results.extend(list(decoy_utils.calc_xval_performance( actives_feat, decoys_feat, n_jobs=1))) print('DEKOIS paper metrics (LADS, DOE, Doppelganger score)') dec_results.append(decoy_utils.doe_score(actives_feat, decoys_feat)) lads_scores = decoy_utils.lads_score_v2(active_mols_gen, decoy_mols_gen) dec_results.append(np.mean(lads_scores)) dg_scores, dg_ids = decoy_utils.dg_score(active_mols_gen, decoy_mols_gen) dec_results.extend([np.mean(dg_scores), max(dg_scores)]) print('Save decoy mols') print(dec_results) return dec_results
def select_and_evaluate_decoys(f, target, file_loc='./', output_loc='./'): print("Processing: ", f) dec_results = [f] # Read data data = decoy_utils.read_paired_file(file_loc + f) # Filter dupes and actives that are too small dec_results.append(len(set([d[0] for d in data]))) tmp = [Chem.MolFromSmiles(d[0]) for d in data] data = [d for idx, d in enumerate(data) if tmp[idx] is not None \ and tmp[idx].GetNumHeavyAtoms()>10] data = pd.DataFrame(data, columns=['act', 'dec']) if target == 'SA': data['style'] = data['dec'].apply(lambda x: get_sa(x)) data['style'] = (5 - data['style']) / 3 else: style = pd.read_csv('./eval/results/predict_TOX.csv') style = style.rename(columns={ 'smiles': 'dec', 'pred_0': 'style' })[['dec', 'style']] data = data.merge(style, on='dec', how='inner') data['style'] = 1 - data['style'] pss = get_pss_from_smiles(data['act'].values, data['dec'].values) data['pss'] = pss.mean(0) data['score'] = data['pss'] + data['style'] result = [] for key, tmp_df in data.groupby('act'): tmp_df = tmp_df.sort_values('score', ascending=False) tmp_df = tmp_df.reset_index(drop=True) for i in range(min([1, tmp_df.shape[0]])): result.append([key, tmp_df['dec'].values[i]]) result = pd.DataFrame(result, columns=['act', 'dec']) output_name = output_loc + \ f'/{target}_results.smi' result = result.drop_duplicates().reset_index(drop=True) result.to_csv(output_name, index=False, header=None, sep=' ')
def select_and_evaluate_decoys(f, target, file_loc='./', output_loc='./', T_simi=0.15, N=5): print("Processing: ", f) # Read data data = decoy_utils.read_paired_file(file_loc + f) data = pd.DataFrame(data, columns=['act', 'dec' ]) #.sample(frac=0.01).reset_index(drop=True) mol_acts = [Chem.MolFromSmiles(smi) for smi in data['act'].values] mol_decs = [Chem.MolFromSmiles(smi) for smi in data['dec'].values] fp_acts = [get_fp(mol) for mol in mol_acts] fp_decs = [get_fp(mol) for mol in mol_decs] simi = [ get_scaffold_simi(fp_acts[i], fp_decs[i]) for i in range(len(fp_acts)) ] idxs = np.where(np.array(simi) < T_simi) mol_acts = np.array(mol_acts)[idxs] mol_decs = np.array(mol_decs)[idxs] pss = get_pss_from_smiles(mol_acts, mol_decs) data = pd.DataFrame(data.values[idxs], columns=['act', 'dec']) data['pss'] = pss.mean(0) data['score'] = data['pss'] result = [] for key, tmp_df in data.groupby('act'): tmp_df = tmp_df.sort_values('score', ascending=False) tmp_df = tmp_df.reset_index(drop=True) for i in range(min([N, tmp_df.shape[0]])): result.append([key, tmp_df['dec'].values[i]]) result = pd.DataFrame(result, columns=['act', 'dec']) result = result.drop_duplicates().reset_index(drop=True) result.to_csv(f'{target}_decoys.smi', index=False, header=None, sep=' ')
pred = pd.read_csv('./eval/zinc_all_alerts_pred.csv', usecols=['smiles', 'pred']).rename(columns={ 'smiles': 'content', 'pred': 'content_TOX' })[['content', 'content_TOX']] df = df.merge(pred, on='content', how='left') df = df.drop_duplicates(['content', 'gene']) df['tmp'] = df[['content', 'gene']].values.tolist() print(df.shape) pss = get_pss_from_smiles(df['content'].values, df['gene'].values) df['PSS'] = pss.mean(0) df['content_fp'] = df['content'].apply(lambda x: get_mol_features(x)) df['gene_fp'] = df['gene'].apply(lambda x: get_mol_features(x)) df['similarity'] = df[['content_fp', 'gene_fp']].values.tolist() df['similarity'] = df['similarity'].apply( lambda x: np.sum(np.array(x[0]) * np.array(x[1]))) # higher is better