def encode_smiles(series): print("Encoding smiles") # parameter is a pd.series with ZINC_IDs as the indicies and smiles as the elements encoded_smiles = DDModel.process_smiles(series.values, 100, fit_range=100, use_padding=True, normalize=True) encoded_dict = dict(zip(series.keys(), encoded_smiles)) # returns a dict array of the smiles. return encoded_dict
os.mkdir(file_path + '/iteration_' + str(it) + '/morgan_1024_predictions') except OSError: print(file_path + '/iteration_' + str(it) + '/morgan_1024_predictions', "already exists") thresholds = pd.read_csv(file_path + '/iteration_' + str(it) + '/best_models/thresholds.txt', header=None) thresholds.columns = ['model_no', 'thresh', 'cutoff'] tr = [] models = [] for f in glob.glob(file_path + '/iteration_' + str(it) + '/best_models/model_*'): if "." not in f: # skipping over the .ddss & .csv files mn = int(f.split('/')[-1].split('_')[1]) tr.append(thresholds[thresholds.model_no == mn].thresh.iloc[0]) models.append( DDModel.load(file_path + '/iteration_' + str(it) + '/best_models/model_' + str(mn))) print("Number of models to predict:", len(models)) t = time.time() returned = prediction_morgan(fn, models, tr) print(time.time() - t) with open( file_path + '/iteration_' + str(it) + '/morgan_1024_predictions/passed_file_ct.txt', 'a') as ref: ref.write(fn + ',' + str(returned) + '\n')
if CONTINUOUS: print('Using continuous labels...') y_valid = valid_data.r_i_docking_score y_test = test_data.r_i_docking_score y_train = train_data.r_i_docking_score if NORMALIZE: print('Adding cutoff to be normalized') cutoff_ser = pd.Series([cf], index=['cutoff']) y_train = y_train.append(cutoff_ser) print("Normalizing docking scores...") # Normalize the docking scores y_valid = DDModel.normalize(y_valid) y_test = DDModel.normalize(y_test) y_train = DDModel.normalize(y_train) print('Extracting normalized cutoff...') cf_norm = y_train['cutoff'] y_train.drop(labels=['cutoff'], inplace=True) # removing it from the dataset cf_to_use = cf_norm else: cf_to_use = cf # Getting all the ids of hits and non hits. y_pos = y_train[y_train < cf_to_use] y_neg = y_train[y_train >= cf_to_use]
all_sc = {} path_to_model = SAVE_PATH+'/iteration_'+str(n_iteration)+'/all_models/' print('Model_to_use_with_cf:', model_to_use_with_cf) for i in range(len(model_to_use_with_cf)): cf = model_to_use_with_cf[i][0] # y_train<cf returns a bool array for this condition on each element y_test_cf = y_test<cf y_valid_cf = y_valid<cf models = [] # loading the models matching the cutoff and appending them to the models list for mn in model_to_use_with_cf[i][-1]: print('\tLoading model:', path_to_model + '/model_'+str(mn)) models.append(DDModel.load(path_to_model+'/model_'+str(mn))) print('num models:', len(models)) prediction_valid = [] scc = [] for model in models: print('using valid as validation') model_pred = model.predict(X_valid) if model.output_activation == 'linear': # Converting back to binary values to get stats model_pred = model_pred < cf prediction_valid.append(model_pred) precision, recall, thresholds = precision_recall_curve(y_valid_cf, model_pred) scc.append([precision, recall, thresholds]) tr = []