Ejemplo n.º 1
0
def encode_smiles(series):
    print("Encoding smiles")
    # parameter is a pd.series with ZINC_IDs as the indicies and smiles as the elements
    encoded_smiles = DDModel.process_smiles(series.values, 100, fit_range=100, use_padding=True, normalize=True)
    encoded_dict = dict(zip(series.keys(), encoded_smiles))
    # returns a dict array of the smiles.
    return encoded_dict
    os.mkdir(file_path + '/iteration_' + str(it) + '/morgan_1024_predictions')
except OSError:
    print(file_path + '/iteration_' + str(it) + '/morgan_1024_predictions',
          "already exists")

thresholds = pd.read_csv(file_path + '/iteration_' + str(it) +
                         '/best_models/thresholds.txt',
                         header=None)
thresholds.columns = ['model_no', 'thresh', 'cutoff']

tr = []
models = []
for f in glob.glob(file_path + '/iteration_' + str(it) +
                   '/best_models/model_*'):
    if "." not in f:  # skipping over the .ddss & .csv files
        mn = int(f.split('/')[-1].split('_')[1])
        tr.append(thresholds[thresholds.model_no == mn].thresh.iloc[0])
        models.append(
            DDModel.load(file_path + '/iteration_' + str(it) +
                         '/best_models/model_' + str(mn)))

print("Number of models to predict:", len(models))
t = time.time()
returned = prediction_morgan(fn, models, tr)
print(time.time() - t)

with open(
        file_path + '/iteration_' + str(it) +
        '/morgan_1024_predictions/passed_file_ct.txt', 'a') as ref:
    ref.write(fn + ',' + str(returned) + '\n')
Ejemplo n.º 3
0

if CONTINUOUS:
    print('Using continuous labels...')
    y_valid = valid_data.r_i_docking_score
    y_test = test_data.r_i_docking_score
    y_train = train_data.r_i_docking_score

    if NORMALIZE:
        print('Adding cutoff to be normalized')
        cutoff_ser = pd.Series([cf], index=['cutoff'])
        y_train = y_train.append(cutoff_ser)

        print("Normalizing docking scores...")
        # Normalize the docking scores
        y_valid = DDModel.normalize(y_valid)
        y_test = DDModel.normalize(y_test)
        y_train = DDModel.normalize(y_train)

        print('Extracting normalized cutoff...')
        cf_norm = y_train['cutoff']
        y_train.drop(labels=['cutoff'], inplace=True)   # removing it from the dataset

        cf_to_use = cf_norm
    else:
        cf_to_use = cf

    # Getting all the ids of hits and non hits.
    y_pos = y_train[y_train < cf_to_use]
    y_neg = y_train[y_train >= cf_to_use]
all_sc = {}
path_to_model = SAVE_PATH+'/iteration_'+str(n_iteration)+'/all_models/'

print('Model_to_use_with_cf:', model_to_use_with_cf)
for i in range(len(model_to_use_with_cf)):
    cf = model_to_use_with_cf[i][0]

    # y_train<cf returns a bool array for this condition on each element
    y_test_cf = y_test<cf
    y_valid_cf = y_valid<cf

    models = []
    # loading the models matching the cutoff and appending them to the models list
    for mn in model_to_use_with_cf[i][-1]:
        print('\tLoading model:', path_to_model + '/model_'+str(mn))
        models.append(DDModel.load(path_to_model+'/model_'+str(mn)))
    print('num models:', len(models))
    
    prediction_valid = []
    scc = []
    for model in models:
        print('using valid as validation')
        model_pred = model.predict(X_valid)
        if model.output_activation == 'linear':
            # Converting back to binary values to get stats
            model_pred = model_pred < cf
        prediction_valid.append(model_pred)
        precision, recall, thresholds = precision_recall_curve(y_valid_cf, model_pred)
        scc.append([precision, recall, thresholds])

    tr = []