コード例 #1
0
def virtual_screening(target,
                      X_repurpose=None,
                      target_name=None,
                      drug_names=None,
                      train_drug=None,
                      train_target=None,
                      train_y=None,
                      save_dir='./save_folder',
                      pretrained_dir=None,
                      finetune_epochs=10,
                      finetune_LR=0.01,
                      finetune_batch_size=32,
                      convert_y=True,
                      subsample_frac=1,
                      pretrained=True,
                      split='random',
                      frac=[0.7, 0.1, 0.2],
                      agg='agg_mean_max',
                      output_len=30):

    # print(X_repurpose)
    if not os.path.exists(save_dir):
        print(
            'Save path not found or given and set to default: \'./save_folder/\'. '
        )
        os.mkdir(save_dir)
        save_dir = save_dir

    if target_name is None:
        target_name = [
            'Target ' + str(i) for i in list(range(len(X_repurpose)))
        ]

    if X_repurpose is not None:
        if drug_names is None:
            drug_names = [
                'Drug ' + str(i) for i in list(range(len(X_repurpose)))
            ]
        print("Loading customized repurposing dataset...")
    else:
        print(
            "Virtual Screening requires drug candidates input (a list of SMILESs)"
        )

    pretrained_model_names = [['Daylight', 'AAC']]

    y_preds_models = []

    if (pretrained_dir is None) & pretrained:
        # load 6 pretrained model
        print('Beginning Downloading Pretrained Model...')
        print(
            'Note: if you have already download the pretrained model before, please stop the program and set the input parameter \'pretrained_dir\' to the path'
        )
        pretrained_dir = download_pretrained_model('pretrained_models')
    elif pretrained == False:
        print(
            'Beginning Downloading Configs Files for training from scratch...')
        pretrained_dir = download_pretrained_model('models_configs')
    else:
        print('Checking if pretrained directory is valid...')
        if not os.path.exists(pretrained_dir):
            print(
                'The directory to pretrained model is not found. Please double check, or download it again by setting the input parameter \'pretrained_dir\' to be \'None\''
            )
        else:
            print('Beginning to load the pretrained models...')

    if train_drug is None:

        print('Using pretrained model and making predictions...')

        for idx, model_name in enumerate(pretrained_model_names):
            model_path = os.path.join(
                pretrained_dir, 'model_' + model_name[0] + '_' + model_name[1])
            model = models.model_pretrained(model_path)
            result_folder_path = os.path.join(
                save_dir, 'results_' + model_name[0] + '_' + model_name[1])

            if not os.path.exists(result_folder_path):
                os.mkdir(result_folder_path)

            y_pred = models.virtual_screening(X_repurpose,
                                              target,
                                              model,
                                              drug_names,
                                              target_name,
                                              convert_y=convert_y,
                                              result_folder=result_folder_path,
                                              verbose=False)
            y_preds_models.append(y_pred)
            print('Predictions from model ' + str(idx + 1) +
                  ' with drug encoding ' + model_name[0] +
                  ' and target encoding ' + model_name[1] + ' are done...')
            print('-------------')
    else:
        # customized training data
        print('Training on your own customized data...')
        if not os.path.exists(os.path.join(save_dir, 'new_trained_models')):
            os.mkdir(os.path.join(save_dir, 'new_trained_models'))
        new_trained_models_dir = os.path.join(save_dir, 'new_trained_models')
        if isinstance(train_target, str):
            train_target = [train_target]
        for idx, model_name in enumerate(pretrained_model_names):
            drug_encoding = model_name[0]
            target_encoding = model_name[1]
            train, val, test = data_process(train_drug,
                                            train_target,
                                            train_y,
                                            drug_encoding,
                                            target_encoding,
                                            split_method=split,
                                            frac=frac,
                                            sample_frac=subsample_frac)
            model_path = os.path.join(
                pretrained_dir, 'model_' + model_name[0] + '_' + model_name[1])
            if pretrained:
                model = models.model_pretrained(model_path)
                print('Use pretrained model...')
            else:
                config = load_dict(model_path)
                model = models.model_initialize(**config)
                print('Training from scrtach...')
            print('Begin to train model ' + str(idx) + ' with drug encoding ' +
                  drug_encoding + ' and target encoding ' + target_encoding)
            model.config['train_epoch'] = finetune_epochs
            model.config['LR'] = finetune_LR
            model.config['batch_size'] = finetune_batch_size

            result_folder_path = os.path.join(
                save_dir, 'results_' + model_name[0] + '_' + model_name[1])

            if not os.path.exists(result_folder_path):
                os.mkdir(result_folder_path)

            model.config['result_folder'] = result_folder_path
            model.train(train, val, test)

            print('model training finished, now doing virtual screening')
            y_pred = models.virtual_screening(X_repurpose,
                                              target,
                                              model,
                                              drug_names,
                                              target_name,
                                              convert_y=convert_y,
                                              result_folder=result_folder_path,
                                              verbose=False)
            y_preds_models.append(y_pred)
            print('Predictions from model ' + str(idx) +
                  ' with drug encoding ' + model_name[0] +
                  ' and target encoding ' + model_name[1] + ' are done...')
            model.save_model(
                os.path.join(new_trained_models_dir,
                             'model_' + model_name[0] + '_' + model_name[1]))
    result_folder_path = os.path.join(save_dir, 'results_aggregation')

    if not os.path.exists(result_folder_path):
        os.mkdir(result_folder_path)

    print('models prediction finished...')
    print('aggregating results...')

    if agg == 'mean':
        y_pred = np.mean(y_preds_models, axis=0)
    elif agg == 'max_effect':
        if convert_y:
            y_pred = np.min(y_preds_models, axis=0)
        else:
            y_pred = np.max(y_preds_models, axis=0)
    elif agg == 'agg_mean_max':
        if convert_y:
            y_pred = (np.min(y_preds_models, axis=0) +
                      np.mean(y_preds_models, axis=0)) / 2
        else:
            y_pred = (np.max(y_preds_models, axis=0) +
                      np.mean(y_preds_models, axis=0)) / 2

    with open(os.path.join(result_folder_path, 'logits_VS_mean.pkl'),
              'wb') as f:
        pickle.dump(np.mean(y_preds_models, axis=0), f,
                    pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(result_folder_path, 'logits_VS_max.pkl'),
              'wb') as f:
        pickle.dump(np.min(y_preds_models, axis=0), f, pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(result_folder_path, 'logits_VS_mean_max.pkl'),
              'wb') as f:
        pickle.dump(
            (np.min(y_preds_models, axis=0) + np.mean(y_preds_models, axis=0))
            / 2, f, pickle.HIGHEST_PROTOCOL)

    fo = os.path.join(result_folder_path, "virtual_screening.txt")
    print_list = []

    if model.binary:
        table_header = [
            "Rank", "Drug Name", "Target Name", "Interaction", "Probability"
        ]
    else:
        ### regression
        table_header = ["Rank", "Drug Name", "Target Name", "Binding Score"]
    table = PrettyTable(table_header)

    with open(fo, 'w') as fout:
        print('virtual screening...')
        df_data = data_process_repurpose_virtual_screening(
            X_repurpose, target, model.drug_encoding, model.target_encoding,
            'virtual screening')
        y_pred = model.predict(df_data)

        if convert_y:
            y_pred = convert_y_unit(np.array(y_pred), 'p', 'nM')

        print('---------------')
        if drug_names is not None and target_name is not None:
            print('Virtual Screening Result')
            f_d = max([len(o) for o in drug_names]) + 1
            f_p = max([len(o) for o in target_name]) + 1
            for i in range(len(target)):
                if model.binary:
                    if y_pred[i] > 0.5:
                        string_lst = [
                            drug_names[i], target_name[i], "YES",
                            "{0:.2f}".format(y_pred[i])
                        ]
                    else:
                        string_lst = [
                            drug_names[i], target_name[i], "NO",
                            "{0:.2f}".format(y_pred[i])
                        ]

                else:
                    ### regression
                    string_lst = [
                        drug_names[i], target_name[i],
                        "{0:.2f}".format(y_pred[i])
                    ]

                print_list.append((string_lst, y_pred[i]))
        if convert_y:
            print_list.sort(key=lambda x: x[1])
        else:
            print_list.sort(key=lambda x: x[1], reverse=True)
        print_list = [i[0] for i in print_list]
        for idx, lst in enumerate(print_list):
            lst = [str(idx + 1)] + lst
            table.add_row(lst)
        fout.write(table.get_string())
    with open(fo, 'r') as fin:
        lines = fin.readlines()
        for idx, line in enumerate(lines):
            if idx < output_len + 3:
                print(line, end='')
            else:
                print('checkout ' + fo + ' for the whole list')
                break
    print()
    with open(os.path.join(result_folder_path, 'output_list_VS.pkl'),
              'wb') as f:
        pickle.dump(print_list, f, pickle.HIGHEST_PROTOCOL)
コード例 #2
0
ファイル: demo.py プロジェクト: giangnt/DeepPurpose
                                split_method='cold_protein',
                                frac=[0.7, 0.1, 0.2])

# Generate new model using default parameters; also allow model tuning via input parameters.
config = generate_config(drug_encoding,
                         target_encoding,
                         transformer_n_layer_target=8)
net = models.model_initialize(**config)

# Train the new model.
# Detailed output including a tidy table storing validation loss, metrics, AUC curves figures and etc. are stored in the ./result folder.
net.train(train, val, test)

# or simply load pretrained model from a model directory path or reproduced model name such as DeepDTA
net = models.model_pretrained(MODEL_PATH_DIR or MODEL_NAME)

# Repurpose using the trained model or pre-trained model
# In this example, loading repurposing dataset using Broad Repurposing Hub and SARS-CoV 3CL Protease Target.
X_repurpose, drug_name, drug_cid = load_broad_repurposing_hub(SAVE_PATH)
target, target_name = load_SARS_CoV_Protease_3CL()

_ = models.repurpose(X_repurpose, target, net, drug_name, target_name)

# Virtual screening using the trained model or pre-trained model
X_repurpose, drug_name, target, target_name = [
    'CCCCCCCOc1cccc(c1)C([O-])=O', ...
], ['16007391', ...], ['MLARRKPVLPALTINPTIAEGPSPTSEGASEANLVDLQKKLEEL...',
                       ...], ['P36896', 'P00374']

_ = models.virtual_screening(X_repurpose, target, net, drug_name, target_name)