Beispiel #1
0
def main(model, config):
    set_seed(2048)

    model_config = torch.load('./temp')
    model_state = torch.load(config.model_load)
    model_vocab = None
    model = MODELS.get_model_class(model)(model_vocab, model_config)
    model.load_state_dict(model_state)
    model = model.cuda()
    model.eval()

    model.model_loaded = True
    _, smi2vec = load_model()

    content_test = pickle.load(
        open(f'./data/content_test_{config.target}.pkl', 'rb'))  # [:100]
    style_instance = pickle.load(
        open(f'./data/style_instance_test_{config.target}.pkl',
             'rb'))  # [:30000]

    latent_content_test = model.heteroencoder.encode(smi2vec(content_test))
    latent_style_instance = model.heteroencoder.encode(smi2vec(style_instance))
    print(latent_content_test.shape, latent_style_instance.shape)
    samples = model.sample_per_act(latent_content_test,
                                   latent_style_instance,
                                   num_per_act=config.num_per_act,
                                   n_ins=config.n_ins)
    os.makedirs('./eval/results', exist_ok=True)
    pickle.dump(samples,
                open(f'./eval/results/{config.target}_result.pkl', 'wb'))
Beispiel #2
0
def main(config):
    set_seed(config.seed)

    model_config = torch.load(config.config_load)
    model_vocab = torch.load(config.vocab_load)
    model_state = torch.load(config.model_load)

    device = torch.device(config.device)

    # For CUDNN to work properly:
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    model = VAE(model_vocab, model_config)
    model.load_state_dict(model_state)
    model = model.to(device)
    model.eval()

    gen, n = [], config.n_samples
    T = tqdm.tqdm(range(config.n_samples), desc='Generating mols')
    while n > 0:
        x = model.sample(min(n, config.n_batch), config.max_len)[-1]
        mols = [model_vocab.ids2string(i_x.tolist()) for i_x in x]
        n -= len(mols)
        T.update(len(mols))
        T.refresh()
        gen.extend(mols)

    df = pd.DataFrame(gen, columns=['SMILES'])
    df.to_csv(config.gen_save, index=False)
Beispiel #3
0
def main(model, config):
    os.makedirs(config.model_save, exist_ok=True)
    set_seed(config.seed)
    device = torch.device(config.device)

    if config.config_save is not None:
        torch.save(config, config.config_save)

    # For CUDNN to work properly
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)
    content_train = pickle.load(
        open(f'./data/content_train_{config.target}.pkl', 'rb'))
    style_instance = pickle.load(
        open(f'./data/style_instance_train_{config.target}.pkl', 'rb'))
    trainer = MODELS.get_model_trainer(model)(config)

    vocab = None
    model = MODELS.get_model_class(model)(vocab, config)
    if config.model_load is not None:
        print(f'load model from {config.model_load}')
        model_state = torch.load(config.model_load)
        model.load_state_dict(model_state)
    model = model.to(device)
    trainer.fit(model,
                content_train=content_train,
                style_instance=style_instance)
Beispiel #4
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)
    if config.conditional_model:
        labels = read_label_csv(config.train_load)
        config.labels_size = len(labels[0])
        labels = [[int(x) for x in list(l)] for l in labels]
        train_data = [(x, y) for (x, y) in zip(train, labels)]
    else:
        train_data = [(x) for x in train]
    shuffle(train_data)
    train_data = train_data[:500000]
    vocab = CharVocab.from_data(train)
    torch.save(config, config.config_save)
    torch.save(vocab, config.vocab_save)

    device = torch.device(config.device)

    model = AAE(vocab, config)
    model = model.to(device)

    trainer = AAETrainer(config)
    trainer.fit(model, train_data)

    model.to('cpu')
    torch.save(model.state_dict(), config.model_save)
Beispiel #5
0
def main(model, config):
    set_seed(config.seed)
    device = torch.device(config.device)

    if config.config_save is not None:
        torch.save(config, config.config_save)

    # For CUDNN to work properly
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    train_data = read_smiles_csv(config.train_load)
    if config.val_load:
        val_data = read_smiles_csv(config.val_load)
    else:
        val_data = None
    trainer = MODELS.get_model_trainer(model)(config)

    if config.vocab_load is not None:
        assert os.path.exists(config.vocab_load), \
            'vocab_load path does not exist!'
        vocab = torch.load(config.vocab_load)
    else:
        vocab = trainer.get_vocabulary(train_data)

    if config.vocab_save is not None:
        torch.save(vocab, config.vocab_save)

    model = MODELS.get_model_class(model)(vocab, config).to(device)
    trainer.fit(model, train_data, val_data)

    model = model.to('cpu')
    torch.save(model.state_dict(), config.model_save)
Beispiel #6
0
def main(config):
    set_seed(config.seed)

    model_config = torch.load(config.config_load)
    model_vocab = torch.load(config.vocab_load)
    model_state = torch.load(config.model_load)

    device = torch.device(config.device)

    model = AAE(model_vocab, model_config)
    model.load_state_dict(model_state)
    model = model.to(device)
    model.eval()

    samples = []
    n = config.n_samples
    with tqdm.tqdm(total=config.n_samples, desc='Generating samples') as T:
        while n > 0:
            current_samples = model.sample(min(n, config.n_batch),
                                           config.max_len)
            samples.extend(current_samples)

            n -= len(current_samples)
            T.update(len(current_samples))

    samples = pd.DataFrame(samples, columns=['SMILES'])
    samples.to_csv(config.gen_save, index=False)
Beispiel #7
0
def main(model, config):
    set_seed(config.seed)
    device = torch.device(config.device)

    # For CUDNN to work properly:
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    model_config = torch.load(config.config_load)
    model_vocab = torch.load(config.vocab_load)
    model_state = torch.load(config.model_load)

    model = MODELS.get_model_class(model)(model_vocab, model_config)
    model.load_state_dict(model_state)
    model = model.to(device)
    model.eval()

    samples = []
    n = config.n_samples
    with tqdm(total=config.n_samples, desc='Generating samples') as T:
        while n > 0:
            current_samples = model.sample(
                min(n, config.n_batch), config.max_len
            )
            samples.extend(current_samples)

            n -= len(current_samples)
            T.update(len(current_samples))

    samples = pd.DataFrame(samples, columns=['SMILES'])
    samples.to_csv(config.gen_save, index=False)
Beispiel #8
0
def main(config):
    set_seed(config.seed)

    model_vocab = torch.load(config.vocab_load)
    model_config = torch.load(config.config_load)
    model_state = torch.load(config.model_load)

    device = torch.device(config.device)

    model = CharRNN(model_vocab, model_config.hidden, model_config.num_layers,
                    model_config.dropout, device)
    model.load_state_dict(model_state)
    model = model.to(device=device)
    model.eval()

    gen_smiles = []

    # TODO: n_samples % batch = 0
    for i in tqdm.tqdm(range(config.n_samples // config.n_batch)):
        smiles_list = model.sample_smiles(config.max_len, config.n_batch)
        for t in smiles_list:
            gen_smiles.append(model_vocab.ids2string([i.item() for i in t]))

    df = pd.DataFrame(gen_smiles, columns=['SMILES'])
    df.to_csv(config.gen_save, index=False)
Beispiel #9
0
def main(config):
    set_seed(config.seed)
    device = torch.device(config.device)

    # For CUDNN to work properly:
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    # Hardcode
    model_config = torch.load(
        pjoin(BASE_DIR, 'checkpoints', 'vae', 'vae_1', 'vae_config.pt'))
    model_vocab = torch.load(
        pjoin(BASE_DIR, 'checkpoints', 'vae', 'vae_1', 'vae_vocab.pt'))
    model_state = torch.load(
        pjoin(BASE_DIR, 'checkpoints', 'vae', 'vae_1', 'vae_model.pt'))

    model = VAEEncode(model_vocab, model_config)
    model.load_state_dict(model_state)
    model = model.to(device)
    model.eval()

    smile = pd.read_csv(pjoin('/mol_data', 'DeepAffinity', 'merged_data.tsv'),
                        delimiter='\t')
    smile = smile['Canonical SMILE'].values
    embeds = []
    for smi in tqdm(smile, desc='Running VAE encoder'):
        smi = model.string2tensor(smi)
        embeds.append(model.forward_encoder_no_noise([smi]).cpu().numpy()[0])
    samples = pd.DataFrame(embeds)
    samples.to_csv(pjoin(BASE_DIR, 'checkpoints', 'vae', 'vae_1',
                         'embeds.csv'),
                   index=False)
Beispiel #10
0
def main(config):
    set_seed(config.seed)

    device = torch.device(config.device)

    data = read_smiles_csv(config.train_load)

    vocab = None
    if config.vocab_save is not None and os.path.exists(config.vocab_save):
        vocab = torch.load(config.vocab_save)
    corpus = JTreeCorpus(config.n_batch, device).fit(dataset=data,
                                                     vocabulary=vocab,
                                                     n_jobs=config.n_jobs)
    torch.save(corpus.vocab, config.vocab_save)
    train_dataloader = corpus.transform(data, num_workers=config.n_jobs)

    model = JTNNVAE(corpus.vocab, config.hidden, config.latent, config.depth)
    model = model.to(device=device)

    for param in model.parameters():
        if param.dim() == 1:
            nn.init.constant_(param, 0)
        else:
            nn.init.xavier_normal_(param)

    trainer = JTreeTrainer(config)
    trainer.fit(model, train_dataloader)

    torch.save(model.state_dict(), config.model_save)
    torch.save(config, config.config_save)
Beispiel #11
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)
    vocab = CharVocab.from_data(train)
    torch.save(vocab, config.vocab_save)
    torch.save(config, config.config_save)
    device = torch.device(config.device)

    # condition mode
    if config.conditional:
        fps = read_fps_csv(config.train_load)
        fps = fps_to_list(fps)
        fps = [torch.tensor(f, dtype=torch.float, device=device) for f in fps]
        # fingerprints length
        fps_len = len(fps[0])
    else:
        fps = None
        fps_len = 0

    with Pool(config.n_jobs) as pool:
        reward_func = MetricsReward(train, config.n_ref_subsample,
                                    config.rollouts, pool,
                                    config.addition_rewards)
        model = ORGAN(vocab, config, fps_len, reward_func)
        model = model.to(device)

        trainer = ORGANTrainer(config)
        trainer.fit(model, train, fps)

    torch.save(model.state_dict(), config.model_save)
Beispiel #12
0
def main(model, config):
    set_seed(config.seed)
    device = torch.device(config.device)
    # For CUDNN to work properly:
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    if (config.lbann_weights_dir):
        assert os.path.exists(config.lbann_weights_dir), (
            "LBANN inference mode is specified but directory "
            " to load weights does not exist: '{}'".format(
                config.lbann_weights_dir))

    model_config = torch.load(config.config_load)
    trainer = MODELS.get_model_trainer(model)(model_config)
    model_vocab = torch.load(config.vocab_load)
    model_state = torch.load(config.model_load)

    model = MODELS.get_model_class(model)(model_vocab, model_config)
    if os.path.exists(config.lbann_weights_dir):
        model.load_lbann_weights(config.lbann_weights_dir,
                                 config.lbann_epoch_counts)
    else:
        # assume that a non-LBANN model is being loaded
        model.load_state_dict(model_state)
    model = model.to(device)
    model.eval()

    if (config.save_reconstruction):
        test_data = read_smiles_csv(config.test_path)
        print("Reconstructing ", len(test_data), " of ", config.test_path,
              " test samples")
        test_loader = trainer.get_dataloader(model, test_data, shuffle=False)
        #tqdm_data = tqdm(test_loader, desc='Reconstruction (batch #{})'.format(batch+1))
        tqdm_data = tqdm(test_loader, desc='Reconstruction')
        model.reconstruct(tqdm_data, config.pred_save)
        print("Reconstructed samples of ", config.test_path, " saved to ",
              config.pred_save)

    samples = []
    n = config.n_samples
    print("Generating Samples")
    with tqdm(total=config.n_samples, desc='Generating samples') as T:
        while n > 0:
            current_samples = model.sample(min(n, config.n_batch),
                                           config.max_len)
            samples.extend(current_samples)

            n -= len(current_samples)
            T.update(len(current_samples))

    samples = pd.DataFrame(samples, columns=['SMILES'])
    print("Save generated samples to ", config.gen_save)
    samples.to_csv(config.gen_save, index=False)
Beispiel #13
0
def main(config):
    setup_default_logger()

    set_seed(config.seed)

    generator = OrganGenerator(config)

    json_file_path = os.path.join(config.output_dir,
                                  'distribution_learning_results.json')
    assess_distribution_learning(generator,
                                 chembl_training_file=config.dist_file,
                                 json_output_file=json_file_path,
                                 benchmark_version=config.suite)
Beispiel #14
0
def main(config):
    setup_default_logger()

    set_seed(config.seed)

    if config.output_dir is None:
        config.output_dir = os.path.dirname(os.path.realpath(__file__))

    generator = VaeGenerator(config)

    json_file_path = os.path.join(config.output_dir, 'distribution_learning_results.json')
    assess_distribution_learning(generator,
                                 chembl_training_file=config.dist_file,
                                 json_output_file=json_file_path,
                                 benchmark_version=config.suite)
Beispiel #15
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)

    device = torch.device(config.device)

    corpus = OneHotCorpus(config.batch, device)

    train_dataloader = corpus.fit(train).transform(train)

    model = CharRNN(corpus.vocab, config.hidden, config.num_layers, config.dropout, device).to(device)
    trainer = CharRNNTrainer(config)
    trainer.fit(model, train_dataloader)

    torch.save(model.state_dict(), config.model_save)
    torch.save(config, config.config_save)
    torch.save(corpus.vocab, config.vocab_save)
Beispiel #16
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)

    vocab = CharVocab.from_data(train)
    torch.save(config, config.config_save)
    torch.save(vocab, config.vocab_save)

    device = torch.device(config.device)

    model = AAE(vocab, config)
    model = model.to(device)

    trainer = AAETrainer(config)
    trainer.fit(model, train)

    model.to('cpu')
    torch.save(model.state_dict(), config.model_save)
Beispiel #17
0
def main(config):
    set_seed(config.seed)

    model_config = torch.load(config.config_load)
    model_vocab = torch.load(config.vocab_load)
    model_state = torch.load(config.model_load)

    device = torch.device(config.device)

    model = AAE(model_vocab, model_config)
    model.load_state_dict(model_state)
    model = model.to(device)
    model.eval()

    if model_config.conditional_model:
        test = pd.read_csv(config.label_load,
                           usecols=['fingerprints_center'],
                           squeeze=True).astype(str).tolist()
        labels = [[int(x) for x in list(t)] for t in test]
        labels = np.array(labels)
        labels = torch.FloatTensor(labels).cuda()
    else:
        labels = None

    samples = []
    n = config.n_samples
    n_labels = config.n_labels
    with tqdm.tqdm(total=config.n_samples * n_labels,
                   desc='Generating samples') as T:
        while n > 0:
            current_samples = model.sample(n_labels, config.max_len, labels)
            samples.append(current_samples)
            n -= 1
            T.update(n_labels)

    samples = np.transpose(np.array(samples)).tolist()
    output = open(config.gen_save, 'w')
    output.write('SMILES\n')
    for i in range(len(samples)):
        for j in range(len(samples[0])):
            output.write('{0}\n'.format(samples[i][j]))
    output.close()
Beispiel #18
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)
    vocab = CharVocab.from_data(train)
    device = torch.device(config.device)

    with Pool(config.n_jobs) as pool:
        reward_func = MetricsReward(train, config.n_ref_subsample,
                                    config.rollouts, pool,
                                    config.addition_rewards)
        model = ORGAN(vocab, config, reward_func)
        model = model.to(device)

        trainer = ORGANTrainer(config)
        trainer.fit(model, train)

    torch.save(model.state_dict(), config.model_save)
    torch.save(config, config.config_save)
    torch.save(vocab, config.vocab_save)
Beispiel #19
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)

    device = torch.device(config.device)

    # For CUDNN to work properly:
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    corpus = OneHotCorpus(config.n_batch, device)
    train = corpus.fit(train).transform(train)


    # condition mode
    if config.conditional:
        fps = read_fps_csv(config.train_load)
        fps = fps_to_list(fps)
        fps = [torch.tensor(f, dtype=torch.float, device=device) for f in fps]

        # fingerprints length
        fps_len = len(fps[0])

        # fingerprints dataloader
        fps = corpus.fps_transform(fps)

        # training data
        train = zip(train, fps)
        shuffle(train)
    else:
        fps_len = 0


    model = VAE(corpus.vocab, fps_len, config).to(device)

    trainer = VAETrainer(config)

    torch.save(config, config.config_save)
    torch.save(corpus.vocab, config.vocab_save)
    trainer.fit(model, train, config.conditional)
def main(config):
    set_seed(config.seed)

    train = read_smiles(config.train_load)

    device = torch.device(config.device)

    # For CUDNN to work properly:
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    corpus = OneHotCorpus(config.n_batch, device)
    train = corpus.fit(train).transform(train)

    model = VAE(corpus.vocab, config).to(device)

    trainer = VAETrainer(config)

    torch.save(config, config.config_save)
    torch.save(corpus.vocab, config.vocab_save)
    trainer.fit(model, train)
Beispiel #21
0
def main(config):
    set_seed(config.seed)

    model_vocab = torch.load(config.vocab_load)
    model_config = torch.load(config.config_load)
    model_state = torch.load(config.model_load)

    device = torch.device(config.device)

    model = JTNNVAE(model_vocab, model_config.hidden, model_config.latent,
                    model_config.depth)
    model.load_state_dict(model_state)
    model = model.to(device=device)
    model.eval()

    gen_smiles = []
    for _ in tqdm.trange(config.n_samples):
        gen_smiles.append(model.sample_prior(prob_decode=True))

    df = pd.DataFrame(gen_smiles, columns=['SMILES'])
    df.to_csv(config.gen_save, index=False)
Beispiel #22
0
def main(config):
    set_seed(config.seed)
    device = torch.device(config.device)

    if config.config_save is not None:
        torch.save(config, config.config_save)

    # For CUDNN to work properly
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)
    
    aff_data = pd.read_csv(pjoin('/mol_data', 'DeepAffinity', 'merged_data.tsv'), delimiter='\t')
    print(aff_data)
    embed_data = pd.read_csv(pjoin('/mol_data', 'embeds.csv'), delimiter='\t')
    train_data = (
        embed_data.values[10000:],
        aff_data['Sequence'].values[10000:],
        aff_data[['pEC50_[M]', 'pIC50_[M]', 'pKd_[M]', 'pKi_[M]']].values[10000:]
        )
    val_data = (
        embed_data.values[:10000],
        aff_data['Sequence'].values[:10000],
        aff_data[['pEC50_[M]', 'pIC50_[M]', 'pKd_[M]', 'pKi_[M]']].values[:10000],
        )
    trainer = BindingTrainer(config)

    vocab = trainer.get_vocabulary(train_data[1])

    if config.vocab_save is not None:
        torch.save(vocab, config.vocab_save)

    model = Binding(vocab, config).to(device)
    trainer.fit(model, train_data, val_data)

    model = model.to('cpu')
    torch.save(model.state_dict(), config.model_save)
Beispiel #23
0
def main(config):
    set_seed(config.seed)

    model_config = torch.load(config.config_load)
    model_vocab = torch.load(config.vocab_load)
    model_state = torch.load(config.model_load)

    device = torch.device(config.device)

    # condition mode
    if config.conditional:
        print('Conditional generation')
        # target fingerprints
        fps_center = read_fps_csv(config.condition_load)
        fps_center = fps_to_list(list(set(fps_center)))
        fps_center = [
            torch.tensor(f, dtype=torch.float, device=device)
            for f in fps_center
        ]
        # target fingerprints length
        fps_len = len(fps_center[0])
        # target fingerprints number
        fps_num = len(fps_center)
    else:
        fps_center = [None]
        fps_len = 0
        fps_num = 1

    model = ORGAN(model_vocab, model_config, fps_len)
    model.load_state_dict(model_state)
    model = model.to(device)
    model.eval()

    # sample numbers
    gen_samples = []
    n = config.n_samples

    with tqdm.tqdm(total=config.n_samples, desc='Generating samples') as T:
        for i in range(fps_num):
            samples = []
            while n > 0:
                fps = fps_center[i]
                if config.conditional:
                    fps = fps_center[i].unsqueeze(0)

                current_samples = model.sample(fps, config.conditional,
                                               min(n, config.n_batch),
                                               config.max_len)
                samples.extend(current_samples)

                n -= len(current_samples)
                T.update(len(current_samples))

            gen_samples.extend(samples)

        df = pd.DataFrame(gen_samples, columns=['SMILES'])
        df.to_csv(config.gen_save, index=False)

    # tanimoto similarity score and summary
    if config.conditional:
        calculate_score(config.gen_save, config.condition_load,
                        config.n_samples)
Beispiel #24
0
def main(config):
    set_seed(config.seed)

    model_config = torch.load(config.config_load)
    model_vocab = torch.load(config.vocab_load)
    model_state = torch.load(config.model_load)

    device = torch.device(config.device)

    # For CUDNN to work properly:
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    # condition mode
    if config.conditional:
        # target fingerprints
        fps_center = read_fps_csv(config.condition_load)
        fps_center = fps_to_list(list(set(fps_center)))
        fps_center = [
            torch.tensor(f, dtype=torch.float, device=device)
            for f in fps_center
        ]
        # target fingerprints length
        fps_len = len(fps_center[0])
        # target fingerprints number
        fps_num = len(fps_center)
    else:
        fps_center = [None]
        fps_len = 0
        fps_num = 1

    model = VAE(model_vocab, fps_len, model_config)
    model.load_state_dict(model_state)
    model = model.to(device)
    model.eval()

    # sample numbers
    gen, n = [], config.n_samples
    gen_samples = []

    # sample
    T = tqdm.tqdm(range(config.n_samples), desc='Generating mols')

    for i in range(fps_num):
        gen, n = [], config.n_samples
        while n > 0:
            x = model.sample(fps_center[i], config.conditional,
                             min(n, config.n_batch), config.max_len)[-1]
            mols = [model_vocab.ids2string(i_x.tolist()) for i_x in x]
            n -= len(mols)
            T.update(len(mols))
            T.refresh()
            gen.extend(mols)

        gen_samples.extend(gen)

    df = pd.DataFrame(gen_samples, columns=['SMILES'])
    df.to_csv(config.gen_save, index=False)

    # tanimoto similarity score and summary
    if config.conditional:
        calculate_score(config.gen_save, config.condition_load,
                        config.n_samples)