Ejemplo n.º 1
0
def main(model, config):
    set_seed(config.seed)
    device = torch.device(config.device)

    if config.config_save is not None:
        torch.save(config, config.config_save)

    # For CUDNN to work properly
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    train_data = read_smiles_csv(config.train_load)
    if config.val_load:
        val_data = read_smiles_csv(config.val_load)
    else:
        val_data = None
    trainer = MODELS.get_model_trainer(model)(config)

    if config.vocab_load is not None:
        assert os.path.exists(config.vocab_load), \
            'vocab_load path does not exist!'
        vocab = torch.load(config.vocab_load)
    else:
        vocab = trainer.get_vocabulary(train_data)

    if config.vocab_save is not None:
        torch.save(vocab, config.vocab_save)

    model = MODELS.get_model_class(model)(vocab, config).to(device)
    trainer.fit(model, train_data, val_data)

    model = model.to('cpu')
    torch.save(model.state_dict(), config.model_save)
Ejemplo n.º 2
0
def main(config, print_metrics=True):
    test = None
    test_scaffolds = None
    ptest = None
    ptest_scaffolds = None
    train = None
    if config.test_path:
        test = read_smiles_csv(config.test_path)
    if config.test_scaffolds_path is not None:
        test_scaffolds = read_smiles_csv(config.test_scaffolds_path)
    if config.train_path is not None:
        train = read_smiles_csv(config.train_path)
    if config.ptest_path is not None:
        ptest = np.load(config.ptest_path, allow_pickle=True)['stats'].item()
    if config.ptest_scaffolds_path is not None:
        ptest_scaffolds = np.load(config.ptest_scaffolds_path,
                                  allow_pickle=True)['stats'].item()
    gen = read_smiles_csv(config.gen_path)
    metrics = get_all_metrics(gen=gen,
                              k=config.ks,
                              n_jobs=config.n_jobs,
                              device=config.device,
                              test_scaffolds=test_scaffolds,
                              ptest=ptest,
                              ptest_scaffolds=ptest_scaffolds,
                              test=test,
                              train=train)

    if print_metrics:
        for name, value in metrics.items():
            print('{},{}'.format(name, value))
    else:
        return metrics
Ejemplo n.º 3
0
def main(config, print_metrics=True):
    test = read_smiles_csv(config.test_path)
    test_scaffolds = None
    ptest = None
    ptest_scaffolds = None
    if config.test_scaffolds_path is not None:
        test_scaffolds = read_smiles_csv(config.test_scaffolds_path)
    if config.ptest_path is not None:
        if not os.path.exists(config.ptest_path):
            warnings.warn(f'{config.ptest_path} does not exist')
            ptest = None
        else:
            ptest = np.load(config.ptest_path)['stats'].item()
    if config.ptest_scaffolds_path is not None:
        if not os.path.exists(config.ptest_scaffolds_path):
            warnings.warn(f'{config.ptest_scaffolds_path} does not exist')
            ptest_scaffolds = None
        else:
            ptest_scaffolds = np.load(
                config.ptest_scaffolds_path)['stats'].item()
    gen = read_smiles_csv(config.gen_path)
    metrics = get_all_metrics(test,
                              gen,
                              k=config.ks,
                              n_jobs=config.n_jobs,
                              device=config.device,
                              test_scaffolds=test_scaffolds,
                              ptest=ptest,
                              ptest_scaffolds=ptest_scaffolds)

    if print_metrics:
        for name, value in metrics.items():
            print('{},{}'.format(name, value))
    else:
        return metrics
Ejemplo n.º 4
0
Archivo: eval.py Proyecto: sparel/moses
def main(config, print_metrics=True):
    ref = read_smiles_csv(config.ref_path)
    gen = read_smiles_csv(config.gen_path)
    metrics = get_all_metrics(ref, gen, k=config.ks, n_jobs=config.n_jobs,
                              gpu=config.device_code)

    if print_metrics:
        print('Metrics:')
        for name, value in metrics.items():
            print('\t' + name + ' = {}'.format(value))
    else:
        return metrics
Ejemplo n.º 5
0
def main(config):
    data = read_smiles_csv(config.train_load)

    trainer = JTreeTrainer(config)
    vocab = trainer.get_vocabulary()

    torch.save(vocab, config.vocab_save)
Ejemplo n.º 6
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)
    if config.conditional_model:
        labels = read_label_csv(config.train_load)
        config.labels_size = len(labels[0])
        labels = [[int(x) for x in list(l)] for l in labels]
        train_data = [(x, y) for (x, y) in zip(train, labels)]
    else:
        train_data = [(x) for x in train]
    shuffle(train_data)
    train_data = train_data[:500000]
    vocab = CharVocab.from_data(train)
    torch.save(config, config.config_save)
    torch.save(vocab, config.vocab_save)

    device = torch.device(config.device)

    model = AAE(vocab, config)
    model = model.to(device)

    trainer = AAETrainer(config)
    trainer.fit(model, train_data)

    model.to('cpu')
    torch.save(model.state_dict(), config.model_save)
Ejemplo n.º 7
0
def main(config):
    set_seed(config.seed)

    device = torch.device(config.device)

    data = read_smiles_csv(config.train_load)

    vocab = None
    if config.vocab_save is not None and os.path.exists(config.vocab_save):
        vocab = torch.load(config.vocab_save)
    corpus = JTreeCorpus(config.n_batch, device).fit(dataset=data,
                                                     vocabulary=vocab,
                                                     n_jobs=config.n_jobs)
    torch.save(corpus.vocab, config.vocab_save)
    train_dataloader = corpus.transform(data, num_workers=config.n_jobs)

    model = JTNNVAE(corpus.vocab, config.hidden, config.latent, config.depth)
    model = model.to(device=device)

    for param in model.parameters():
        if param.dim() == 1:
            nn.init.constant_(param, 0)
        else:
            nn.init.xavier_normal_(param)

    trainer = JTreeTrainer(config)
    trainer.fit(model, train_dataloader)

    torch.save(model.state_dict(), config.model_save)
    torch.save(config, config.config_save)
Ejemplo n.º 8
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)
    vocab = CharVocab.from_data(train)
    torch.save(vocab, config.vocab_save)
    torch.save(config, config.config_save)
    device = torch.device(config.device)

    # condition mode
    if config.conditional:
        fps = read_fps_csv(config.train_load)
        fps = fps_to_list(fps)
        fps = [torch.tensor(f, dtype=torch.float, device=device) for f in fps]
        # fingerprints length
        fps_len = len(fps[0])
    else:
        fps = None
        fps_len = 0

    with Pool(config.n_jobs) as pool:
        reward_func = MetricsReward(train, config.n_ref_subsample,
                                    config.rollouts, pool,
                                    config.addition_rewards)
        model = ORGAN(vocab, config, fps_len, reward_func)
        model = model.to(device)

        trainer = ORGANTrainer(config)
        trainer.fit(model, train, fps)

    torch.save(model.state_dict(), config.model_save)
Ejemplo n.º 9
0
def main(model, config):
    set_seed(config.seed)
    device = torch.device(config.device)
    # For CUDNN to work properly:
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    if (config.lbann_weights_dir):
        assert os.path.exists(config.lbann_weights_dir), (
            "LBANN inference mode is specified but directory "
            " to load weights does not exist: '{}'".format(
                config.lbann_weights_dir))

    model_config = torch.load(config.config_load)
    trainer = MODELS.get_model_trainer(model)(model_config)
    model_vocab = torch.load(config.vocab_load)
    model_state = torch.load(config.model_load)

    model = MODELS.get_model_class(model)(model_vocab, model_config)
    if os.path.exists(config.lbann_weights_dir):
        model.load_lbann_weights(config.lbann_weights_dir,
                                 config.lbann_epoch_counts)
    else:
        # assume that a non-LBANN model is being loaded
        model.load_state_dict(model_state)
    model = model.to(device)
    model.eval()

    if (config.save_reconstruction):
        test_data = read_smiles_csv(config.test_path)
        print("Reconstructing ", len(test_data), " of ", config.test_path,
              " test samples")
        test_loader = trainer.get_dataloader(model, test_data, shuffle=False)
        #tqdm_data = tqdm(test_loader, desc='Reconstruction (batch #{})'.format(batch+1))
        tqdm_data = tqdm(test_loader, desc='Reconstruction')
        model.reconstruct(tqdm_data, config.pred_save)
        print("Reconstructed samples of ", config.test_path, " saved to ",
              config.pred_save)

    samples = []
    n = config.n_samples
    print("Generating Samples")
    with tqdm(total=config.n_samples, desc='Generating samples') as T:
        while n > 0:
            current_samples = model.sample(min(n, config.n_batch),
                                           config.max_len)
            samples.extend(current_samples)

            n -= len(current_samples)
            T.update(len(current_samples))

    samples = pd.DataFrame(samples, columns=['SMILES'])
    print("Save generated samples to ", config.gen_save)
    samples.to_csv(config.gen_save, index=False)
Ejemplo n.º 10
0
def main(config):
    data = read_smiles_csv(config.train_load)

    clusters = set()

    for smiles in tqdm.tqdm(data):
        mol = MolTree(smiles)
        for c in mol.nodes:
            clusters.add(c.smiles)

    vocab = JTreeVocab(sorted(list(clusters)))
    torch.save(vocab, config.vocab_save)
Ejemplo n.º 11
0
def eval_metrics(eval_config, print_metrics=True):

    # need to detect if file has the header or not
    test = read_smiles_csv(model_config.test_path)
    test_scaffolds = None
    ptest = None
    ptest_scaffolds = None
    if model_config.test_scaffolds_path is not None:
        test_scaffolds = read_smiles_csv(model_config.test_scaffolds_path)
    if model_config.ptest_path is not None:
        if not os.path.exists(model_config.ptest_path):
            warnings.warn(f"{model_config.ptest_path} does not exist")
            ptest = None
        else:
            ptest = np.load(model_config.ptest_path)["stats"].item()
    if model_config.ptest_scaffolds_path is not None:
        if not os.path.exists(model_config.ptest_scaffolds_path):
            warnings.warn(f"{model_config.ptest_scaffolds_path} does not exist")
            ptest_scaffolds = None
        else:
            ptest_scaffolds = np.load(model_config.ptest_scaffolds_path)["stats"].item()
    gen = read_smiles_csv(model_config.gen_save)
    metrics = get_all_metrics(
        test,
        gen,
        k=model_config.ks,
        n_jobs=model_config.n_jobs,
        gpu=model_config.gpu,
        test_scaffolds=test_scaffolds,
        ptest=ptest,
        ptest_scaffolds=ptest_scaffolds,
    )

    if print_metrics:
        print("Metrics:")
        for name, value in metrics.items():
            print("\t" + name + " = {}".format(value))
        return metrics
    else:
        return metrics
Ejemplo n.º 12
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)

    device = torch.device(config.device)

    corpus = OneHotCorpus(config.batch, device)

    train_dataloader = corpus.fit(train).transform(train)

    model = CharRNN(corpus.vocab, config.hidden, config.num_layers, config.dropout, device).to(device)
    trainer = CharRNNTrainer(config)
    trainer.fit(model, train_dataloader)

    torch.save(model.state_dict(), config.model_save)
    torch.save(config, config.config_save)
    torch.save(corpus.vocab, config.vocab_save)
Ejemplo n.º 13
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)

    vocab = CharVocab.from_data(train)
    torch.save(config, config.config_save)
    torch.save(vocab, config.vocab_save)

    device = torch.device(config.device)

    model = AAE(vocab, config)
    model = model.to(device)

    trainer = AAETrainer(config)
    trainer.fit(model, train)

    model.to('cpu')
    torch.save(model.state_dict(), config.model_save)
Ejemplo n.º 14
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)
    vocab = CharVocab.from_data(train)
    device = torch.device(config.device)

    with Pool(config.n_jobs) as pool:
        reward_func = MetricsReward(train, config.n_ref_subsample,
                                    config.rollouts, pool,
                                    config.addition_rewards)
        model = ORGAN(vocab, config, reward_func)
        model = model.to(device)

        trainer = ORGANTrainer(config)
        trainer.fit(model, train)

    torch.save(model.state_dict(), config.model_save)
    torch.save(config, config.config_save)
    torch.save(vocab, config.vocab_save)
Ejemplo n.º 15
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)

    device = torch.device(config.device)

    # For CUDNN to work properly:
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    corpus = OneHotCorpus(config.n_batch, device)
    train = corpus.fit(train).transform(train)


    # condition mode
    if config.conditional:
        fps = read_fps_csv(config.train_load)
        fps = fps_to_list(fps)
        fps = [torch.tensor(f, dtype=torch.float, device=device) for f in fps]

        # fingerprints length
        fps_len = len(fps[0])

        # fingerprints dataloader
        fps = corpus.fps_transform(fps)

        # training data
        train = zip(train, fps)
        shuffle(train)
    else:
        fps_len = 0


    model = VAE(corpus.vocab, fps_len, config).to(device)

    trainer = VAETrainer(config)

    torch.save(config, config.config_save)
    torch.save(corpus.vocab, config.vocab_save)
    trainer.fit(model, train, config.conditional)
Ejemplo n.º 16
0
def main(config):
    set_seed(config.seed)

    train = read_smiles_csv(config.train_load)

    device = torch.device(config.device)

    # For CUDNN to work properly:
    if device.type.startswith('cuda'):
        torch.cuda.set_device(device.index or 0)

    corpus = OneHotCorpus(config.n_batch, device)
    train = corpus.fit(train).transform(train)

    model = VAE(corpus.vocab, config).to(device)

    trainer = VAETrainer(config)

    torch.save(config, config.config_save)
    torch.save(corpus.vocab, config.vocab_save)
    trainer.fit(model, train)
Ejemplo n.º 17
0
    print("Validity % ", (is_valid.count(1)/data_size)*100)
    print("Same % ", (is_same.count(1)/data_size)*100)
    valid_tani_dist = [ t for t in tani_dist if t >= 0 ] 
    print("Average tanimoto ", np.mean(np.array(valid_tani_dist)))
    

    if output_file is not None:
        output_columns = ['original', 'decoded', 'is_valid', 'is_same', 'smile_accuracy','tanimoto_distance','total_avg_accuracy']
        res_df.to_csv(output_file, index=False, columns=output_columns)
    return(res_df)


fdir = sys.argv[1] #directory of LBANN tensor outputs
sd = sys.argv[2]   #tag for different noise pertubation values
vocab_file = sys.argv[3] #vocab file from PyTorch MOSES library

sequence_length = 102 #Max sequence lenght use in LBANN training (100+bos+eos)
zdim = 512 #latent space dimension
batch_num = 0 #use to control loading different batches of dump (default 0) 

get_smiles_from_lbann_tensors(fdir,sequence_length, zdim,vocab_file)

orig_file = read_smiles_csv(fdir+"gt_batch"+str(batch_num)+"smiles.txt")
pred_file = read_smiles_csv(fdir+"pred_batch"+str(batch_num)+"smiles.txt")
diff_file = fdir+"sd"+sd+"_smiles_metrics.csv"

print("Input/pred SMILES file sizes ", len(orig_file), " ", len(pred_file))

compare_decoded_to_original_smiles(orig_file, pred_file, diff_file)
print("Input/pred SMILES diff file saved to", diff_file)