def run_addcarbon(smi_train, results_base, seed=None):
    if seed is not None:
        np.random.seed(seed)
    # setup results directory
    dir_results = Path(f'{results_base}/{timestamp()}')
    os.makedirs(dir_results)

    np.random.seed(seed)
    add_carbon = AddCarbonSampler(smi_train)

    sampled_smiles = add_carbon.generate(10000)
    fn_sampled = str(dir_results / 'addcarbon_smiles.txt')
    with open(fn_sampled, 'w') as f:
        f.write('\n'.join(sampled_smiles))

    fn_guacamol_results = str(dir_results / 'guacamol_results.json')
    assess_distribution_learning(add_carbon,
                                 smi_train,
                                 json_output_file=fn_guacamol_results)

    with open(fn_guacamol_results) as f:
        results = json.load(f)

    for b in results['results']:
        print(f"{b['benchmark_name']}: {b['score']}")

    print(f'Saved results in: {dir_results}')
def main(params):
    # setup random seeds
    set_seed(params.seed)
    params.ar = True

    if not os.path.isdir(params.output_dir): os.makedirs(params.output_dir)
    print("Loading the model from {0}".format(params.model_path))
    # load everything from checkpoint
    model_params, dico, model = reload_ar_checkpoint(params.model_path)
    if params.local_cpu is False:
        model = model.cuda()
    # evaluate distributional results
    generator = SmilesTransformerGenerator(params, dico, model,
                                           params.sample_temperature)
    json_file_path = os.path.join(params.output_dir,
                                  'distribution_learning_results.json')
    smiles_output_path = os.path.join(params.output_dir,
                                      'generated_smiles.txt')
    print("Starting distributional evaluation")
    t1 = time.time()
    if params.evaluate is True:
        assess_distribution_learning(generator,
                                     chembl_training_file=params.dist_file,
                                     json_output_file=json_file_path,
                                     benchmark_version=params.suite)
    else:
        smiles_list = generator.generate(params.num_samples)
        with open(smiles_output_path, 'w') as f:
            for smiles in smiles_list:
                f.write(smiles + '\n')
    t2 = time.time()
    print("Total time taken {}".format(t2 - t1))
Exemple #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--smiles_file',
        help=
        'Location of the ChEMBL dataset to use for the distribution benchmarks.',
        default='data/guacamol_v1_all.smiles')
    parser.add_argument(
        '--pickle_directory',
        help=
        'Directory containing pickle files with the distribution statistics',
        default=None)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--n_jobs', type=int, default=-1)
    parser.add_argument('--generations', type=int, default=1000)
    parser.add_argument('--population_size', type=int, default=100)
    parser.add_argument('--num_sims', type=int, default=40)
    parser.add_argument('--max_children', type=int, default=25)
    parser.add_argument('--max_atoms', type=int, default=60)
    parser.add_argument('--init_smiles', type=str, default='CC')
    parser.add_argument('--random_start', action='store_true')
    parser.add_argument('--output_dir', type=str, default=None)
    parser.add_argument('--suite', default='v2')
    args = parser.parse_args()

    if args.output_dir is None:
        args.output_dir = os.path.dirname(os.path.realpath(__file__))

    if args.pickle_directory is None:
        args.pickle_directory = os.path.dirname(os.path.realpath(__file__))

    np.random.seed(args.seed)

    setup_default_logger()

    # save command line args
    with open(
            os.path.join(args.output_dir, 'distribution_learning_params.json'),
            'w') as jf:
        json.dump(vars(args), jf, sort_keys=True, indent=4)

    sampler = GB_MCTS_Sampler(pickle_directory=args.pickle_directory,
                              n_jobs=args.n_jobs,
                              random_start=args.random_start,
                              num_sims=args.num_sims,
                              max_children=args.max_children,
                              init_smiles=args.init_smiles,
                              max_atoms=args.max_atoms,
                              generations=args.generations,
                              population_size=args.population_size)

    json_file_path = os.path.join(args.output_dir,
                                  'distribution_learning_results.json')
    assess_distribution_learning(sampler,
                                 json_output_file=json_file_path,
                                 chembl_training_file=args.smiles_file,
                                 benchmark_version=args.suite)
Exemple #4
0
def main(config):
    setup_default_logger()

    set_seed(config.seed)

    generator = OrganGenerator(config)

    json_file_path = os.path.join(config.output_dir,
                                  'distribution_learning_results.json')
    assess_distribution_learning(generator,
                                 chembl_training_file=config.dist_file,
                                 json_output_file=json_file_path,
                                 benchmark_version=config.suite)
Exemple #5
0
def main(config):
    setup_default_logger()

    set_seed(config.seed)

    if config.output_dir is None:
        config.output_dir = os.path.dirname(os.path.realpath(__file__))

    generator = VaeGenerator(config)

    json_file_path = os.path.join(config.output_dir, 'distribution_learning_results.json')
    assess_distribution_learning(generator,
                                 chembl_training_file=config.dist_file,
                                 json_output_file=json_file_path,
                                 benchmark_version=config.suite)
Exemple #6
0
def main(params):
    # setup random seeds
    set_seed(params.seed)
    params.ar = True

    print("Loading the model from {0}".format(params.model_path))
    # load everything from checkpoint
    model_params, dico, model = reload_ar_checkpoint(params.model_path)
    if params.local_cpu is False:
        model = model.cuda()
    # evaluate distributional results
    generator = SmilesTransformerGenerator(params, dico, model)
    json_file_path = os.path.join('/'.join(params.model_path.split('/')[:-1]),
                                  'distribution_learning_results.json')
    print("Starting distributional evaluation")
    t1 = time.time()
    assess_distribution_learning(generator,
                                 chembl_training_file=params.dist_file,
                                 json_output_file=json_file_path,
                                 benchmark_version=params.suite)
    t2 = time.time()
    print("Total time taken {}".format(t2 - t1))
    parser = argparse.ArgumentParser(description='Distribution learning benchmark for SMILES RNN',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--seed', default=42, type=int, help='Random seed')
    parser.add_argument('--model_path', default=None, help='Full path to SMILES RNN model')
    parser.add_argument('--output_dir', default=None, help='Output directory')
    parser.add_argument('--dist_file', default='data/guacamol_v1_all.smiles', help='Distribution file')

    args = parser.parse_args()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    logger.info(f'device:\t{device}')

    set_random_seed(args.seed, device)

    if args.output_dir is None:
        args.output_dir = os.path.dirname(os.path.realpath(__file__))

    if args.model_path is None:
        dir_path = os.path.dirname(os.path.realpath(__file__))
        args.model_path = os.path.join(dir_path, 'pretrained_model', 'model_final_0.473.pt')

    model_def = Path(args.model_path).with_suffix('.json')
    model = load_rnn_model(model_def, args.model_path, device, copy_to_cpu=True)
    generator = SmilesRnnGenerator(model=model, device=device)

    json_file_path = os.path.join(args.output_dir, 'distribution_learning_results.json')
    assess_distribution_learning(generator,
                                 chembl_training_file=args.dist_file,
                                 json_output_file=json_file_path)
Exemple #8
0
    parser.add_argument('--suite', default='v2')

    args = parser.parse_args()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    logger.info(f'device:\t{device}')

    set_random_seed(args.seed, device)

    if args.output_dir is None:
        args.output_dir = os.path.dirname(os.path.realpath(__file__))

    if args.model_path is None:
        dir_path = os.path.dirname(os.path.realpath(__file__))
        args.model_path = os.path.join(dir_path, 'pretrained_model',
                                       'model_final_0.473.pt')

    model_def = Path(args.model_path).with_suffix('.json')
    model = load_rnn_model(model_def,
                           args.model_path,
                           device,
                           copy_to_cpu=True)
    generator = SmilesRnnGenerator(model=model, device=device)

    json_file_path = os.path.join(args.output_dir,
                                  'distribution_learning_results.json')
    assess_distribution_learning(generator,
                                 chembl_training_file=args.dist_file,
                                 json_output_file=json_file_path,
                                 benchmark_version=args.suite)
Exemple #9
0
from guacamol.assess_distribution_learning import assess_distribution_learning
from generative_playground.molecules.guacamol_utils import DummyMoleculeGenerator
from generative_playground.molecules.lean_settings import get_data_location

# # naive sampling
# train_mols = get_data_location(source="ChEMBL:train")['source_data']
# my_gen = DummyMoleculeGenerator('distribution_naive_smiles.zip', maximize_reward=False)
# assess_distribution_learning(my_gen, train_mols)

# stable-ish run with discriminator sampling
#
files = [
    ['distribution_naive_no_priors_smiles.zip', 'distribution_naive_no_priors_smiles_2.zip'],
    'distribution_naive_uncond_priors_smiles.zip',  # is the one with unconditional priors
    'distribution_naive_smiles.zip',  # is the one with conditional priors
    'distribution_discr_eps0.2_smiles.zip']

train_mols = get_data_location(source="ChEMBL:train")['source_data']
my_gen = DummyMoleculeGenerator(files[0],  # eps0.2_smiles.zip',
                                maximize_reward=False,
                                keep_last=1e4)
assess_distribution_learning(my_gen, train_mols)