Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--smiles_file',
        help=
        'Location of the ChEMBL dataset to use for the distribution benchmarks.',
        default='data/guacamol_v1_all.smiles')
    parser.add_argument(
        '--pickle_directory',
        help=
        'Directory containing pickle files with the distribution statistics',
        default=None)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--n_jobs', type=int, default=-1)
    parser.add_argument('--generations', type=int, default=1000)
    parser.add_argument('--population_size', type=int, default=100)
    parser.add_argument('--num_sims', type=int, default=40)
    parser.add_argument('--max_children', type=int, default=25)
    parser.add_argument('--max_atoms', type=int, default=60)
    parser.add_argument('--init_smiles', type=str, default='CC')
    parser.add_argument('--random_start', action='store_true')
    parser.add_argument('--output_dir', type=str, default=None)
    parser.add_argument('--suite', default='v2')
    args = parser.parse_args()

    if args.output_dir is None:
        args.output_dir = os.path.dirname(os.path.realpath(__file__))

    if args.pickle_directory is None:
        args.pickle_directory = os.path.dirname(os.path.realpath(__file__))

    np.random.seed(args.seed)

    setup_default_logger()

    # save command line args
    with open(
            os.path.join(args.output_dir, 'distribution_learning_params.json'),
            'w') as jf:
        json.dump(vars(args), jf, sort_keys=True, indent=4)

    sampler = GB_MCTS_Sampler(pickle_directory=args.pickle_directory,
                              n_jobs=args.n_jobs,
                              random_start=args.random_start,
                              num_sims=args.num_sims,
                              max_children=args.max_children,
                              init_smiles=args.init_smiles,
                              max_atoms=args.max_atoms,
                              generations=args.generations,
                              population_size=args.population_size)

    json_file_path = os.path.join(args.output_dir,
                                  'distribution_learning_results.json')
    assess_distribution_learning(sampler,
                                 json_output_file=json_file_path,
                                 chembl_training_file=args.smiles_file,
                                 benchmark_version=args.suite)
def main():
    population_size = 100  ### each generation for each mol in population, one oracle call.
    max_children = 10
    generations_num = int(max_oracle_num / population_size / max_children)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--pickle_directory',
        help=
        'Directory containing pickle files with the distribution statistics',
        default=None)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--n_jobs', type=int, default=-1)
    parser.add_argument('--generations', type=int, default=generations_num)
    parser.add_argument('--population_size', type=int, default=population_size)
    parser.add_argument('--num_sims', type=int, default=40)
    parser.add_argument('--max_children', type=int,
                        default=max_children)  ### 25 -> 5
    parser.add_argument('--max_atoms', type=int, default=60)
    parser.add_argument('--init_smiles', type=str, default='CC')
    parser.add_argument('--output_dir', type=str, default=None)
    parser.add_argument('--patience', type=int, default=5)
    parser.add_argument('--suite', default='v3')
    args = parser.parse_args()

    if args.output_dir is None:
        args.output_dir = os.path.dirname(os.path.realpath(__file__))

    if args.pickle_directory is None:
        args.pickle_directory = os.path.dirname(os.path.realpath(__file__))

    np.random.seed(args.seed)

    setup_default_logger()

    # save command line args
    with open(os.path.join(args.output_dir, 'goal_directed_params.json'),
              'w') as jf:
        json.dump(vars(args), jf, sort_keys=True, indent=4)

    optimiser = GB_MCTS_Generator(pickle_directory=args.pickle_directory,
                                  n_jobs=args.n_jobs,
                                  num_sims=args.num_sims,
                                  max_children=args.max_children,
                                  init_smiles=args.init_smiles,
                                  max_atoms=args.max_atoms,
                                  patience=args.patience,
                                  generations=args.generations,
                                  population_size=args.population_size)

    json_file_path = os.path.join(args.output_dir,
                                  'goal_directed_results.json')
    assess_goal_directed_generation(optimiser,
                                    json_output_file=json_file_path,
                                    benchmark_version=args.suite)
Esempio n. 3
0
def main(config):
    setup_default_logger()

    set_seed(config.seed)

    generator = OrganGenerator(config)

    json_file_path = os.path.join(config.output_dir,
                                  'distribution_learning_results.json')
    assess_distribution_learning(generator,
                                 chembl_training_file=config.dist_file,
                                 json_output_file=json_file_path,
                                 benchmark_version=config.suite)
Esempio n. 4
0
def entry_point():
    parser = argparse.ArgumentParser()
    parser.add_argument('--smiles_file', type=str)
    parser.add_argument('--db_fname', type=str)
    parser.add_argument('--selection_size', type=int, default=10)
    parser.add_argument('--radius', type=int, default=3)
    parser.add_argument('--replacements', type=int, default=1000)
    parser.add_argument('--min_size', type=int, default=0)
    parser.add_argument('--max_size', type=int, default=10)
    parser.add_argument('--min_inc', type=int, default=-7)
    parser.add_argument('--max_inc', type=int, default=7)
    parser.add_argument('--generations', type=int, default=1000)
    parser.add_argument('--ncpu', type=int, default=1)
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--output_dir', type=str, default=None)
    parser.add_argument('--suite', default='v2')

    args = parser.parse_args()

    np.random.seed(args.seed)

    setup_default_logger()

    if args.output_dir is None:
        args.output_dir = os.path.dirname(os.path.realpath(__file__))

    # save command line args
    with open(os.path.join(args.output_dir, 'goal_directed_params.json'),
              'w') as jf:
        json.dump(vars(args), jf, sort_keys=True, indent=4)

    optimiser = CREM_Generator(smi_file=args.smiles_file,
                               selection_size=args.selection_size,
                               db_fname=args.db_fname,
                               radius=args.radius,
                               min_size=args.min_size,
                               max_size=args.max_size,
                               min_inc=args.min_inc,
                               max_inc=args.max_inc,
                               replacements=args.replacements,
                               generations=args.generations,
                               ncpu=args.ncpu,
                               random_start=True,
                               output_dir=args.output_dir)

    json_file_path = os.path.join(args.output_dir,
                                  'goal_directed_results.json')
    assess_goal_directed_generation(optimiser,
                                    json_output_file=json_file_path,
                                    benchmark_version=args.suite)
Esempio n. 5
0
def main():
    setup_default_logger()

    parser = argparse.ArgumentParser(
        description=
        'Generate pickle files for the statistics of a training set for MCTS',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--smiles_file',
        default='data/moses.smiles',
        help=
        'Full path to SMILES file from which to generate the distribution statistics'
    )
    parser.add_argument('--output_dir',
                        default=None,
                        help='Output directory for the pickle files')
    args = parser.parse_args()

    if args.output_dir is None:
        args.output_dir = os.path.dirname(os.path.realpath(__file__))

    logger.info('Generating probabilities for MCTS...')

    t0 = time()

    stats = StatsCalculator(args.smiles_file)

    size_stats = stats.size_statistics()
    rxn_smarts_rings = stats.rxn_smarts_rings()
    rxn_smarts_make_rings = stats.rxn_smarts_make_rings()
    p_rings = stats.ring_probs()

    pickle.dump(size_stats,
                open(os.path.join(args.output_dir, 'size_stats.p'), 'wb'))
    pickle.dump(p_rings, open(os.path.join(args.output_dir, 'p_ring.p'), 'wb'))
    pickle.dump(rxn_smarts_rings,
                open(os.path.join(args.output_dir, 'rs_ring.p'), 'wb'))
    pickle.dump(rxn_smarts_make_rings,
                open(os.path.join(args.output_dir, 'rs_make_ring.p'), 'wb'))

    p = stats.pair_probs()
    rxn_smarts = stats.rxn_smarts()

    pickle.dump(p, open(os.path.join(args.output_dir, 'p1.p'), 'wb'))
    pickle.dump(rxn_smarts, open(os.path.join(args.output_dir, 'r_s1.p'),
                                 'wb'))

    print(
        f'Total time: {str(datetime.timedelta(seconds=int(time() - t0)))} secs'
    )
Esempio n. 6
0
def main(config):
    setup_default_logger()

    set_seed(config.seed)

    if config.output_dir is None:
        config.output_dir = os.path.dirname(os.path.realpath(__file__))

    generator = VaeGenerator(config)

    json_file_path = os.path.join(config.output_dir, 'distribution_learning_results.json')
    assess_distribution_learning(generator,
                                 chembl_training_file=config.dist_file,
                                 json_output_file=json_file_path,
                                 benchmark_version=config.suite)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_jobs', type=int, default=-1)
    parser.add_argument('--episode_size', type=int, default=8192)
    parser.add_argument('--batch_size', type=int, default=1024)
    parser.add_argument('--entropy_weight', type=int, default=1)
    parser.add_argument('--kl_div_weight', type=int, default=10)
    parser.add_argument('--output_dir', default=None)
    parser.add_argument('--clip_param', type=int, default=0.2)
    parser.add_argument('--num_epochs', type=int, default=20)
    parser.add_argument('--model_path', default=None)
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--suite', default='v3')

    args = parser.parse_args()

    np.random.seed(args.seed)

    setup_default_logger()

    if args.output_dir is None:
        args.output_dir = os.path.dirname(os.path.realpath(__file__))

    if args.model_path is None:
        dir_path = os.path.dirname(os.path.realpath(__file__))
        args.model_path = os.path.join(dir_path, 'pretrained_model',
                                       'model_final_0.473.pt')

    # save command line args
    with open(os.path.join(args.output_dir, 'goal_directed_params.json'),
              'w') as jf:
        json.dump(vars(args), jf, sort_keys=True, indent=4)

    optimiser = PPODirectedGenerator(pretrained_model_path=args.model_path,
                                     num_epochs=args.num_epochs,
                                     episode_size=args.episode_size,
                                     batch_size=args.batch_size,
                                     entropy_weight=args.entropy_weight,
                                     kl_div_weight=args.kl_div_weight,
                                     clip_param=args.clip_param)

    json_file_path = os.path.join(args.output_dir,
                                  'goal_directed_results.json')
    assess_goal_directed_generation(optimiser,
                                    json_output_file=json_file_path,
                                    benchmark_version=args.suite)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--pickle_directory', help='Directory containing pickle files with the distribution statistics',
                        default=None)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--n_jobs', type=int, default=-1)
    parser.add_argument('--generations', type=int, default=1000)
    parser.add_argument('--population_size', type=int, default=100)
    parser.add_argument('--num_sims', type=int, default=40)
    parser.add_argument('--max_children', type=int, default=25)
    parser.add_argument('--max_atoms', type=int, default=60)
    parser.add_argument('--init_smiles', type=str, default='CC')
    parser.add_argument('--random_start', type=bool, default=False)
    parser.add_argument('--output_dir', type=str, default=None)
    parser.add_argument('--patience', type=int, default=5)
    args = parser.parse_args()

    if args.output_dir is None:
        args.output_dir = os.path.dirname(os.path.realpath(__file__))

    if args.pickle_directory is None:
        args.pickle_directory = os.path.dirname(os.path.realpath(__file__))

    np.random.seed(args.seed)

    setup_default_logger()

    # save command line args
    with open(os.path.join(args.output_dir, 'goal_directed_params.json'), 'w') as jf:
        json.dump(vars(args), jf, sort_keys=True, indent=4)

    optimiser = GB_MCTS_Generator(pickle_directory=args.pickle_directory,
                                  n_jobs=args.n_jobs,
                                  random_start=args.random_start,
                                  num_sims=args.num_sims,
                                  max_children=args.max_children,
                                  init_smiles=args.init_smiles,
                                  max_atoms=args.max_atoms,
                                  patience=args.patience,
                                  generations=args.generations,
                                  population_size=args.population_size)

    json_file_path = os.path.join(args.output_dir, 'goal_directed_results.json')
    assess_goal_directed_generation(optimiser, json_output_file=json_file_path)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--smiles_file', default='data/guacamol_v1_all.smiles')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--population_size', type=int, default=100)
    parser.add_argument('--n_mutations', type=int, default=200)
    parser.add_argument('--gene_size', type=int, default=300)
    parser.add_argument('--generations', type=int, default=1000)
    parser.add_argument('--n_jobs', type=int, default=-1)
    parser.add_argument('--random_start', action='store_true')
    parser.add_argument('--output_dir', type=str, default=None)
    parser.add_argument('--patience', type=int, default=5)
    parser.add_argument('--suite', default='v1')

    args = parser.parse_args()

    np.random.seed(args.seed)

    setup_default_logger()

    if args.output_dir is None:
        args.output_dir = os.path.dirname(os.path.realpath(__file__))

    # save command line args
    with open(os.path.join(args.output_dir, 'goal_directed_params.json'),
              'w') as jf:
        json.dump(vars(args), jf, sort_keys=True, indent=4)

    optimiser = ChemGEGenerator(smi_file=args.smiles_file,
                                population_size=args.population_size,
                                n_mutations=args.n_mutations,
                                gene_size=args.gene_size,
                                generations=args.generations,
                                n_jobs=args.n_jobs,
                                random_start=args.random_start,
                                patience=args.patience)

    json_file_path = os.path.join(args.output_dir,
                                  'goal_directed_results.json')
    assess_goal_directed_generation(optimiser,
                                    json_output_file=json_file_path,
                                    benchmark_version=args.suite)
Esempio n. 10
0
def main(config):
    setup_default_logger()

    set_seed(config.seed)

    train = read_smiles(config.train_load)

    vocab = CharVocab.from_data(train)
    torch.save(config, config.config_save)
    torch.save(vocab, config.vocab_save)

    device = torch.device(config.device)

    model = AAE(vocab, config)
    model = model.to(device)

    trainer = AAETrainer(config)
    trainer.fit(model, train)

    model.to('cpu')
    torch.save(model.state_dict(), config.model_save)
import argparse
import logging
import os
from pathlib import Path

import torch

from guacamol.assess_distribution_learning import assess_distribution_learning
from guacamol.utils.helpers import setup_default_logger

from .rnn_utils import load_rnn_model, set_random_seed
from .smiles_rnn_generator import SmilesRnnGenerator

if __name__ == '__main__':
    setup_default_logger()
    logger = logging.getLogger(__name__)

    parser = argparse.ArgumentParser(description='Distribution learning benchmark for SMILES RNN',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--seed', default=42, type=int, help='Random seed')
    parser.add_argument('--model_path', default=None, help='Full path to SMILES RNN model')
    parser.add_argument('--output_dir', default=None, help='Output directory')
    parser.add_argument('--dist_file', default='data/guacamol_v1_all.smiles', help='Distribution file')

    args = parser.parse_args()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    logger.info(f'device:\t{device}')

    set_random_seed(args.seed, device)
def main():
    """ Get Chembl-23.

    Preprocessing steps:

    1) filter SMILES shorter than 5 and longer than 200 chars and those with forbidden symbols
    2) canonicalize, neutralize, only permit smiles shorter than 100 chars
    3) shuffle, write files, check if they are consistently hashed.
    """
    setup_default_logger()

    argparser = get_argparser()
    args = argparser.parse_args()

    # Set constants
    np.random.seed(1337)
    neutralization_rxns = initialise_neutralisation_reactions()
    smiles_dict = AllowedSmilesCharDictionary()

    print("Preprocessing ChEMBL molecules...")

    chembl_file = os.path.join(args.destination, CHEMBL_FILE_NAME)

    data = (
        pkgutil.get_data("guacamol.data", "holdout_set_gcm_v1.smiles").decode("utf-8").splitlines()
    )

    holdout_mols = [i.split(" ")[0] for i in data]
    holdout_set = set(canonicalize_list(holdout_mols, False))
    holdout_fps = get_fingerprints_from_smileslist(holdout_set)

    # Download Chembl23 if needed.
    download_if_not_present(chembl_file, uri=CHEMBL_URL)
    raw_smiles = get_raw_smiles(
        chembl_file, smiles_char_dict=smiles_dict, open_fn=gzip.open, extract_fn=extract_chembl
    )

    file_prefix = "chembl24_canon"

    print(
        f"and standardizing {len(raw_smiles)} molecules using {args.n_jobs} cores, "
        f"and excluding molecules based on ECFP4 similarity of > {TANIMOTO_CUTOFF} to the holdout set."
    )

    # Process all the SMILES in parallel
    runner = Parallel(n_jobs=args.n_jobs, verbose=2)

    joblist = (
        delayed(filter_and_canonicalize)(
            smiles_str, holdout_set, holdout_fps, neutralization_rxns, TANIMOTO_CUTOFF, False
        )
        for smiles_str in raw_smiles
    )

    output = runner(joblist)

    # Put all nonzero molecules in a list, remove duplicates, sort and shuffle

    all_good_mols = sorted(list(set([item[0] for item in output if item])))
    np.random.shuffle(all_good_mols)
    print(f"Ended up with {len(all_good_mols)} molecules. Preparing splits...")

    # Split into train-dev-test
    # Check whether the md5-hashes of the generated smiles files match
    # the precomputed hashes, this ensures everyone works with the same splits.

    VALID_SIZE = int(0.05 * len(all_good_mols))
    TEST_SIZE = int(0.15 * len(all_good_mols))

    dev_set = all_good_mols[0:VALID_SIZE]
    dev_path = os.path.join(args.destination, f"{file_prefix}_dev-valid.smiles")
    write_smiles(dev_set, dev_path)

    test_set = all_good_mols[VALID_SIZE : VALID_SIZE + TEST_SIZE]
    test_path = os.path.join(args.destination, f"{file_prefix}_test.smiles")
    write_smiles(test_set, test_path)

    train_set = all_good_mols[VALID_SIZE + TEST_SIZE :]
    train_path = os.path.join(args.destination, f"{file_prefix}_train.smiles")
    write_smiles(train_set, train_path)

    # check the hashes
    valid_hashes = [
        compare_hash(train_path, TRAIN_HASH),
        compare_hash(dev_path, VALID_HASH),
        compare_hash(test_path, TEST_HASH),
    ]

    if not all(valid_hashes):
        raise SystemExit(f"Invalid hashes for the dataset files")

    print("Dataset generation successful. You are ready to go.")