def assess_goal_directed_generation(
        goal_directed_molecule_generator: GoalDirectedGenerator,
        json_output_file='output_goal_directed.json',
        benchmark_version='v3') -> None:
    """
    Assesses a distribution-matching model for de novo molecule design.

    Args:
        goal_directed_molecule_generator: Model to evaluate
        json_output_file: Name of the file where to save the results in JSON format
        benchmark_version: which benchmark suite to execute
    """
    logger.info(
        f'Benchmarking goal-directed molecule generation, version {benchmark_version}'
    )
    benchmarks = goal_directed_benchmark_suite(version_name=benchmark_version)

    results = _evaluate_goal_directed_benchmarks(
        goal_directed_molecule_generator=goal_directed_molecule_generator,
        benchmarks=benchmarks)

    benchmark_results: Dict[str, Any] = OrderedDict()
    benchmark_results['guacamol_version'] = guacamol.__version__
    benchmark_results['benchmark_suite_version'] = benchmark_version
    benchmark_results['timestamp'] = get_time_string()
    benchmark_results['results'] = [vars(result) for result in results]

    logger.info(f'Save results to file {json_output_file}')
    with open(json_output_file, 'wt') as f:
        f.write(json.dumps(benchmark_results, indent=4))
Exemple #2
0
def Main():
    # Parse the command line arguments.
    args = ParseArgs()

    # Create a pool of worker threads.
    thread_pool = joblib.Parallel(n_jobs=args.n_threads)

    # Load the SMILES of the input file into memory.
    smiles = LoadSMILES(thread_pool, args.input)
    print(f"{len(smiles)} SMILES loaded.")

    # Loop over the individual benchmarks in the benchmark suite.
    starting_populations = {}
    benchmarks = goal_directed_benchmark_suite(
        version_name=args.benchmark_suite_version)
    for benchmark in benchmarks:
        # Determine the number of molecules to write out.
        n = args.n_molecules
        n_requested = max(benchmark.contribution_specification.top_counts)
        if n_requested > n:
            n = n_requested
        # Use the benchmark's scoring function to score the loaded SMILES.
        start_time = time.time()
        print(f"Scoring molecules with '{benchmark.name}' scoring function.")
        top_n_smiles = TopN(thread_pool, smiles, benchmark.objective, n)
        time_spend = time.time() - start_time
        print(f"{time_spend}s spent retrieving {len(top_n_smiles)} molecules.")
        # Store the best molecules.
        starting_populations[benchmark.name] = top_n_smiles

    # Write the molecules to the output JSON file.
    with open(args.output, "w") as file:
        json.dump(starting_populations, file, indent=4)
def Main():
    # Parse the command line arguments.
    args = ParseArgs()

    # Read the starting populations JSON. These are supposed to be the top N
    # molecules found by screening the GuacaMol "all" dataset using the same scoring
    # functions as the GuacaMol benchmark suite. In the original GuacaMol examples
    # this step is done as part of the benchmark, but since that implies repeating
    # the virtual screen for every replica I've separated it.
    benchmark_starting_populations = None
    if args.starting_populations:
        benchmark_starting_populations = LoadStartingPopulations(args.starting_populations)

    # Iterate over the specified benchmarks.
    benchmarks = goal_directed_benchmark_suite(version_name=args.benchmark_suite_version)
    for benchmark_id in args.benchmark_ids:
        benchmark = benchmarks[benchmark_id - 1]
        print(f"Current benchmark: {benchmark.name}")

        # Fetch the appropiate starting population for the benchmark.
        starting_population = None
        if args.starting_populations:
            starting_population = benchmark_starting_populations[benchmark.name]

        # Create an output directory for the benchmark.
        benchmark_output_directory = os.path.join(args.output_directory, str(benchmark_id))
        os.mkdir(benchmark_output_directory)

        # Initialize a LEADD instance.
        leadd = LEADD(fragmentation_settings_file=args.fragmentation_settings_file,
                      reconstruction_settings_file=args.reconstruction_settings_file,
                      output_directory=benchmark_output_directory,
                      starting_population=starting_population,
                      benchmark=benchmark,
                      n_threads=args.n_threads)

        # Use the benchmark to assess LEADD's performance.
        benchmark_result = benchmark.assess_model(leadd)
        # Add finer-grained computational performance data to the result's metadata.
        leadd.add_performance_data_to_guacamol_benchmark_result(benchmark_result)

        # Write the genetic operation frequencies to the report.
        leadd.leadd.WriteOperationFrequenciesToReport()

        # Release all LEADD resources.
        leadd.clean_up()

        # Write the results to a JSON file.
        output_json_path = os.path.join(benchmark_output_directory, "goal_directed_results.json")
        WriteGuacaMolBenchmarkResult(benchmark_result, output_json_path)
Exemple #4
0
    def generate_optimized_molecules(self,
                                     scoring_function: ScoringFunction,
                                     number_molecules: int,
                                     starting_population: Optional[List[str]] = None) -> List[str]:
        """
        Given an objective function, generate molecules that score as high as possible.

        Args:
            scoring_function: scoring function
            number_molecules: number of molecules to generate
            starting_population: molecules to start the optimization from (optional)

        Returns:
            A list of SMILES strings for the generated molecules.
        """
        benchmarks = goal_directed_benchmark_suite(self.version)
        cache_files = ['guac_v2_{}do_0.5_lr4e-5_mark'.format(idx) for idx in range(20)]
        #TODO: put the directory for the cache file here
        gen = DummyMoleculeGenerator([cache_files[19]], maximize_reward=True)
        self.obj_num += 1
        if self.obj_num == self.num_benchmarks:
            self.obj_num == 0
        return gen.generate(number_molecules)
Exemple #5
0
 def __init__(self, version):
     self.version = version
     self.num_benchmarks = len(goal_directed_benchmark_suite(self.version))
     self.obj_num = 0
Exemple #6
0
def guacamol_goal_scoring_functions(version_name):
    assert version_name in version_name_list, "Version name must be in " + str(version_name_list)
    benchmarks = goal_directed_benchmark_suite(version_name=version_name)
    out = [GuacamolGoalWrapper(b) for b in benchmarks]
    return out