def test_validity_correct_molecules(): smiles_1 = 'O' smiles_2 = 'C' smiles_3 = 'CC(ONONOC)CCCc1ccccc1' assert is_valid(smiles_1) assert is_valid(smiles_2) assert is_valid(smiles_3)
def sample_valid_molecules(model: DistributionMatchingGenerator, number_molecules: int, max_tries=10) -> List[str]: """ Sample from the given generator until the desired number of valid molecules has been sampled (i.e., ignore invalid molecules). Args: model: model to sample from number_molecules: number of valid molecules to generate max_tries: determines the maximum number N of samples to draw, N = number_molecules * max_tries Returns: A list of number_molecules valid molecules. If this was not possible with the given max_tries, the list may be shorter. """ max_samples = max_tries * number_molecules number_already_sampled = 0 valid_molecules: List[str] = [] while len(valid_molecules) < number_molecules and number_already_sampled < max_samples: remaining_to_sample = number_molecules - len(valid_molecules) samples = model.generate(remaining_to_sample) number_already_sampled += remaining_to_sample valid_molecules += [m for m in samples if is_valid(m)] return valid_molecules
def assess_model( self, model: DistributionMatchingGenerator ) -> DistributionLearningBenchmarkResult: start_time = time.time() molecules = model.generate(number_samples=self.number_samples) end_time = time.time() if len(molecules) != self.number_samples: raise Exception( "The model did not generate the correct number of molecules") number_valid = sum(1 if is_valid(smiles) else 0 for smiles in molecules) validity_ratio = number_valid / self.number_samples metadata = { "number_samples": self.number_samples, "number_valid": number_valid } return DistributionLearningBenchmarkResult( benchmark_name=self.name, score=validity_ratio, sampling_time=end_time - start_time, metadata=metadata, )
def test_validity_incorrect_syntax(): smiles = 'CCCincorrectsyntaxCCC' assert not is_valid(smiles)
def test_validity_empty_molecule(): smiles = '' assert not is_valid(smiles)
def test_validity_incorrect_valence(): smiles = 'CCC(CC)(CC)(=O)CCC' assert not is_valid(smiles)