def test_sample_unique_molecules_if_not_enough_unique_generated():
    # does not raise an exception if
    molecules = ['CO' for _ in range(20)]
    molecules[-1] = 'CC'
    generator = MockGenerator(molecules)

    # samples a max of 9*2 molecules and just does not sample the other molecule
    # in this case the list of generated molecules contains just 'CO'
    mols = sample_unique_molecules(generator, 2, max_tries=9)
    assert mols == ['CO']

    # with a max of 10*2 molecules two valid molecules can be sampled
    generator = MockGenerator(molecules)
    mols = sample_unique_molecules(generator, 2)
    assert mols == ['CO', 'CC']
Example #2
0
    def assess_model(
        self, model: DistributionMatchingGenerator
    ) -> DistributionLearningBenchmarkResult:
        """
        Assess a distribution-matching generator model.

        Args:
            model: model to assess
        """
        start_time = time.time()
        molecules = sample_unique_molecules(
            model=model, number_molecules=self.number_samples, max_tries=2)
        end_time = time.time()

        if len(molecules) != self.number_samples:
            logger.warning(
                "The model could not generate enough unique molecules. The score will be penalized."
            )

        # canonicalize_list in order to remove stereo information (also removes duplicates and invalid molecules, but there shouldn't be any)
        unique_molecules = set(
            canonicalize_list(molecules, include_stereocenters=False))

        novel_molecules = unique_molecules.difference(
            self.training_set_molecules)

        novel_ratio = len(novel_molecules) / self.number_samples

        metadata = {
            "number_samples": self.number_samples,
            "number_novel": len(novel_molecules)
        }

        return DistributionLearningBenchmarkResult(
            benchmark_name=self.name,
            score=novel_ratio,
            sampling_time=end_time - start_time,
            metadata=metadata,
        )
Example #3
0
def save_metrics(model: EstimatorGenerator, training_set_file: Union[str,
                                                                     Path],
                 output_file: Union[str, Path]) -> None:
    training_set = [s.strip() for s in open(training_set_file).readlines()]
    training_set_molecules = set(
        canonicalize_list(training_set, include_stereocenters=False))
    LOG.info('Loaded %d unique molecules from %s', len(training_set_molecules),
             training_set_file)

    metrics = GraphMolecularMetrics(None, None)
    gen_molecules = sample_unique_molecules(model, 10000)
    pbar = tqdm(gen_molecules, desc='Computing metrics', total=10000)

    indices = []
    samples = defaultdict(lambda: [])
    for i, smi in enumerate(pbar):
        if smi is None or not ValidityScore.is_valid_smiles(smi):
            continue

        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            continue

        values = metrics.get_validation_metrics([mol])
        values['SMILES'] = smi
        values['is_novel'] = 0 if smi in training_set_molecules else 1

        for key, val in values.items():
            if isinstance(val, list):
                assert len(val) == 1
                val = val[0]
            samples[key].append(val)
        indices.append(i)

    df = pd.DataFrame.from_dict(samples)
    df.index = indices
    LOG.info('Saving metrics to %s', output_file)
    df.to_csv(output_file)
def test_sample_unique_molecules_with_duplicate_molecules():
    generator = MockGenerator(['CO', 'C(O)', 'CCCC', 'CC'])

    mols = sample_unique_molecules(generator, 2)

    assert mols == ['CO', 'CCCC']
def test_sample_unique_molecules_with_invalid_molecules():
    generator = MockGenerator(['invalid1', 'invalid2', 'inv3', 'CCCC', 'CC'])

    mols = sample_unique_molecules(generator, 2)

    assert mols == ['CCCC', 'CC']
def test_sample_unique_molecules_for_valid_only():
    generator = MockGenerator(['CCCC', 'CC'])

    mols = sample_unique_molecules(generator, 2)

    assert mols == ['CCCC', 'CC']
    def assess_model(
        self, model: DistributionMatchingGenerator
    ) -> DistributionLearningBenchmarkResult:
        """
        Assess a distribution-matching generator model.

        Args:
            model: model to assess
        """
        start_time = time.time()
        molecules = sample_unique_molecules(
            model=model, number_molecules=self.number_samples, max_tries=2)
        end_time = time.time()

        if len(molecules) != self.number_samples:
            logger.warning(
                'The model could not generate enough unique molecules. The score will be penalized.'
            )

        # canonicalize_list in order to remove stereo information (also removes duplicates and invalid molecules, but there shouldn't be any)
        unique_molecules = set(
            canonicalize_list(molecules, include_stereocenters=False))

        # first we calculate the descriptors, which are np.arrays of size n_samples x n_descriptors
        d_sampled = calculate_pc_descriptors(unique_molecules,
                                             self.pc_descriptor_subset)
        d_chembl = calculate_pc_descriptors(self.training_set_molecules,
                                            self.pc_descriptor_subset)

        kldivs = {}

        # now we calculate the kl divergence for the float valued descriptors ...
        for i in range(4):
            kldiv = continuous_kldiv(X_baseline=d_chembl[:, i],
                                     X_sampled=d_sampled[:, i])
            kldivs[self.pc_descriptor_subset[i]] = kldiv

        # ... and for the int valued ones.
        for i in range(4, 9):
            kldiv = discrete_kldiv(X_baseline=d_chembl[:, i],
                                   X_sampled=d_sampled[:, i])
            kldivs[self.pc_descriptor_subset[i]] = kldiv

        # pairwise similarity

        chembl_sim = calculate_internal_pairwise_similarities(
            self.training_set_molecules)
        chembl_sim = chembl_sim.max(axis=1)

        sampled_sim = calculate_internal_pairwise_similarities(
            unique_molecules)
        sampled_sim = sampled_sim.max(axis=1)

        kldiv_int_int = continuous_kldiv(X_baseline=chembl_sim,
                                         X_sampled=sampled_sim)
        kldivs['internal_similarity'] = kldiv_int_int

        # for some reason, this runs into problems when both sets are identical.
        # cross_set_sim = calculate_pairwise_similarities(self.training_set_molecules, unique_molecules)
        # cross_set_sim = cross_set_sim.max(axis=1)
        #
        # kldiv_ext = discrete_kldiv(chembl_sim, cross_set_sim)
        # kldivs['external_similarity'] = kldiv_ext
        # kldiv_sum += kldiv_ext

        metadata = {'number_samples': self.number_samples, 'kl_divs': kldivs}

        # Each KL divergence value is transformed to be in [0, 1].
        # Then their average delivers the final score.
        partial_scores = [np.exp(-score) for score in kldivs.values()]
        score = sum(partial_scores) / len(partial_scores)

        return DistributionLearningBenchmarkResult(benchmark_name=self.name,
                                                   score=score,
                                                   sampling_time=end_time -
                                                   start_time,
                                                   metadata=metadata)