Esempio n. 1
0
def test_isomeric_canonicalisation():
    endiandric_acid = r'OC(=O)[C@H]5C2\C=C/C3[C@@H]5CC4[C@H](C\C=C\C=C\c1ccccc1)[C@@H]2[C@@H]34'

    with_stereocenters = canonicalize(endiandric_acid, include_stereocenters=True)
    without_stereocenters = canonicalize(endiandric_acid, include_stereocenters=False)

    expected_with_stereocenters = 'O=C(O)[C@H]1C2C=CC3[C@@H]1CC1[C@H](C/C=C/C=C/c4ccccc4)[C@@H]2[C@@H]31'
    expected_without_stereocenters = 'O=C(O)C1C2C=CC3C1CC1C(CC=CC=Cc4ccccc4)C2C31'

    assert with_stereocenters == expected_with_stereocenters
    assert without_stereocenters == expected_without_stereocenters
def process_smis(smis,
                 scoring_function,
                 pool,
                 canonicalization,
                 duplicate_removal,
                 scoring_parallelization,
                 max_smi_len=100):
    if canonicalization:
        smis = pool(
            delayed(lambda smi: canonicalize(smi, include_stereocenters=False))
            (smi) for smi in smis)
        smis = list(
            filter(lambda smi: (smi is not None) and (len(smi) < max_smi_len),
                   smis))

    if duplicate_removal:
        smis = list(set(smis))

    if scoring_function is None:
        return smis

    if scoring_parallelization:
        scores = pool(delayed(scoring_function)(smi) for smi in smis)
    else:
        scores = [scoring_function(smi) for smi in smis]

    smis, scores = filter_by_score(smis, scores, -1e-8)

    return smis, scores
Esempio n. 3
0
    def vec2smiles(self, vec, rem_bos, rem_eos):
        string = self.vec2string(vec, rem_bos, rem_eos)
        if string is None:
            return None

        smiles = canonicalize(string)
        if smiles is None or len(smiles) == 0:
            return None
        if len(smiles) > self.max_smiles_length:
            return None

        return smiles
Esempio n. 4
0
def test_list_canonicalization_removes_none():
    m1 = 'CCC(OCOCO)CC(=O)NCC'
    m2 = 'this.is.not.a.molecule'
    m3 = 'c1ccccc1'
    m4 = 'CC(OCON=N)CC'

    molecules = [m1, m2, m3, m4]
    canonicalized_molecules = canonicalize_list(molecules)

    valid_molecules = [m1, m3, m4]
    expected = [canonicalize(smiles) for smiles in valid_molecules]

    assert canonicalized_molecules == expected
Esempio n. 5
0
    def canonicalize_and_score_smiles(self, smis, scoring_function, pool):
        smis = pool(delayed(lambda smi: canonicalize(smi, include_stereocenters=False))(smi) for smi in smis)
        smis = list(filter(lambda smi: (smi is not None) and self.char_dict.allowed(smi), smis))
        scores = pool(delayed(scoring_function.score)(smi) for smi in smis)
        #scores = [0.0 for smi in smis]

        filtered_smis_and_scores = list(
            filter(
                lambda smi_and_score: smi_and_score[1] > scoring_function.scoring_function.corrupt_score,
                zip(smis, scores),
            )
        )

        smis, scores = map(list, zip(*filtered_smis_and_scores)) if len(filtered_smis_and_scores) > 0 else ([], [])
        return smis, scores
Esempio n. 6
0
def sample_unique_molecules(model: DistributionMatchingGenerator,
                            number_molecules: int,
                            max_tries=10) -> List[str]:
    """
    Sample from the given generator until the desired number of unique (distinct) molecules
    has been sampled (i.e., ignore duplicate molecules).

    Args:
        model: model to sample from
        number_molecules: number of unique (distinct) molecules to generate
        max_tries: determines the maximum number N of samples to draw, N = number_molecules * max_tries

    Returns:
        A list of number_molecules unique molecules, in canonalized form.
        If this was not possible with the given max_tries, the list may be shorter.
        The generation order is kept.
    """

    max_samples = max_tries * number_molecules
    number_already_sampled = 0

    unique_list: List[str] = []
    unique_set: Set[str] = set()

    while len(unique_list
              ) < number_molecules and number_already_sampled < max_samples:
        remaining_to_sample = number_molecules - len(unique_list)

        samples = model.generate(remaining_to_sample)
        number_already_sampled += remaining_to_sample

        for smiles in samples:
            canonical_smiles = canonicalize(smiles)
            if canonical_smiles is not None and canonical_smiles not in unique_set:
                unique_set.add(canonical_smiles)
                unique_list.append(canonical_smiles)

    # this should always be True
    assert len(unique_set) == len(unique_list)

    return unique_list
def mutate(p_gene, scoring_function):
    c_gene = mutation(p_gene)
    c_smiles = canonicalize(cfg_util.decode(gene_to_cfg(c_gene)))
    c_score = scoring_function.score(c_smiles)
    return Molecule(c_score, c_smiles, c_gene)
Esempio n. 8
0
    def _canonicalize(self):
        if self._canonical_smiles is not None:
            return

        canonical = [canonicalize(mol) for mol in self._smiles]
        self._canonical_smiles = [s for s in canonical if s is not None]