Example #1
0
def test_kekulize_parser(test_name, column_name, dataset_samples):
    """Tests the kekulization of SMILES, which is the first step of
    selfies.encoder().
    """

    # file I/O
    curr_dir = os.path.dirname(__file__)
    test_path = os.path.join(curr_dir, 'test_sets', test_name + ".txt")
    error_path = os.path.join(curr_dir, 'error_sets',
                              "errors_kekulize_{}.csv".format(test_name))

    os.makedirs(os.path.dirname(error_path), exist_ok=True)
    error_list = []
    with open(error_path, "w+") as error_log:
        error_log.write("In\n")
    error_found_flag = False

    # make pandas reader
    N = sum(1 for _ in open(test_path)) - 1
    S = dataset_samples if (0 < dataset_samples <= N) else N
    skip = sorted(random.sample(range(1, N + 1), N - S))
    reader = pd.read_csv(test_path, chunksize=10000, header=0, skiprows=skip)

    # kekulize testing
    for chunk in reader:
        for smiles in chunk[column_name]:

            if (MolFromSmiles(smiles) is None) or ('*' in smiles):
                continue

            # build kekulized SMILES
            kekule_fragments = []

            for fragment in smiles.split("."):

                kekule_gen = kekulize_parser(_parse_smiles(fragment))

                k = []
                for bond, symbol, symbol_type in kekule_gen:
                    if symbol_type == BRANCH_TYPE:
                        bond = ''
                    k.append(bond)

                    if symbol_type == RING_TYPE and len(symbol) == 2:
                        k.append('%')
                    k.append(symbol)

                kekule_fragments.append(''.join(k))

            kekule_smiles = '.'.join(kekule_fragments)

            if not is_same_mol(smiles, kekule_smiles):
                error_list.append(smiles)

        with open(error_path, "a") as error_log:
            error_log.write("\n".join(error_list))
        error_found_flag = error_found_flag or error_list
        error_list = []

    assert not error_found_flag
Example #2
0
def _translate_smiles(smiles: str) -> str:
    """A helper for ``selfies.encoder``, which translates a SMILES into a
    SELFIES (assuming the input SMILES contains no dots).

    :param smiles: the SMILES to be translated.
    :return: the SELFIES translation of SMILES.
    """

    smiles_gen = _parse_smiles(smiles)

    char_set = set(smiles)
    if any(c in char_set for c in ['c', 'n', 'o', 'p', 'a', 's']):
        smiles_gen = kekulize_parser(smiles_gen)

    # a simple mutable counter to track which atom was the i-th derived atom
    derive_counter = [0]

    # a dictionary to keep track of the rings to be made. If a ring with id
    # X is connected to the i-th and j-th derived atoms (i < j) with bond
    # symbol s, then after the i-th atom is derived, rings[X] = (s, i).
    # As soon as the j-th atom is derived, rings[X] is removed from <rings>,
    # and the ring is made.
    rings = {}

    selfies, _ = _translate_smiles_derive(smiles_gen, rings, derive_counter)

    if rings:
        raise ValueError("malformed ring numbering or ring numbering "
                         "across a dot symbol")

    return selfies
def time_kekulize(file_path: str, sample_size: int = -1):
    curr_dir = os.path.dirname(__file__)
    file_path = os.path.join(curr_dir, file_path)

    # load data
    with open(file_path, 'r') as file:
        smiles = [line.rstrip() for line in file.readlines()]
        smiles.pop(0)

        if sample_size > 0:
            smiles = random.sample(smiles, sample_size)

    print(f"Timing Kekulization of {len(smiles)} SMILES from {file_path}")

    # time selfies kekulization
    start = time.time()
    for s in smiles:
        list(kekulize_parser(_parse_smiles(s)))
    selfies_time = time.time() - start
    print(f"--> selfies kekulize: {selfies_time:0.7f}s")

    # time RDKit kekulization
    start = time.time()
    for s in smiles:
        m = MolFromSmiles(s)
        Kekulize(m)
        MolToSmiles(m, kekuleSmiles=True)
    rdkit_time = time.time() - start
    print(f"--> RDKit kekulize: {rdkit_time:0.7f}s")