Ejemplo n.º 1
def test_get_semantic_constraints():
    """Tests selfies.get_semantic_constraints().

    # Getting the constraints does not return aliases
    assert sf.get_semantic_constraints() is not sf.get_semantic_constraints()

    # The appropriate symbols are in the constraints
    constraints = sf.get_semantic_constraints()
    assert '?' in constraints
Ejemplo n.º 2
def test_change_constraints_cache_clear():
    alphabet = sf.get_semantic_robust_alphabet()
    assert alphabet == sf.get_semantic_robust_alphabet()
    assert sf.decoder("[C][#C]") == "C#C"

    new_constraints = sf.get_semantic_constraints()
    new_constraints["C"] = 1

    new_alphabet = sf.get_semantic_robust_alphabet()
    assert new_alphabet != alphabet
    assert sf.decoder("[C][#C]") == "CC"

    sf.set_semantic_constraints()  # re-set alphabet
Ejemplo n.º 3
def test_unconstrained_symbols():
    """Tests SELFIES with symbols that are not semantically constrained.

    f_branch = "[Branch1][C][F]"
    s = "[Xe-2]" + (f_branch * 8)
    assert decode_eq(s, "[Xe-2](F)(F)(F)(F)(F)(F)(F)CF")

    # change default semantic constraints
    constraints = sf.get_semantic_constraints()
    constraints["?"] = 2

    assert decode_eq(s, "[Xe-2](F)CF")

Ejemplo n.º 4
def test_charged_symbols():
    """Tests that SELFIES symbols with charges are constrained properly.

    constraints = sf.get_semantic_constraints()
    constraints["Sn+4"] = 1
    constraints["O-2"] = 2

    # the following molecules don't make sense, but we use them to test
    # selfies. Hence, we can't verify them with RDKit
    assert decode_eq("[Sn+4][=C]", "[Sn+4]C")
    assert decode_eq("[O-2][#C]", "[O-2]=C")

    # mixing many symbol types
    assert decode_eq("[17O@@H1-2][#C]", "[17O@@H1-2]C")

Ejemplo n.º 5
def test_charged_symbols():
    """Tests that SELFIES symbols with charges are constrained properly.

    constraints = sf.get_semantic_constraints()
    constraints['Sn+4'] = 1
    constraints['O-2'] = 2

    # the following molecules don't make sense, but we use them to test
    # selfies. Hence, we can't verify them with RDKit
    assert sf.decoder("[Sn++++expl][=C]") == "[Sn++++]C"
    assert sf.decoder("[Sn+4expl][=C]") == "[Sn+4]C"
    assert sf.decoder("[O--expl][#C]") == "[O--]=C"
    assert sf.decoder("[O-2expl][#C]") == "[O-2]=C"

    # mixing many symbol types
    assert sf.decoder("[17O@@H-2expl][#C]") == "[17O@@H-2]C"

Ejemplo n.º 6
def test_unconstrained_symbols():
    """Tests SELFIES with symbols that are not semantically constrained.

    assert sf.decoder("[Xe-2expl][Branch1_1][C][F][Branch1_1][C][F]"
                      "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]") \
           == "[Xe-2](F)(F)(F)(F)(F)(F)(F)CF"

    # change default semantic constraints
    constraints = sf.get_semantic_constraints()
    constraints['?'] = 2

    assert sf.decoder("[Xe-2expl][Branch1_1][C][F][Branch1_1][C][F]"
                      "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]") \
           == "[Xe-2](F)CF"

Ejemplo n.º 7
def test_roundtrip_translation():
    """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the
    SMILES examples in QM9, NonFullerene, Zinc, etc.

    # modify constraints
    constraints = sf.get_semantic_constraints()
    constraints['N'] = 6
    constraints['Br'] = 7
    constraints['Cl'] = 7
    constraints['I'] = 7

    # file I/O
    ckpt_path = os.path.join(curr_dir, 'checkpoints', 'emolecule_ckpt.txt')
    error_path = os.path.join(curr_dir, 'error_sets', 'errors_emolecules.csv')

    # check if a previous checkpoint exists to continue tests
    if os.path.exists(ckpt_path):
        with open(ckpt_path, 'r') as ckpt_file:
            checkpoint = int(ckpt_file.readlines()[0])

    # if no path to a checkpoint exists,
    # create a new directory for error logging and checkpoints
        os.makedirs(os.path.dirname(ckpt_path), exist_ok=True)
        os.makedirs(os.path.dirname(error_path), exist_ok=True)

        with open(error_path, "w+") as error_log:
            error_log.write("In, Out\n")
        checkpoint = -1

    error_list = []
    error_found_flag = False

    # make pandas reader
    reader = pd.read_csv(EMOL_PATH,
                         delimiter=' ',

    # roundtrip testing
    for chunk_idx, chunk in enumerate(reader):

        if chunk_idx <= checkpoint:

        for in_smiles in chunk[COL_NAME]:

            # check if SMILES in chunk is a valid RDKit molecule.
            # if not, skip testing
            # All inputted SMILES must be valid
            # RDKit Mol objects to be encoded.
            if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles):

            # encode selfies
            selfies = sf.encoder(in_smiles)

            # if unable to encode SMILES, write to list of errors
            if selfies is None:
                error_list.append((in_smiles, ''))

            # take encoeded SELFIES and decode
            out_smiles = sf.decoder(selfies)

            # compare original SMILES to decoded SELFIE string.
            # if not the same string, write to list of errors.
            if not is_same_mol(in_smiles, out_smiles):
                error_list.append((in_smiles, out_smiles))

        # open and write all errors to errors_emolecule.csv
        with open(error_path, "a") as error_log:
            for error in error_list:
                error_log.write(','.join(error) + "\n")
        error_found_flag = error_found_flag or error_list
        error_list = []

        # create checkpoint from the current pandas reader chunk,
        # to load from and continue testing.
        with open(ckpt_path, 'w+') as ckpt_file:

    sf.set_semantic_constraints()  # restore defaults
    os.remove(ckpt_path)  # remove checkpoint

    assert not error_found_flag
Ejemplo n.º 8
def test_get_semantic_constraints():
    constraints = sf.get_semantic_constraints()
    assert constraints is not sf.get_semantic_constraints()  # not alias
    assert "?" in constraints
Ejemplo n.º 9
def test_roundtrip_translation(test_name, column_name, dataset_samples):
    """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the
    SMILES examples in QM9, NonFullerene, Zinc, etc.

    # modify semantic bond constraints
    constraints = sf.get_semantic_constraints()
    constraints['N'] = 6

    # file I/O
    curr_dir = os.path.dirname(__file__)
    test_path = os.path.join(curr_dir, 'test_sets', test_name + ".txt")
    error_path = os.path.join(curr_dir, 'error_sets',

    # create error directory
    os.makedirs(os.path.dirname(error_path), exist_ok=True)
    error_list = []

    # add header in error log text file
    with open(error_path, "w+") as error_log:
        error_log.write("In, Out\n")
    error_found_flag = False

    # make pandas reader
    N = sum(1 for _ in open(test_path)) - 1
    S = dataset_samples if (0 < dataset_samples <= N) else N
    skip = sorted(random.sample(range(1, N + 1), N - S))
    reader = pd.read_csv(test_path, chunksize=10000, header=0, skiprows=skip)

    # roundtrip testing
    for chunk in reader:
        for in_smiles in chunk[column_name]:
            # check if SMILES in chunk is a valid RDKit molecule.
            # if not, skip testing
            # All inputted SMILES must be valid
            # RDKit Mol objects to be encoded.
            if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles):

            # encode SELFIE string
            selfies = sf.encoder(in_smiles)

            # if unable to encode SMILES, write to list of errors
            if selfies is None:
                error_list.append((in_smiles, ''))

            # take encoeded SELFIES and decode
            out_smiles = sf.decoder(selfies)

            # compare original SMILES to decoded SELFIE string.
            # if not the same string, write to list of errors.
            if not is_same_mol(in_smiles, out_smiles):
                error_list.append((in_smiles, str(out_smiles)))

        # open and write all errors to errors_{test_name}.csv
        with open(error_path, "a") as error_log:
            for error in error_list:
                error_log.write(','.join(error) + "\n")
        error_found_flag = error_found_flag or error_list
        error_list = []

    sf.set_semantic_constraints()  # restore defaults

    assert not error_found_flag