def test_get_semantic_constraints(): """Tests selfies.get_semantic_constraints(). """ # Getting the constraints does not return aliases assert sf.get_semantic_constraints() is not sf.get_semantic_constraints() # The appropriate symbols are in the constraints constraints = sf.get_semantic_constraints() assert '?' in constraints
def test_change_constraints_cache_clear(): alphabet = sf.get_semantic_robust_alphabet() assert alphabet == sf.get_semantic_robust_alphabet() assert sf.decoder("[C][#C]") == "C#C" new_constraints = sf.get_semantic_constraints() new_constraints["C"] = 1 sf.set_semantic_constraints(new_constraints) new_alphabet = sf.get_semantic_robust_alphabet() assert new_alphabet != alphabet assert sf.decoder("[C][#C]") == "CC" sf.set_semantic_constraints() # re-set alphabet
def test_unconstrained_symbols(): """Tests SELFIES with symbols that are not semantically constrained. """ f_branch = "[Branch1][C][F]" s = "[Xe-2]" + (f_branch * 8) assert decode_eq(s, "[Xe-2](F)(F)(F)(F)(F)(F)(F)CF") # change default semantic constraints constraints = sf.get_semantic_constraints() constraints["?"] = 2 sf.set_semantic_constraints(constraints) assert decode_eq(s, "[Xe-2](F)CF") sf.set_semantic_constraints()
def test_charged_symbols(): """Tests that SELFIES symbols with charges are constrained properly. """ constraints = sf.get_semantic_constraints() constraints["Sn+4"] = 1 constraints["O-2"] = 2 sf.set_semantic_constraints(constraints) # the following molecules don't make sense, but we use them to test # selfies. Hence, we can't verify them with RDKit assert decode_eq("[Sn+4][=C]", "[Sn+4]C") assert decode_eq("[O-2][#C]", "[O-2]=C") # mixing many symbol types assert decode_eq("[17O@@H1-2][#C]", "[17O@@H1-2]C") sf.set_semantic_constraints()
def test_charged_symbols(): """Tests that SELFIES symbols with charges are constrained properly. """ constraints = sf.get_semantic_constraints() constraints['Sn+4'] = 1 constraints['O-2'] = 2 sf.set_semantic_constraints(constraints) # the following molecules don't make sense, but we use them to test # selfies. Hence, we can't verify them with RDKit assert sf.decoder("[Sn++++expl][=C]") == "[Sn++++]C" assert sf.decoder("[Sn+4expl][=C]") == "[Sn+4]C" assert sf.decoder("[O--expl][#C]") == "[O--]=C" assert sf.decoder("[O-2expl][#C]") == "[O-2]=C" # mixing many symbol types assert sf.decoder("[17O@@H-2expl][#C]") == "[17O@@H-2]C" sf.set_semantic_constraints()
def test_unconstrained_symbols(): """Tests SELFIES with symbols that are not semantically constrained. """ assert sf.decoder("[Xe-2expl][Branch1_1][C][F][Branch1_1][C][F]" "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]" "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]") \ == "[Xe-2](F)(F)(F)(F)(F)(F)(F)CF" # change default semantic constraints constraints = sf.get_semantic_constraints() constraints['?'] = 2 sf.set_semantic_constraints(constraints) assert sf.decoder("[Xe-2expl][Branch1_1][C][F][Branch1_1][C][F]" "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]" "[Branch1_1][C][F][Branch1_1][C][F][Branch1_1][C][F]") \ == "[Xe-2](F)CF" sf.set_semantic_constraints()
def test_roundtrip_translation(): """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the SMILES examples in QM9, NonFullerene, Zinc, etc. """ # modify constraints constraints = sf.get_semantic_constraints() constraints['N'] = 6 constraints['Br'] = 7 constraints['Cl'] = 7 constraints['I'] = 7 sf.set_semantic_constraints(constraints) # file I/O ckpt_path = os.path.join(curr_dir, 'checkpoints', 'emolecule_ckpt.txt') error_path = os.path.join(curr_dir, 'error_sets', 'errors_emolecules.csv') # check if a previous checkpoint exists to continue tests if os.path.exists(ckpt_path): with open(ckpt_path, 'r') as ckpt_file: checkpoint = int(ckpt_file.readlines()[0]) # if no path to a checkpoint exists, # create a new directory for error logging and checkpoints else: os.makedirs(os.path.dirname(ckpt_path), exist_ok=True) os.makedirs(os.path.dirname(error_path), exist_ok=True) with open(error_path, "w+") as error_log: error_log.write("In, Out\n") checkpoint = -1 error_list = [] error_found_flag = False # make pandas reader reader = pd.read_csv(EMOL_PATH, chunksize=10000, compression='gzip', delimiter=' ', header=0) # roundtrip testing for chunk_idx, chunk in enumerate(reader): if chunk_idx <= checkpoint: continue for in_smiles in chunk[COL_NAME]: # check if SMILES in chunk is a valid RDKit molecule. # if not, skip testing # All inputted SMILES must be valid # RDKit Mol objects to be encoded. if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles): continue # encode selfies selfies = sf.encoder(in_smiles) # if unable to encode SMILES, write to list of errors if selfies is None: error_list.append((in_smiles, '')) continue # take encoeded SELFIES and decode out_smiles = sf.decoder(selfies) # compare original SMILES to decoded SELFIE string. # if not the same string, write to list of errors. if not is_same_mol(in_smiles, out_smiles): error_list.append((in_smiles, out_smiles)) # open and write all errors to errors_emolecule.csv with open(error_path, "a") as error_log: for error in error_list: error_log.write(','.join(error) + "\n") error_found_flag = error_found_flag or error_list error_list = [] # create checkpoint from the current pandas reader chunk, # to load from and continue testing. with open(ckpt_path, 'w+') as ckpt_file: ckpt_file.write(str(chunk_idx)) sf.set_semantic_constraints() # restore defaults os.remove(ckpt_path) # remove checkpoint assert not error_found_flag
def test_get_semantic_constraints(): constraints = sf.get_semantic_constraints() assert constraints is not sf.get_semantic_constraints() # not alias assert "?" in constraints
def test_roundtrip_translation(test_name, column_name, dataset_samples): """Tests a roundtrip SMILES -> SELFIES -> SMILES translation of the SMILES examples in QM9, NonFullerene, Zinc, etc. """ # modify semantic bond constraints constraints = sf.get_semantic_constraints() constraints['N'] = 6 sf.set_semantic_constraints(constraints) # file I/O curr_dir = os.path.dirname(__file__) test_path = os.path.join(curr_dir, 'test_sets', test_name + ".txt") error_path = os.path.join(curr_dir, 'error_sets', "errors_{}.csv".format(test_name)) # create error directory os.makedirs(os.path.dirname(error_path), exist_ok=True) error_list = [] # add header in error log text file with open(error_path, "w+") as error_log: error_log.write("In, Out\n") error_found_flag = False # make pandas reader N = sum(1 for _ in open(test_path)) - 1 S = dataset_samples if (0 < dataset_samples <= N) else N skip = sorted(random.sample(range(1, N + 1), N - S)) reader = pd.read_csv(test_path, chunksize=10000, header=0, skiprows=skip) # roundtrip testing for chunk in reader: for in_smiles in chunk[column_name]: # check if SMILES in chunk is a valid RDKit molecule. # if not, skip testing # All inputted SMILES must be valid # RDKit Mol objects to be encoded. if (MolFromSmiles(in_smiles) is None) or ('*' in in_smiles): continue # encode SELFIE string selfies = sf.encoder(in_smiles) # if unable to encode SMILES, write to list of errors if selfies is None: error_list.append((in_smiles, '')) continue # take encoeded SELFIES and decode out_smiles = sf.decoder(selfies) # compare original SMILES to decoded SELFIE string. # if not the same string, write to list of errors. if not is_same_mol(in_smiles, out_smiles): error_list.append((in_smiles, str(out_smiles))) # open and write all errors to errors_{test_name}.csv with open(error_path, "a") as error_log: for error in error_list: error_log.write(','.join(error) + "\n") error_found_flag = error_found_flag or error_list error_list = [] sf.set_semantic_constraints() # restore defaults assert not error_found_flag