def get_selfie_and_smiles_encodings_for_dataset(smiles_list): """ Returns encoding, alphabet and length of largest molecule in SMILES and SELFIES, given a file containing SMILES molecules. input: csv file with molecules. Column's name must be 'smiles'. output: - selfies encoding - selfies alphabet - longest selfies string - smiles encoding (equivalent to file content) - smiles alphabet (character based) - longest smiles string """ # df = pd.read_csv(file_path) # smiles_list = np.asanyarray(df.smiles) smiles_alphabet = list(set(''.join(smiles_list))) smiles_alphabet.append(' ') # for padding largest_smiles_len = len(max(smiles_list, key=len)) print('--> Translating SMILES to SELFIES...') selfies_list = list(map(sf.encoder, smiles_list)) all_selfies_symbols = sf.get_alphabet_from_selfies(selfies_list) all_selfies_symbols.add('[nop]') selfies_alphabet = list(all_selfies_symbols) largest_selfies_len = max(sf.len_selfies(s) for s in selfies_list) print('Finished translating SMILES to SELFIES.') return selfies_list, selfies_alphabet, largest_selfies_len, \ smiles_list, smiles_alphabet, largest_smiles_len
def main(args): # Load Tokenizer print("Loading selfied in directory: {}.".format(args.directory)) selfies = load_selfies(args.directory) print("Extracting alphabet from smiles samples") print("The longest sample in dataset is {}".format( max(sf.len_selfies(s) for s in selfies))) alphabet = sf.get_alphabet_from_selfies(selfies) alphabet.add('[start]') # '[start]' alphabet.add('[end]') alphabet.add('[pad]') alphabet.add('[unk]') alphabet = list(alphabet) symbol_to_idx = {s: i for i, s in enumerate(alphabet)} idx_to_symbol = {i: s for i, s in enumerate(alphabet)} with open(args.output_file, 'w') as w: for i in range(0, len(idx_to_symbol)): w.write("{}\n".format(idx_to_symbol[i])) print("Alphabet written")
def selfies_to_hot(selfie, largest_selfie_len, alphabet): """Go from a single selfies string to a one-hot encoding. """ symbol_to_int = dict((c, i) for i, c in enumerate(alphabet)) # pad with [nop] selfie += '[nop]' * (largest_selfie_len - sf.len_selfies(selfie)) # integer encode symbol_list = sf.split_selfies(selfie) integer_encoded = [symbol_to_int[symbol] for symbol in symbol_list] # one hot-encode the integer encoded selfie onehot_encoded = list() for index in integer_encoded: letter = [0] * len(alphabet) letter[index] = 1 onehot_encoded.append(letter) return integer_encoded, np.array(onehot_encoded)
def get_largest_selfie_len(smiles_list): """Returns the length of the largest SELFIES string from a list of SMILES.""" selfies_list = list(map(sf.encoder, smiles_list)) largest_selfies_len = max(sf.len_selfies(s) for s in selfies_list) return largest_selfies_len
def test_len_selfies(test_cases): for s, (length, _) in test_cases.items(): assert sf.len_selfies(s) == length
def test_len_selfies(dataset): for entry in dataset[0]: assert sf.len_selfies(entry.selfies) == len(entry.symbols)