Beispiel #1
0
def get_selfie_and_smiles_encodings_for_dataset(smiles_list):
    """
    Returns encoding, alphabet and length of largest molecule in SMILES and
    SELFIES, given a file containing SMILES molecules.
    input:
        csv file with molecules. Column's name must be 'smiles'.
    output:
        - selfies encoding
        - selfies alphabet
        - longest selfies string
        - smiles encoding (equivalent to file content)
        - smiles alphabet (character based)
        - longest smiles string
    """

    # df = pd.read_csv(file_path)
    # smiles_list = np.asanyarray(df.smiles)
    smiles_alphabet = list(set(''.join(smiles_list)))
    smiles_alphabet.append(' ')  # for padding
    largest_smiles_len = len(max(smiles_list, key=len))

    print('--> Translating SMILES to SELFIES...')

    selfies_list = list(map(sf.encoder, smiles_list))

    all_selfies_symbols = sf.get_alphabet_from_selfies(selfies_list)
    all_selfies_symbols.add('[nop]')
    selfies_alphabet = list(all_selfies_symbols)

    largest_selfies_len = max(sf.len_selfies(s) for s in selfies_list)

    print('Finished translating SMILES to SELFIES.')

    return selfies_list, selfies_alphabet, largest_selfies_len, \
           smiles_list, smiles_alphabet, largest_smiles_len
def main(args):
    # Load Tokenizer
    print("Loading selfied in directory: {}.".format(args.directory))
    selfies = load_selfies(args.directory)
    print("Extracting alphabet from smiles samples")
    print("The longest sample in dataset is {}".format(
        max(sf.len_selfies(s) for s in selfies)))
    alphabet = sf.get_alphabet_from_selfies(selfies)
    alphabet.add('[start]')  # '[start]'
    alphabet.add('[end]')
    alphabet.add('[pad]')
    alphabet.add('[unk]')
    alphabet = list(alphabet)
    symbol_to_idx = {s: i for i, s in enumerate(alphabet)}
    idx_to_symbol = {i: s for i, s in enumerate(alphabet)}
    with open(args.output_file, 'w') as w:
        for i in range(0, len(idx_to_symbol)):
            w.write("{}\n".format(idx_to_symbol[i]))
    print("Alphabet written")
Beispiel #3
0
def selfies_to_hot(selfie, largest_selfie_len, alphabet):
    """Go from a single selfies string to a one-hot encoding.
    """

    symbol_to_int = dict((c, i) for i, c in enumerate(alphabet))

    # pad with [nop]
    selfie += '[nop]' * (largest_selfie_len - sf.len_selfies(selfie))

    # integer encode
    symbol_list = sf.split_selfies(selfie)
    integer_encoded = [symbol_to_int[symbol] for symbol in symbol_list]

    # one hot-encode the integer encoded selfie
    onehot_encoded = list()
    for index in integer_encoded:
        letter = [0] * len(alphabet)
        letter[index] = 1
        onehot_encoded.append(letter)

    return integer_encoded, np.array(onehot_encoded)
Beispiel #4
0
def get_largest_selfie_len(smiles_list):
    """Returns the length of the largest SELFIES string from a list of SMILES."""

    selfies_list = list(map(sf.encoder, smiles_list))
    largest_selfies_len = max(sf.len_selfies(s) for s in selfies_list)
    return largest_selfies_len
def test_len_selfies(test_cases):
    for s, (length, _) in test_cases.items():
        assert sf.len_selfies(s) == length
Beispiel #6
0
def test_len_selfies(dataset):
    for entry in dataset[0]:
        assert sf.len_selfies(entry.selfies) == len(entry.symbols)