def test_untokenise_smiles_with_simplify_rings(self, smiles_str, split,
                                                   tokens):
        token_list = ['?', 'C', '1', '2', '%12', '%']

        tokeniser = smiles.SmilesTokeniser(token_list,
                                           splitting_method=split,
                                           simplify_rings=True)
        smiles_result = tokeniser.untokenise_smiles(tokens)
        assert smiles_result == smiles_str
    def test_tokenise_smiles_all_tokens(self, smiles_str, tokens):
        token_list = [
            '?', 'C', 'N', 'O', 'Cl', 'Br', '[Pt]', '(', ')', '=', '1', '2',
            '3', '%10'
        ]

        tokeniser = smiles.SmilesTokeniser(token_list)
        tokens_result = tokeniser.tokenise_smiles(smiles_str)
        assert tokens_result == tokens
    def test_tokenise_smiles_with_sos_eos(self, smiles_str, token_sos,
                                          token_eos, tokens):
        token_list = ['C', 'N', 'O', 'Cl', 'Br', '<SOS>', '<EOS>']

        tokeniser = smiles.SmilesTokeniser(token_list,
                                           token_sos=token_sos,
                                           token_eos=token_eos)
        tokens_result = tokeniser.tokenise_smiles(smiles_str)
        assert tokens_result == tokens
    def test_tokenise_smiles_with_simplify_rings(self, smiles_str, split,
                                                 tokens):
        token_list = ['?', 'C', '1', '2', '3']

        tokeniser = smiles.SmilesTokeniser(token_list,
                                           splitting_method=split,
                                           simplify_rings=True)
        tokens_result = tokeniser.tokenise_smiles(smiles_str)
        assert tokens_result == tokens
    def test_tokenise_smiles_with_padding(self, smiles_str, token_pad, length,
                                          truncate, tokens):
        token_list = ['C', 'N', 'O', 'Cl', 'Br', '_', ' ']

        tokeniser = smiles.SmilesTokeniser(token_list,
                                           token_padding=token_pad,
                                           sequence_length=length,
                                           truncate_sequence=truncate)
        tokens_result = tokeniser.tokenise_smiles(smiles_str)
        assert tokens_result == tokens
    def test_tokenise_smiles_with_unknown_placeholder(self, smiles_str, split,
                                                      placeholder, tokens):
        token_list = ['C', 'Br', 'B', 'r', '?', '[Pt]', '[', 'P', 't', ']']

        tokeniser = smiles.SmilesTokeniser(token_list,
                                           splitting_method=split,
                                           token_unknown=placeholder)
        tokens_result = tokeniser.tokenise_smiles(smiles_str)
        assert tokens_result == tokens
        assert tokeniser.missing_tokens == {'S', 'N'}
    def test_tokenise_smiles_characters(self, smiles_str, tokens):
        token_list = [
            '?', 'C', 'N', 'O', 'l', 'B', '[', 'P', 't', ']', '(', ')', '=',
            '1', '2', '3', '%', '0', 'r'
        ]

        tokeniser = smiles.SmilesTokeniser(token_list,
                                           splitting_method='characters')
        tokens_result = tokeniser.tokenise_smiles(smiles_str)
        assert tokens_result == tokens
    def test_tokenise_smiles_halogens_only(self, smiles_str, tokens):
        token_list = [
            '?', 'C', 'N', 'O', 'Cl', 'Br', '[', 'P', 't', ']', '(', ')', '=',
            '1', '2', '3', '%', '0'
        ]

        tokeniser = smiles.SmilesTokeniser(token_list,
                                           splitting_method='halogens_only')
        tokens_result = tokeniser.tokenise_smiles(smiles_str)
        assert tokens_result == tokens
    def test_tokenise_smiles_with_pad_sos_eos(self, smiles_str, token_sos,
                                              token_eos, tokens):
        token_list = ['C', 'N', 'O', 'Cl', 'Br', '<SOS>', '<EOS>', '_']

        tokeniser = smiles.SmilesTokeniser(token_list,
                                           token_sos=token_sos,
                                           token_eos=token_eos,
                                           token_padding='_',
                                           sequence_length=7)
        tokens_result = tokeniser.tokenise_smiles(smiles_str)
        assert tokens_result == tokens
    def test_exception_if_seq_length_not_padding(self):
        token_list = ['?', 'C', '1', '2', '%12', '%']

        with pytest.raises(ValueError):
            _ = smiles.SmilesTokeniser(token_list, sequence_length=50)
    def test_exception_if_padding_not_seq_length(self):
        token_list = ['?', 'C', '1', '2', '%12', '%']

        with pytest.raises(ValueError):
            _ = smiles.SmilesTokeniser(token_list, token_padding='?')