Esempio n. 1
0
    def test_sampling_with_padding(self):
        smiles_string = Token.tokenize(Token.augment(self.smiles_string))
        n_steps = 20
        sampler = SMILESConsecutiveSampler(self.vocabulary.corpus,
                                           n_steps=n_steps)

        step_i = 0
        for sample in sampler:
            input_s = ''.join(smiles_string[step_i:step_i + n_steps])
            output_s = ''.join(smiles_string[step_i + 1:step_i + n_steps + 1])

            if sample.valid_length < n_steps:
                input_s += Token.PAD * (n_steps - sample.valid_length - 1)
                output_s += Token.PAD * (n_steps - sample.valid_length)

            self.assertEqual(
                input_s,
                ''.join(self.vocabulary.get_tokens(sample.inputs)),
            )
            self.assertEqual(
                output_s,
                ''.join(self.vocabulary.get_tokens(sample.outputs)),
            )

            step_i += n_steps
Esempio n. 2
0
 def test_crop_without_padding(self):
     self.assertEqual(
         self.smiles,
         Token.crop(self.aug_w_pad, padding=True),
     )
     self.assertEqual(
         self.smiles,
         Token.crop(self.smiles),
     )
    def test_2_tokens_are_valid(self):
        valid_tokens = Token.get_all_tokens() - frozenset(Token.UNK)

        for _ in range(100):
            smiles = self.predictor()

            for token in Token.tokenize(smiles):
                if len(token) == 1 and token.islower():
                    token = token.upper()

                self.assertIn(token, valid_tokens)
Esempio n. 4
0
 def test_augment_without_padding(self):
     self.assertEqual(
         self.aug_w_pad,
         Token.augment(self.smiles, padding_len=self.n_pad),
     )
     self.assertEqual(
         self.aug_no_pad,
         Token.augment(self.smiles, padding_len=0),
     )
     self.assertEqual(
         self.aug_w_pad,
         Token.augment(self.aug_w_pad),
     )
Esempio n. 5
0
 def test_crop(self):
     self.assertEqual(
         self.smiles,
         Token.crop(self.aug_no_pad),
     )
     self.assertEqual(
         self.smiles,
         Token.crop((Token.BOS * 5) + self.aug_no_pad),
     )
     self.assertEqual(
         self.smiles,
         Token.crop(self.aug_no_pad + (Token.EOS * 3)),
     )
     self.assertEqual(
         self.smiles,
         Token.crop((Token.BOS * 4) + self.aug_no_pad + (Token.EOS * 4)))
Esempio n. 6
0
 def test_augment(self):
     self.assertEqual(
         self.aug_no_pad,
         Token.augment(self.smiles),
     )
     self.assertEqual(
         self.aug_no_pad,
         Token.augment((Token.BOS * 5) + self.aug_no_pad),
     )
     self.assertEqual(
         self.aug_no_pad,
         Token.augment(self.aug_no_pad + (Token.EOS * 3)),
     )
     self.assertEqual(
         self.aug_no_pad,
         Token.augment((Token.BOS * 4) + self.aug_no_pad + (Token.EOS * 4)))
Esempio n. 7
0
    def test_2_getitem(self):

        for i in range(len(self.only_smiles)):
            self.assertEqual(
                self.only_smiles[i],
                Token.crop(self.dataset[i]
                           ),  # `augment` was set to `True` during init.
            )
Esempio n. 8
0
    def test_corpus(self):
        # Original SMILES list without padded special tokens.
        smiles_list = self.temp_file.smiles_strings.split('\n')

        self.assertEqual(len(self.vocab.corpus), len(smiles_list))

        for idx, tokens in zip(self.vocab.corpus, smiles_list):
            # Add special tokens in order to correspond to the loaded corpus
            # for data sampling and model fitting.
            tokens = Token.augment(tokens)
            # Test id-to-token mapping.
            self.assertEqual(
                ''.join(self.vocab.get_tokens(idx)),
                tokens,
            )
            # Test token-to-id mapping.
            self.assertListEqual(idx, self.vocab[Token.tokenize(tokens)])
Esempio n. 9
0
    def test_contains(self):
        self.assertNotIn(Token.UNK, self.vocab)

        all_tokens = Token.get_all_tokens()

        for token in self.vocab:
            if len(token) == 1 and token.islower():
                token = token.upper()
            self.assertIn(token, all_tokens)
Esempio n. 10
0
    def test_sampling_without_padding(self):
        tokens = Token.tokenize(Token.augment(self.smiles_string))
        n_steps = len(tokens) - 1
        sampler = SMILESConsecutiveSampler(self.vocabulary.corpus,
                                           n_steps=n_steps)

        step_i = 0
        for n_samples, sample in enumerate(sampler, start=1):
            self.assertListEqual(
                tokens[step_i:step_i + n_steps],
                self.vocabulary.get_tokens(sample.inputs),
            )
            self.assertListEqual(
                tokens[step_i + 1:step_i + n_steps + 1],
                self.vocabulary.get_tokens(sample.outputs),
            )
            self.assertEqual(n_steps, sample.valid_length)

            step_i += n_steps

        self.assertEqual(n_samples, 1)
Esempio n. 11
0
    def test_1_read(self):

        self.assertTrue(
            all(
                s.startswith(Token.BOS) and s.endswith(Token.EOS)
                for s in self.dataset))
        self.assertListEqual(
            self.item_list,
            [Token.crop(s) for s in self.dataset],
        )

        self.assertEqual(
            len(self.item_list),
            len(self.dataset),
        )
Esempio n. 12
0
 def test_tokens_and_idx(self):
     self.assertSequenceEqual(
         # Tokenize the entire dataset to get a set of unique tokens.
         sorted(
             set(
                 Token.tokenize(
                     self.temp_file.smiles_strings.replace('\n', '')))),
         # The temporary file is not augmented by the special tokens.
         sorted(set(self.vocab.token_to_idx) - Token.SPECIAL),
     )
     self.assertSequenceEqual(
         sorted(
             set(self.vocab.token_to_idx)
             # Pad and unknown tokens does not appear in the original set.
             - {Token.PAD, Token.UNK}),
         sorted(set(self.vocab.token_freqs)),
     )
Esempio n. 13
0
 def test(smiles, match_bracket_atoms=False):
     self.assertListEqual(
         Token.tokenize(''.join(smiles), match_bracket_atoms), smiles)