def test_sampling_with_padding(self): smiles_string = Token.tokenize(Token.augment(self.smiles_string)) n_steps = 20 sampler = SMILESConsecutiveSampler(self.vocabulary.corpus, n_steps=n_steps) step_i = 0 for sample in sampler: input_s = ''.join(smiles_string[step_i:step_i + n_steps]) output_s = ''.join(smiles_string[step_i + 1:step_i + n_steps + 1]) if sample.valid_length < n_steps: input_s += Token.PAD * (n_steps - sample.valid_length - 1) output_s += Token.PAD * (n_steps - sample.valid_length) self.assertEqual( input_s, ''.join(self.vocabulary.get_tokens(sample.inputs)), ) self.assertEqual( output_s, ''.join(self.vocabulary.get_tokens(sample.outputs)), ) step_i += n_steps
def test_crop_without_padding(self): self.assertEqual( self.smiles, Token.crop(self.aug_w_pad, padding=True), ) self.assertEqual( self.smiles, Token.crop(self.smiles), )
def test_2_tokens_are_valid(self): valid_tokens = Token.get_all_tokens() - frozenset(Token.UNK) for _ in range(100): smiles = self.predictor() for token in Token.tokenize(smiles): if len(token) == 1 and token.islower(): token = token.upper() self.assertIn(token, valid_tokens)
def test_augment_without_padding(self): self.assertEqual( self.aug_w_pad, Token.augment(self.smiles, padding_len=self.n_pad), ) self.assertEqual( self.aug_no_pad, Token.augment(self.smiles, padding_len=0), ) self.assertEqual( self.aug_w_pad, Token.augment(self.aug_w_pad), )
def test_crop(self): self.assertEqual( self.smiles, Token.crop(self.aug_no_pad), ) self.assertEqual( self.smiles, Token.crop((Token.BOS * 5) + self.aug_no_pad), ) self.assertEqual( self.smiles, Token.crop(self.aug_no_pad + (Token.EOS * 3)), ) self.assertEqual( self.smiles, Token.crop((Token.BOS * 4) + self.aug_no_pad + (Token.EOS * 4)))
def test_augment(self): self.assertEqual( self.aug_no_pad, Token.augment(self.smiles), ) self.assertEqual( self.aug_no_pad, Token.augment((Token.BOS * 5) + self.aug_no_pad), ) self.assertEqual( self.aug_no_pad, Token.augment(self.aug_no_pad + (Token.EOS * 3)), ) self.assertEqual( self.aug_no_pad, Token.augment((Token.BOS * 4) + self.aug_no_pad + (Token.EOS * 4)))
def test_2_getitem(self): for i in range(len(self.only_smiles)): self.assertEqual( self.only_smiles[i], Token.crop(self.dataset[i] ), # `augment` was set to `True` during init. )
def test_corpus(self): # Original SMILES list without padded special tokens. smiles_list = self.temp_file.smiles_strings.split('\n') self.assertEqual(len(self.vocab.corpus), len(smiles_list)) for idx, tokens in zip(self.vocab.corpus, smiles_list): # Add special tokens in order to correspond to the loaded corpus # for data sampling and model fitting. tokens = Token.augment(tokens) # Test id-to-token mapping. self.assertEqual( ''.join(self.vocab.get_tokens(idx)), tokens, ) # Test token-to-id mapping. self.assertListEqual(idx, self.vocab[Token.tokenize(tokens)])
def test_contains(self): self.assertNotIn(Token.UNK, self.vocab) all_tokens = Token.get_all_tokens() for token in self.vocab: if len(token) == 1 and token.islower(): token = token.upper() self.assertIn(token, all_tokens)
def test_sampling_without_padding(self): tokens = Token.tokenize(Token.augment(self.smiles_string)) n_steps = len(tokens) - 1 sampler = SMILESConsecutiveSampler(self.vocabulary.corpus, n_steps=n_steps) step_i = 0 for n_samples, sample in enumerate(sampler, start=1): self.assertListEqual( tokens[step_i:step_i + n_steps], self.vocabulary.get_tokens(sample.inputs), ) self.assertListEqual( tokens[step_i + 1:step_i + n_steps + 1], self.vocabulary.get_tokens(sample.outputs), ) self.assertEqual(n_steps, sample.valid_length) step_i += n_steps self.assertEqual(n_samples, 1)
def test_1_read(self): self.assertTrue( all( s.startswith(Token.BOS) and s.endswith(Token.EOS) for s in self.dataset)) self.assertListEqual( self.item_list, [Token.crop(s) for s in self.dataset], ) self.assertEqual( len(self.item_list), len(self.dataset), )
def test_tokens_and_idx(self): self.assertSequenceEqual( # Tokenize the entire dataset to get a set of unique tokens. sorted( set( Token.tokenize( self.temp_file.smiles_strings.replace('\n', '')))), # The temporary file is not augmented by the special tokens. sorted(set(self.vocab.token_to_idx) - Token.SPECIAL), ) self.assertSequenceEqual( sorted( set(self.vocab.token_to_idx) # Pad and unknown tokens does not appear in the original set. - {Token.PAD, Token.UNK}), sorted(set(self.vocab.token_freqs)), )
def test(smiles, match_bracket_atoms=False): self.assertListEqual( Token.tokenize(''.join(smiles), match_bracket_atoms), smiles)