Ejemplo n.º 1
0
 def setUp(self):
     words = ['the', '.', chr(256) + 't', '<S>', '</S>', '<UNK>']
     (_, tmp) = tempfile.mkstemp()
     with open(tmp, 'w') as fout:
         fout.write('\n'.join(words))
     self.vocab = UnicodeCharsVocabulary(tmp, 5)
     self._tmp = tmp
Ejemplo n.º 2
0
def test_original_dataset_implementation():
    """Trying to show how the original `LMDataset`
    and `BidirectionalLMDataset` works.
    """
    from bilm.data import LMDataset, BidirectionalLMDataset, \
         UnicodeCharsVocabulary

    test_prefix = 'data/test/violin_test.txt'
    vocab_path = 'dump/bilm_pretrain/vocab-2016-09-10.txt'

    vocabulary = UnicodeCharsVocabulary(vocab_path, max_word_length=50)
    dataset = LMDataset(test_prefix, vocabulary)
    a = dataset.iter_batches(batch_size=10, num_steps=50)
    b = next(a)
    print(f'Keys: {b.keys()}')
    for k, v in b.items():
        print(f'Shape of {k}: {v.shape}')

    print(vocabulary.decode(b['token_ids'][0]))
    print(vocabulary.decode(b['next_token_id'][0]))
    print(vocabulary.decode_chars(b['tokens_characters'][0]))

    from IPython import embed
    embed()
    import os
    os._exit(1)
Ejemplo n.º 3
0
 def test_vocab_encode_chars_reverse(self):
     sentence = ' '.join(reversed(['th', 'thhhhh', chr(256) + 't']))
     vocab = UnicodeCharsVocabulary(self._tmp, 5)
     char_ids = vocab.encode_chars(sentence, reverse=True)
     expected = np.array(
         [[258, 256, 259, 260, 260], [258, 116, 104, 259, 260],
          [258, 116, 104, 104, 259], [258, 196, 128, 116, 259],
          [258, 257, 259, 260, 260]],
         dtype=np.int32)[::-1, :]
     self.assertTrue((char_ids == expected).all())
Ejemplo n.º 4
0
 def test_vocab_encode_chars_reverse(self):
     sentence = ' '.join(reversed(['th', 'thhhhh', chr(256) + 't']))
     vocab = UnicodeCharsVocabulary(self._tmp, 5)
     char_ids = vocab.encode_chars(sentence, reverse=True)
     expected = np.array(
         [[258, 256, 259, 260, 260],
         [258, 116, 104, 259, 260],
         [258, 116, 104, 104, 259],
         [258, 196, 128, 116, 259],
         [258, 257, 259, 260, 260]], dtype=np.int32)[::-1, :]
     self.assertTrue((char_ids == expected).all())
Ejemplo n.º 5
0
class TestUnicodeCharsVocabulary(unittest.TestCase):
    def setUp(self):
        words = ['the', '.', chr(256) + 't', '<S>', '</S>', '<UNK>']
        (_, tmp) = tempfile.mkstemp()
        with open(tmp, 'w') as fout:
            fout.write('\n'.join(words))
        self.vocab = UnicodeCharsVocabulary(tmp, 5)
        self._tmp = tmp

    def test_vocab_word_to_char_ids(self):
        char_ids = self.vocab.word_to_char_ids('th')
        expected = np.array([258, 116, 104, 259, 260], dtype=np.int32)
        self.assertTrue((char_ids == expected).all())

        char_ids = self.vocab.word_to_char_ids('thhhhh')
        expected = np.array([258, 116, 104, 104, 259])
        self.assertTrue((char_ids == expected).all())

        char_ids = self.vocab.word_to_char_ids(chr(256) + 't')
        expected = np.array([258, 196, 128, 116, 259], dtype=np.int32)
        self.assertTrue((char_ids == expected).all())

    def test_bos_eos(self):
        bos_ids = self.vocab.word_to_char_ids('<S>')
        self.assertTrue((bos_ids == self.vocab.bos_chars).all())

        bos_ids = self.vocab.word_char_ids[self.vocab.word_to_id('<S>')]
        self.assertTrue((bos_ids == self.vocab.bos_chars).all())

        eos_ids = self.vocab.word_to_char_ids('</S>')
        self.assertTrue((eos_ids == self.vocab.eos_chars).all())

        eos_ids = self.vocab.word_char_ids[self.vocab.word_to_id('</S>')]
        self.assertTrue((eos_ids == self.vocab.eos_chars).all())

    def test_vocab_encode_chars(self):
        sentence = ' '.join(['th', 'thhhhh', chr(256) + 't'])
        char_ids = self.vocab.encode_chars(sentence)
        expected = np.array(
            [[258, 256, 259, 260, 260],
            [258, 116, 104, 259, 260],
            [258, 116, 104, 104, 259],
            [258, 196, 128, 116, 259],
            [258, 257, 259, 260, 260]], dtype=np.int32)
        self.assertTrue((char_ids == expected).all())

    def test_vocab_encode_chars_reverse(self):
        sentence = ' '.join(reversed(['th', 'thhhhh', chr(256) + 't']))
        vocab = UnicodeCharsVocabulary(self._tmp, 5)
        char_ids = vocab.encode_chars(sentence, reverse=True)
        expected = np.array(
            [[258, 256, 259, 260, 260],
            [258, 116, 104, 259, 260],
            [258, 116, 104, 104, 259],
            [258, 196, 128, 116, 259],
            [258, 257, 259, 260, 260]], dtype=np.int32)[::-1, :]
        self.assertTrue((char_ids == expected).all())

    def tearDown(self):
        os.remove(self._tmp)
Ejemplo n.º 6
0
class TestUnicodeCharsVocabulary(unittest.TestCase):
    def setUp(self):
        words = ['the', '.', chr(256) + 't', '<S>', '</S>', '<UNK>']
        (_, tmp) = tempfile.mkstemp()
        with open(tmp, 'w') as fout:
            fout.write('\n'.join(words))
        self.vocab = UnicodeCharsVocabulary(tmp, 5)
        self._tmp = tmp

    def test_vocab_word_to_char_ids(self):
        char_ids = self.vocab.word_to_char_ids('th')
        expected = np.array([258, 116, 104, 259, 260], dtype=np.int32)
        self.assertTrue((char_ids == expected).all())

        char_ids = self.vocab.word_to_char_ids('thhhhh')
        expected = np.array([258, 116, 104, 104, 259])
        self.assertTrue((char_ids == expected).all())

        char_ids = self.vocab.word_to_char_ids(chr(256) + 't')
        expected = np.array([258, 196, 128, 116, 259], dtype=np.int32)
        self.assertTrue((char_ids == expected).all())

    def test_bos_eos(self):
        bos_ids = self.vocab.word_to_char_ids('<S>')
        self.assertTrue((bos_ids == self.vocab.bos_chars).all())

        bos_ids = self.vocab.word_char_ids[self.vocab.word_to_id('<S>')]
        self.assertTrue((bos_ids == self.vocab.bos_chars).all())

        eos_ids = self.vocab.word_to_char_ids('</S>')
        self.assertTrue((eos_ids == self.vocab.eos_chars).all())

        eos_ids = self.vocab.word_char_ids[self.vocab.word_to_id('</S>')]
        self.assertTrue((eos_ids == self.vocab.eos_chars).all())

    def test_vocab_encode_chars(self):
        sentence = ' '.join(['th', 'thhhhh', chr(256) + 't'])
        char_ids = self.vocab.encode_chars(sentence)
        expected = np.array(
            [[258, 256, 259, 260, 260], [258, 116, 104, 259, 260],
             [258, 116, 104, 104, 259], [258, 196, 128, 116, 259],
             [258, 257, 259, 260, 260]],
            dtype=np.int32)
        self.assertTrue((char_ids == expected).all())

    def test_vocab_encode_chars_reverse(self):
        sentence = ' '.join(reversed(['th', 'thhhhh', chr(256) + 't']))
        vocab = UnicodeCharsVocabulary(self._tmp, 5)
        char_ids = vocab.encode_chars(sentence, reverse=True)
        expected = np.array(
            [[258, 256, 259, 260, 260], [258, 116, 104, 259, 260],
             [258, 116, 104, 104, 259], [258, 196, 128, 116, 259],
             [258, 257, 259, 260, 260]],
            dtype=np.int32)[::-1, :]
        self.assertTrue((char_ids == expected).all())

    def tearDown(self):
        os.remove(self._tmp)
Ejemplo n.º 7
0
 def setUp(self):
     words = ['the', '.', chr(256) + 't', '<S>', '</S>', '<UNK>']
     (_, tmp) = tempfile.mkstemp()
     with open(tmp, 'w') as fout:
         fout.write('\n'.join(words))
     self.vocab = UnicodeCharsVocabulary(tmp, 5)
     self._tmp = tmp
Ejemplo n.º 8
0
    def __init__(self):
        self._elmo = hub.Module(
            ROOT_DIR +
            "/bidirectional_lms/elmo_ru_news/tf_hub_model_epoch_n_3/",
            trainable=True)
        base_path = ROOT_DIR + "/bidirectional_lms/elmo_ru_news"
        ckpt_prefixed_path = base_path + "/model.ckpt-0003"
        # metafile_path = base_path + "/model.ckpt-0003.meta"
        # ckpt_prefixed_path = base_path + "/model.ckpt-1327437"
        # metafile_path = base_path + "/model.ckpt-1327437.meta"

        self.softmax_w = tf.train.load_variable(ckpt_prefixed_path,
                                                'lm/softmax/W')
        self.softmax_bias = tf.train.load_variable(ckpt_prefixed_path,
                                                   'lm/softmax/b')

        # read vocabulary
        path_to_vocab = base_path + "/tokens_set.txt"
        with open(path_to_vocab, "r") as vocab_file:
            self.n_tokens_vocab = vocab_file.readlines()

        # TODO finish me
        self._lm_vocab = UnicodeCharsVocabulary(path_to_vocab, 200)

        self.words = self._lm_vocab._id_to_word
        self.word_index = {word: i for i, word in enumerate(self.words)}

        # index of unknown token:
        self.IDX_UNK_TOKEN = self.word_index.get("<UNK>")
Ejemplo n.º 9
0
    def _load_data(self, reverse, chars, bidirectional=False):
        if chars:
            vocab = UnicodeCharsVocabulary(self._tmp_vocab, 5)
        else:
            vocab = Vocabulary(self._tmp_vocab)

        if not bidirectional:
            data = LMDataset(self._tmp_train, vocab, reverse=reverse)
        else:
            data = BidirectionalLMDataset(self._tmp_train, vocab)

        return data
Ejemplo n.º 10
0
    def __init__(self):
        self.load_model()

        # read vocabulary
        self.path_to_vocab = SELF_DIR + "/elmo_transformer_pretrained_models/vocabulary/tokens.txt"
        self._lm_vocab = UnicodeCharsVocabulary(self.path_to_vocab, 200)

        self.words = self._lm_vocab._id_to_word
        self.word_index = {word: i for i, word in enumerate(self.words)}

        # index of unknown token:
        self.IDX_UNK_TOKEN = self.word_index.get("<UNK>")
Ejemplo n.º 11
0
    def __init__(self):
        self.load_model()

        # read vocabulary
        path_to_vocab = ROOT_DIR + "/bidirectional_lms/elmo_ru_news/tokens_set.txt"
        # with open(path_to_vocab, "r") as vocab_file:
        #     self.n_tokens_vocab = vocab_file.readlines()
        # print(self.n_tokens_vocab)
        self._lm_vocab = UnicodeCharsVocabulary(path_to_vocab, 200)

        self.words = self._lm_vocab._id_to_word
        self.word_index = {word: i for i, word in enumerate(self.words)}

        # index of unknown token:
        self.IDX_UNK_TOKEN = self.word_index.get("<UNK>")
Ejemplo n.º 12
0
def load_vocab(vocab_file, max_word_length=None):
    if max_word_length:
        return UnicodeCharsVocabulary(vocab_file, max_word_length,
                                      validate_file=True)
    else:
        return Vocabulary(vocab_file, validate_file=True)
Ejemplo n.º 13
0
    vocab_chars.encode(word, split=True, reverse=True)))
word = '阿道 夫'
print('====> encoded result without split: {}'.format(
    vocab_chars.encode(word)))
print('====> encoded result with split: {}'.format(
    vocab_chars.encode(word, split=True)))
print('====> encoded result with split with reverse: {}'.format(
    vocab_chars.encode(word, split=True, reverse=True)))
'''
UE for UnicodeCharsVocabulary
'''
print('\n\n\tUE for UnicodeCharsVocabulary:')
vocab_file = '../data/vocab_seg_words_elmo.txt'
vocab_file1 = '../data/vocab_seg_chars_elmo.txt'
vocab_unicodechars = UnicodeCharsVocabulary(vocab_file,
                                            max_word_length=10,
                                            validate_file=True)
print('====> bos: {}'.format(vocab_chars.bos))
print('====> eos: {}'.format(vocab_chars.eos))
print('====> unk: {}'.format(vocab_chars.unk))
print('====> size: {}'.format(vocab_chars.size))

word = '阿道夫'
print('====> word to id: {}'.format(vocab_chars.word_to_id(word)))
word = '阿'
print('====> word to id: {}'.format(vocab_chars.word_to_id(word)))

id = 234
print('====> id to word: {}'.format(vocab_chars.id_to_word(id)))
id = 234234
print('====> id to word: {}'.format(vocab_chars.id_to_word(id)))