Exemple #1
0
    def test_vocab_without_unk(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        oov_word = 'OOVWORD'
        self.assertNotIn(oov_word, c)

        # tests for specials_first=True
        v_first = vocab.Vocab(c, min_freq=3, specials=['<pad>'], specials_first=True)
        expected_itos_first = ['<pad>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']
        expected_stoi_first = {x: index for index, x in enumerate(expected_itos_first)}
        self.assertEqual(v_first.itos, expected_itos_first)
        self.assertEqual(dict(v_first.stoi), expected_stoi_first)
        self.assertNotIn(oov_word, v_first.itos)
        self.assertNotIn(oov_word, v_first.stoi)

        # tests for specials_first=False
        v_last = vocab.Vocab(c, min_freq=3, specials=['<pad>'], specials_first=False)
        expected_itos_last = ['ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', '<pad>']
        expected_stoi_last = {x: index for index, x in enumerate(expected_itos_last)}
        self.assertEqual(v_last.itos, expected_itos_last)
        self.assertEqual(dict(v_last.stoi), expected_stoi_last)
        self.assertNotIn(oov_word, v_last.itos)
        self.assertNotIn(oov_word, v_last.stoi)

        # check if pad is mapped to the first index
        self.assertEqual(v_first.stoi['<pad>'], 0)
        # check if pad is mapped to the last index
        self.assertEqual(v_last.stoi['<pad>'], max(v_last.stoi.values()))

        # check if an oovword is not in vocab and a default unk_id is not assigned to it
        self.assertRaises(KeyError, v_first.stoi.__getitem__, oov_word)
        self.assertRaises(KeyError, v_last.stoi.__getitem__, oov_word)
 def test_errors(self):
     c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
     with self.assertRaises(ValueError):
         # Test proper error raised when using unknown string alias
         vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                     vectors=["fasttext.english.300d"])
         vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                     vectors="fasttext.english.300d")
     with self.assertRaises(ValueError):
         # Test proper error is raised when vectors argument is
         # non-string or non-Vectors
         vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                     vectors={"word": [1, 2, 3]})
def get_dicts(words=['hello', 'world'], glove='glove.6B.50d'):
    c = Counter(words)
    v = vocab.Vocab(c, vectors=glove)
    dicts = {}
    for i in words:
        dicts[i] = v.vectors.numpy()[v.stoi[i]]
    return dicts
    def __init__(self,
                 corpus,
                 reserved_tokens=[],
                 n_vocab=None,
                 min_freq=1,
                 type="paired"):
        """extract vocabulary form given corpus


        Keyword Arguments:
            reserved_tokens {list} -- reserved tokens list, will be added after the built-in tokens (default: {[]})
            n_vocab {int} -- # of max vocabulary based on the frequency, -1 means infinity.  (default: {-1})
            type {str} -- value in {"dialog", "translation"}. (default: {"dialog"})
        """
        if n_vocab == -1:
            n_vocab = None

        reserved_tokens = [
            "<pad>",
            "<unk>",
            "<bos>",
            "<eos>",
        ] + reserved_tokens
        self.vocab = vocab.Vocab(self._build_vocab(corpus, type=type),
                                 specials=reserved_tokens,
                                 max_size=n_vocab,
                                 min_freq=min_freq)
 def test_serialization(self):
     c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
     v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'])
     pickle_path = os.path.join(self.test_dir, "vocab.pkl")
     pickle.dump(v, open(pickle_path, "wb"))
     v_loaded = pickle.load(open(pickle_path, "rb"))
     assert v == v_loaded
    def test_vocab_extend(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        # Build a vocab and get vectors twice to test caching.
        for i in range(2):
            f = FastText(language='simple')
            v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                            vectors=f)
            n_vocab = len(v)
            v.extend(f)  # extend the vocab with the words contained in f.itos
            self.assertGreater(len(v), n_vocab)

            self.assertEqual(v.itos[:6], ['<unk>', '<pad>', '<bos>',
                             'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'])
            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_fasttext_simple_en = {
                'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645],
                'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165],
            }

            for word in expected_fasttext_simple_en:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_fasttext_simple_en[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec")
            conditional_remove(vec_file)
Exemple #7
0
    def test_vocab_specials_first(self):
        c = Counter("a a b b c c".split())

        # add specials into vocabulary at first
        v = vocab.Vocab(c, max_size=2, specials=['<pad>', '<eos>'])
        expected_itos = ['<pad>', '<eos>', 'a', 'b']
        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
        self.assertEqual(v.itos, expected_itos)
        self.assertEqual(dict(v.stoi), expected_stoi)

        # add specials into vocabulary at last
        v = vocab.Vocab(c, max_size=2, specials=['<pad>', '<eos>'], specials_first=False)
        expected_itos = ['a', 'b', '<pad>', '<eos>']
        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
        self.assertEqual(v.itos, expected_itos)
        self.assertEqual(dict(v.stoi), expected_stoi)
def get_vocab(data, stopword):
    tokenized_data = get_tokenized(data, stopword)  #分词、去除停用词
    counter = collections.Counter([tk for st in tokenized_data
                                   for tk in st])  #统计词频
    return Vocab.Vocab(counter, min_freq=5, specials=[
        '<pad>', '<unk>'
    ])  #保留词频大于5的词 <pad>对应填充项(词典中第0个词) <unk>对应低频词和停止词等未知词(词典中第1个词)
Exemple #9
0
def prepair_text(text: list):
    """
    Обрабатывает сырой текст. Создает итератор
    :param text: Необработанный лист с тектом
    :return: Загрузчик
    """
    text_df = pd.DataFrame(text, columns=['text'])
    text_df['target'] = 0
    text_df['text'] = text_df['text'].apply(clearing)

    field = data.Field(tokenize=tokenizer, include_lengths=True)
    field.vocab = load_vocab('vocab.pkl')
    label = data.LabelField(dtype=torch.float)
    label.vocab = vocab.Vocab({'<unk>': 0, '<pad>': 1, 0: 2, 1: 1})
    fields = [('text', field), ('label', label)]
    text_ds = DataFrameDataset(text_df, fields)

    text_loader = data.BucketIterator(text_ds,
                                      sort_within_batch=True,
                                      batch_size=1)
    print()
    print(text[0])
    print(text_df['text'].values)
    print(vars(text_ds[0])['text'])

    return text_loader
Exemple #10
0
 def _paragraphs_to_vocab_and_data(
         self, paragraphs: typing.List[Paragraph]) -> vocab.Vocab:
     '''
     Helper function to read in the raw data, tokenize it appropriately, and generate the
     Vocab object.
     '''
     # tokenize paragraphs
     paragraphs_tokenized = [
         self._tokenize_paragraph(paragraph) for paragraph in paragraphs
     ]
     counter = collections.Counter()
     for tokenized_paragraph in paragraphs_tokenized:
         for sentence in tokenized_paragraph:
             for token in sentence:
                 counter[token] += 1
     return (
         paragraphs_tokenized,
         vocab.Vocab(
             counter,
             max_size=self.max_vocab_size,
             min_freq=5,
             specials=[
                 const.PAD_TOKEN, '<mask>', const.CLASS_TOKEN, '<sep>'
             ],
         ),
     )
Exemple #11
0
    def test_vocab_vectors_custom_cache(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        vector_cache = os.path.join('/tmp', 'vector_cache')
        # Build a vocab and get vectors twice to test caching.
        for i in range(2):
            if i == 1:
                self.assertTrue(os.path.exists(vector_cache))

            v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                            vectors=Vectors('wiki.simple.vec', cache=vector_cache,
                                            url=FastText.url_base.format('simple')))

            self.assertEqual(v.itos, ['<unk>', '<pad>', '<bos>',
                                      'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'])
            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_fasttext_simple_en = {
                'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645],
                'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165],
            }

            for word in expected_fasttext_simple_en:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_fasttext_simple_en[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            vec_file = os.path.join(vector_cache, "wiki.simple.vec")
            conditional_remove(vec_file)
Exemple #12
0
def get_vocab_imdb(data):
    '''
    :param data: all reviews and  each review is a list of words
    :return: vocab.vocab(counter, min_freq)
    '''
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)
def get_vocab_imdb(data):
    '''
    :param data: all reviews and  each review is a list of words
    :return: vocab.vocab(counter, min_freq)
    '''
    tokenized_data = get_tokenized_imdb(data)  #将评论单词转化为lower_case
    counter = collections.Counter([tk for st in tokenized_data for tk in st
                                   ])  #记录每个所有句子中出现的单词的次数[word, cnt]
    return Vocab.Vocab(counter, min_freq=5)
    def test_vocab_basic(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})
        v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'])

        expected_itos = ['<unk>', '<pad>', '<bos>',
                         'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']
        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
        self.assertEqual(v.itos, expected_itos)
        self.assertEqual(dict(v.stoi), expected_stoi)
Exemple #15
0
 def test_has_unk(self):
     c = Counter({
         'hello': 4,
         'world': 3,
         'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5,
         'freq_too_low': 2
     })
     v = vocab.Vocab(c)
     self.assertEqual(v['not_in_it'], 0)
Exemple #16
0
def get_vocab(data):
    '''
    @params:
        data: 同上
    @return: 数据集上的词典,Vocab 的实例(freqs, stoi, itos)
    '''
    tokenized_data = get_tokenized(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=1)
Exemple #17
0
def get_vocab_evaluation(data):
    '''
    @params:
        data: 同上
    @return: 数据集上的词典,Vocab 的实例(freqs, stoi, itos)
    '''
    tokenized_data = get_tokenized_evaluation(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    # 统计所有的数据
    return Vocab.Vocab(counter, min_freq=2)  # 构建词汇表,这里最小出现次数是2
def create_vocab(convos, max_freq=5):
    # flatten convos
    flat = [text for pair in convos for text in pair]

    counter = Counter(' '.join(flat).split())

    voc = vocab.Vocab(counter, min_freq=max_freq,
                    specials=[PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN])

    return voc
Exemple #19
0
def get_vocab(data):
    """
    获取词汇表,filter:词频需要超过5次
    :param data:
    :return:
    """
    tokenized_data = tokenize(data)
    counter = collections.Counter(
        [token for st in tokenized_data for token in st])
    return Vocab.Vocab(counter, min_freq=5)
 def __init__(self):
     glove = torchvocab.Vectors(name=os.path.join(
         Constants.Data.datadir, Constants.Data.glove_path))
     counter = Counter([w for w in glove.stoi])
     self.vocab = torchvocab.Vocab(counter,
                                   vectors=glove,
                                   specials=[
                                       Constants.SpecialTokens.pad,
                                       Constants.SpecialTokens.unk
                                   ])
     self.embedding_layer = nn.Embedding.from_pretrained(self.vocab.vectors)
Exemple #21
0
def build_data(all_tokens, all_seqs):
    """
    使用所有的词来构造词典。并将所有序列中的词变换为词索引后构造Tensor
    :param all_tokens:
    :param all_seqs:
    :return:
    """
    vocab = Vocab.Vocab(collections.Counter(all_tokens),
                        specials=[PAD, BOS, EOS])
    indices = [[vocab.stoi[w] for w in seq] for seq in all_seqs]
    return vocab, torch.tensor(indices)
 def test_vocab_set_vectors(self):
     c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5,
                  'test': 4, 'freq_too_low': 2})
     v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'])
     stoi = {"hello": 0, "world": 1, "test": 2}
     vectors = torch.FloatTensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
     dim = 2
     v.set_vectors(stoi, vectors, dim)
     expected_vectors = np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0],
                                  [0.0, 0.0], [0.1, 0.2], [0.5, 0.6],
                                  [0.3, 0.4]])
     assert_allclose(v.vectors.numpy(), expected_vectors)
Exemple #23
0
def load_vocab():
    """
    加载字典:data/cnews.vocab.txt
    :return: Vocab类型的字典变量
    """
    with open('data/cnews.vocab.txt', 'rb') as vocab_file:
        # 分割字符组成列表
        review = vocab_file.read().decode('utf-8').split('\r\n')
        # 封装成Counter类型
        counter = collections.Counter(review)
        # 转换成Vocab类型
        return Vocab.Vocab(counter)
Exemple #24
0
    def test_vocab_download_charngram_vectors(self):
        c = Counter({
            'hello': 4,
            'world': 3,
            'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5,
            'freq_too_low': 2
        })
        # Build a vocab and get vectors twice to test caching, then once more
        # to test string aliases.
        for i in range(3):
            if i == 2:
                vectors = "charngram.100d"
            else:
                vectors = CharNGram()
            v = vocab.Vocab(c,
                            min_freq=3,
                            specials=['<unk>', '<pad>', '<bos>'],
                            vectors=vectors)
            expected_itos = [
                '<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'
            ]
            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
            self.assertEqual(v.itos, expected_itos)
            self.assertEqual(dict(v.stoi), expected_stoi)
            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_charngram = {
                'hello': [
                    -0.44782442, -0.08937783, -0.34227219, -0.16233221,
                    -0.39343098
                ],
                'world': [
                    -0.29590717, -0.05275926, -0.37334684, 0.27117205,
                    -0.3868292
                ],
            }

            for word in expected_charngram:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_charngram[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(100))
            assert_allclose(vectors[v.stoi['OOV token']], np.zeros(100))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            conditional_remove(
                os.path.join(self.project_root, ".vector_cache",
                             "charNgram.txt"))
            conditional_remove(
                os.path.join(self.project_root, ".vector_cache",
                             "jmt_pre-trained_embeddings.tar.gz"))
Exemple #25
0
def get_tokenized_vocab(tokenized_data, min_freq=5):
    """ 构造词典.

        @params:
            tokenized_data - 分词后的数据,words为单词列表,[[words], [words]].
            min_freq - 构造Vocab时的最小词频.

        @return:
            On success - Vocab对象.
            On failure - 错误信息.
    """
    tokenized_counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(tokenized_counter, min_freq=5)
Exemple #26
0
    def test_vocab_download_glove_vectors(self):
        c = Counter({
            'hello': 4,
            'world': 3,
            'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5,
            'freq_too_low': 2
        })

        # Build a vocab and get vectors twice to test caching, then once more
        # to test string aliases.
        for i in range(3):
            if i == 2:
                vectors = "glove.twitter.27B.25d"
            else:
                vectors = GloVe(name='twitter.27B', dim='25')
            v = vocab.Vocab(c,
                            min_freq=3,
                            specials=['<unk>', '<pad>', '<bos>'],
                            vectors=vectors)

            expected_itos = [
                '<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'
            ]
            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
            self.assertEqual(v.itos, expected_itos)
            self.assertEqual(dict(v.stoi), expected_stoi)

            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_twitter = {
                'hello': [-0.77069, 0.12827, 0.33137, 0.0050893, -0.47605],
                'world': [0.10301, 0.095666, -0.14789, -0.22383, -0.14775],
            }

            for word in expected_twitter:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_twitter[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(25))
            assert_allclose(vectors[v.stoi['OOV token']], np.zeros(25))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            zip_file = os.path.join(self.project_root, ".vector_cache",
                                    "glove.twitter.27B.zip")
            conditional_remove(zip_file)
            for dim in ["25", "50", "100", "200"]:
                conditional_remove(
                    os.path.join(self.project_root, ".vector_cache",
                                 "glove.twitter.27B.{}d.txt".format(dim)))
Exemple #27
0
    def __init__(self, data_path, max_length, max_vocab_size, min_freq,
                 eos_token, pad_token, unk_token, embed_dim, special_tokens,
                 threshold, pre_trained=False):
        """
        Args:
            data_path (str): path to data file
            max_length (int): maximum length of each sentence, including <eos>
            max_vocab_size (int): maximum number of words allowed in vocabulary
            min_freq (int): minimum frequency to add word to vocabulary
            eos_token (str): end of sentence token (tells decoder to start or stop)
            pad_token (str): padding token
            unk_token (str): unknown word token
            embed_dim (int): dimension of embedding vectors
            special_tokens (list of str): other tokens to add to vocabulary
            threshold (int): count of unknown words required to prune sentence
            pre_trained (Vector): pre trained word embeddings
        """
        special_tokens = [pad_token, unk_token, eos_token] + special_tokens
        # the value 0 will be regarded as padding
        assert special_tokens[0] == pad_token
        inputs, targets, counter, xlen = process_data(data_path, max_length,
                                                      eos_token, pad_token)
        self.vocab = vocab.Vocab(counter=counter, max_size=max_vocab_size,
                                 min_freq=min_freq, specials=special_tokens)
        if pre_trained is not False:
            self.vocab.load_vectors(pre_trained)
        assert len(inputs) == len(targets) and len(inputs) == len(xlen)

        self.nwords = len(self.vocab)
        self.max_len = max_length
        self.eos_idx = self.vocab.stoi[eos_token]
        self.pad_idx = self.vocab.stoi[pad_token]
        self.unk_idx = self.vocab.stoi[unk_token]
        self.eos_token = eos_token
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.embed_dim = embed_dim
        self.unk_count = 0  # number of unknown words in dataset
        self.total_tokens = 0  # number of tokens in dataset not counting padding
        self.special_tokens = special_tokens
        self.x_lens = xlen
        self.x_data = np.zeros((len(inputs), max_length), dtype=np.int32)
        self.y_data = np.zeros((len(targets), max_length), dtype=np.int32)

        convert_to_index(inputs, self, self.x_data)
        convert_to_index(targets, self, self.y_data)
        self.x_data, self.y_data, self.x_lens = prune_data(self.x_data, self.y_data,
                                                           self.x_lens, self, threshold)
        self.x_data = torch.from_numpy(self.x_data)
        self.y_data = torch.from_numpy(self.y_data)
Exemple #28
0
def build_data(all_tokens, all_seqs):
    """ 使用所有的词来构造词典。并将所有序列中的词变换为词索引后构造Tensor.

        @params:
            all_tokens - 所有的词.
            all_seqs - 所有序列.

        @return:
            On success - vocab,转换为词索引后的Tensor.
            On failure - 错误信息.
    """
    vocab = Vocab.Vocab(collections.Counter(all_tokens),
                        specials=[Const.PAD, Const.BOS, Const.EOS])
    indices = [[vocab.stoi[w] for w in seq] for seq in all_seqs]
    return vocab, torch.tensor(indices)
 def build_vocab(self, labels, dataset):
     # do this so that all labels are present even if they incidentally aren't included in the data
     counter = Counter({label: 0 for label in labels})
     for example in dataset.examples:
         counter.update(example.label)
     self.vocab = vocab.Vocab(counter, specials=labels)
     
     weights = []
     factor = 0
     for label in self.vocab.itos:
         freq = self.vocab.freqs[label]
         weight = 1 / self.vocab.freqs[label] if freq else 0
         factor += weight
         weights.append(weight)
     self.weights = torch.tensor(weights) / factor
Exemple #30
0
    def test_vocab_download_fasttext_vectors(self):
        c = Counter({
            'hello': 4,
            'world': 3,
            'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5,
            'freq_too_low': 2
        })
        # Build a vocab and get vectors twice to test caching, then once more
        # to test string aliases.
        for i in range(3):
            if i == 2:
                vectors = str("fasttext.simple.300d")  # must handle str on Py2
            else:
                vectors = FastText(language='simple')

            v = vocab.Vocab(c,
                            min_freq=3,
                            specials=['<unk>', '<pad>', '<bos>'],
                            vectors=vectors)

            expected_itos = [
                '<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'
            ]
            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
            self.assertEqual(v.itos, expected_itos)
            self.assertEqual(dict(v.stoi), expected_stoi)
            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_fasttext_simple_en = {
                'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645],
                'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165],
            }

            for word in expected_fasttext_simple_en:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_fasttext_simple_en[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300))
            assert_allclose(vectors[v.stoi['OOV token']], np.zeros(300))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            vec_file = os.path.join(self.project_root, ".vector_cache",
                                    "wiki.simple.vec")
            conditional_remove(vec_file)