def test_vocab_without_unk(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) oov_word = 'OOVWORD' self.assertNotIn(oov_word, c) # tests for specials_first=True v_first = vocab.Vocab(c, min_freq=3, specials=['<pad>'], specials_first=True) expected_itos_first = ['<pad>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'] expected_stoi_first = {x: index for index, x in enumerate(expected_itos_first)} self.assertEqual(v_first.itos, expected_itos_first) self.assertEqual(dict(v_first.stoi), expected_stoi_first) self.assertNotIn(oov_word, v_first.itos) self.assertNotIn(oov_word, v_first.stoi) # tests for specials_first=False v_last = vocab.Vocab(c, min_freq=3, specials=['<pad>'], specials_first=False) expected_itos_last = ['ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world', '<pad>'] expected_stoi_last = {x: index for index, x in enumerate(expected_itos_last)} self.assertEqual(v_last.itos, expected_itos_last) self.assertEqual(dict(v_last.stoi), expected_stoi_last) self.assertNotIn(oov_word, v_last.itos) self.assertNotIn(oov_word, v_last.stoi) # check if pad is mapped to the first index self.assertEqual(v_first.stoi['<pad>'], 0) # check if pad is mapped to the last index self.assertEqual(v_last.stoi['<pad>'], max(v_last.stoi.values())) # check if an oovword is not in vocab and a default unk_id is not assigned to it self.assertRaises(KeyError, v_first.stoi.__getitem__, oov_word) self.assertRaises(KeyError, v_last.stoi.__getitem__, oov_word)
def test_errors(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) with self.assertRaises(ValueError): # Test proper error raised when using unknown string alias vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'], vectors=["fasttext.english.300d"]) vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'], vectors="fasttext.english.300d") with self.assertRaises(ValueError): # Test proper error is raised when vectors argument is # non-string or non-Vectors vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'], vectors={"word": [1, 2, 3]})
def get_dicts(words=['hello', 'world'], glove='glove.6B.50d'): c = Counter(words) v = vocab.Vocab(c, vectors=glove) dicts = {} for i in words: dicts[i] = v.vectors.numpy()[v.stoi[i]] return dicts
def __init__(self, corpus, reserved_tokens=[], n_vocab=None, min_freq=1, type="paired"): """extract vocabulary form given corpus Keyword Arguments: reserved_tokens {list} -- reserved tokens list, will be added after the built-in tokens (default: {[]}) n_vocab {int} -- # of max vocabulary based on the frequency, -1 means infinity. (default: {-1}) type {str} -- value in {"dialog", "translation"}. (default: {"dialog"}) """ if n_vocab == -1: n_vocab = None reserved_tokens = [ "<pad>", "<unk>", "<bos>", "<eos>", ] + reserved_tokens self.vocab = vocab.Vocab(self._build_vocab(corpus, type=type), specials=reserved_tokens, max_size=n_vocab, min_freq=min_freq)
def test_serialization(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>']) pickle_path = os.path.join(self.test_dir, "vocab.pkl") pickle.dump(v, open(pickle_path, "wb")) v_loaded = pickle.load(open(pickle_path, "rb")) assert v == v_loaded
def test_vocab_extend(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) # Build a vocab and get vectors twice to test caching. for i in range(2): f = FastText(language='simple') v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'], vectors=f) n_vocab = len(v) v.extend(f) # extend the vocab with the words contained in f.itos self.assertGreater(len(v), n_vocab) self.assertEqual(v.itos[:6], ['<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']) vectors = v.vectors.numpy() # The first 5 entries in each vector. expected_fasttext_simple_en = { 'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645], 'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165], } for word in expected_fasttext_simple_en: assert_allclose(vectors[v.stoi[word], :5], expected_fasttext_simple_en[word]) assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300)) # Delete the vectors after we're done to save disk space on CI if os.environ.get("TRAVIS") == "true": vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec") conditional_remove(vec_file)
def test_vocab_specials_first(self): c = Counter("a a b b c c".split()) # add specials into vocabulary at first v = vocab.Vocab(c, max_size=2, specials=['<pad>', '<eos>']) expected_itos = ['<pad>', '<eos>', 'a', 'b'] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.itos, expected_itos) self.assertEqual(dict(v.stoi), expected_stoi) # add specials into vocabulary at last v = vocab.Vocab(c, max_size=2, specials=['<pad>', '<eos>'], specials_first=False) expected_itos = ['a', 'b', '<pad>', '<eos>'] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.itos, expected_itos) self.assertEqual(dict(v.stoi), expected_stoi)
def get_vocab(data, stopword): tokenized_data = get_tokenized(data, stopword) #分词、去除停用词 counter = collections.Counter([tk for st in tokenized_data for tk in st]) #统计词频 return Vocab.Vocab(counter, min_freq=5, specials=[ '<pad>', '<unk>' ]) #保留词频大于5的词 <pad>对应填充项(词典中第0个词) <unk>对应低频词和停止词等未知词(词典中第1个词)
def prepair_text(text: list): """ Обрабатывает сырой текст. Создает итератор :param text: Необработанный лист с тектом :return: Загрузчик """ text_df = pd.DataFrame(text, columns=['text']) text_df['target'] = 0 text_df['text'] = text_df['text'].apply(clearing) field = data.Field(tokenize=tokenizer, include_lengths=True) field.vocab = load_vocab('vocab.pkl') label = data.LabelField(dtype=torch.float) label.vocab = vocab.Vocab({'<unk>': 0, '<pad>': 1, 0: 2, 1: 1}) fields = [('text', field), ('label', label)] text_ds = DataFrameDataset(text_df, fields) text_loader = data.BucketIterator(text_ds, sort_within_batch=True, batch_size=1) print() print(text[0]) print(text_df['text'].values) print(vars(text_ds[0])['text']) return text_loader
def _paragraphs_to_vocab_and_data( self, paragraphs: typing.List[Paragraph]) -> vocab.Vocab: ''' Helper function to read in the raw data, tokenize it appropriately, and generate the Vocab object. ''' # tokenize paragraphs paragraphs_tokenized = [ self._tokenize_paragraph(paragraph) for paragraph in paragraphs ] counter = collections.Counter() for tokenized_paragraph in paragraphs_tokenized: for sentence in tokenized_paragraph: for token in sentence: counter[token] += 1 return ( paragraphs_tokenized, vocab.Vocab( counter, max_size=self.max_vocab_size, min_freq=5, specials=[ const.PAD_TOKEN, '<mask>', const.CLASS_TOKEN, '<sep>' ], ), )
def test_vocab_vectors_custom_cache(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) vector_cache = os.path.join('/tmp', 'vector_cache') # Build a vocab and get vectors twice to test caching. for i in range(2): if i == 1: self.assertTrue(os.path.exists(vector_cache)) v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'], vectors=Vectors('wiki.simple.vec', cache=vector_cache, url=FastText.url_base.format('simple'))) self.assertEqual(v.itos, ['<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']) vectors = v.vectors.numpy() # The first 5 entries in each vector. expected_fasttext_simple_en = { 'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645], 'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165], } for word in expected_fasttext_simple_en: assert_allclose(vectors[v.stoi[word], :5], expected_fasttext_simple_en[word]) assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300)) # Delete the vectors after we're done to save disk space on CI if os.environ.get("TRAVIS") == "true": vec_file = os.path.join(vector_cache, "wiki.simple.vec") conditional_remove(vec_file)
def get_vocab_imdb(data): ''' :param data: all reviews and each review is a list of words :return: vocab.vocab(counter, min_freq) ''' tokenized_data = get_tokenized_imdb(data) counter = collections.Counter([tk for st in tokenized_data for tk in st]) return Vocab.Vocab(counter, min_freq=5)
def get_vocab_imdb(data): ''' :param data: all reviews and each review is a list of words :return: vocab.vocab(counter, min_freq) ''' tokenized_data = get_tokenized_imdb(data) #将评论单词转化为lower_case counter = collections.Counter([tk for st in tokenized_data for tk in st ]) #记录每个所有句子中出现的单词的次数[word, cnt] return Vocab.Vocab(counter, min_freq=5)
def test_vocab_basic(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2}) v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>']) expected_itos = ['<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world'] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.itos, expected_itos) self.assertEqual(dict(v.stoi), expected_stoi)
def test_has_unk(self): c = Counter({ 'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2 }) v = vocab.Vocab(c) self.assertEqual(v['not_in_it'], 0)
def get_vocab(data): ''' @params: data: 同上 @return: 数据集上的词典,Vocab 的实例(freqs, stoi, itos) ''' tokenized_data = get_tokenized(data) counter = collections.Counter([tk for st in tokenized_data for tk in st]) return Vocab.Vocab(counter, min_freq=1)
def get_vocab_evaluation(data): ''' @params: data: 同上 @return: 数据集上的词典,Vocab 的实例(freqs, stoi, itos) ''' tokenized_data = get_tokenized_evaluation(data) counter = collections.Counter([tk for st in tokenized_data for tk in st]) # 统计所有的数据 return Vocab.Vocab(counter, min_freq=2) # 构建词汇表,这里最小出现次数是2
def create_vocab(convos, max_freq=5): # flatten convos flat = [text for pair in convos for text in pair] counter = Counter(' '.join(flat).split()) voc = vocab.Vocab(counter, min_freq=max_freq, specials=[PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN]) return voc
def get_vocab(data): """ 获取词汇表,filter:词频需要超过5次 :param data: :return: """ tokenized_data = tokenize(data) counter = collections.Counter( [token for st in tokenized_data for token in st]) return Vocab.Vocab(counter, min_freq=5)
def __init__(self): glove = torchvocab.Vectors(name=os.path.join( Constants.Data.datadir, Constants.Data.glove_path)) counter = Counter([w for w in glove.stoi]) self.vocab = torchvocab.Vocab(counter, vectors=glove, specials=[ Constants.SpecialTokens.pad, Constants.SpecialTokens.unk ]) self.embedding_layer = nn.Embedding.from_pretrained(self.vocab.vectors)
def build_data(all_tokens, all_seqs): """ 使用所有的词来构造词典。并将所有序列中的词变换为词索引后构造Tensor :param all_tokens: :param all_seqs: :return: """ vocab = Vocab.Vocab(collections.Counter(all_tokens), specials=[PAD, BOS, EOS]) indices = [[vocab.stoi[w] for w in seq] for seq in all_seqs] return vocab, torch.tensor(indices)
def test_vocab_set_vectors(self): c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'test': 4, 'freq_too_low': 2}) v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>']) stoi = {"hello": 0, "world": 1, "test": 2} vectors = torch.FloatTensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) dim = 2 v.set_vectors(stoi, vectors, dim) expected_vectors = np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.1, 0.2], [0.5, 0.6], [0.3, 0.4]]) assert_allclose(v.vectors.numpy(), expected_vectors)
def load_vocab(): """ 加载字典:data/cnews.vocab.txt :return: Vocab类型的字典变量 """ with open('data/cnews.vocab.txt', 'rb') as vocab_file: # 分割字符组成列表 review = vocab_file.read().decode('utf-8').split('\r\n') # 封装成Counter类型 counter = collections.Counter(review) # 转换成Vocab类型 return Vocab.Vocab(counter)
def test_vocab_download_charngram_vectors(self): c = Counter({ 'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2 }) # Build a vocab and get vectors twice to test caching, then once more # to test string aliases. for i in range(3): if i == 2: vectors = "charngram.100d" else: vectors = CharNGram() v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'], vectors=vectors) expected_itos = [ '<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world' ] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.itos, expected_itos) self.assertEqual(dict(v.stoi), expected_stoi) vectors = v.vectors.numpy() # The first 5 entries in each vector. expected_charngram = { 'hello': [ -0.44782442, -0.08937783, -0.34227219, -0.16233221, -0.39343098 ], 'world': [ -0.29590717, -0.05275926, -0.37334684, 0.27117205, -0.3868292 ], } for word in expected_charngram: assert_allclose(vectors[v.stoi[word], :5], expected_charngram[word]) assert_allclose(vectors[v.stoi['<unk>']], np.zeros(100)) assert_allclose(vectors[v.stoi['OOV token']], np.zeros(100)) # Delete the vectors after we're done to save disk space on CI if os.environ.get("TRAVIS") == "true": conditional_remove( os.path.join(self.project_root, ".vector_cache", "charNgram.txt")) conditional_remove( os.path.join(self.project_root, ".vector_cache", "jmt_pre-trained_embeddings.tar.gz"))
def get_tokenized_vocab(tokenized_data, min_freq=5): """ 构造词典. @params: tokenized_data - 分词后的数据,words为单词列表,[[words], [words]]. min_freq - 构造Vocab时的最小词频. @return: On success - Vocab对象. On failure - 错误信息. """ tokenized_counter = collections.Counter([tk for st in tokenized_data for tk in st]) return Vocab.Vocab(tokenized_counter, min_freq=5)
def test_vocab_download_glove_vectors(self): c = Counter({ 'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2 }) # Build a vocab and get vectors twice to test caching, then once more # to test string aliases. for i in range(3): if i == 2: vectors = "glove.twitter.27B.25d" else: vectors = GloVe(name='twitter.27B', dim='25') v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'], vectors=vectors) expected_itos = [ '<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world' ] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.itos, expected_itos) self.assertEqual(dict(v.stoi), expected_stoi) vectors = v.vectors.numpy() # The first 5 entries in each vector. expected_twitter = { 'hello': [-0.77069, 0.12827, 0.33137, 0.0050893, -0.47605], 'world': [0.10301, 0.095666, -0.14789, -0.22383, -0.14775], } for word in expected_twitter: assert_allclose(vectors[v.stoi[word], :5], expected_twitter[word]) assert_allclose(vectors[v.stoi['<unk>']], np.zeros(25)) assert_allclose(vectors[v.stoi['OOV token']], np.zeros(25)) # Delete the vectors after we're done to save disk space on CI if os.environ.get("TRAVIS") == "true": zip_file = os.path.join(self.project_root, ".vector_cache", "glove.twitter.27B.zip") conditional_remove(zip_file) for dim in ["25", "50", "100", "200"]: conditional_remove( os.path.join(self.project_root, ".vector_cache", "glove.twitter.27B.{}d.txt".format(dim)))
def __init__(self, data_path, max_length, max_vocab_size, min_freq, eos_token, pad_token, unk_token, embed_dim, special_tokens, threshold, pre_trained=False): """ Args: data_path (str): path to data file max_length (int): maximum length of each sentence, including <eos> max_vocab_size (int): maximum number of words allowed in vocabulary min_freq (int): minimum frequency to add word to vocabulary eos_token (str): end of sentence token (tells decoder to start or stop) pad_token (str): padding token unk_token (str): unknown word token embed_dim (int): dimension of embedding vectors special_tokens (list of str): other tokens to add to vocabulary threshold (int): count of unknown words required to prune sentence pre_trained (Vector): pre trained word embeddings """ special_tokens = [pad_token, unk_token, eos_token] + special_tokens # the value 0 will be regarded as padding assert special_tokens[0] == pad_token inputs, targets, counter, xlen = process_data(data_path, max_length, eos_token, pad_token) self.vocab = vocab.Vocab(counter=counter, max_size=max_vocab_size, min_freq=min_freq, specials=special_tokens) if pre_trained is not False: self.vocab.load_vectors(pre_trained) assert len(inputs) == len(targets) and len(inputs) == len(xlen) self.nwords = len(self.vocab) self.max_len = max_length self.eos_idx = self.vocab.stoi[eos_token] self.pad_idx = self.vocab.stoi[pad_token] self.unk_idx = self.vocab.stoi[unk_token] self.eos_token = eos_token self.pad_token = pad_token self.unk_token = unk_token self.embed_dim = embed_dim self.unk_count = 0 # number of unknown words in dataset self.total_tokens = 0 # number of tokens in dataset not counting padding self.special_tokens = special_tokens self.x_lens = xlen self.x_data = np.zeros((len(inputs), max_length), dtype=np.int32) self.y_data = np.zeros((len(targets), max_length), dtype=np.int32) convert_to_index(inputs, self, self.x_data) convert_to_index(targets, self, self.y_data) self.x_data, self.y_data, self.x_lens = prune_data(self.x_data, self.y_data, self.x_lens, self, threshold) self.x_data = torch.from_numpy(self.x_data) self.y_data = torch.from_numpy(self.y_data)
def build_data(all_tokens, all_seqs): """ 使用所有的词来构造词典。并将所有序列中的词变换为词索引后构造Tensor. @params: all_tokens - 所有的词. all_seqs - 所有序列. @return: On success - vocab,转换为词索引后的Tensor. On failure - 错误信息. """ vocab = Vocab.Vocab(collections.Counter(all_tokens), specials=[Const.PAD, Const.BOS, Const.EOS]) indices = [[vocab.stoi[w] for w in seq] for seq in all_seqs] return vocab, torch.tensor(indices)
def build_vocab(self, labels, dataset): # do this so that all labels are present even if they incidentally aren't included in the data counter = Counter({label: 0 for label in labels}) for example in dataset.examples: counter.update(example.label) self.vocab = vocab.Vocab(counter, specials=labels) weights = [] factor = 0 for label in self.vocab.itos: freq = self.vocab.freqs[label] weight = 1 / self.vocab.freqs[label] if freq else 0 factor += weight weights.append(weight) self.weights = torch.tensor(weights) / factor
def test_vocab_download_fasttext_vectors(self): c = Counter({ 'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2 }) # Build a vocab and get vectors twice to test caching, then once more # to test string aliases. for i in range(3): if i == 2: vectors = str("fasttext.simple.300d") # must handle str on Py2 else: vectors = FastText(language='simple') v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'], vectors=vectors) expected_itos = [ '<unk>', '<pad>', '<bos>', 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world' ] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.itos, expected_itos) self.assertEqual(dict(v.stoi), expected_stoi) vectors = v.vectors.numpy() # The first 5 entries in each vector. expected_fasttext_simple_en = { 'hello': [0.39567, 0.21454, -0.035389, -0.24299, -0.095645], 'world': [0.10444, -0.10858, 0.27212, 0.13299, -0.33165], } for word in expected_fasttext_simple_en: assert_allclose(vectors[v.stoi[word], :5], expected_fasttext_simple_en[word]) assert_allclose(vectors[v.stoi['<unk>']], np.zeros(300)) assert_allclose(vectors[v.stoi['OOV token']], np.zeros(300)) # Delete the vectors after we're done to save disk space on CI if os.environ.get("TRAVIS") == "true": vec_file = os.path.join(self.project_root, ".vector_cache", "wiki.simple.vec") conditional_remove(vec_file)