def test_vocab_with_special_token(tokens, frequencies): vocab = Vocab( tokens, frequencies=frequencies, min_frequency=5, unk_token="<unk>", pad_token="<pad>", bos_token="<s>", eos_token="</s>", ) assert len(vocab) == len(tokens) - 1 assert "c" not in vocab assert "##b" in vocab assert vocab["a"] == 2 assert vocab.token2idx(["a", "##b", "<unk>"]) == [2, 3, 5] assert vocab.idx2token([2, 3, 5]) == ["a", "##b", "<unk>"] assert vocab.sorted_tokens == [ "<s>", "</s>", "a", "##b", "ddd", "<unk>", "<pad>", ] assert vocab.sorted_token_lengths == [1, 1, 1, 1, 3, 1, 1]
def test_vocab_ichar2itoken( char_start, char_end, token_start, token_end, input_tokens, tokens, frequencies ): vocab = Vocab( tokens, frequencies=frequencies, min_frequency=5, unk_token="<unk>", pad_token="<pad>", bos_token="<s>", eos_token="</s>", ) start_mapping, end_mapping = vocab.create_ichar2itoken_mapping(input_tokens) assert start_mapping[char_start] == token_start assert end_mapping[char_end] == token_end
def test_word2vec(): vocab = Vocab(["a", "b", "c"]) wv = Word2vec(vocab, 10) token_ids = np.array([1, 2, 0, 0]) embeddings = wv(token_ids) assert embeddings.shape == (4, 10) assert embeddings._keras_mask.numpy().tolist() == [True, True, False, False]
def test_jieba_tokenizer(): vocab = Vocab(["你们", "我们", "好"]) tokenizer = JiebaTokenizer(vocab) tokens = tokenizer.tokenize("你们与我们好") assert tokens[0] == "你们" assert tokens[1] == "与" assert tokens[2] == "我们" assert tokens[3] == "好"
def test_vocab_serialization(tokens, frequencies): vocab = Vocab( tokens, frequencies, min_frequency=5, unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos>", ) new_vocab = Vocab.from_json(vocab.to_json()) assert len(vocab) == len(new_vocab) assert vocab.sorted_tokens == new_vocab.sorted_tokens assert vocab.pad == new_vocab.pad assert vocab.unk == new_vocab.unk assert vocab.bos == new_vocab.bos assert vocab.eos == new_vocab.eos assert str(vocab) == str(new_vocab)
def build_vocab( texts: Sequence[str], segment_func: Callable[[str], Sequence[str]], min_frequency=5, ) -> Vocab: counter = Counter( itertools.chain.from_iterable(segment_func(text) for text in texts) ) return Vocab(counter, min_frequency=min_frequency)
def test_empty_vocab(): vocab = Vocab([]) assert vocab.pad == "<pad>" assert vocab.unk == "<unk>" assert vocab.bos == "<bos>" assert vocab.eos == "<eos>" assert vocab[vocab.pad] == 0 assert vocab[vocab.unk] == 1 assert vocab[vocab.bos] == 2 assert vocab[vocab.eos] == 3
def load(cls, directory: str, epoch: Optional[int] = None) -> "BaseNLPModel": with open(os.path.join(directory, "meta.json"), encoding="UTF-8") as f: meta = json.loads(f.read()) with open(os.path.join(directory, "vocab.json")) as f: vocab = Vocab.from_json(f.read()) module = cls.from_config({"vocab": vocab, **meta}) module._model = tf.keras.models.load_model( os.path.join(directory, cls._get_model_filename(epoch=epoch)) ) module._built = True return module
def test_transform_func(text_with_empty): vocab = Vocab(["x", "y"]) d = ClassificationDataset( vocab, ["1", "2"], segmenter="char", is_multilabel=True, csv_file=text_with_empty, max_length=2, ) data = d.py_transform(tf.constant("xz"), tf.constant("1|2")) np.testing.assert_array_equal(data[0], [vocab["x"], vocab[vocab.unk]]) np.testing.assert_array_equal(data[1], [1, 1])
def test_classification_dataset_transform(): vocab = Vocab(["x", "y"]) df = pd.DataFrame({"text": ["xxx", "yyyyy"], "label": ["1|2", "2"]}) d = ClassificationDataset(vocab, ["2", "1"], segmenter="char", is_multilabel=True, X=df.text, y=df.label) dataset = d.batchify(2, shuffle=False) for text, label in dataset: text = text.numpy() assert text[0][-1] == vocab[vocab.pad] assert text[1][0] == vocab["y"] label = label.numpy() np.testing.assert_array_equal(label, [[1, 1], [1, 0]])
def test_create_from_csv(text_without_empty): vocab = Vocab(["你", "啊", "拿", "好", "我"]) labels = ["1", "2"] for in_memory in (True, False): d = ClassificationDataset( vocab, labels, is_multilabel=True, csv_file=text_without_empty, in_memory=in_memory, ) dataset = d.batchify(2, shuffle=False) for text, label in dataset: text = text.numpy() assert text[1][-1] == vocab[vocab.pad] assert text[1][0] == vocab["我"] label = label.numpy() np.testing.assert_array_equal(label, [[1, 0], [1, 1]])
def from_checkpoint_file( cls, model_type: BertFamily, checkpoint_directory: str, config_filename: Optional[str] = None, ) -> "ModelCheckpoint": cls.create_checkpoint_file(checkpoint_directory) with open(os.path.join(checkpoint_directory, "vocab.txt"), encoding="UTF-8") as f: token_list = f.read().strip("\n").split("\n") vocab = Vocab( token_list, pad_token=token_list[0], unk_token=token_list[100], bos_token=token_list[101], eos_token=token_list[102], ) if not config_filename: config_filename = cls.search_config_file(checkpoint_directory) config = BertConfig.from_json_file( os.path.join(checkpoint_directory, config_filename)) return cls(model_type, checkpoint_directory, config, vocab)
def vocab(): return Vocab("甲乙丙丁葵", bos_token="[CLS]", eos_token="[SEP]")
def from_word2vec_format(cls, filename: str, segmenter: str = "jieba") -> "Word2vec": pad = "<pad>" unk = "<unk>" bos = "<s>" eos = "</s>" num_special_tokens = 0 vocab_size = 0 embedding_size = 0 has_header = False has_unk = False with open(filename) as f: for line in f: line = line.strip("\n") if line == "": continue if embedding_size == 0 and not line.startswith(" "): if len(line.split()) == 2: embedding_size = int(line.split(" ")[1]) has_header = True continue else: embedding_size = len(line.split(" ")) - 1 if (line.startswith(pad) or line.startswith(unk) or line.startswith(bos) or line.startswith(eos)): num_special_tokens += 1 vocab_size += 1 num_adding_tokens = 4 - num_special_tokens weights = np.zeros((vocab_size + num_adding_tokens, embedding_size)) weights[2:4, :] = np.random.uniform(-0.1, 0.1, size=(2, embedding_size)) tokens = [pad, unk, bos, eos] with open(filename) as f: for i, line in enumerate(f): line = line.strip("\n") if line == "": continue if has_header: i -= 1 if i < 0: continue cells = line.split(" ") if cells[0] == "": del cells[0] cells[0] = " " token = cells[0] idx = i + num_adding_tokens + num_special_tokens if token == pad: idx = 0 num_special_tokens -= 1 elif token == unk: idx = 1 has_unk = True num_special_tokens -= 1 elif token == bos: idx = 2 num_special_tokens -= 1 elif token == eos: idx = 3 num_special_tokens -= 1 else: tokens.append(token) vec = list(map(float, cells[1:])) weights[idx, :] = vec vocab = Vocab( tokens, pad_token=pad, unk_token=unk, bos_token=bos, eos_token=eos, ) if not has_unk: weights[1, :] = weights[4:, :].mean(axis=0) word2vec = cls(vocab, embedding_size, segmenter=segmenter) word2vec.build() word2vec._model.set_weights([weights]) return word2vec
def test_label_binarizer(text_with_empty): vocab = Vocab(["x", "y"]) d = ClassificationDataset(vocab, ["a", "c", "b"], csv_file=text_with_empty) np.testing.assert_array_equal(d.py_label_binarizer(["c", "a", "d"]), [1, 1, 0])
def test_vocab_without_special_token(tokens): vocab = Vocab(tokens[:-4]) assert len(vocab) == len(tokens) assert vocab.pad == "<pad>"