Ejemplo n.º 1
0
    def _get_vocabs(self):
        word_list = []
        char_list = []

        for ds in self._datasets:
            for item in ds:
                words = self._get_word_tokens(item[1])
                word_list.extend(words)

                for word in words:
                    char_list.extend(iter(word))

        word_counter = data.count_tokens(word_list)
        char_counter = data.count_tokens(char_list)

        word_vocab = Vocab(word_counter)
        char_vocab = Vocab(char_counter)

        # embedding_zh = gluonnlp.embedding.create('fasttext', source='cc.zh.300')
        # embedding_eng = gluonnlp.embedding.create('fasttext', source='cc.en.300')
        # embedding_ko = gluonnlp.embedding.create('fasttext', source='cc.ko.300')
        # word_vocab.set_embedding(embedding_eng, embedding_zh, embedding_ko)
        #
        # count = 0
        # for token, times in word_counter.items():
        #     if (word_vocab.embedding[token].sum() != 0).asscalar():
        #         count += 1
        #     else:
        #         print(token)
        #
        # print("{}/{} words have embeddings".format(count, len(word_counter)))

        return word_vocab, char_vocab
Ejemplo n.º 2
0
def test_berttokenizer():

    # test WordpieceTokenizer
    vocab_tokens = ["want", "##want", "##ed", "wa", "un", "runn", "##ing"]
    vocab = Vocab(
        count_tokens(vocab_tokens),
        reserved_tokens=["[CLS]", "[SEP]"],
        unknown_token="[UNK]", padding_token=None, bos_token=None, eos_token=None)
    tokenizer = t.BERTTokenizer(vocab=vocab)

    assert tokenizer(u"unwanted running") == [
        "un", "##want", "##ed", "runn", "##ing"]
    assert tokenizer(u"unwantedX running") == ["[UNK]", "runn", "##ing"]
    assert tokenizer.is_first_subword('un')
    assert not tokenizer.is_first_subword('##want')

    # test BERTTokenizer
    vocab_tokens = ["[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
                    "##ing", ","]

    vocab = Vocab(
        count_tokens(vocab_tokens),
        reserved_tokens=["[CLS]", "[SEP]"],
        unknown_token="[UNK]", padding_token=None, bos_token=None, eos_token=None)
    tokenizer = t.BERTTokenizer(vocab=vocab)
    tokens = tokenizer(u"UNwant\u00E9d,running")
    assert tokens == ["un", "##want", "##ed", ",", "runn", "##ing"]
Ejemplo n.º 3
0
def test_bert_sentences_transform():
    text_a = u'is this jacksonville ?'
    text_b = u'no it is not'
    vocab_tokens = ['is', 'this', 'jack', '##son', '##ville', '?', 'no', 'it', 'is', 'not']

    bert_vocab = BERTVocab(count_tokens(vocab_tokens))
    tokenizer = t.BERTTokenizer(vocab=bert_vocab)

    # test BERTSentenceTransform
    bert_st = t.BERTSentenceTransform(tokenizer, 15, pad=True, pair=True)
    token_ids, length, type_ids = bert_st((text_a, text_b))

    text_a_tokens = ['is', 'this', 'jack', '##son', '##ville', '?']
    text_b_tokens = ['no', 'it', 'is', 'not']
    text_a_ids = bert_vocab[text_a_tokens]
    text_b_ids = bert_vocab[text_b_tokens]

    cls_ids = bert_vocab[[bert_vocab.cls_token]]
    sep_ids = bert_vocab[[bert_vocab.sep_token]]
    pad_ids = bert_vocab[[bert_vocab.padding_token]]

    concated_ids = cls_ids + text_a_ids + sep_ids + text_b_ids + sep_ids + pad_ids
    valid_token_ids = np.array([pad_ids[0]] * 15, dtype=np.int32)
    for i, x in enumerate(concated_ids):
        valid_token_ids[i] = x
    valid_type_ids = np.zeros((15,), dtype=np.int32)
    start = len(text_a_tokens) + 2
    end = len(text_a_tokens) + 2 + len(text_b_tokens) + 1
    valid_type_ids[start:end] = 1

    assert all(token_ids == valid_token_ids)
    assert length == len(vocab_tokens) + 3
    assert all(type_ids == valid_type_ids)
Ejemplo n.º 4
0
 def test_gluon_nlp(self):
     # get corpus statistics
     counter = count_tokens(['alpha', 'beta', 'gamma', 'beta'])
     # create Vocab
     vocab = Vocab(counter)
     # find index based on token
     self.assertEqual(4, vocab['beta'])
Ejemplo n.º 5
0
 def test_gluon_nlp(self):
     # get corpus statistics
     counter = count_tokens(['alpha', 'beta', 'gamma', 'beta'])
     # create Vocab
     vocab = Vocab(counter)
     # find index based on token
     self.assertEqual(4, vocab['beta'])
def test_bert_dataset_transform():
    text_a = u'is this jacksonville ?'
    text_b = u'no it is not'
    label_cls = 0
    vocab_tokens = [
        'is', 'this', 'jack', '##son', '##ville', '?', 'no', 'it', 'is', 'not'
    ]

    bert_vocab = BERTVocab(count_tokens(vocab_tokens))
    tokenizer = BERTTokenizer(vocab=bert_vocab)

    # test BERTDatasetTransform for classification task
    bert_cls_dataset_t = BERTDatasetTransform(tokenizer,
                                              15,
                                              labels=[label_cls],
                                              pad=True,
                                              pair=True,
                                              label_dtype='int32')
    token_ids, length, type_ids, label_ids = bert_cls_dataset_t(
        (text_a, text_b, label_cls))

    text_a_tokens = ['is', 'this', 'jack', '##son', '##ville', '?']
    text_b_tokens = ['no', 'it', 'is', 'not']
    text_a_ids = bert_vocab[text_a_tokens]
    text_b_ids = bert_vocab[text_b_tokens]

    cls_ids = bert_vocab[[bert_vocab.cls_token]]
    sep_ids = bert_vocab[[bert_vocab.sep_token]]
    pad_ids = bert_vocab[[bert_vocab.padding_token]]

    concated_ids = cls_ids + text_a_ids + sep_ids + text_b_ids + sep_ids + pad_ids
    valid_token_ids = np.array([pad_ids[0]] * 15, dtype=np.int32)
    for i, x in enumerate(concated_ids):
        valid_token_ids[i] = x
    valid_type_ids = np.zeros((15, ), dtype=np.int32)
    start = len(text_a_tokens) + 2
    end = len(text_a_tokens) + 2 + len(text_b_tokens) + 1
    valid_type_ids[start:end] = 1

    assert all(token_ids == valid_token_ids)
    assert length == len(vocab_tokens) + 3
    assert all(type_ids == valid_type_ids)
    assert all(label_ids == np.array([label_cls], dtype=np.int32))

    # test BERTDatasetTransform for regression task
    label_reg = 0.2
    bert_reg_dataset_t = BERTDatasetTransform(tokenizer,
                                              15,
                                              pad=True,
                                              pair=True,
                                              label_dtype='float32')
    token_ids, length, type_ids, label_reg_val = bert_reg_dataset_t(
        (text_a, text_b, label_reg))
    assert all(token_ids == valid_token_ids)
    assert length == len(vocab_tokens) + 3
    assert all(type_ids == valid_type_ids)
    assert all(label_reg_val == np.array([label_reg], dtype=np.float32))
Ejemplo n.º 7
0
    def _create_squad_vocab(tokenization_fn, dataset):
        all_tokens = []

        for data_item in dataset:
            all_tokens.extend(tokenization_fn(data_item[1]))
            all_tokens.extend(tokenization_fn(data_item[2]))

        counter = data.count_tokens(all_tokens)
        vocab = Vocab(counter)
        return vocab
Ejemplo n.º 8
0
    def __call__(self, example):
        """Maps examples into distinct tokens

        Parameters
        ----------
        example : dict
            Example to process with context_tokens and ques_tokens keys

        Returns
        -------
        mapped_values : List[Tuple]
            Result of mapping process. Each tuple of (token, count) format
        """
        para_counter = data.count_tokens(example['context_tokens'] if not self._iterate_over_example
                                         else [c for tkn in example['context_tokens'] for c in tkn])
        ques_counter = data.count_tokens(example['ques_tokens'] if not self._iterate_over_example
                                         else [c for tkn in example['ques_tokens'] for c in tkn])
        counter = para_counter + ques_counter
        return list(counter.items())
Ejemplo n.º 9
0
def get_vocab(datasets):
    all_words = [
        word for dataset in datasets for item in dataset for word in item[0]
    ]
    vocab = Vocab(data.count_tokens(all_words))
    glove = embedding.create('glove',
                             source='glove.6B.' + str(args.embedding_dim) +
                             'd')
    vocab.set_embedding(glove)
    return vocab
Ejemplo n.º 10
0
def read_data(word_path, label_path, nature_path, max_seq_len, PAD, NOT, PAD_NATURE, UNK):
    '''
    读取数据中的每个句子的词,词性,词所对应的实体的标记。对每条句子的词的长度进行长截短补到指定的
    max_seq_len 的长度,对词的填充使用 PAD, 词性填充使用 PAD_NATURE, 标记填充使用 NOT。
    构建 词的字典,词性的字典以及标记的字典,字典中保留位置符号 UNK
    Args:
        word_path: 包含每条句子的词的数据的路径
        label_path: 包含每条句子的词的标记的数据的路径
        nature_path: 包含每条句子的词的词性的数据的路径
        max_seq_len: 最大句子长度,以词为单位
        PAD: 词的填充符号
        NOT: 标记的填充符号
        PAD_NATURE: 词性的填充符号
        UNK: 未知符号
    Returns:
        word_vocab:词的字典 
        label_vocab:词所对应的实体的标记的字典
        nature_vocab:词的词性的字典
        input_seqs:所有句子的输入的词的列表 [[word1, word2, ...], [word1, word2, ...], ...]
        output_seqs: 所有句子的词的标记的列表 [[label1, label2, ...], [label1, label2, ...], ...]
        nature_seqs:所有句子的词的词性的列表 [[nature1, nature2, ...], [nature1, nature2, ...], ...]
    '''
    input_tokens, output_tokens, nature_tokens = [], [], []
    input_seqs, output_seqs, nature_seqs = [], [], []

    with open(word_path, 'r', encoding='utf-8') as fx, open(label_path, 'r', encoding='utf-8') as fy, open(nature_path, 'r', encoding='utf-8') as fn:
        word_lines = fx.readlines()
        label_lines = fy.readlines()
        word_natures = fn.readlines()
        assert len(word_lines) == len(word_natures)
        assert len(word_natures) == len(label_lines)

        for word_line, label_line, word_nature in zip(word_lines, label_lines, word_natures):
            input_seq = word_line.strip()
            output_seq = label_line.strip()
            nature_seq = word_nature.strip()

            cur_input_tokens = input_seq.split(' ')
            cur_output_tokens = output_seq.split(' ')
            cur_nature_tokens = nature_seq.split(' ')
            assert len(cur_input_tokens) == len(cur_output_tokens)
            assert len(cur_output_tokens) == len(cur_nature_tokens)

            # 跳过奇怪的实体类别标注
            if '' in cur_output_tokens:
                continue

            # if-else: 长截短补
            if len(cur_input_tokens) < max_seq_len or len(cur_output_tokens) < max_seq_len or len(cur_nature_tokens) < max_seq_len:

                # 添加 PAD 符号使每个序列长度都为 max_seq_len
                while len(cur_input_tokens) < max_seq_len:
                    cur_input_tokens.append(PAD)
                    cur_output_tokens.append(NOT)
                    cur_nature_tokens.append(PAD_NATURE)
            else:
                cur_input_tokens = cur_input_tokens[0:max_seq_len]
                cur_output_tokens = cur_output_tokens[0:max_seq_len]
                cur_nature_tokens = cur_nature_tokens[0:max_seq_len]

            input_tokens.extend(cur_input_tokens)
            output_tokens.extend(cur_output_tokens)
            nature_tokens.extend(cur_nature_tokens)

            # 记录序列
            input_seqs.append(cur_input_tokens)
            output_seqs.append(cur_output_tokens)
            nature_seqs.append(cur_nature_tokens)

        # 创建字典
        word_vocab = Vocab(count_tokens(input_tokens), unknown_token=UNK, padding_token=PAD)
        label_vocab = Vocab(count_tokens(output_tokens), unknown_token=UNK, padding_token=NOT)
        nature_vocab = Vocab(count_tokens(nature_tokens), unknown_token=UNK, padding_token=PAD_NATURE)

    return word_vocab, label_vocab, nature_vocab, input_seqs, output_seqs, nature_seqs