Python Vocab.Vocabの例

プログラミング言語: Python

名前空間/パッケージ名: gluonnlp

クラス/型: Vocab

メソッド/関数: Vocab

hotexamples.comのコード掲載数: 7

Python Vocab.Vocab - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのgluonnlp.Vocab.Vocabの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Vocab(7)

to_indices(6)

set_embedding(3)

from_json(2)

コード例 #1

ファイルを表示

    def _get_vocabs(self):
        word_list = []
        char_list = []

        for ds in self._datasets:
            for item in ds:
                words = self._get_word_tokens(item[1])
                word_list.extend(words)

                for word in words:
                    char_list.extend(iter(word))

        word_counter = data.count_tokens(word_list)
        char_counter = data.count_tokens(char_list)

        word_vocab = Vocab(word_counter)
        char_vocab = Vocab(char_counter)

        # embedding_zh = gluonnlp.embedding.create('fasttext', source='cc.zh.300')
        # embedding_eng = gluonnlp.embedding.create('fasttext', source='cc.en.300')
        # embedding_ko = gluonnlp.embedding.create('fasttext', source='cc.ko.300')
        # word_vocab.set_embedding(embedding_eng, embedding_zh, embedding_ko)
        #
        # count = 0
        # for token, times in word_counter.items():
        #     if (word_vocab.embedding[token].sum() != 0).asscalar():
        #         count += 1
        #     else:
        #         print(token)
        #
        # print("{}/{} words have embeddings".format(count, len(word_counter)))

        return word_vocab, char_vocab

コード例 #2

ファイルを表示

ファイル: test_model.py プロジェクト: winbarani/MXNetWorkshopHongKong

def test_join_embedding():
    counter = data.Counter(["love", "走秀", "vacation"])
    vocab1 = Vocab(counter)
    vocab2 = Vocab(counter)
    chinese_embedding = gluonnlp.embedding.create('fasttext', source='wiki.zh')
    eng_embedding = gluonnlp.embedding.create('fasttext', source='wiki.simple')

    vocab1.set_embedding(chinese_embedding)
    vocab2.set_embedding(eng_embedding)

    print(vocab1.embedding['vacation'] + vocab2.embedding['vacation'])

コード例 #3

ファイルを表示

ファイル: test_mxnet.py プロジェクト: benhamner/docker-python-1

 def test_gluon_nlp(self):
     # get corpus statistics
     counter = count_tokens(['alpha', 'beta', 'gamma', 'beta'])
     # create Vocab
     vocab = Vocab(counter)
     # find index based on token
     self.assertEqual(4, vocab['beta'])

コード例 #4

ファイルを表示

ファイル: data_processing.py プロジェクト: zhaozhengChen/gluon-nlp

    def _create_squad_vocab(tokenization_fn, dataset):
        all_tokens = []

        for data_item in dataset:
            all_tokens.extend(tokenization_fn(data_item[1]))
            all_tokens.extend(tokenization_fn(data_item[2]))

        counter = data.count_tokens(all_tokens)
        vocab = Vocab(counter)
        return vocab

コード例 #5

ファイルを表示

def get_vocab(datasets):
    all_words = [
        word for dataset in datasets for item in dataset for word in item[0]
    ]
    vocab = Vocab(data.count_tokens(all_words))
    glove = embedding.create('glove',
                             source='glove.6B.' + str(args.embedding_dim) +
                             'd')
    vocab.set_embedding(glove)
    return vocab

コード例 #6

ファイルを表示

    def _get_vocabs(train_examples, dev_examples, emb_file_name,
                    is_cased_embedding, shrink_word_vocab, pool):
        """Create both word-level and character-level vocabularies. Vocabularies are built using
        data from both train and dev datasets.

        Parameters
        ----------
        train_examples : List[dict]
            Tokenized training examples
        dev_examples : List[dict]
            Tokenized dev examples
        emb_file_name : str
            Glove embedding file name
        is_cased_embedding : bool
            When True, provided embedding file is cased, uncased otherwise
        shrink_word_vocab : bool
            When True, only tokens that have embeddings in the embedding file are remained in the
            word_vocab. Otherwise tokens with no embedding also stay
        pool : Pool
            Multiprocessing pool to use

        Returns
        -------
        word_vocab : Vocab
            Word-level vocabulary
        char_vocab : Vocab
            Char-level vocabulary
        """
        tic = time.time()
        print('Word counters receiving started.')

        word_mapper = SQuADAsyncVocabMapper()
        word_reducer = SQuADAsyncVocabReducer()
        word_mapped = list(
            tqdm.tqdm(word_mapper.run_async(
                itertools.chain(train_examples, dev_examples), pool),
                      total=len(train_examples) + len(dev_examples)))
        word_partitioned = tqdm.tqdm(SQuADDataPipeline._partition(
            itertools.chain(*word_mapped)),
                                     total=len(word_mapped))
        word_counts = list(
            tqdm.tqdm(word_reducer.run_async(word_partitioned, pool),
                      total=len(word_partitioned)))
        print('Word counters received in {:.3f} sec'.format(time.time() - tic))

        tic = time.time()
        print('Char counters receiving started.')
        char_mapper = SQuADAsyncVocabMapper(iterate_over_example=True)
        char_reducer = SQuADAsyncVocabReducer()
        char_mapped = list(
            tqdm.tqdm(char_mapper.run_async(
                itertools.chain(train_examples, dev_examples), pool),
                      total=len(train_examples) + len(dev_examples)))
        char_partitioned = SQuADDataPipeline._partition(
            itertools.chain(*char_mapped))
        char_counts = list(
            tqdm.tqdm(char_reducer.run_async(char_partitioned, pool),
                      total=len(char_partitioned)))
        print('Char counters received in {:.3f} sec'.format(time.time() - tic))

        embedding = nlp.embedding.create('glove', source=emb_file_name)

        if is_cased_embedding:
            word_counts = itertools.chain(
                *[[(item[0],
                    item[1]), (item[0].lower(),
                               item[1]), (item[0].capitalize(),
                                          item[1]), (item[0].upper(), item[1])]
                  for item in word_counts])
        else:
            word_counts = [(item[0].lower(), item[1]) for item in word_counts]

        word_vocab = Vocab(
            {
                item[0]: item[1]
                for item in word_counts
                if not shrink_word_vocab or item[0] in embedding.token_to_idx
            },
            bos_token=None,
            eos_token=None)
        word_vocab.set_embedding(embedding)
        char_vocab = Vocab({item[0]: item[1]
                            for item in char_counts},
                           bos_token=None,
                           eos_token=None)

        return word_vocab, char_vocab

コード例 #7

ファイルを表示

ファイル: utils.py プロジェクト: kenjewu/NamedEntityRecognition

def read_data(word_path, label_path, nature_path, max_seq_len, PAD, NOT, PAD_NATURE, UNK):
    '''
    读取数据中的每个句子的词，词性，词所对应的实体的标记。对每条句子的词的长度进行长截短补到指定的
    max_seq_len 的长度，对词的填充使用 PAD, 词性填充使用 PAD_NATURE， 标记填充使用 NOT。
    构建 词的字典，词性的字典以及标记的字典，字典中保留位置符号 UNK
    Args:
        word_path: 包含每条句子的词的数据的路径
        label_path: 包含每条句子的词的标记的数据的路径
        nature_path: 包含每条句子的词的词性的数据的路径
        max_seq_len: 最大句子长度，以词为单位
        PAD: 词的填充符号
        NOT: 标记的填充符号
        PAD_NATURE: 词性的填充符号
        UNK: 未知符号
    Returns:
        word_vocab：词的字典 
        label_vocab：词所对应的实体的标记的字典
        nature_vocab：词的词性的字典
        input_seqs：所有句子的输入的词的列表 [[word1, word2, ...], [word1, word2, ...], ...]
        output_seqs: 所有句子的词的标记的列表 [[label1, label2, ...], [label1, label2, ...], ...]
        nature_seqs：所有句子的词的词性的列表 [[nature1, nature2, ...], [nature1, nature2, ...], ...]
    '''
    input_tokens, output_tokens, nature_tokens = [], [], []
    input_seqs, output_seqs, nature_seqs = [], [], []

    with open(word_path, 'r', encoding='utf-8') as fx, open(label_path, 'r', encoding='utf-8') as fy, open(nature_path, 'r', encoding='utf-8') as fn:
        word_lines = fx.readlines()
        label_lines = fy.readlines()
        word_natures = fn.readlines()
        assert len(word_lines) == len(word_natures)
        assert len(word_natures) == len(label_lines)

        for word_line, label_line, word_nature in zip(word_lines, label_lines, word_natures):
            input_seq = word_line.strip()
            output_seq = label_line.strip()
            nature_seq = word_nature.strip()

            cur_input_tokens = input_seq.split(' ')
            cur_output_tokens = output_seq.split(' ')
            cur_nature_tokens = nature_seq.split(' ')
            assert len(cur_input_tokens) == len(cur_output_tokens)
            assert len(cur_output_tokens) == len(cur_nature_tokens)

            # 跳过奇怪的实体类别标注
            if '' in cur_output_tokens:
                continue

            # if-else: 长截短补
            if len(cur_input_tokens) < max_seq_len or len(cur_output_tokens) < max_seq_len or len(cur_nature_tokens) < max_seq_len:

                # 添加 PAD 符号使每个序列长度都为 max_seq_len
                while len(cur_input_tokens) < max_seq_len:
                    cur_input_tokens.append(PAD)
                    cur_output_tokens.append(NOT)
                    cur_nature_tokens.append(PAD_NATURE)
            else:
                cur_input_tokens = cur_input_tokens[0:max_seq_len]
                cur_output_tokens = cur_output_tokens[0:max_seq_len]
                cur_nature_tokens = cur_nature_tokens[0:max_seq_len]

            input_tokens.extend(cur_input_tokens)
            output_tokens.extend(cur_output_tokens)
            nature_tokens.extend(cur_nature_tokens)

            # 记录序列
            input_seqs.append(cur_input_tokens)
            output_seqs.append(cur_output_tokens)
            nature_seqs.append(cur_nature_tokens)

        # 创建字典
        word_vocab = Vocab(count_tokens(input_tokens), unknown_token=UNK, padding_token=PAD)
        label_vocab = Vocab(count_tokens(output_tokens), unknown_token=UNK, padding_token=NOT)
        nature_vocab = Vocab(count_tokens(nature_tokens), unknown_token=UNK, padding_token=PAD_NATURE)

    return word_vocab, label_vocab, nature_vocab, input_seqs, output_seqs, nature_seqs