Ejemplo n.º 1
0
    def _load_processed_data(self):
        """ Load processed data from the disk
        Returns
        -------
        train_examples : List[Tuple]
            Processed train examples. Each tuple consists of question_id, record_index,
            context_tokens_indices, question_tokens_indices, context_chars_indices,
            question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
            context, context_tokens_spans
        dev_examples : List[Tuple]
            Processed dev examples. Each tuple consists of question_id, record_index,
            context_tokens_indices, question_tokens_indices, context_chars_indices,
            question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
            context, context_tokens_spans
        word_vocab : Vocab
            Word-level vocabulary
        char_vocab : Vocab
            Char-level vocabulary
        """
        with open(os.path.join(self._data_root_path, self._processed_train_data_file_name),
                  'r') as f:
            train_examples = json.load(f)

        with open(os.path.join(self._data_root_path, self._processed_dev_data_file_name), 'r') as f:
            dev_examples = json.load(f)

        with open(os.path.join(self._data_root_path, self._word_vocab_file_name), 'r') as f:
            word_vocab = Vocab.from_json(json.load(f))

        with open(os.path.join(self._data_root_path, self._char_vocab_file_name), 'r') as f:
            char_vocab = Vocab.from_json(json.load(f))

        return train_examples, dev_examples, word_vocab, char_vocab
Ejemplo n.º 2
0
    def _get_vocabs(self):
        word_list = []
        char_list = []

        for ds in self._datasets:
            for item in ds:
                words = self._get_word_tokens(item[1])
                word_list.extend(words)

                for word in words:
                    char_list.extend(iter(word))

        word_counter = data.count_tokens(word_list)
        char_counter = data.count_tokens(char_list)

        word_vocab = Vocab(word_counter)
        char_vocab = Vocab(char_counter)

        # embedding_zh = gluonnlp.embedding.create('fasttext', source='cc.zh.300')
        # embedding_eng = gluonnlp.embedding.create('fasttext', source='cc.en.300')
        # embedding_ko = gluonnlp.embedding.create('fasttext', source='cc.ko.300')
        # word_vocab.set_embedding(embedding_eng, embedding_zh, embedding_ko)
        #
        # count = 0
        # for token, times in word_counter.items():
        #     if (word_vocab.embedding[token].sum() != 0).asscalar():
        #         count += 1
        #     else:
        #         print(token)
        #
        # print("{}/{} words have embeddings".format(count, len(word_counter)))

        return word_vocab, char_vocab
Ejemplo n.º 3
0
def get_vocab(datasets):
    all_words = [
        word for dataset in datasets for item in dataset for word in item[0]
    ]
    vocab = Vocab(data.count_tokens(all_words))
    glove = embedding.create('glove',
                             source='glove.6B.' + str(args.embedding_dim) +
                             'd')
    vocab.set_embedding(glove)
    return vocab
Ejemplo n.º 4
0
    def __init__(self, num_classes: int, embedding_dim: int, k_max: int, vocab: Vocab) -> None:
        """Instantiating VDCNN class

        Args:
            num_classes (int): the number of classes
            embedding_dim (int): the dimension of embedding vector for token
            k_max (int): the parameter of k-max pooling following last convolution block
            vocab (gluonnlp.Vocab): the instance of gluonnlp.Vocab
        """
        super(VDCNN, self).__init__()
        self._extractor = nn.Sequential(nn.Embedding(len(vocab), embedding_dim, vocab.to_indices(vocab.padding_token)),
                                        Permute(),
                                        nn.Conv1d(embedding_dim, 64, 3, 1, 1),
                                        ConvBlock(64, 64),
                                        ConvBlock(64, 64),
                                        nn.MaxPool1d(2, 2),
                                        ConvBlock(64, 128),
                                        ConvBlock(128, 128),
                                        nn.MaxPool1d(2, 2),
                                        ConvBlock(128, 256),
                                        ConvBlock(256, 256),
                                        nn.MaxPool1d(2, 2),
                                        ConvBlock(256, 512),
                                        ConvBlock(512, 512),
                                        nn.AdaptiveMaxPool1d(k_max),
                                        Flatten())

        self._classifier = nn.Sequential(nn.Linear(512 * k_max, 2048),
                                         nn.ReLU(),
                                         nn.Linear(2048, 2048),
                                         nn.ReLU(),
                                         nn.Linear(2048, num_classes))
Ejemplo n.º 5
0
    def __init__(self, num_classes: int, embedding_dim: int,
                 vocab: Vocab) -> None:
        """Instantiating CharCNN class

        Args:
            num_classes (int): the number of classes
            embedding_dim (int): the dimension of embedding vector for token
            vocab (gluonnlp.Vocab): the instance of gluonnlp.Vocab
        """
        super(CharCNN, self).__init__()
        self._extractor = nn.Sequential(
            nn.Embedding(len(vocab), embedding_dim,
                         vocab.to_indices(vocab.padding_token)), Permute(),
            nn.Conv1d(in_channels=embedding_dim,
                      out_channels=256,
                      kernel_size=7), nn.ReLU(), nn.MaxPool1d(3, 3),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=7),
            nn.ReLU(), nn.MaxPool1d(3, 3),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3),
            nn.ReLU(),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3),
            nn.ReLU(),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3),
            nn.ReLU(),
            nn.Conv1d(in_channels=256, out_channels=256, kernel_size=3),
            nn.ReLU(), nn.MaxPool1d(3, 3), Flatten())

        self._classifier = nn.Sequential(
            nn.Linear(in_features=1792, out_features=512), nn.ReLU(),
            nn.Dropout(), nn.Linear(in_features=512, out_features=512),
            nn.ReLU(), nn.Dropout(),
            nn.Linear(in_features=512, out_features=num_classes))

        self.apply(self._init_weights)
Ejemplo n.º 6
0
 def test_gluon_nlp(self):
     # get corpus statistics
     counter = count_tokens(['alpha', 'beta', 'gamma', 'beta'])
     # create Vocab
     vocab = Vocab(counter)
     # find index based on token
     self.assertEqual(4, vocab['beta'])
Ejemplo n.º 7
0
    def __init__(self, label_vocab: Vocab, token_vocab: Vocab, lstm_hidden_dim: int) -> None:
        """Instantiating BilstmCRF class

        Args:
            token_vocab: (gluonnlp.Vocab): the instance of gluonnlp.Vocab that has token information
            label_vocab: (gluonnlp.Vocab): the instance of gluonnlp.Vocab that has label information
            lstm_hidden_dim (int): the number of hidden dimension of lstm
        """
        super(BilstmCRF, self).__init__()
        self._embedding = Embedding(token_vocab, padding_idx=token_vocab.to_indices(token_vocab.padding_token),
                                    freeze=False, permuting=False, tracking=True)
        self._pipe = Linker(permuting=False)
        self._bilstm = BiLSTM(self._embedding._ops.embedding_dim, lstm_hidden_dim, using_sequence=True)
        self._fc = nn.Linear(2 * lstm_hidden_dim, len(label_vocab))
        self._crf = CRF(len(label_vocab), bos_tag_id=label_vocab.to_indices(label_vocab.bos_token),
                        eos_tag_id=label_vocab.to_indices(label_vocab.eos_token),
                        pad_tag_id=label_vocab.to_indices(label_vocab.padding_token))
Ejemplo n.º 8
0
    def __init__(self, vocab: Vocab, word_dropout_ratio: float = .2) -> None:
        """Instantiating MultiChannelEmbedding class

        Args:
            vocab (gluonnlp.Vocab): the instance of gluonnlp.Vocab
            word_dropout_ratio (float): ratio of replacing token with "<unk>" in the sequence
        """
        super(MultiChannelEmbedding, self).__init__()
        self._static = nn.Embedding.from_pretrained(
            torch.from_numpy(vocab.embedding.idx_to_vec.asnumpy()),
            freeze=True,
            padding_idx=vocab.to_indices(vocab.padding_token))
        self._non_static = nn.Embedding.from_pretrained(
            torch.from_numpy(vocab.embedding.idx_to_vec.asnumpy()),
            freeze=False,
            padding_idx=vocab.to_indices(vocab.padding_token))
        self._word_dropout_ratio = word_dropout_ratio
Ejemplo n.º 9
0
def test_join_embedding():
    counter = data.Counter(["love", "走秀", "vacation"])
    vocab1 = Vocab(counter)
    vocab2 = Vocab(counter)
    chinese_embedding = gluonnlp.embedding.create('fasttext', source='wiki.zh')
    eng_embedding = gluonnlp.embedding.create('fasttext', source='wiki.simple')

    vocab1.set_embedding(chinese_embedding)
    vocab2.set_embedding(eng_embedding)

    print(vocab1.embedding['vacation'] + vocab2.embedding['vacation'])
Ejemplo n.º 10
0
    def _create_squad_vocab(tokenization_fn, dataset):
        all_tokens = []

        for data_item in dataset:
            all_tokens.extend(tokenization_fn(data_item[1]))
            all_tokens.extend(tokenization_fn(data_item[2]))

        counter = data.count_tokens(all_tokens)
        vocab = Vocab(counter)
        return vocab
Ejemplo n.º 11
0
    def __init__(self, lstm_hidden_dim: int, da: int, r: int, vocab: Vocab) -> None:
        """Instantiating SentenceEncoder class

        Args:
            lstm_hidden_dim (int): the number of features in the hidden states in bi-directional lstm
            da (int): the number of features in hidden layer from self-attention
            r (int): the number of aspects of self-attention
            vocab (gluonnlp.Vocab): the instance of gluonnlp.Vocab
        """
        super(SentenceEncoder, self).__init__()
        self._embedding = Embedding(vocab, padding_idx=vocab.to_indices(vocab.padding_token), freeze=False,
                                    permuting=False, tracking=True)
        self._pipe = Linker(permuting=False)
        self._bilstm = BiLSTM(self._embedding._ops.embedding_dim, lstm_hidden_dim, using_sequence=True)
        self._attention = SelfAttention(2 * lstm_hidden_dim, da, r)
Ejemplo n.º 12
0
    def __init__(self, num_classes: int, embedding_dim: int, hidden_dim: int,
                 vocab: Vocab) -> None:
        """Instantiating ConvRec class

        Args:
            num_classes (int): the number of classes
            embedding_dim (int) : the dimension of embedding vector for token
            vocab (gluonnlp.Vocab): the instance of gluonnlp.Vocab
        """
        super(ConvRec, self).__init__()
        self._ops = nn.Sequential(
            Embedding(len(vocab),
                      embedding_dim,
                      vocab.to_indices(vocab.padding_token),
                      permuting=True,
                      tracking=True),
            Conv1d(embedding_dim, hidden_dim, 5, 1, 1, F.relu, tracking=True),
            MaxPool1d(2, 2, tracking=True),
            Conv1d(hidden_dim, hidden_dim, 3, 1, 1, F.relu, tracking=True),
            MaxPool1d(2, 2, tracking=True), Linker(permuting=True),
            BiLSTM(hidden_dim, hidden_dim, using_sequence=False), nn.Dropout(),
            nn.Linear(in_features=2 * hidden_dim, out_features=num_classes))

        self.apply(self._init_weights)
Ejemplo n.º 13
0
 def vocab(self):
     path = os.path.join(self._path, 'vocab.json')
     with io.open(path, 'r', encoding='utf-8') as in_file:
         return Vocab.from_json(in_file.read())
Ejemplo n.º 14
0
    def _get_vocabs(train_examples, dev_examples, emb_file_name,
                    is_cased_embedding, shrink_word_vocab, pool):
        """Create both word-level and character-level vocabularies. Vocabularies are built using
        data from both train and dev datasets.

        Parameters
        ----------
        train_examples : List[dict]
            Tokenized training examples
        dev_examples : List[dict]
            Tokenized dev examples
        emb_file_name : str
            Glove embedding file name
        is_cased_embedding : bool
            When True, provided embedding file is cased, uncased otherwise
        shrink_word_vocab : bool
            When True, only tokens that have embeddings in the embedding file are remained in the
            word_vocab. Otherwise tokens with no embedding also stay
        pool : Pool
            Multiprocessing pool to use

        Returns
        -------
        word_vocab : Vocab
            Word-level vocabulary
        char_vocab : Vocab
            Char-level vocabulary
        """
        tic = time.time()
        print('Word counters receiving started.')

        word_mapper = SQuADAsyncVocabMapper()
        word_reducer = SQuADAsyncVocabReducer()
        word_mapped = list(
            tqdm.tqdm(word_mapper.run_async(
                itertools.chain(train_examples, dev_examples), pool),
                      total=len(train_examples) + len(dev_examples)))
        word_partitioned = tqdm.tqdm(SQuADDataPipeline._partition(
            itertools.chain(*word_mapped)),
                                     total=len(word_mapped))
        word_counts = list(
            tqdm.tqdm(word_reducer.run_async(word_partitioned, pool),
                      total=len(word_partitioned)))
        print('Word counters received in {:.3f} sec'.format(time.time() - tic))

        tic = time.time()
        print('Char counters receiving started.')
        char_mapper = SQuADAsyncVocabMapper(iterate_over_example=True)
        char_reducer = SQuADAsyncVocabReducer()
        char_mapped = list(
            tqdm.tqdm(char_mapper.run_async(
                itertools.chain(train_examples, dev_examples), pool),
                      total=len(train_examples) + len(dev_examples)))
        char_partitioned = SQuADDataPipeline._partition(
            itertools.chain(*char_mapped))
        char_counts = list(
            tqdm.tqdm(char_reducer.run_async(char_partitioned, pool),
                      total=len(char_partitioned)))
        print('Char counters received in {:.3f} sec'.format(time.time() - tic))

        embedding = nlp.embedding.create('glove', source=emb_file_name)

        if is_cased_embedding:
            word_counts = itertools.chain(
                *[[(item[0],
                    item[1]), (item[0].lower(),
                               item[1]), (item[0].capitalize(),
                                          item[1]), (item[0].upper(), item[1])]
                  for item in word_counts])
        else:
            word_counts = [(item[0].lower(), item[1]) for item in word_counts]

        word_vocab = Vocab(
            {
                item[0]: item[1]
                for item in word_counts
                if not shrink_word_vocab or item[0] in embedding.token_to_idx
            },
            bos_token=None,
            eos_token=None)
        word_vocab.set_embedding(embedding)
        char_vocab = Vocab({item[0]: item[1]
                            for item in char_counts},
                           bos_token=None,
                           eos_token=None)

        return word_vocab, char_vocab
Ejemplo n.º 15
0
def read_data(word_path, label_path, nature_path, max_seq_len, PAD, NOT, PAD_NATURE, UNK):
    '''
    读取数据中的每个句子的词,词性,词所对应的实体的标记。对每条句子的词的长度进行长截短补到指定的
    max_seq_len 的长度,对词的填充使用 PAD, 词性填充使用 PAD_NATURE, 标记填充使用 NOT。
    构建 词的字典,词性的字典以及标记的字典,字典中保留位置符号 UNK
    Args:
        word_path: 包含每条句子的词的数据的路径
        label_path: 包含每条句子的词的标记的数据的路径
        nature_path: 包含每条句子的词的词性的数据的路径
        max_seq_len: 最大句子长度,以词为单位
        PAD: 词的填充符号
        NOT: 标记的填充符号
        PAD_NATURE: 词性的填充符号
        UNK: 未知符号
    Returns:
        word_vocab:词的字典 
        label_vocab:词所对应的实体的标记的字典
        nature_vocab:词的词性的字典
        input_seqs:所有句子的输入的词的列表 [[word1, word2, ...], [word1, word2, ...], ...]
        output_seqs: 所有句子的词的标记的列表 [[label1, label2, ...], [label1, label2, ...], ...]
        nature_seqs:所有句子的词的词性的列表 [[nature1, nature2, ...], [nature1, nature2, ...], ...]
    '''
    input_tokens, output_tokens, nature_tokens = [], [], []
    input_seqs, output_seqs, nature_seqs = [], [], []

    with open(word_path, 'r', encoding='utf-8') as fx, open(label_path, 'r', encoding='utf-8') as fy, open(nature_path, 'r', encoding='utf-8') as fn:
        word_lines = fx.readlines()
        label_lines = fy.readlines()
        word_natures = fn.readlines()
        assert len(word_lines) == len(word_natures)
        assert len(word_natures) == len(label_lines)

        for word_line, label_line, word_nature in zip(word_lines, label_lines, word_natures):
            input_seq = word_line.strip()
            output_seq = label_line.strip()
            nature_seq = word_nature.strip()

            cur_input_tokens = input_seq.split(' ')
            cur_output_tokens = output_seq.split(' ')
            cur_nature_tokens = nature_seq.split(' ')
            assert len(cur_input_tokens) == len(cur_output_tokens)
            assert len(cur_output_tokens) == len(cur_nature_tokens)

            # 跳过奇怪的实体类别标注
            if '' in cur_output_tokens:
                continue

            # if-else: 长截短补
            if len(cur_input_tokens) < max_seq_len or len(cur_output_tokens) < max_seq_len or len(cur_nature_tokens) < max_seq_len:

                # 添加 PAD 符号使每个序列长度都为 max_seq_len
                while len(cur_input_tokens) < max_seq_len:
                    cur_input_tokens.append(PAD)
                    cur_output_tokens.append(NOT)
                    cur_nature_tokens.append(PAD_NATURE)
            else:
                cur_input_tokens = cur_input_tokens[0:max_seq_len]
                cur_output_tokens = cur_output_tokens[0:max_seq_len]
                cur_nature_tokens = cur_nature_tokens[0:max_seq_len]

            input_tokens.extend(cur_input_tokens)
            output_tokens.extend(cur_output_tokens)
            nature_tokens.extend(cur_nature_tokens)

            # 记录序列
            input_seqs.append(cur_input_tokens)
            output_seqs.append(cur_output_tokens)
            nature_seqs.append(cur_nature_tokens)

        # 创建字典
        word_vocab = Vocab(count_tokens(input_tokens), unknown_token=UNK, padding_token=PAD)
        label_vocab = Vocab(count_tokens(output_tokens), unknown_token=UNK, padding_token=NOT)
        nature_vocab = Vocab(count_tokens(nature_tokens), unknown_token=UNK, padding_token=PAD_NATURE)

    return word_vocab, label_vocab, nature_vocab, input_seqs, output_seqs, nature_seqs
Ejemplo n.º 16
0
 def vocab(self):
     path = os.path.join(self._path, 'vocab.json')
     with io.open(path, 'r', encoding='utf-8') as in_file:
         return Vocab.from_json(in_file.read())