def get_vocabulary(self): examples = self.get_train_examples() + self.get_dev_examples() + self.get_test_examples() vocab = Vocabulary(start=1) for e in examples: for t in e.tokens: vocab.add(t) return vocab
def __init__(self, sentence_length=50, word_length=12): self.data_dict = {} self.vecs = {} self.sentence_len = sentence_length self.word_len = word_length self._tokens_vocab = Vocabulary(2) self._chars_vocab = Vocabulary(2) self._tags_vocab = Vocabulary(1) self._intents_vocab = Vocabulary()
def __init__(self, sentence_length, word_length=12, embedding_model=None, embedding_size=None): self.data_dict = {} self.vecs = {} self.sentence_len = sentence_length self.word_len = word_length self.embedding_model = embedding_model self.embedding_size = embedding_size self._tokens_vocab = Vocabulary() self._chars_vocab = Vocabulary() self._tags_vocab = Vocabulary() self._intents_vocab = Vocabulary()
def __init__(self, train_file, test_file, max_sentence_length=30, max_word_length=20, tag_field_no=4): self.files = {'train': train_file, 'test': test_file} self.max_sent_len = max_sentence_length self.max_word_len = max_word_length self.tf = tag_field_no self.vocabs = { 'token': Vocabulary(2), # 0=pad, 1=unk 'char': Vocabulary(2), # 0=pad, 1=unk 'tag': Vocabulary(1) } # 0=pad self.data = {} for f in self.files: raw_sentences = self._read_file(self.files[f]) word_vecs = [] char_vecs = [] tag_vecs = [] for tokens, tags in raw_sentences: word_vecs.append( np.array([self.vocabs['token'].add(t) for t in tokens])) word_chars = [] for t in tokens: word_chars.append( np.array([self.vocabs['char'].add(c) for c in t])) word_chars = pad_sequences(word_chars, maxlen=self.max_word_len) if self.max_sent_len - len(tokens) > 0: char_padding = self.max_sent_len - len(word_chars) char_vecs.append( np.concatenate((np.zeros( (char_padding, self.max_word_len)), word_chars), axis=0)) else: char_vecs.append(word_chars[-self.max_sent_len:]) tag_vecs.append( np.array([self.vocabs['tag'].add(t) for t in tags])) word_vecs = pad_sequences(word_vecs, maxlen=self.max_sent_len) char_vecs = np.asarray(char_vecs) tag_vecs = pad_sequences(tag_vecs, maxlen=self.max_sent_len) self.data[f] = word_vecs, char_vecs, tag_vecs
def get_vocabulary(self, examples: TokenClsInputExample = None): vocab = Vocabulary(start=1) for e in examples: for t in e.tokens: vocab.add(t) return vocab
class IntentDataset(object): """ Intent extraction dataset base class Args: sentence_length (int): max sentence length embedding_model (str, optional): external embedding model path embedding_size (int): embedding vectors size """ def __init__(self, sentence_length, word_length=12, embedding_model=None, embedding_size=None): self.data_dict = {} self.vecs = {} self.sentence_len = sentence_length self.word_len = word_length self.embedding_model = embedding_model self.embedding_size = embedding_size self._tokens_vocab = Vocabulary() self._chars_vocab = Vocabulary() self._tags_vocab = Vocabulary() self._intents_vocab = Vocabulary() def _load_embedding(self, files): print('Loading external word embedding model ..') emb_vecs, _ = load_word_embeddings(self.embedding_model) for f in files: self.vecs[f][0] = fill_embedding_mat(self.vecs[f][0], self._tokens_vocab.reverse_vocab(), emb_vecs, self.embedding_size) def _load_data(self, train_set, test_set): datasets = {'train': train_set, 'test': test_set} # vectorize # add offset of 2 for PAD and OOV self._tokens_vocab.add_vocab_offset(2) self._chars_vocab.add_vocab_offset(2) self._tags_vocab.add_vocab_offset(1) vec_data = {} for f in sorted(datasets.keys()): vec_data[f] = self._prepare_vectors(datasets[f]) for f in sorted(datasets.keys()): tokens, words, intents, tags = vec_data[f] x = pad_sequences(tokens, maxlen=self.sentence_len) _w = [] for s in words: _s = pad_sequences(s, maxlen=self.word_len) sentence = np.asarray(_s)[-self.sentence_len:] if sentence.shape[0] < self.sentence_len: sentence = np.vstack((np.zeros((self.sentence_len - sentence.shape[0], self.word_len)), sentence)) _w.append(sentence) w = np.asarray(_w) _y = pad_sequences(tags, maxlen=self.sentence_len) y = one_hot_sentence(_y, self.label_vocab_size) i = one_hot(intents, self.intent_size) self.vecs[f] = [x, w, i, y] def _prepare_vectors(self, dataset): tokens = [] words = [] tags = [] intents = [] for tok, tag, i in dataset: tokens.append(np.asarray([self._tokens_vocab.add(t) for t in tok])) words.append(np.asarray(self._extract_char_features(tok))) tags.append(np.asarray([self._tags_vocab.add(t) for t in tag])) intents.append(self._intents_vocab.add(i)) return tokens, words, np.asarray(intents), tags def _extract_char_features(self, tokens): words = [] for t in tokens: words.append(np.asarray([self._chars_vocab.add(c) for c in t])) return words @property def vocab_size(self): """int: vocabulary size""" return len(self._tokens_vocab) + 2 @property def char_vocab_size(self): """int: char vocabulary size""" return len(self._chars_vocab) + 2 @property def label_vocab_size(self): """int: label vocabulary size""" return len(self._tags_vocab) + 1 @property def intent_size(self): """int: intent label vocabulary size""" return len(self._intents_vocab) @property def tokens_vocab(self): """dict: tokens vocabulary""" return self._tokens_vocab.vocab @property def labels_vocab(self): """dict: labels vocabulary""" return self._tags_vocab.vocab @property def intents_vocab(self): """dict: intent labels vocabulary""" return self._intents_vocab.vocab @property def train_set(self): """:obj:`tuple` of :obj:`numpy.ndarray`: train set""" return self.vecs['train'] @property def test_set(self): """:obj:`tuple` of :obj:`numpy.ndarray`: test set""" return self.vecs['test']
class IntentDataset(object): """ Intent extraction dataset base class Args: sentence_length (int): max sentence length embedding_model (str, optional): external embedding model path embedding_size (int): embedding vectors size """ def __init__(self, sentence_length, word_length=12, embedding_model=None, embedding_size=None): self.data_dict = {} self.vecs = {} self.sentence_len = sentence_length self.word_len = word_length self.embedding_model = embedding_model self.embedding_size = embedding_size self._tokens_vocab = Vocabulary() self._chars_vocab = Vocabulary() self._tags_vocab = Vocabulary() self._intents_vocab = Vocabulary() def _load_embedding(self, files): print('Loading external word embedding model ..') emb_vecs, _ = load_word_embeddings(self.embedding_model) for f in files: self.vecs[f][0] = fill_embedding_mat(self.vecs[f][0], self._tokens_vocab.reverse_vocab(), emb_vecs, self.embedding_size) def _load_data(self, train_set, test_set): datasets = {'train': train_set, 'test': test_set} # vectorize # add offset of 2 for PAD and OOV self._tokens_vocab.add_vocab_offset(2) self._chars_vocab.add_vocab_offset(2) self._tags_vocab.add_vocab_offset(1) vec_data = {} for f in datasets.keys(): vec_data[f] = self._prepare_vectors(datasets[f]) for f in datasets.keys(): tokens, words, intents, tags = vec_data[f] x = pad_sequences(tokens, maxlen=self.sentence_len) _w = [] for s in words: _s = pad_sequences(s, maxlen=self.word_len) sentence = np.asarray(_s)[-self.sentence_len:] if sentence.shape[0] < self.sentence_len: sentence = np.vstack((np.zeros((self.sentence_len - sentence.shape[0], self.word_len)), sentence)) _w.append(sentence) w = np.asarray(_w) _y = pad_sequences(tags, maxlen=self.sentence_len) y = one_hot_sentence(_y, self.label_vocab_size) i = one_hot(intents, self.intent_size) self.vecs[f] = [x, w, i, y] def _prepare_vectors(self, dataset): tokens = [] words = [] tags = [] intents = [] for tok, tag, i in dataset: tokens.append(np.asarray([self._tokens_vocab.add(t) for t in tok])) words.append(np.asarray(self._extract_char_features(tok))) tags.append(np.asarray([self._tags_vocab.add(t) for t in tag])) intents.append(self._intents_vocab.add(i)) return tokens, words, np.asarray(intents), tags def _extract_char_features(self, tokens): words = [] for t in tokens: words.append(np.asarray([self._chars_vocab.add(c) for c in t])) return words @property def vocab_size(self): """int: vocabulary size""" return len(self._tokens_vocab) + 2 @property def char_vocab_size(self): """int: char vocabulary size""" return len(self._chars_vocab) + 2 @property def label_vocab_size(self): """int: label vocabulary size""" return len(self._tags_vocab) + 1 @property def intent_size(self): """int: intent label vocabulary size""" return len(self._intents_vocab) @property def tokens_vocab(self): """dict: tokens vocabulary""" return self._tokens_vocab.vocab @property def labels_vocab(self): """dict: labels vocabulary""" return self._tags_vocab.vocab @property def intents_vocab(self): """dict: intent labels vocabulary""" return self._intents_vocab.vocab @property def train_set(self): """:obj:`tuple` of :obj:`numpy.ndarray`: train set""" return self.vecs['train'] @property def test_set(self): """:obj:`tuple` of :obj:`numpy.ndarray`: test set""" return self.vecs['test']