Esempio n. 1
0
 def get_vocabulary(self):
     examples = self.get_train_examples() + self.get_dev_examples() + self.get_test_examples()
     vocab = Vocabulary(start=1)
     for e in examples:
         for t in e.tokens:
             vocab.add(t)
     return vocab
Esempio n. 2
0
    def __init__(self, sentence_length=50, word_length=12):
        self.data_dict = {}
        self.vecs = {}
        self.sentence_len = sentence_length
        self.word_len = word_length

        self._tokens_vocab = Vocabulary(2)
        self._chars_vocab = Vocabulary(2)
        self._tags_vocab = Vocabulary(1)
        self._intents_vocab = Vocabulary()
Esempio n. 3
0
    def __init__(self, sentence_length, word_length=12, embedding_model=None,
                 embedding_size=None):
        self.data_dict = {}
        self.vecs = {}
        self.sentence_len = sentence_length
        self.word_len = word_length
        self.embedding_model = embedding_model
        self.embedding_size = embedding_size

        self._tokens_vocab = Vocabulary()
        self._chars_vocab = Vocabulary()
        self._tags_vocab = Vocabulary()
        self._intents_vocab = Vocabulary()
Esempio n. 4
0
    def __init__(self,
                 train_file,
                 test_file,
                 max_sentence_length=30,
                 max_word_length=20,
                 tag_field_no=4):
        self.files = {'train': train_file, 'test': test_file}
        self.max_sent_len = max_sentence_length
        self.max_word_len = max_word_length
        self.tf = tag_field_no

        self.vocabs = {
            'token': Vocabulary(2),  # 0=pad, 1=unk
            'char': Vocabulary(2),  # 0=pad, 1=unk
            'tag': Vocabulary(1)
        }  # 0=pad

        self.data = {}
        for f in self.files:
            raw_sentences = self._read_file(self.files[f])
            word_vecs = []
            char_vecs = []
            tag_vecs = []
            for tokens, tags in raw_sentences:
                word_vecs.append(
                    np.array([self.vocabs['token'].add(t) for t in tokens]))
                word_chars = []
                for t in tokens:
                    word_chars.append(
                        np.array([self.vocabs['char'].add(c) for c in t]))
                word_chars = pad_sequences(word_chars,
                                           maxlen=self.max_word_len)
                if self.max_sent_len - len(tokens) > 0:
                    char_padding = self.max_sent_len - len(word_chars)
                    char_vecs.append(
                        np.concatenate((np.zeros(
                            (char_padding, self.max_word_len)), word_chars),
                                       axis=0))
                else:
                    char_vecs.append(word_chars[-self.max_sent_len:])
                tag_vecs.append(
                    np.array([self.vocabs['tag'].add(t) for t in tags]))
            word_vecs = pad_sequences(word_vecs, maxlen=self.max_sent_len)
            char_vecs = np.asarray(char_vecs)
            tag_vecs = pad_sequences(tag_vecs, maxlen=self.max_sent_len)
            self.data[f] = word_vecs, char_vecs, tag_vecs
Esempio n. 5
0
    def __init__(self, sentence_length, word_length=12, embedding_model=None,
                 embedding_size=None):
        self.data_dict = {}
        self.vecs = {}
        self.sentence_len = sentence_length
        self.word_len = word_length
        self.embedding_model = embedding_model
        self.embedding_size = embedding_size

        self._tokens_vocab = Vocabulary()
        self._chars_vocab = Vocabulary()
        self._tags_vocab = Vocabulary()
        self._intents_vocab = Vocabulary()
 def get_vocabulary(self, examples: TokenClsInputExample = None):
     vocab = Vocabulary(start=1)
     for e in examples:
         for t in e.tokens:
             vocab.add(t)
     return vocab
Esempio n. 7
0
class IntentDataset(object):
    """
    Intent extraction dataset base class

    Args:
        sentence_length (int): max sentence length
        embedding_model (str, optional): external embedding model path
        embedding_size (int): embedding vectors size
    """

    def __init__(self, sentence_length, word_length=12, embedding_model=None,
                 embedding_size=None):
        self.data_dict = {}
        self.vecs = {}
        self.sentence_len = sentence_length
        self.word_len = word_length
        self.embedding_model = embedding_model
        self.embedding_size = embedding_size

        self._tokens_vocab = Vocabulary()
        self._chars_vocab = Vocabulary()
        self._tags_vocab = Vocabulary()
        self._intents_vocab = Vocabulary()

    def _load_embedding(self, files):
        print('Loading external word embedding model ..')
        emb_vecs, _ = load_word_embeddings(self.embedding_model)
        for f in files:
            self.vecs[f][0] = fill_embedding_mat(self.vecs[f][0],
                                                 self._tokens_vocab.reverse_vocab(),
                                                 emb_vecs,
                                                 self.embedding_size)

    def _load_data(self, train_set, test_set):
        datasets = {'train': train_set, 'test': test_set}
        # vectorize
        # add offset of 2 for PAD and OOV
        self._tokens_vocab.add_vocab_offset(2)
        self._chars_vocab.add_vocab_offset(2)
        self._tags_vocab.add_vocab_offset(1)
        vec_data = {}
        for f in sorted(datasets.keys()):
            vec_data[f] = self._prepare_vectors(datasets[f])
        for f in sorted(datasets.keys()):
            tokens, words, intents, tags = vec_data[f]
            x = pad_sequences(tokens, maxlen=self.sentence_len)
            _w = []
            for s in words:
                _s = pad_sequences(s, maxlen=self.word_len)
                sentence = np.asarray(_s)[-self.sentence_len:]
                if sentence.shape[0] < self.sentence_len:
                    sentence = np.vstack((np.zeros((self.sentence_len - sentence.shape[0],
                                                    self.word_len)), sentence))
                _w.append(sentence)
            w = np.asarray(_w)
            _y = pad_sequences(tags, maxlen=self.sentence_len)
            y = one_hot_sentence(_y, self.label_vocab_size)
            i = one_hot(intents, self.intent_size)
            self.vecs[f] = [x, w, i, y]

    def _prepare_vectors(self, dataset):
        tokens = []
        words = []
        tags = []
        intents = []
        for tok, tag, i in dataset:
            tokens.append(np.asarray([self._tokens_vocab.add(t) for t in tok]))
            words.append(np.asarray(self._extract_char_features(tok)))
            tags.append(np.asarray([self._tags_vocab.add(t) for t in tag]))
            intents.append(self._intents_vocab.add(i))
        return tokens, words, np.asarray(intents), tags

    def _extract_char_features(self, tokens):
        words = []
        for t in tokens:
            words.append(np.asarray([self._chars_vocab.add(c) for c in t]))
        return words

    @property
    def vocab_size(self):
        """int: vocabulary size"""
        return len(self._tokens_vocab) + 2

    @property
    def char_vocab_size(self):
        """int: char vocabulary size"""
        return len(self._chars_vocab) + 2

    @property
    def label_vocab_size(self):
        """int: label vocabulary size"""
        return len(self._tags_vocab) + 1

    @property
    def intent_size(self):
        """int: intent label vocabulary size"""
        return len(self._intents_vocab)

    @property
    def tokens_vocab(self):
        """dict: tokens vocabulary"""
        return self._tokens_vocab.vocab

    @property
    def labels_vocab(self):
        """dict: labels vocabulary"""
        return self._tags_vocab.vocab

    @property
    def intents_vocab(self):
        """dict: intent labels vocabulary"""
        return self._intents_vocab.vocab

    @property
    def train_set(self):
        """:obj:`tuple` of :obj:`numpy.ndarray`: train set"""
        return self.vecs['train']

    @property
    def test_set(self):
        """:obj:`tuple` of :obj:`numpy.ndarray`: test set"""
        return self.vecs['test']
Esempio n. 8
0
class IntentDataset(object):
    """
    Intent extraction dataset base class

    Args:
        sentence_length (int): max sentence length
        embedding_model (str, optional): external embedding model path
        embedding_size (int): embedding vectors size
    """

    def __init__(self, sentence_length, word_length=12, embedding_model=None,
                 embedding_size=None):
        self.data_dict = {}
        self.vecs = {}
        self.sentence_len = sentence_length
        self.word_len = word_length
        self.embedding_model = embedding_model
        self.embedding_size = embedding_size

        self._tokens_vocab = Vocabulary()
        self._chars_vocab = Vocabulary()
        self._tags_vocab = Vocabulary()
        self._intents_vocab = Vocabulary()

    def _load_embedding(self, files):
        print('Loading external word embedding model ..')
        emb_vecs, _ = load_word_embeddings(self.embedding_model)
        for f in files:
            self.vecs[f][0] = fill_embedding_mat(self.vecs[f][0],
                                                 self._tokens_vocab.reverse_vocab(),
                                                 emb_vecs,
                                                 self.embedding_size)

    def _load_data(self, train_set, test_set):
        datasets = {'train': train_set, 'test': test_set}
        # vectorize
        # add offset of 2 for PAD and OOV
        self._tokens_vocab.add_vocab_offset(2)
        self._chars_vocab.add_vocab_offset(2)
        self._tags_vocab.add_vocab_offset(1)
        vec_data = {}
        for f in datasets.keys():
            vec_data[f] = self._prepare_vectors(datasets[f])
        for f in datasets.keys():
            tokens, words, intents, tags = vec_data[f]
            x = pad_sequences(tokens, maxlen=self.sentence_len)
            _w = []
            for s in words:
                _s = pad_sequences(s, maxlen=self.word_len)
                sentence = np.asarray(_s)[-self.sentence_len:]
                if sentence.shape[0] < self.sentence_len:
                    sentence = np.vstack((np.zeros((self.sentence_len - sentence.shape[0],
                                                    self.word_len)), sentence))
                _w.append(sentence)
            w = np.asarray(_w)
            _y = pad_sequences(tags, maxlen=self.sentence_len)
            y = one_hot_sentence(_y, self.label_vocab_size)
            i = one_hot(intents, self.intent_size)
            self.vecs[f] = [x, w, i, y]

    def _prepare_vectors(self, dataset):
        tokens = []
        words = []
        tags = []
        intents = []
        for tok, tag, i in dataset:
            tokens.append(np.asarray([self._tokens_vocab.add(t) for t in tok]))
            words.append(np.asarray(self._extract_char_features(tok)))
            tags.append(np.asarray([self._tags_vocab.add(t) for t in tag]))
            intents.append(self._intents_vocab.add(i))
        return tokens, words, np.asarray(intents), tags

    def _extract_char_features(self, tokens):
        words = []
        for t in tokens:
            words.append(np.asarray([self._chars_vocab.add(c) for c in t]))
        return words

    @property
    def vocab_size(self):
        """int: vocabulary size"""
        return len(self._tokens_vocab) + 2

    @property
    def char_vocab_size(self):
        """int: char vocabulary size"""
        return len(self._chars_vocab) + 2

    @property
    def label_vocab_size(self):
        """int: label vocabulary size"""
        return len(self._tags_vocab) + 1

    @property
    def intent_size(self):
        """int: intent label vocabulary size"""
        return len(self._intents_vocab)

    @property
    def tokens_vocab(self):
        """dict: tokens vocabulary"""
        return self._tokens_vocab.vocab

    @property
    def labels_vocab(self):
        """dict: labels vocabulary"""
        return self._tags_vocab.vocab

    @property
    def intents_vocab(self):
        """dict: intent labels vocabulary"""
        return self._intents_vocab.vocab

    @property
    def train_set(self):
        """:obj:`tuple` of :obj:`numpy.ndarray`: train set"""
        return self.vecs['train']

    @property
    def test_set(self):
        """:obj:`tuple` of :obj:`numpy.ndarray`: test set"""
        return self.vecs['test']