Beispiel #1
0
    def __init__(
        self, train_file, test_file, max_sentence_length=30, max_word_length=20, tag_field_no=2
    ):
        self.files = {"train": train_file, "test": test_file}
        self.max_sent_len = max_sentence_length
        self.max_word_len = max_word_length
        self.tf = tag_field_no

        self.vocabs = {"token": None, "char": None, "tag": None}  # 0=pad, 1=unk  # 0=pad

        self.data = {}

        sentences = self._read_file(self.files["train"])
        train_size = len(sentences)
        sentences += self._read_file(self.files["test"])
        test_size = len(sentences) - train_size
        texts, tags = list(zip(*sentences))

        texts_mat, self.vocabs["token"] = word_vector_generator(texts, lower=True, start=2)
        tags_mat, self.vocabs["tag"] = word_vector_generator(tags, start=1)
        chars_mat, self.vocabs["char"] = character_vector_generator(texts, start=2)

        texts_mat = pad_sentences(texts_mat, max_length=self.max_sent_len)
        tags_mat = pad_sentences(tags_mat, max_length=self.max_sent_len)

        chars_mat = [pad_sentences(d, max_length=self.max_word_len) for d in chars_mat]
        zeros = np.zeros((len(chars_mat), self.max_sent_len, self.max_word_len))
        for idx, d in enumerate(chars_mat):
            d = d[: self.max_sent_len]
            zeros[idx, : d.shape[0]] = d
        chars_mat = zeros.astype(dtype=np.int32)

        self.data["train"] = texts_mat[:train_size], chars_mat[:train_size], tags_mat[:train_size]
        self.data["test"] = texts_mat[-test_size:], chars_mat[-test_size:], tags_mat[-test_size:]
 def _gen_data(self):
     train, test = self._load_data()
     train_size = len(train)
     test_size = len(test)
     sentences = self._extract(train, test, 0)
     pos_tags = self._extract(train, test, 1)
     chunk_tags = self._extract(train, test, 2)
     sentence_vecs, word_vocab = word_vector_generator(
         sentences, self.lower, 2)
     pos_vecs, pos_vocab = word_vector_generator(pos_tags, start=1)
     chunk_vecs, chunk_vocab = word_vector_generator(chunk_tags, start=1)
     self.vocabs = {
         "word": word_vocab,  # 0=pad, 1=unk
         "pos": pos_vocab,  # 0=pad, 1=unk
         "chunk": chunk_vocab,
     }  # 0=pad
     if self.sentence_length is not None:
         sentence_vecs = pad_sentences(sentence_vecs,
                                       max_length=self.sentence_length)
         chunk_vecs = pad_sentences(chunk_vecs,
                                    max_length=self.sentence_length)
         pos_vecs = pad_sentences(pos_vecs, max_length=self.sentence_length)
     self._data_dict["train"] = (
         sentence_vecs[:train_size],
         pos_vecs[:train_size],
         chunk_vecs[:train_size],
     )
     self._data_dict["test"] = (
         sentence_vecs[-test_size:],
         pos_vecs[-test_size:],
         chunk_vecs[-test_size:],
     )
     if self.use_chars:
         chars_vecs, char_vocab = character_vector_generator(sentences,
                                                             start=2)
         self.vocabs.update({"char": char_vocab})  # 0=pad, 1=unk
         if self.max_word_length is not None:
             chars_vecs = [
                 pad_sentences(d, max_length=self.max_word_length)
                 for d in chars_vecs
             ]
             zeros = np.zeros((len(chars_vecs), self.sentence_length,
                               self.max_word_length))
             for idx, d in enumerate(chars_vecs):
                 d = d[:self.sentence_length]
                 zeros[idx, -d.shape[0]:] = d
             chars_vecs = zeros.astype(dtype=np.int32)
         self._data_dict["train"] += (chars_vecs[:train_size], )
         self._data_dict["test"] += (chars_vecs[-test_size:], )
    def _load_data(self, train_set, test_set):
        # vectorize
        # add offset of 2 for PAD and OOV
        train_size = len(train_set)
        test_size = len(test_set)
        texts, tags, intents = list(zip(*train_set + test_set))
        text_vectors, self._tokens_vocab = word_vector_generator(texts,
                                                                 lower=True,
                                                                 start=1)
        tag_vectors, self._tags_vocab = word_vector_generator(tags,
                                                              lower=False,
                                                              start=1)
        chars_vectors, self._chars_vocab = character_vector_generator(texts,
                                                                      start=1)
        i, self._intents_vocab = word_vector_generator([intents])
        i = np.asarray(i[0])

        text_vectors = pad_sentences(text_vectors,
                                     max_length=self.sentence_len)
        tag_vectors = pad_sentences(tag_vectors, max_length=self.sentence_len)
        chars_vectors = [
            pad_sentences(d, max_length=self.word_len) for d in chars_vectors
        ]
        zeros = np.zeros(
            (len(chars_vectors), self.sentence_len, self.word_len))
        for idx, d in enumerate(chars_vectors):
            d = d[:self.sentence_len]
            zeros[idx, :d.shape[0]] = d
        chars_vectors = zeros.astype(dtype=np.int32)

        self.vecs["train"] = [
            text_vectors[:train_size],
            chars_vectors[:train_size],
            i[:train_size],
            tag_vectors[:train_size],
        ]
        self.vecs["test"] = [
            text_vectors[-test_size:],
            chars_vectors[-test_size:],
            i[-test_size:],
            tag_vectors[-test_size:],
        ]