def __init__( self, train_file, test_file, max_sentence_length=30, max_word_length=20, tag_field_no=2 ): self.files = {"train": train_file, "test": test_file} self.max_sent_len = max_sentence_length self.max_word_len = max_word_length self.tf = tag_field_no self.vocabs = {"token": None, "char": None, "tag": None} # 0=pad, 1=unk # 0=pad self.data = {} sentences = self._read_file(self.files["train"]) train_size = len(sentences) sentences += self._read_file(self.files["test"]) test_size = len(sentences) - train_size texts, tags = list(zip(*sentences)) texts_mat, self.vocabs["token"] = word_vector_generator(texts, lower=True, start=2) tags_mat, self.vocabs["tag"] = word_vector_generator(tags, start=1) chars_mat, self.vocabs["char"] = character_vector_generator(texts, start=2) texts_mat = pad_sentences(texts_mat, max_length=self.max_sent_len) tags_mat = pad_sentences(tags_mat, max_length=self.max_sent_len) chars_mat = [pad_sentences(d, max_length=self.max_word_len) for d in chars_mat] zeros = np.zeros((len(chars_mat), self.max_sent_len, self.max_word_len)) for idx, d in enumerate(chars_mat): d = d[: self.max_sent_len] zeros[idx, : d.shape[0]] = d chars_mat = zeros.astype(dtype=np.int32) self.data["train"] = texts_mat[:train_size], chars_mat[:train_size], tags_mat[:train_size] self.data["test"] = texts_mat[-test_size:], chars_mat[-test_size:], tags_mat[-test_size:]
def _gen_data(self): train, test = self._load_data() train_size = len(train) test_size = len(test) sentences = self._extract(train, test, 0) pos_tags = self._extract(train, test, 1) chunk_tags = self._extract(train, test, 2) sentence_vecs, word_vocab = word_vector_generator( sentences, self.lower, 2) pos_vecs, pos_vocab = word_vector_generator(pos_tags, start=1) chunk_vecs, chunk_vocab = word_vector_generator(chunk_tags, start=1) self.vocabs = { "word": word_vocab, # 0=pad, 1=unk "pos": pos_vocab, # 0=pad, 1=unk "chunk": chunk_vocab, } # 0=pad if self.sentence_length is not None: sentence_vecs = pad_sentences(sentence_vecs, max_length=self.sentence_length) chunk_vecs = pad_sentences(chunk_vecs, max_length=self.sentence_length) pos_vecs = pad_sentences(pos_vecs, max_length=self.sentence_length) self._data_dict["train"] = ( sentence_vecs[:train_size], pos_vecs[:train_size], chunk_vecs[:train_size], ) self._data_dict["test"] = ( sentence_vecs[-test_size:], pos_vecs[-test_size:], chunk_vecs[-test_size:], ) if self.use_chars: chars_vecs, char_vocab = character_vector_generator(sentences, start=2) self.vocabs.update({"char": char_vocab}) # 0=pad, 1=unk if self.max_word_length is not None: chars_vecs = [ pad_sentences(d, max_length=self.max_word_length) for d in chars_vecs ] zeros = np.zeros((len(chars_vecs), self.sentence_length, self.max_word_length)) for idx, d in enumerate(chars_vecs): d = d[:self.sentence_length] zeros[idx, -d.shape[0]:] = d chars_vecs = zeros.astype(dtype=np.int32) self._data_dict["train"] += (chars_vecs[:train_size], ) self._data_dict["test"] += (chars_vecs[-test_size:], )
def _load_data(self, train_set, test_set): # vectorize # add offset of 2 for PAD and OOV train_size = len(train_set) test_size = len(test_set) texts, tags, intents = list(zip(*train_set + test_set)) text_vectors, self._tokens_vocab = word_vector_generator(texts, lower=True, start=1) tag_vectors, self._tags_vocab = word_vector_generator(tags, lower=False, start=1) chars_vectors, self._chars_vocab = character_vector_generator(texts, start=1) i, self._intents_vocab = word_vector_generator([intents]) i = np.asarray(i[0]) text_vectors = pad_sentences(text_vectors, max_length=self.sentence_len) tag_vectors = pad_sentences(tag_vectors, max_length=self.sentence_len) chars_vectors = [ pad_sentences(d, max_length=self.word_len) for d in chars_vectors ] zeros = np.zeros( (len(chars_vectors), self.sentence_len, self.word_len)) for idx, d in enumerate(chars_vectors): d = d[:self.sentence_len] zeros[idx, :d.shape[0]] = d chars_vectors = zeros.astype(dtype=np.int32) self.vecs["train"] = [ text_vectors[:train_size], chars_vectors[:train_size], i[:train_size], tag_vectors[:train_size], ] self.vecs["test"] = [ text_vectors[-test_size:], chars_vectors[-test_size:], i[-test_size:], tag_vectors[-test_size:], ]