def __create_xy_test(self, tag_file, embedding_file, data_size=1, look_back=5, suffix=None): x_test = [] y_test = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors( DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words, tags = DataUtils.extract_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) data_size = int(len(words) * min(data_size, 1)) - int( len(words) * min(data_size, 1)) % look_back for idx in np.arange(0, data_size, look_back): x_timestep = [] y_timestep = [] for jdx in range(look_back): word_input = word_emb[word_keys[idx + jdx]] if word_keys[ idx + jdx] in word_emb else word_emb["UNK"] tag_input = tag_emb[tags[idx + jdx]] if (jdx == 0): x_timestep = [word_input] y_timestep = [tag_input] else: x_timestep = np.append(x_timestep, [word_input], axis=0) y_timestep = np.append(y_timestep, [tag_input], axis=0) x_timestep = np.array(x_timestep) y_timestep = np.array(y_timestep) if (idx == 0): x_test = [x_timestep] y_test = [y_timestep] else: x_test = np.append(x_test, [x_timestep], axis=0) y_test = np.append(y_test, [y_timestep], axis=0) if idx % int(data_size / (10 * look_back)) == 0: DataUtils.update_message(str(int(idx / data_size * 100))) x_test = np.array(x_test) y_test = np.array(y_test) return x_test, y_test
def __create_xy(self, tag_file, embedding_file, data_size, window_size, available_tags, suffix): x = [] y = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors( DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words, tags = DataUtils.extract_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) data_size = int(len(words) * data_size) for idx in range(data_size): tag = tags[idx + int(window_size / 2)] if len(available_tags) == 0 or tag in available_tags: word_input = word_emb[word_keys[idx]] if word_keys[ idx] in word_emb else word_emb["UNK"] for widx in range(1, window_size): word_input = np.append( word_input, word_emb[word_keys[idx + widx]] if word_keys[idx + widx] in word_emb else word_emb["UNK"], axis=0) tag_input = tag_emb[tag] if (idx == 0): x = [word_input] y = [tag_input] else: x = np.append(x, [word_input], axis=0) y = np.append(y, [tag_input], axis=0) if idx % int(data_size / 10) == 0: DataUtils.update_message(str(int(idx / data_size * 100))) return x, y
def __create_xy_test(self, tag_file, embedding_file, data_size, window_size, suffix): x_test = [] y_test = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors(DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words, tags = DataUtils.extract_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) data_size = min((int(len(words)*data_size), len(words)-window_size)) for idx in range(data_size): word_input = word_emb[word_keys[idx]] if word_keys[idx] in word_emb else word_emb["UNK"] for widx in range(1, window_size): word_input = np.append(word_input, word_emb[word_keys[idx+widx]] if word_keys[idx+widx] in word_emb else word_emb["UNK"], axis = 0) tag_input = tag_emb[tags[idx+int(window_size/2)]] if(idx == 0): x_test = [word_input] y_test = [tag_input] else: x_test = np.append(x_test, [word_input], axis=0) y_test = np.append(y_test, [tag_input], axis=0) x_test = np.array(x_test) y_test = np.array(y_test) if idx%int(data_size/10) == 0: DataUtils.update_message(str(int(idx/data_size*100))) x_test = np.array(x_test) y_test = np.array(y_test) return x_test, y_test