def __create_xy_train(self, tag_file, embedding_file, data_size=1, look_back=5, threshold=0, suffix=None): x_train = [] y_train = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors( DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words = DataUtils.extract_word_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) tag_dict = DataUtils.extract_tag_dict(corpus, threshold) data_size = int(len(words) * min(data_size, 1)) - int( len(words) * min(data_size, 1)) % look_back data_size = 53750 for idx in np.arange(0, data_size, look_back): dict_tag_inputs = [tag_dict[words[idx]]] word_inputs = [ word_emb[word_keys[idx]] ] if word_keys[idx] in word_emb else [word_emb["UNK"]] for widx in range(1, look_back): word_inputs = np.append( word_inputs, [word_emb[word_keys[idx + widx]]] if word_keys[idx + widx] in word_emb else [word_emb["UNK"]], axis=0) dict_tag_inputs.append(tag_dict[words[idx + widx]]) dict_tag_inputs = DataUtils.cartesian(np.array(dict_tag_inputs)) for jdx in range(len(dict_tag_inputs)): tag_inputs = [tag_emb[tag] for tag in dict_tag_inputs[jdx]] if idx == 0 and jdx == 0: x_train = [word_inputs] y_train = [tag_inputs] else: x_train = np.append(x_train, [word_inputs], axis=0) y_train = np.append(y_train, [tag_inputs], axis=0) if idx % int(data_size / (10 * look_back)) == 0: DataUtils.update_message(str(int(idx / data_size * 100))) x_train = np.array(x_train) y_train = np.array(y_train) return x_train, y_train
def __create_xy_test(self, tag_file, embedding_file, data_size=1, look_back=5, suffix=None): x_test = [] y_test = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors( DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words, tags = DataUtils.extract_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) data_size = int(len(words) * min(data_size, 1)) - int( len(words) * min(data_size, 1)) % look_back for idx in np.arange(0, data_size, look_back): x_timestep = [] y_timestep = [] for jdx in range(look_back): word_input = word_emb[word_keys[idx + jdx]] if word_keys[ idx + jdx] in word_emb else word_emb["UNK"] tag_input = tag_emb[tags[idx + jdx]] if (jdx == 0): x_timestep = [word_input] y_timestep = [tag_input] else: x_timestep = np.append(x_timestep, [word_input], axis=0) y_timestep = np.append(y_timestep, [tag_input], axis=0) x_timestep = np.array(x_timestep) y_timestep = np.array(y_timestep) if (idx == 0): x_test = [x_timestep] y_test = [y_timestep] else: x_test = np.append(x_test, [x_timestep], axis=0) y_test = np.append(y_test, [y_timestep], axis=0) if idx % int(data_size / (10 * look_back)) == 0: DataUtils.update_message(str(int(idx / data_size * 100))) x_test = np.array(x_test) y_test = np.array(y_test) return x_test, y_test
def __create_xy(self, tag_file, embedding_file, data_size, window_size, available_tags, suffix): x = [] y = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors( DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words, tags = DataUtils.extract_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) data_size = int(len(words) * data_size) for idx in range(data_size): tag = tags[idx + int(window_size / 2)] if len(available_tags) == 0 or tag in available_tags: word_input = word_emb[word_keys[idx]] if word_keys[ idx] in word_emb else word_emb["UNK"] for widx in range(1, window_size): word_input = np.append( word_input, word_emb[word_keys[idx + widx]] if word_keys[idx + widx] in word_emb else word_emb["UNK"], axis=0) tag_input = tag_emb[tag] if (idx == 0): x = [word_input] y = [tag_input] else: x = np.append(x, [word_input], axis=0) y = np.append(y, [tag_input], axis=0) if idx % int(data_size / 10) == 0: DataUtils.update_message(str(int(idx / data_size * 100))) return x, y
def __create_xy_train(self, tag_file, embedding_file, data_size, window_size, threshold, suffix): x_train = [] y_train = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors(DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words = DataUtils.extract_word_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) tag_dict = DataUtils.extract_tag_dict(corpus, threshold) data_size = min((int(len(words)*data_size), len(words)-window_size)) for idx in range(data_size): word_input = word_emb[word_keys[idx]] if word_keys[idx] in word_emb else word_emb["UNK"] for widx in range(1, window_size): word_input = np.append(word_input, word_emb[word_keys[idx+widx]] if word_keys[idx+widx] in word_emb else word_emb["UNK"], axis = 0) tag_inputs = [tag_emb[tag] for tag in tag_dict[words[idx+int(window_size/2)]]] for tidx in range(len(tag_inputs)): tag_input = tag_inputs[tidx] if idx == 0 and tidx == 0: x_train = [word_input] y_train = [tag_input] else: x_train = np.append(x_train, [word_input], axis=0) y_train = np.append(y_train, [tag_input], axis=0) if idx%int(data_size/100) == 0: DataUtils.update_message(str(int(idx/data_size*100))) x_train = np.array(x_train) y_train = np.array(y_train) return x_train, y_train