def __create_xy_train(self, tag_file, embedding_file, data_size=1, look_back=5, threshold=0, suffix=None): x_train = [] y_train = [] corpus = DataUtils.load_corpus(tag_file) tag_emb = DataUtils.create_onehot_vectors( DataUtils.extract_tag_list(corpus)) word_emb = DataUtils.load_embeddings(embedding_file) if suffix is not None: word_emb = DataUtils.add_suffix_embeddings(word_emb, suffix[0], suffix[1]) words = DataUtils.extract_word_data(corpus) word_keys = DataUtils.normalize_cases(word_emb.keys(), words) tag_dict = DataUtils.extract_tag_dict(corpus, threshold) data_size = int(len(words) * min(data_size, 1)) - int( len(words) * min(data_size, 1)) % look_back data_size = 53750 for idx in np.arange(0, data_size, look_back): dict_tag_inputs = [tag_dict[words[idx]]] word_inputs = [ word_emb[word_keys[idx]] ] if word_keys[idx] in word_emb else [word_emb["UNK"]] for widx in range(1, look_back): word_inputs = np.append( word_inputs, [word_emb[word_keys[idx + widx]]] if word_keys[idx + widx] in word_emb else [word_emb["UNK"]], axis=0) dict_tag_inputs.append(tag_dict[words[idx + widx]]) dict_tag_inputs = DataUtils.cartesian(np.array(dict_tag_inputs)) for jdx in range(len(dict_tag_inputs)): tag_inputs = [tag_emb[tag] for tag in dict_tag_inputs[jdx]] if idx == 0 and jdx == 0: x_train = [word_inputs] y_train = [tag_inputs] else: x_train = np.append(x_train, [word_inputs], axis=0) y_train = np.append(y_train, [tag_inputs], axis=0) if idx % int(data_size / (10 * look_back)) == 0: DataUtils.update_message(str(int(idx / data_size * 100))) x_train = np.array(x_train) y_train = np.array(y_train) return x_train, y_train