def load_data(): read = FileOperator.f_open(Const.UNIQ_SRC_FILE) unique_char_set = read[-1].split(",") w2v = Word2Vec(Const.W2V_SRC_FILE, Const.W2V_WEIGHT_FILE, Const.WORD_FEAT_LEN, "load") data_array = [] for word in unique_char_set: data_array.append(w2v.str_to_vector(word)) kmeans = MyKmeans(Const.NUM_OF_CLUSTER, Const.KMEANS_SAVE_FILE, data_array, "load") return unique_char_set, w2v, kmeans
def learn_word(): print("src file: ", Const.SRC_FILE) sentence_list = FileOperator.f_open(Const.SRC_FILE) sentence_list = StringOperator.split_sentence(sentence_list) prob_state, w2v, kmeans, _ = load_data() cnt = 0 for sentence in sentence_list: sys.stdout.write("\r progress: %d / %d" % (cnt, len(sentence_list))) sys.stdout.flush() for i in range(len(sentence) - 2): vec = w2v.str_to_vector(sentence[i]).reshape(1, -1) cluster = kmeans.get_cluster(vec) next_vec = w2v.str_to_vector(sentence[i + 1]).reshape(1, -1) next_cluster = kmeans.get_cluster(next_vec) prob_state.count_up_trainsition(cluster, next_cluster) cnt += 1 prob_state.save_prob(Const.PROB_FILE) print() print("end")
def init_data(): print("src file: ", Const.SRC_FILE) sentence_list = FileOperator.f_open(Const.SRC_FILE) sentence_list = StringOperator.split_sentence(sentence_list) flatten_word_list = StringOperator.array_string_to_flatten(sentence_list) unique_char_set = StringOperator.array_char_to_unique(flatten_word_list) print("unique char set len :", len(unique_char_set)) FileOperator.f_write(Const.UNIQ_SRC_FILE, unique_char_set) print("save unique file: ", Const.UNIQ_SRC_FILE) prob_state = ProbabilityState(Const.NUM_OF_CLUSTER, Const.PROB_FILE, "init") prob_state.save_prob(Const.PROB_FILE) w2v = Word2Vec(Const.W2V_SRC_FILE, Const.W2V_WEIGHT_FILE, Const.WORD_FEAT_LEN, "init") data_array = [] for word in unique_char_set: data_array.append(w2v.str_to_vector(word)) MyKmeans(Const.NUM_OF_CLUSTER, Const.KMEANS_SAVE_FILE, data_array, "init")
def call_sse(): read = FileOperator.f_open(Const.UNIQ_SRC_FILE) w2v = Word2Vec(Const.W2V_SRC_FILE, Const.W2V_WEIGHT_FILE, Const.WORD_FEAT_LEN, "load") unique_char_set = read[-1].split(",") print("number of unique word:", len(unique_char_set)) data_array = [] for word in unique_char_set: data_array.append(w2v.str_to_vector(word)) sse_list = [] num_of_cluster_list = range(100, 2000, 100) for num_of_cluster in num_of_cluster_list: print(num_of_cluster) kmeans = MyKmeans(num_of_cluster, Const.KMEANS_SAVE_FILE, data_array, "init") print(kmeans.get_sse()) sse_list.append(kmeans.get_sse()) plt.plot(num_of_cluster_list, sse_list, marker='o') # plt.show() plt.savefig(Const.SSE_IMG)