def load_test_data(self): x_batch = [] y_batch = [] class_num_map = {} test_file_list = utils.find_all_files( self.parameters['test_corpus_dir'], []) lines = [] for file_name in test_file_list: lines += utils.read_lines_small_file(file_name) random.shuffle(lines) for text_file in lines: [text, file_name] = text_file text = preProcess.filtUrl(text) text = text.replace(" ", '').replace( '\n', '')[:self.parameters['max_text_length']] if len(text) == 0: continue id_list = self.trans_char2id(text) # print(text) # print(id_list) class_label = file_name.split('/')[-2] class_num_map[class_label] = class_num_map.get(class_label, 0) + 1 #if class_num_map[class_label]>20: continue class_label_one_hot = self.class_label_one_hot[class_label] x_batch.append(id_list) y_batch.append(class_label_one_hot) x_batch = np.array(x_batch) y_batch = np.array(y_batch) return x_batch, y_batch
def fit(self, if_static_embeding=True): file_list = utils.find_all_files(self.parameters['train_corpus_dir'], []) test_input, test_output = self.load_test_data() count = 0 batch_size = 50 for epoch in range(10000): x_batch = [] y_batch = [] lines = [] for file_name in file_list: lines += utils.read_lines_small_file(file_name) random.shuffle(lines) for text_file in lines: [text, file_name] = text_file text = preProcess.filtUrl(text) text = text.replace(" ", '').replace( '\n', '')[:self.parameters['max_text_length']] id_list = self.trans_char2id(text) class_label = file_name.split('/')[-2] class_label_one_hot = self.class_label_one_hot[class_label] count += 1 #print(class_label, text) x_batch.append(id_list) # print(x_batch) y_batch.append(class_label_one_hot) #print(len(y_batch), y_batch) #打乱顺序 # print(x_batch) x_batch = np.array(x_batch) y_batch = np.array(y_batch) #print(y_batch) # print(x_batch.shape) #训练 for i in range(0, y_batch.shape[0], batch_size): a_x_batch = x_batch[i:i + batch_size, :] a_y_batch = y_batch[i:i + batch_size, :] #print(a_y_batch) Y, prob_dist, _, loss_1, accuracy = self.sess.run([self.Y, self.prob_dist, self.train, self.losses, self.accuracy],\ feed_dict={self.X: a_x_batch, self.Y: a_y_batch}) #打印损失值 # print('epoch ', epoch," loss is ", loss, '。 accuracy is ', accuracy) loss, accuracy = self.sess.run([self.losses, self.accuracy],\ feed_dict={self.X: test_input, self.Y: test_output}) merg = self.sess.run(self.merged,\ feed_dict={self.X: test_input, self.Y: test_output}) self.writer.add_summary(merg, epoch) print(epoch, loss_1, "在测试集中的loss为", loss, 'accuracy为', accuracy) self.saver.save(self.sess, self.parameters['check_points_dir'] + '/model') tf.reset_default_graph() self.writer.close()
def fit(self, if_static_embeding=True): model = None file_list = utils.find_all_files( self.parameters['train_corpus_for_embedding'], []) print("训练数据的文件数量是", len(file_list)) count = 0 x_batch = [] step = 0 random.shuffle(file_list) for file_name in file_list: lines = utils.read_lines_small_file(file_name) # text = self.get_title_content(lines) lines = list(map(lambda x: x.split('#'), lines)) lines = list( filter(lambda x: len(x) == 8 and len(x[6]) > 50, lines)) lines = list(map(lambda x: x[6].split('kabukabu')[1].\ replace('d_post_content j_d_post_content clearfix"> ', ''), lines)) text = ''.join(lines).replace(' ', '') if len(text) == 0: continue # print("文档字数 是", len(text)) text = list(text) if len(text) == 0: continue count += 1 x_batch.append(text) if len(x_batch) == 10: #打乱顺序 random_index = list(range(10)) random.shuffle(random_index) #print(x_batch) x_batch = np.array(x_batch)[random_index] #训练 print("这是第", step) if model == None: model = Word2Vec(x_batch, size=200, window=5, min_count=5, workers=8, iter=200) step += 1 else: model.build_vocab(x_batch, update=True) model.train(x_batch, total_examples=x_batch.shape[0], epochs=200) step += 1 x_batch = [] if count % 50 == 0: model.save("./model/word2vec.model")
def fit(self, if_static_embeding=True): file_list = utils.find_all_files(self.parameters['train_corpus_dir'], []) test_input, test_output = self.load_test_data() count = 0 x_batch = [] y_batch = [] for epoch in range(1000): random.shuffle(file_list) for file_name in file_list: print(file_name) lines = utils.read_lines_small_file(file_name) text = self.get_title_content(lines) if len(text) == 0: continue id_list = self.trans_char2id(text) class_label = file_name.split('/')[-2] print(self.class_label_one_hot) class_label_one_hot = self.class_label_one_hot[class_label] count += 1 x_batch.append(id_list) # print(x_batch) y_batch.append(class_label_one_hot) if len(x_batch) == 500: #打乱顺序 random_index = list(range(500)) random.shuffle(random_index) # print(x_batch) x_batch = np.array(x_batch)[random_index] y_batch = np.array(y_batch)[random_index] # print(x_batch.shape) #训练 _, loss, accuracy = self.sess.run([self.train, self.losses, self.accuracy],\ feed_dict={self.X: x_batch, self.Y: y_batch}) #打印损失值 x_batch = [] y_batch = [] if count % 5000 == 0: print('epoch ', epoch, " loss is ", loss, '。 accuracy is ', accuracy) loss, accuracy = self.sess.run([self.losses, self.accuracy],\ feed_dict={self.X: test_input, self.Y: test_output}) print("在测试集中的loss为", loss, 'accuracy为', accuracy) self.saver.save( self.sess, self.parameters['check_points_dir'] + '/model')
def get_char_id_map(self): class_label_id_map = {} char_freq_map = {} all_files = [] print("正字读取语料,并统计字的频率") utils.find_all_files(self.parameters['train_corpus_dir'], all_files) count = 0 for file_name in all_files: class_name = file_name.split("/")[-2] if class_name not in class_label_id_map: class_label_id_map[class_name] = len(class_label_id_map) this_class_sample_size = 0 for lines in utils.read_lines_small_file(file_name): for line in lines: line = preProcess.filtUrl(line) line = line.replace(' ', '').replace('\n', '') if len(line) < 10: continue count += 1 this_class_sample_size += 1 # if this_class_sample_size==100: break if count % 10000 == 0: print("已经读取了", count, '行。', "字符数量是", len(char_freq_map)) for char in line: char_freq_map[char] = char_freq_map.get(char, 0) + 1 print("正在为每一个字分配一个id") char_id_map = {'unk': 0, 'pad_char': 1, 'stop_char': 2} id_char_map = {0: 'unk', 1: 'pad_char', 2: 'stop_char'} init_char_id_map_size = len(char_id_map) char_freq_list = sorted(char_freq_map.items(),\ key=lambda x: x[1], reverse=True)\ [:self.parameters['char_set_size']-len(id_char_map)] for i in range(len(char_freq_list)): [char, _] = char_freq_list[i] if char not in stop_chars: char_id_map[char] = i + init_char_id_map_size id_char_map[i + init_char_id_map_size] = char pickle.dump(char_id_map, open(self.parameters['char_id_map_file'], 'wb')) print(char_id_map.keys()) pickle.dump(id_char_map, open(self.parameters['id_char_map_file'], 'wb')) pickle.dump(class_label_id_map, open(self.parameters['class_label_id_map_file'], 'wb'))
def load_test_data(self): x_batch = [] y_batch = [] class_num_map = {} test_file_list = utils.find_all_files( self.parameters['test_corpus_dir'], []) for file_name in test_file_list: lines = utils.read_lines_small_file(file_name) text = self.get_title_content(lines) if len(text) == 0: continue id_list = self.trans_char2id(text) class_label = file_name.split('/')[-2] class_num_map[class_label] = class_num_map.get(class_label, 0) + 1 if class_num_map[class_label] > 20: continue print(self.class_label_one_hot) class_label_one_hot = self.class_label_one_hot[class_label] x_batch.append(id_list) y_batch.append(class_label_one_hot) x_batch = np.array(x_batch) y_batch = np.array(y_batch) return x_batch, y_batch