def predict(sentence, num_words, model): # 预测此句是否为要点句,是返回1,不是返回0 data = np.empty((1, num_words, size), dtype="float64") sentence = u.remove_useless(sentence) word_list = u.seg2words_long(sentence) word_list = word_list[:num_words] # 把长度缩减到训练模型的长度 num = 0 vector_model = wv.load_model() for i in range(len(word_list)): word = word_list[i].encode('utf-8') vector = wv.get_vector(vector_model, word) if vector == []: continue data[0, i, :] = vector num += 1 for j in range(num, num_words): data[0, j, :] = -1 prediction = model.predict(data) # print "%.2f%%" % (float(prediction[0][0]) * 100) + " " + "%.2f%%" % (float(prediction[0][1]) * 100) return np.argmax(prediction)
def save_data(line_list, data_path, ignore): num_lines = 0 largest_num = 0 vector_model = wv.load_model() for i in range(len(line_list)): if (i + 1) % 50 == 0: print("第" + str(i + 1) + "行 (" + str(i + 1) + "/" + str(len(line_list)) + ")") # 清除无关信息 line_list[i] = u.remove_useless(line_list[i]) # 处理标签 label = 0 if "|" in line_list[i]: label = 1 line_list[i] = line_list[i].replace("|", "") else: if ignore: if random.randint(0, 9) < 3: label = 0 else: continue else: label = 0 # 转换为词向量 total_vector = [] word_list = u.seg2words_long(line_list[i]) for word in word_list: word = word.encode('utf-8') vector = wv.get_vector(vector_model, word) # 模型是utf-8的 if (vector == []): continue total_vector.append(vector) # 找到最大值 if len(total_vector) > largest_num: largest_num = len(total_vector) # 去除空行 if total_vector == []: continue num_lines += 1 # 写入数据 f = open(data_path, "a") f.write(str(label) + "\n") for vector in total_vector: for num in vector: f.write(str(num) + " ") f.write("\n") f.write("%\n") f.close() # 在开头两行补上数据的维度,以供在训练的初始化时提取 f = open(data_path, 'r+') content = f.read() f.seek(0, 0) f.write(str(num_lines) + "\n") f.write(str(largest_num) + "\n") f.write(content) f.close()