def load_data(max_sentence_length = None): """ 从本地文件读取搜狗分类数据集 """ read_dir_path = os.path.join(defaultPath.PROJECT_DIRECTORY,sogou_classfication.data_path_jieba) label_dir_list = os.listdir(read_dir_path) x_raw = [] y = [] label2index_dict = {l.strip(): i for i,l in enumerate(sogou_classfication.label_list)} for label_dir in label_dir_list: label_dir_path = os.path.join(read_dir_path,label_dir) label_index = label2index_dict[label_dir] label_item = np.zeros(len(sogou_classfication.label_list),np.float32) label_item[label_index] = 1 label_file_list = os.listdir(label_dir_path) for label_file in label_file_list: with open(os.path.join(label_dir_path,label_file),'rb') as reader: text = reader.read().decode('utf-8').replace('\n','').replace('\r','').strip() x_raw.append(text) y.append(label_item) if not max_sentence_length: max_sentence_length = max([len(item.split(" ") for item in x_raw)]) x = [] model_path = os.path.join(defaultPath.PROJECT_DIRECTORY, sogou_classfication.word2Vect_path) model_save_path = os.path.join(model_path, sogou_classfication.model_name) word2vec_model = Word2Vec.load(model_save_path) text_converter = data_convert.SimpleTextConverter(word2vec_model,max_sentence_length,None) for sentence,sentence_leng in text_converter.transform_to_ids(x_raw): x.append(sentence) return np.array(x),np.array(y),max_sentence_length
def predict_doc(text): """ 给定一个文本,预测文本的分类 """ stop_word_set = stop_word.get_stop_word() sengment_list = jieba.cut(text) word_list = [] for word in sengment_list: word = word.strip() if '' != word and word not in stop_word_set: word_list.append(word) word_segment = " ".join(word_list) #查找最新保存的检查文件的文件名 checkpoint_dir = FLAGS.checkpoint_dir checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) index2label_dict = { i: l.strip() for i, l in enumerate(sogou_classfication.label_list) } model_path = os.path.join(defaultPath.PROJECT_DIRECTORY, sogou_classfication.word2Vect_path) model_save_path = os.path.join(model_path, sogou_classfication.model_name) word2vec_model = Word2Vec.load(model_save_path) text_converter = data_convert.SimpleTextConverter( word2vec_model, FLAGS.max_sentence_length, None) x_test = [] for doc, _ in text_converter.transform_to_ids([word_segment]): x_test.append(doc) x_test = np.array(x_test) with tf.Graph().as_default() as graph: with tf.Session() as sess: saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) input_x = graph.get_operation_by_name("model/input_x").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "model/dropout_keep_prob").outputs[0] #带评估的Tensors prediction = graph.get_operation_by_name( "model/output/prediction").outputs[0] predict_class = sess.run(prediction, { input_x: x_test, dropout_keep_prob: 1.0 })[0] return index2label_dict.get(predict_class)
def load_data(max_sentence_length=None): """ 从本地文件读取搜狗分类数据集 """ read_dir_path = os.path.join(defaultPath.PROJECT_DIRECTORY, tv_classfication.tv_data_path) label_dir_list = os.listdir(read_dir_path) x_raw = [] y = [] label2index_dict = { l.strip(): i for i, l in enumerate(tv_classfication.label_list) } for label_dir in label_dir_list: if label_dir == 'thu_jieba.txt': continue label_dir_path = os.path.join(read_dir_path, label_dir) label_index = label2index_dict[label_dir] label_item = np.zeros(len(tv_classfication.label_list), np.float32) label_item[label_index] = 1 label_file_list = os.listdir(label_dir_path) for label_file in label_file_list: if label_file.endswith(".csv"): continue with open(os.path.join(label_dir_path, label_file), 'rb') as reader: i = 0 for line in reader: i += 1 text = line.decode('utf-8').replace('\n', '').replace( '\r', '').strip() x_raw.append(text) y.append(label_item) if i > 3000: break if not max_sentence_length: max_sentence_length = max([len(item.split(" ") for item in x_raw)]) x = [] model_path = tv_classfication.word2vec_path word2vec_model = Word2Vec.load(model_path) text_converter = data_convert.SimpleTextConverter( word2vec_model, max_sentence_length, None) for sentence, sentence_leng in text_converter.transform_to_ids(x_raw): x.append(sentence) return np.array(x), np.array(y), max_sentence_length
def predict(text): words = jieba.cut(text) words = " ".join(words) index2label = { i: l.strip() for i, l in enumerate(tv_classfication.label_list) } word2vec_model = Word2Vec.load(tv_classfication.word2vec_path) text_converter = data_convert.SimpleTextConverter(word2vec_model, 80, None) x_test = [] for doc, _ in text_converter.transform_to_ids([words]): x_test.append(doc) x_test = np.array(x_test) graph = tf.Graph() with graph.as_default(), tf.Session() as sess: model = bi_lstm_model.Bi_lstm() model.restore_model(sess) print(tv_classfication.index2label.get(model.predict(sess, x_test)[0]))
def predict_label(text): words = jieba.cut(text) words = " ".join(words) word2vec_path = tv_classfication.word2vec_path model_save_path = "saveModel1/saveModel1/" word2vec_model = Word2Vec.load(word2vec_path) text_converter = data_convert.SimpleTextConverter(word2vec_model, 500, None) x_test = [] for doc, _ in text_converter.transform_to_ids([words]): x_test.append(doc) x_test = np.array(x_test) with tf.Graph().as_default() as graph: with tf.Session() as sess: checkpoint_file = tf.train.latest_checkpoint(model_save_path) saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) input_x = graph.get_operation_by_name("model/input_x").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "model/dropout_keep_prob").outputs[0] logits = graph.get_operation_by_name( "model/softmax_layer/Add:0").outputs[0] prediction = graph.get_operation_by_name( "model/output/prediction").outputs[0] predict_class, logits_out = sess.run([prediction, logits], { input_x: x_test, dropout_keep_prob: 1.0 })[0] return tv_classfication.index2label.get(predict_class)