Ejemplo n.º 1
0
def load_data(max_sentence_length = None):
    """
        从本地文件读取搜狗分类数据集
    """
    read_dir_path = os.path.join(defaultPath.PROJECT_DIRECTORY,sogou_classfication.data_path_jieba)
    label_dir_list = os.listdir(read_dir_path)
    x_raw = []
    y = []
    label2index_dict = {l.strip(): i for i,l in enumerate(sogou_classfication.label_list)}

    for label_dir in label_dir_list:
        label_dir_path = os.path.join(read_dir_path,label_dir)
        label_index = label2index_dict[label_dir]
        label_item = np.zeros(len(sogou_classfication.label_list),np.float32)
        label_item[label_index] = 1
        label_file_list = os.listdir(label_dir_path)
        for label_file in label_file_list:
            with open(os.path.join(label_dir_path,label_file),'rb') as reader:
                text = reader.read().decode('utf-8').replace('\n','').replace('\r','').strip()
                x_raw.append(text)
                y.append(label_item)
        if not max_sentence_length:
            max_sentence_length = max([len(item.split(" ") for item in x_raw)])
        x = []

        model_path = os.path.join(defaultPath.PROJECT_DIRECTORY, sogou_classfication.word2Vect_path)
        model_save_path = os.path.join(model_path, sogou_classfication.model_name)
        word2vec_model = Word2Vec.load(model_save_path)
        text_converter = data_convert.SimpleTextConverter(word2vec_model,max_sentence_length,None)

        for sentence,sentence_leng in text_converter.transform_to_ids(x_raw):
            x.append(sentence)
    return np.array(x),np.array(y),max_sentence_length
Ejemplo n.º 2
0
def predict_doc(text):
    """
    给定一个文本,预测文本的分类
    """
    stop_word_set = stop_word.get_stop_word()
    sengment_list = jieba.cut(text)
    word_list = []
    for word in sengment_list:
        word = word.strip()
        if '' != word and word not in stop_word_set:
            word_list.append(word)

    word_segment = " ".join(word_list)

    #查找最新保存的检查文件的文件名
    checkpoint_dir = FLAGS.checkpoint_dir
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
    index2label_dict = {
        i: l.strip()
        for i, l in enumerate(sogou_classfication.label_list)
    }

    model_path = os.path.join(defaultPath.PROJECT_DIRECTORY,
                              sogou_classfication.word2Vect_path)
    model_save_path = os.path.join(model_path, sogou_classfication.model_name)
    word2vec_model = Word2Vec.load(model_save_path)
    text_converter = data_convert.SimpleTextConverter(
        word2vec_model, FLAGS.max_sentence_length, None)

    x_test = []
    for doc, _ in text_converter.transform_to_ids([word_segment]):
        x_test.append(doc)
    x_test = np.array(x_test)

    with tf.Graph().as_default() as graph:
        with tf.Session() as sess:
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            input_x = graph.get_operation_by_name("model/input_x").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "model/dropout_keep_prob").outputs[0]
            #带评估的Tensors
            prediction = graph.get_operation_by_name(
                "model/output/prediction").outputs[0]
            predict_class = sess.run(prediction, {
                input_x: x_test,
                dropout_keep_prob: 1.0
            })[0]
            return index2label_dict.get(predict_class)
Ejemplo n.º 3
0
def load_data(max_sentence_length=None):
    """
        从本地文件读取搜狗分类数据集
    """
    read_dir_path = os.path.join(defaultPath.PROJECT_DIRECTORY,
                                 tv_classfication.tv_data_path)
    label_dir_list = os.listdir(read_dir_path)
    x_raw = []
    y = []
    label2index_dict = {
        l.strip(): i
        for i, l in enumerate(tv_classfication.label_list)
    }

    for label_dir in label_dir_list:
        if label_dir == 'thu_jieba.txt':
            continue
        label_dir_path = os.path.join(read_dir_path, label_dir)
        label_index = label2index_dict[label_dir]
        label_item = np.zeros(len(tv_classfication.label_list), np.float32)
        label_item[label_index] = 1
        label_file_list = os.listdir(label_dir_path)
        for label_file in label_file_list:
            if label_file.endswith(".csv"):
                continue
            with open(os.path.join(label_dir_path, label_file),
                      'rb') as reader:
                i = 0
                for line in reader:
                    i += 1
                    text = line.decode('utf-8').replace('\n', '').replace(
                        '\r', '').strip()
                    x_raw.append(text)
                    y.append(label_item)

                    if i > 3000:
                        break
        if not max_sentence_length:
            max_sentence_length = max([len(item.split(" ") for item in x_raw)])
        x = []

        model_path = tv_classfication.word2vec_path
        word2vec_model = Word2Vec.load(model_path)
        text_converter = data_convert.SimpleTextConverter(
            word2vec_model, max_sentence_length, None)

        for sentence, sentence_leng in text_converter.transform_to_ids(x_raw):
            x.append(sentence)
    return np.array(x), np.array(y), max_sentence_length
Ejemplo n.º 4
0
def predict(text):
    words = jieba.cut(text)
    words = " ".join(words)
    index2label = {
        i: l.strip()
        for i, l in enumerate(tv_classfication.label_list)
    }

    word2vec_model = Word2Vec.load(tv_classfication.word2vec_path)
    text_converter = data_convert.SimpleTextConverter(word2vec_model, 80, None)
    x_test = []
    for doc, _ in text_converter.transform_to_ids([words]):
        x_test.append(doc)

    x_test = np.array(x_test)

    graph = tf.Graph()
    with graph.as_default(), tf.Session() as sess:
        model = bi_lstm_model.Bi_lstm()
        model.restore_model(sess)

        print(tv_classfication.index2label.get(model.predict(sess, x_test)[0]))
Ejemplo n.º 5
0
def predict_label(text):

    words = jieba.cut(text)
    words = " ".join(words)

    word2vec_path = tv_classfication.word2vec_path
    model_save_path = "saveModel1/saveModel1/"
    word2vec_model = Word2Vec.load(word2vec_path)
    text_converter = data_convert.SimpleTextConverter(word2vec_model, 500,
                                                      None)
    x_test = []
    for doc, _ in text_converter.transform_to_ids([words]):
        x_test.append(doc)

    x_test = np.array(x_test)

    with tf.Graph().as_default() as graph:
        with tf.Session() as sess:

            checkpoint_file = tf.train.latest_checkpoint(model_save_path)
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            input_x = graph.get_operation_by_name("model/input_x").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "model/dropout_keep_prob").outputs[0]

            logits = graph.get_operation_by_name(
                "model/softmax_layer/Add:0").outputs[0]

            prediction = graph.get_operation_by_name(
                "model/output/prediction").outputs[0]
            predict_class, logits_out = sess.run([prediction, logits], {
                input_x: x_test,
                dropout_keep_prob: 1.0
            })[0]
            return tv_classfication.index2label.get(predict_class)