Example #1
0
def predict():
    with open(map_path, "rb") as f:
        word_to_id, cat_to_id, seq_length, num_classes = pickle.load(f)
    id_to_cat = {v: k for k, v in cat_to_id.items()}
    config = TCNNConfig()
    config.num_classes = num_classes
    config.vocab_size = len(word_to_id)
    model = TextCNN(config)
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型
    while True:
        line = str(input("请输入测试句子:"))
        data_id = [[
            word_to_id[x] for x in list(native_content(line))
            if x in word_to_id
        ]]
        x_pad = kr.preprocessing.sequence.pad_sequences(data_id, seq_length)
        y_pred_cls = session.run(model.y_pred_cls,
                                 feed_dict={
                                     model.input_x: x_pad,
                                     model.keep_prob: 1.0
                                 })
        print('sentence : {}, prdict intent : {}'.format(
            line, id_to_cat[y_pred_cls[0]]))
        a = 1
Example #2
0
 def __init__(self, LENGTH):
     self.config = TCNNConfig()
     self.config.seq_length = LENGTH
     self.model = TextCNN(self.config)
     self.session = tf.Session()
     self.session.run(tf.global_variables_initializer())
     saver = tf.train.Saver()
     saver.restore(sess=self.session, save_path=save_path)  # 读取保存的模型
 def __init__(self):
     self.config = TCNNConfig()
     self.categories, self.cat_to_id = read_category()
     self.words, self.word_to_id = read_vocab(vocab_dir)
     self.config.vocab_size = len(self.words)
     self.model = TextCNN(self.config)
     self.session = tf.Session()
     self.session.run(tf.global_variables_initializer())
     saver = tf.train.Saver()
     saver.restore(sess=self.session, save_path=save_path)  # 读取保存的模型
Example #4
0
    def __init__(self):
        self.config = TCNNConfig()
        self.categories, self.cat_to_id = read_category()
        self.words, self.word_to_id = read_vocab(vocab_dir)
        self.config.vocab_size = len(self.words)
        self.config.pre_training = np.load(pre_training)
        self.model = TextCNN(self.config)

        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        if not load_checkpoint(save_dir, self.session):
            exit()
Example #5
0
def load_variable_pb():
    session = tf.Session(graph=tf.Graph())
    model_file_path = "pb/model"
    meta_graph = tf.saved_model.loader.load(
        session, [tf.saved_model.tag_constants.SERVING], model_file_path)

    model_graph_signature = list(meta_graph.signature_def.items())[0][1]
    output_feed = []
    output_op_names = []
    output_tensor_dict = {}

    output_op_names.append('y_pred_cls')
    output_op_names.append('y_pred_prob')

    for output_item in model_graph_signature.outputs.items():
        output_op_name = output_item[0]
        output_tensor_name = output_item[1].name
        output_tensor_dict[output_op_name] = output_tensor_name

    for name in output_op_names:
        output_feed.append(output_tensor_dict[name])
        print(output_tensor_dict[name])
    print("load model finish!")

    config = TCNNConfig()
    categories, cat_to_id = read_category()
    word_to_id = read_vocab(vocab_dir)

    while True:

        string = input("请输入测试句子: ").strip()

        input_x = [[word_to_id.get(x, word_to_id['<PAD>']) for x in string]]

        input_x = tf.keras.preprocessing.sequence.pad_sequences(
            sequences=input_x, maxlen=config.seq_length)

        inputs = {}
        inputs['input_x'] = input_x
        inputs['keep_prob'] = 1.0

        feed_dict = {}
        for input_item in model_graph_signature.inputs.items():
            input_op_name = input_item[0]
            input_tensor_name = input_item[1].name
            feed_dict[input_tensor_name] = inputs[input_op_name]

        outputs = session.run(output_feed, feed_dict=feed_dict)

        print(categories[outputs[0][0]])

        print(outputs[1][0])
Example #6
0
    def __init__(self):
        self.config = TCNNConfig()
        self.words, self.word_to_id = read_vocab(vocab_dir)
        self.config.vocab_size = len(self.words)
        self.config.pre_training = pd.read_csv(word_vector_dir, header=None, index_col=None).values
        self.model = TextCNN(self.config)
        session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        self.session = tf.Session(config=session_conf)
        self.session.run(tf.global_variables_initializer())

        # self.session.run(tf.initialize_local_variables())
        saver = tf.train.Saver()
        saver.restore(sess=self.session, save_path=save_path)  # 读取保存的模型
    def __init__(self):
        self.config = TCNNConfig()
        self.categories = categories
        self.cat_to_id = cat_to_id
        self.words = words
        self.word_to_id = word_to_id
        self.config.vocab_size = len(self.words)
        self.model = TextCNN(self.config)

        self.session = sess
        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess=self.session, save_path=cnn_save_path)  # 读取保存的模型
Example #8
0
 def __init__(self):
     print('Configuring CNN model...')
     if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
         build_vocab(train_dir, vocab_dir, config.vocab_size)
     self.categories, cat_to_id = read_category()
     words, self.word_to_id = read_vocab(vocab_dir)
     self.table = pd.read_excel('predict_check_data.xls')
     category_set = list(set(self.table['name'].tolist()))
     self.config = TCNNConfig(len(list(category_set)))
     self.config.vocab_size = len(words)
     self.model = TextCNN(self.config)
     self.categories = list(set(self.table['name'].tolist()))
     self.categories.sort(key=self.table['name'].tolist().index)
Example #9
0
 def __init__(self):
     self.map_path = './model/ids.map'
     self.save_path = './model/best_validation'
     with open(self.map_path, "rb") as f:
         self.word_to_id, self.cat_to_id, self.seq_length, self.num_classes = pickle.load(
             f)
     self.id_to_cat = {v: k for k, v in self.cat_to_id.items()}
     self.config = TCNNConfig()
     self.config.num_classes = self.num_classes
     self.config.vocab_size = len(self.word_to_id)
     self.model = TextCNN(self.config)
     self.session = tf.Session()
     self.session.run(tf.global_variables_initializer())
     saver = tf.train.Saver()
     saver.restore(sess=self.session, save_path=self.save_path)  # 读取保存的模型
Example #10
0
def model_convert(model_path,pb_path):
    config = TCNNConfig()
    model = TextCNN(config)
    save_path = model_path
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        saver_1 = tf.train.Saver()
        saver_1.restore(sess=session, save_path=save_path)  # 读取保存的模型
        print([n.name for n in session.graph.as_graph_def().node])
        frozen_graph_def= tf.graph_util.convert_variables_to_constants(
            session,
            session.graph_def,
            output_node_names=["keep_prob","input_x","score/predict"])

        with open(pb_path, 'wb') as f:
            f.write(frozen_graph_def.SerializeToString())
    def __init__(self):
        embedding_model_file = os.path.join(
            'data', 'word_embedding', 'embeddings.bin')
        embedding_model = word2vec.load(embedding_model_file)  # type: word2vec.WordVectors
        self.segor = Train()

        self.config = TCNNConfig()
        self.categories, self.cat_to_id = read_category()
        words = list(embedding_model.vocab)
        self.word_to_id = embedding_model.vocab_hash
        self.config.vocab_size = len(words)
        self.model = TextCNN(self.config, embedding_model)

        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess=self.session, save_path=save_path)  # 读取保存的模型
Example #12
0
def read_example(filename_queue):
    """Read one example from filename_queue"""
    reader = tf.TFRecordReader()
    config = TCNNConfig()
    key, value = reader.read(filename_queue)
    features = tf.parse_single_example(
        value,
        features={
            "text": tf.VarLenFeature(tf.string),
            "title": tf.FixedLenFeature([config.seq_length], tf.int64),
            "label": tf.FixedLenFeature([config.num_classes], tf.int64)
        })

    text = features["text"]
    title = tf.cast(features["title"], tf.int32)
    label = tf.cast(features["label"], tf.int32)
    return text, title, label
    def __init__(self, stopwords_path, vocab_dir, categories_dir, save_path):

        self.thu = thulac.thulac(seg_only=True)
        self.stopwords = [
            line.strip() for line in open(stopwords_path).readlines()
        ]
        categories, cat_to_id = read_category(categories_dir)
        self.id_to_cat = {v: k for k, v in cat_to_id.items()}
        words, self.word_to_id = read_vocab(vocab_dir)
        g = tf.Graph()
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        self.sess = tf.Session(graph=g, config=tf_config)
        with self.sess.as_default():
            with g.as_default():
                self.config = TCNNConfig()
                self.config.num_classes = len(cat_to_id)
                self.config.vocab_size = len(words)
                self.model = TextCNN(self.config)
                saver = tf.train.Saver()
                self.sess.run(tf.global_variables_initializer())
                saver.restore(self.sess, save_path=save_path)
Example #14
0
  def load_model(self):
    sess = tf.Session()
    print('Configuring CNN model...')
    config = TCNNConfig()
    cnn_model = TextCNN(config)

    saver = tf.train.Saver()
    params_file = tf.train.latest_checkpoint(self.model_dir)
    saver.restore(sess, params_file)

    categories, cat_to_id = read_category()
    vocab_dir = 'cnews/cnews.vocab.txt'
    words, word_to_id = read_vocab(vocab_dir)

    self.words = words
    self.word_to_id = word_to_id
    self.categories = categories
    self.cat_to_id = cat_to_id

    self.cnn_model = cnn_model
    self.sess = sess
    print(self.cnn_model)
    print(self.sess)
Example #15
0
    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")

    print('Configuring CNN model...')
    config = TCNNConfig()  # 读取配置文件
    if not os.path.exists(
            vocab_dir):  # 如果不存在词汇表,用train_dir中频率最高的vocab_size-1个词构建词汇表
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()  # 读分类list 和 分类-id 字典
    words, word_to_id = read_vocab(vocab_dir)  # 读词汇表list 和 words-id 字典
    config.vocab_size = len(words)  # 词汇表大小重新设定
    model = TextCNN(config)

    if sys.argv[1] == 'train':
        train()
    elif sys.argv[1] == 'test':
        test()
    else:
        raise ValueError("""usage: python run_cnn.py [train / test]""")
Example #16
0
    # 评估
    print("Precision, Recall and F1-Score...")
    print(
        metrics.classification_report(y_test_cls,
                                      y_pred_cls,
                                      target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':

    config = TCNNConfig()  # 1.加载配置参数; 初始化右边的类之后得到左边的对象config
    if not os.path.exists(vocab_dir):  # 如果cnews.vocab.txt不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()  # 制作分类目录
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextCNN(config)
    option = 'test'
    if option == 'train':
        train()
    else:
        test()
Example #17
0
        for j in range(len(cm[i])):
            ws.write(i,j,cm[i][j])
    wb.save('../data/temp/cm.xls')
    '''
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")

    print('Configuring CNN model...')
    config = TCNNConfig()

    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
# w2v = get_word_embedding(w2v_path, vocab_dir, config.embedding_dim)
# config.w2v = w2v
# print(w2v)
# print(config.w2v)
    categories, cat_to_id = read_category(categories_dir)
    id_to_cat = {v: k for k, v in cat_to_id.items()}
    words, word_to_id = read_vocab(vocab_dir)
    #print('loading word embedding...')
    #embeddings = get_embeddings('./datasets/w2v.txt',vocab_dir,word_to_id)
    #embeddings = pickle.load(open('./datasets/embeddings.pkl','rb'))
    #config.embedding_dim = len(embeddings[0])
    config.num_classes = len(cat_to_id)
Example #18
0
    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")

    print('Configuring CNN model...')
    config = TCNNConfig()                        # 加载配置文件
    if not os.path.exists(vocab_dir):           # 如果不存在词汇表,重建
        build_vocab_to_words(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()     # 获取分类类别
    words, word_to_id = read_vocab(vocab_dir)   # 读取词汇表
    config.vocab_size = len(words)              # 修改 词汇表大小
    model = TextCNN(config)

    if sys.argv[1] == 'train':
        train()
    else:
        test()
    # 评估
    print("Precision, Recall and F1-Score...")
    print(
        metrics.classification_report(y_test_cls,
                                      y_pred_cls,
                                      target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':

    config = TCNNConfig()  # 获取配置参数
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextCNN(config)
    option = 'train'
    if option == 'train':
        train()
    else:
        test()
Example #20
0
        test_file = open(cnn.test_dir, 'w')
        for test_doc in test_docs:
            temp = test_doc.split(',')
            test_data_X.append(corpus[int(temp[0]) - 1])
            test_data_Y.append(temp[1])
            string = corpus[int(temp[0]) - 1]
            #string = expand_abbr(string)
            str_to_write = string

            test_file.write(temp[1] + '\t' + str_to_write + '\n')
            all_file.write(temp[1] + '\t' + str_to_write + '\n')

        print('Configuring CNN model...')
        test_file.close()
        all_file.close()
        cnn.config = TCNNConfig()
        #if not os.path.exists(cnn.vocab_dir): #if no vocab, build it
        build_vocab_words(cnn.all_dir, cnn.vocab_dir, cnn.config.vocab_size)
        cnn.words, cnn.word_to_id = read_vocab(cnn.vocab_dir)
        cnn.config.vocab_size = len(cnn.words)

        #select a subset of word vectors
        cnn.missing_dir = os.path.join(cnn.base_dir,
                                       key + '.' + sub_key + '.missing.txt')
        missing_words_file = open(cnn.missing_dir, 'w')
        sub_embeddings = np.random.uniform(
            -0.0, 0.0, (cnn.config.vocab_size, embedding_dim))
        count = 0
        for i in range(0, cnn.config.vocab_size):
            if (cnn.words[i] in word_vector_map
                ):  #word_vector_map.has_key(cnn.words[i])
    if len(sys.argv) > 3 or len(sys.argv) < 2 or sys.argv[1] not in [
            'train', 'test'
    ]:
        raise ValueError(
            """usage: python run_cnn.py [train / test] [train/test file]""")
    base_dir = 'data/train_test_files'
    train_dir = os.path.join(base_dir, 'qttnews.train.csv')
    dev_sample_percentage = 0.1
    test_dir = os.path.join(base_dir, 'qttnews.test.csv')
    vocab_dir = "data/w2v/qttnews.vocab.txt"
    word_vector_dir = "data/w2v/qttnews.vector.txt"
    save_dir = 'checkpoints/textcnn'
    save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径
    # tf.reset_default_graph()
    print('Configuring CNN model...')
    config = TCNNConfig()

    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        print("no vocabulary file, need to generate it ")
        generate_w2v()
    # categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    # trans vector file to numpy file
    if not os.path.exists(word_vector_dir):
        print("no pretrained w2v exists, generate the w2v")
        generate_w2v()
    else:
        print("load w2v embeddings")
        config.pre_training = pd.read_csv(word_vector_dir,
                                          header=None,
Example #22
0
    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':

    print('Configuring CNN model...')

    if len(sys.argv) == 6 and sys.argv[1] in ['train', 'test']:
        config = TCNNConfig()
        t_name = sys.argv[3]
        t_th = sys.argv[2]
        data_dir = sys.argv[4]
        base_dir = 'data/' + data_dir + '/' + t_name
        classes = sys.argv[5].split('-')

        train_dir = os.path.join(base_dir, 'train.csv')
        test_dir = os.path.join(base_dir, 'test.csv')
        val_dir = os.path.join(base_dir, 'dev.csv')
        vocab_dir = os.path.join('data/data_orginal/' + t_name, 'vocab.csv')

        if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
            print(' vocab_dir not exists: ', vocab_dir)
            build_vocab('data/data_orginal/' + t_name + '/whole.csv',
                        vocab_dir, config.vocab_size)
Example #23
0
        metrics.classification_report(y_test_cls,
                                      y_pred_cls,
                                      target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_rnn.py [train / test]""")
    print('Configuring CNN model...')
    config = TCNNConfig()
    if not os.path.exists(vocab_dir):
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    config.pre_training = np.load(pre_training)
    model = TextCNN(config)

    if sys.argv[1] == 'train':
        train()
    else:
        test()
Example #24
0
    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn_c.py [train / test]""")

    print('Configuring CNN model...')
    config = TCNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        data_loader.build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = data_loader.read_category()
    words, word_to_id = data_loader.read_vocab(vocab_dir)
    config.vocab_size = len(words)
    max_train = data_loader.get_maxlength(train_dir)
    max_val = data_loader.get_maxlength(val_dir)
    # 用所有集合中最大的序列长度
    temp_val = max(max_train, max_val)
    # 如果有的集合中序列长度超过了1014,就还是用1014吧
    print("最长长度: %i" % temp_val)
    # config.seq_length = min(temp_val, 1500)
    config.seq_length = min(temp_val, 1500)
    # config.seq_length = min(temp_val, 3000)
    # 暂时设为5000字,大多数都能满足
Example #25
0
    f.close()
    return 0


if __name__ == '__main__':

    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test', 'predict']:
        raise ValueError(
            """usage: python run_cnn.py [train / test / predict]""")

    print('Configuring CNN model...')
    table = pd.read_excel('predict_check_data.xls')
    SubCategoryName_list = table['name'].tolist()
    category_set = list(set(SubCategoryName_list))
    category_set.sort(key=SubCategoryName_list.index)
    config = TCNNConfig(len(category_set))
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    #print(config.vocab_size)
    model = TextCNN(config)

    if sys.argv[1] == 'train':
        train()
    elif sys.argv[1] == 'predict':
        predict()
    else:
        test()
Example #26
0
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    # 输入参数 train 和 test 表示训练与测试
    # 需要在命令行运行 python run_cnn.py <train>|<test>
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")

    print('Configuring CNN model...')
    config = TCNNConfig()  # 获得TCNNConfig设置,TCNNConfig表示CNN配置参数
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建 单词表长度5000,是train里面出现最频繁的5000个单词
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()  # read_category()获取目录,cat_to_id 标签:序号的字典
    words, word_to_id = read_vocab(vocab_dir)  # 将词汇表的各个单词编号
    config.vocab_size = len(words)  # 更新词汇表长度
    model = TextCNN(config)  # 构建CNN模型,很重要

    if sys.argv[1] == 'train':
        train()
    else:
        test()
Example #27
0
import os
import tensorflow as tf
import scipy.io as sio
import numpy as np
from data.cnews_loader import read_vocab, read_category, batch_iter, process_file, build_vocab
from cnn_model import TCNNConfig, TextCNN

base_dir = 'data/cnews'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')
save_dir = base_dir

config = TCNNConfig()
if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
    build_vocab(train_dir, vocab_dir, config.vocab_size)
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)


def convert_to_tfrecord(data_dir, save_dir, save_name):
    x_data, y_data = process_file(data_dir, word_to_id, cat_to_id,
                                  config.seq_length)
    # y_data = np.argmax(y_onehot, 1)

    filename = os.path.join(save_dir, save_name + '.tfrecords')
    writer = tf.python_io.TFRecordWriter(filename)
    print('\nTransform start......')
    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")

    print('Configuring CNN model...')
    config = TCNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextCNN(config)

    if sys.argv[1] == 'train':
        train()
    else:
        test()
Example #29
0
    print("Confusion Matrix...")
    print(y_test_cls)
    print(y_pred_cls)
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    # if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
    #     raise ValueError("""usage: python run_cnn.py [train / test]""")

    print('Configuring CNN model...')
    config = TCNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    dataNums = [16, 32, 64, 128, 256]
    for i in dataNums:
        if i == 0:
            continue
        g1 = tf.Graph()
        sess1 = tf.Session(graph=g1)
        with sess1.as_default():
            with g1.as_default():
                model = TextCNN(config, batch_size=i)
                train()
def train(model,data):
    if 
    print("Configuring TensorBoard and Saver...")
    # 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖
    tensorboard_dir = 'tensorboard/textcnn'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    # 配置 Saver
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print("Loading training and validation data...")
    # 载入训练集与验证集
    start_time = time.time()
    x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
    x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # 创建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升,提前结束训练

    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)

            if total_batch % config.save_per_batch == 0:
                # 每多少轮次将训练结果写入tensorboard scalar
                s = session.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)

            if total_batch % config.print_per_batch == 0:
                # 每多少轮次输出在训练集和验证集上的性能
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                loss_val, acc_val = evaluate(session, x_val, y_val)  # todo

                if acc_val > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            session.run(model.optim, feed_dict=feed_dict)  # 运行优化
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # 验证集正确率长期不提升,提前结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break  # 跳出循环
        if flag:  # 同上
            break


def test():
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型

    print('Testing...')
    loss_test, acc_test = evaluate(session, x_test, y_test)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))

    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1

    y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存预测结果
    for i in range(num_batch):  # 逐批次处理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)

    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    #if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
    #    raise ValueError("""usage: python run_cnn.py [train / test]""")

    print('Configuring CNN model...')
    config = TCNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextCNN(config)

    #if sys.argv[1] == 'train':
    #    train()
    #else:
    #    test()
    train()