Example #1
0
    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    # 输入参数 train 和 test 表示训练与测试
    # 需要在命令行运行 python run_cnn.py <train>|<test>
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")

    print('Configuring CNN model...')
    config = TCNNConfig()  # 获得TCNNConfig设置,TCNNConfig表示CNN配置参数
    if not os.path.exists(
            vocab_dir):  # 如果不存在词汇表,重建 单词表长度5000,是train里面出现最频繁的5000个单词
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category(
    )  # read_category()获取目录,cat_to_id 标签:序号的字典
    words, word_to_id = read_vocab(vocab_dir)  # 将词汇表的各个单词编号
    config.vocab_size = len(words)  # 更新词汇表长度
    model = TextCNN(config)  # 构建CNN模型,很重要

    if sys.argv[1] == 'train':
        train()
    else:
        test()
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)

    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    print('Configuring CNN model...')
    config = TCNNConfig()
    if not os.path.exists(vocab_dir):
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextCNN(config)

    if sys.argv[1] == 'train':
        train()
    else:
        test()
Example #3
0
        for i in range(0, cnn.config.vocab_size):
            if(cnn.words[i] in word_vector_map): # word_vector_map.has_key(cnn.words[i])
                count = count
                sub_embeddings[i]= word_vector_map.get(cnn.words[i])
            else:
                count = count + 1
                missing_words_file.write(cnn.words[i]+'\n')
        
        print('no embedding: ' + str(1.0 * count/len(cnn.words)))
        print(str(len(sub_embeddings)) + '\t' + str(len(sub_embeddings[0])))
        missing_words_file.close()
        
        print(sub_embeddings[0])
        cnn.embedding_matrix = sub_embeddings
        #print(cnn.embedding_matrix.shape)
        cnn.model = TextCNN(cnn.config)

        cnn.train()
        predict_y = cnn.test()  #predicting results
        print(predict_y)
        print(len(predict_y))
        print(len(test_data_Y))

        tf.reset_default_graph() 
        
        correct_count = 0
        for i in range(len(test_data_Y)):
            if cnn.id_to_cat[predict_y[i]] == test_data_Y[i]:
                correct_count += 1
            doc_node = doc.createElement("doc")
            doc_node.setAttribute("id", test_docs[i].split(',')[0])