print( metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print('Confusion Matrix...') cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print('Time usage:', time_dif) if __name__ == '__main__': # if len(sys.argv) != 2 or sys.argv[1] not in ['train','test']: # raise ValueError('usage:python run_rnn.py [train / test]') print('Configuring RNN model...') config = TRNNConfig() if not os.path.exists(vocab_dir): build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextRNN(config) if sys.argv[1] == 'train': train() else: test()
print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_cnn.py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() if not os.path.exists(vocab_dir_c): # 如果不存在字表,重建 data_loader.build_vocab(train_dir, vocab_dir_c, config.vocab_size_c) if not os.path.exists(vocab_dir_w): # 如果不存在词汇表,重建 data_loader_wordlevel.build_vocab(train_dir, vocab_dir_w, config.vocab_size_w) categories, cat_to_id = data_loader.read_category() characters, character_to_id = data_loader.read_vocab(vocab_dir_c) words, word_to_id = data_loader_wordlevel.read_vocab(vocab_dir_w) config.vocab_size_c = len(characters) config.vocab_size_w = len(words) # max_train = data_loader.get_maxlength(train_dir) # max_val = data_loader.get_maxlength(val_dir) # # 用所有集合中最大的序列长度 # temp_val = max(max_train, max_val) # # 如果有的集合中序列长度超过了1014,就还是用1014吧 # print("最长长度: %i" % temp_val) # # config.seq_length = min(temp_val, 1500)
def init_vocab(config): if not os.path.exists(config.vocab_dir): # 如果不存在词汇表,重建 print('build vocabulary') build_vocab(config.train_dir, config.vocab_dir, config.vocab_size) config.words, config.word_to_id = read_vocab(config.vocab_dir) config.vocab_size = len(config.words)
# save_file_stratified('data/qwdata/shuffle-try3/classified_table_ms.txt', 'data/qwdata/ms-ygscplusssdqw-clean',categories) # print(len(open('data/betadata2-711depart/train.txt', 'r', encoding='utf-8').readlines())) # print(len(open('data/betadata2-711depart/val.txt', 'r', encoding='utf-8').readlines())) # print(len(open('data/betadata2-711depart/test.txt', 'r', encoding='utf-8').readlines())) # write_files_together(ms_file_path='data\\qwdata\\ms-ssd', # ms_label='data\\qwdata\\classifier_ms.csv', # xs_file_path='data\\qwdata\\new-xs-zkss-ajjbkq', # xs_label='data\\qwdata\\classifier_xs.csv') config = TCNNConfig() config.vocab_size_w = 200000 config.vocab_size_c = 200000 if not os.path.exists(vocab_dir_c): # 如果不存在字表,重建 data_loader.build_vocab(all_data_dir, vocab_dir_c, config.vocab_size_c) if not os.path.exists(vocab_dir_w): # 如果不存在词汇表,重建 data_loader_wordlevel.build_vocab(all_data_dir, vocab_dir_w, config.vocab_size_w) # categories, cat_to_id = data_loader.read_category() # characters, character_to_id = data_loader.read_vocab(vocab_dir_c) # words, word_to_id = data_loader_wordlevel.read_vocab(vocab_dir_w) # config.vocab_size_c = len(characters) # config.vocab_size_w = len(words) # # config.seq_length_c = 1500 # config.seq_length_w = 800 # data_convert() # stratified_cross('data\\10-fold-original-data\\data-convert',