keyword_num = 10  # 关键词(特征词)个数
test_rate = 0.1  # 测试集比例
hidden_dim = 200  # LSTM的隐层神经元个数(输出维度)
word_emb_dim = 100  # 词向量维度
feature_emb_dim = 50  # 特征名称embedding维度
keep_prob = 0.8  # dropout保留比例
num_layers = 1  # LSTM层数
batch_size = 200  # 每个batch的大小
learning_rate = 0.0001  # 学习率
num_epochs = 10  # 训练数据迭代次数
add_feature = False  # 是否加特征名embedding
add_keyword_attention = False  # 是否加关键词attention

print('loading data ...')
text_data = LoadData('diag_code_data.csv', filter_num, add_feature)
word_vocab, feature_vocab = text_data.build_vocab()
vocab_size = len(word_vocab)
print('the vocabulary size is {}'.format(vocab_size))
label = text_data.label

features = text_data.feature_names
num_features = len(features)
num_classes = len(set(label))
print('num_classes = {}, num_features = {}'.format(num_classes, num_features))

sentences = text_data.creat_id_sentences(word_vocab, feature_vocab)
sentences_length = [len(sentence) for sentence in sentences]
mean_seq_length = np.mean(sentences_length)
max_seq_length = np.max(sentences_length)
print('mean_seq_length = {}, max_seq_length = {}'.format(
    mean_seq_length, max_seq_length))