add_feature = False  # 是否加特征名embedding
add_keyword_attention = False  # 是否加关键词attention

print('loading data ...')
text_data = LoadData('diag_code_data.csv', filter_num, add_feature)
word_vocab, feature_vocab = text_data.build_vocab()
vocab_size = len(word_vocab)
print('the vocabulary size is {}'.format(vocab_size))
label = text_data.label

features = text_data.feature_names
num_features = len(features)
num_classes = len(set(label))
print('num_classes = {}, num_features = {}'.format(num_classes, num_features))

sentences = text_data.creat_id_sentences(word_vocab, feature_vocab)
sentences_length = [len(sentence) for sentence in sentences]
mean_seq_length = np.mean(sentences_length)
max_seq_length = np.max(sentences_length)
print('mean_seq_length = {}, max_seq_length = {}'.format(
    mean_seq_length, max_seq_length))

if add_keyword_attention:
    keywords = text_data.extrac_keywords(keyword_num)
    keywords_id = [word_vocab[word] for word in list(chain(*keywords))]
else:
    keywords_id = None

train_x, train_y, test_x, test_y, mask_train, mask_test = text_data.data_split(
    text_data=sentences,
    seq_length=seq_length,