def get_dic(path, t2i_path, l2i_path): sentences = read_jsonline(path) words = set(word for i in sentences for word in i['word']) token_dic = {str(v): i + 1 for i, v in enumerate(words)} token_dic["unk"] = 0 labels = set(word for i in sentences for word in i['tag']) label2id = {str(v): i for i, v in enumerate(labels)} write_json(t2i_path, token_dic) write_json(l2i_path, label2id)
def sentences_len(path): sentences = read_jsonline(path) len_l = [len(i['word']) for i in sentences] print(len_l) plt.hist(len_l, bins=40, facecolor="blue", edgecolor="black", alpha=0.7) # 显示横轴标签 plt.xlabel("length") # 显示纵轴标签 plt.ylabel("nums") # 显示图标题 plt.title("statistic") plt.show()
self.model.fit_generator(generator.__iter__(), steps_per_epoch=10000, epochs=3, callbacks=callbacks_list, validation_data=self.v_generator.__iter__(), nb_val_samples=200) def predict(self): pass if __name__ == '__main__': ROOT_PATH = '/Users/ouhon/PycharmProjects/keras_nlp_tutorial/NER/' path = ROOT_PATH + 'CCKS_2017/data/raw_data/data.jsonl' data = read_jsonline(path) tag2i_dict = {'O': 0, 'B-TREATMENT': 1, 'I-TREATMENT': 2, 'B-BODY': 3, 'I-BODY': 4, 'B-SIGNS': 5, 'I-SIGNS': 6, 'B-CHECK': 7, 'I-CHECK': 8, 'B-DISEASE': 9, 'I-DISEASE': 10} i2tag_dict = {str(v): str(i) for i, v in tag2i_dict.items()} max_len = max([len(i['content']) for i in data])
# -*- coding: utf-8 -*- # @Time : 2020/6/5 下午2:24 # @Author : Benqi from tools import read_jsonline, write_jsonline new = [] f = read_jsonline( '/Users/ouhon/PycharmProjects/keras_nlp_tutorial/NER/CCKS_2017/data/raw_data/data.jsonl' ) for i in f: l = i['tag'] if len(set(l)) > 1: new.append(i) print(i) write_jsonline( "/Users/ouhon/PycharmProjects/keras_nlp_tutorial/NER/CCKS_2017/data/raw_data/data2.jsonl", new) print(len(f)) print(len(new))