def train_data_generator(train_file): lines = load_json_file(train_file) for i, x in enumerate(tqdm(lines)): guid = str(i) text = clean_text(x['text']) sl = LabeledText(guid, text) # -------------------- 训练数据json格式 -------------------- # { # "text": "万通地产设计总监刘克峰;", # "label": { # "name": { # "刘克峰": [[8, 10]] # }, # "company": { # "万通地产": [[0, 3]] # }, # "position": { # "设计总监": [[4, 7]] # } # } # } entities = [] classes = x['label'].keys() for c in classes: c_labels = x['label'][c] # logger.debug(f"c_labels:{c_labels}") for label, span in c_labels.items(): x0, x1 = span[0] sl.add_entity(c, x0, x1) yield str(i), text, None, sl.entities
def test_data_generator(test_file): lines = load_json_file(test_file) for i, s in enumerate(tqdm(lines)): guid = str(i) text_a = clean_text(s['originalText']) yield guid, text_a, None, None
def test_data_generator(test_file): test_data = load_json_file(test_file) total_examples = len(test_data) for i, json_data in enumerate(tqdm(test_data, desc="test")): guid = str(json_data['id']) text = json_data['sentence'] text = clean_text(text) yield guid, text, None, None
def eval_data_generator(eval_file): eval_data = load_json_file(eval_file) for i, json_data in enumerate(tqdm(eval_data, desc="eval")): guid = str(i) text = json_data['sentence'] text = clean_text(text) label = json_data['label_desc'] yield guid, text, None, label
def train_data_generator(train_file): lines = load_json_file(train_file) for i, x in enumerate(tqdm(lines)): guid = str(i) text = clean_text(x['originalText']) sl = LabeledText(guid, text) entities = x['entities'] for entity in entities: start_pos = entity['start_pos'] end_pos = entity['end_pos'] - 1 category = entity['label_type'] sl.add_entity(category, start_pos, end_pos) yield str(i), text, None, sl.entities
def train_data_generator(train_file): lines = load_json_file(train_file) for i, x in enumerate(tqdm(lines)): guid = str(i) text = clean_text(x['text']) sl = LabeledText(guid, text) entities = [] classes = x['label'].keys() for c in classes: c_labels = x['label'][c] # logger.debug(f"c_labels:{c_labels}") for label, span in c_labels.items(): x0, x1 = span[0] sl.add_entity(c, x0, x1) print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities) break
import numpy as np from theta.modeling import load_glue_examples from theta.modeling.glue import GlueTrainer, load_model, get_args from theta.utils import load_json_file # -------------------- Data -------------------- # ## 1. 数据观察 train_file = './data/rawdata/train.json' test_file = './data/rawdata/test.json' eval_file = './data/rawdata/dev.json' labels_file = './data/rawdata/labels.json' # ### 1.1 样本数量分布 train_data = load_json_file(train_file) test_data = load_json_file(test_file) eval_data = load_json_file(eval_file) all_data = train_data + eval_data descs = [x['label_desc'] for x in all_data] from collections import Counter logger.debug(f"{Counter(descs)}") # ### 1.2 样本长度分布 lengths = [len(x['sentence']) for x in all_data] logger.info(f"***** Text Lengths *****") logger.info(f"mean: {np.mean(lengths):.2f}") logger.info(f"std: {np.mean(lengths):.2f}") logger.info(f"max: {np.max(lengths)}") logger.info(f"min: {np.min(lengths)}")