def load_train_val_examples(args): lines = [] for guid, text, _, entities in train_data_generator(args.train_file, args.eval_file): sl = LabeledText(guid, text, entities) lines.append({'guid': guid, 'text': text, 'entities': entities}) allow_overlap = args.allow_overlap if args.num_augements > 0: allow_overlap = False train_base_examples = load_ner_labeled_examples( lines, ner_labels, seg_len=args.seg_len, seg_backoff=args.seg_backoff, num_augements=args.num_augements, allow_overlap=allow_overlap) train_examples, val_examples = split_train_eval_examples( train_base_examples, train_rate=args.train_rate, fold=args.fold, shuffle=True) logger.info(f"Loaded {len(train_examples)} train examples, " f"{len(val_examples)} val examples.") return train_examples, val_examples
def train_data_generator(train_file): lines = load_json_file(train_file) for i, x in enumerate(tqdm(lines)): guid = str(i) text = clean_text(x['text']) sl = LabeledText(guid, text) # -------------------- 训练数据json格式 -------------------- # { # "text": "万通地产设计总监刘克峰;", # "label": { # "name": { # "刘克峰": [[8, 10]] # }, # "company": { # "万通地产": [[0, 3]] # }, # "position": { # "设计总监": [[4, 7]] # } # } # } entities = [] classes = x['label'].keys() for c in classes: c_labels = x['label'][c] # logger.debug(f"c_labels:{c_labels}") for label, span in c_labels.items(): x0, x1 = span[0] sl.add_entity(c, x0, x1) yield str(i), text, None, sl.entities
def train_data_from_last_generator(train_file): train_files = [ './data/event_element_sampling_0713.json', ] for train_file in train_files: tagged_train_json_data = json.load(open(train_file, 'r')) all_labels = tagged_train_json_data['labelCategories'] id2label = {x['id']: x['text'] for x in all_labels} all_entities = tagged_train_json_data['labels'] content = tagged_train_json_data['content'] # re_b = '(\\n[-]+ yanbao\\d\\d\\d\\.txt Begin [-]+\\n\\n)' # re_e = '(\\n[-]+ yanbao\\d\\d\\d\\.txt End [-]+\\n\\n)' re_b = '(\\n[-]+ [\d]+ Begin [-]+\\n\\n)' re_e = '(\\n[-]+ [\d]+ End [-]+\\n\\n)' b_list = [] for x in re.finditer(re_b, content): b_list.append((x.start(), x.end())) e_list = [] for x in re.finditer(re_e, content): e_list.append((x.start(), x.end())) pages = [(x_b[0], x_b[1], x_e[0], x_e[1]) for x_b, x_e in zip(b_list, e_list)] logger.warning(f"pages: {pages}") for i, page in enumerate(pages): head_x0, head_x1, tail_x0, tail_x1 = page guid = f"{i}" text = content[head_x1:tail_x0] sl = LabeledText(guid, text) for entity in all_entities: s = entity['startIndex'] e = entity['endIndex'] - 1 assert e >= s if s >= head_x1 and e < tail_x0: sl.add_entity(id2label[entity['categoryId']], s - head_x1, e - head_x1) yield guid, text, None, sl.entities
def train_data_generator(train_file): lines = load_json_file(train_file) for i, x in enumerate(tqdm(lines)): guid = str(i) text = clean_text(x['originalText']) sl = LabeledText(guid, text) entities = x['entities'] for entity in entities: start_pos = entity['start_pos'] end_pos = entity['end_pos'] - 1 category = entity['label_type'] sl.add_entity(category, start_pos, end_pos) yield str(i), text, None, sl.entities
def train_data_generator(train_file): lines = load_json_file(train_file) for i, x in enumerate(tqdm(lines)): guid = str(i) text = clean_text(x['text']) sl = LabeledText(guid, text) entities = [] classes = x['label'].keys() for c in classes: c_labels = x['label'][c] # logger.debug(f"c_labels:{c_labels}") for label, span in c_labels.items(): x0, x1 = span[0] sl.add_entity(c, x0, x1) print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities) break
def load_eval_examples(eval_text_file, eval_bio_file): lines = [] for guid, text, _, entities in train_data_generator( eval_text_file, eval_bio_file): sl = LabeledText(guid, text, entities) lines.append({'guid': guid, 'text': text, 'entities': entities}) train_base_examples = load_ner_labeled_examples( lines, ner_labels, seg_len=args.seg_len, seg_backoff=args.seg_backoff, num_augments=0, allow_overlap=False) eval_examples = train_base_examples logger.info(f"Loaded {len(eval_examples)} eval examples") return eval_examples
def train_data_generator(train_file): data = load_ner_train_data(train_file) for i, x in enumerate(tqdm(data)): guid = x[0] text = clean_text(x[1]) sl = LabeledText(guid, text) entities = x[2] for entity in entities: c = entity[0] x0 = int(entity[1]) x1 = int(entity[2]) sl.add_entity(c, x0, x1) #print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities) #if i > 10: #break yield guid, text, None, sl.entities
def train_data_generator_0(train_file): with open(train_file, 'r') as fr: lines = fr.readlines() for line in tqdm(lines, desc=f"train & eval"): d = json.loads(line) guid = d['doc_id'] text = clean_text(d['content']) seg_text = text seg_labels = [] for e in d['events']: event_type = e['event_type'] # if event_type not in ['破产清算']: # ['股东减持', '股东增持']: # continue for k, v in e.items(): if not v: continue if k not in ['event_id', 'event_type']: label = '_'.join((event_type, k)) # if label not in ner_labels: # ner_labels.append(label) i0 = seg_text.find(v) while i0 >= 0: # if i0 >= 0: if len(v) == 1: # if labels[i0] == 'O': # labels[i0] = f"S-{label}" pass else: seg_labels.append((label, i0, i0 + len(v) - 1)) # break i0 = seg_text.find(v, i0 + len(v)) sl = LabeledText(guid, text) for category, start_char, end_char in seg_labels: sl.add_entity(category, start_char, end_char) yield guid, text, None, sl.entities
def train_data_generator(train_text_file, train_bio_file): texts = load_texts(train_text_file) cond, labels = load_bioattr_labels(train_bio_file) for i, x in enumerate(tqdm(texts)): guid = str(i) text = clean_text(x) sl = LabeledText(guid, text) entities = labels[i] for entity in entities: c = entity[0] x0 = int(entity[1]) x1 = int(entity[2]) sl.add_entity(c, x0, x1) #print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities) #if i > 10: #break yield str(i), text, None, sl.entities
def train_data_generator(train_file): data, _ = load_data(train_file) for i, x in enumerate(tqdm(data)): guid = x[1] text = clean_text(x[0]) level1 = x[2] level2 = x[3] level3 = x[4] sl = LabeledText(guid, text) entities = x[5] for entity in entities: c = level1 + "_" + level2 + "_" + level3 + "_" + entity[2] x0 = int(entity[0]) x1 = int(entity[1]) sl.add_entity(c, x0, x1) #print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities) #if i > 10: #break yield str(i), text, None, sl.entities
def train_data_generator(train_file): data = load_data(train_file) for i, x in enumerate(tqdm(data)): guid = str(i) text = clean_text(x[0]) arguments = x[1] sl = LabeledText(guid, text) entities = [] for key, value in arguments.items(): argument = key event_type = value[0] role = value[1] start_index = int(value[2]) c = event_type + "_" + role x0 = start_index x1 = start_index + len(argument) - 1 sl.add_entity(c, x0, x1) #print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities) #if i > 10: #break yield str(i), text, None, sl.entities