Ejemplo n.º 1
0
def train_data_generator(train_file):

    lines = load_json_file(train_file)

    for i, x in enumerate(tqdm(lines)):
        guid = str(i)
        text = clean_text(x['text'])
        sl = LabeledText(guid, text)

        # -------------------- 训练数据json格式 --------------------
        #  {
        #      "text": "万通地产设计总监刘克峰;",
        #      "label": {
        #          "name": {
        #              "刘克峰": [[8, 10]]
        #          },
        #          "company": {
        #              "万通地产": [[0, 3]]
        #          },
        #          "position": {
        #              "设计总监": [[4, 7]]
        #          }
        #      }
        #  }

        entities = []
        classes = x['label'].keys()
        for c in classes:
            c_labels = x['label'][c]
            #  logger.debug(f"c_labels:{c_labels}")
            for label, span in c_labels.items():
                x0, x1 = span[0]
                sl.add_entity(c, x0, x1)

        yield str(i), text, None, sl.entities
Ejemplo n.º 2
0
def load_train_val_examples(args):
    lines = []
    for guid, text, _, entities in train_data_generator(args.train_file, args.eval_file):
        sl = LabeledText(guid, text, entities)
        lines.append({'guid': guid, 'text': text, 'entities': entities})

    allow_overlap = args.allow_overlap
    if args.num_augements > 0:
        allow_overlap = False

    train_base_examples = load_ner_labeled_examples(
        lines,
        ner_labels,
        seg_len=args.seg_len,
        seg_backoff=args.seg_backoff,
        num_augements=args.num_augements,
        allow_overlap=allow_overlap)

    train_examples, val_examples = split_train_eval_examples(
        train_base_examples,
        train_rate=args.train_rate,
        fold=args.fold,
        shuffle=True)

    logger.info(f"Loaded {len(train_examples)} train examples, "
                f"{len(val_examples)} val examples.")
    return train_examples, val_examples
Ejemplo n.º 3
0
def train_data_from_last_generator(train_file):

    train_files = [
        './data/event_element_sampling_0713.json',
    ]

    for train_file in train_files:
        tagged_train_json_data = json.load(open(train_file, 'r'))

        all_labels = tagged_train_json_data['labelCategories']
        id2label = {x['id']: x['text'] for x in all_labels}

        all_entities = tagged_train_json_data['labels']

        content = tagged_train_json_data['content']

        #  re_b = '(\\n[-]+ yanbao\\d\\d\\d\\.txt Begin [-]+\\n\\n)'
        #  re_e = '(\\n[-]+ yanbao\\d\\d\\d\\.txt End [-]+\\n\\n)'
        re_b = '(\\n[-]+ [\d]+ Begin [-]+\\n\\n)'
        re_e = '(\\n[-]+ [\d]+ End [-]+\\n\\n)'
        b_list = []
        for x in re.finditer(re_b, content):
            b_list.append((x.start(), x.end()))
        e_list = []
        for x in re.finditer(re_e, content):
            e_list.append((x.start(), x.end()))

        pages = [(x_b[0], x_b[1], x_e[0], x_e[1])
                 for x_b, x_e in zip(b_list, e_list)]

        logger.warning(f"pages: {pages}")

        for i, page in enumerate(pages):
            head_x0, head_x1, tail_x0, tail_x1 = page

            guid = f"{i}"
            text = content[head_x1:tail_x0]
            sl = LabeledText(guid, text)

            for entity in all_entities:
                s = entity['startIndex']
                e = entity['endIndex'] - 1
                assert e >= s
                if s >= head_x1 and e < tail_x0:
                    sl.add_entity(id2label[entity['categoryId']], s - head_x1,
                                  e - head_x1)
            yield guid, text, None, sl.entities
Ejemplo n.º 4
0
def train_data_generator(train_file):

    lines = load_json_file(train_file)

    for i, x in enumerate(tqdm(lines)):
        guid = str(i)
        text = clean_text(x['originalText'])
        sl = LabeledText(guid, text)

        entities = x['entities']
        for entity in entities:
            start_pos = entity['start_pos']
            end_pos = entity['end_pos'] - 1
            category = entity['label_type']
            sl.add_entity(category, start_pos, end_pos)

        yield str(i), text, None, sl.entities
Ejemplo n.º 5
0
def train_data_generator(train_file):

    lines = load_json_file(train_file)

    for i, x in enumerate(tqdm(lines)):
        guid = str(i)
        text = clean_text(x['text'])
        sl = LabeledText(guid, text)
        entities = []
        classes = x['label'].keys()
        for c in classes:
            c_labels = x['label'][c]
            #  logger.debug(f"c_labels:{c_labels}")
            for label, span in c_labels.items():
                x0, x1 = span[0]
                sl.add_entity(c, x0, x1)
        print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities)
        break
Ejemplo n.º 6
0
def train_data_generator(train_file):

    data = load_ner_train_data(train_file)

    for i, x in enumerate(tqdm(data)):
        guid = x[0]
        text = clean_text(x[1])
        sl = LabeledText(guid, text)
        entities = x[2]
        for entity in entities:
            c = entity[0]
            x0 = int(entity[1])
            x1 = int(entity[2])
            sl.add_entity(c, x0, x1)

        #print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities)
        #if i > 10:
        #break
        yield guid, text, None, sl.entities
Ejemplo n.º 7
0
def train_data_generator_0(train_file):

    with open(train_file, 'r') as fr:
        lines = fr.readlines()
        for line in tqdm(lines, desc=f"train & eval"):
            d = json.loads(line)
            guid = d['doc_id']
            text = clean_text(d['content'])

            seg_text = text
            seg_labels = []
            for e in d['events']:
                event_type = e['event_type']
                #  if event_type not in ['破产清算']:  # ['股东减持', '股东增持']:
                #      continue
                for k, v in e.items():
                    if not v:
                        continue

                    if k not in ['event_id', 'event_type']:
                        label = '_'.join((event_type, k))

                        #  if label not in ner_labels:
                        #      ner_labels.append(label)

                        i0 = seg_text.find(v)
                        while i0 >= 0:
                            #  if i0 >= 0:
                            if len(v) == 1:
                                #  if labels[i0] == 'O':
                                #      labels[i0] = f"S-{label}"
                                pass
                            else:
                                seg_labels.append((label, i0, i0 + len(v) - 1))
                            #  break
                            i0 = seg_text.find(v, i0 + len(v))

            sl = LabeledText(guid, text)
            for category, start_char, end_char in seg_labels:
                sl.add_entity(category, start_char, end_char)

            yield guid, text, None, sl.entities
Ejemplo n.º 8
0
def train_data_generator(train_text_file, train_bio_file):

    texts = load_texts(train_text_file)
    cond, labels = load_bioattr_labels(train_bio_file)

    for i, x in enumerate(tqdm(texts)):
        guid = str(i)
        text = clean_text(x)
        sl = LabeledText(guid, text)
        entities = labels[i]
        for entity in entities:
            c = entity[0]
            x0 = int(entity[1])
            x1 = int(entity[2])
            sl.add_entity(c, x0, x1)

        #print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities)
        #if i > 10:
        #break
        yield str(i), text, None, sl.entities
Ejemplo n.º 9
0
def train_data_generator(train_file):

    data, _ = load_data(train_file)

    for i, x in enumerate(tqdm(data)):
        guid = x[1]
        text = clean_text(x[0])
        level1 = x[2]
        level2 = x[3]
        level3 = x[4]
        sl = LabeledText(guid, text)
        entities = x[5]
        for entity in entities:
            c = level1 + "_" + level2 + "_" + level3 + "_" + entity[2]
            x0 = int(entity[0])
            x1 = int(entity[1])
            sl.add_entity(c, x0, x1)

        #print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities)
        #if i > 10:
        #break
        yield str(i), text, None, sl.entities
Ejemplo n.º 10
0
def train_data_generator(train_file):

    data = load_data(train_file)

    for i, x in enumerate(tqdm(data)):
        guid = str(i)
        text = clean_text(x[0])
        arguments = x[1]
        sl = LabeledText(guid, text)
        entities = []
        for key, value in arguments.items():
            argument = key
            event_type = value[0]
            role = value[1]
            start_index = int(value[2])
            c = event_type + "_" + role
            x0 = start_index
            x1 = start_index + len(argument) - 1
            sl.add_entity(c, x0, x1)

        #print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities)
        #if i > 10:
        #break
        yield str(i), text, None, sl.entities
Ejemplo n.º 11
0
def load_eval_examples(eval_text_file, eval_bio_file):
    lines = []
    for guid, text, _, entities in train_data_generator(
            eval_text_file, eval_bio_file):
        sl = LabeledText(guid, text, entities)
        lines.append({'guid': guid, 'text': text, 'entities': entities})

    train_base_examples = load_ner_labeled_examples(
        lines,
        ner_labels,
        seg_len=args.seg_len,
        seg_backoff=args.seg_backoff,
        num_augments=0,
        allow_overlap=False)

    eval_examples = train_base_examples

    logger.info(f"Loaded {len(eval_examples)} eval examples")
    return eval_examples