Exemple #1
0
def train_data_generator(train_file):

    lines = load_json_file(train_file)

    for i, x in enumerate(tqdm(lines)):
        guid = str(i)
        text = clean_text(x['text'])
        sl = LabeledText(guid, text)

        # -------------------- 训练数据json格式 --------------------
        #  {
        #      "text": "万通地产设计总监刘克峰;",
        #      "label": {
        #          "name": {
        #              "刘克峰": [[8, 10]]
        #          },
        #          "company": {
        #              "万通地产": [[0, 3]]
        #          },
        #          "position": {
        #              "设计总监": [[4, 7]]
        #          }
        #      }
        #  }

        entities = []
        classes = x['label'].keys()
        for c in classes:
            c_labels = x['label'][c]
            #  logger.debug(f"c_labels:{c_labels}")
            for label, span in c_labels.items():
                x0, x1 = span[0]
                sl.add_entity(c, x0, x1)

        yield str(i), text, None, sl.entities
def test_data_generator(test_file):

    lines = load_json_file(test_file)
    for i, s in enumerate(tqdm(lines)):
        guid = str(i)
        text_a = clean_text(s['originalText'])

        yield guid, text_a, None, None
Exemple #3
0
def test_data_generator(test_file):
    test_data = load_json_file(test_file)
    total_examples = len(test_data)
    for i, json_data in enumerate(tqdm(test_data, desc="test")):
        guid = str(json_data['id'])
        text = json_data['sentence']
        text = clean_text(text)

        yield guid, text, None, None
Exemple #4
0
def eval_data_generator(eval_file):
    eval_data = load_json_file(eval_file)
    for i, json_data in enumerate(tqdm(eval_data, desc="eval")):
        guid = str(i)
        text = json_data['sentence']
        text = clean_text(text)
        label = json_data['label_desc']

        yield guid, text, None, label
def train_data_generator(train_file):

    lines = load_json_file(train_file)

    for i, x in enumerate(tqdm(lines)):
        guid = str(i)
        text = clean_text(x['originalText'])
        sl = LabeledText(guid, text)

        entities = x['entities']
        for entity in entities:
            start_pos = entity['start_pos']
            end_pos = entity['end_pos'] - 1
            category = entity['label_type']
            sl.add_entity(category, start_pos, end_pos)

        yield str(i), text, None, sl.entities
Exemple #6
0
def train_data_generator(train_file):

    lines = load_json_file(train_file)

    for i, x in enumerate(tqdm(lines)):
        guid = str(i)
        text = clean_text(x['text'])
        sl = LabeledText(guid, text)
        entities = []
        classes = x['label'].keys()
        for c in classes:
            c_labels = x['label'][c]
            #  logger.debug(f"c_labels:{c_labels}")
            for label, span in c_labels.items():
                x0, x1 = span[0]
                sl.add_entity(c, x0, x1)
        print("index: ", str(i), ", text: ", text, ", entities: ", sl.entities)
        break
Exemple #7
0
import numpy as np

from theta.modeling import load_glue_examples
from theta.modeling.glue import GlueTrainer, load_model, get_args
from theta.utils import load_json_file

# -------------------- Data --------------------

# ## 1. 数据观察
train_file = './data/rawdata/train.json'
test_file = './data/rawdata/test.json'
eval_file = './data/rawdata/dev.json'
labels_file = './data/rawdata/labels.json'

# ### 1.1 样本数量分布
train_data = load_json_file(train_file)
test_data = load_json_file(test_file)
eval_data = load_json_file(eval_file)

all_data = train_data + eval_data
descs = [x['label_desc'] for x in all_data]
from collections import Counter
logger.debug(f"{Counter(descs)}")

# ### 1.2 样本长度分布
lengths = [len(x['sentence']) for x in all_data]
logger.info(f"***** Text Lengths *****")
logger.info(f"mean: {np.mean(lengths):.2f}")
logger.info(f"std: {np.mean(lengths):.2f}")
logger.info(f"max: {np.max(lengths)}")
logger.info(f"min: {np.min(lengths)}")