Example #1
0
def load_data(filename):
    D = []
    with open(filename) as f:
        for l in f:
            l = json.loads(l)
            for event in l['event_list']:
                arguments = {}
                k=0
                vocab={}
                for argument in event['arguments']:
                    flag=0
                    for i in range(k):
                        if re.search(argument['argument'],vocab[i]) :
                            if len(argument['argument'])>vocab[i]:
                                flag=1
                            else :
                                arguments.pop(vocab[i])
                                vocab.pop(vocab[i])
                    if flag: continue
                    key = argument['argument']
                    value = (event['event_type'], argument['role'])  #事件类型+论元角色
                    arguments[key] = value
                    vocab[k]=argument['argument']
                    k=k+1
                D.append((event['event_type']+"|"+l['text'], arguments))   #改动1-这里的缩紧可以增加数据量,每个event——type
    return D
def evaluate(data):
    """评估函数,计算f1、precision、recall
    """
    X, Y, Z = 1e-10, 1e-10, 1e-10
    f = open('dev_pred.json', 'w', encoding='utf-8')
    pbar = tqdm()
    for d in data:
        R = set([SPO(spo) for spo in extract_spoes(d['text'])])
        T = set([SPO(spo) for spo in d['spo_list']])
        X += len(R & T)
        Y += len(R)
        Z += len(T)
        f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
        pbar.update()
        pbar.set_description('f1: %.5f, precision: %.5f, recall: %.5f' %
                             (f1, precision, recall))
        s = json.dumps(
            {
                'text': d['text'],
                'spo_list': list(T),
                'spo_list_pred': list(R),
                'new': list(R - T),
                'lack': list(T - R),
            },
            ensure_ascii=False,
            indent=4)
        f.write(s + '\n')
    pbar.close()
    f.close()
    return f1, precision, recall
def load_data(filename):
    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            title, content = l.strip().split('\t')
            D.append((title, content))
    return D
Example #4
0
def load_vocab(dict_path, encoding='utf-8', simplified=False, startswith=None):
    """从bert的词典文件中读取词典
    """
    token_dict = {}
    with open(dict_path, encoding=encoding) as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)

    if simplified:  # 过滤冗余部分token
        new_token_dict, keep_tokens = {}, []
        startswith = startswith or []
        for t in startswith:
            new_token_dict[t] = len(new_token_dict)
            keep_tokens.append(token_dict[t])

        for t, _ in sorted(token_dict.items(), key=lambda s: s[1]):
            if t not in new_token_dict:
                keep = True
                if len(t) > 1:
                    for c in Tokenizer.stem(t):
                        if (Tokenizer._is_cjk_character(c)
                                or Tokenizer._is_punctuation(c)):
                            keep = False
                            break
                if keep:
                    new_token_dict[t] = len(new_token_dict)
                    keep_tokens.append(token_dict[t])
                else:
                    print t, [t]

        return new_token_dict, keep_tokens
    else:
        return token_dict
Example #5
0
def load_data(filename):
    """读取训练数据,并做一些标准化,保证equation是可以eval的
    参考:https://kexue.fm/archives/7809
    """
    D = []
    for l in open(filename):
        l = json.loads(l)
        question, equation, answer = l['original_text'], l['equation'], l[
            'ans']
        # 处理带分数
        question = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', question)
        equation = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', equation)
        answer = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', answer)
        equation = re.sub('(\d+)\(', '\\1+(', equation)
        answer = re.sub('(\d+)\(', '\\1+(', answer)
        # 分数去括号
        question = re.sub('\((\d+/\d+)\)', '\\1', question)
        # 处理百分数
        equation = re.sub('([\.\d]+)%', '(\\1/100)', equation)
        answer = re.sub('([\.\d]+)%', '(\\1/100)', answer)
        # 冒号转除号、剩余百分号处理
        equation = equation.replace(':', '/').replace('%', '/100')
        answer = answer.replace(':', '/').replace('%', '/100')
        if equation[:2] == 'x=':
            equation = equation[2:]
        try:
            if is_equal(eval(equation), eval(answer)):
                D.append((question, remove_bucket(equation), answer))
        except:
            continue
    return D
Example #6
0
def load_data(filename):
    print(filename)
    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            label, text = l.strip().split('\t')
            D.append((text, int(label)))
    return D
Example #7
0
 def on_epoch_end(self, epoch, logs=None):
     val_acc = evaluate(valid_generator)
     if val_acc > self.best_val_acc:  #  保存最好的模型,并记录最好的准确率
         self.best_val_acc = val_acc
         model.save_weights(
             os.path.join(output_model_path,
                          'best_model_bert_ptuning.weights'))
     test_acc = evaluate(test_generator)
     with open(os.path.join(output_model_path, "eval_accuracy.txt"),
               "a") as val_res:
         val_res.write(json.dumps({"eval_accuracy": val_acc}) + "\n")
     with open(os.path.join(output_model_path, "test_accuracy.txt"),
               "a") as test_res:
         test_res.write(json.dumps({"test_accuracy": test_acc}) + "\n")
     print(  # 打印准确率
         u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' %
         (val_acc, self.best_val_acc, test_acc))
def standard_label(df=None):
    key_list = [i for i in df["new_label"].tolist()]
    value_list = df["label"].tolist()
    with open("label.json", "w", encoding="utf-8") as f:
        f.write(
            json.dumps(dict(zip(key_list, value_list)),
                       ensure_ascii=False,
                       indent=2))
Example #9
0
def load_data_test(filename):
    D = []
    with open(filename) as f:
        for l in f:
            l = json.loads(l)

            D.append(l)
    return D
def load_data(filename):
  D = []
  with open(filename, 'r') as f:
    data = json.load(f)
    
  for item in data:
    D.append((item['rewrite_q'], item['new_q'], item['img_path']))
  return D
Example #11
0
def load_data(filename):

    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            l = json.loads(l)
            D.append(l['text'])
    return D
Example #12
0
def load_data(filename):
    D = []
    with open(filename) as f:
        for i, l in enumerate(f):
            l = json.loads(l)
            text, label = l['sentence'], l['label']
            D.append((text[:128], int(label)))
    return D
Example #13
0
def corpus():
    """循环读取语料
    """
    while True:
        with open('LCCD-large-shuf.json') as f:
            for l in f:
                l = json.loads(l)
                yield l
Example #14
0
def corpus(data_path):
    """循环读取语料
    """
    while True:
        ls = json.load(open(config.data_path, encoding='utf-8'))
        print(len(ls))
        for l in ls:
            yield l
Example #15
0
def predict_to_file(in_file, out_file):
    """预测结果到文件,便于用官方脚本评测
    使用示例:
    predict_to_file('/root/icwb2-data/testing/pku_test.utf8', 'myresult.txt')
    官方评测代码示例:
    data_dir="/root/icwb2-data"
    $data_dir/scripts/score $data_dir/gold/pku_training_words.utf8 $data_dir/gold/pku_test_gold.utf8 myresult.txt > myscore.txt
    (执行完毕后查看myscore.txt的内容末尾)
    """
    fw = open(out_file, 'w', encoding='utf-8')
    with open(in_file, encoding='utf-8') as fr:
        for l in tqdm(fr):
            l = l.strip()
            if l:
                l = ' '.join(word_segment(l))
            fw.write(l + '\n')
    fw.close()
Example #16
0
def load_sts_12_16_data(filename):
    """加载STS-12,13,14,15,16数据(带标签)
    单条格式:(文本1, 文本2, 标签)
    """
    D = []
    input_file = filename
    label_file = input_file.replace('STS.input', 'STS.gs')
    input_file = open(input_file, encoding='utf-8')
    label_file = open(label_file, encoding='utf-8')
    for i, l in zip(input_file, label_file):
        if l.strip():
            i = i.strip().split('\t')
            l = float(l.strip())
            D.append((i[0], i[1], l))
    input_file.close()
    label_file.close()
    return D
def predict_to_file(in_file, out_file):
    """预测到文件
    可以提交到 https://tianchi.aliyun.com/dataset/dataDetail?dataId=95414
    """
    data = json.load(open(in_file))
    for d in tqdm(data, ncols=100):
        d['entities'] = []
        entities = NER.recognize(d['text'])
        for e in entities:
            d['entities'].append({
                'start_idx': e[0],
                'end_idx': e[1],
                'type': e[2]
            })
    json.dump(data,
              open(out_file, 'w', encoding='utf-8'),
              indent=4,
              ensure_ascii=False)
Example #18
0
def load_data(filename):
    """加载数据
    返回:[(texts, labels, summary)]
    """
    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            D.append(json.loads(l))
    return D
Example #19
0
def load_snli_data(filename):
    """加载SNLI数据(带标签)
    单条格式:(文本1, 文本2, 标签)
    """
    D = []
    filename = filename.split('/')
    s1_file = '/'.join(filename[:-1]) + '/s1.' + filename[-1]
    s2_file = '/'.join(filename[:-1]) + '/s2.' + filename[-1]
    l_file = '/'.join(filename[:-1]) + '/labels.' + filename[-1]
    s1_file = open(s1_file, encoding='utf-8')
    s2_file = open(s2_file, encoding='utf-8')
    l_file = open(l_file, encoding='utf-8')
    for s1, s2, l in zip(s1_file, s2_file, l_file):
        D.append((s1.strip(), s2.strip(), l.strip()))
    s1_file.close()
    s2_file.close()
    l_file.close()
    return D
Example #20
0
def predict_to_file(data, filename, topk=1):
    """将预测结果输出到文件,方便评估
    """
    with open(filename, 'w', encoding='utf-8') as f:
        for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)):
            q, a = qag.generate(d[0])
            s = '%s\t%s\t%s\n' % (q, a, d[0])
            f.write(s)
            f.flush()
Example #21
0
def load_data(filename):
    """加载数据
    单条格式:[词1, 词2, 词3, ...]
    """
    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            D.append(re.split(' +', l.strip()))
    return D
Example #22
0
def load_data(filename):
    D = []
    for d in json.load(open(filename))['data'][0]['paragraphs']:
        for qa in d['qas']:
            D.append([
                qa['id'], d['context'], qa['question'],
                [a['text'] for a in qa.get('answers', [])]
            ])
    return D
Example #23
0
def load_user_dict(filename):
    """加载用户词典
    """
    user_dict = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            w = l.split()[0]
            user_dict.append(w)
    return user_dict
Example #24
0
def load_data(filename):
    """加载数据
    返回:[{...}]
    """
    D = []
    with open(filename) as f:
        for l in f:
            D.append(json.loads(l))
    return D
Example #25
0
def predict(in_file, out_file, topk=1):
    """输出预测结果到文件
    该函数主要为比赛 https://www.datafountain.cn/competitions/467 所写,
    主要是读取该比赛的测试集,然后预测equation,并且根据不同的问题输出不同格式的答案,
    out_file可以直接提交到线上评测,线上准确率可以达到38%+。
    """
    fw = open(out_file, 'w', encoding='utf-8')
    raw_data = pd.read_csv(in_file, header=None, encoding='utf-8')
    for i, question in tqdm(raw_data.values):
        question = re.sub('(\d+)_(\d+/\d+)', '(\\1+\\2)', question)
        pred_equation = autosolve.generate(question, topk)
        if '.' not in pred_equation:
            pred_equation = re.sub('([\d]+)', 'Integer(\\1)', pred_equation)
        try:
            pred_answer = eval(pred_equation)
        except:
            pred_answer = np.random.choice(21) + 1
        if '.' in pred_equation:
            if u'百分之几' in question:
                pred_answer = pred_answer * 100
            pred_answer = round(pred_answer, 2)
            if int(pred_answer) == pred_answer:
                pred_answer = int(pred_answer)
            if (
                re.findall(u'多少[辆|人|个|只|箱|包本|束|头|盒|张]', question) or
                re.findall(u'几[辆|人|个|只|箱|包|本|束|头|盒|张]', question)
            ):
                if re.findall(u'至少|最少', question):
                    pred_answer = np.ceil(pred_answer)
                elif re.findall(u'至多|最多', question):
                    pred_answer = np.floor(pred_answer)
                else:
                    pred_answer = np.ceil(pred_answer)
                pred_answer = int(pred_answer)
            pred_answer = str(pred_answer)
            if u'百分之几' in question:
                pred_answer = pred_answer + '%'
        else:
            pred_answer = str(pred_answer)
            if '/' in pred_answer:
                if re.findall('\d+/\d+', question):
                    a, b = pred_answer.split('/')
                    a, b = int(a), int(b)
                    if a > b:
                        pred_answer = '%s_%s/%s' % (a // b, a % b, b)
                else:
                    if re.findall(u'至少|最少', question):
                        pred_answer = np.ceil(eval(pred_answer))
                    elif re.findall(u'至多|最多', question):
                        pred_answer = np.floor(eval(pred_answer))
                    else:
                        pred_answer = np.ceil(eval(pred_answer))
                    pred_answer = str(int(pred_answer))
        fw.write(str(i) + ',' + pred_answer + '\n')
        fw.flush()
    fw.close()
Example #26
0
 def _batch_texts():
     texts = []
     for txt in txts:
         text = open(txt, encoding='utf-8').read()
         texts.append(text)
         if len(texts) == 100:
             yield texts
             texts = []
     if texts:
         yield texts
Example #27
0
def load_vocab(dict_path, encoding='utf-8'):
    """从bert的词典文件中读取词典
    """
    token_dict = {}
    with open(dict_path, encoding=encoding) as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)

    return token_dict
Example #28
0
def load_data(filename):
    D = []
    with open(filename,encoding = 'utf-8') as f:
        for l in f:
            l = json.loads(l)
            arguments = []
            for event in l['event_list']:
                arguments.append(label2id[event['event_type']])  
            D.append((l['text'], arguments))
    return D
Example #29
0
def predict_to_file(in_file, out_file):
    """预测到文件
    可以提交到 https://www.cluebenchmarks.com/ner.html
    """
    fw = open(out_file, 'w', encoding='utf-8')
    with open(in_file) as fr:
        for l in tqdm(fr):
            l = json.loads(l)
            l['label'] = {}
            for start, end, label in NER.recognize(l['text']):
                if label not in l['label']:
                    l['label'][label] = {}
                entity = l['text'][start:end + 1]
                if entity not in l['label'][label]:
                    l['label'][label][entity] = []
                l['label'][label][entity].append([start, end])
            l = json.dumps(l, ensure_ascii=False)
            fw.write(l + '\n')
    fw.close()
def load_data(filename):
    """加载数据
    单条格式:(文本1, 文本2, 标签id)
    """
    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            text1, text2, label = l.strip().split('\t')
            D.append((text1, text2, int(label)))
    return D