def load_data(filename): D = [] with open(filename) as f: for l in f: l = json.loads(l) for event in l['event_list']: arguments = {} k=0 vocab={} for argument in event['arguments']: flag=0 for i in range(k): if re.search(argument['argument'],vocab[i]) : if len(argument['argument'])>vocab[i]: flag=1 else : arguments.pop(vocab[i]) vocab.pop(vocab[i]) if flag: continue key = argument['argument'] value = (event['event_type'], argument['role']) #事件类型+论元角色 arguments[key] = value vocab[k]=argument['argument'] k=k+1 D.append((event['event_type']+"|"+l['text'], arguments)) #改动1-这里的缩紧可以增加数据量,每个event——type return D
def evaluate(data): """评估函数,计算f1、precision、recall """ X, Y, Z = 1e-10, 1e-10, 1e-10 f = open('dev_pred.json', 'w', encoding='utf-8') pbar = tqdm() for d in data: R = set([SPO(spo) for spo in extract_spoes(d['text'])]) T = set([SPO(spo) for spo in d['spo_list']]) X += len(R & T) Y += len(R) Z += len(T) f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z pbar.update() pbar.set_description('f1: %.5f, precision: %.5f, recall: %.5f' % (f1, precision, recall)) s = json.dumps( { 'text': d['text'], 'spo_list': list(T), 'spo_list_pred': list(R), 'new': list(R - T), 'lack': list(T - R), }, ensure_ascii=False, indent=4) f.write(s + '\n') pbar.close() f.close() return f1, precision, recall
def load_data(filename): D = [] with open(filename, encoding='utf-8') as f: for l in f: title, content = l.strip().split('\t') D.append((title, content)) return D
def load_vocab(dict_path, encoding='utf-8', simplified=False, startswith=None): """从bert的词典文件中读取词典 """ token_dict = {} with open(dict_path, encoding=encoding) as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) if simplified: # 过滤冗余部分token new_token_dict, keep_tokens = {}, [] startswith = startswith or [] for t in startswith: new_token_dict[t] = len(new_token_dict) keep_tokens.append(token_dict[t]) for t, _ in sorted(token_dict.items(), key=lambda s: s[1]): if t not in new_token_dict: keep = True if len(t) > 1: for c in Tokenizer.stem(t): if (Tokenizer._is_cjk_character(c) or Tokenizer._is_punctuation(c)): keep = False break if keep: new_token_dict[t] = len(new_token_dict) keep_tokens.append(token_dict[t]) else: print t, [t] return new_token_dict, keep_tokens else: return token_dict
def load_data(filename): """读取训练数据,并做一些标准化,保证equation是可以eval的 参考:https://kexue.fm/archives/7809 """ D = [] for l in open(filename): l = json.loads(l) question, equation, answer = l['original_text'], l['equation'], l[ 'ans'] # 处理带分数 question = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', question) equation = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', equation) answer = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', answer) equation = re.sub('(\d+)\(', '\\1+(', equation) answer = re.sub('(\d+)\(', '\\1+(', answer) # 分数去括号 question = re.sub('\((\d+/\d+)\)', '\\1', question) # 处理百分数 equation = re.sub('([\.\d]+)%', '(\\1/100)', equation) answer = re.sub('([\.\d]+)%', '(\\1/100)', answer) # 冒号转除号、剩余百分号处理 equation = equation.replace(':', '/').replace('%', '/100') answer = answer.replace(':', '/').replace('%', '/100') if equation[:2] == 'x=': equation = equation[2:] try: if is_equal(eval(equation), eval(answer)): D.append((question, remove_bucket(equation), answer)) except: continue return D
def load_data(filename): print(filename) D = [] with open(filename, encoding='utf-8') as f: for l in f: label, text = l.strip().split('\t') D.append((text, int(label))) return D
def on_epoch_end(self, epoch, logs=None): val_acc = evaluate(valid_generator) if val_acc > self.best_val_acc: # 保存最好的模型,并记录最好的准确率 self.best_val_acc = val_acc model.save_weights( os.path.join(output_model_path, 'best_model_bert_ptuning.weights')) test_acc = evaluate(test_generator) with open(os.path.join(output_model_path, "eval_accuracy.txt"), "a") as val_res: val_res.write(json.dumps({"eval_accuracy": val_acc}) + "\n") with open(os.path.join(output_model_path, "test_accuracy.txt"), "a") as test_res: test_res.write(json.dumps({"test_accuracy": test_acc}) + "\n") print( # 打印准确率 u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' % (val_acc, self.best_val_acc, test_acc))
def standard_label(df=None): key_list = [i for i in df["new_label"].tolist()] value_list = df["label"].tolist() with open("label.json", "w", encoding="utf-8") as f: f.write( json.dumps(dict(zip(key_list, value_list)), ensure_ascii=False, indent=2))
def load_data_test(filename): D = [] with open(filename) as f: for l in f: l = json.loads(l) D.append(l) return D
def load_data(filename): D = [] with open(filename, 'r') as f: data = json.load(f) for item in data: D.append((item['rewrite_q'], item['new_q'], item['img_path'])) return D
def load_data(filename): D = [] with open(filename, encoding='utf-8') as f: for l in f: l = json.loads(l) D.append(l['text']) return D
def load_data(filename): D = [] with open(filename) as f: for i, l in enumerate(f): l = json.loads(l) text, label = l['sentence'], l['label'] D.append((text[:128], int(label))) return D
def corpus(): """循环读取语料 """ while True: with open('LCCD-large-shuf.json') as f: for l in f: l = json.loads(l) yield l
def corpus(data_path): """循环读取语料 """ while True: ls = json.load(open(config.data_path, encoding='utf-8')) print(len(ls)) for l in ls: yield l
def predict_to_file(in_file, out_file): """预测结果到文件,便于用官方脚本评测 使用示例: predict_to_file('/root/icwb2-data/testing/pku_test.utf8', 'myresult.txt') 官方评测代码示例: data_dir="/root/icwb2-data" $data_dir/scripts/score $data_dir/gold/pku_training_words.utf8 $data_dir/gold/pku_test_gold.utf8 myresult.txt > myscore.txt (执行完毕后查看myscore.txt的内容末尾) """ fw = open(out_file, 'w', encoding='utf-8') with open(in_file, encoding='utf-8') as fr: for l in tqdm(fr): l = l.strip() if l: l = ' '.join(word_segment(l)) fw.write(l + '\n') fw.close()
def load_sts_12_16_data(filename): """加载STS-12,13,14,15,16数据(带标签) 单条格式:(文本1, 文本2, 标签) """ D = [] input_file = filename label_file = input_file.replace('STS.input', 'STS.gs') input_file = open(input_file, encoding='utf-8') label_file = open(label_file, encoding='utf-8') for i, l in zip(input_file, label_file): if l.strip(): i = i.strip().split('\t') l = float(l.strip()) D.append((i[0], i[1], l)) input_file.close() label_file.close() return D
def predict_to_file(in_file, out_file): """预测到文件 可以提交到 https://tianchi.aliyun.com/dataset/dataDetail?dataId=95414 """ data = json.load(open(in_file)) for d in tqdm(data, ncols=100): d['entities'] = [] entities = NER.recognize(d['text']) for e in entities: d['entities'].append({ 'start_idx': e[0], 'end_idx': e[1], 'type': e[2] }) json.dump(data, open(out_file, 'w', encoding='utf-8'), indent=4, ensure_ascii=False)
def load_data(filename): """加载数据 返回:[(texts, labels, summary)] """ D = [] with open(filename, encoding='utf-8') as f: for l in f: D.append(json.loads(l)) return D
def load_snli_data(filename): """加载SNLI数据(带标签) 单条格式:(文本1, 文本2, 标签) """ D = [] filename = filename.split('/') s1_file = '/'.join(filename[:-1]) + '/s1.' + filename[-1] s2_file = '/'.join(filename[:-1]) + '/s2.' + filename[-1] l_file = '/'.join(filename[:-1]) + '/labels.' + filename[-1] s1_file = open(s1_file, encoding='utf-8') s2_file = open(s2_file, encoding='utf-8') l_file = open(l_file, encoding='utf-8') for s1, s2, l in zip(s1_file, s2_file, l_file): D.append((s1.strip(), s2.strip(), l.strip())) s1_file.close() s2_file.close() l_file.close() return D
def predict_to_file(data, filename, topk=1): """将预测结果输出到文件,方便评估 """ with open(filename, 'w', encoding='utf-8') as f: for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)): q, a = qag.generate(d[0]) s = '%s\t%s\t%s\n' % (q, a, d[0]) f.write(s) f.flush()
def load_data(filename): """加载数据 单条格式:[词1, 词2, 词3, ...] """ D = [] with open(filename, encoding='utf-8') as f: for l in f: D.append(re.split(' +', l.strip())) return D
def load_data(filename): D = [] for d in json.load(open(filename))['data'][0]['paragraphs']: for qa in d['qas']: D.append([ qa['id'], d['context'], qa['question'], [a['text'] for a in qa.get('answers', [])] ]) return D
def load_user_dict(filename): """加载用户词典 """ user_dict = [] with open(filename, encoding='utf-8') as f: for l in f: w = l.split()[0] user_dict.append(w) return user_dict
def load_data(filename): """加载数据 返回:[{...}] """ D = [] with open(filename) as f: for l in f: D.append(json.loads(l)) return D
def predict(in_file, out_file, topk=1): """输出预测结果到文件 该函数主要为比赛 https://www.datafountain.cn/competitions/467 所写, 主要是读取该比赛的测试集,然后预测equation,并且根据不同的问题输出不同格式的答案, out_file可以直接提交到线上评测,线上准确率可以达到38%+。 """ fw = open(out_file, 'w', encoding='utf-8') raw_data = pd.read_csv(in_file, header=None, encoding='utf-8') for i, question in tqdm(raw_data.values): question = re.sub('(\d+)_(\d+/\d+)', '(\\1+\\2)', question) pred_equation = autosolve.generate(question, topk) if '.' not in pred_equation: pred_equation = re.sub('([\d]+)', 'Integer(\\1)', pred_equation) try: pred_answer = eval(pred_equation) except: pred_answer = np.random.choice(21) + 1 if '.' in pred_equation: if u'百分之几' in question: pred_answer = pred_answer * 100 pred_answer = round(pred_answer, 2) if int(pred_answer) == pred_answer: pred_answer = int(pred_answer) if ( re.findall(u'多少[辆|人|个|只|箱|包本|束|头|盒|张]', question) or re.findall(u'几[辆|人|个|只|箱|包|本|束|头|盒|张]', question) ): if re.findall(u'至少|最少', question): pred_answer = np.ceil(pred_answer) elif re.findall(u'至多|最多', question): pred_answer = np.floor(pred_answer) else: pred_answer = np.ceil(pred_answer) pred_answer = int(pred_answer) pred_answer = str(pred_answer) if u'百分之几' in question: pred_answer = pred_answer + '%' else: pred_answer = str(pred_answer) if '/' in pred_answer: if re.findall('\d+/\d+', question): a, b = pred_answer.split('/') a, b = int(a), int(b) if a > b: pred_answer = '%s_%s/%s' % (a // b, a % b, b) else: if re.findall(u'至少|最少', question): pred_answer = np.ceil(eval(pred_answer)) elif re.findall(u'至多|最多', question): pred_answer = np.floor(eval(pred_answer)) else: pred_answer = np.ceil(eval(pred_answer)) pred_answer = str(int(pred_answer)) fw.write(str(i) + ',' + pred_answer + '\n') fw.flush() fw.close()
def _batch_texts(): texts = [] for txt in txts: text = open(txt, encoding='utf-8').read() texts.append(text) if len(texts) == 100: yield texts texts = [] if texts: yield texts
def load_vocab(dict_path, encoding='utf-8'): """从bert的词典文件中读取词典 """ token_dict = {} with open(dict_path, encoding=encoding) as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) return token_dict
def load_data(filename): D = [] with open(filename,encoding = 'utf-8') as f: for l in f: l = json.loads(l) arguments = [] for event in l['event_list']: arguments.append(label2id[event['event_type']]) D.append((l['text'], arguments)) return D
def predict_to_file(in_file, out_file): """预测到文件 可以提交到 https://www.cluebenchmarks.com/ner.html """ fw = open(out_file, 'w', encoding='utf-8') with open(in_file) as fr: for l in tqdm(fr): l = json.loads(l) l['label'] = {} for start, end, label in NER.recognize(l['text']): if label not in l['label']: l['label'][label] = {} entity = l['text'][start:end + 1] if entity not in l['label'][label]: l['label'][label][entity] = [] l['label'][label][entity].append([start, end]) l = json.dumps(l, ensure_ascii=False) fw.write(l + '\n') fw.close()
def load_data(filename): """加载数据 单条格式:(文本1, 文本2, 标签id) """ D = [] with open(filename, encoding='utf-8') as f: for l in f: text1, text2, label = l.strip().split('\t') D.append((text1, text2, int(label))) return D