def encoder(sentence,argument): label2id,id2label,num_labels = tools.load_schema() encode_dict=tokenizer.encode_plus(sentence,max_length=args.max_length,pad_to_max_length=True) encode_sent=encode_dict['input_ids'] token_type_ids=encode_dict['token_type_ids'] attention_mask=encode_dict['attention_mask'] label=[0 for i in range(args.max_length)] for key,value in argument.items(): encode_arg=tokenizer.encode(value) start_idx=tools.search(encode_arg[1:-1],encode_sent) label[start_idx]= label2id[key] * 2 + 1 for i in range(1, len(encode_arg[1:-1])): label[start_idx + i] = label2id[key] * 2 + 2 return encode_sent,token_type_ids,attention_mask,label
def load_data(file_path): event_type_dict=tools.load_schema() with open(file_path,'r',encoding='utf8') as f: lines=f.readlines() sentences=[] for line in lines: data=json.loads(line) text=data['text'] title=data['title'] if 'event_list' in data.keys() and data['event_list'] !=[]: for event in data['event_list']: event_type = event['event_type'] if event_type !='无事件': role_list = event_type_dict[event_type] for role in role_list: sent = event_type+'[unused1]'+role+'[SEP]'+text sentences.append(sent) return sentences
from transformers import BertTokenizer from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder import numpy as np from utils.arguments_parse import args import json import torch from torch import nn from torch.utils.data import DataLoader, Dataset import unicodedata, re from data_preprocessing import tools from tqdm import tqdm from sklearn.utils import shuffle tokenizer = tools.get_tokenizer() predicate2id, id2predicate, s_entity_type, o_entity_type, _, _ = tools.load_schema( ) def load_data(file_path): with open(file_path, 'r', encoding='utf8') as f: lines = f.readlines() sentences = [] result = [] for line in tqdm(lines): data = json.loads(line) text = data['text'] text = text s_dict = {} o_dict = {} spo_dict = {} for spo in data['spo_list']:
import os import sys from typing import Any from transformers import BertTokenizer, BertModel import torch from torch import nn import pickle from torch.utils.data import DataLoader, Dataset from torch import optim import numpy as np from data_preprocessing import tools label2id, id2label, num_labels = tools.load_schema() num_label = num_labels + 1 tokenizer = tools.get_tokenizer() class biaffine(nn.Module): def __init__(self, in_size, out_size, bias_x=True, bias_y=True): super().__init__() self.bias_x = bias_x self.bias_y = bias_y self.out_size = out_size self.U = torch.nn.Parameter( torch.Tensor(in_size + int(bias_x), out_size, in_size + int(bias_y))) # self.U1 = self.U.view(size=(in_size + int(bias_x),-1)) #U.shape = [in_size,out_size,in_size] def forward(self, x, y): if self.bias_x: x = torch.cat((x, torch.ones_like(x[..., :1])), dim=-1)
from model.loss_function import multilabel_cross_entropy from model.metrics import metrics from data_preprocessing import * import json from tqdm import tqdm import unicodedata, re from data_preprocessing import predict_data_prepro from data_preprocessing import tools from tqdm import tqdm device = torch.device('cuda') added_token = ['[unused1]', '[unused2]'] tokenizer = BertTokenizer.from_pretrained( args.pretrained_model_path, additional_special_tokens=added_token) predicate2id, id2predicate = tools.load_schema() model = bertMRC(pre_train_dir=args.pretrained_model_path, dropout_rate=0.5).to(device) model.load_state_dict(torch.load(args.checkpoints)) model.eval() def load_data(file_path): with open(file_path, 'r', encoding='utf8') as f: lines = f.readlines() sentences = [] for line in lines: data = json.loads(line) sentences.append(data['text']) return sentences