def create_and_check_model_as_decoder( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask, ): config.add_cross_attention = True model = BertModel(config) model.to(torch_device) model.eval() result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, ) result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask): model = BertModel(config) model.eval() sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask) sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states) sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = { "sequence_output": sequence_output, "pooled_output": pooled_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
def get_kobert_model(model_file, vocab_file, ctx="cpu"): bertmodel = BertModel(config=BertConfig.from_dict(bert_config)) bertmodel.load_state_dict(torch.load(model_file), strict=False) device = torch.device(ctx) bertmodel.to(device) bertmodel.eval() vocab_b_obj = nlp.vocab.BERTVocab.from_json(open(vocab_file, 'rt').read()) return bertmodel, vocab_b_obj
def get_kobert_model(model_file, vocab_file, ctx="cpu"): bertmodel = BertModel(config=BertConfig.from_dict(bert_config)) bertmodel.load_state_dict(torch.load(model_file)) device = torch.device(ctx) bertmodel.to(device) bertmodel.eval() vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file, padding_token='[PAD]') return bertmodel, vocab_b_obj
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertModel(config=config) model.to(input_ids.device) model.eval() sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) # failed because there is not loss output model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc ], [self.last_hidden_state_desc, self.pooler_output_desc]) args_gradient_accumulation_steps = 8 args_local_rank = 0 args_world_size = 1 args_fp16 = True args_allreduce_post_accumulation = True model = ORTTrainer( model, None, model_desc, "LambOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription( 'Learning_Rate', [ 1, ], torch.float32), device=self.device, postprocess_model=postprocess_model, gradient_accumulation_steps=args_gradient_accumulation_steps, world_rank=args_local_rank, world_size=args_world_size, use_mixed_precision=True if args_fp16 else False, allreduce_post_accumulation=True if args_allreduce_post_accumulation else False) sequence_output, pooled_output = model( input_ids, token_type_ids=token_type_ids) sequence_output, pooled_output = model(input_ids) result = { "sequence_output": sequence_output, "pooled_output": pooled_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
def main(): # Env use_gpu=False #torch.cuda.is_available() # batchsize = [1,4,8,16,32,64,128,256] # Data Augment input_np = np.random.randint(1000, size=(1,512)) input_ids=torch.from_numpy(input_np).long() attention_mask=torch.zeros(1, 512).long() token_type_ids=torch.ones(1, 512).long() if use_gpu: input_ids=input_ids.cuda() attention_mask=attention_mask.cuda() token_type_ids=token_type_ids.cuda() # Model Prepare configuration = BertConfig(vocab_size=30522, hidden_size=768, num_hudden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act='gelu', hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, gradient_checkpointing=False) if use_gpu: model = BertModel(configuration).cuda() else: model = BertModel(configuration) # Eval with Speed Record model.eval() t1 = time.time() with torch.no_grad(): for i in range(512): print(i) output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) print('val_time = {:.6f}'.format(time.time() - t1))
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertModel(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) self.parent.assertEqual( result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def _load_bert(self, bert_config_path: str, bert_model_path: str): bert_config = BertConfig.from_json_file(bert_config_path) model = BertModel(bert_config) if self.cuda: model_states = torch.load(bert_model_path) else: model_states = torch.load(bert_model_path, map_location='cpu') # fix model_states for k in list(model_states.keys()): if k.startswith("bert."): model_states[k[5:]] = model_states.pop(k) elif k.startswith("cls"): _ = model_states.pop(k) if k[-4:] == "beta": model_states[k[:-4]+"bias"] = model_states.pop(k) if k[-5:] == "gamma": model_states[k[:-5]+"weight"] = model_states.pop(k) model.load_state_dict(model_states) if self.cuda: model.cuda() model.eval() return model
def main(args, _=None): batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) checkpoint = utils.load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} utils.unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = utils.process_components(model=model) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=get_features, tokenizer=tokenizer, max_length=max_length, ) dataloader = utils.get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} poolings = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch in enumerate(dataloader): batch = utils.any2device(batch, device) features_ = model(**batch) # create storage based on network output if idx == 0: # class _, embedding_size = features_[1].shape features["class"] = np.memmap( f"{args.out_prefix}.class.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) if args.output_hidden_states: # all embeddings for i, feature_ in enumerate(features_[2]): name_ = f"embeddings_{i + 1:02d}" _, _, embedding_size = feature_.shape poolings[name_] = LamaPooling( features_in=embedding_size, groups=pooling_groups, ) features[name_] = np.memmap( f"{args.out_prefix}.{name_}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) else: # last _, _, embedding_size = features_[0].shape poolings["last"] = LamaPooling( features_in=embedding_size, groups=pooling_groups, ) features["last"] = np.memmap( f"{args.out_prefix}.last.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) features["class"][indices] = _detach(features_[1]) if args.output_hidden_states: # all embeddings for i, feature_ in enumerate(features_[2]): name_ = f"embeddings_{i + 1:02d}" feature_ = poolings[name_](feature_) features[name_][indices] = _detach(feature_) else: feature_ = poolings[name_](features_[0]) features["last"][indices] = _detach(feature_)
def main(args, _=None): """Run the ``catalyst-contrib text2embeddings`` script.""" batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") bert_level = args.bert_level if bert_level is not None: assert (args.output_hidden_states ), "You need hidden states output for level specification" set_global_seed(args.seed) prepare_cudnn(args.deterministic, args.benchmark) if getattr(args, "in_huggingface", False): model_config = BertConfig.from_pretrained(args.in_huggingface) model_config.output_hidden_states = args.output_hidden_states model = BertModel.from_pretrained(args.in_huggingface, config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_huggingface) else: model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) if getattr(args, "in_model", None) is not None: checkpoint = load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = process_components(model=model) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=partial( tokenize_text, strip=args.strip, lowercase=args.lowercase, remove_punctuation=args.remove_punctuation, ), tokenizer=tokenizer, max_length=max_length, ) dataloader = get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch_input in enumerate(dataloader): batch_input = any2device(batch_input, device) batch_output = model(**batch_input) mask = (batch_input["attention_mask"].unsqueeze(-1) if args.mask_for_max_length else None) if check_ddp_wrapped(model): # using several gpu hidden_size = model.module.config.hidden_size hidden_states = model.module.config.output_hidden_states else: # using cpu or one gpu hidden_size = model.config.hidden_size hidden_states = model.config.output_hidden_states batch_features = process_bert_output( bert_output=batch_output, hidden_size=hidden_size, output_hidden_states=hidden_states, pooling_groups=pooling_groups, mask=mask, ) # create storage based on network output if idx == 0: for layer_name, layer_value in batch_features.items(): if bert_level is not None and bert_level != layer_name: continue layer_name = (layer_name if isinstance(layer_name, str) else f"{layer_name:02d}") _, embedding_size = layer_value.shape features[layer_name] = np.memmap( f"{args.out_prefix}.{layer_name}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) for layer_name2, layer_value2 in batch_features.items(): if bert_level is not None and bert_level != layer_name2: continue layer_name2 = (layer_name2 if isinstance(layer_name2, str) else f"{layer_name2:02d}") features[layer_name2][indices] = _detach(layer_value2) if args.force_save: for key, mmap in features.items(): mmap.flush() np.save(f"{args.out_prefix}.{key}.force.npy", mmap, allow_pickle=False)
class JointBERT(BertPreTrainedModel): def __init__(self, bert_config, model_config, device, slot_dim, intent_dim, intent_weight=None): super(JointBERT, self).__init__(bert_config) self.slot_num_labels = slot_dim self.intent_num_labels = intent_dim self.device = device self.intent_weight = intent_weight if intent_weight is not None else torch.tensor( [1.] * intent_dim) self.bert = BertModel(bert_config) self.dropout = nn.Dropout(model_config['dropout']) self.context = model_config['context'] self.finetune = model_config['finetune'] self.context_grad = model_config['context_grad'] if self.context: self.intent_classifier = nn.Linear(2 * bert_config.hidden_size, self.intent_num_labels) self.slot_classifier = nn.Linear(2 * bert_config.hidden_size, self.slot_num_labels) self.intent_hidden = nn.Linear(2 * bert_config.hidden_size, 2 * bert_config.hidden_size) self.slot_hidden = nn.Linear(2 * bert_config.hidden_size, 2 * bert_config.hidden_size) else: self.intent_classifier = nn.Linear(bert_config.hidden_size, self.intent_num_labels) self.slot_classifier = nn.Linear(bert_config.hidden_size, self.slot_num_labels) self.intent_hidden = nn.Linear(bert_config.hidden_size, bert_config.hidden_size) self.slot_hidden = nn.Linear(bert_config.hidden_size, bert_config.hidden_size) self.intent_loss_fct = torch.nn.BCEWithLogitsLoss( pos_weight=self.intent_weight) self.slot_loss_fct = torch.nn.CrossEntropyLoss() self.init_weights() def forward(self, word_seq_tensor, word_mask_tensor, tag_seq_tensor=None, tag_mask_tensor=None, intent_tensor=None, context_seq_tensor=None, context_mask_tensor=None): if not self.finetune: self.bert.eval() with torch.no_grad(): outputs = self.bert(input_ids=word_seq_tensor, attention_mask=word_mask_tensor) else: outputs = self.bert(input_ids=word_seq_tensor, attention_mask=word_mask_tensor) sequence_output = outputs[0] pooled_output = outputs[1] if self.context and context_seq_tensor is not None: if not self.finetune or not self.context_grad: with torch.no_grad(): context_output = self.bert( input_ids=context_seq_tensor, attention_mask=context_mask_tensor)[1] else: context_output = self.bert( input_ids=context_seq_tensor, attention_mask=context_mask_tensor)[1] sequence_output = torch.cat([ context_output.unsqueeze(1).repeat(1, sequence_output.size(1), 1), sequence_output ], dim=-1) pooled_output = torch.cat([context_output, pooled_output], dim=-1) sequence_output = nn.functional.relu( self.dropout(self.slot_hidden(sequence_output))) pooled_output = nn.functional.relu( self.dropout(self.intent_hidden(pooled_output))) sequence_output = self.dropout(sequence_output) slot_logits = self.slot_classifier(sequence_output) outputs = (slot_logits, ) pooled_output = self.dropout(pooled_output) intent_logits = self.intent_classifier(pooled_output) outputs = outputs + (intent_logits, ) if tag_seq_tensor is not None: active_tag_loss = tag_mask_tensor.view(-1) == 1 active_tag_logits = slot_logits.view( -1, self.slot_num_labels)[active_tag_loss] active_tag_labels = tag_seq_tensor.view(-1)[active_tag_loss] slot_loss = self.slot_loss_fct(active_tag_logits, active_tag_labels) outputs = outputs + (slot_loss, ) if intent_tensor is not None: intent_loss = self.intent_loss_fct(intent_logits, intent_tensor) outputs = outputs + (intent_loss, ) return outputs # slot_logits, intent_logits, (slot_loss), (intent_loss),
def main(): parser = argparse.ArgumentParser( description='Train the individual Transformer model') parser.add_argument('--dataset_folder', type=str, default='datasets') parser.add_argument('--dataset_name', type=str, default='zara1') parser.add_argument('--obs', type=int, default=8) parser.add_argument('--preds', type=int, default=12) parser.add_argument('--emb_size', type=int, default=1024) parser.add_argument('--heads', type=int, default=8) parser.add_argument('--layers', type=int, default=6) parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--cpu', action='store_true') parser.add_argument('--output_folder', type=str, default='Output') parser.add_argument('--val_size', type=int, default=50) parser.add_argument('--gpu_device', type=str, default="0") parser.add_argument('--verbose', action='store_true') parser.add_argument('--max_epoch', type=int, default=100) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--validation_epoch_start', type=int, default=30) parser.add_argument('--resume_train', action='store_true') parser.add_argument('--delim', type=str, default='\t') parser.add_argument('--name', type=str, default="zara1") args = parser.parse_args() model_name = args.name try: os.mkdir('models') except: pass try: os.mkdir('output') except: pass try: os.mkdir('output/BERT') except: pass try: os.mkdir(f'models/BERT') except: pass try: os.mkdir(f'output/BERT/{args.name}') except: pass try: os.mkdir(f'models/BERT/{args.name}') except: pass log = SummaryWriter('logs/BERT_%s' % model_name) log.add_scalar('eval/mad', 0, 0) log.add_scalar('eval/fad', 0, 0) try: os.mkdir(args.name) except: pass device = torch.device("cuda") if args.cpu or not torch.cuda.is_available(): device = torch.device("cpu") args.verbose = True ## creation of the dataloaders for train and validation train_dataset, _ = baselineUtils.create_dataset(args.dataset_folder, args.dataset_name, 0, args.obs, args.preds, delim=args.delim, train=True, verbose=args.verbose) val_dataset, _ = baselineUtils.create_dataset(args.dataset_folder, args.dataset_name, 0, args.obs, args.preds, delim=args.delim, train=False, verbose=args.verbose) test_dataset, _ = baselineUtils.create_dataset(args.dataset_folder, args.dataset_name, 0, args.obs, args.preds, delim=args.delim, train=False, eval=True, verbose=args.verbose) from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertConfig, AdamW config = BertConfig(vocab_size=30522, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act='relu', hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12) model = BertModel(config).to(device) from individual_TF import LinearEmbedding as NewEmbed, Generator as GeneratorTS a = NewEmbed(3, 768).to(device) model.set_input_embeddings(a) generator = GeneratorTS(768, 2).to(device) #model.set_output_embeddings(GeneratorTS(1024,2)) tr_dl = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) val_dl = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0) test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0) #optim = SGD(list(a.parameters())+list(model.parameters())+list(generator.parameters()),lr=0.01) #sched=torch.optim.lr_scheduler.StepLR(optim,0.0005) optim = NoamOpt( 768, 0.1, len(tr_dl), torch.optim.Adam(list(a.parameters()) + list(model.parameters()) + list(generator.parameters()), lr=0, betas=(0.9, 0.98), eps=1e-9)) #optim=Adagrad(list(a.parameters())+list(model.parameters())+list(generator.parameters()),lr=0.01,lr_decay=0.001) epoch = 0 mean = train_dataset[:]['src'][:, :, 2:4].mean((0, 1)) * 0 std = train_dataset[:]['src'][:, :, 2:4].std((0, 1)) * 0 + 1 while epoch < args.max_epoch: epoch_loss = 0 model.train() for id_b, batch in enumerate(tr_dl): optim.optimizer.zero_grad() r = 0 rot_mat = np.array([[np.cos(r), np.sin(r)], [-np.sin(r), np.cos(r)]]) inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device) inp = torch.matmul(inp, torch.from_numpy(rot_mat).float().to(device)) trg_masked = torch.zeros((inp.shape[0], args.preds, 2)).to(device) inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device) trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1], 1).to(device) inp_cat = torch.cat((inp, trg_masked), 1) cls_cat = torch.cat((inp_cls, trg_cls), 1) net_input = torch.cat((inp_cat, cls_cat), 2) position = torch.arange(0, net_input.shape[1]).repeat( inp.shape[0], 1).long().to(device) token = torch.zeros( (inp.shape[0], net_input.shape[1])).long().to(device) attention_mask = torch.ones( (inp.shape[0], net_input.shape[1])).long().to(device) out = model(input_ids=net_input, position_ids=position, token_type_ids=token, attention_mask=attention_mask) pred = generator(out[0]) loss = F.pairwise_distance( pred[:, :].contiguous().view(-1, 2), torch.matmul( torch.cat( (batch['src'][:, :, 2:4], batch['trg'][:, :, 2:4]), 1).contiguous().view(-1, 2).to(device), torch.from_numpy(rot_mat).float().to(device))).mean() loss.backward() optim.step() print("epoch %03i/%03i frame %04i / %04i loss: %7.4f" % (epoch, args.max_epoch, id_b, len(tr_dl), loss.item())) epoch_loss += loss.item() #sched.step() log.add_scalar('Loss/train', epoch_loss / len(tr_dl), epoch) with torch.no_grad(): model.eval() gt = [] pr = [] val_loss = 0 for batch in val_dl: inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device) trg_masked = torch.zeros( (inp.shape[0], args.preds, 2)).to(device) inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device) trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1], 1).to(device) inp_cat = torch.cat((inp, trg_masked), 1) cls_cat = torch.cat((inp_cls, trg_cls), 1) net_input = torch.cat((inp_cat, cls_cat), 2) position = torch.arange(0, net_input.shape[1]).repeat( inp.shape[0], 1).long().to(device) token = torch.zeros( (inp.shape[0], net_input.shape[1])).long().to(device) attention_mask = torch.zeros( (inp.shape[0], net_input.shape[1])).long().to(device) out = model(input_ids=net_input, position_ids=position, token_type_ids=token, attention_mask=attention_mask) pred = generator(out[0]) loss = F.pairwise_distance( pred[:, :].contiguous().view(-1, 2), torch.cat( (batch['src'][:, :, 2:4], batch['trg'][:, :, 2:4]), 1).contiguous().view(-1, 2).to(device)).mean() val_loss += loss.item() gt_b = batch['trg'][:, :, 0:2] preds_tr_b = pred[:, args.obs:].cumsum(1).to( 'cpu').detach() + batch['src'][:, -1:, 0:2] gt.append(gt_b) pr.append(preds_tr_b) gt = np.concatenate(gt, 0) pr = np.concatenate(pr, 0) mad, fad, errs = baselineUtils.distance_metrics(gt, pr) log.add_scalar('validation/loss', val_loss / len(val_dl), epoch) log.add_scalar('validation/mad', mad, epoch) log.add_scalar('validation/fad', fad, epoch) model.eval() gt = [] pr = [] for batch in test_dl: inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device) trg_masked = torch.zeros( (inp.shape[0], args.preds, 2)).to(device) inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device) trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1], 1).to(device) inp_cat = torch.cat((inp, trg_masked), 1) cls_cat = torch.cat((inp_cls, trg_cls), 1) net_input = torch.cat((inp_cat, cls_cat), 2) position = torch.arange(0, net_input.shape[1]).repeat( inp.shape[0], 1).long().to(device) token = torch.zeros( (inp.shape[0], net_input.shape[1])).long().to(device) attention_mask = torch.zeros( (inp.shape[0], net_input.shape[1])).long().to(device) out = model(input_ids=net_input, position_ids=position, token_type_ids=token, attention_mask=attention_mask) pred = generator(out[0]) gt_b = batch['trg'][:, :, 0:2] preds_tr_b = pred[:, args.obs:].cumsum(1).to( 'cpu').detach() + batch['src'][:, -1:, 0:2] gt.append(gt_b) pr.append(preds_tr_b) gt = np.concatenate(gt, 0) pr = np.concatenate(pr, 0) mad, fad, errs = baselineUtils.distance_metrics(gt, pr) torch.save(model.state_dict(), "models/BERT/%s/ep_%03i.pth" % (args.name, epoch)) torch.save(generator.state_dict(), "models/BERT/%s/gen_%03i.pth" % (args.name, epoch)) torch.save(a.state_dict(), "models/BERT/%s/emb_%03i.pth" % (args.name, epoch)) log.add_scalar('eval/mad', mad, epoch) log.add_scalar('eval/fad', fad, epoch) epoch += 1 ab = 1
class UnStructuredModel: def __init__(self, model_name, max_length, stride): self.model_name = model_name self.tokenizer = None self.model = None self.max_length = max_length self.stride = stride if model_name == 'bert-base-uncased': configuration = BertConfig() self.tokenizer = BertTokenizer.from_pretrained(self.model_name) self.model = BertModel(configuration).from_pretrained(self.model_name) self.model.to(device) self.model.eval() for param in self.model.parameters(): param.requires_grad = False #self.model.bert.embeddings.requires_grad = False def padTokens(self, tokens): if len(tokens)<self.max_length: tokens = tokens + ["[PAD]" for i in range(self.max_length - len(tokens))] return tokens def getEmbedding(self, text, if_pool=True, pooling_type="mean", batchsize = 1): tokens = self.tokenizer.tokenize(text) tokenized_array = self.tokenizeText(tokens) embeddingTensorsList = [] print(len(tokenized_array)) tensor = torch.zeros([1, 768], device=device) count = 0 if len(tokenized_array)>batchsize: for i in range(0, len(tokenized_array), batchsize): current_tokens = tokenized_array[i:min(i+batchsize,len(tokenized_array))] token_ids = torch.tensor(current_tokens).to(device) seg_ids=[[0 for _ in range(len(tokenized_array[0]))] for _ in range(len(current_tokens))] seg_ids = torch.tensor(seg_ids).to(device) hidden_reps, cls_head = self.model(token_ids, token_type_ids = seg_ids) cls_head.to(device) clas_head = cls_head.detach if if_pool and pooling_type=="mean": tensor = tensor.add(torch.sum(cls_head, dim=0)) count +=cls_head.shape[0] else: embeddingTensorsList.append(cls_head) del cls_head, hidden_reps if if_pool and pooling_type=="mean" and count>0: embedding = torch.div(tensor, count) elif not if_pool: embedding = torch.cat(embeddingTensorsList, dim=0) else: raise NotImplementedError() else: token_ids = torch.tensor(tokenized_array).to(device) seg_ids=[[0 for _ in range(len(tokenized_array[0]))] for _ in range(len(tokenized_array))] seg_ids = torch.tensor(seg_ids).to(device) hidden_reps, cls_head = self.model(token_ids, token_type_ids = seg_ids) cls_head.to(device) cls_head.requires_grad = False if if_pool and pooling_type=="mean": embedding = torch.div(torch.sum(cls_head, dim=0), cls_head.shape[0]) elif not if_pool: embedding = cls_head else: raise NotImplementedError() del cls_head, hidden_reps return embedding def tokenizeText(self, tokens): tokens_array = [] #window_movement_tokens = max_length - stride for i in range(0, len(tokens), self.stride): if i+self.max_length<len(tokens): curr_tokens = ["[CLS]"] + tokens[i:i+self.max_length] + ["[SEP]"] else: padded_tokens = self.padTokens(tokens[i:i+self.max_length]) curr_tokens = ["[CLS]"] + padded_tokens + ["[SEP]"] curr_tokens = self.tokenizer.convert_tokens_to_ids(curr_tokens) tokens_array.append(curr_tokens) return tokens_array
def main(config_path): config = Box.from_yaml(config_path.open()) torch.cuda.set_device(config.train.device) logger = create_logger(name="MAIN") logger.info(f"[-] Config loaded from {config_path}") data_dir = Path(config.data.data_dir) save_dir = Path(config.data.save_dir) if not save_dir.exists(): save_dir.mkdir() transfo_dir = Path(config.data.transfo_dir) device = create_device(config.train.device) tokenizer = BertTokenizer.from_pretrained( str(transfo_dir), do_lower_case=(not config.data.cased)) global CLS global SEP global PAD CLS, SEP, PAD = tokenizer.convert_tokens_to_ids( ["[CLS]", "[SEP]", "[PAD]"]) bert_config = BertConfig.from_pretrained(str(transfo_dir)) # To extract representations from other layers bert_config.output_hidden_states = True model = BertModel(bert_config) model.to(device) model.eval() train_file = data_dir / "schema_dstc8+m2.2.json" train_vocab_file = save_dir / "train_schema_vocab.pkl" train_embed_file = save_dir / "train_schema_embed.pkl" train_desc_file = save_dir / "train_schema_desc.pkl" valid_file = data_dir / "dev" / "schema.json" valid_vocab_file = save_dir / "valid_schema_vocab.pkl" valid_embed_file = save_dir / "valid_schema_embed.pkl" valid_desc_file = save_dir / "valid_schema_desc.pkl" if (data_dir / "test").exists(): test_file = data_dir / "test" / "schema.json" test_vocab_file = save_dir / "test_schema_vocab.pkl" test_embed_file = save_dir / "test_schema_embed.pkl" test_desc_file = save_dir / "test_schema_desc.pkl" else: test_file = None test_vocab_file = None test_embed_file = None test_desc_file = None train_schema_vocab, train_desc = extract(train_file, config.data.concat_name) valid_schema_vocab, valid_desc = extract(valid_file, config.data.concat_name) if test_file is not None: test_schema_vocab, test_desc = extract(test_file, config.data.concat_name) else: test_schema_vocab = test_desc = None pickle.dump(train_schema_vocab, open(train_vocab_file, "wb")) pickle.dump(valid_schema_vocab, open(valid_vocab_file, "wb")) if test_schema_vocab is not None: pickle.dump(test_schema_vocab, open(test_vocab_file, "wb")) layer = config.data.schema.layer pooling = config.data.schema.pooling train_embed = [] for desc in tqdm(train_desc, leave=False): embed = [] for sent in tqdm(desc, leave=False): embed.append( get_rep(sent, model, tokenizer, layer, pooling, device)) embed = torch.stack(embed) train_embed.append(embed) train_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)] for sent in desc] for desc in train_desc] pickle.dump(train_embed, open(train_embed_file, "wb")) pickle.dump(train_desc, open(train_desc_file, "wb")) valid_embed = [] for desc in tqdm(valid_desc, leave=False): embed = [] for sent in tqdm(desc, leave=False): embed.append( get_rep(sent, model, tokenizer, layer, pooling, device)) embed = torch.stack(embed) valid_embed.append(embed) valid_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)] for sent in desc] for desc in valid_desc] pickle.dump(valid_embed, open(valid_embed_file, "wb")) pickle.dump(valid_desc, open(valid_desc_file, "wb")) if test_desc is None: exit() test_embed = [] for desc in tqdm(test_desc, leave=False): embed = [] for sent in tqdm(desc, leave=False): embed.append( get_rep(sent, model, tokenizer, layer, pooling, device)) embed = torch.stack(embed) test_embed.append(embed) test_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)] for sent in desc] for desc in test_desc] pickle.dump(test_embed, open(test_embed_file, "wb")) pickle.dump(test_desc, open(test_desc_file, "wb"))
import torch import torch.nn as nn from transformers import BertTokenizer, BertModel device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #device = torch.device('cpu') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') BertModel = BertModel.from_pretrained('bert-base-uncased').to(device) BertModel.eval() class Decoder(nn.Module): def __init__(self, vocab_size, use_glove, use_bert, glove_vectors, vocab): super(Decoder, self).__init__() self.vocab = vocab self.encoder_dim = 2048 self.attention_dim = 512 self.use_bert = use_bert if use_glove: self.embed_dim = 200 elif use_bert: self.embed_dim = 768 else: self.embed_dim = 512 self.decoder_dim = 512 self.vocab_size = vocab_size self.dropout = 0.5
class NERPredict(IPredict): ''' 构造函数, 初始化预测器 use_gpu: 使用GPU bert_config_file_name: Bert模型配置文件路径 vocab_file_name: 单词表文件路径 tags_file_name: Tag表文件路径 bert_model_path: Bert模型装载路径 lstm_crf_model_path: CRF模型装载路径 hidden_dim: CRF隐藏层 ''' def __init__(self, use_gpu, bert_config_file_name, vocab_file_name, tags_file_name, bert_model_path, lstm_crf_model_path, hidden_dim): self.use_gpu = use_gpu self.data_manager_init(vocab_file_name, tags_file_name) self.tokenizer = BertTokenizer.from_pretrained(vocab_file_name) self.model_init(hidden_dim, bert_config_file_name, bert_model_path, lstm_crf_model_path) def data_manager_init(self, vocab_file_name, tags_file_name): tags_list = BERTDataManager.ReadTagsList(tags_file_name) tags_list = [tags_list] self.dm = BERTDataManager(tags_list=tags_list, vocab_file_name=vocab_file_name) def model_init(self, hidden_dim, bert_config_file_name, bert_model_path, lstm_crf_model_path): config = BertConfig.from_json_file(bert_config_file_name) self.model = BertModel(config) bert_dict = torch.load(bert_model_path).module.state_dict() self.model.load_state_dict(bert_dict) self.birnncrf = torch.load(lstm_crf_model_path) self.model.eval() self.birnncrf.eval() def data_process(self, sentences): result = [] pad_tag = '[PAD]' if type(sentences) == str: sentences = [sentences] max_len = 0 for sentence in sentences: encode = self.tokenizer.encode(sentence, add_special_tokens=True) result.append(encode) if max_len < len(encode): max_len = len(encode) for i, sentence in enumerate(result): remain = max_len - len(sentence) for _ in range(remain): result[i].append(self.dm.wordToIdx(pad_tag)) return torch.tensor(result) def pred(self, sentences): sentences = self.data_process(sentences) if torch.cuda.is_available() and self.use_gpu: self.model.cuda() self.birnncrf.cuda() sentences = sentences.cuda() outputs = self.model(input_ids=sentences, attention_mask=sentences.gt(0)) hidden_states = outputs[0] scores, tags = self.birnncrf(hidden_states, sentences.gt(0)) final_tags = [] decode_sentences = [] for item in tags: final_tags.append([self.dm.idx_to_tag[tag] for tag in item]) for item in sentences.tolist(): decode_sentences.append(self.tokenizer.decode(item)) return (scores, tags, final_tags, decode_sentences) def __call__(self, sentences): return self.pred(sentences)
def main(args, _=None): """Run the ``catalyst-data text2embeddings`` script.""" batch_size = args.batch_size num_workers = args.num_workers max_length = args.max_length pooling_groups = args.pooling.split(",") utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) if hasattr(args, "in_huggingface"): model_config = BertConfig.from_pretrained(args.in_huggingface) model_config.output_hidden_states = args.output_hidden_states model = BertModel.from_pretrained(args.in_huggingface, config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_huggingface) else: model_config = BertConfig.from_pretrained(args.in_config) model_config.output_hidden_states = args.output_hidden_states model = BertModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.in_vocab) if hasattr(args, "in_model"): checkpoint = utils.load_checkpoint(args.in_model) checkpoint = {"model_state_dict": checkpoint} utils.unpack_checkpoint(checkpoint=checkpoint, model=model) model = model.eval() model, _, _, _, device = utils.process_components(model=model) df = pd.read_csv(args.in_csv) df = df.dropna(subset=[args.txt_col]) df.to_csv(f"{args.out_prefix}.df.csv", index=False) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) num_samples = len(df) open_fn = LambdaReader( input_key=args.txt_col, output_key=None, lambda_fn=partial( tokenize_text, strip=args.strip, lowercase=args.lowercase, remove_punctuation=args.remove_punctuation, ), tokenizer=tokenizer, max_length=max_length, ) dataloader = utils.get_loader( df, open_fn, batch_size=batch_size, num_workers=num_workers, ) features = {} dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for idx, batch in enumerate(dataloader): batch = utils.any2device(batch, device) bert_output = model(**batch) mask = (batch["attention_mask"].unsqueeze(-1) if args.mask_for_max_length else None) if utils.check_ddp_wrapped(model): # using several gpu hidden_size = model.module.config.hidden_size hidden_states = model.module.config.output_hidden_states else: # using cpu or one gpu hidden_size = model.config.hidden_size hidden_states = model.config.output_hidden_states features_ = process_bert_output( bert_output=bert_output, hidden_size=hidden_size, output_hidden_states=hidden_states, pooling_groups=pooling_groups, mask=mask, ) # create storage based on network output if idx == 0: for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" _, embedding_size = value.shape features[name_] = np.memmap( f"{args.out_prefix}.{name_}.npy", dtype=np.float32, mode="w+", shape=(num_samples, embedding_size), ) indices = np.arange(idx * batch_size, min((idx + 1) * batch_size, num_samples)) for key, value in features_.items(): name_ = key if isinstance(key, str) else f"{key:02d}" features[name_][indices] = _detach(value)
class SentenceBert(BertPreTrainedModel): def __init__(self, config, max_len, tokenizer, device, task_type): super(SentenceBert, self).__init__(config) self.max_len = max_len self.task_type = task_type self._target_device = device self.tokenizer = tokenizer self.bert = BertModel(config=config) self.classifier = nn.Linear(3 * config.hidden_size, config.num_labels) def forward(self, inputs): input_a = inputs[0] input_b = inputs[1] output_a = self.bert(**input_a, return_dict=True, output_hidden_states=True) output_b = self.bert(**input_b, return_dict=True, output_hidden_states=True) #采用最后一层 embedding_a = output_a.hidden_states[-1] embedding_b = output_b.hidden_states[-1] embedding_a = self.pooling(embedding_a, input_a) embedding_b = self.pooling(embedding_b, input_b) if self.task_type == "classification": embedding_abs = torch.abs(embedding_a - embedding_b) vectors_concat = [] vectors_concat.append(embedding_a) vectors_concat.append(embedding_b) vectors_concat.append(embedding_abs) #列拼接3个768————>3*768 features = torch.cat(vectors_concat, 1) output = self.classifier(features) else: d = torch.mul(embedding_a, embedding_b) a_len = torch.norm(embedding_a, dim=1) b_len = torch.norm(embedding_b, dim=1) cos = torch.sum(d) / (a_len * b_len) output = cos return output def pooling(self, token_embeddings, input): output_vectors = [] #attention_mask attention_mask = input['attention_mask'] #[B,L]------>[B,L,1]------>[B,L,768],矩阵的值是0或者1 input_mask_expanded = attention_mask.unsqueeze(-1).expand( token_embeddings.size()).float() #这里做矩阵点积,就是对元素相乘(序列中padding字符,通过乘以0给去掉了)[B,L,768] t = token_embeddings * input_mask_expanded #[B,768] sum_embeddings = torch.sum(t, 1) # [B,768],最大值为seq_len sum_mask = input_mask_expanded.sum(1) #限定每个元素的最小值是1e-9,保证分母不为0 sum_mask = torch.clamp(sum_mask, min=1e-9) #得到最后的具体embedding的每一个维度的值——元素相除 output_vectors.append(sum_embeddings / sum_mask) #列拼接 output_vector = torch.cat(output_vectors, 1) return output_vector def encoding(self, inputs): self.bert.eval() with torch.no_grad(): output = self.bert(**inputs, return_dict=True, output_hidden_states=True) embedding = output.hidden_states[-1] embedding = self.pooling(embedding, inputs) return embedding # def class_infer(self,texts,batch_size=64): # """ # 推理输入文本list中第一条和剩余其他的是否是同一类别;可以传入无限长的list # :param texts: # :param tokenizer: # :param batch_size: # :return: # """ # result = [] # text_a = texts[0] # input_id_a, attention_mask_a = self.convert_text2ids(text_a) # input_ids_a = [] # attention_masks_a = [] # input_ids_b = [] # attention_masks_b = [] # for text in texts[1:]: # input_id_b, attention_mask_b = self.convert_text2ids(text) # # input_ids_a.append(input_id_a) # attention_masks_a.append(attention_mask_a) # # input_ids_b.append(input_id_b) # attention_masks_b.append(attention_mask_b) # if len(input_ids_a) >= batch_size: # inputs = [] # input_ids_a = torch.as_tensor(input_ids_a,dtype=torch.long,device=self.device) # attention_masks_a = torch.as_tensor(attention_masks_a,dtype=torch.long,device=self.device) # token_type_ids_a = torch.zeros_like(input_ids_a).to(self.device) # # inputs_a = {'input_ids': input_ids_a, 'attention_mask': attention_masks_a,'token_type_ids':token_type_ids_a} # # input_ids_b = torch.as_tensor(input_ids_b, dtype=torch.long, device=self.device) # attention_masks_b = torch.as_tensor(attention_masks_b, dtype=torch.long, device=self.device) # token_type_ids_b = torch.zeros_like(input_ids_b).to(self.device) # # inputs_b = {'input_ids': input_ids_b, 'attention_mask': attention_masks_b, # 'token_type_ids': token_type_ids_b} # # inputs.append(inputs_a) # inputs.append(inputs_b) # logits = self.forward(inputs) # # # lables = torch.argmax(logits) # result.append(lables) # # # input_ids_a = [] # attention_masks_a = [] # input_ids_b = [] # attention_masks_b = [] # # # # inputs = [] # input_ids_a = torch.as_tensor(input_ids_a, dtype=torch.long, device=self.device) # attention_masks_a = torch.as_tensor(attention_masks_a, dtype=torch.long, device=self.device) # token_type_ids_a = torch.zeros_like(input_ids_a).to(self.device) # # inputs_a = {'input_ids': input_ids_a, 'attention_mask': attention_masks_a, 'token_type_ids': token_type_ids_a} # # input_ids_b = torch.as_tensor(input_ids_b, dtype=torch.long, device=self.device) # attention_masks_b = torch.as_tensor(attention_masks_b, dtype=torch.long, device=self.device) # token_type_ids_b = torch.zeros_like(input_ids_b).to(self.device) # # inputs_b = {'input_ids': input_ids_b, 'attention_mask': attention_masks_b, # 'token_type_ids': token_type_ids_b} # # inputs.append(inputs_a) # inputs.append(inputs_b) # logits = self.forward(inputs) # # lables = torch.argmax(logits) # result.append(lables) # # # return result def class_infer(self, texts, batch_size=64): """ 推理输入文本list中第一条和剩余其他的是否是同一类别;传入长度<batch_size :param texts: :param tokenizer: :param batch_size: :return: """ assert len(texts) <= batch_size result = [] text_a = texts[0] input_id_a, attention_mask_a = self.convert_text2ids(text_a) input_ids_a = [] attention_masks_a = [] input_ids_b = [] attention_masks_b = [] for text in texts[1:]: input_id_b, attention_mask_b = self.convert_text2ids(text) input_ids_a.append(input_id_a) attention_masks_a.append(attention_mask_a) input_ids_b.append(input_id_b) attention_masks_b.append(attention_mask_b) inputs = [] input_ids_a = torch.as_tensor(input_ids_a, dtype=torch.long, device=self._target_device) attention_masks_a = torch.as_tensor(attention_masks_a, dtype=torch.long, device=self._target_device) token_type_ids_a = torch.zeros_like(input_ids_a).to( self._target_device) inputs_a = { 'input_ids': input_ids_a, 'attention_mask': attention_masks_a, 'token_type_ids': token_type_ids_a } input_ids_b = torch.as_tensor(input_ids_b, dtype=torch.long, device=self._target_device) attention_masks_b = torch.as_tensor(attention_masks_b, dtype=torch.long, device=self._target_device) token_type_ids_b = torch.zeros_like(input_ids_b).to( self._target_device) inputs_b = { 'input_ids': input_ids_b, 'attention_mask': attention_masks_b, 'token_type_ids': token_type_ids_b } inputs.append(inputs_a) inputs.append(inputs_b) logits = self.forward(inputs) lables = torch.argmax(logits) result.append(lables) return result def similarity_infer(self, texts, batch_size=64): """ 计算输入文本list中第一条和剩余其他文本的相似度 传入长度<batch_size :param texts: :param tokenizer: :param batch_size: :return: """ assert len(texts) <= batch_size input_ids = [] attention_masks = [] for text in texts: input_id, attention_mask = self.convert_text2ids(text) input_ids.append(input_id) attention_masks.append(attention_mask) input_ids_a = torch.as_tensor(input_ids, dtype=torch.long, device=self._target_device) attention_masks_a = torch.as_tensor(attention_masks, dtype=torch.long, device=self._target_device) token_type_ids_a = torch.zeros_like(input_ids_a).to( self._target_device) inputs = { 'input_ids': input_ids_a, 'attention_mask': attention_masks_a, 'token_type_ids': token_type_ids_a } embeddings = self.encoding(inputs) embedding_a = embeddings[0:1] embedding_b = embeddings[1:] d = torch.mul(embedding_a, embedding_b) # 计算对应元素相乘 a_len = torch.norm(embedding_a, dim=1) # 2范数,也就是模长 b_len = torch.norm(embedding_b, dim=1) cos = torch.sum(d, dim=1) / (a_len * b_len) # 得到相似度 simlaritys = cos return simlaritys def convert_text2ids(self, text): text = text[0:self.max_len - 2] inputs = self.tokenizer(text) input_ids = inputs['input_ids'] # lenght = len(input_ids) attention_mask = inputs['attention_mask'] paddings = [0] * (self.max_len - len(input_ids)) input_ids += paddings attention_mask += paddings return input_ids, attention_mask
import torch from transformers import BertConfig, BertModel if args.size == 'tiny': cur_dir = os.path.dirname(os.path.abspath(__file__)) bert_name_or_path = os.path.join(os.path.join(cur_dir, 'bert'), 'bert-tiny-uncased-config.json') elif args.size == 'base': bert_name_or_path = "bert-base-uncased" else: bert_name_or_path = "bert-large-uncased" config = BertConfig.from_pretrained(bert_name_or_path) model = BertModel(config) model.eval() device = torch.device("cpu") model.to(device) dummy_input0 = torch.LongTensor(1, 512).fill_(1).to(device) dummy_input1 = torch.LongTensor(1, 512).fill_(1).to(device) dummy_input2 = torch.LongTensor(1, 512).fill_(0).to(device) dummy_input = (dummy_input0, dummy_input1, dummy_input2) output_path = './bert/bert_{}.onnx'.format(args.size) torch.onnx.export(model, dummy_input, output_path, export_params=True, opset_version=12, do_constant_folding=True, input_names=["input_ids", "input_mask", "segment_ids"], output_names=["output"],
def train(config, bert_config, train_path, dev_path, rel2id, id2rel, tokenizer): if os.path.exists(config.output_dir) is False: os.makedirs(config.output_dir, exist_ok=True) if os.path.exists('./data/train_file.pkl'): train_data = pickle.load(open("./data/train_file.pkl", mode='rb')) else: train_data = data.load_data(train_path, tokenizer, rel2id, num_rels) pickle.dump(train_data, open("./data/train_file.pkl", mode='wb')) dev_data = json.load(open(dev_path)) for sent in dev_data: data.to_tuple(sent) data_manager = data.SPO(train_data) train_sampler = RandomSampler(data_manager) train_data_loader = DataLoader(data_manager, sampler=train_sampler, batch_size=config.batch_size, drop_last=True) num_train_steps = int( len(data_manager) / config.batch_size) * config.max_epoch if config.bert_pretrained_model is not None: logger.info('load bert weight') Bert_model = BertModel.from_pretrained(config.bert_pretrained_model, config=bert_config) else: logger.info('random initialize bert model') Bert_model = BertModel(config=bert_config).init_weights() Bert_model.to(device) submodel = sub_model(config).to(device) objmodel = obj_model(config).to(device) loss_fuc = nn.BCELoss(reduction='none') params = list(Bert_model.parameters()) + list( submodel.parameters()) + list(objmodel.parameters()) optimizer = AdamW(params, lr=config.lr) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(data_manager)) logger.info(" Num Epochs = %d", config.max_epoch) logger.info(" Total train batch size = %d", config.batch_size) logger.info(" Total optimization steps = %d", num_train_steps) logger.info(" Logging steps = %d", config.print_freq) logger.info(" Save steps = %d", config.save_freq) global_step = 0 Bert_model.train() submodel.train() objmodel.train() for _ in range(config.max_epoch): optimizer.zero_grad() epoch_itorator = tqdm(train_data_loader, disable=None) for step, batch in enumerate(epoch_itorator): batch = tuple(t.to(device) for t in batch) input_ids, segment_ids, input_masks, sub_positions, sub_heads, sub_tails, obj_heads, obj_tails = batch bert_output = Bert_model(input_ids, input_masks, segment_ids)[0] pred_sub_heads, pred_sub_tails = submodel( bert_output) # [batch_size, seq_len, 1] pred_obj_heads, pred_obj_tails = objmodel(bert_output, sub_positions) # 计算loss mask = input_masks.view(-1) # loss1 sub_heads = sub_heads.unsqueeze(-1) # [batch_szie, seq_len, 1] sub_tails = sub_tails.unsqueeze(-1) loss1_head = loss_fuc(pred_sub_heads, sub_heads).view(-1) loss1_head = torch.sum(loss1_head * mask) / torch.sum(mask) loss1_tail = loss_fuc(pred_sub_tails, sub_tails).view(-1) loss1_tail = torch.sum(loss1_tail * mask) / torch.sum(mask) loss1 = loss1_head + loss1_tail # loss2 loss2_head = loss_fuc(pred_obj_heads, obj_heads).view(-1, obj_heads.shape[-1]) loss2_head = torch.sum( loss2_head * mask.unsqueeze(-1)) / torch.sum(mask) loss2_tail = loss_fuc(pred_obj_tails, obj_tails).view(-1, obj_tails.shape[-1]) loss2_tail = torch.sum( loss2_tail * mask.unsqueeze(-1)) / torch.sum(mask) loss2 = loss2_head + loss2_tail # optimize loss = loss1 + loss2 loss.backward() optimizer.step() optimizer.zero_grad() global_step += 1 if (global_step + 1) % config.print_freq == 0: logger.info( "epoch : {} step: {} #### loss1: {} loss2: {}".format( _, global_step + 1, loss1.cpu().item(), loss2.cpu().item())) if (global_step + 1) % config.eval_freq == 0: logger.info("***** Running evaluating *****") with torch.no_grad(): Bert_model.eval() submodel.eval() objmodel.eval() P, R, F1 = utils.metric(Bert_model, submodel, objmodel, dev_data, id2rel, tokenizer) logger.info(f'precision:{P}\nrecall:{R}\nF1:{F1}') Bert_model.train() submodel.train() objmodel.train() if (global_step + 1) % config.save_freq == 0: # Save a trained model model_name = "pytorch_model_%d" % (global_step + 1) output_model_file = os.path.join(config.output_dir, model_name) state = { 'bert_state_dict': Bert_model.state_dict(), 'subject_state_dict': submodel.state_dict(), 'object_state_dict': objmodel.state_dict(), } torch.save(state, output_model_file) model_name = "pytorch_model_last" output_model_file = os.path.join(config.output_dir, model_name) state = { 'bert_state_dict': Bert_model.state_dict(), 'subject_state_dict': submodel.state_dict(), 'object_state_dict': objmodel.state_dict(), } torch.save(state, output_model_file)
class BertForQuestionAnsweringWithCRF(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = BertModel(config) self.hidden_size = self.bert.config.hidden_size self.CRF_fc1 = nn.Sequential( nn.Dropout(0.5), nn.Linear(self.hidden_size, config.num_labels + 2, bias=True), ) self.CRF = CRF(target_size=self.bert.config.num_labels, device=torch.device("cuda")) self.CrossEntropyLoss = nn.CrossEntropyLoss() self.fc2 = nn.Linear(self.hidden_size, 2, bias=True) def forward(self, tokens_id_l, token_type_ids_l, answer_offset_l, answer_seq_label_l, IsQA_l): ## 字符ID [batch_size, seq_length] tokens_x_2d = torch.LongTensor(tokens_id_l).to(self.device) token_type_ids_2d = torch.LongTensor(token_type_ids_l).to(self.device) # 计算sql_len 不包含[CLS] batch_size, seq_length = tokens_x_2d[:, 1:].size() ## CRF答案ID [batch_size, seq_length] y_2d = torch.LongTensor(answer_seq_label_l).to(self.device)[:, 1:] ## (batch_size,) y_IsQA_2d = torch.LongTensor(IsQA_l).to(self.device) if self.training: # self.training基层的外部类 self.bert.train() output = self.bert( input_ids=tokens_x_2d, token_type_ids=token_type_ids_2d, output_hidden_states=True, return_dict=True) #[batch_size, seq_len, hidden_size] else: self.bert.eval() with torch.no_grad(): output = self.bert(input_ids=tokens_x_2d, token_type_ids=token_type_ids_2d, output_hidden_states=True, return_dict=True) ## [CLS] for IsQA [batch_size, hidden_size] cls_emb = output.last_hidden_state[:, 0, :] IsQA_logits = self.fc2(cls_emb) ## [batch_size, 2] IsQA_loss = self.CrossEntropyLoss.forward(IsQA_logits, y_IsQA_2d) ## [batch_size, 1] IsQA_prediction = IsQA_logits.argmax(dim=-1).unsqueeze(dim=-1) # CRF mask mask = np.ones(shape=[batch_size, seq_length], dtype=np.uint8) mask = torch.ByteTensor(mask).to( self.device) # [batch_size, seq_len, 4] # No [CLS] crf_logits = self.CRF_fc1(output.last_hidden_state[:, 1:, :]) crf_loss = self.CRF.neg_log_likelihood_loss(feats=crf_logits, mask=mask, tags=y_2d) _, CRFprediction = self.CRF.forward(feats=crf_logits, mask=mask) return IsQA_prediction, CRFprediction, IsQA_loss, crf_loss, y_2d, y_IsQA_2d.unsqueeze( dim=-1) # (batch_size,) -> (batch_size, 1)