def create_and_check_model_as_decoder(
     self,
     config,
     input_ids,
     token_type_ids,
     input_mask,
     sequence_labels,
     token_labels,
     choice_labels,
     encoder_hidden_states,
     encoder_attention_mask,
 ):
     config.add_cross_attention = True
     model = BertModel(config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         encoder_hidden_states=encoder_hidden_states,
         encoder_attention_mask=encoder_attention_mask,
     )
     result = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         encoder_hidden_states=encoder_hidden_states,
     )
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
     self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
     self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
        def create_and_check_bert_model_as_decoder(self, config, input_ids,
                                                   token_type_ids, input_mask,
                                                   sequence_labels,
                                                   token_labels, choice_labels,
                                                   encoder_hidden_states,
                                                   encoder_attention_mask):
            model = BertModel(config)
            model.eval()
            sequence_output, pooled_output = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask)
            sequence_output, pooled_output = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids,
                encoder_hidden_states=encoder_hidden_states)
            sequence_output, pooled_output = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids)

            result = {
                "sequence_output": sequence_output,
                "pooled_output": pooled_output,
            }
            self.parent.assertListEqual(
                list(result["sequence_output"].size()),
                [self.batch_size, self.seq_length, self.hidden_size])
            self.parent.assertListEqual(list(result["pooled_output"].size()),
                                        [self.batch_size, self.hidden_size])
Beispiel #3
0
def get_kobert_model(model_file, vocab_file, ctx="cpu"):
    bertmodel = BertModel(config=BertConfig.from_dict(bert_config))
    bertmodel.load_state_dict(torch.load(model_file), strict=False)
    device = torch.device(ctx)
    bertmodel.to(device)
    bertmodel.eval()
    vocab_b_obj = nlp.vocab.BERTVocab.from_json(open(vocab_file, 'rt').read())
    return bertmodel, vocab_b_obj
Beispiel #4
0
def get_kobert_model(model_file, vocab_file, ctx="cpu"):
    bertmodel = BertModel(config=BertConfig.from_dict(bert_config))
    bertmodel.load_state_dict(torch.load(model_file))
    device = torch.device(ctx)
    bertmodel.to(device)
    bertmodel.eval()
    vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file,
                                                         padding_token='[PAD]')
    return bertmodel, vocab_b_obj
        def create_and_check_bert_model(self, config, input_ids,
                                        token_type_ids, input_mask,
                                        sequence_labels, token_labels,
                                        choice_labels):
            model = BertModel(config=config)
            model.to(input_ids.device)
            model.eval()

            sequence_output, pooled_output = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids)

            # failed because there is not loss output
            model_desc = ModelDescription([
                self.input_ids_desc, self.attention_mask_desc,
                self.token_type_ids_desc
            ], [self.last_hidden_state_desc, self.pooler_output_desc])
            args_gradient_accumulation_steps = 8
            args_local_rank = 0
            args_world_size = 1
            args_fp16 = True
            args_allreduce_post_accumulation = True

            model = ORTTrainer(
                model,
                None,
                model_desc,
                "LambOptimizer",
                map_optimizer_attributes=map_optimizer_attributes,
                learning_rate_description=IODescription(
                    'Learning_Rate', [
                        1,
                    ], torch.float32),
                device=self.device,
                postprocess_model=postprocess_model,
                gradient_accumulation_steps=args_gradient_accumulation_steps,
                world_rank=args_local_rank,
                world_size=args_world_size,
                use_mixed_precision=True if args_fp16 else False,
                allreduce_post_accumulation=True
                if args_allreduce_post_accumulation else False)

            sequence_output, pooled_output = model(
                input_ids, token_type_ids=token_type_ids)
            sequence_output, pooled_output = model(input_ids)

            result = {
                "sequence_output": sequence_output,
                "pooled_output": pooled_output,
            }
            self.parent.assertListEqual(
                list(result["sequence_output"].size()),
                [self.batch_size, self.seq_length, self.hidden_size])
            self.parent.assertListEqual(list(result["pooled_output"].size()),
                                        [self.batch_size, self.hidden_size])
Beispiel #6
0
def main():

# Env
    use_gpu=False
    #torch.cuda.is_available()
    
    # batchsize = [1,4,8,16,32,64,128,256]

# Data Augment
    input_np = np.random.randint(1000, size=(1,512))
    
    input_ids=torch.from_numpy(input_np).long()
    attention_mask=torch.zeros(1, 512).long()
    token_type_ids=torch.ones(1, 512).long()
    
    if use_gpu:
        input_ids=input_ids.cuda()
        attention_mask=attention_mask.cuda()
        token_type_ids=token_type_ids.cuda()
        

    

# Model Prepare

    configuration = BertConfig(vocab_size=30522, hidden_size=768, num_hudden_layers=12,
                            num_attention_heads=12, intermediate_size=3072, 
                            hidden_act='gelu', hidden_dropout_prob=0.1, 
                            attention_probs_dropout_prob=0.1, 
                            max_position_embeddings=512, type_vocab_size=2, 
                            initializer_range=0.02, layer_norm_eps=1e-12, 
                            pad_token_id=0, gradient_checkpointing=False)
    
    if use_gpu:
        model = BertModel(configuration).cuda()
    else:
        model = BertModel(configuration)
    
# Eval with Speed Record
    
    model.eval()
    
    t1 = time.time()
    with torch.no_grad():
        for i in range(512):
            print(i)
            output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    print('val_time = {:.6f}'.format(time.time() - t1))
 def create_and_check_bert_model(self, config, input_ids, token_type_ids,
                                 input_mask, sequence_labels, token_labels,
                                 choice_labels):
     model = BertModel(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids,
                    attention_mask=input_mask,
                    token_type_ids=token_type_ids)
     result = model(input_ids, token_type_ids=token_type_ids)
     result = model(input_ids)
     self.parent.assertEqual(
         result.last_hidden_state.shape,
         (self.batch_size, self.seq_length, self.hidden_size))
     self.parent.assertEqual(result.pooler_output.shape,
                             (self.batch_size, self.hidden_size))
Beispiel #8
0
    def _load_bert(self, bert_config_path: str, bert_model_path: str):
        bert_config = BertConfig.from_json_file(bert_config_path)
        model = BertModel(bert_config)
        if self.cuda:
            model_states = torch.load(bert_model_path)
        else:
            model_states = torch.load(bert_model_path, map_location='cpu')
        # fix model_states
        for k in list(model_states.keys()):
            if k.startswith("bert."):
                model_states[k[5:]] = model_states.pop(k)
            elif k.startswith("cls"):
                _ = model_states.pop(k)

            if k[-4:] == "beta":
                model_states[k[:-4]+"bias"] = model_states.pop(k)
            if k[-5:] == "gamma":
                model_states[k[:-5]+"weight"] = model_states.pop(k)

        model.load_state_dict(model_states)
        if self.cuda:
            model.cuda()
        model.eval()
        return model
Beispiel #9
0
def main(args, _=None):
    batch_size = args.batch_size
    num_workers = args.num_workers
    max_length = args.max_length
    pooling_groups = args.pooling.split(",")

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    model_config = BertConfig.from_pretrained(args.in_config)
    model_config.output_hidden_states = args.output_hidden_states
    model = BertModel(config=model_config)

    checkpoint = utils.load_checkpoint(args.in_model)
    checkpoint = {"model_state_dict": checkpoint}
    utils.unpack_checkpoint(checkpoint=checkpoint, model=model)

    model = model.eval()
    model, _, _, _, device = utils.process_components(model=model)

    tokenizer = BertTokenizer.from_pretrained(args.in_vocab)

    df = pd.read_csv(args.in_csv)
    df = df.dropna(subset=[args.txt_col])
    df.to_csv(f"{args.out_prefix}.df.csv", index=False)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())
    num_samples = len(df)

    open_fn = LambdaReader(
        input_key=args.txt_col,
        output_key=None,
        lambda_fn=get_features,
        tokenizer=tokenizer,
        max_length=max_length,
    )

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    features = {}
    poolings = {}
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            batch = utils.any2device(batch, device)
            features_ = model(**batch)

            # create storage based on network output
            if idx == 0:
                # class
                _, embedding_size = features_[1].shape
                features["class"] = np.memmap(
                    f"{args.out_prefix}.class.npy",
                    dtype=np.float32,
                    mode="w+",
                    shape=(num_samples, embedding_size),
                )
                if args.output_hidden_states:
                    # all embeddings
                    for i, feature_ in enumerate(features_[2]):
                        name_ = f"embeddings_{i + 1:02d}"
                        _, _, embedding_size = feature_.shape
                        poolings[name_] = LamaPooling(
                            features_in=embedding_size,
                            groups=pooling_groups,
                        )
                        features[name_] = np.memmap(
                            f"{args.out_prefix}.{name_}.npy",
                            dtype=np.float32,
                            mode="w+",
                            shape=(num_samples, embedding_size),
                        )
                else:
                    # last
                    _, _, embedding_size = features_[0].shape
                    poolings["last"] = LamaPooling(
                        features_in=embedding_size,
                        groups=pooling_groups,
                    )
                    features["last"] = np.memmap(
                        f"{args.out_prefix}.last.npy",
                        dtype=np.float32,
                        mode="w+",
                        shape=(num_samples, embedding_size),
                    )

            indices = np.arange(idx * batch_size,
                                min((idx + 1) * batch_size, num_samples))
            features["class"][indices] = _detach(features_[1])
            if args.output_hidden_states:
                # all embeddings
                for i, feature_ in enumerate(features_[2]):
                    name_ = f"embeddings_{i + 1:02d}"
                    feature_ = poolings[name_](feature_)
                    features[name_][indices] = _detach(feature_)
            else:
                feature_ = poolings[name_](features_[0])
                features["last"][indices] = _detach(feature_)
Beispiel #10
0
def main(args, _=None):
    """Run the ``catalyst-contrib text2embeddings`` script."""
    batch_size = args.batch_size
    num_workers = args.num_workers
    max_length = args.max_length
    pooling_groups = args.pooling.split(",")
    bert_level = args.bert_level

    if bert_level is not None:
        assert (args.output_hidden_states
                ), "You need hidden states output for level specification"

    set_global_seed(args.seed)
    prepare_cudnn(args.deterministic, args.benchmark)

    if getattr(args, "in_huggingface", False):
        model_config = BertConfig.from_pretrained(args.in_huggingface)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel.from_pretrained(args.in_huggingface,
                                          config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_huggingface)
    else:
        model_config = BertConfig.from_pretrained(args.in_config)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel(config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_vocab)
    if getattr(args, "in_model", None) is not None:
        checkpoint = load_checkpoint(args.in_model)
        checkpoint = {"model_state_dict": checkpoint}
        unpack_checkpoint(checkpoint=checkpoint, model=model)

    model = model.eval()
    model, _, _, _, device = process_components(model=model)

    df = pd.read_csv(args.in_csv)
    df = df.dropna(subset=[args.txt_col])
    df.to_csv(f"{args.out_prefix}.df.csv", index=False)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())
    num_samples = len(df)

    open_fn = LambdaReader(
        input_key=args.txt_col,
        output_key=None,
        lambda_fn=partial(
            tokenize_text,
            strip=args.strip,
            lowercase=args.lowercase,
            remove_punctuation=args.remove_punctuation,
        ),
        tokenizer=tokenizer,
        max_length=max_length,
    )

    dataloader = get_loader(
        df,
        open_fn,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    features = {}
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for idx, batch_input in enumerate(dataloader):
            batch_input = any2device(batch_input, device)
            batch_output = model(**batch_input)
            mask = (batch_input["attention_mask"].unsqueeze(-1)
                    if args.mask_for_max_length else None)

            if check_ddp_wrapped(model):
                # using several gpu
                hidden_size = model.module.config.hidden_size
                hidden_states = model.module.config.output_hidden_states

            else:
                # using cpu or one gpu
                hidden_size = model.config.hidden_size
                hidden_states = model.config.output_hidden_states

            batch_features = process_bert_output(
                bert_output=batch_output,
                hidden_size=hidden_size,
                output_hidden_states=hidden_states,
                pooling_groups=pooling_groups,
                mask=mask,
            )

            # create storage based on network output
            if idx == 0:
                for layer_name, layer_value in batch_features.items():
                    if bert_level is not None and bert_level != layer_name:
                        continue
                    layer_name = (layer_name if isinstance(layer_name, str)
                                  else f"{layer_name:02d}")
                    _, embedding_size = layer_value.shape
                    features[layer_name] = np.memmap(
                        f"{args.out_prefix}.{layer_name}.npy",
                        dtype=np.float32,
                        mode="w+",
                        shape=(num_samples, embedding_size),
                    )

            indices = np.arange(idx * batch_size,
                                min((idx + 1) * batch_size, num_samples))
            for layer_name2, layer_value2 in batch_features.items():
                if bert_level is not None and bert_level != layer_name2:
                    continue
                layer_name2 = (layer_name2 if isinstance(layer_name2, str) else
                               f"{layer_name2:02d}")
                features[layer_name2][indices] = _detach(layer_value2)

    if args.force_save:
        for key, mmap in features.items():
            mmap.flush()
            np.save(f"{args.out_prefix}.{key}.force.npy",
                    mmap,
                    allow_pickle=False)
Beispiel #11
0
class JointBERT(BertPreTrainedModel):
    def __init__(self,
                 bert_config,
                 model_config,
                 device,
                 slot_dim,
                 intent_dim,
                 intent_weight=None):
        super(JointBERT, self).__init__(bert_config)
        self.slot_num_labels = slot_dim
        self.intent_num_labels = intent_dim
        self.device = device
        self.intent_weight = intent_weight if intent_weight is not None else torch.tensor(
            [1.] * intent_dim)

        self.bert = BertModel(bert_config)
        self.dropout = nn.Dropout(model_config['dropout'])
        self.context = model_config['context']
        self.finetune = model_config['finetune']
        self.context_grad = model_config['context_grad']
        if self.context:
            self.intent_classifier = nn.Linear(2 * bert_config.hidden_size,
                                               self.intent_num_labels)
            self.slot_classifier = nn.Linear(2 * bert_config.hidden_size,
                                             self.slot_num_labels)
            self.intent_hidden = nn.Linear(2 * bert_config.hidden_size,
                                           2 * bert_config.hidden_size)
            self.slot_hidden = nn.Linear(2 * bert_config.hidden_size,
                                         2 * bert_config.hidden_size)
        else:
            self.intent_classifier = nn.Linear(bert_config.hidden_size,
                                               self.intent_num_labels)
            self.slot_classifier = nn.Linear(bert_config.hidden_size,
                                             self.slot_num_labels)
            self.intent_hidden = nn.Linear(bert_config.hidden_size,
                                           bert_config.hidden_size)
            self.slot_hidden = nn.Linear(bert_config.hidden_size,
                                         bert_config.hidden_size)
        self.intent_loss_fct = torch.nn.BCEWithLogitsLoss(
            pos_weight=self.intent_weight)
        self.slot_loss_fct = torch.nn.CrossEntropyLoss()

        self.init_weights()

    def forward(self,
                word_seq_tensor,
                word_mask_tensor,
                tag_seq_tensor=None,
                tag_mask_tensor=None,
                intent_tensor=None,
                context_seq_tensor=None,
                context_mask_tensor=None):
        if not self.finetune:
            self.bert.eval()
            with torch.no_grad():
                outputs = self.bert(input_ids=word_seq_tensor,
                                    attention_mask=word_mask_tensor)
        else:
            outputs = self.bert(input_ids=word_seq_tensor,
                                attention_mask=word_mask_tensor)

        sequence_output = outputs[0]
        pooled_output = outputs[1]

        if self.context and context_seq_tensor is not None:
            if not self.finetune or not self.context_grad:
                with torch.no_grad():
                    context_output = self.bert(
                        input_ids=context_seq_tensor,
                        attention_mask=context_mask_tensor)[1]
            else:
                context_output = self.bert(
                    input_ids=context_seq_tensor,
                    attention_mask=context_mask_tensor)[1]
            sequence_output = torch.cat([
                context_output.unsqueeze(1).repeat(1, sequence_output.size(1),
                                                   1), sequence_output
            ],
                                        dim=-1)
            pooled_output = torch.cat([context_output, pooled_output], dim=-1)

        sequence_output = nn.functional.relu(
            self.dropout(self.slot_hidden(sequence_output)))
        pooled_output = nn.functional.relu(
            self.dropout(self.intent_hidden(pooled_output)))

        sequence_output = self.dropout(sequence_output)
        slot_logits = self.slot_classifier(sequence_output)
        outputs = (slot_logits, )

        pooled_output = self.dropout(pooled_output)
        intent_logits = self.intent_classifier(pooled_output)
        outputs = outputs + (intent_logits, )

        if tag_seq_tensor is not None:
            active_tag_loss = tag_mask_tensor.view(-1) == 1
            active_tag_logits = slot_logits.view(
                -1, self.slot_num_labels)[active_tag_loss]
            active_tag_labels = tag_seq_tensor.view(-1)[active_tag_loss]
            slot_loss = self.slot_loss_fct(active_tag_logits,
                                           active_tag_labels)

            outputs = outputs + (slot_loss, )

        if intent_tensor is not None:
            intent_loss = self.intent_loss_fct(intent_logits, intent_tensor)
            outputs = outputs + (intent_loss, )

        return outputs  # slot_logits, intent_logits, (slot_loss), (intent_loss),
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser(
        description='Train the individual Transformer model')
    parser.add_argument('--dataset_folder', type=str, default='datasets')
    parser.add_argument('--dataset_name', type=str, default='zara1')
    parser.add_argument('--obs', type=int, default=8)
    parser.add_argument('--preds', type=int, default=12)
    parser.add_argument('--emb_size', type=int, default=1024)
    parser.add_argument('--heads', type=int, default=8)
    parser.add_argument('--layers', type=int, default=6)
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--cpu', action='store_true')
    parser.add_argument('--output_folder', type=str, default='Output')
    parser.add_argument('--val_size', type=int, default=50)
    parser.add_argument('--gpu_device', type=str, default="0")
    parser.add_argument('--verbose', action='store_true')
    parser.add_argument('--max_epoch', type=int, default=100)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--validation_epoch_start', type=int, default=30)
    parser.add_argument('--resume_train', action='store_true')
    parser.add_argument('--delim', type=str, default='\t')
    parser.add_argument('--name', type=str, default="zara1")

    args = parser.parse_args()
    model_name = args.name

    try:
        os.mkdir('models')
    except:
        pass
    try:
        os.mkdir('output')
    except:
        pass
    try:
        os.mkdir('output/BERT')
    except:
        pass
    try:
        os.mkdir(f'models/BERT')
    except:
        pass

    try:
        os.mkdir(f'output/BERT/{args.name}')
    except:
        pass

    try:
        os.mkdir(f'models/BERT/{args.name}')
    except:
        pass

    log = SummaryWriter('logs/BERT_%s' % model_name)

    log.add_scalar('eval/mad', 0, 0)
    log.add_scalar('eval/fad', 0, 0)

    try:
        os.mkdir(args.name)
    except:
        pass

    device = torch.device("cuda")
    if args.cpu or not torch.cuda.is_available():
        device = torch.device("cpu")

    args.verbose = True

    ## creation of the dataloaders for train and validation
    train_dataset, _ = baselineUtils.create_dataset(args.dataset_folder,
                                                    args.dataset_name,
                                                    0,
                                                    args.obs,
                                                    args.preds,
                                                    delim=args.delim,
                                                    train=True,
                                                    verbose=args.verbose)
    val_dataset, _ = baselineUtils.create_dataset(args.dataset_folder,
                                                  args.dataset_name,
                                                  0,
                                                  args.obs,
                                                  args.preds,
                                                  delim=args.delim,
                                                  train=False,
                                                  verbose=args.verbose)
    test_dataset, _ = baselineUtils.create_dataset(args.dataset_folder,
                                                   args.dataset_name,
                                                   0,
                                                   args.obs,
                                                   args.preds,
                                                   delim=args.delim,
                                                   train=False,
                                                   eval=True,
                                                   verbose=args.verbose)

    from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertConfig, AdamW

    config = BertConfig(vocab_size=30522,
                        hidden_size=768,
                        num_hidden_layers=12,
                        num_attention_heads=12,
                        intermediate_size=3072,
                        hidden_act='relu',
                        hidden_dropout_prob=0.1,
                        attention_probs_dropout_prob=0.1,
                        max_position_embeddings=512,
                        type_vocab_size=2,
                        initializer_range=0.02,
                        layer_norm_eps=1e-12)
    model = BertModel(config).to(device)

    from individual_TF import LinearEmbedding as NewEmbed, Generator as GeneratorTS
    a = NewEmbed(3, 768).to(device)
    model.set_input_embeddings(a)
    generator = GeneratorTS(768, 2).to(device)
    #model.set_output_embeddings(GeneratorTS(1024,2))

    tr_dl = torch.utils.data.DataLoader(train_dataset,
                                        batch_size=args.batch_size,
                                        shuffle=True,
                                        num_workers=0)
    val_dl = torch.utils.data.DataLoader(val_dataset,
                                         batch_size=args.batch_size,
                                         shuffle=True,
                                         num_workers=0)
    test_dl = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          num_workers=0)

    #optim = SGD(list(a.parameters())+list(model.parameters())+list(generator.parameters()),lr=0.01)
    #sched=torch.optim.lr_scheduler.StepLR(optim,0.0005)
    optim = NoamOpt(
        768, 0.1, len(tr_dl),
        torch.optim.Adam(list(a.parameters()) + list(model.parameters()) +
                         list(generator.parameters()),
                         lr=0,
                         betas=(0.9, 0.98),
                         eps=1e-9))
    #optim=Adagrad(list(a.parameters())+list(model.parameters())+list(generator.parameters()),lr=0.01,lr_decay=0.001)
    epoch = 0

    mean = train_dataset[:]['src'][:, :, 2:4].mean((0, 1)) * 0
    std = train_dataset[:]['src'][:, :, 2:4].std((0, 1)) * 0 + 1

    while epoch < args.max_epoch:
        epoch_loss = 0
        model.train()

        for id_b, batch in enumerate(tr_dl):

            optim.optimizer.zero_grad()
            r = 0
            rot_mat = np.array([[np.cos(r), np.sin(r)],
                                [-np.sin(r), np.cos(r)]])

            inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device)
            inp = torch.matmul(inp,
                               torch.from_numpy(rot_mat).float().to(device))
            trg_masked = torch.zeros((inp.shape[0], args.preds, 2)).to(device)
            inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device)
            trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1],
                                  1).to(device)
            inp_cat = torch.cat((inp, trg_masked), 1)
            cls_cat = torch.cat((inp_cls, trg_cls), 1)
            net_input = torch.cat((inp_cat, cls_cat), 2)

            position = torch.arange(0, net_input.shape[1]).repeat(
                inp.shape[0], 1).long().to(device)
            token = torch.zeros(
                (inp.shape[0], net_input.shape[1])).long().to(device)
            attention_mask = torch.ones(
                (inp.shape[0], net_input.shape[1])).long().to(device)

            out = model(input_ids=net_input,
                        position_ids=position,
                        token_type_ids=token,
                        attention_mask=attention_mask)

            pred = generator(out[0])

            loss = F.pairwise_distance(
                pred[:, :].contiguous().view(-1, 2),
                torch.matmul(
                    torch.cat(
                        (batch['src'][:, :, 2:4], batch['trg'][:, :, 2:4]),
                        1).contiguous().view(-1, 2).to(device),
                    torch.from_numpy(rot_mat).float().to(device))).mean()
            loss.backward()
            optim.step()
            print("epoch %03i/%03i  frame %04i / %04i loss: %7.4f" %
                  (epoch, args.max_epoch, id_b, len(tr_dl), loss.item()))
            epoch_loss += loss.item()
        #sched.step()
        log.add_scalar('Loss/train', epoch_loss / len(tr_dl), epoch)
        with torch.no_grad():
            model.eval()

            gt = []
            pr = []
            val_loss = 0
            for batch in val_dl:
                inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device)
                trg_masked = torch.zeros(
                    (inp.shape[0], args.preds, 2)).to(device)
                inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device)
                trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1],
                                      1).to(device)
                inp_cat = torch.cat((inp, trg_masked), 1)
                cls_cat = torch.cat((inp_cls, trg_cls), 1)
                net_input = torch.cat((inp_cat, cls_cat), 2)

                position = torch.arange(0, net_input.shape[1]).repeat(
                    inp.shape[0], 1).long().to(device)
                token = torch.zeros(
                    (inp.shape[0], net_input.shape[1])).long().to(device)
                attention_mask = torch.zeros(
                    (inp.shape[0], net_input.shape[1])).long().to(device)

                out = model(input_ids=net_input,
                            position_ids=position,
                            token_type_ids=token,
                            attention_mask=attention_mask)

                pred = generator(out[0])

                loss = F.pairwise_distance(
                    pred[:, :].contiguous().view(-1, 2),
                    torch.cat(
                        (batch['src'][:, :, 2:4], batch['trg'][:, :, 2:4]),
                        1).contiguous().view(-1, 2).to(device)).mean()
                val_loss += loss.item()

                gt_b = batch['trg'][:, :, 0:2]
                preds_tr_b = pred[:, args.obs:].cumsum(1).to(
                    'cpu').detach() + batch['src'][:, -1:, 0:2]
                gt.append(gt_b)
                pr.append(preds_tr_b)

            gt = np.concatenate(gt, 0)
            pr = np.concatenate(pr, 0)
            mad, fad, errs = baselineUtils.distance_metrics(gt, pr)
            log.add_scalar('validation/loss', val_loss / len(val_dl), epoch)
            log.add_scalar('validation/mad', mad, epoch)
            log.add_scalar('validation/fad', fad, epoch)

            model.eval()

            gt = []
            pr = []
            for batch in test_dl:
                inp = ((batch['src'][:, :, 2:4] - mean) / std).to(device)
                trg_masked = torch.zeros(
                    (inp.shape[0], args.preds, 2)).to(device)
                inp_cls = torch.ones(inp.shape[0], inp.shape[1], 1).to(device)
                trg_cls = torch.zeros(trg_masked.shape[0], trg_masked.shape[1],
                                      1).to(device)
                inp_cat = torch.cat((inp, trg_masked), 1)
                cls_cat = torch.cat((inp_cls, trg_cls), 1)
                net_input = torch.cat((inp_cat, cls_cat), 2)

                position = torch.arange(0, net_input.shape[1]).repeat(
                    inp.shape[0], 1).long().to(device)
                token = torch.zeros(
                    (inp.shape[0], net_input.shape[1])).long().to(device)
                attention_mask = torch.zeros(
                    (inp.shape[0], net_input.shape[1])).long().to(device)

                out = model(input_ids=net_input,
                            position_ids=position,
                            token_type_ids=token,
                            attention_mask=attention_mask)

                pred = generator(out[0])

                gt_b = batch['trg'][:, :, 0:2]
                preds_tr_b = pred[:, args.obs:].cumsum(1).to(
                    'cpu').detach() + batch['src'][:, -1:, 0:2]
                gt.append(gt_b)
                pr.append(preds_tr_b)

            gt = np.concatenate(gt, 0)
            pr = np.concatenate(pr, 0)
            mad, fad, errs = baselineUtils.distance_metrics(gt, pr)

            torch.save(model.state_dict(),
                       "models/BERT/%s/ep_%03i.pth" % (args.name, epoch))
            torch.save(generator.state_dict(),
                       "models/BERT/%s/gen_%03i.pth" % (args.name, epoch))
            torch.save(a.state_dict(),
                       "models/BERT/%s/emb_%03i.pth" % (args.name, epoch))

            log.add_scalar('eval/mad', mad, epoch)
            log.add_scalar('eval/fad', fad, epoch)

        epoch += 1

    ab = 1
class UnStructuredModel:

    def __init__(self, model_name, max_length, stride):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.max_length = max_length
        self.stride = stride
        if model_name == 'bert-base-uncased':
            configuration = BertConfig()
            self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
            self.model = BertModel(configuration).from_pretrained(self.model_name)
            self.model.to(device)
            self.model.eval()
            for param in self.model.parameters():
                param.requires_grad = False
            #self.model.bert.embeddings.requires_grad = False


    def padTokens(self, tokens):
        if len(tokens)<self.max_length:
            tokens = tokens + ["[PAD]" for i in range(self.max_length - len(tokens))]
        return tokens

    def getEmbedding(self, text, if_pool=True, pooling_type="mean", batchsize = 1):
        tokens = self.tokenizer.tokenize(text)
        tokenized_array = self.tokenizeText(tokens)
        embeddingTensorsList = []
        print(len(tokenized_array))
        tensor = torch.zeros([1, 768], device=device)
        count = 0
        if len(tokenized_array)>batchsize:
            for i in range(0, len(tokenized_array), batchsize):
                current_tokens = tokenized_array[i:min(i+batchsize,len(tokenized_array))]
                token_ids = torch.tensor(current_tokens).to(device)
                seg_ids=[[0 for _ in range(len(tokenized_array[0]))] for _ in range(len(current_tokens))]
                seg_ids   = torch.tensor(seg_ids).to(device)
                hidden_reps, cls_head = self.model(token_ids, token_type_ids = seg_ids)
                cls_head.to(device)
                clas_head = cls_head.detach
                if if_pool and pooling_type=="mean":
                    tensor = tensor.add(torch.sum(cls_head, dim=0))
                    count +=cls_head.shape[0]
                else:
                    embeddingTensorsList.append(cls_head)
                del cls_head, hidden_reps
            if if_pool and pooling_type=="mean" and count>0:
                embedding = torch.div(tensor, count)
            elif not if_pool:
                embedding = torch.cat(embeddingTensorsList, dim=0)
            else:
                raise NotImplementedError()

        else:
            token_ids = torch.tensor(tokenized_array).to(device)
            seg_ids=[[0 for _ in range(len(tokenized_array[0]))] for _ in range(len(tokenized_array))]
            seg_ids   = torch.tensor(seg_ids).to(device)
            hidden_reps, cls_head = self.model(token_ids, token_type_ids = seg_ids)
            cls_head.to(device)
            cls_head.requires_grad = False
            if if_pool and pooling_type=="mean":
                embedding = torch.div(torch.sum(cls_head, dim=0), cls_head.shape[0])
            elif not if_pool:
                embedding = cls_head
            else:
                raise NotImplementedError()
            del cls_head, hidden_reps
        return embedding

    def tokenizeText(self, tokens):
        tokens_array = []
        #window_movement_tokens =  max_length - stride
        for i in range(0, len(tokens), self.stride):
            if i+self.max_length<len(tokens):
                curr_tokens = ["[CLS]"] + tokens[i:i+self.max_length] + ["[SEP]"]
            else:
                padded_tokens = self.padTokens(tokens[i:i+self.max_length])
                curr_tokens = ["[CLS]"] + padded_tokens + ["[SEP]"]
            curr_tokens = self.tokenizer.convert_tokens_to_ids(curr_tokens)
            tokens_array.append(curr_tokens)
        return tokens_array
Beispiel #14
0
def main(config_path):
    config = Box.from_yaml(config_path.open())
    torch.cuda.set_device(config.train.device)
    logger = create_logger(name="MAIN")
    logger.info(f"[-] Config loaded from {config_path}")

    data_dir = Path(config.data.data_dir)
    save_dir = Path(config.data.save_dir)
    if not save_dir.exists():
        save_dir.mkdir()
    transfo_dir = Path(config.data.transfo_dir)
    device = create_device(config.train.device)

    tokenizer = BertTokenizer.from_pretrained(
        str(transfo_dir), do_lower_case=(not config.data.cased))

    global CLS
    global SEP
    global PAD
    CLS, SEP, PAD = tokenizer.convert_tokens_to_ids(
        ["[CLS]", "[SEP]", "[PAD]"])

    bert_config = BertConfig.from_pretrained(str(transfo_dir))
    # To extract representations from other layers
    bert_config.output_hidden_states = True
    model = BertModel(bert_config)
    model.to(device)
    model.eval()

    train_file = data_dir / "schema_dstc8+m2.2.json"
    train_vocab_file = save_dir / "train_schema_vocab.pkl"
    train_embed_file = save_dir / "train_schema_embed.pkl"
    train_desc_file = save_dir / "train_schema_desc.pkl"
    valid_file = data_dir / "dev" / "schema.json"
    valid_vocab_file = save_dir / "valid_schema_vocab.pkl"
    valid_embed_file = save_dir / "valid_schema_embed.pkl"
    valid_desc_file = save_dir / "valid_schema_desc.pkl"
    if (data_dir / "test").exists():
        test_file = data_dir / "test" / "schema.json"
        test_vocab_file = save_dir / "test_schema_vocab.pkl"
        test_embed_file = save_dir / "test_schema_embed.pkl"
        test_desc_file = save_dir / "test_schema_desc.pkl"
    else:
        test_file = None
        test_vocab_file = None
        test_embed_file = None
        test_desc_file = None

    train_schema_vocab, train_desc = extract(train_file,
                                             config.data.concat_name)
    valid_schema_vocab, valid_desc = extract(valid_file,
                                             config.data.concat_name)
    if test_file is not None:
        test_schema_vocab, test_desc = extract(test_file,
                                               config.data.concat_name)
    else:
        test_schema_vocab = test_desc = None

    pickle.dump(train_schema_vocab, open(train_vocab_file, "wb"))
    pickle.dump(valid_schema_vocab, open(valid_vocab_file, "wb"))
    if test_schema_vocab is not None:
        pickle.dump(test_schema_vocab, open(test_vocab_file, "wb"))

    layer = config.data.schema.layer
    pooling = config.data.schema.pooling

    train_embed = []
    for desc in tqdm(train_desc, leave=False):
        embed = []
        for sent in tqdm(desc, leave=False):
            embed.append(
                get_rep(sent, model, tokenizer, layer, pooling, device))
        embed = torch.stack(embed)
        train_embed.append(embed)

    train_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)]
                   for sent in desc] for desc in train_desc]

    pickle.dump(train_embed, open(train_embed_file, "wb"))
    pickle.dump(train_desc, open(train_desc_file, "wb"))

    valid_embed = []
    for desc in tqdm(valid_desc, leave=False):
        embed = []
        for sent in tqdm(desc, leave=False):
            embed.append(
                get_rep(sent, model, tokenizer, layer, pooling, device))
        embed = torch.stack(embed)
        valid_embed.append(embed)

    valid_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)]
                   for sent in desc] for desc in valid_desc]

    pickle.dump(valid_embed, open(valid_embed_file, "wb"))
    pickle.dump(valid_desc, open(valid_desc_file, "wb"))

    if test_desc is None:
        exit()

    test_embed = []
    for desc in tqdm(test_desc, leave=False):
        embed = []
        for sent in tqdm(desc, leave=False):
            embed.append(
                get_rep(sent, model, tokenizer, layer, pooling, device))
        embed = torch.stack(embed)
        test_embed.append(embed)

    test_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)]
                  for sent in desc] for desc in test_desc]

    pickle.dump(test_embed, open(test_embed_file, "wb"))
    pickle.dump(test_desc, open(test_desc_file, "wb"))
Beispiel #15
0
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
BertModel = BertModel.from_pretrained('bert-base-uncased').to(device)
BertModel.eval()


class Decoder(nn.Module):

    def __init__(self, vocab_size, use_glove, use_bert, glove_vectors, vocab):
        super(Decoder, self).__init__()
        self.vocab = vocab
        self.encoder_dim = 2048
        self.attention_dim = 512
        self.use_bert = use_bert

        if use_glove:
            self.embed_dim = 200
        elif use_bert:
            self.embed_dim = 768
        else:
            self.embed_dim = 512

        self.decoder_dim = 512
        self.vocab_size = vocab_size
        self.dropout = 0.5
Beispiel #16
0
class NERPredict(IPredict):
    '''
    构造函数, 初始化预测器
    use_gpu: 使用GPU
    bert_config_file_name: Bert模型配置文件路径
    vocab_file_name: 单词表文件路径
    tags_file_name: Tag表文件路径
    bert_model_path: Bert模型装载路径
    lstm_crf_model_path: CRF模型装载路径
    hidden_dim: CRF隐藏层
    '''
    def __init__(self, use_gpu, bert_config_file_name, vocab_file_name,
                 tags_file_name, bert_model_path, lstm_crf_model_path,
                 hidden_dim):
        self.use_gpu = use_gpu
        self.data_manager_init(vocab_file_name, tags_file_name)
        self.tokenizer = BertTokenizer.from_pretrained(vocab_file_name)
        self.model_init(hidden_dim, bert_config_file_name, bert_model_path,
                        lstm_crf_model_path)

    def data_manager_init(self, vocab_file_name, tags_file_name):
        tags_list = BERTDataManager.ReadTagsList(tags_file_name)
        tags_list = [tags_list]
        self.dm = BERTDataManager(tags_list=tags_list,
                                  vocab_file_name=vocab_file_name)

    def model_init(self, hidden_dim, bert_config_file_name, bert_model_path,
                   lstm_crf_model_path):
        config = BertConfig.from_json_file(bert_config_file_name)

        self.model = BertModel(config)

        bert_dict = torch.load(bert_model_path).module.state_dict()

        self.model.load_state_dict(bert_dict)
        self.birnncrf = torch.load(lstm_crf_model_path)

        self.model.eval()
        self.birnncrf.eval()

    def data_process(self, sentences):
        result = []
        pad_tag = '[PAD]'
        if type(sentences) == str:
            sentences = [sentences]
        max_len = 0
        for sentence in sentences:
            encode = self.tokenizer.encode(sentence, add_special_tokens=True)
            result.append(encode)
            if max_len < len(encode):
                max_len = len(encode)

        for i, sentence in enumerate(result):
            remain = max_len - len(sentence)
            for _ in range(remain):
                result[i].append(self.dm.wordToIdx(pad_tag))
        return torch.tensor(result)

    def pred(self, sentences):
        sentences = self.data_process(sentences)

        if torch.cuda.is_available() and self.use_gpu:
            self.model.cuda()
            self.birnncrf.cuda()
            sentences = sentences.cuda()

        outputs = self.model(input_ids=sentences,
                             attention_mask=sentences.gt(0))
        hidden_states = outputs[0]
        scores, tags = self.birnncrf(hidden_states, sentences.gt(0))
        final_tags = []
        decode_sentences = []

        for item in tags:
            final_tags.append([self.dm.idx_to_tag[tag] for tag in item])

        for item in sentences.tolist():
            decode_sentences.append(self.tokenizer.decode(item))

        return (scores, tags, final_tags, decode_sentences)

    def __call__(self, sentences):
        return self.pred(sentences)
Beispiel #17
0
def main(args, _=None):
    """Run the ``catalyst-data text2embeddings`` script."""
    batch_size = args.batch_size
    num_workers = args.num_workers
    max_length = args.max_length
    pooling_groups = args.pooling.split(",")

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    if hasattr(args, "in_huggingface"):
        model_config = BertConfig.from_pretrained(args.in_huggingface)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel.from_pretrained(args.in_huggingface,
                                          config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_huggingface)
    else:
        model_config = BertConfig.from_pretrained(args.in_config)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel(config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_vocab)
    if hasattr(args, "in_model"):
        checkpoint = utils.load_checkpoint(args.in_model)
        checkpoint = {"model_state_dict": checkpoint}
        utils.unpack_checkpoint(checkpoint=checkpoint, model=model)

    model = model.eval()
    model, _, _, _, device = utils.process_components(model=model)

    df = pd.read_csv(args.in_csv)
    df = df.dropna(subset=[args.txt_col])
    df.to_csv(f"{args.out_prefix}.df.csv", index=False)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())
    num_samples = len(df)

    open_fn = LambdaReader(
        input_key=args.txt_col,
        output_key=None,
        lambda_fn=partial(
            tokenize_text,
            strip=args.strip,
            lowercase=args.lowercase,
            remove_punctuation=args.remove_punctuation,
        ),
        tokenizer=tokenizer,
        max_length=max_length,
    )

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    features = {}
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            batch = utils.any2device(batch, device)
            bert_output = model(**batch)
            mask = (batch["attention_mask"].unsqueeze(-1)
                    if args.mask_for_max_length else None)

            if utils.check_ddp_wrapped(model):
                # using several gpu
                hidden_size = model.module.config.hidden_size
                hidden_states = model.module.config.output_hidden_states

            else:
                # using cpu or one gpu
                hidden_size = model.config.hidden_size
                hidden_states = model.config.output_hidden_states

            features_ = process_bert_output(
                bert_output=bert_output,
                hidden_size=hidden_size,
                output_hidden_states=hidden_states,
                pooling_groups=pooling_groups,
                mask=mask,
            )

            # create storage based on network output
            if idx == 0:
                for key, value in features_.items():
                    name_ = key if isinstance(key, str) else f"{key:02d}"
                    _, embedding_size = value.shape
                    features[name_] = np.memmap(
                        f"{args.out_prefix}.{name_}.npy",
                        dtype=np.float32,
                        mode="w+",
                        shape=(num_samples, embedding_size),
                    )

            indices = np.arange(idx * batch_size,
                                min((idx + 1) * batch_size, num_samples))
            for key, value in features_.items():
                name_ = key if isinstance(key, str) else f"{key:02d}"
                features[name_][indices] = _detach(value)
Beispiel #18
0
class SentenceBert(BertPreTrainedModel):
    def __init__(self, config, max_len, tokenizer, device, task_type):
        super(SentenceBert, self).__init__(config)
        self.max_len = max_len
        self.task_type = task_type
        self._target_device = device
        self.tokenizer = tokenizer
        self.bert = BertModel(config=config)
        self.classifier = nn.Linear(3 * config.hidden_size, config.num_labels)

    def forward(self, inputs):
        input_a = inputs[0]
        input_b = inputs[1]
        output_a = self.bert(**input_a,
                             return_dict=True,
                             output_hidden_states=True)
        output_b = self.bert(**input_b,
                             return_dict=True,
                             output_hidden_states=True)
        #采用最后一层
        embedding_a = output_a.hidden_states[-1]
        embedding_b = output_b.hidden_states[-1]
        embedding_a = self.pooling(embedding_a, input_a)
        embedding_b = self.pooling(embedding_b, input_b)

        if self.task_type == "classification":
            embedding_abs = torch.abs(embedding_a - embedding_b)
            vectors_concat = []
            vectors_concat.append(embedding_a)
            vectors_concat.append(embedding_b)
            vectors_concat.append(embedding_abs)
            #列拼接3个768————>3*768
            features = torch.cat(vectors_concat, 1)
            output = self.classifier(features)
        else:
            d = torch.mul(embedding_a, embedding_b)
            a_len = torch.norm(embedding_a, dim=1)
            b_len = torch.norm(embedding_b, dim=1)
            cos = torch.sum(d) / (a_len * b_len)
            output = cos
        return output

    def pooling(self, token_embeddings, input):
        output_vectors = []
        #attention_mask
        attention_mask = input['attention_mask']
        #[B,L]------>[B,L,1]------>[B,L,768],矩阵的值是0或者1
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(
            token_embeddings.size()).float()
        #这里做矩阵点积,就是对元素相乘(序列中padding字符,通过乘以0给去掉了)[B,L,768]
        t = token_embeddings * input_mask_expanded
        #[B,768]
        sum_embeddings = torch.sum(t, 1)

        # [B,768],最大值为seq_len
        sum_mask = input_mask_expanded.sum(1)
        #限定每个元素的最小值是1e-9,保证分母不为0
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        #得到最后的具体embedding的每一个维度的值——元素相除
        output_vectors.append(sum_embeddings / sum_mask)

        #列拼接
        output_vector = torch.cat(output_vectors, 1)

        return output_vector

    def encoding(self, inputs):
        self.bert.eval()
        with torch.no_grad():
            output = self.bert(**inputs,
                               return_dict=True,
                               output_hidden_states=True)
            embedding = output.hidden_states[-1]
            embedding = self.pooling(embedding, inputs)
        return embedding

    # def class_infer(self,texts,batch_size=64):
    #     """
    #     推理输入文本list中第一条和剩余其他的是否是同一类别;可以传入无限长的list
    #     :param texts:
    #     :param tokenizer:
    #     :param batch_size:
    #     :return:
    #     """
    #     result = []
    #     text_a = texts[0]
    #     input_id_a, attention_mask_a = self.convert_text2ids(text_a)
    #     input_ids_a = []
    #     attention_masks_a = []
    #     input_ids_b = []
    #     attention_masks_b = []
    #     for text in texts[1:]:
    #         input_id_b, attention_mask_b = self.convert_text2ids(text)
    #
    #         input_ids_a.append(input_id_a)
    #         attention_masks_a.append(attention_mask_a)
    #
    #         input_ids_b.append(input_id_b)
    #         attention_masks_b.append(attention_mask_b)
    #         if len(input_ids_a) >= batch_size:
    #             inputs = []
    #             input_ids_a = torch.as_tensor(input_ids_a,dtype=torch.long,device=self.device)
    #             attention_masks_a = torch.as_tensor(attention_masks_a,dtype=torch.long,device=self.device)
    #             token_type_ids_a = torch.zeros_like(input_ids_a).to(self.device)
    #
    #             inputs_a = {'input_ids': input_ids_a, 'attention_mask': attention_masks_a,'token_type_ids':token_type_ids_a}
    #
    #             input_ids_b = torch.as_tensor(input_ids_b, dtype=torch.long, device=self.device)
    #             attention_masks_b = torch.as_tensor(attention_masks_b, dtype=torch.long, device=self.device)
    #             token_type_ids_b = torch.zeros_like(input_ids_b).to(self.device)
    #
    #             inputs_b = {'input_ids': input_ids_b, 'attention_mask': attention_masks_b,
    #                         'token_type_ids': token_type_ids_b}
    #
    #             inputs.append(inputs_a)
    #             inputs.append(inputs_b)
    #             logits = self.forward(inputs)
    #
    #
    #             lables = torch.argmax(logits)
    #             result.append(lables)
    #
    #
    #             input_ids_a = []
    #             attention_masks_a = []
    #             input_ids_b = []
    #             attention_masks_b = []
    #
    #
    #
    #     inputs = []
    #     input_ids_a = torch.as_tensor(input_ids_a, dtype=torch.long, device=self.device)
    #     attention_masks_a = torch.as_tensor(attention_masks_a, dtype=torch.long, device=self.device)
    #     token_type_ids_a = torch.zeros_like(input_ids_a).to(self.device)
    #
    #     inputs_a = {'input_ids': input_ids_a, 'attention_mask': attention_masks_a, 'token_type_ids': token_type_ids_a}
    #
    #     input_ids_b = torch.as_tensor(input_ids_b, dtype=torch.long, device=self.device)
    #     attention_masks_b = torch.as_tensor(attention_masks_b, dtype=torch.long, device=self.device)
    #     token_type_ids_b = torch.zeros_like(input_ids_b).to(self.device)
    #
    #     inputs_b = {'input_ids': input_ids_b, 'attention_mask': attention_masks_b,
    #                 'token_type_ids': token_type_ids_b}
    #
    #     inputs.append(inputs_a)
    #     inputs.append(inputs_b)
    #     logits = self.forward(inputs)
    #
    #     lables = torch.argmax(logits)
    #     result.append(lables)
    #
    #
    #     return  result

    def class_infer(self, texts, batch_size=64):
        """
        推理输入文本list中第一条和剩余其他的是否是同一类别;传入长度<batch_size
        :param texts:
        :param tokenizer:
        :param batch_size:
        :return:
        """
        assert len(texts) <= batch_size
        result = []
        text_a = texts[0]
        input_id_a, attention_mask_a = self.convert_text2ids(text_a)
        input_ids_a = []
        attention_masks_a = []
        input_ids_b = []
        attention_masks_b = []
        for text in texts[1:]:
            input_id_b, attention_mask_b = self.convert_text2ids(text)

            input_ids_a.append(input_id_a)
            attention_masks_a.append(attention_mask_a)

            input_ids_b.append(input_id_b)
            attention_masks_b.append(attention_mask_b)

        inputs = []
        input_ids_a = torch.as_tensor(input_ids_a,
                                      dtype=torch.long,
                                      device=self._target_device)
        attention_masks_a = torch.as_tensor(attention_masks_a,
                                            dtype=torch.long,
                                            device=self._target_device)
        token_type_ids_a = torch.zeros_like(input_ids_a).to(
            self._target_device)
        inputs_a = {
            'input_ids': input_ids_a,
            'attention_mask': attention_masks_a,
            'token_type_ids': token_type_ids_a
        }

        input_ids_b = torch.as_tensor(input_ids_b,
                                      dtype=torch.long,
                                      device=self._target_device)
        attention_masks_b = torch.as_tensor(attention_masks_b,
                                            dtype=torch.long,
                                            device=self._target_device)
        token_type_ids_b = torch.zeros_like(input_ids_b).to(
            self._target_device)
        inputs_b = {
            'input_ids': input_ids_b,
            'attention_mask': attention_masks_b,
            'token_type_ids': token_type_ids_b
        }

        inputs.append(inputs_a)
        inputs.append(inputs_b)

        logits = self.forward(inputs)
        lables = torch.argmax(logits)
        result.append(lables)

        return result

    def similarity_infer(self, texts, batch_size=64):
        """
        计算输入文本list中第一条和剩余其他文本的相似度 传入长度<batch_size
        :param texts:
        :param tokenizer:
        :param batch_size:
        :return:
        """
        assert len(texts) <= batch_size
        input_ids = []
        attention_masks = []
        for text in texts:
            input_id, attention_mask = self.convert_text2ids(text)
            input_ids.append(input_id)
            attention_masks.append(attention_mask)

        input_ids_a = torch.as_tensor(input_ids,
                                      dtype=torch.long,
                                      device=self._target_device)
        attention_masks_a = torch.as_tensor(attention_masks,
                                            dtype=torch.long,
                                            device=self._target_device)
        token_type_ids_a = torch.zeros_like(input_ids_a).to(
            self._target_device)
        inputs = {
            'input_ids': input_ids_a,
            'attention_mask': attention_masks_a,
            'token_type_ids': token_type_ids_a
        }

        embeddings = self.encoding(inputs)

        embedding_a = embeddings[0:1]
        embedding_b = embeddings[1:]

        d = torch.mul(embedding_a, embedding_b)  # 计算对应元素相乘
        a_len = torch.norm(embedding_a, dim=1)  # 2范数,也就是模长
        b_len = torch.norm(embedding_b, dim=1)
        cos = torch.sum(d, dim=1) / (a_len * b_len)  # 得到相似度

        simlaritys = cos

        return simlaritys

    def convert_text2ids(self, text):
        text = text[0:self.max_len - 2]
        inputs = self.tokenizer(text)

        input_ids = inputs['input_ids']
        # lenght = len(input_ids)
        attention_mask = inputs['attention_mask']
        paddings = [0] * (self.max_len - len(input_ids))
        input_ids += paddings
        attention_mask += paddings

        return input_ids, attention_mask
Beispiel #19
0
import torch
from transformers import BertConfig, BertModel

if args.size == 'tiny':
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    bert_name_or_path = os.path.join(os.path.join(cur_dir, 'bert'),
                                     'bert-tiny-uncased-config.json')
elif args.size == 'base':
    bert_name_or_path = "bert-base-uncased"
else:
    bert_name_or_path = "bert-large-uncased"

config = BertConfig.from_pretrained(bert_name_or_path)
model = BertModel(config)
model.eval()
device = torch.device("cpu")
model.to(device)
dummy_input0 = torch.LongTensor(1, 512).fill_(1).to(device)
dummy_input1 = torch.LongTensor(1, 512).fill_(1).to(device)
dummy_input2 = torch.LongTensor(1, 512).fill_(0).to(device)
dummy_input = (dummy_input0, dummy_input1, dummy_input2)
output_path = './bert/bert_{}.onnx'.format(args.size)
torch.onnx.export(model,
                  dummy_input,
                  output_path,
                  export_params=True,
                  opset_version=12,
                  do_constant_folding=True,
                  input_names=["input_ids", "input_mask", "segment_ids"],
                  output_names=["output"],
def train(config, bert_config, train_path, dev_path, rel2id, id2rel,
          tokenizer):
    if os.path.exists(config.output_dir) is False:
        os.makedirs(config.output_dir, exist_ok=True)
    if os.path.exists('./data/train_file.pkl'):
        train_data = pickle.load(open("./data/train_file.pkl", mode='rb'))
    else:
        train_data = data.load_data(train_path, tokenizer, rel2id, num_rels)
        pickle.dump(train_data, open("./data/train_file.pkl", mode='wb'))
    dev_data = json.load(open(dev_path))
    for sent in dev_data:
        data.to_tuple(sent)
    data_manager = data.SPO(train_data)
    train_sampler = RandomSampler(data_manager)
    train_data_loader = DataLoader(data_manager,
                                   sampler=train_sampler,
                                   batch_size=config.batch_size,
                                   drop_last=True)
    num_train_steps = int(
        len(data_manager) / config.batch_size) * config.max_epoch

    if config.bert_pretrained_model is not None:
        logger.info('load bert weight')
        Bert_model = BertModel.from_pretrained(config.bert_pretrained_model,
                                               config=bert_config)
    else:
        logger.info('random initialize bert model')
        Bert_model = BertModel(config=bert_config).init_weights()
    Bert_model.to(device)
    submodel = sub_model(config).to(device)
    objmodel = obj_model(config).to(device)

    loss_fuc = nn.BCELoss(reduction='none')
    params = list(Bert_model.parameters()) + list(
        submodel.parameters()) + list(objmodel.parameters())
    optimizer = AdamW(params, lr=config.lr)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(data_manager))
    logger.info("  Num Epochs = %d", config.max_epoch)
    logger.info("  Total train batch size = %d", config.batch_size)
    logger.info("  Total optimization steps = %d", num_train_steps)
    logger.info("  Logging steps = %d", config.print_freq)
    logger.info("  Save steps = %d", config.save_freq)

    global_step = 0
    Bert_model.train()
    submodel.train()
    objmodel.train()

    for _ in range(config.max_epoch):
        optimizer.zero_grad()
        epoch_itorator = tqdm(train_data_loader, disable=None)
        for step, batch in enumerate(epoch_itorator):
            batch = tuple(t.to(device) for t in batch)
            input_ids, segment_ids, input_masks, sub_positions, sub_heads, sub_tails, obj_heads, obj_tails = batch

            bert_output = Bert_model(input_ids, input_masks, segment_ids)[0]
            pred_sub_heads, pred_sub_tails = submodel(
                bert_output)  # [batch_size, seq_len, 1]
            pred_obj_heads, pred_obj_tails = objmodel(bert_output,
                                                      sub_positions)

            # 计算loss
            mask = input_masks.view(-1)

            # loss1
            sub_heads = sub_heads.unsqueeze(-1)  # [batch_szie, seq_len, 1]
            sub_tails = sub_tails.unsqueeze(-1)

            loss1_head = loss_fuc(pred_sub_heads, sub_heads).view(-1)
            loss1_head = torch.sum(loss1_head * mask) / torch.sum(mask)

            loss1_tail = loss_fuc(pred_sub_tails, sub_tails).view(-1)
            loss1_tail = torch.sum(loss1_tail * mask) / torch.sum(mask)

            loss1 = loss1_head + loss1_tail

            # loss2
            loss2_head = loss_fuc(pred_obj_heads,
                                  obj_heads).view(-1, obj_heads.shape[-1])
            loss2_head = torch.sum(
                loss2_head * mask.unsqueeze(-1)) / torch.sum(mask)

            loss2_tail = loss_fuc(pred_obj_tails,
                                  obj_tails).view(-1, obj_tails.shape[-1])
            loss2_tail = torch.sum(
                loss2_tail * mask.unsqueeze(-1)) / torch.sum(mask)

            loss2 = loss2_head + loss2_tail

            # optimize
            loss = loss1 + loss2
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
            if (global_step + 1) % config.print_freq == 0:
                logger.info(
                    "epoch : {} step: {} #### loss1: {}  loss2: {}".format(
                        _, global_step + 1,
                        loss1.cpu().item(),
                        loss2.cpu().item()))

            if (global_step + 1) % config.eval_freq == 0:
                logger.info("***** Running evaluating *****")
                with torch.no_grad():
                    Bert_model.eval()
                    submodel.eval()
                    objmodel.eval()
                    P, R, F1 = utils.metric(Bert_model, submodel, objmodel,
                                            dev_data, id2rel, tokenizer)
                    logger.info(f'precision:{P}\nrecall:{R}\nF1:{F1}')
                Bert_model.train()
                submodel.train()
                objmodel.train()

            if (global_step + 1) % config.save_freq == 0:
                # Save a trained model
                model_name = "pytorch_model_%d" % (global_step + 1)
                output_model_file = os.path.join(config.output_dir, model_name)
                state = {
                    'bert_state_dict': Bert_model.state_dict(),
                    'subject_state_dict': submodel.state_dict(),
                    'object_state_dict': objmodel.state_dict(),
                }
                torch.save(state, output_model_file)

    model_name = "pytorch_model_last"
    output_model_file = os.path.join(config.output_dir, model_name)
    state = {
        'bert_state_dict': Bert_model.state_dict(),
        'subject_state_dict': submodel.state_dict(),
        'object_state_dict': objmodel.state_dict(),
    }
    torch.save(state, output_model_file)
Beispiel #21
0
class BertForQuestionAnsweringWithCRF(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.hidden_size = self.bert.config.hidden_size
        self.CRF_fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(self.hidden_size, config.num_labels + 2, bias=True),
        )

        self.CRF = CRF(target_size=self.bert.config.num_labels,
                       device=torch.device("cuda"))
        self.CrossEntropyLoss = nn.CrossEntropyLoss()
        self.fc2 = nn.Linear(self.hidden_size, 2, bias=True)

    def forward(self, tokens_id_l, token_type_ids_l, answer_offset_l,
                answer_seq_label_l, IsQA_l):

        ## 字符ID [batch_size, seq_length]
        tokens_x_2d = torch.LongTensor(tokens_id_l).to(self.device)
        token_type_ids_2d = torch.LongTensor(token_type_ids_l).to(self.device)

        # 计算sql_len 不包含[CLS]
        batch_size, seq_length = tokens_x_2d[:, 1:].size()

        ## CRF答案ID [batch_size, seq_length]
        y_2d = torch.LongTensor(answer_seq_label_l).to(self.device)[:, 1:]
        ## (batch_size,)
        y_IsQA_2d = torch.LongTensor(IsQA_l).to(self.device)

        if self.training:  # self.training基层的外部类
            self.bert.train()
            output = self.bert(
                input_ids=tokens_x_2d,
                token_type_ids=token_type_ids_2d,
                output_hidden_states=True,
                return_dict=True)  #[batch_size, seq_len, hidden_size]
        else:
            self.bert.eval()
            with torch.no_grad():
                output = self.bert(input_ids=tokens_x_2d,
                                   token_type_ids=token_type_ids_2d,
                                   output_hidden_states=True,
                                   return_dict=True)

        ## [CLS] for IsQA  [batch_size, hidden_size]
        cls_emb = output.last_hidden_state[:, 0, :]

        IsQA_logits = self.fc2(cls_emb)  ## [batch_size, 2]
        IsQA_loss = self.CrossEntropyLoss.forward(IsQA_logits, y_IsQA_2d)

        ## [batch_size, 1]
        IsQA_prediction = IsQA_logits.argmax(dim=-1).unsqueeze(dim=-1)

        # CRF mask
        mask = np.ones(shape=[batch_size, seq_length], dtype=np.uint8)
        mask = torch.ByteTensor(mask).to(
            self.device)  # [batch_size, seq_len, 4]

        # No [CLS]
        crf_logits = self.CRF_fc1(output.last_hidden_state[:, 1:, :])
        crf_loss = self.CRF.neg_log_likelihood_loss(feats=crf_logits,
                                                    mask=mask,
                                                    tags=y_2d)
        _, CRFprediction = self.CRF.forward(feats=crf_logits, mask=mask)

        return IsQA_prediction, CRFprediction, IsQA_loss, crf_loss, y_2d, y_IsQA_2d.unsqueeze(
            dim=-1)  # (batch_size,) -> (batch_size, 1)