Beispiel #1
0
        def create_and_check_bert_model_as_decoder(self, config, input_ids,
                                                   token_type_ids, input_mask,
                                                   sequence_labels,
                                                   token_labels, choice_labels,
                                                   encoder_hidden_states,
                                                   encoder_attention_mask):
            model = BertModel(config)
            model.to(torch_device)
            model.eval()
            sequence_output, pooled_output = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask)
            sequence_output, pooled_output = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids,
                encoder_hidden_states=encoder_hidden_states)
            sequence_output, pooled_output = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids)

            result = {
                "sequence_output": sequence_output,
                "pooled_output": pooled_output,
            }
            self.parent.assertListEqual(
                list(result["sequence_output"].size()),
                [self.batch_size, self.seq_length, self.hidden_size])
            self.parent.assertListEqual(list(result["pooled_output"].size()),
                                        [self.batch_size, self.hidden_size])
 def create_and_check_model_as_decoder(
     self,
     config,
     input_ids,
     token_type_ids,
     input_mask,
     sequence_labels,
     token_labels,
     choice_labels,
     encoder_hidden_states,
     encoder_attention_mask,
 ):
     config.add_cross_attention = True
     model = BertModel(config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         encoder_hidden_states=encoder_hidden_states,
         encoder_attention_mask=encoder_attention_mask,
     )
     result = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         encoder_hidden_states=encoder_hidden_states,
     )
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
     self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
     self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
Beispiel #3
0
def get_kobert_model(model_file, vocab_file, ctx="cpu"):
    bertmodel = BertModel(config=BertConfig.from_dict(bert_config))
    bertmodel.load_state_dict(torch.load(model_file), strict=False)
    device = torch.device(ctx)
    bertmodel.to(device)
    bertmodel.eval()
    vocab_b_obj = nlp.vocab.BERTVocab.from_json(open(vocab_file, 'rt').read())
    return bertmodel, vocab_b_obj
Beispiel #4
0
    def __init__(
        self,
        pretrained_model_name=None,
        config_filename=None,
        vocab_size=None,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        max_position_embeddings=512,
    ):
        super().__init__()

        # Check that only one of pretrained_model_name, config_filename, and
        # vocab_size was passed in
        total = 0
        if pretrained_model_name is not None:
            total += 1
        if config_filename is not None:
            total += 1
        if vocab_size is not None:
            total += 1

        if total != 1:
            raise ValueError(
                "Only one of pretrained_model_name, vocab_size, " +
                "or config_filename should be passed into the " +
                "BERT constructor.")

        # TK: The following code checks the same once again.
        if vocab_size is not None:
            config = BertConfig(
                vocab_size_or_config_json_file=vocab_size,
                vocab_size=vocab_size,
                hidden_size=hidden_size,
                num_hidden_layers=num_hidden_layers,
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                hidden_act=hidden_act,
                max_position_embeddings=max_position_embeddings,
            )
            model = BertModel(config)
        elif pretrained_model_name is not None:
            model = BertModel.from_pretrained(pretrained_model_name)
        elif config_filename is not None:
            config = BertConfig.from_json_file(config_filename)
            model = BertModel(config)
        else:
            raise ValueError(
                "Either pretrained_model_name or vocab_size must" +
                " be passed into the BERT constructor")

        model.to(self._device)

        self.add_module("bert", model)
        self.config = model.config
        self._hidden_size = model.config.hidden_size
Beispiel #5
0
    def __init__(self,
                 *,
                 pretrained_model_name=None,
                 config_filename=None,
                 vocab_size=None,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 max_position_embeddings=512,
                 **kwargs):
        TrainableNM.__init__(self, **kwargs)

        # Check that only one of pretrained_model_name, config_filename, and
        # vocab_size was passed in
        total = 0
        if pretrained_model_name is not None:
            total += 1
        if config_filename is not None:
            total += 1
        if vocab_size is not None:
            total += 1

        if total != 1:
            raise ValueError(
                "Only one of pretrained_model_name, vocab_size, " +
                "or config_filename should be passed into the " +
                "BERT constructor.")

        if vocab_size is not None:
            config = BertConfig(
                vocab_size_or_config_json_file=vocab_size,
                vocab_size=vocab_size,
                hidden_size=hidden_size,
                num_hidden_layers=num_hidden_layers,
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                hidden_act=hidden_act,
                max_position_embeddings=max_position_embeddings,
            )
            model = BertModel(config)
        elif pretrained_model_name is not None:
            model = BertModel.from_pretrained(pretrained_model_name)
        elif config_filename is not None:
            config = BertConfig.from_json_file(config_filename)
            model = BertModel(config)
        else:
            raise ValueError(
                "Either pretrained_model_name or vocab_size must" +
                " be passed into the BERT constructor")

        model.to(self._device)

        self.add_module("bert", model)
        self.config = model.config
        for key, value in self.config.to_dict().items():
            self._local_parameters[key] = value
Beispiel #6
0
def get_kobert_model(model_file, vocab_file, ctx="cpu"):
    bertmodel = BertModel(config=BertConfig.from_dict(bert_config))
    bertmodel.load_state_dict(torch.load(model_file))
    device = torch.device(ctx)
    bertmodel.to(device)
    bertmodel.eval()
    vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file,
                                                         padding_token='[PAD]')
    return bertmodel, vocab_b_obj
        def create_and_check_bert_model(self, config, input_ids,
                                        token_type_ids, input_mask,
                                        sequence_labels, token_labels,
                                        choice_labels):
            model = BertModel(config=config)
            model.to(input_ids.device)
            model.eval()

            sequence_output, pooled_output = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids)

            # failed because there is not loss output
            model_desc = ModelDescription([
                self.input_ids_desc, self.attention_mask_desc,
                self.token_type_ids_desc
            ], [self.last_hidden_state_desc, self.pooler_output_desc])
            args_gradient_accumulation_steps = 8
            args_local_rank = 0
            args_world_size = 1
            args_fp16 = True
            args_allreduce_post_accumulation = True

            model = ORTTrainer(
                model,
                None,
                model_desc,
                "LambOptimizer",
                map_optimizer_attributes=map_optimizer_attributes,
                learning_rate_description=IODescription(
                    'Learning_Rate', [
                        1,
                    ], torch.float32),
                device=self.device,
                postprocess_model=postprocess_model,
                gradient_accumulation_steps=args_gradient_accumulation_steps,
                world_rank=args_local_rank,
                world_size=args_world_size,
                use_mixed_precision=True if args_fp16 else False,
                allreduce_post_accumulation=True
                if args_allreduce_post_accumulation else False)

            sequence_output, pooled_output = model(
                input_ids, token_type_ids=token_type_ids)
            sequence_output, pooled_output = model(input_ids)

            result = {
                "sequence_output": sequence_output,
                "pooled_output": pooled_output,
            }
            self.parent.assertListEqual(
                list(result["sequence_output"].size()),
                [self.batch_size, self.seq_length, self.hidden_size])
            self.parent.assertListEqual(list(result["pooled_output"].size()),
                                        [self.batch_size, self.hidden_size])
Beispiel #8
0
    def __init__(self,
                 bert: BertModel,
                 tokenizer: BertTokenizer,
                 hypernym_list: Union[str, Path, List[List[str]]],
                 embed_with_encoder_output: bool = True,
                 embed_wo_special_tokens: bool = True,
                 use_projection: bool = False,
                 batch_size: int = 128):
        super(HyBert, self).__init__()

        self.bert = bert.to(device)
        if not isinstance(hypernym_list, (list, dict)):
            hypernym_list = self._read_hypernym_list(hypernym_list)

        self.tokenizer = tokenizer
        self.hypernym_list = hypernym_list
        self.use_projection = use_projection

        print(f"Building matrix of hypernym embeddings.")
        self.hypernym_embeddings = \
            torch.nn.Parameter(self._build_hypernym_matrix(hypernym_list,
                                                           embed_with_encoder_output,
                                                           embed_wo_special_tokens,
                                                           batch_size))
        if self.use_projection:
            self.projection = nn.Linear(768, 768)
 def create_and_check_bert_model(self, config, input_ids, token_type_ids,
                                 input_mask, sequence_labels, token_labels,
                                 choice_labels):
     model = BertModel(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids,
                    attention_mask=input_mask,
                    token_type_ids=token_type_ids)
     result = model(input_ids, token_type_ids=token_type_ids)
     result = model(input_ids)
     self.parent.assertEqual(
         result.last_hidden_state.shape,
         (self.batch_size, self.seq_length, self.hidden_size))
     self.parent.assertEqual(result.pooler_output.shape,
                             (self.batch_size, self.hidden_size))
Beispiel #10
0
 def __init__(self,
              model: BertModel,
              tokenizer: TokenizerWrapper,
              device=None):
     self.device = torch.device(device if device is not None else "cuda"
                                if torch.cuda.is_available() else "cpu")
     self.model = model.to(self.device)
     self.tokenizer = tokenizer
Beispiel #11
0
    def _calculate_token_embeddings(self, df: pd.DataFrame,
                                    embedder: BertModel):

        all_embeddings = []
        for id, sentence_df in tqdm(df.groupby(SENT_ID),
                                    desc='Creating Bert Embeddings',
                                    unit='sentence'):
            tokens_list = list(
                sentence_df.groupby(TOKEN_ID).first()[TOKEN_STR])
            sentence = ' '.join(tokens_list)

            input_ids = self.tokenizer.encode(sentence)
            input_ids = torch.tensor(input_ids).unsqueeze(0).to(
                GPU_ID)  # .to('cpu')
            token_embeddings = self._creat_embeddings(input_ids).to('cpu')
            sent_emb = self._untokenize(tokens_list, token_embeddings)
            # sent_emb = self._untokenize_bpe(tokens_list, token_embeddings)
            all_embeddings.append(sent_emb.data.numpy())

        embedder.to('cpu')
        return all_embeddings
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):

    bert_config_file = os.path.join(BERT_PT_PATH,
                                    f'bert_config_{bert_type}.json')
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    init_checkpoint = os.path.join(BERT_PT_PATH,
                                   f'pytorch_model_{bert_type}.bin')

    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
    bert_config.print_status()

    model_bert = BertModel(bert_config)
    if no_pretraining:
        pass
    else:
        model_bert.load_state_dict(
            torch.load(init_checkpoint, map_location='cpu'))
        print("Load pre-trained parameters.")
    model_bert.to(device)

    return model_bert, tokenizer, bert_config
def train(config, bert_config, train_path, dev_path, rel2id, id2rel,
          tokenizer):
    if os.path.exists(config.output_dir) is False:
        os.makedirs(config.output_dir, exist_ok=True)
    if os.path.exists('./data/train_file.pkl'):
        train_data = pickle.load(open("./data/train_file.pkl", mode='rb'))
    else:
        train_data = data.load_data(train_path, tokenizer, rel2id, num_rels)
        pickle.dump(train_data, open("./data/train_file.pkl", mode='wb'))
    dev_data = json.load(open(dev_path))
    for sent in dev_data:
        data.to_tuple(sent)
    data_manager = data.SPO(train_data)
    train_sampler = RandomSampler(data_manager)
    train_data_loader = DataLoader(data_manager,
                                   sampler=train_sampler,
                                   batch_size=config.batch_size,
                                   drop_last=True)
    num_train_steps = int(
        len(data_manager) / config.batch_size) * config.max_epoch

    if config.bert_pretrained_model is not None:
        logger.info('load bert weight')
        Bert_model = BertModel.from_pretrained(config.bert_pretrained_model,
                                               config=bert_config)
    else:
        logger.info('random initialize bert model')
        Bert_model = BertModel(config=bert_config).init_weights()
    Bert_model.to(device)
    submodel = sub_model(config).to(device)
    objmodel = obj_model(config).to(device)

    loss_fuc = nn.BCELoss(reduction='none')
    params = list(Bert_model.parameters()) + list(
        submodel.parameters()) + list(objmodel.parameters())
    optimizer = AdamW(params, lr=config.lr)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(data_manager))
    logger.info("  Num Epochs = %d", config.max_epoch)
    logger.info("  Total train batch size = %d", config.batch_size)
    logger.info("  Total optimization steps = %d", num_train_steps)
    logger.info("  Logging steps = %d", config.print_freq)
    logger.info("  Save steps = %d", config.save_freq)

    global_step = 0
    Bert_model.train()
    submodel.train()
    objmodel.train()

    for _ in range(config.max_epoch):
        optimizer.zero_grad()
        epoch_itorator = tqdm(train_data_loader, disable=None)
        for step, batch in enumerate(epoch_itorator):
            batch = tuple(t.to(device) for t in batch)
            input_ids, segment_ids, input_masks, sub_positions, sub_heads, sub_tails, obj_heads, obj_tails = batch

            bert_output = Bert_model(input_ids, input_masks, segment_ids)[0]
            pred_sub_heads, pred_sub_tails = submodel(
                bert_output)  # [batch_size, seq_len, 1]
            pred_obj_heads, pred_obj_tails = objmodel(bert_output,
                                                      sub_positions)

            # 计算loss
            mask = input_masks.view(-1)

            # loss1
            sub_heads = sub_heads.unsqueeze(-1)  # [batch_szie, seq_len, 1]
            sub_tails = sub_tails.unsqueeze(-1)

            loss1_head = loss_fuc(pred_sub_heads, sub_heads).view(-1)
            loss1_head = torch.sum(loss1_head * mask) / torch.sum(mask)

            loss1_tail = loss_fuc(pred_sub_tails, sub_tails).view(-1)
            loss1_tail = torch.sum(loss1_tail * mask) / torch.sum(mask)

            loss1 = loss1_head + loss1_tail

            # loss2
            loss2_head = loss_fuc(pred_obj_heads,
                                  obj_heads).view(-1, obj_heads.shape[-1])
            loss2_head = torch.sum(
                loss2_head * mask.unsqueeze(-1)) / torch.sum(mask)

            loss2_tail = loss_fuc(pred_obj_tails,
                                  obj_tails).view(-1, obj_tails.shape[-1])
            loss2_tail = torch.sum(
                loss2_tail * mask.unsqueeze(-1)) / torch.sum(mask)

            loss2 = loss2_head + loss2_tail

            # optimize
            loss = loss1 + loss2
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
            if (global_step + 1) % config.print_freq == 0:
                logger.info(
                    "epoch : {} step: {} #### loss1: {}  loss2: {}".format(
                        _, global_step + 1,
                        loss1.cpu().item(),
                        loss2.cpu().item()))

            if (global_step + 1) % config.eval_freq == 0:
                logger.info("***** Running evaluating *****")
                with torch.no_grad():
                    Bert_model.eval()
                    submodel.eval()
                    objmodel.eval()
                    P, R, F1 = utils.metric(Bert_model, submodel, objmodel,
                                            dev_data, id2rel, tokenizer)
                    logger.info(f'precision:{P}\nrecall:{R}\nF1:{F1}')
                Bert_model.train()
                submodel.train()
                objmodel.train()

            if (global_step + 1) % config.save_freq == 0:
                # Save a trained model
                model_name = "pytorch_model_%d" % (global_step + 1)
                output_model_file = os.path.join(config.output_dir, model_name)
                state = {
                    'bert_state_dict': Bert_model.state_dict(),
                    'subject_state_dict': submodel.state_dict(),
                    'object_state_dict': objmodel.state_dict(),
                }
                torch.save(state, output_model_file)

    model_name = "pytorch_model_last"
    output_model_file = os.path.join(config.output_dir, model_name)
    state = {
        'bert_state_dict': Bert_model.state_dict(),
        'subject_state_dict': submodel.state_dict(),
        'object_state_dict': objmodel.state_dict(),
    }
    torch.save(state, output_model_file)
class UnStructuredModel:

    def __init__(self, model_name, max_length, stride):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.max_length = max_length
        self.stride = stride
        if model_name == 'bert-base-uncased':
            configuration = BertConfig()
            self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
            self.model = BertModel(configuration).from_pretrained(self.model_name)
            self.model.to(device)
            self.model.eval()
            for param in self.model.parameters():
                param.requires_grad = False
            #self.model.bert.embeddings.requires_grad = False


    def padTokens(self, tokens):
        if len(tokens)<self.max_length:
            tokens = tokens + ["[PAD]" for i in range(self.max_length - len(tokens))]
        return tokens

    def getEmbedding(self, text, if_pool=True, pooling_type="mean", batchsize = 1):
        tokens = self.tokenizer.tokenize(text)
        tokenized_array = self.tokenizeText(tokens)
        embeddingTensorsList = []
        print(len(tokenized_array))
        tensor = torch.zeros([1, 768], device=device)
        count = 0
        if len(tokenized_array)>batchsize:
            for i in range(0, len(tokenized_array), batchsize):
                current_tokens = tokenized_array[i:min(i+batchsize,len(tokenized_array))]
                token_ids = torch.tensor(current_tokens).to(device)
                seg_ids=[[0 for _ in range(len(tokenized_array[0]))] for _ in range(len(current_tokens))]
                seg_ids   = torch.tensor(seg_ids).to(device)
                hidden_reps, cls_head = self.model(token_ids, token_type_ids = seg_ids)
                cls_head.to(device)
                clas_head = cls_head.detach
                if if_pool and pooling_type=="mean":
                    tensor = tensor.add(torch.sum(cls_head, dim=0))
                    count +=cls_head.shape[0]
                else:
                    embeddingTensorsList.append(cls_head)
                del cls_head, hidden_reps
            if if_pool and pooling_type=="mean" and count>0:
                embedding = torch.div(tensor, count)
            elif not if_pool:
                embedding = torch.cat(embeddingTensorsList, dim=0)
            else:
                raise NotImplementedError()

        else:
            token_ids = torch.tensor(tokenized_array).to(device)
            seg_ids=[[0 for _ in range(len(tokenized_array[0]))] for _ in range(len(tokenized_array))]
            seg_ids   = torch.tensor(seg_ids).to(device)
            hidden_reps, cls_head = self.model(token_ids, token_type_ids = seg_ids)
            cls_head.to(device)
            cls_head.requires_grad = False
            if if_pool and pooling_type=="mean":
                embedding = torch.div(torch.sum(cls_head, dim=0), cls_head.shape[0])
            elif not if_pool:
                embedding = cls_head
            else:
                raise NotImplementedError()
            del cls_head, hidden_reps
        return embedding

    def tokenizeText(self, tokens):
        tokens_array = []
        #window_movement_tokens =  max_length - stride
        for i in range(0, len(tokens), self.stride):
            if i+self.max_length<len(tokens):
                curr_tokens = ["[CLS]"] + tokens[i:i+self.max_length] + ["[SEP]"]
            else:
                padded_tokens = self.padTokens(tokens[i:i+self.max_length])
                curr_tokens = ["[CLS]"] + padded_tokens + ["[SEP]"]
            curr_tokens = self.tokenizer.convert_tokens_to_ids(curr_tokens)
            tokens_array.append(curr_tokens)
        return tokens_array
class BertVisdEmbedding(nn.Module):
    '''
      The layer of generate Bert contextual representation
      '''
    def __init__(self, config=None, device=t.device("cpu")):
        '''
          Args:
            @config: configuration file of internal Bert layer
          '''
        super(BertVisdEmbedding, self).__init__()
        if config is None:
            self.bert = BertModel.from_pretrained('bert-base-uncased')
        else:
            self.bert = BertModel(config=config)  # transformers correspondence
        self.device = device
        self.bert_hidden_size = self.bert.config.hidden_size
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.CLS = tokenizer.convert_tokens_to_ids(
            ['[CLS]'])[0]  #ID of the Bert [CLS] token
        self.SEP = tokenizer.convert_tokens_to_ids(
            ['[SEP]'])[0]  #ID of the Bert [SEP] token
        self.PAD = tokenizer.convert_tokens_to_ids(
            ['[PAD]'])[0]  #ID of the Bert [PAD] token

    def make_bert_input(self, content_idxs, content_type, seg_ids):
        '''
          Args:
            @content_idxs (tensor): Bert IDs of the content. (batch_size, max_seq_len) Note that the max_seq_len is a fixed number due to padding/clamping policy.
            @content_type (str): whether the content is "question", "history" or "answer".
            @the initial segment ID: for "question" and "answer", this should be None; for 'history', this is should be well-initialized [0,..,0,1,...,1].
          Return:
            cmp_idx (tensor): [CLS] context_idxs [SEP]. (batch_size, max_seq_len+2)
            segment_ids (tensor): for "question" and "answer", this should be "1,1,...,1"; for "history", this should be "seg_ids[0], seg_ids, seg_ids[-1]". (batch_size, max_seq_len+2)
            input_mask (tensor): attention of the real token in content. Note [CLS] and [SEP] are count as real token. (batch_size, q_len + ctx_len + 2)
          '''
        mask = content_idxs != self.PAD  #get the mask indicating the non-padding tokens in the content
        if content_type == 'question' or content_type == 'answer':  #question/answer type
            seg_ids = t.zeros_like(content_idxs,
                                   dtype=content_idxs.dtype,
                                   device=content_idxs.device)

        seq_len = mask.sum(dim=1)  #(batch_size, ) length of each sequence
        batch_size, _ = content_idxs.size()
        content_idxs = t.cat(
            (content_idxs,
             t.tensor([[self.PAD]] * batch_size, device=content_idxs.device)),
            dim=1)  #(batch_size, max_seq_len+1)
        content_idxs[
            t.arange(0, batch_size),
            seq_len] = self.SEP  #append [SEP] token to obtain "content_idxs [SEP]"
        seg_last = seg_ids[t.arange(0, batch_size), seq_len -
                           1]  #get the last segment id of each sequence
        seg_ids = t.cat(
            (seg_ids, t.tensor([[0]] * batch_size,
                               device=content_idxs.device)),
            dim=1)  #(batch_size, max_seq_len+1)
        seg_ids[t.arange(0, batch_size),
                seq_len] = seg_last  #the segment id of the new appended [SEP]
        content_idxs = t.cat(
            (t.tensor([[self.CLS]] * batch_size,
                      device=content_idxs.device), content_idxs),
            dim=1
        )  #(batch_size, max_seq_len+2)append [CLS] token to obtain "[CLS] content_idxs [SEP]"
        seg_ids = t.cat(
            (seg_ids[:, 0].view(-1, 1), seg_ids), dim=1
        )  #(batch_size, max_seq_len+2) extend the first column of the segment id
        input_mask = (content_idxs !=
                      self.PAD).long()  #(batch_size, max_seq_len+2)

        return content_idxs, seg_ids, input_mask

    def parse_bert_output(self, bert_output, orig_PAD_mask):
        '''
          Args:
            @bert_output (tensor): Bert output with [CLS] and [SEP] embeddings. (batch_size, 1+max_seq_len+1, bert_hidden_size) 
            @orig_PAD_mask (tensor): 1 for PAD token, 0 for non-PAD token. (batch_size, max_seq_len)
          Return:
            bert_enc (tensor): Bert output without [CLS] and [SEP] embeddings, and with zero-embedding for all PAD tokens. (batch_size, max_seq_len, bert_hidden_size)
          '''
        bert_enc = bert_output[:, 1:
                               -1]  #(batch_size, max_seq_len, bert_hidden_size)
        pad_emb = t.zeros(
            self.bert_hidden_size, device=bert_output.device
        )  #manually set the embedding of PAD token to be zero
        #print(bert_enc.size(), orig_PAD_mask.size(), pad_emb.size(), bert_enc.device, orig_PAD_mask.device, pad_emb.device)
        bert_enc = bert_enc.contiguous()
        bert_enc[
            orig_PAD_mask] = pad_emb  #set the PAD token embeddings to be zero.
        return bert_enc

    def forward(self, content_idxs, content_type, seg_ids=None):
        '''
          Args:
            @content_idxs (tensor): Bert IDs of the contents. (batch_size, max_seq_len) Note that the max_seq_len is a fixed number due to padding/clamping policy
            @content_type (str): whether the tensor is "question", "history" or "answer"
          Return:
            bert_ctx_emb (tensor): contextual embedding condition on question. (batch_size, max_seq_len, bert_hidden_size)
          '''
        orig_PAD_mask = content_idxs == self.PAD
        cmp_idxs, segment_ids, bert_att = self.make_bert_input(
            content_idxs, content_type, seg_ids)
        outputs = self.bert(cmp_idxs, segment_ids, bert_att)
        bert_output = outputs[0]
        bert_enc = self.parse_bert_output(bert_output, orig_PAD_mask)
        return bert_enc

    def train(self, mode=True):
        '''
          Specifically set self.bert into training mode
          '''
        self.training = mode
        self.bert.train(mode)
        return self

    def eval(self):
        '''
          Specifically set self.bert into evaluation mode 
          '''
        return self.train(False)

    def to(self, *args, **kwargs):
        '''
          Override to() interface.
          '''
        print("bert emd to() called!")
        self = super().to(*args, **kwargs)
        self.bert = self.bert.to(*args, **kwargs)
        return self
Beispiel #16
0
    'hidden_act': 'gelu',
    'hidden_dropout_prob': 0.1,
    'hidden_size': 768,
    'initializer_range': 0.02,
    'intermediate_size': 3072,
    'max_position_embeddings': 512,
    'num_attention_heads': 12,
    'num_hidden_layers': 12,
    'type_vocab_size': 2,
    'vocab_size': 8002
}

if __name__ == "__main__":
    ctx = "cpu"
    # kobert
    kobert_model_file = "./kobert_resources/pytorch_kobert_2439f391a6.params"
    kobert_vocab_file = "./kobert_resources/kobert_news_wiki_ko_cased-ae5711deb3.spiece"

    bertmodel = BertModel(config=BertConfig.from_dict(bert_config))
    bertmodel.load_state_dict(torch.load(kobert_model_file))
    device = torch.device(ctx)
    bertmodel.to(device)
    # bertmodel.eval()

    # for name, param in bertmodel.named_parameters():
    #     print(name, param.shape)

    for name, param in bertmodel.named_parameters():
        if param.requires_grad:
            print(name, param.shape)
Beispiel #17
0
from transformers import BertConfig, BertModel

if args.size == 'tiny':
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    bert_name_or_path = os.path.join(os.path.join(cur_dir, 'bert'),
                                     'bert-tiny-uncased-config.json')
elif args.size == 'base':
    bert_name_or_path = "bert-base-uncased"
else:
    bert_name_or_path = "bert-large-uncased"

config = BertConfig.from_pretrained(bert_name_or_path)
model = BertModel(config)
model.eval()
device = torch.device("cpu")
model.to(device)
dummy_input0 = torch.LongTensor(1, 512).fill_(1).to(device)
dummy_input1 = torch.LongTensor(1, 512).fill_(1).to(device)
dummy_input2 = torch.LongTensor(1, 512).fill_(0).to(device)
dummy_input = (dummy_input0, dummy_input1, dummy_input2)
output_path = './bert/bert_{}.onnx'.format(args.size)
torch.onnx.export(model,
                  dummy_input,
                  output_path,
                  export_params=True,
                  opset_version=12,
                  do_constant_folding=True,
                  input_names=["input_ids", "input_mask", "segment_ids"],
                  output_names=["output"],
                  dynamic_axes={
                      'input_ids': {
Beispiel #18
0
class Embedder():
    def __init__(self, vectorizer=None, tokenizer=None, dim_embed=200):
        """
        :param tokenizer: KB 
        """
        self.vectorizer = vectorizer
        self.tokenizer = tokenizer
        self.pre_trained = pre_trained = vectorizer.pre_trained
        self.n_tag = self.vectorizer.n_tag

        if 'bert' in pre_trained.lower():
            self.tag2vec = None
            import sys
            if pre_trained == 'bert-multi':
                from transformers import BertModel, BertConfig
                bert_config = BertConfig.from_pretrained(
                    'bert-base-multilingual-cased', output_hidden_states=True)
                self.bert = BertModel(bert_config).to(device)
            elif pre_trained == 'sktkobert':
                from kobert.pytorch_kobert import get_pytorch_kobert_model
                #sys.path.append('/home/bwlee/work/codes/sentence_similarity/kobert')
                #from pytorch_kobert3 import get_pytorch_kobert_model
                self.bert, _ = get_pytorch_kobert_model()
                self.bert = self.bert.to(device)
            elif pre_trained == 'kbalbert':
                sys.path.append(
                    '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/')
                from transformers import AlbertModel
                kbalbert_path = '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/model'
                self.bert = AlbertModel.from_pretrained(
                    kbalbert_path, output_hidden_states=True)
                self.bert = self.bert.to(device)
            else:
                from transformers import BertModel, BertConfig
                bert_config = BertConfig.from_pretrained(
                    pre_trained, output_hidden_states=True)
                self.bert = BertModel(bert_config).to(device)
        else:
            self.tag2vec = self.vectorizer.tag2vec
            self.n_vocab = len(self.vectorizer.tag2vec)
            if pre_trained == '':
                self.embed = nn.Embedding(num_embeddings=self.n_tag,
                                          embedding_dim=dim_embed,
                                          padding_idx=self.tag2ix[PAD_TAG])

    def set_embed(self, weights=None, bias=None):
        if weights is not None:
            self.embed.weight.data = weights
        if bias is not None:
            self.embed.bias.data = bias

    def __call__(self, text_arr, flag_sent=True):
        """
        check type_ids=None gives different result in bert-multi
        :param text_arr: accepts text in iterable form like batch
        """
        if type(text_arr) is str:
            print('warning: text should be in batch form')
            text_arr = [text_arr]

        if self.pre_trained == '':
            return self._call_manual(text_arr)
        elif self.pre_trained == 'glove':
            return self._call_glove(text_arr)
        elif 'bert' in self.pre_trained:
            return self._call_bert(text_arr, flag_sent)

    def _call_manual(self, text_arr):
        """
        :param text_arr: accepts text in iterable form like batch
        """
        idss = []
        for text in text_arr:
            seq = self.tokenizer.tokenize(text)
            ids = self.vectorizer.get_ids(seq)
            idss.append(ids)
        idss = torch.LongTensor(idss)
        return self.embed(idss)

    def _call_glove(self, text_arr):
        """
        :param text_arr: accepts text in iterable form like batch
        """
        vecs = []
        dim_glove = len(self.vectorizer.tag2vec['the'])
        zero = [0] * dim_glove
        for text in text_arr:
            seq = self.tokenizer.tokenize(text)
            vec = [
                self.vectorizer.tag2vec[token]
                if token in self.vectorizer.tags else zero for token in seq
            ]
        vecs.append(vec)
        return torch.tensor(vecs)

    def _call_bert(self, text_arr, flag_sent):
        idss, masks, type_ids = [], [], []
        for text in text_arr:
            seq = self.tokenizer.tokenize(text)
            ids, mask, type_id = self.vectorizer.get_ids_bert(seq)
            idss.append(ids)
            masks.append(mask)
            type_ids.append(type_id)

        with torch.no_grad():
            idss = torch.tensor(idss).to(device)
            masks = torch.tensor(masks).to(device)
            type_ids = torch.tensor(type_ids).to(device)
            #type_ids = None # bert-multi gives different values
            clss, last, hiddens = self.bert(idss,
                                            attention_mask=masks,
                                            token_type_ids=type_ids)  #kbalbert

            if flag_sent is True:
                length = torch.sum(masks,
                                   dim=1)  # lengths of words in each sentence
                length = torch.sqrt(length * 1.0).unsqueeze(1)
                masks2 = masks.unsqueeze(2)
                context = torch.sum(hiddens[-2] * masks2, dim=1) / length
            else:
                return clss, last, hiddens
        return context
Beispiel #19
0
def main(config_path):
    config = Box.from_yaml(config_path.open())
    torch.cuda.set_device(config.train.device)
    logger = create_logger(name="MAIN")
    logger.info(f"[-] Config loaded from {config_path}")

    data_dir = Path(config.data.data_dir)
    save_dir = Path(config.data.save_dir)
    if not save_dir.exists():
        save_dir.mkdir()
    transfo_dir = Path(config.data.transfo_dir)
    device = create_device(config.train.device)

    tokenizer = BertTokenizer.from_pretrained(
        str(transfo_dir), do_lower_case=(not config.data.cased))

    global CLS
    global SEP
    global PAD
    CLS, SEP, PAD = tokenizer.convert_tokens_to_ids(
        ["[CLS]", "[SEP]", "[PAD]"])

    bert_config = BertConfig.from_pretrained(str(transfo_dir))
    # To extract representations from other layers
    bert_config.output_hidden_states = True
    model = BertModel(bert_config)
    model.to(device)
    model.eval()

    train_file = data_dir / "schema_dstc8+m2.2.json"
    train_vocab_file = save_dir / "train_schema_vocab.pkl"
    train_embed_file = save_dir / "train_schema_embed.pkl"
    train_desc_file = save_dir / "train_schema_desc.pkl"
    valid_file = data_dir / "dev" / "schema.json"
    valid_vocab_file = save_dir / "valid_schema_vocab.pkl"
    valid_embed_file = save_dir / "valid_schema_embed.pkl"
    valid_desc_file = save_dir / "valid_schema_desc.pkl"
    if (data_dir / "test").exists():
        test_file = data_dir / "test" / "schema.json"
        test_vocab_file = save_dir / "test_schema_vocab.pkl"
        test_embed_file = save_dir / "test_schema_embed.pkl"
        test_desc_file = save_dir / "test_schema_desc.pkl"
    else:
        test_file = None
        test_vocab_file = None
        test_embed_file = None
        test_desc_file = None

    train_schema_vocab, train_desc = extract(train_file,
                                             config.data.concat_name)
    valid_schema_vocab, valid_desc = extract(valid_file,
                                             config.data.concat_name)
    if test_file is not None:
        test_schema_vocab, test_desc = extract(test_file,
                                               config.data.concat_name)
    else:
        test_schema_vocab = test_desc = None

    pickle.dump(train_schema_vocab, open(train_vocab_file, "wb"))
    pickle.dump(valid_schema_vocab, open(valid_vocab_file, "wb"))
    if test_schema_vocab is not None:
        pickle.dump(test_schema_vocab, open(test_vocab_file, "wb"))

    layer = config.data.schema.layer
    pooling = config.data.schema.pooling

    train_embed = []
    for desc in tqdm(train_desc, leave=False):
        embed = []
        for sent in tqdm(desc, leave=False):
            embed.append(
                get_rep(sent, model, tokenizer, layer, pooling, device))
        embed = torch.stack(embed)
        train_embed.append(embed)

    train_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)]
                   for sent in desc] for desc in train_desc]

    pickle.dump(train_embed, open(train_embed_file, "wb"))
    pickle.dump(train_desc, open(train_desc_file, "wb"))

    valid_embed = []
    for desc in tqdm(valid_desc, leave=False):
        embed = []
        for sent in tqdm(desc, leave=False):
            embed.append(
                get_rep(sent, model, tokenizer, layer, pooling, device))
        embed = torch.stack(embed)
        valid_embed.append(embed)

    valid_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)]
                   for sent in desc] for desc in valid_desc]

    pickle.dump(valid_embed, open(valid_embed_file, "wb"))
    pickle.dump(valid_desc, open(valid_desc_file, "wb"))

    if test_desc is None:
        exit()

    test_embed = []
    for desc in tqdm(test_desc, leave=False):
        embed = []
        for sent in tqdm(desc, leave=False):
            embed.append(
                get_rep(sent, model, tokenizer, layer, pooling, device))
        embed = torch.stack(embed)
        test_embed.append(embed)

    test_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)]
                  for sent in desc] for desc in test_desc]

    pickle.dump(test_embed, open(test_embed_file, "wb"))
    pickle.dump(test_desc, open(test_desc_file, "wb"))