コード例 #1
0
ファイル: bert_vocab.py プロジェクト: seanmacavaney/OpenNIR
 def _forward(self, in_toks, lens=None, seg_id=0):
     if lens is None:
         # if no lens provided, assume all are full length, I guess... not great
         lens = torch.full_like(in_toks[:, 0], in_toks.shape[1])
     maxlen = self.bert.config.max_position_embeddings
     MAX_TOK_LEN = maxlen - 2  # -2 for [CLS] and [SEP]
     toks, _ = util.subbatch(in_toks, MAX_TOK_LEN)
     mask = util.lens2mask(lens, in_toks.shape[1])
     mask, _ = util.subbatch(mask, MAX_TOK_LEN)
     toks = torch.cat([torch.full_like(toks[:, :1], self.CLS), toks], dim=1)
     toks = torch.cat([toks, torch.full_like(toks[:, :1], self.SEP)], dim=1)
     ONES = torch.ones_like(mask[:, :1])
     mask = torch.cat([ONES, mask, ONES], dim=1)
     segment_ids = torch.full_like(toks, seg_id)
     # Change -1 padding to 0-padding (will be masked)
     toks = torch.where(toks == -1, torch.zeros_like(toks), toks)
     result = self.bert(toks, segment_ids, mask)
     if not self.vocab.config['last_layer']:
         cls_result = [r[:, 0] for r in result]
         result = [r[:, 1:-1, :] for r in result]
         result = [
             util.un_subbatch(r, in_toks, MAX_TOK_LEN) for r in result
         ]
     else:
         BATCH = in_toks.shape[0]
         result = result[-1]
         cls_output = result[:, 0]
         cls_result = []
         for i in range(cls_output.shape[0] // BATCH):
             cls_result.append(cls_output[i * BATCH:(i + 1) * BATCH])
         cls_result = torch.stack(cls_result, dim=2).mean(dim=2)
         result = result[:, 1:-1, :]
         result = util.un_subbatch(result, in_toks, MAX_TOK_LEN)
     return result, cls_result
コード例 #2
0
ファイル: bert_vocab.py プロジェクト: seanmacavaney/OpenNIR
    def enc_query_doc(self, **inputs):
        query_tok, query_len = inputs['query_tok'], inputs['query_len']
        doc_tok, doc_len = inputs['doc_tok'], inputs['doc_len']
        BATCH, QLEN = query_tok.shape
        maxlen = self.bert.config.max_position_embeddings
        MAX_DOC_TOK_LEN = maxlen - QLEN - 3  # -3 [CLS] and 2x[SEP]

        doc_toks, sbcount = util.subbatch(doc_tok, MAX_DOC_TOK_LEN)
        doc_mask = util.lens2mask(doc_len, doc_tok.shape[1])
        doc_mask, _ = util.subbatch(doc_mask, MAX_DOC_TOK_LEN)

        query_toks = torch.cat([query_tok] * sbcount, dim=0)
        query_mask = util.lens2mask(query_len, query_toks.shape[1])
        query_mask = torch.cat([query_mask] * sbcount, dim=0)

        CLSS = torch.full_like(query_toks[:, :1], self.CLS)
        SEPS = torch.full_like(query_toks[:, :1], self.SEP)
        ONES = torch.ones_like(query_mask[:, :1])
        NILS = torch.zeros_like(query_mask[:, :1])
        toks = torch.cat([CLSS, query_toks, SEPS, doc_toks, SEPS], dim=1)
        mask = torch.cat([ONES, query_mask, ONES, doc_mask, ONES], dim=1)
        segment_ids = torch.cat([NILS] * (2 + QLEN) + [ONES] *
                                (1 + doc_toks.shape[1]),
                                dim=1)

        # Change -1 padding to 0-padding (will be masked)
        toks = torch.where(toks == -1, torch.zeros_like(toks), toks)

        result = self.bert(toks, segment_ids, mask)

        # extract relevant subsequences for query and doc
        query_results = [r[:BATCH, 1:QLEN + 1] for r in result]
        doc_results = [r[:, QLEN + 2:-1] for r in result]
        doc_results = [
            util.un_subbatch(r, doc_tok, MAX_DOC_TOK_LEN) for r in doc_results
        ]

        cls_results = []
        for layer in range(len(result)):
            cls_output = result[layer][:, 0]
            cls_result = []
            for i in range(cls_output.shape[0] // BATCH):
                cls_result.append(cls_output[i * BATCH:(i + 1) * BATCH])
            cls_result = torch.stack(cls_result, dim=2).mean(dim=2)
            cls_results.append(cls_result)

        if self.vocab.config['last_layer']:
            query_results = query_results[-1]
            doc_results = doc_results[-1]
            cls_results = cls_results[-1]

        return {'query': query_results, 'doc': doc_results, 'cls': cls_results}