예제 #1
0
    def tensorize_random_mask(self, batch_text, bsize=None):
        assert type(batch_text) in [list, tuple], (type(batch_text))

        # add placehold for the [Q] marker
        batch_text = ['. ' + x for x in batch_text]

        batch_ids = []
        batch_masks = []
        for text in batch_text:
            tokens = self.tok.tokenize(text)
            pos = nltk.pos_tag(tokens)
            new_tokens = []
            new_masks = []
            for i in range(len(tokens)):
                p = pos[i]
                token = tokens[i]
                # if token is Noun or adj and is not stop words and not subword token
                if ('NN' in p[1] or 'JJ' in p[1]) and (
                        token not in self.nltk_stopwords) and '#' not in token:
                    new_tokens.append(token)
                    new_masks.append(1)
                    # add 1 means let BERT do self attention with mask token (mask_v2), and zero is not(mask_v1)
                    # 不讓bert做self attention代表不讓他破壞句法結構? 最後一層layer再猜
                    new_tokens.append('[MASK]')
                    new_masks.append(0)
                else:
                    new_tokens.append(token)
                    new_masks.append(1)

            # padding or truncate
            if len(new_tokens) >= self.query_maxlen - 2:
                new_tokens = ['[CLS]'] + new_tokens[:self.query_maxlen -
                                                    2] + ['[SEP]']
                new_masks = [1] + new_masks[:self.query_maxlen - 2] + [0]
            else:
                new_tokens = ['[CLS]'] + new_tokens + ['[MASK]'] * (
                    self.query_maxlen - 2 - len(new_tokens)) + ['[SEP]']
                new_masks = new_masks + [0] * (self.query_maxlen -
                                               len(new_masks))
            new_ids = self.tok.convert_tokens_to_ids(new_tokens)
            new_ids[1] = self.Q_marker_token_id

            # print(text)
            # print(new_tokens , len(new_tokens))
            # print(new_ids , len(new_ids))
            # print(new_masks , len(new_masks))

            batch_ids.append(new_ids)
            batch_masks.append(new_masks)

        ids = torch.tensor(batch_ids)
        mask = torch.tensor(batch_masks)

        if bsize:
            batches = _split_into_batches(ids, mask, bsize)
            return batches

        return ids, mask
예제 #2
0
    def tensorize(self, batch_text, bsize=None):
        assert type(batch_text) in [list, tuple], (type(batch_text))

        # add placehold for the [Q] marker
        batch_text = ['. ' + x for x in batch_text]

        obj = self.tok(batch_text,
                       padding='max_length',
                       truncation=True,
                       return_tensors='pt',
                       max_length=self.query_maxlen)

        ids, mask = obj['input_ids'], obj['attention_mask']

        # postprocess for the [Q] marker and the [MASK] augmentation
        ids[:, 1] = self.Q_marker_token_id
        ids[ids == 0] = self.mask_token_id

        if bsize:
            batches = _split_into_batches(ids, mask, bsize)
            return batches

        return ids, mask
예제 #3
0
    def tensorize(self, batch_text, bsize=None):
        assert type(batch_text) in [list, tuple], (type(batch_text))

        # add placehold for the [D] marker
        batch_text = ['. ' + x for x in batch_text]

        obj = self.tok(batch_text,
                       padding='longest',
                       truncation='longest_first',
                       return_tensors='pt',
                       max_length=self.doc_maxlen)

        ids, mask = obj['input_ids'], obj['attention_mask']

        # postprocess for the [D] marker
        ids[:, 1] = self.D_marker_token_id

        if bsize:
            ids, mask, reverse_indices = _sort_by_length(ids, mask, bsize)
            batches = _split_into_batches(ids, mask, bsize)
            return batches, reverse_indices

        return ids, mask