Ejemplo n.º 1
0
 def post_processor(self, tokenizer):
     return TemplateProcessing(
         seq_a=["$0", "</s>"],
         seq_b=["$1", "</s>"],
         special_tokens=[
             ("</s>", tokenizer.get_vocab()["</s>"]),
         ],
     )
Ejemplo n.º 2
0
def load_tokenizer(tokenizer_path):
    tokenizer = Tokenizer.from_file(tokenizer_path)
    tokenizer.post_processor = TemplateProcessing(
        single='[CLS] $A [SEP]',
        pair='[CLS] $A [SEP] $B:1 [SEP]:1',
        special_tokens=[('[CLS]', 1), ('[SEP]', 2)],
    )
    return tokenizer
Ejemplo n.º 3
0
 def post_processor(self, tokenizer):
     eos = self.original_tokenizer.eos_token
     return TemplateProcessing(
         seq_a=["$0", eos],
         seq_b=["$1", eos],
         special_tokens=[
             (eos, tokenizer.get_vocab()[eos]),
         ],
     )
Ejemplo n.º 4
0
def tokenize(dt, df):
    from tokenizers import Tokenizer
    from tokenizers.models import WordPiece
    from tokenizers.pre_tokenizers import Whitespace
    from tokenizers import normalizers
    from tokenizers.normalizers import NFD, StripAccents
    from tokenizers.processors import TemplateProcessing
    from tokenizers.trainers import WordPieceTrainer

    #print(df.head())
    #print(df.query_text.head())
    #print(df.query_text.to_list())
    #exit(0)
    data_source = get_data_source(dt)
    token_file = Path(data_dir, data_source, 'tokenizer.json')
    vocab_file = Path(data_dir, data_source, 'vocab.txt')
    corpus_file = Path(data_dir, data_source, 'corpus.txt')
    if vocab_file.is_file() and corpus_file.is_file():
        print("corpus and token files already generated")
        return 0

    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordPieceTrainer(
        vocab_size=25000,
        min_frequency=3,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    #print(df.query_text.to_list())
    bert_tokenizer.train_from_iterator(df.query_text.to_list(), trainer)
    bert_tokenizer.save(str(token_file))
    #bert_tokenizer.save_model(directory=data_dir,name='tokenizer')
    df['range_idx'] = range(0, df.shape[0])
    df['mean_rank_group'] = df.groupby(
        ['session_id'], sort=False)['range_idx'].transform(np.mean)
    df['separate_column'] = df['range_idx'] < df['mean_rank_group']
    df = df.groupby(['session_id', 'separate_column'],
                    as_index=False,
                    sort=False)['query_text'].agg(
                        ' '.join).drop(columns='separate_column')
    #df = df.groupby('session_id').agg({'query_text':' '.join}).reset_index()
    df.query_text.to_csv(corpus_file, header=False, index=False)
    with open(token_file) as token_f:
        jdata = json.load(token_f)
        with open(vocab_file, "w") as fd:
            for k in jdata['model']['vocab'].keys():
                print(k, file=fd)
Ejemplo n.º 5
0
    def test_instantiate(self):
        bert = self.get_bert()
        assert bert is not None
        assert isinstance(bert, PostProcessor)
        assert isinstance(bert, TemplateProcessing)
        assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing)

        # It is absolutely legal to have tokens with spaces in the name:
        processor = TemplateProcessing(
            single=["[ C L S ]", "Token with space"],
            special_tokens=[("[ C L S ]", 0), ("Token with space", 1)],
        )
        # Sequence identifiers must be well formed:
        with pytest.raises(Exception, match="Cannot build Piece"):
            processor = TemplateProcessing(single="[CLS] $$ [SEP]")
        with pytest.raises(Exception, match="Cannot build Piece"):
            processor = TemplateProcessing(single="[CLS] $A: [SEP]")
        # Special tokens must be provided when used in template:
        with pytest.raises(Exception, match="Missing SpecialToken\(s\) with id\(s\)"):
            processor = TemplateProcessing(single=["[CLS]"])
Ejemplo n.º 6
0
    def __init__(
        self,
        replacement: str = "▁",
        add_prefix_space: bool = True,
        unk_token: Union[str, AddedToken] = "<unk>",
        eos_token: Union[str, AddedToken] = "</s>",
        pad_token: Union[str, AddedToken] = "<pad>",
    ):
        self.special_tokens = {
            "pad": {"id": 0, "token": pad_token},
            "eos": {"id": 1, "token": eos_token},
            "unk": {"id": 2, "token": unk_token},
        }

        self.special_tokens_list = [None] * len(self.special_tokens)
        for token_dict in self.special_tokens.values():
            self.special_tokens_list[token_dict["id"]] = token_dict["token"]

        tokenizer = Tokenizer(Unigram())

        tokenizer.normalizer = normalizers.Sequence(
            [
                normalizers.Nmt(),
                normalizers.NFKC(),
                normalizers.Replace(Regex(" {2,}"), " "),
                normalizers.Lowercase(),
            ]
        )
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [
                pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
                pre_tokenizers.Digits(individual_digits=True),
                pre_tokenizers.Punctuation(),
            ]
        )
        tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)

        tokenizer.post_processor = TemplateProcessing(
            single=f"$A {self.special_tokens['eos']['token']}",
            special_tokens=[(self.special_tokens["eos"]["token"], self.special_tokens["eos"]["id"])],
        )

        parameters = {
            "model": "SentencePieceUnigram",
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
        }

        super().__init__(tokenizer, parameters)
Ejemplo n.º 7
0
 def _prepare_pipeline(self):
     self.tokenizer.normalizer = normalizers.Sequence(
         [NFD(), Lowercase(), StripAccents()])
     self.tokenizer.pre_tokenizer = Whitespace()
     self.tokenizer.post_processor = TemplateProcessing(
         single="[CLS] $A [SEP]",
         pair="[CLS] $A [SEP] $B:1 [SEP]:1",
         special_tokens=[
             ("[CLS]", 1),
             ("[SEP]", 2),
         ],
     )
     self.tokenizer.enable_padding(
         pad_id=self.__class__.SPECIAL_TOKENS.index("[PAD]"),
         pad_token="[PAD]")
Ejemplo n.º 8
0
def wordpiece_tokenize(line):
    tokenizer = Tokenizer(WordPiece(wordpiece_dict3))
    tokenizer.enable_padding(length=200)
    tokenizer.enable_truncation(max_length=200)
    tokenizer.pre_tokenizer = WhitespaceSplit()
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    output = tokenizer.encode(line)
    return (output.ids)
Ejemplo n.º 9
0
    def configure_tokenizers(self, padding, truncation, max_length, lower):
        # Settings
        pad_length = None
        if padding in {True, "longest"}:
            pass
        elif padding in {"max_length"}:
            pad_length = max_length
        elif padding in {False, "do_not_pad"}:
            pass
        else:
            raise ValueError("Unknown padding type")

        # SRC tokenizer
        tok_normalizers = [NFD(), Strip()]
        if lower:
            tok_normalizers += [Lowercase()]

        self.tokenizer = Tokenizer(tok_model())  # unk_token=... not working
        self.tokenizer.add_special_tokens(self.special_tokens)
        self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [WhitespaceSplit()])
        self.tokenizer.normalizer = normalizers.Sequence(
            tok_normalizers)  # StripAccents requires NFD
        self.tokenizer.decoder = tok_decoder()

        # Define template (Needed for the sos/eos tokens)
        basic_template = TemplateProcessing(
            single=f"{self.SOS_WORD} $A {self.EOS_WORD}",
            pair=
            f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}",
            special_tokens=[
                (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)),
                (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD))
            ],
        )
        self.tokenizer.post_processor = basic_template

        if padding:
            self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id(
                self.PAD_WORD),
                                          pad_token=self.PAD_WORD,
                                          length=pad_length)
        if truncation:
            self.tokenizer.enable_truncation(max_length,
                                             stride=0,
                                             strategy='longest_first')
Ejemplo n.º 10
0
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer:
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )
    bert_tokenizer.train_from_iterator(sentences, trainer=trainer)
    if serialize_path:
        bert_tokenizer.save(serialize_path)
    return bert_tokenizer
Ejemplo n.º 11
0
def train_wordpiece_bert():
    """
    Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
    """
    from tokenizers.models import WordPiece
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

    from tokenizers import normalizers
    from tokenizers.normalizers import Lowercase, NFD, StripAccents
    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace
    bert_tokenizer.pre_tokenizer = Whitespace()

    from tokenizers.processors import TemplateProcessing
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )

    bert_tokenizer.decoder = decoders.WordPiece()

    from tokenizers.trainers import WordPieceTrainer
    trainer = WordPieceTrainer(
        vocab_size=30522,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    files = [
        DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a
        for a in ["test", "train", "valid"]
    ]
    bert_tokenizer.train(files, trainer)
    bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json')

    return bert_tokenizer
Ejemplo n.º 12
0
    def __init__(
        self,
        batch_size: int = 1,
        val_batch_size: int = None,
        dataset=None,
        languages=None,
        tokenizer: Tokenizer = None,
        device='cpu',
    ):
        super(WMT20DataModule, self).__init__()

        self.batch_size = batch_size
        self.val_batch_size = val_batch_size if val_batch_size is not None else batch_size

        if dataset is None:
            raise ValueError(f"dataset is required for {self}")
        self.dataset = dataset
        if languages is None:
            raise ValueError(f"languages is required for {self}")
        self.languages = languages
        self.tokenizer = tokenizer
        pad_token = "[PAD]"
        self.tokenizer.enable_padding(pad_id=tokenizer.token_to_id(pad_token),
                                      pad_token=pad_token)

        translate_postprocessor = TemplateProcessing(
            single="[TRANSLATE] $0 [SEP]",
            special_tokens=[("[TRANSLATE]",
                             tokenizer.token_to_id('[TRANSLATE]')),
                            ("[SEP]", tokenizer.token_to_id('[SEP]'))],
        )

        tokenizer.post_processor = translate_postprocessor

        self.device = device

        return
Ejemplo n.º 13
0
    parser.add_argument(
        "--dataset",
        type=str,
        default="/data/nv419/VQG_DATA/processed/iq_dataset.hdf5")
    parser.add_argument(
        "--val_dataset",
        type=str,
        default="/data/nv419/VQG_DATA/processed/iq_val_dataset.hdf5")

    args = parser.parse_args()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    args.device = device

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
    )

    data_loader = get_loader(os.path.join(os.getcwd(), args.dataset),
                             tokenizer,
                             args.batch_size,
                             shuffle=True,
                             num_workers=8)
    val_data_loader = get_loader(os.path.join(os.getcwd(), args.val_dataset),
                                 tokenizer,
                                 args.batch_size,
                                 shuffle=False,
                                 num_workers=8)

    trainVQG = TrainVQG(args, tokenizer)  # .to(device)
Ejemplo n.º 14
0
 def get_bert(self):
     return TemplateProcessing(
         seq_a=["[CLS]", "$0", "[SEP]"],
         seq_b=["$1", "[SEP]"],
         special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
     )
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer
from tokenizers.processors import TemplateProcessing

t = Tokenizer(WordLevel(unk_token="[UNK]"))
t.pre_tokenizer = Whitespace()

trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"])
t.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    # ,
    # pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 2),
        ("[SEP]", 3),
    ])

files = ['tok-train-shuf-tgt.tsv']
t.train(files, trainer)

t.save("code_tokenizer.json")
Ejemplo n.º 16
0
    def test_quicktour(self, doc_wiki_tokenizer):
        def print(*args, **kwargs):
            pass

        try:
            # START reload_tokenizer
            tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
            # END reload_tokenizer
        except Exception:
            tokenizer = Tokenizer.from_file(doc_wiki_tokenizer)
        # START encode
        output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
        # END encode
        # START print_tokens
        print(output.tokens)
        # ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"]
        # END print_tokens
        assert output.tokens == [
            "Hello",
            ",",
            "y",
            "'",
            "all",
            "!",
            "How",
            "are",
            "you",
            "[UNK]",
            "?",
        ]
        # START print_ids
        print(output.ids)
        # [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35]
        # END print_ids
        assert output.ids == [
            27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35
        ]
        # START print_offsets
        print(output.offsets[9])
        # (26, 27)
        # END print_offsets
        assert output.offsets[9] == (26, 27)
        # START use_offsets
        sentence = "Hello, y'all! How are you 😁 ?"
        sentence[26:27]
        # "😁"
        # END use_offsets
        assert sentence[26:27] == "😁"
        # START check_sep
        tokenizer.token_to_id("[SEP]")
        # 2
        # END check_sep
        assert tokenizer.token_to_id("[SEP]") == 2
        # START init_template_processing
        from tokenizers.processors import TemplateProcessing

        tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[
                ("[CLS]", tokenizer.token_to_id("[CLS]")),
                ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ],
        )
        # END init_template_processing
        # START print_special_tokens
        output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
        print(output.tokens)
        # ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"]
        # END print_special_tokens
        assert output.tokens == [
            "[CLS]",
            "Hello",
            ",",
            "y",
            "'",
            "all",
            "!",
            "How",
            "are",
            "you",
            "[UNK]",
            "?",
            "[SEP]",
        ]
        # START print_special_tokens_pair
        output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?")
        print(output.tokens)
        # ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"]
        # END print_special_tokens_pair
        assert output.tokens == [
            "[CLS]",
            "Hello",
            ",",
            "y",
            "'",
            "all",
            "!",
            "[SEP]",
            "How",
            "are",
            "you",
            "[UNK]",
            "?",
            "[SEP]",
        ]
        # START print_type_ids
        print(output.type_ids)
        # [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
        # END print_type_ids
        assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
        # START encode_batch
        output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
        # END encode_batch
        # START encode_batch_pair
        output = tokenizer.encode_batch(
            [["Hello, y'all!", "How are you 😁 ?"],
             ["Hello to you too!", "I'm fine, thank you!"]])
        # END encode_batch_pair
        # START enable_padding
        tokenizer.enable_padding(pad_id=3, pad_token="[PAD]")
        # END enable_padding
        # START print_batch_tokens
        output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"])
        print(output[1].tokens)
        # ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"]
        # END print_batch_tokens
        assert output[1].tokens == [
            "[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"
        ]
        # START print_attention_mask
        print(output[1].attention_mask)
        # [1, 1, 1, 1, 1, 1, 1, 0]
        # END print_attention_mask
        assert output[1].attention_mask == [1, 1, 1, 1, 1, 1, 1, 0]
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

uid_task_id_sequence_path = 'data/feature_sequence/uid_task_id.txt'
paths = [str(x) for x in Path(".").glob('data/feature_sequence/*.txt')]

tokenizer = Tokenizer(WordLevel())
tokenizer.pre_tokenizer = Whitespace()
# trainer = trainers.BpeTrainer(
trainer = trainers.WordPieceTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train(trainer, [uid_task_id_sequence_path])
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

# tokenizer.save_model("tmp")
tokenizer.model.save('data/bert_and_tokenizer', 'uid_task_id')

# tokenizer = ByteLevelBPETokenizer(
#     "./tmp/vocab.json",
#     "./tmp/merges.txt",
# )

# task id的词汇表大小
task_id_vocab_size = 6033
config = BertConfig(
Ejemplo n.º 18
0
 def get_roberta(self):
     return TemplateProcessing(
         seq_a="<s> $0 </s>",
         seq_b="</s> $0 </s>",
         special_tokens=[("<s>", 0), ("</s>", 1)],
     )
Ejemplo n.º 19
0
 def get_bert(self):
     return TemplateProcessing(
         single=["[CLS]", "$0", "[SEP]"],
         pair=["[CLS]", "$A", "[SEP]", "$B:1", "[SEP]:1"],
         special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
     )
Ejemplo n.º 20
0
    def __getitem__(self, item):
        """
		Args:
			item: int, idx
		Returns:
			tokens: tokens of query + context, [seq_len]
			token_type_ids: token type ids, 0 for query, 1 for context, [seq_len]
			start_labels: start labels of NER in tokens, [seq_len]
			end_labels: end labels of NER in tokens, [seq_len]
			label_mask: label mask, 1 for counting into loss, 0 for ignoring. [seq_len]
			match_labels: match labels, [seq_len, seq_len]
			sample_idx: sample id
			label_idx: label id

		"""
        cls_tok = "[CLS]"
        sep_tok = "[SEP]"
        if 'roberta' in self.args.bert_config_dir:
            cls_tok = "<s>"
            sep_tok = "</s>"

        # begin{get the label2idx dictionary}
        label2idx = {}
        label2idx_list = self.args.label2idx_list
        for labidx in label2idx_list:
            lab, idx = labidx
            label2idx[lab] = int(idx)
        # print('label2idx: ',label2idx)
        # end{get the label2idx dictionary}

        # begin{get the morph2idx dictionary}
        morph2idx = {}
        morph2idx_list = self.args.morph2idx_list
        for morphidx in morph2idx_list:
            morph, idx = morphidx
            morph2idx[morph] = int(idx)
        # end{get the morph2idx dictionary}

        data = self.all_data[item]
        tokenizer = self.tokenzier

        # AutoTokenizer(self.args.bert_config_dir)

        qas_id = data.get("qas_id", "0.0")
        sample_idx, label_idx = qas_id.split(".")

        sample_idx = torch.LongTensor([int(sample_idx)])
        label_idx = torch.LongTensor([int(label_idx)])

        query = data["query"]
        context = data["context"].strip()
        if '\u200b' in context:
            context = context.replace('\u200b', '')
        elif '\ufeff' in context:
            context = context.replace('\ufeff', '')
        elif '  ' in context:
            context = context.replace('  ', ' ')

        span_position_label = data["span_position_label"]
        # context = "Japan -DOCSTART- began the defence of their Asian Cup on Friday ."

        start_positions = []
        end_positions = []

        for seidx, label in span_position_label.items():
            sidx, eidx = seidx.split(';')
            start_positions.append(int(sidx))
            end_positions.append(int(eidx))

        # add space offsets
        words = context.split()

        # convert the span position into the character index, space is also a position.
        pos_start_positions = start_positions
        pos_end_positions = end_positions

        pos_span_idxs = []
        for sidx, eidx in zip(pos_start_positions, pos_end_positions):
            pos_span_idxs.append((sidx, eidx))

        # all span (sidx, eidx)
        all_span_idxs = enumerate_spans(context.split(),
                                        offset=0,
                                        max_span_width=self.args.max_span_len)
        # get the span-length of each span

        # begin{compute the span weight}
        all_span_weights = []

        for span_idx in all_span_idxs:
            weight = self.args.neg_span_weight
            if span_idx in pos_span_idxs:
                weight = 1.0
            all_span_weights.append(weight)
        # end{compute the span weight}

        all_span_lens = []
        for idxs in all_span_idxs:
            sid, eid = idxs
            slen = eid - sid + 1
            all_span_lens.append(slen)

        morph_idxs = self.case_feature_tokenLevel(morph2idx, all_span_idxs,
                                                  words,
                                                  self.args.max_span_len)

        if 'roberta' in self.args.bert_config_dir:

            tokenizer.post_processor = TemplateProcessing(
                single="<s> $A </s>",
                pair="<s> $A </s> $B:1 </s>:1",
                special_tokens=[
                    ("<s>", tokenizer.token_to_id("<s>")),
                    ("</s>", tokenizer.token_to_id("</s>")),
                ],
            )
            tokenizer._tokenizer.post_processor = BertProcessing(
                ("</s>", tokenizer.token_to_id("</s>")),
                ("<s>", tokenizer.token_to_id("<s>")),
            )
            p1 = tokenizer.token_to_id("<s>")
            p2 = tokenizer.token_to_id("</s>")
            print("p1", p1)
            print("p2", p2)

        query_context_tokens = tokenizer.encode(context,
                                                add_special_tokens=True)
        tokens = query_context_tokens.ids  # subword index
        type_ids = query_context_tokens.type_ids  # the split of two sentence on the subword-level, 0 for first sent, 1 for the second sent
        offsets = query_context_tokens.offsets  # the subword's start-index and end-idx of the character-level.

        # print("current sent: ", context)
        all_span_idxs_ltoken, all_span_word, all_span_idxs_new_label = self.convert2tokenIdx(
            words, tokens, type_ids, offsets, all_span_idxs,
            span_position_label)
        pos_span_idxs_ltoken, pos_span_word, pos_span_idxs_new_label = self.convert2tokenIdx(
            words, tokens, type_ids, offsets, pos_span_idxs,
            span_position_label)

        span_label_ltoken = []
        for seidx_str, label in all_span_idxs_new_label.items():
            span_label_ltoken.append(label2idx[label])
        '''
		an example of tokens, type_ids, and offsets value.
		inputs: 
			query = "you are beautiful ."
			context = 'i love you .'

		outputs:
			tokens:  [101, 2017, 2024, 3376, 1012, 102, 1045, 2293, 2017, 1012, 102]
			type_ids:  [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
			offsets:  [(0, 0), (0, 3), (4, 7), (8, 17), (18, 19), (0, 0), (0, 1), (2, 6), (7, 10), (11, 12), (0, 0)]
			query_context_tokens.tokens: ['[CLS]', 'you', 'are', 'beautiful', '.', '[SEP]', 'i', 'love', 'you', '.', '[SEP]']
			query_context_tokens.words:  [None, 0, 1, 2, 3, None, 0, 1, 2, 3, None]
		'''

        # # the max-end-index should not exceed the max-length.
        # all_span_idxs_ltoken

        # return  tokens, type_ids, all_span_idxs_ltoken, pos_span_mask_ltoken
        # truncate
        tokens = tokens[:self.max_length]
        type_ids = type_ids[:self.max_length]
        all_span_idxs_ltoken = all_span_idxs_ltoken[:self.max_num_span]
        # pos_span_mask_ltoken = pos_span_mask_ltoken[:self.max_num_span]
        span_label_ltoken = span_label_ltoken[:self.max_num_span]
        all_span_lens = all_span_lens[:self.max_num_span]
        morph_idxs = morph_idxs[:self.max_num_span]
        all_span_weights = all_span_weights[:self.max_num_span]

        # make sure last token is [SEP]
        sep_token = tokenizer.token_to_id(sep_tok)
        if tokens[-1] != sep_token:
            assert len(tokens) == self.max_length
            tokens = tokens[:-1] + [sep_token]

        # padding to the max length.
        import numpy as np
        real_span_mask_ltoken = np.ones_like(span_label_ltoken)
        if self.pad_to_maxlen:
            tokens = self.pad(tokens, 0)
            type_ids = self.pad(type_ids, 1)
            all_span_idxs_ltoken = self.pad(all_span_idxs_ltoken,
                                            value=(0, 0),
                                            max_length=self.max_num_span)
            # pos_span_mask_ltoken = self.pad(pos_span_mask_ltoken,value=0,max_length=self.max_num_span)
            real_span_mask_ltoken = self.pad(real_span_mask_ltoken,
                                             value=0,
                                             max_length=self.max_num_span)
            span_label_ltoken = self.pad(span_label_ltoken,
                                         value=0,
                                         max_length=self.max_num_span)
            all_span_lens = self.pad(all_span_lens,
                                     value=0,
                                     max_length=self.max_num_span)
            morph_idxs = self.pad(morph_idxs,
                                  value=0,
                                  max_length=self.max_num_span)
            all_span_weights = self.pad(all_span_weights,
                                        value=0,
                                        max_length=self.max_num_span)

        tokens = torch.LongTensor(tokens)
        type_ids = torch.LongTensor(
            type_ids)  # use to split the first and second sentence.
        all_span_idxs_ltoken = torch.LongTensor(all_span_idxs_ltoken)
        # pos_span_mask_ltoken = torch.LongTensor(pos_span_mask_ltoken)
        real_span_mask_ltoken = torch.LongTensor(real_span_mask_ltoken)
        span_label_ltoken = torch.LongTensor(span_label_ltoken)
        all_span_lens = torch.LongTensor(all_span_lens)
        morph_idxs = torch.LongTensor(morph_idxs)
        # print("all_span_weights: ",all_span_weights)
        all_span_weights = torch.Tensor(all_span_weights)

        min_idx = np.max(np.array(all_span_idxs_ltoken))

        return [
            tokens,
            type_ids,  # use to split the first and second sentence.
            all_span_idxs_ltoken,
            morph_idxs,
            span_label_ltoken,
            all_span_lens,
            all_span_weights,

            # pos_span_mask_ltoken,
            real_span_mask_ltoken,
            words,
            all_span_word,
            all_span_idxs,
        ]
Ejemplo n.º 21
0
 def get_roberta(self):
     return TemplateProcessing(
         single="<s> $0 </s>",
         pair="<s> $A </s> </s> $B </s>",
         special_tokens=[("<s>", 0), ("</s>", 1)],
     )