コード例 #1
0
 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     with PathManager.open(config.vocab_file) as file_path:
         vocab = build_fairseq_vocab(
             dictionary_class=MaskedLMDictionary,
             vocab_file=file_path,
             max_vocab=config.max_vocab,
             min_count=config.min_count,
             special_token_replacements={
                 "<unk>": UNK,
                 "<pad>": PAD,
                 "</s>": EOS,
                 "<mask>": MASK,
             },
         )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         language_column=config.language_column,
         lang2id=config.lang2id,
         use_language_embeddings=config.use_language_embeddings,
         has_language_in_data=config.has_language_in_data,
     )
コード例 #2
0
    def from_config(cls, config: Config):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        base_tokenizer = None
        if config.base_tokenizer:
            base_tokenizer = create_component(
                ComponentType.TOKENIZER, config.base_tokenizer
            )

        # map to the real vocab_file
        config.vocab_file = (
            resources.roberta.RESOURCE_MAP[config.vocab_file]
            if config.vocab_file in resources.roberta.RESOURCE_MAP
            else config.vocab_file
        )
        with PathManager.open(config.vocab_file) as f:
            vocab = build_fairseq_vocab(
                vocab_file=f,
                special_token_replacements={
                    "<pad>": PAD,
                    "<s>": BOS,
                    "</s>": EOS,
                    "<unk>": UNK,
                    "<mask>": MASK,
                },
            )
        return cls(
            columns=config.columns,
            vocab=vocab,
            tokenizer=tokenizer,
            max_seq_len=config.max_seq_len,
            base_tokenizer=base_tokenizer,
        )
コード例 #3
0
    def __init__(
        self,
        vocab_path: Optional[str] = None,
        vocab_list: Optional[List[str]] = None,
        special_token_replacements=SPECIAL_TOKEN_REPLACEMENT,
        add_bos: bool = False,
        add_eos: bool = False,
        max_seq_len: int = 2**30,
    ):
        super().__init__()
        assert vocab_path or vocab_list, "vocab_path or vocab_list is required"
        assert not (vocab_path and vocab_list
                    ), "vocab_path and vocab_list are mutual exclusive"

        if vocab_list:
            self.vocab = ScriptVocabulary(vocab_list)
        else:
            with PathManager.open(vocab_path) as f:
                vocab = build_fairseq_vocab(
                    f, special_token_replacements=special_token_replacements)
                self.vocab = ScriptVocabulary(
                    list(vocab),
                    pad_idx=vocab.get_pad_index(-1),
                    bos_idx=vocab.get_bos_index(-1),
                    eos_idx=vocab.get_eos_index(-1),
                    unk_idx=vocab.get_unk_index(-1),
                    unk_token=vocab.unk_token,
                )
        # TODO T77728853 We need to combine truncate with BOS/EOS as they impact each other
        # Need to find a nicer way to do this, as this can't be chained.
        self.add_bos = add_bos
        self.add_eos = add_eos
        # Make room for bos and eos from max_seq_len if true
        self.truncate_transform = TruncateTransform(max_seq_len - add_bos -
                                                    add_eos)
コード例 #4
0
ファイル: transforms.py プロジェクト: terrorizer1980/pytext
    def __init__(
        self, vocab_path: Optional[str] = None, vocab_list: Optional[List[str]] = None
    ):
        super().__init__()
        assert vocab_path or vocab_list, "vocab_path or vocab_list is required"
        assert not (
            vocab_path and vocab_list
        ), "vocab_path and vocab_list are mutual exclusive"

        if vocab_list:
            self.vocab = ScriptVocabulary(vocab_list)
        else:
            with PathManager.open(vocab_path) as f:
                special_token_replacements = {
                    "[UNK]": UNK,
                    "[PAD]": PAD,
                    "[CLS]": BOS,
                    "[MASK]": MASK,
                    "[SEP]": EOS,
                }
                vocab = build_fairseq_vocab(
                    f, special_token_replacements=special_token_replacements
                )
                self.vocab = ScriptVocabulary(
                    list(vocab),
                    pad_idx=vocab.get_pad_index(-1),
                    bos_idx=vocab.get_bos_index(-1),
                    eos_idx=vocab.get_eos_index(-1),
                    unk_idx=vocab.get_unk_index(-1),
                )
コード例 #5
0
    def from_config(cls, config: Config):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)

        config.vocab_file = (resources.roberta.RESOURCE_MAP[config.vocab_file]
                             if config.vocab_file
                             in resources.roberta.RESOURCE_MAP else
                             config.vocab_file)
        with PathManager.open(config.vocab_file) as file_path:
            vocab = build_fairseq_vocab(
                vocab_file=file_path,
                special_token_replacements={
                    "<pad>": SpecialTokens.PAD,
                    "<s>": SpecialTokens.BOS,
                    "</s>": SpecialTokens.EOS,
                    "<unk>": SpecialTokens.UNK,
                    "<mask>": SpecialTokens.MASK,
                },
            )
        return cls(
            columns=config.columns,
            vocab=vocab,
            tokenizer=tokenizer,
            max_seq_len=config.max_seq_len,
            answers_column=config.answers_column,
            answer_starts_column=config.answer_starts_column,
        )
コード例 #6
0
    def __init__(
        self,
        vocab_path: Optional[str] = None,
        vocab_list: Optional[List[str]] = None,
        special_token_replacements=SPECIAL_TOKEN_REPLACEMENT,
    ):
        super().__init__()
        assert vocab_path or vocab_list, "vocab_path or vocab_list is required"
        assert not (
            vocab_path and vocab_list
        ), "vocab_path and vocab_list are mutual exclusive"

        if vocab_list:
            self.vocab = ScriptVocabulary(vocab_list)
        else:
            with PathManager.open(vocab_path) as f:
                vocab = build_fairseq_vocab(
                    f, special_token_replacements=special_token_replacements
                )
                self.vocab = ScriptVocabulary(
                    list(vocab),
                    pad_idx=vocab.get_pad_index(-1),
                    bos_idx=vocab.get_bos_index(-1),
                    eos_idx=vocab.get_eos_index(-1),
                    unk_idx=vocab.get_unk_index(-1),
                    unk_token=vocab.unk_token,
                )
コード例 #7
0
ファイル: roberta_tensorizer.py プロジェクト: jessemin/pytext
 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     vocab = build_fairseq_vocab(
         vocab_file=config.vocab_file,
         special_token_replacements={
             "<pad>": PAD,
             "<s>": BOS,
             "</s>": EOS,
             "<unk>": UNK,
         },
     )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
     )
コード例 #8
0
 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     vocab = build_fairseq_vocab(
         vocab_file=config.vocab_file,
         special_token_replacements={
             config.pad_token: PAD,
             config.bos_token: BOS,
             config.eos_token: EOS,
             config.unk_token: UNK,
         },
     )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         answers_column=config.answers_column,
         answer_starts_column=config.answer_starts_column,
     )
コード例 #9
0
 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     with PathManager.open(config.vocab_file) as file_path:
         vocab = build_fairseq_vocab(
             vocab_file=file_path,
             special_token_replacements={
                 "<pad>": SpecialTokens.PAD,
                 "<s>": SpecialTokens.BOS,
                 "</s>": SpecialTokens.EOS,
                 "<unk>": SpecialTokens.UNK,
                 "<mask>": SpecialTokens.MASK,
             },
         )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
     )
コード例 #10
0
    def from_config(cls, config: Config, **kwargs):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        base_tokenizer = None
        if config.base_tokenizer:
            base_tokenizer = create_component(
                ComponentType.TOKENIZER, config.base_tokenizer
            )

        # map to the real vocab_file
        config.vocab_file = (
            resources.roberta.RESOURCE_MAP[config.vocab_file]
            if config.vocab_file in resources.roberta.RESOURCE_MAP
            else config.vocab_file
        )
        with PathManager.open(config.vocab_file) as f:
            vocab = build_fairseq_vocab(
                vocab_file=f,
                special_token_replacements={
                    "<pad>": SpecialTokens.PAD,
                    "<s>": SpecialTokens.BOS,
                    "</s>": SpecialTokens.EOS,
                    "<unk>": SpecialTokens.UNK,
                    "<mask>": SpecialTokens.MASK,
                },
                tokens_to_add=[SpecialTokens.SELFIE_RAW_IMAGE]
                if config.add_selfie_token
                else None,
            )
        return cls(
            columns=config.columns,
            vocab=vocab,
            tokenizer=tokenizer,
            max_seq_len=config.max_seq_len,
            base_tokenizer=base_tokenizer,
            **kwargs,
        )