def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) with PathManager.open(config.vocab_file) as file_path: vocab = build_fairseq_vocab( dictionary_class=MaskedLMDictionary, vocab_file=file_path, max_vocab=config.max_vocab, min_count=config.min_count, special_token_replacements={ "<unk>": UNK, "<pad>": PAD, "</s>": EOS, "<mask>": MASK, }, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, language_column=config.language_column, lang2id=config.lang2id, use_language_embeddings=config.use_language_embeddings, has_language_in_data=config.has_language_in_data, )
def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) base_tokenizer = None if config.base_tokenizer: base_tokenizer = create_component( ComponentType.TOKENIZER, config.base_tokenizer ) # map to the real vocab_file config.vocab_file = ( resources.roberta.RESOURCE_MAP[config.vocab_file] if config.vocab_file in resources.roberta.RESOURCE_MAP else config.vocab_file ) with PathManager.open(config.vocab_file) as f: vocab = build_fairseq_vocab( vocab_file=f, special_token_replacements={ "<pad>": PAD, "<s>": BOS, "</s>": EOS, "<unk>": UNK, "<mask>": MASK, }, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, base_tokenizer=base_tokenizer, )
def __init__( self, vocab_path: Optional[str] = None, vocab_list: Optional[List[str]] = None, special_token_replacements=SPECIAL_TOKEN_REPLACEMENT, add_bos: bool = False, add_eos: bool = False, max_seq_len: int = 2**30, ): super().__init__() assert vocab_path or vocab_list, "vocab_path or vocab_list is required" assert not (vocab_path and vocab_list ), "vocab_path and vocab_list are mutual exclusive" if vocab_list: self.vocab = ScriptVocabulary(vocab_list) else: with PathManager.open(vocab_path) as f: vocab = build_fairseq_vocab( f, special_token_replacements=special_token_replacements) self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(-1), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(-1), unk_token=vocab.unk_token, ) # TODO T77728853 We need to combine truncate with BOS/EOS as they impact each other # Need to find a nicer way to do this, as this can't be chained. self.add_bos = add_bos self.add_eos = add_eos # Make room for bos and eos from max_seq_len if true self.truncate_transform = TruncateTransform(max_seq_len - add_bos - add_eos)
def __init__( self, vocab_path: Optional[str] = None, vocab_list: Optional[List[str]] = None ): super().__init__() assert vocab_path or vocab_list, "vocab_path or vocab_list is required" assert not ( vocab_path and vocab_list ), "vocab_path and vocab_list are mutual exclusive" if vocab_list: self.vocab = ScriptVocabulary(vocab_list) else: with PathManager.open(vocab_path) as f: special_token_replacements = { "[UNK]": UNK, "[PAD]": PAD, "[CLS]": BOS, "[MASK]": MASK, "[SEP]": EOS, } vocab = build_fairseq_vocab( f, special_token_replacements=special_token_replacements ) self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(-1), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(-1), )
def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) config.vocab_file = (resources.roberta.RESOURCE_MAP[config.vocab_file] if config.vocab_file in resources.roberta.RESOURCE_MAP else config.vocab_file) with PathManager.open(config.vocab_file) as file_path: vocab = build_fairseq_vocab( vocab_file=file_path, special_token_replacements={ "<pad>": SpecialTokens.PAD, "<s>": SpecialTokens.BOS, "</s>": SpecialTokens.EOS, "<unk>": SpecialTokens.UNK, "<mask>": SpecialTokens.MASK, }, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, answers_column=config.answers_column, answer_starts_column=config.answer_starts_column, )
def __init__( self, vocab_path: Optional[str] = None, vocab_list: Optional[List[str]] = None, special_token_replacements=SPECIAL_TOKEN_REPLACEMENT, ): super().__init__() assert vocab_path or vocab_list, "vocab_path or vocab_list is required" assert not ( vocab_path and vocab_list ), "vocab_path and vocab_list are mutual exclusive" if vocab_list: self.vocab = ScriptVocabulary(vocab_list) else: with PathManager.open(vocab_path) as f: vocab = build_fairseq_vocab( f, special_token_replacements=special_token_replacements ) self.vocab = ScriptVocabulary( list(vocab), pad_idx=vocab.get_pad_index(-1), bos_idx=vocab.get_bos_index(-1), eos_idx=vocab.get_eos_index(-1), unk_idx=vocab.get_unk_index(-1), unk_token=vocab.unk_token, )
def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) vocab = build_fairseq_vocab( vocab_file=config.vocab_file, special_token_replacements={ "<pad>": PAD, "<s>": BOS, "</s>": EOS, "<unk>": UNK, }, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, )
def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) vocab = build_fairseq_vocab( vocab_file=config.vocab_file, special_token_replacements={ config.pad_token: PAD, config.bos_token: BOS, config.eos_token: EOS, config.unk_token: UNK, }, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, answers_column=config.answers_column, answer_starts_column=config.answer_starts_column, )
def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) with PathManager.open(config.vocab_file) as file_path: vocab = build_fairseq_vocab( vocab_file=file_path, special_token_replacements={ "<pad>": SpecialTokens.PAD, "<s>": SpecialTokens.BOS, "</s>": SpecialTokens.EOS, "<unk>": SpecialTokens.UNK, "<mask>": SpecialTokens.MASK, }, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, )
def from_config(cls, config: Config, **kwargs): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) base_tokenizer = None if config.base_tokenizer: base_tokenizer = create_component( ComponentType.TOKENIZER, config.base_tokenizer ) # map to the real vocab_file config.vocab_file = ( resources.roberta.RESOURCE_MAP[config.vocab_file] if config.vocab_file in resources.roberta.RESOURCE_MAP else config.vocab_file ) with PathManager.open(config.vocab_file) as f: vocab = build_fairseq_vocab( vocab_file=f, special_token_replacements={ "<pad>": SpecialTokens.PAD, "<s>": SpecialTokens.BOS, "</s>": SpecialTokens.EOS, "<unk>": SpecialTokens.UNK, "<mask>": SpecialTokens.MASK, }, tokens_to_add=[SpecialTokens.SELFIE_RAW_IMAGE] if config.add_selfie_token else None, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, base_tokenizer=base_tokenizer, **kwargs, )