class Config(Tensorizer.Config): #: The name of the slot label column to parse from the data source. slot_column: str = "slots" #: The name of the text column to parse from the data source. #: We need this to be able to generate tensors which correspond to input text. text_column: str = "text" #: The tokenizer to use to split input text into tokens. This should be #: configured in a way which yields tokens consistent with the tokens input to #: or output by a model, so that the labels generated by this tensorizer #: will match the indices of the model's tokens. tokenizer: Tokenizer.Config = Tokenizer.Config() #: Whether to allow for unknown labels at test/prediction time allow_unknown: bool = False
class Config(Tensorizer.Config): column: str = "text_seq" max_seq_len: Optional[int] = None #: sentence markers add_bos_token: bool = False add_eos_token: bool = False use_eos_token_for_bos: bool = False #: list markers add_bol_token: bool = False add_eol_token: bool = False use_eol_token_for_bol: bool = False #: The tokenizer to use to split input text into tokens. tokenizer: Tokenizer.Config = Tokenizer.Config()
class Config(BERTTensorizer.Config): vocab_file: str = "/mnt/vol/nlp_technologies/xlm/vocab_xnli_15" tokenizer: Tokenizer.Config = Tokenizer.Config() is_fairseq: bool = False pretraining: bool = False max_seq_len: Optional[int] = 256 max_vocab: int = 95000 min_count: int = 0 language_columns: List[str] = ["language"] lang2id: Dict[str, int] = DEFAULT_LANG2ID_DICT reset_positions: bool = False has_language_in_data: bool = False use_language_embeddings: bool = True
class Config(Tensorizer.Config): #: The name of the text column to parse from the data source. column: str = "text" #: The tokenizer to use to split input text into tokens. tokenizer: Tokenizer.Config = Tokenizer.Config() #: The max token length for input text. max_seq_len: Optional[int] = None #: The max byte length for a token. max_byte_len: int = 15 #: Offset to add to all non-padding bytes offset_for_non_padding: int = 0 add_bos_token: bool = False add_eos_token: bool = False use_eos_token_for_bos: bool = False
class Config(BERTTensorizerBase.Config): vocab_file: str = "/mnt/vol/nlp_technologies/xlm/vocab_xnli_15" tokenizer: Tokenizer.Config = Tokenizer.Config() max_vocab: int = 95000 min_count: int = 0 # language identifiers for extracting the language from a row of data # during numberize language_column: str = "language" # language-to-id mapping used to obtain language embeddings lang2id: Dict[str, int] = LANG2ID_15 # Controls whether language is being read from the data file (which # is what happens for finetuning) or being added during processing # (which is what happens during pretraining) has_language_in_data: bool = False # controls whether we train with language embeddings or not use_language_embeddings: bool = True
class Config(Tensorizer.Config): #: The name of the text column to parse from the data source. column: str = "text" #: The tokenizer to use to split input text into tokens. tokenizer: Tokenizer.Config = Tokenizer.Config() add_bos_token: bool = False add_eos_token: bool = False use_eos_token_for_bos: bool = False max_seq_len: Optional[int] = None #: If False, will not create token vocab during initialization. The vocab will #: need to be set during model initialization (e.g. see WordEmbedding) build_vocab: bool = True vocab_file: str = "" #: The number of lines in the above provided vocab_file to add to the #: overall vocab vocab_file_size_limit: int = 0
def setUp(self): self.data_source = SquadDataSource.from_config( SquadDataSource.Config( train_filename=tests_module.test_file("squad_tiny.json"), eval_filename=None, test_filename=None, )) self.tensorizer_with_wordpiece = SquadTensorizer.from_config( SquadTensorizer.Config( tokenizer=WordPieceTokenizer.Config( wordpiece_vocab_path= "pytext/data/test/data/wordpiece_1k.txt"), max_seq_len=250, )) self.tensorizer_with_alphanumeric = SquadTensorizer.from_config( SquadTensorizer.Config( tokenizer=Tokenizer.Config(split_regex=r"\W+"), max_seq_len=250))
class Config(Tensorizer.Config): text_column: str = "text" dict_column: str = "dict" #: tokenizer to split text and create dict tensors of the same size. tokenizer: Tokenizer.Config = Tokenizer.Config()
class Config(Tensorizer.Config): # BERT style models support multiple text inputs columns: List[str] = ["text"] tokenizer: Tokenizer.Config = Tokenizer.Config() vocab_file: str = "" max_seq_len: int = 256