Esempio n. 1
0
 class Config(Tensorizer.Config):
     #: The name of the slot label column to parse from the data source.
     slot_column: str = "slots"
     #: The name of the text column to parse from the data source.
     #: We need this to be able to generate tensors which correspond to input text.
     text_column: str = "text"
     #: The tokenizer to use to split input text into tokens. This should be
     #: configured in a way which yields tokens consistent with the tokens input to
     #: or output by a model, so that the labels generated by this tensorizer
     #: will match the indices of the model's tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     #: Whether to allow for unknown labels at test/prediction time
     allow_unknown: bool = False
Esempio n. 2
0
 class Config(Tensorizer.Config):
     column: str = "text_seq"
     max_seq_len: Optional[int] = None
     #: sentence markers
     add_bos_token: bool = False
     add_eos_token: bool = False
     use_eos_token_for_bos: bool = False
     #: list markers
     add_bol_token: bool = False
     add_eol_token: bool = False
     use_eol_token_for_bol: bool = False
     #: The tokenizer to use to split input text into tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
Esempio n. 3
0
 class Config(BERTTensorizer.Config):
     vocab_file: str = "/mnt/vol/nlp_technologies/xlm/vocab_xnli_15"
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     is_fairseq: bool = False
     pretraining: bool = False
     max_seq_len: Optional[int] = 256
     max_vocab: int = 95000
     min_count: int = 0
     language_columns: List[str] = ["language"]
     lang2id: Dict[str, int] = DEFAULT_LANG2ID_DICT
     reset_positions: bool = False
     has_language_in_data: bool = False
     use_language_embeddings: bool = True
Esempio n. 4
0
 class Config(Tensorizer.Config):
     #: The name of the text column to parse from the data source.
     column: str = "text"
     #: The tokenizer to use to split input text into tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     #: The max token length for input text.
     max_seq_len: Optional[int] = None
     #: The max byte length for a token.
     max_byte_len: int = 15
     #: Offset to add to all non-padding bytes
     offset_for_non_padding: int = 0
     add_bos_token: bool = False
     add_eos_token: bool = False
     use_eos_token_for_bos: bool = False
Esempio n. 5
0
 class Config(BERTTensorizerBase.Config):
     vocab_file: str = "/mnt/vol/nlp_technologies/xlm/vocab_xnli_15"
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     max_vocab: int = 95000
     min_count: int = 0
     # language identifiers for extracting the language from a row of data
     # during numberize
     language_column: str = "language"
     # language-to-id mapping used to obtain language embeddings
     lang2id: Dict[str, int] = LANG2ID_15
     # Controls whether language is being read from the data file (which
     # is what happens for finetuning) or being added during processing
     # (which is what happens during pretraining)
     has_language_in_data: bool = False
     # controls whether we train with language embeddings or not
     use_language_embeddings: bool = True
Esempio n. 6
0
 class Config(Tensorizer.Config):
     #: The name of the text column to parse from the data source.
     column: str = "text"
     #: The tokenizer to use to split input text into tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     add_bos_token: bool = False
     add_eos_token: bool = False
     use_eos_token_for_bos: bool = False
     max_seq_len: Optional[int] = None
     #: If False, will not create token vocab during initialization. The vocab will
     #: need to be set during model initialization (e.g. see WordEmbedding)
     build_vocab: bool = True
     vocab_file: str = ""
     #: The number of lines in the above provided vocab_file to add to the
     #: overall vocab
     vocab_file_size_limit: int = 0
Esempio n. 7
0
 def setUp(self):
     self.data_source = SquadDataSource.from_config(
         SquadDataSource.Config(
             train_filename=tests_module.test_file("squad_tiny.json"),
             eval_filename=None,
             test_filename=None,
         ))
     self.tensorizer_with_wordpiece = SquadTensorizer.from_config(
         SquadTensorizer.Config(
             tokenizer=WordPieceTokenizer.Config(
                 wordpiece_vocab_path=
                 "pytext/data/test/data/wordpiece_1k.txt"),
             max_seq_len=250,
         ))
     self.tensorizer_with_alphanumeric = SquadTensorizer.from_config(
         SquadTensorizer.Config(
             tokenizer=Tokenizer.Config(split_regex=r"\W+"),
             max_seq_len=250))
Esempio n. 8
0
 class Config(Tensorizer.Config):
     text_column: str = "text"
     dict_column: str = "dict"
     #: tokenizer to split text and create dict tensors of the same size.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
Esempio n. 9
0
 class Config(Tensorizer.Config):
     # BERT style models support multiple text inputs
     columns: List[str] = ["text"]
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     vocab_file: str = ""
     max_seq_len: int = 256