def __init__( self, train_filename: str, dev_filename: str, test_filename: str, tokenizers: Dict[str, BaseTokenizer] = None, namespace_vocab_options: Dict[str, Dict[str, Any]] = None, namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None, batch_size: int = 10, ): self.train_filename = train_filename self.dev_filename = dev_filename self.test_filename = test_filename self.tokenizers = tokenizers or { "tokens": WordTokenizer(tokenizer="vanilla"), "char_tokens": CharacterTokenizer(), } self.namespace_vocab_options = namespace_vocab_options or { "char_tokens": { "start_token": " ", "end_token": " ", "pad_token": " ", "unk_token": " ", } } self.namespace_numericalizer_map = namespace_numericalizer_map or { "tokens": Numericalizer(), "char_tokens": Numericalizer(), } self.namespace_numericalizer_map["seq_label"] = Numericalizer() self.batch_size = batch_size self.train_dataset = SeqLabellingDataset( filename=self.train_filename, tokenizers=self.tokenizers ) self.dev_dataset = SeqLabellingDataset( filename=self.dev_filename, tokenizers=self.tokenizers ) self.test_dataset = SeqLabellingDataset( filename=self.test_filename, tokenizers=self.tokenizers ) super(SeqLabellingDatasetManager, self).__init__( train_dataset=self.train_dataset, dev_dataset=self.dev_dataset, test_dataset=self.test_dataset, namespace_vocab_options=self.namespace_vocab_options, namespace_numericalizer_map=self.namespace_numericalizer_map, batch_size=batch_size, )
def __init__( self, train_filename: str, dev_filename: str, test_filename: str, tokenizers: Dict[str, BaseTokenizer] = None, namespace_vocab_options: Dict[str, Dict[str, Any]] = None, namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None, batch_size: int = 10, ): self.train_filename = train_filename self.dev_filename = dev_filename self.test_filename = test_filename self.tokenizers = tokenizers or { "tokens": WordTokenizer(), "char_tokens": CharacterTokenizer(), } self.namespace_vocab_options = namespace_vocab_options or { "char_tokens": { "start_token": " ", "end_token": " ", "pad_token": " ", "unk_token": " ", }, "label": { "include_special_vocab": False }, } self.namespace_numericalizer_map = namespace_numericalizer_map or { "tokens": Numericalizer(), "char_tokens": Numericalizer(), } self.namespace_numericalizer_map["label"] = Numericalizer() self.batch_size = batch_size self.train_dataset = TextClassificationDataset( filename=self.train_filename, tokenizers=self.tokenizers) self.dev_dataset = TextClassificationDataset( filename=self.dev_filename, tokenizers=self.tokenizers) self.test_dataset = TextClassificationDataset( filename=self.test_filename, tokenizers=self.tokenizers) super(TextClassificationDatasetManager, self).__init__( train_dataset=self.train_dataset, dev_dataset=self.dev_dataset, test_dataset=self.test_dataset, namespace_vocab_options=self.namespace_vocab_options, namespace_numericalizer_map=self.namespace_numericalizer_map, batch_size=batch_size, )
def get_numericalized_instances(get_preprocessed_instances): instances, labels = get_preprocessed_instances MAX_NUM_WORDS = 3000 MAX_LENGTH = 15 vocab = Vocab(instances=instances, max_num_tokens=MAX_NUM_WORDS) vocab.build_vocab() numericalizer = Numericalizer(vocabulary=vocab) numericalized_instances = numericalizer.numericalize_batch_instances( instances[:32]) return { "numericalized_instances": numericalized_instances, "labels": labels, "max_length": MAX_LENGTH, "max_num_words": MAX_NUM_WORDS, "vocab": vocab, }
def single_instance_setup(instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocabulary = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) numericalizer = Numericalizer(vocabulary=vocabulary) return single_instance, numericalizer, vocabulary
def __init__( self, train_filename: str, dev_filename: str, test_filename: str, tokenizers: Dict[str, BaseTokenizer] = None, namespace_vocab_options: Dict[str, Dict[str, Any]] = None, namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None, batch_size=10, column_names: List[str] = None, train_only: Optional[str] = None, ): self.train_filename = train_filename self.dev_filename = dev_filename self.test_filename = test_filename self.tokenizers = tokenizers or { "tokens": WordTokenizer(tokenizer="vanilla"), "char_tokens": CharacterTokenizer(), } if namespace_vocab_options is None: namespace_vocab_options = {} namespace_vocab_options_defaults = { "char_tokens": { "start_token": " ", "end_token": " ", "pad_token": " ", "unk_token": " ", } } self.namespace_vocab_options = {} vocab_namespaces = set(namespace_vocab_options.keys()).union( namespace_vocab_options_defaults.keys()) for namespace in vocab_namespaces: user_passed = namespace_vocab_options.get(namespace, {}) defaults = namespace_vocab_options_defaults.get(namespace, {}) self.namespace_vocab_options[namespace] = { **defaults, **user_passed } self.namespace_numericalizer_map = namespace_numericalizer_map or { "tokens": Numericalizer(), "char_tokens": Numericalizer(), } self.batch_size = batch_size if column_names is None: column_names = ["label_1"] valid_column_names = [column_names[0]] for column_name in valid_column_names: self.namespace_numericalizer_map[column_name] = Numericalizer() self.train_dataset = BioNerDataset( filename=self.train_filename, tokenizers=self.tokenizers, column_names=column_names, train_only=train_only, ) self.dev_dataset = BioNerDataset( filename=self.dev_filename, tokenizers=self.tokenizers, column_names=column_names, train_only=train_only, ) self.test_dataset = BioNerDataset( filename=self.test_filename, tokenizers=self.tokenizers, column_names=column_names, train_only=train_only, ) super(BioNERDatasetManager, self).__init__( train_dataset=self.train_dataset, dev_dataset=self.dev_dataset, test_dataset=self.test_dataset, namespace_vocab_options=self.namespace_vocab_options, namespace_numericalizer_map=self.namespace_numericalizer_map, batch_size=batch_size, )
def __init__( self, train_filename: str, dev_filename: str, test_filename: str, tokenizers: Dict[str, BaseTokenizer] = None, namespace_vocab_options: Dict[str, Dict[str, Any]] = None, namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None, batch_size=10, column_names: List[str] = None, ): self.train_filename = train_filename self.dev_filename = dev_filename self.test_filename = test_filename self.tokenizers = tokenizers or { "tokens": WordTokenizer(tokenizer="vanilla"), "char_tokens": CharacterTokenizer(), } namespace_vocab_options_defaults = { "char_tokens": { "start_token": " ", "end_token": " ", "pad_token": " ", "unk_token": " ", } } if namespace_vocab_options is None: namespace_vocab_options = {} self.namespace_vocab_options = copy.deepcopy( namespace_vocab_options_defaults) for namespace, options in self.namespace_vocab_options.items(): user_passed = namespace_vocab_options.get(namespace, {}) self.namespace_vocab_options[namespace] = { **options, **user_passed } self.namespace_numericalizer_map = namespace_numericalizer_map or { "tokens": Numericalizer(), "char_tokens": Numericalizer(), } self.batch_size = batch_size if column_names is None: column_names = ["NER"] for column_name in column_names: self.namespace_numericalizer_map[column_name] = Numericalizer() self.train_dataset = ConllYagoDataset( filename=self.train_filename, tokenizers=self.tokenizers, column_names=column_names, ) self.dev_dataset = ConllYagoDataset( filename=self.dev_filename, tokenizers=self.tokenizers, column_names=column_names, ) self.test_dataset = ConllYagoDataset( filename=self.test_filename, tokenizers=self.tokenizers, column_names=column_names, ) super(ConllYagoDatasetsManager, self).__init__( train_dataset=self.train_dataset, dev_dataset=self.dev_dataset, test_dataset=self.test_dataset, namespace_vocab_options=self.namespace_vocab_options, namespace_numericalizer_map=self.namespace_numericalizer_map, batch_size=batch_size, )
def __init__( self, train_filename: str, dev_filename: str, test_filename: str, tokenizers: Dict[str, BaseTokenizer] = None, namespace_vocab_options: Dict[str, Dict[str, Any]] = None, namespace_numericalizer_map: Dict[str, BaseNumericalizer] = None, batch_size: int = 10, ): """ Parameters ---------- train_filename: str The path wehere the train file is stored dev_filename: str The path where the dev file is stored test_filename: str The path where the test file is stored tokenizers: Dict[str, BaseTokenizer] A mapping from namespace to the tokenizer namespace_vocab_options: Dict[str, Dict[str, Any]] A mapping from the name to options namespace_numericalizer_map: Dict[str, BaseNumericalizer] Every namespace can have a different numericalizer specified batch_size: int The batch size of the data returned """ self.train_filename = train_filename self.dev_filename = dev_filename self.test_filename = test_filename self.tokenizers = tokenizers or { "tokens": WordTokenizer(), "char_tokens": CharacterTokenizer(), } self.namespace_vocab_options = namespace_vocab_options or { "char_tokens": { "start_token": " ", "end_token": " ", "pad_token": " ", "unk_token": " ", }, "label": { "include_special_vocab": False }, } self.namespace_numericalizer_map = namespace_numericalizer_map or { "tokens": Numericalizer(), "char_tokens": Numericalizer(), } self.namespace_numericalizer_map["label"] = Numericalizer() self.batch_size = batch_size self.train_dataset = TextClassificationDataset( filename=self.train_filename, tokenizers=self.tokenizers) self.dev_dataset = TextClassificationDataset( filename=self.dev_filename, tokenizers=self.tokenizers) self.test_dataset = TextClassificationDataset( filename=self.test_filename, tokenizers=self.tokenizers) super(TextClassificationDatasetManager, self).__init__( train_dataset=self.train_dataset, dev_dataset=self.dev_dataset, test_dataset=self.test_dataset, namespace_vocab_options=self.namespace_vocab_options, namespace_numericalizer_map=self.namespace_numericalizer_map, batch_size=batch_size, )