def _init_tensorizers(cls, config: Config, rank, world_size): tensorizers = { name: create_component(ComponentType.TENSORIZER, tensorizer_config) for name, tensorizer_config in config.model.inputs._asdict().items() if tensorizer_config } schema: Dict[str, Type] = {} for tensorizer in tensorizers.values(): for name, type in tensorizer.column_schema: if name in schema and type != schema[name]: raise TypeError(f"Expected two different types for column {name}") schema[name] = type # This initializes the tensorizers data = create_component( ComponentType.DATA_HANDLER, config.data, schema, tensorizers, rank=rank, world_size=world_size, ) return tensorizers, data
def create_tensorizers( model_inputs: Union[BaseModel.Config.ModelInput, Dict[str, Tensorizer.Config]], ) -> Dict[str, Tensorizer]: if not isinstance(model_inputs, dict): model_inputs = model_inputs._asdict() tensorizers = { name: create_component(ComponentType.TENSORIZER, tensorizer_config) for name, tensorizer_config in model_inputs.items() if tensorizer_config } return tensorizers
def batches(self, stage: Stage, rank=0, world_size=1, data_source=None): all_batches = { name: task.batches(stage, rank, world_size) for name, task in self.data_dict.items() } if stage == Stage.TRAIN: sampler = create_component(self.sampler_config, iterators=all_batches) else: sampler = EvalBatchSampler(all_batches) for name, batch in sampler.batchify(all_batches): batch[BatchContext.TASK_NAME] = name yield batch
def _init_model(cls, model_config, tensorizers, model_state=None): model_config.init_from_saved_state = model_state is not None model = create_component( ComponentType.MODEL, model_config, tensorizers=tensorizers ) if model_state: print("Loading model from model state dict...") model.load_state_dict(model_state) print("Loaded!") if cuda.CUDA_ENABLED: model = model.cuda() return model
def from_config( cls, config: Config, schema: Dict[str, Type], tensorizers: Dict[str, Tensorizer], rank=0, world_size=1, **kwargs, ): data_source_cls = Registry.get(ComponentType.DATA_SOURCE, type(config.source)) if issubclass(data_source_cls, ShardedDataSource): # data source is already sharded, we don't need to wrap RowShardedDataSource data_source = create_component( ComponentType.DATA_SOURCE, config.source, schema, rank=rank, world_size=world_size, ) else: unsharded_data_source = create_component(ComponentType.DATA_SOURCE, config.source, schema) data_source = RowShardedDataSource( data_source=unsharded_data_source, rank=rank, world_size=world_size) batcher = create_component(ComponentType.BATCHER, config.batcher) return cls( data_source, tensorizers, batcher=batcher, sort_key=config.sort_key, epoch_size=config.epoch_size, **kwargs, )
def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) base_tokenizer = None if config.base_tokenizer: base_tokenizer = create_component(ComponentType.TOKENIZER, config.base_tokenizer) with PathManager.open(config.vocab_file) as f: vocab = build_fairseq_vocab( vocab_file=f, special_token_replacements={ "<pad>": PAD, "<s>": BOS, "</s>": EOS, "<unk>": UNK, "<mask>": MASK, }, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, base_tokenizer=base_tokenizer, )
def from_config(cls, config: Config, **kwargs): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) base_tokenizer = None if config.base_tokenizer: base_tokenizer = create_component( ComponentType.TOKENIZER, config.base_tokenizer ) # map to the real vocab_file config.vocab_file = ( resources.roberta.RESOURCE_MAP[config.vocab_file] if config.vocab_file in resources.roberta.RESOURCE_MAP else config.vocab_file ) with PathManager.open(config.vocab_file) as f: vocab = build_fairseq_vocab( vocab_file=f, special_token_replacements={ "<pad>": SpecialTokens.PAD, "<s>": SpecialTokens.BOS, "</s>": SpecialTokens.EOS, "<unk>": SpecialTokens.UNK, "<mask>": SpecialTokens.MASK, }, tokens_to_add=[SpecialTokens.SELFIE_RAW_IMAGE] if config.add_selfie_token else None, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, base_tokenizer=base_tokenizer, **kwargs, )
def _get_data_source(test_path, source_config, field_names, task): if isinstance(task, NewDisjointMultitask): # Cannot easily specify a single data source for multitask assert not test_path data_source = None elif test_path and hasattr(source_config, "test_filename"): source_config.test_filename = test_path if field_names and hasattr(source_config, "field_names"): source_config.field_names = field_names data_source = create_component(ComponentType.DATA_SOURCE, source_config, task.data.data_source.schema) else: data_source = task.data.data_source return data_source
def from_config( cls, config: Config, data_dict: Dict[str, Data], task_key: str = BatchContext.TASK_NAME, rank=0, world_size=1, init_tensorizers=True, ): samplers = { Stage.TRAIN: create_component(ComponentType.BATCH_SAMPLER, config.sampler), Stage.EVAL: EvalBatchSampler(), Stage.TEST: EvalBatchSampler(), } return cls(data_dict, samplers, config.test_key, task_key)
def from_config(cls, config: Config, unused_metadata=None, model_state=None): tensorizers, data = NewTask._init_tensorizers(config) # Initialized tensorizers can be used to create the model model = NewTask._init_model(config, tensorizers, model_state) # This is the only place right now that the task actually cares about which # features and tensors are being used. This is a strong tie between # the implementation of the model and the metric reporter. metric_reporter = create_component( ComponentType.METRIC_REPORTER, config.metric_reporter, tensorizers=tensorizers, ) trainer = create_trainer(config.trainer, model) return cls(data, model, metric_reporter, trainer)
def test_create_from_config(self): source_config = PandasDataSource.Config( train_df=pd.DataFrame({"c1": [10, 20, 30], "c2": [40, 50, 60]}), eval_df=pd.DataFrame({"c1": [11, 21, 31], "c2": [41, 51, 61]}), test_df=pd.DataFrame({"c1": [12, 22, 32], "c2": [42, 52, 62]}), column_mapping={"c1": "feature1", "c2": "feature2"}, ) ds = create_component( ComponentType.DATA_SOURCE, source_config, schema={"feature1": float, "feature2": float}, ) self.assertEqual({"feature1": 10, "feature2": 40}, next(iter(ds.train))) self.assertEqual({"feature1": 11, "feature2": 41}, next(iter(ds.eval))) self.assertEqual({"feature1": 12, "feature2": 42}, next(iter(ds.test))) self.assertEqual(3, len(list(ds.train)))
def create_task( task_config, metadata=None, model_state=None, tensorizers=None, rank=0, world_size=1 ): """ Create a task by finding task class in registry and invoking the from_config function of the class, see :meth:`~Task.from_config` for more details """ return create_component( ComponentType.TASK, task_config, metadata, model_state, tensorizers=tensorizers, rank=rank, world_size=world_size, )
def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) vocab = build_fairseq_vocab( vocab_file=config.vocab_file, special_token_replacements={ "<pad>": PAD, "<s>": BOS, "</s>": EOS, "<unk>": UNK, }, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, )
def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) return cls( columns=config.columns, tokenizer=tokenizer, vocab_file=config.vocab_file, is_fairseq=config.is_fairseq, pretraining=config.pretraining, max_seq_len=config.max_seq_len, max_vocab=config.max_vocab, min_count=config.min_count, language_columns=config.language_columns, lang2id=config.lang2id, reset_positions=config.reset_positions, has_language_in_data=config.has_language_in_data, use_language_embeddings=config.use_language_embeddings, )
def _create_tensorizers(cls, model_inputs_dict): if not isinstance(model_inputs_dict, dict): model_inputs_dict = model_inputs_dict._asdict() tensorizers = { name: create_component(ComponentType.TENSORIZER, tensorizer_config) for name, tensorizer_config in model_inputs_dict.items() if tensorizer_config } schema: Dict[str, Type] = {} for tensorizer in tensorizers.values(): for name, type in tensorizer.column_schema: if name in schema and type != schema[name]: raise TypeError( f"Unexpected different types for column {name}: {type} != {schema[name]}" ) schema[name] = type return tensorizers, schema
def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) vocab = build_fairseq_vocab( vocab_file=config.vocab_file, special_token_replacements={ config.pad_token: PAD, config.bos_token: BOS, config.eos_token: EOS, config.unk_token: UNK, }, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, answers_column=config.answers_column, answer_starts_column=config.answer_starts_column, )
def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) with PathManager.open(config.vocab_file) as file_path: vocab = build_fairseq_vocab( vocab_file=file_path, special_token_replacements={ "<pad>": SpecialTokens.PAD, "<s>": SpecialTokens.BOS, "</s>": SpecialTokens.EOS, "<unk>": SpecialTokens.UNK, "<mask>": SpecialTokens.MASK, }, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, )
def from_config(cls, task_config, metadata=None, model_state=None, rank=0, world_size=1): data_dict = OrderedDict() models = OrderedDict() metric_reporters = OrderedDict() for name, task in task_config.tasks.items(): tensorizers, data = NewTask._init_tensorizers( task, rank, world_size) data_dict[name] = data models[name] = NewTask._init_model(task, tensorizers) metric_reporters[name] = create_component( ComponentType.METRIC_REPORTER, task.metric_reporter, tensorizers=tensorizers, ) task_weights = { task_name: task_config.task_weights.get(task_name, 1.0) for task_name in task_config.tasks.keys() } data = DisjointMultitaskData.from_config(task_config.data, data_dict=data_dict, rank=rank, world_size=world_size) model = NewDisjointMultitaskModel(models, loss_weights=task_weights) if model_state: model.load_state_dict(model_state) metric_reporter = DisjointMultitaskMetricReporter( metric_reporters, loss_weights=task_weights, target_task_name=task_config.target_task_name, use_subtask_select_metric=( task_config.metric_reporter.use_subtask_select_metric), ) trainer = create_trainer(task_config.trainer, model) return cls(data, model, metric_reporter, trainer)
def _init_tensorizers(cls, config: Config, tensorizers=None, rank=0, world_size=1): extra_schema = {} if hasattr(config.metric_reporter, "text_column_names"): extra_schema = { column: str for column in config.metric_reporter.text_column_names } if not tensorizers: tensorizers = create_tensorizers(config.model.inputs) schema = create_schema(tensorizers, extra_schema) # This initializes the tensorizers data = create_component( ComponentType.DATA_HANDLER, config.data, schema, tensorizers, rank=rank, world_size=world_size, ) return tensorizers, data
def from_config(cls, config: Config, **kwargs): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) vocab = None if isinstance(tokenizer, WordPieceTokenizer): print("Using WordPieceTokenizer") replacements = { "[UNK]": UNK, "[PAD]": PAD, "[CLS]": BOS, "[SEP]": EOS, "[MASK]": MASK, } vocab = Vocabulary( [token for token, _ in tokenizer.vocab.items()], replacements=replacements, ) doc_tensorizer = TokenTensorizer( text_column=config.doc_column, tokenizer=tokenizer, vocab=vocab, max_seq_len=config.max_doc_seq_len, ) ques_tensorizer = TokenTensorizer( text_column=config.ques_column, tokenizer=tokenizer, vocab=vocab, max_seq_len=config.max_ques_seq_len, ) return cls( doc_tensorizer=doc_tensorizer, ques_tensorizer=ques_tensorizer, doc_column=config.doc_column, ques_column=config.ques_column, answers_column=config.answers_column, answer_starts_column=config.answer_starts_column, tokenizer=tokenizer, vocab=vocab, **kwargs, )
def _init_tensorizers(cls, config: Config, rank, world_size): tensorizers, schema = NewTask._create_tensorizers(config.model.inputs) if hasattr(config.metric_reporter, "text_column_names"): for text_column in config.metric_reporter.text_column_names: if text_column in schema and schema[text_column] != str: raise TypeError(f""" Unexpected different types for column {text_column}: {str} != {schema[text_column]} """) schema[text_column] = str # This initializes the tensorizers data = create_component( ComponentType.DATA_HANDLER, config.data, schema, tensorizers, rank=rank, world_size=world_size, ) return tensorizers, data
def from_config(cls, config: Config, **kwargs): """ from_config parses the config associated with the tensorizer and creates both the tokenizer and the Vocabulary object. The extra arguments passed as kwargs allow us to reuse thie function with variable number of arguments (eg: for classes which derive from this class). """ tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) special_token_replacements = { "[UNK]": UNK, "[PAD]": PAD, "[CLS]": BOS, "[MASK]": MASK, "[SEP]": EOS, } if isinstance(tokenizer, WordPieceTokenizer): vocab = Vocabulary( [token for token, _ in tokenizer.vocab.items()], replacements=special_token_replacements, ) else: config.vocab_file = ( resources.roberta.RESOURCE_MAP[config.vocab_file] if config.vocab_file in resources.roberta.RESOURCE_MAP else config.vocab_file) with PathManager.open(config.vocab_file) as file_path: vocab = build_fairseq_vocab( dictionary_class=BertDictionary, vocab_file=file_path, special_token_replacements=special_token_replacements, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, **kwargs, )
def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) vocab = build_fairseq_vocab( dictionary_class=MaskedLMDictionary, vocab_file=config.vocab_file, max_vocab=config.max_vocab, min_count=config.min_count, special_token_replacements={ "<unk>": UNK, "<pad>": PAD, "</s>": EOS, "<mask>": MASK, }, ) return cls( columns=config.columns, vocab=vocab, tokenizer=tokenizer, max_seq_len=config.max_seq_len, language_column=config.language_column, lang2id=config.lang2id, use_language_embeddings=config.use_language_embeddings, has_language_in_data=config.has_language_in_data, )
def from_config(cls, config: Config): basic_tokenizer = create_component(ComponentType.TOKENIZER, config.basic_tokenizer) vocab = load_vocab(config.wordpiece_vocab_path) wordpiece_tokenizer = WordpieceTokenizer(vocab=vocab) return cls(vocab, basic_tokenizer, wordpiece_tokenizer)
def from_config( cls, config: Config, schema: Dict[str, Type], tensorizers: Dict[str, Tensorizer] ): data_source = create_component(ComponentType.DATA_SOURCE, config.source, schema) batcher = create_component(ComponentType.BATCHER, config.batcher) return cls(data_source, tensorizers, batcher=batcher, sort_key=config.sort_key)
def from_config(cls, config: Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) return cls(config.text_column, config.dict_column, tokenizer)
def from_config(cls, config: Component.Config): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) return cls(config.slot_column, config.text_column, tokenizer, config.allow_unknown)
def create_task(task_config, metadata=None, model_state=None): """ Create a task by finding task class in registry and invoking the from_config function of the class, see :meth:`~Task.from_config` for more details """ return create_component(ComponentType.TASK, task_config, metadata, model_state)
def create_metric_reporter(cls, config, tensorizers): return create_component( ComponentType.METRIC_REPORTER, config.metric_reporter, tensorizers=tensorizers, )
def _create_module_from_registry(module_config, *args, **kwargs): return create_component(ComponentType.MODULE, module_config, *args, **kwargs)