Ejemplo n.º 1
0
    def _init_tensorizers(cls, config: Config, rank, world_size):
        tensorizers = {
            name: create_component(ComponentType.TENSORIZER, tensorizer_config)
            for name, tensorizer_config in config.model.inputs._asdict().items()
            if tensorizer_config
        }
        schema: Dict[str, Type] = {}
        for tensorizer in tensorizers.values():
            for name, type in tensorizer.column_schema:
                if name in schema and type != schema[name]:
                    raise TypeError(f"Expected two different types for column {name}")
                schema[name] = type

        # This initializes the tensorizers
        data = create_component(
            ComponentType.DATA_HANDLER,
            config.data,
            schema,
            tensorizers,
            rank=rank,
            world_size=world_size,
        )
        return tensorizers, data
Ejemplo n.º 2
0
def create_tensorizers(
    model_inputs: Union[BaseModel.Config.ModelInput, Dict[str,
                                                          Tensorizer.Config]],
) -> Dict[str, Tensorizer]:
    if not isinstance(model_inputs, dict):
        model_inputs = model_inputs._asdict()

    tensorizers = {
        name: create_component(ComponentType.TENSORIZER, tensorizer_config)
        for name, tensorizer_config in model_inputs.items()
        if tensorizer_config
    }

    return tensorizers
Ejemplo n.º 3
0
    def batches(self, stage: Stage, rank=0, world_size=1, data_source=None):
        all_batches = {
            name: task.batches(stage, rank, world_size)
            for name, task in self.data_dict.items()
        }
        if stage == Stage.TRAIN:
            sampler = create_component(self.sampler_config,
                                       iterators=all_batches)
        else:
            sampler = EvalBatchSampler(all_batches)

        for name, batch in sampler.batchify(all_batches):
            batch[BatchContext.TASK_NAME] = name
            yield batch
Ejemplo n.º 4
0
    def _init_model(cls, model_config, tensorizers, model_state=None):
        model_config.init_from_saved_state = model_state is not None
        model = create_component(
            ComponentType.MODEL, model_config, tensorizers=tensorizers
        )
        if model_state:
            print("Loading model from model state dict...")
            model.load_state_dict(model_state)
            print("Loaded!")

        if cuda.CUDA_ENABLED:
            model = model.cuda()

        return model
Ejemplo n.º 5
0
    def from_config(
        cls,
        config: Config,
        schema: Dict[str, Type],
        tensorizers: Dict[str, Tensorizer],
        rank=0,
        world_size=1,
        **kwargs,
    ):
        data_source_cls = Registry.get(ComponentType.DATA_SOURCE,
                                       type(config.source))
        if issubclass(data_source_cls, ShardedDataSource):
            # data source is already sharded, we don't need to wrap RowShardedDataSource
            data_source = create_component(
                ComponentType.DATA_SOURCE,
                config.source,
                schema,
                rank=rank,
                world_size=world_size,
            )
        else:
            unsharded_data_source = create_component(ComponentType.DATA_SOURCE,
                                                     config.source, schema)
            data_source = RowShardedDataSource(
                data_source=unsharded_data_source,
                rank=rank,
                world_size=world_size)

        batcher = create_component(ComponentType.BATCHER, config.batcher)
        return cls(
            data_source,
            tensorizers,
            batcher=batcher,
            sort_key=config.sort_key,
            epoch_size=config.epoch_size,
            **kwargs,
        )
Ejemplo n.º 6
0
 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     base_tokenizer = None
     if config.base_tokenizer:
         base_tokenizer = create_component(ComponentType.TOKENIZER,
                                           config.base_tokenizer)
     with PathManager.open(config.vocab_file) as f:
         vocab = build_fairseq_vocab(
             vocab_file=f,
             special_token_replacements={
                 "<pad>": PAD,
                 "<s>": BOS,
                 "</s>": EOS,
                 "<unk>": UNK,
                 "<mask>": MASK,
             },
         )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         base_tokenizer=base_tokenizer,
     )
Ejemplo n.º 7
0
    def from_config(cls, config: Config, **kwargs):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        base_tokenizer = None
        if config.base_tokenizer:
            base_tokenizer = create_component(
                ComponentType.TOKENIZER, config.base_tokenizer
            )

        # map to the real vocab_file
        config.vocab_file = (
            resources.roberta.RESOURCE_MAP[config.vocab_file]
            if config.vocab_file in resources.roberta.RESOURCE_MAP
            else config.vocab_file
        )
        with PathManager.open(config.vocab_file) as f:
            vocab = build_fairseq_vocab(
                vocab_file=f,
                special_token_replacements={
                    "<pad>": SpecialTokens.PAD,
                    "<s>": SpecialTokens.BOS,
                    "</s>": SpecialTokens.EOS,
                    "<unk>": SpecialTokens.UNK,
                    "<mask>": SpecialTokens.MASK,
                },
                tokens_to_add=[SpecialTokens.SELFIE_RAW_IMAGE]
                if config.add_selfie_token
                else None,
            )
        return cls(
            columns=config.columns,
            vocab=vocab,
            tokenizer=tokenizer,
            max_seq_len=config.max_seq_len,
            base_tokenizer=base_tokenizer,
            **kwargs,
        )
Ejemplo n.º 8
0
def _get_data_source(test_path, source_config, field_names, task):
    if isinstance(task, NewDisjointMultitask):
        # Cannot easily specify a single data source for multitask
        assert not test_path
        data_source = None
    elif test_path and hasattr(source_config, "test_filename"):
        source_config.test_filename = test_path
        if field_names and hasattr(source_config, "field_names"):
            source_config.field_names = field_names
        data_source = create_component(ComponentType.DATA_SOURCE,
                                       source_config,
                                       task.data.data_source.schema)
    else:
        data_source = task.data.data_source
    return data_source
Ejemplo n.º 9
0
 def from_config(
     cls,
     config: Config,
     data_dict: Dict[str, Data],
     task_key: str = BatchContext.TASK_NAME,
     rank=0,
     world_size=1,
     init_tensorizers=True,
 ):
     samplers = {
         Stage.TRAIN: create_component(ComponentType.BATCH_SAMPLER, config.sampler),
         Stage.EVAL: EvalBatchSampler(),
         Stage.TEST: EvalBatchSampler(),
     }
     return cls(data_dict, samplers, config.test_key, task_key)
Ejemplo n.º 10
0
    def from_config(cls, config: Config, unused_metadata=None, model_state=None):
        tensorizers, data = NewTask._init_tensorizers(config)

        # Initialized tensorizers can be used to create the model
        model = NewTask._init_model(config, tensorizers, model_state)

        # This is the only place right now that the task actually cares about which
        # features and tensors are being used. This is a strong tie between
        # the implementation of the model and the metric reporter.
        metric_reporter = create_component(
            ComponentType.METRIC_REPORTER,
            config.metric_reporter,
            tensorizers=tensorizers,
        )
        trainer = create_trainer(config.trainer, model)
        return cls(data, model, metric_reporter, trainer)
Ejemplo n.º 11
0
 def test_create_from_config(self):
     source_config = PandasDataSource.Config(
         train_df=pd.DataFrame({"c1": [10, 20, 30], "c2": [40, 50, 60]}),
         eval_df=pd.DataFrame({"c1": [11, 21, 31], "c2": [41, 51, 61]}),
         test_df=pd.DataFrame({"c1": [12, 22, 32], "c2": [42, 52, 62]}),
         column_mapping={"c1": "feature1", "c2": "feature2"},
     )
     ds = create_component(
         ComponentType.DATA_SOURCE,
         source_config,
         schema={"feature1": float, "feature2": float},
     )
     self.assertEqual({"feature1": 10, "feature2": 40}, next(iter(ds.train)))
     self.assertEqual({"feature1": 11, "feature2": 41}, next(iter(ds.eval)))
     self.assertEqual({"feature1": 12, "feature2": 42}, next(iter(ds.test)))
     self.assertEqual(3, len(list(ds.train)))
Ejemplo n.º 12
0
def create_task(
    task_config, metadata=None, model_state=None, tensorizers=None, rank=0, world_size=1
):
    """
    Create a task by finding task class in registry and invoking the from_config
    function of the class, see :meth:`~Task.from_config` for more details
    """
    return create_component(
        ComponentType.TASK,
        task_config,
        metadata,
        model_state,
        tensorizers=tensorizers,
        rank=rank,
        world_size=world_size,
    )
Ejemplo n.º 13
0
 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     vocab = build_fairseq_vocab(
         vocab_file=config.vocab_file,
         special_token_replacements={
             "<pad>": PAD,
             "<s>": BOS,
             "</s>": EOS,
             "<unk>": UNK,
         },
     )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
     )
Ejemplo n.º 14
0
 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     return cls(
         columns=config.columns,
         tokenizer=tokenizer,
         vocab_file=config.vocab_file,
         is_fairseq=config.is_fairseq,
         pretraining=config.pretraining,
         max_seq_len=config.max_seq_len,
         max_vocab=config.max_vocab,
         min_count=config.min_count,
         language_columns=config.language_columns,
         lang2id=config.lang2id,
         reset_positions=config.reset_positions,
         has_language_in_data=config.has_language_in_data,
         use_language_embeddings=config.use_language_embeddings,
     )
Ejemplo n.º 15
0
    def _create_tensorizers(cls, model_inputs_dict):
        if not isinstance(model_inputs_dict, dict):
            model_inputs_dict = model_inputs_dict._asdict()
        tensorizers = {
            name: create_component(ComponentType.TENSORIZER, tensorizer_config)
            for name, tensorizer_config in model_inputs_dict.items()
            if tensorizer_config
        }
        schema: Dict[str, Type] = {}
        for tensorizer in tensorizers.values():
            for name, type in tensorizer.column_schema:
                if name in schema and type != schema[name]:
                    raise TypeError(
                        f"Unexpected different types for column {name}: {type} != {schema[name]}"
                    )
                schema[name] = type

        return tensorizers, schema
Ejemplo n.º 16
0
 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     vocab = build_fairseq_vocab(
         vocab_file=config.vocab_file,
         special_token_replacements={
             config.pad_token: PAD,
             config.bos_token: BOS,
             config.eos_token: EOS,
             config.unk_token: UNK,
         },
     )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         answers_column=config.answers_column,
         answer_starts_column=config.answer_starts_column,
     )
Ejemplo n.º 17
0
 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     with PathManager.open(config.vocab_file) as file_path:
         vocab = build_fairseq_vocab(
             vocab_file=file_path,
             special_token_replacements={
                 "<pad>": SpecialTokens.PAD,
                 "<s>": SpecialTokens.BOS,
                 "</s>": SpecialTokens.EOS,
                 "<unk>": SpecialTokens.UNK,
                 "<mask>": SpecialTokens.MASK,
             },
         )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
     )
Ejemplo n.º 18
0
    def from_config(cls,
                    task_config,
                    metadata=None,
                    model_state=None,
                    rank=0,
                    world_size=1):
        data_dict = OrderedDict()
        models = OrderedDict()
        metric_reporters = OrderedDict()
        for name, task in task_config.tasks.items():
            tensorizers, data = NewTask._init_tensorizers(
                task, rank, world_size)
            data_dict[name] = data
            models[name] = NewTask._init_model(task, tensorizers)
            metric_reporters[name] = create_component(
                ComponentType.METRIC_REPORTER,
                task.metric_reporter,
                tensorizers=tensorizers,
            )

        task_weights = {
            task_name: task_config.task_weights.get(task_name, 1.0)
            for task_name in task_config.tasks.keys()
        }
        data = DisjointMultitaskData.from_config(task_config.data,
                                                 data_dict=data_dict,
                                                 rank=rank,
                                                 world_size=world_size)
        model = NewDisjointMultitaskModel(models, loss_weights=task_weights)
        if model_state:
            model.load_state_dict(model_state)
        metric_reporter = DisjointMultitaskMetricReporter(
            metric_reporters,
            loss_weights=task_weights,
            target_task_name=task_config.target_task_name,
            use_subtask_select_metric=(
                task_config.metric_reporter.use_subtask_select_metric),
        )
        trainer = create_trainer(task_config.trainer, model)

        return cls(data, model, metric_reporter, trainer)
Ejemplo n.º 19
0
    def _init_tensorizers(cls, config: Config, tensorizers=None, rank=0, world_size=1):
        extra_schema = {}
        if hasattr(config.metric_reporter, "text_column_names"):
            extra_schema = {
                column: str for column in config.metric_reporter.text_column_names
            }

        if not tensorizers:
            tensorizers = create_tensorizers(config.model.inputs)
        schema = create_schema(tensorizers, extra_schema)

        # This initializes the tensorizers
        data = create_component(
            ComponentType.DATA_HANDLER,
            config.data,
            schema,
            tensorizers,
            rank=rank,
            world_size=world_size,
        )
        return tensorizers, data
Ejemplo n.º 20
0
    def from_config(cls, config: Config, **kwargs):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        vocab = None
        if isinstance(tokenizer, WordPieceTokenizer):
            print("Using WordPieceTokenizer")
            replacements = {
                "[UNK]": UNK,
                "[PAD]": PAD,
                "[CLS]": BOS,
                "[SEP]": EOS,
                "[MASK]": MASK,
            }
            vocab = Vocabulary(
                [token for token, _ in tokenizer.vocab.items()],
                replacements=replacements,
            )

        doc_tensorizer = TokenTensorizer(
            text_column=config.doc_column,
            tokenizer=tokenizer,
            vocab=vocab,
            max_seq_len=config.max_doc_seq_len,
        )
        ques_tensorizer = TokenTensorizer(
            text_column=config.ques_column,
            tokenizer=tokenizer,
            vocab=vocab,
            max_seq_len=config.max_ques_seq_len,
        )
        return cls(
            doc_tensorizer=doc_tensorizer,
            ques_tensorizer=ques_tensorizer,
            doc_column=config.doc_column,
            ques_column=config.ques_column,
            answers_column=config.answers_column,
            answer_starts_column=config.answer_starts_column,
            tokenizer=tokenizer,
            vocab=vocab,
            **kwargs,
        )
Ejemplo n.º 21
0
    def _init_tensorizers(cls, config: Config, rank, world_size):
        tensorizers, schema = NewTask._create_tensorizers(config.model.inputs)

        if hasattr(config.metric_reporter, "text_column_names"):
            for text_column in config.metric_reporter.text_column_names:
                if text_column in schema and schema[text_column] != str:
                    raise TypeError(f"""
                        Unexpected different types for column {text_column}:
                        {str} != {schema[text_column]}
                        """)
                schema[text_column] = str

        # This initializes the tensorizers
        data = create_component(
            ComponentType.DATA_HANDLER,
            config.data,
            schema,
            tensorizers,
            rank=rank,
            world_size=world_size,
        )
        return tensorizers, data
Ejemplo n.º 22
0
 def from_config(cls, config: Config, **kwargs):
     """
     from_config parses the config associated with the tensorizer and
     creates both the tokenizer and the Vocabulary object. The extra arguments
     passed as kwargs allow us to reuse thie function with variable number
     of arguments (eg: for classes which derive from this class).
     """
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     special_token_replacements = {
         "[UNK]": UNK,
         "[PAD]": PAD,
         "[CLS]": BOS,
         "[MASK]": MASK,
         "[SEP]": EOS,
     }
     if isinstance(tokenizer, WordPieceTokenizer):
         vocab = Vocabulary(
             [token for token, _ in tokenizer.vocab.items()],
             replacements=special_token_replacements,
         )
     else:
         config.vocab_file = (
             resources.roberta.RESOURCE_MAP[config.vocab_file]
             if config.vocab_file in resources.roberta.RESOURCE_MAP else
             config.vocab_file)
         with PathManager.open(config.vocab_file) as file_path:
             vocab = build_fairseq_vocab(
                 dictionary_class=BertDictionary,
                 vocab_file=file_path,
                 special_token_replacements=special_token_replacements,
             )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         **kwargs,
     )
Ejemplo n.º 23
0
 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     vocab = build_fairseq_vocab(
         dictionary_class=MaskedLMDictionary,
         vocab_file=config.vocab_file,
         max_vocab=config.max_vocab,
         min_count=config.min_count,
         special_token_replacements={
             "<unk>": UNK,
             "<pad>": PAD,
             "</s>": EOS,
             "<mask>": MASK,
         },
     )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         language_column=config.language_column,
         lang2id=config.lang2id,
         use_language_embeddings=config.use_language_embeddings,
         has_language_in_data=config.has_language_in_data,
     )
Ejemplo n.º 24
0
 def from_config(cls, config: Config):
     basic_tokenizer = create_component(ComponentType.TOKENIZER,
                                        config.basic_tokenizer)
     vocab = load_vocab(config.wordpiece_vocab_path)
     wordpiece_tokenizer = WordpieceTokenizer(vocab=vocab)
     return cls(vocab, basic_tokenizer, wordpiece_tokenizer)
Ejemplo n.º 25
0
 def from_config(
     cls, config: Config, schema: Dict[str, Type], tensorizers: Dict[str, Tensorizer]
 ):
     data_source = create_component(ComponentType.DATA_SOURCE, config.source, schema)
     batcher = create_component(ComponentType.BATCHER, config.batcher)
     return cls(data_source, tensorizers, batcher=batcher, sort_key=config.sort_key)
Ejemplo n.º 26
0
 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     return cls(config.text_column, config.dict_column, tokenizer)
Ejemplo n.º 27
0
 def from_config(cls, config: Component.Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     return cls(config.slot_column, config.text_column, tokenizer,
                config.allow_unknown)
Ejemplo n.º 28
0
def create_task(task_config, metadata=None, model_state=None):
    """
    Create a task by finding task class in registry and invoking the from_config
    function of the class, see :meth:`~Task.from_config` for more details
    """
    return create_component(ComponentType.TASK, task_config, metadata, model_state)
Ejemplo n.º 29
0
 def create_metric_reporter(cls, config, tensorizers):
     return create_component(
         ComponentType.METRIC_REPORTER,
         config.metric_reporter,
         tensorizers=tensorizers,
     )
Ejemplo n.º 30
0
def _create_module_from_registry(module_config, *args, **kwargs):
    return create_component(ComponentType.MODULE, module_config, *args,
                            **kwargs)