Ejemplos de create_component en Python, ejemplos de pytext.config.component.create_component en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: new_task.py Proyecto: sgpohlj87/pytext

    def _init_tensorizers(cls, config: Config, rank, world_size):
        tensorizers = {
            name: create_component(ComponentType.TENSORIZER, tensorizer_config)
            for name, tensorizer_config in config.model.inputs._asdict().items()
            if tensorizer_config
        }
        schema: Dict[str, Type] = {}
        for tensorizer in tensorizers.values():
            for name, type in tensorizer.column_schema:
                if name in schema and type != schema[name]:
                    raise TypeError(f"Expected two different types for column {name}")
                schema[name] = type

        # This initializes the tensorizers
        data = create_component(
            ComponentType.DATA_HANDLER,
            config.data,
            schema,
            tensorizers,
            rank=rank,
            world_size=world_size,
        )
        return tensorizers, data

Ejemplo n.º 2

0

Mostrar archivo

def create_tensorizers(
    model_inputs: Union[BaseModel.Config.ModelInput, Dict[str,
                                                          Tensorizer.Config]],
) -> Dict[str, Tensorizer]:
    if not isinstance(model_inputs, dict):
        model_inputs = model_inputs._asdict()

    tensorizers = {
        name: create_component(ComponentType.TENSORIZER, tensorizer_config)
        for name, tensorizer_config in model_inputs.items()
        if tensorizer_config
    }

    return tensorizers

Ejemplo n.º 3

0

Mostrar archivo

Archivo: disjoint_multitask_data.py Proyecto: orchestor/pytext

    def batches(self, stage: Stage, rank=0, world_size=1, data_source=None):
        all_batches = {
            name: task.batches(stage, rank, world_size)
            for name, task in self.data_dict.items()
        }
        if stage == Stage.TRAIN:
            sampler = create_component(self.sampler_config,
                                       iterators=all_batches)
        else:
            sampler = EvalBatchSampler(all_batches)

        for name, batch in sampler.batchify(all_batches):
            batch[BatchContext.TASK_NAME] = name
            yield batch

Ejemplo n.º 4

0

Mostrar archivo

    def _init_model(cls, model_config, tensorizers, model_state=None):
        model_config.init_from_saved_state = model_state is not None
        model = create_component(
            ComponentType.MODEL, model_config, tensorizers=tensorizers
        )
        if model_state:
            print("Loading model from model state dict...")
            model.load_state_dict(model_state)
            print("Loaded!")

        if cuda.CUDA_ENABLED:
            model = model.cuda()

        return model

Ejemplo n.º 5

0

Mostrar archivo

    def from_config(
        cls,
        config: Config,
        schema: Dict[str, Type],
        tensorizers: Dict[str, Tensorizer],
        rank=0,
        world_size=1,
        **kwargs,
    ):
        data_source_cls = Registry.get(ComponentType.DATA_SOURCE,
                                       type(config.source))
        if issubclass(data_source_cls, ShardedDataSource):
            # data source is already sharded, we don't need to wrap RowShardedDataSource
            data_source = create_component(
                ComponentType.DATA_SOURCE,
                config.source,
                schema,
                rank=rank,
                world_size=world_size,
            )
        else:
            unsharded_data_source = create_component(ComponentType.DATA_SOURCE,
                                                     config.source, schema)
            data_source = RowShardedDataSource(
                data_source=unsharded_data_source,
                rank=rank,
                world_size=world_size)

        batcher = create_component(ComponentType.BATCHER, config.batcher)
        return cls(
            data_source,
            tensorizers,
            batcher=batcher,
            sort_key=config.sort_key,
            epoch_size=config.epoch_size,
            **kwargs,
        )

Ejemplo n.º 6

0

Mostrar archivo

Archivo: roberta_tensorizer.py Proyecto: amohamedwa/pytext-1

 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     base_tokenizer = None
     if config.base_tokenizer:
         base_tokenizer = create_component(ComponentType.TOKENIZER,
                                           config.base_tokenizer)
     with PathManager.open(config.vocab_file) as f:
         vocab = build_fairseq_vocab(
             vocab_file=f,
             special_token_replacements={
                 "<pad>": PAD,
                 "<s>": BOS,
                 "</s>": EOS,
                 "<unk>": UNK,
                 "<mask>": MASK,
             },
         )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         base_tokenizer=base_tokenizer,
     )

Ejemplo n.º 7

0

Mostrar archivo

    def from_config(cls, config: Config, **kwargs):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        base_tokenizer = None
        if config.base_tokenizer:
            base_tokenizer = create_component(
                ComponentType.TOKENIZER, config.base_tokenizer
            )

        # map to the real vocab_file
        config.vocab_file = (
            resources.roberta.RESOURCE_MAP[config.vocab_file]
            if config.vocab_file in resources.roberta.RESOURCE_MAP
            else config.vocab_file
        )
        with PathManager.open(config.vocab_file) as f:
            vocab = build_fairseq_vocab(
                vocab_file=f,
                special_token_replacements={
                    "<pad>": SpecialTokens.PAD,
                    "<s>": SpecialTokens.BOS,
                    "</s>": SpecialTokens.EOS,
                    "<unk>": SpecialTokens.UNK,
                    "<mask>": SpecialTokens.MASK,
                },
                tokens_to_add=[SpecialTokens.SELFIE_RAW_IMAGE]
                if config.add_selfie_token
                else None,
            )
        return cls(
            columns=config.columns,
            vocab=vocab,
            tokenizer=tokenizer,
            max_seq_len=config.max_seq_len,
            base_tokenizer=base_tokenizer,
            **kwargs,
        )

Ejemplo n.º 8

0

Mostrar archivo

def _get_data_source(test_path, source_config, field_names, task):
    if isinstance(task, NewDisjointMultitask):
        # Cannot easily specify a single data source for multitask
        assert not test_path
        data_source = None
    elif test_path and hasattr(source_config, "test_filename"):
        source_config.test_filename = test_path
        if field_names and hasattr(source_config, "field_names"):
            source_config.field_names = field_names
        data_source = create_component(ComponentType.DATA_SOURCE,
                                       source_config,
                                       task.data.data_source.schema)
    else:
        data_source = task.data.data_source
    return data_source

Ejemplo n.º 9

0

Mostrar archivo

Archivo: disjoint_multitask_data.py Proyecto: whitespur/pytext

 def from_config(
     cls,
     config: Config,
     data_dict: Dict[str, Data],
     task_key: str = BatchContext.TASK_NAME,
     rank=0,
     world_size=1,
     init_tensorizers=True,
 ):
     samplers = {
         Stage.TRAIN: create_component(ComponentType.BATCH_SAMPLER, config.sampler),
         Stage.EVAL: EvalBatchSampler(),
         Stage.TEST: EvalBatchSampler(),
     }
     return cls(data_dict, samplers, config.test_key, task_key)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: new_task.py Proyecto: orchestor/pytext

    def from_config(cls, config: Config, unused_metadata=None, model_state=None):
        tensorizers, data = NewTask._init_tensorizers(config)

        # Initialized tensorizers can be used to create the model
        model = NewTask._init_model(config, tensorizers, model_state)

        # This is the only place right now that the task actually cares about which
        # features and tensors are being used. This is a strong tie between
        # the implementation of the model and the metric reporter.
        metric_reporter = create_component(
            ComponentType.METRIC_REPORTER,
            config.metric_reporter,
            tensorizers=tensorizers,
        )
        trainer = create_trainer(config.trainer, model)
        return cls(data, model, metric_reporter, trainer)

Ejemplo n.º 11

0

Mostrar archivo

 def test_create_from_config(self):
     source_config = PandasDataSource.Config(
         train_df=pd.DataFrame({"c1": [10, 20, 30], "c2": [40, 50, 60]}),
         eval_df=pd.DataFrame({"c1": [11, 21, 31], "c2": [41, 51, 61]}),
         test_df=pd.DataFrame({"c1": [12, 22, 32], "c2": [42, 52, 62]}),
         column_mapping={"c1": "feature1", "c2": "feature2"},
     )
     ds = create_component(
         ComponentType.DATA_SOURCE,
         source_config,
         schema={"feature1": float, "feature2": float},
     )
     self.assertEqual({"feature1": 10, "feature2": 40}, next(iter(ds.train)))
     self.assertEqual({"feature1": 11, "feature2": 41}, next(iter(ds.eval)))
     self.assertEqual({"feature1": 12, "feature2": 42}, next(iter(ds.test)))
     self.assertEqual(3, len(list(ds.train)))

Ejemplo n.º 12

0

Mostrar archivo

Archivo: task.py Proyecto: amohamedwa/pytext-1

def create_task(
    task_config, metadata=None, model_state=None, tensorizers=None, rank=0, world_size=1
):
    """
    Create a task by finding task class in registry and invoking the from_config
    function of the class, see :meth:`~Task.from_config` for more details
    """
    return create_component(
        ComponentType.TASK,
        task_config,
        metadata,
        model_state,
        tensorizers=tensorizers,
        rank=rank,
        world_size=world_size,
    )

Ejemplo n.º 13

0

Mostrar archivo

Archivo: roberta_tensorizer.py Proyecto: jessemin/pytext

 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     vocab = build_fairseq_vocab(
         vocab_file=config.vocab_file,
         special_token_replacements={
             "<pad>": PAD,
             "<s>": BOS,
             "</s>": EOS,
             "<unk>": UNK,
         },
     )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
     )

Ejemplo n.º 14

0

Mostrar archivo

 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     return cls(
         columns=config.columns,
         tokenizer=tokenizer,
         vocab_file=config.vocab_file,
         is_fairseq=config.is_fairseq,
         pretraining=config.pretraining,
         max_seq_len=config.max_seq_len,
         max_vocab=config.max_vocab,
         min_count=config.min_count,
         language_columns=config.language_columns,
         lang2id=config.lang2id,
         reset_positions=config.reset_positions,
         has_language_in_data=config.has_language_in_data,
         use_language_embeddings=config.use_language_embeddings,
     )

Ejemplo n.º 15

0

Mostrar archivo

Archivo: new_task.py Proyecto: waterzxj/pytext

    def _create_tensorizers(cls, model_inputs_dict):
        if not isinstance(model_inputs_dict, dict):
            model_inputs_dict = model_inputs_dict._asdict()
        tensorizers = {
            name: create_component(ComponentType.TENSORIZER, tensorizer_config)
            for name, tensorizer_config in model_inputs_dict.items()
            if tensorizer_config
        }
        schema: Dict[str, Type] = {}
        for tensorizer in tensorizers.values():
            for name, type in tensorizer.column_schema:
                if name in schema and type != schema[name]:
                    raise TypeError(
                        f"Unexpected different types for column {name}: {type} != {schema[name]}"
                    )
                schema[name] = type

        return tensorizers, schema

Ejemplo n.º 16

0

Mostrar archivo

Archivo: squad_for_bert_tensorizer.py Proyecto: 289abhi/pytext

 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     vocab = build_fairseq_vocab(
         vocab_file=config.vocab_file,
         special_token_replacements={
             config.pad_token: PAD,
             config.bos_token: BOS,
             config.eos_token: EOS,
             config.unk_token: UNK,
         },
     )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         answers_column=config.answers_column,
         answer_starts_column=config.answer_starts_column,
     )

Ejemplo n.º 17

0

Mostrar archivo

 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     with PathManager.open(config.vocab_file) as file_path:
         vocab = build_fairseq_vocab(
             vocab_file=file_path,
             special_token_replacements={
                 "<pad>": SpecialTokens.PAD,
                 "<s>": SpecialTokens.BOS,
                 "</s>": SpecialTokens.EOS,
                 "<unk>": SpecialTokens.UNK,
                 "<mask>": SpecialTokens.MASK,
             },
         )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
     )

Ejemplo n.º 18

0

Mostrar archivo

Archivo: disjoint_multitask.py Proyecto: zdavid1995/pytext

    def from_config(cls,
                    task_config,
                    metadata=None,
                    model_state=None,
                    rank=0,
                    world_size=1):
        data_dict = OrderedDict()
        models = OrderedDict()
        metric_reporters = OrderedDict()
        for name, task in task_config.tasks.items():
            tensorizers, data = NewTask._init_tensorizers(
                task, rank, world_size)
            data_dict[name] = data
            models[name] = NewTask._init_model(task, tensorizers)
            metric_reporters[name] = create_component(
                ComponentType.METRIC_REPORTER,
                task.metric_reporter,
                tensorizers=tensorizers,
            )

        task_weights = {
            task_name: task_config.task_weights.get(task_name, 1.0)
            for task_name in task_config.tasks.keys()
        }
        data = DisjointMultitaskData.from_config(task_config.data,
                                                 data_dict=data_dict,
                                                 rank=rank,
                                                 world_size=world_size)
        model = NewDisjointMultitaskModel(models, loss_weights=task_weights)
        if model_state:
            model.load_state_dict(model_state)
        metric_reporter = DisjointMultitaskMetricReporter(
            metric_reporters,
            loss_weights=task_weights,
            target_task_name=task_config.target_task_name,
            use_subtask_select_metric=(
                task_config.metric_reporter.use_subtask_select_metric),
        )
        trainer = create_trainer(task_config.trainer, model)

        return cls(data, model, metric_reporter, trainer)

Ejemplo n.º 19

0

Mostrar archivo

    def _init_tensorizers(cls, config: Config, tensorizers=None, rank=0, world_size=1):
        extra_schema = {}
        if hasattr(config.metric_reporter, "text_column_names"):
            extra_schema = {
                column: str for column in config.metric_reporter.text_column_names
            }

        if not tensorizers:
            tensorizers = create_tensorizers(config.model.inputs)
        schema = create_schema(tensorizers, extra_schema)

        # This initializes the tensorizers
        data = create_component(
            ComponentType.DATA_HANDLER,
            config.data,
            schema,
            tensorizers,
            rank=rank,
            world_size=world_size,
        )
        return tensorizers, data

Ejemplo n.º 20

0

Mostrar archivo

    def from_config(cls, config: Config, **kwargs):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        vocab = None
        if isinstance(tokenizer, WordPieceTokenizer):
            print("Using WordPieceTokenizer")
            replacements = {
                "[UNK]": UNK,
                "[PAD]": PAD,
                "[CLS]": BOS,
                "[SEP]": EOS,
                "[MASK]": MASK,
            }
            vocab = Vocabulary(
                [token for token, _ in tokenizer.vocab.items()],
                replacements=replacements,
            )

        doc_tensorizer = TokenTensorizer(
            text_column=config.doc_column,
            tokenizer=tokenizer,
            vocab=vocab,
            max_seq_len=config.max_doc_seq_len,
        )
        ques_tensorizer = TokenTensorizer(
            text_column=config.ques_column,
            tokenizer=tokenizer,
            vocab=vocab,
            max_seq_len=config.max_ques_seq_len,
        )
        return cls(
            doc_tensorizer=doc_tensorizer,
            ques_tensorizer=ques_tensorizer,
            doc_column=config.doc_column,
            ques_column=config.ques_column,
            answers_column=config.answers_column,
            answer_starts_column=config.answer_starts_column,
            tokenizer=tokenizer,
            vocab=vocab,
            **kwargs,
        )

Ejemplo n.º 21

0

Mostrar archivo

Archivo: new_task.py Proyecto: waterzxj/pytext

    def _init_tensorizers(cls, config: Config, rank, world_size):
        tensorizers, schema = NewTask._create_tensorizers(config.model.inputs)

        if hasattr(config.metric_reporter, "text_column_names"):
            for text_column in config.metric_reporter.text_column_names:
                if text_column in schema and schema[text_column] != str:
                    raise TypeError(f"""
                        Unexpected different types for column {text_column}:
                        {str} != {schema[text_column]}
                        """)
                schema[text_column] = str

        # This initializes the tensorizers
        data = create_component(
            ComponentType.DATA_HANDLER,
            config.data,
            schema,
            tensorizers,
            rank=rank,
            world_size=world_size,
        )
        return tensorizers, data

Ejemplo n.º 22

0

Mostrar archivo

Archivo: bert_tensorizer.py Proyecto: theniteshsingh/pytext

 def from_config(cls, config: Config, **kwargs):
     """
     from_config parses the config associated with the tensorizer and
     creates both the tokenizer and the Vocabulary object. The extra arguments
     passed as kwargs allow us to reuse thie function with variable number
     of arguments (eg: for classes which derive from this class).
     """
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     special_token_replacements = {
         "[UNK]": UNK,
         "[PAD]": PAD,
         "[CLS]": BOS,
         "[MASK]": MASK,
         "[SEP]": EOS,
     }
     if isinstance(tokenizer, WordPieceTokenizer):
         vocab = Vocabulary(
             [token for token, _ in tokenizer.vocab.items()],
             replacements=special_token_replacements,
         )
     else:
         config.vocab_file = (
             resources.roberta.RESOURCE_MAP[config.vocab_file]
             if config.vocab_file in resources.roberta.RESOURCE_MAP else
             config.vocab_file)
         with PathManager.open(config.vocab_file) as file_path:
             vocab = build_fairseq_vocab(
                 dictionary_class=BertDictionary,
                 vocab_file=file_path,
                 special_token_replacements=special_token_replacements,
             )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         **kwargs,
     )

Ejemplo n.º 23

0

Mostrar archivo

 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     vocab = build_fairseq_vocab(
         dictionary_class=MaskedLMDictionary,
         vocab_file=config.vocab_file,
         max_vocab=config.max_vocab,
         min_count=config.min_count,
         special_token_replacements={
             "<unk>": UNK,
             "<pad>": PAD,
             "</s>": EOS,
             "<mask>": MASK,
         },
     )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         language_column=config.language_column,
         lang2id=config.lang2id,
         use_language_embeddings=config.use_language_embeddings,
         has_language_in_data=config.has_language_in_data,
     )

Ejemplo n.º 24

0

Mostrar archivo

 def from_config(cls, config: Config):
     basic_tokenizer = create_component(ComponentType.TOKENIZER,
                                        config.basic_tokenizer)
     vocab = load_vocab(config.wordpiece_vocab_path)
     wordpiece_tokenizer = WordpieceTokenizer(vocab=vocab)
     return cls(vocab, basic_tokenizer, wordpiece_tokenizer)

Ejemplo n.º 25

0

Mostrar archivo

 def from_config(
     cls, config: Config, schema: Dict[str, Type], tensorizers: Dict[str, Tensorizer]
 ):
     data_source = create_component(ComponentType.DATA_SOURCE, config.source, schema)
     batcher = create_component(ComponentType.BATCHER, config.batcher)
     return cls(data_source, tensorizers, batcher=batcher, sort_key=config.sort_key)

Ejemplo n.º 26

0

Mostrar archivo

Archivo: tensorizers.py Proyecto: omargamal510/pytext

 def from_config(cls, config: Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     return cls(config.text_column, config.dict_column, tokenizer)

Ejemplo n.º 27

0

Mostrar archivo

Archivo: tensorizers.py Proyecto: omargamal510/pytext

 def from_config(cls, config: Component.Config):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     return cls(config.slot_column, config.text_column, tokenizer,
                config.allow_unknown)

Ejemplo n.º 28

0

Mostrar archivo

def create_task(task_config, metadata=None, model_state=None):
    """
    Create a task by finding task class in registry and invoking the from_config
    function of the class, see :meth:`~Task.from_config` for more details
    """
    return create_component(ComponentType.TASK, task_config, metadata, model_state)

Ejemplo n.º 29

0

Mostrar archivo

 def create_metric_reporter(cls, config, tensorizers):
     return create_component(
         ComponentType.METRIC_REPORTER,
         config.metric_reporter,
         tensorizers=tensorizers,
     )

Ejemplo n.º 30

0

Mostrar archivo

def _create_module_from_registry(module_config, *args, **kwargs):
    return create_component(ComponentType.MODULE, module_config, *args,
                            **kwargs)