Ejemplo n.º 1
0
    def _parse_configs(self, configs):
        parsed_configs = self.default_configs()
        parsed_configs["batch_size"] = configs.batch_size
        parsed_configs["scope"] = get_class(configs.scope)
        parsed_configs["do_eval"] = configs.do_eval
        parsed_configs["feature_scheme"] = {}
        for tag, scheme in configs.feature_scheme.items():
            parsed_configs["feature_scheme"][tag] = {}
            if scheme["type"] == "data_input":
                parsed_configs["feature_scheme"][tag][
                    "type"] = TrainPreprocessor.DATA_INPUT
            elif scheme["type"] == "data_output":
                parsed_configs["feature_scheme"][tag][
                    "type"] = TrainPreprocessor.DATA_OUTPUT

            extractor = get_class(scheme["extractor"]["class_name"])()
            extractor.initialize(config=scheme["extractor"]["config"])
            if "vocab_path" in scheme["extractor"]:
                vocab_file = open(scheme["extractor"]["vocab_path"], "rb")
                extractor.vocab = pickle.load(vocab_file)
                vocab_file.close()
            parsed_configs["feature_scheme"][tag]["extractor"] = extractor

            if "converter" not in scheme:
                parsed_configs["feature_scheme"][tag]["converter"] = Converter(
                    {})
            else:
                parsed_configs["feature_scheme"][tag]["converter"] = scheme[
                    "converter"]
        return Config(parsed_configs, default_hparams=self.default_configs())
Ejemplo n.º 2
0
    def __init__(self, config: Optional[Union[Dict, Config]] = None):
        super().__init__()
        self._config = Config(hparams=config,
                              default_hparams=self.default_configs())
        self._meta_data: Dict[int, str] = {}

        index_type = self._config.index_type
        device = self._config.device
        dim = self._config.dim

        if device.lower().startswith("gpu"):
            if isinstance(index_type, str) and not index_type.startswith("Gpu"):
                index_type = "Gpu" + index_type

            index_class = utils.get_class(index_type, module_paths=["faiss"])
            gpu_resource = faiss.StandardGpuResources()
            gpu_id = int(device[3:])
            if faiss.get_num_gpus() < gpu_id:
                gpu_id = 0
                logging.warning("Cannot create the index on device %s. "
                                "Total number of GPUs on this machine is "
                                "%s. Using gpu0 for the index.",
                                self._config.device, faiss.get_num_gpus())
            config_class_name = \
                self.INDEX_TYPE_TO_CONFIG.get(index_class.__name__)
            config = utils.get_class(config_class_name,  # type: ignore
                                     module_paths=["faiss"])()
            config.device = gpu_id
            self._index = index_class(gpu_resource, dim, config)

        else:
            index_class = utils.get_class(index_type, module_paths=["faiss"])
            self._index = index_class(dim)
Ejemplo n.º 3
0
def parse_feature_extractors(scheme_configs: Config) -> Dict[str, Any]:
    feature_requests: Dict[str, Any] = {}

    for tag, scheme_config in scheme_configs.items():
        assert (
            "extractor" in scheme_config
        ), "Field not found for data request scheme: `extractor`"
        assert (
            "type" in scheme_config
        ), "Field not found for data request scheme: `type`"
        assert scheme_config["type"] in [
            "data_input",
            "data_output",
        ], "Type field must be either data_input or data_output."

        feature_requests[tag] = {}

        if scheme_config["type"] == "data_input":
            feature_requests[tag]["type"] = DATA_INPUT
        elif scheme_config["type"] == "data_output":
            feature_requests[tag]["type"] = DATA_OUTPUT

        extractor_class = get_class(scheme_config["extractor"]["class_name"])
        extractor: BaseExtractor = extractor_class()
        if not isinstance(extractor, BaseExtractor):
            raise RuntimeError(
                "Invalid extractor: ", scheme_config["extractor"]
            )

        extractor.initialize(config=scheme_config["extractor"]["config"])

        # Load vocab from disk if provided.
        if "vocab_path" in scheme_config["extractor"]:
            with open(
                scheme_config["extractor"]["vocab_path"], "rb"
            ) as vocab_file:
                extractor.vocab = pickle.load(vocab_file)

        feature_requests[tag]["extractor"] = extractor

        if "converter" not in scheme_config:
            # Create default converter if there is no given converter
            feature_requests[tag]["converter"] = Converter({})
        else:
            converter_class = get_class(
                scheme_config["converter"]["class_name"]
            )
            converter: Converter = converter_class()
            if not isinstance(converter, Converter):
                raise RuntimeError(
                    "Invalid converter: ", scheme_config["converter"]
                )
            feature_requests[tag]["converter"] = converter

    return feature_requests
Ejemplo n.º 4
0
    def initialize(self, config: Union[Dict, Config]):
        # pylint: disable=attribute-defined-outside-init
        super().initialize(config)

        if self.config.attribute is None:
            raise ProcessorConfigError(
                "'attribute' is required in this extractor.")
        if self.config.index_annotation is None:
            raise ProcessorConfigError(
                "'index_annotation' is required in this extractor.")
        if self.config.entry_type is None:
            raise ProcessorConfigError(
                "'entry_type' is required in this extractor.")
        else:
            self._entry_class: Type[Link] = get_class(self.config.entry_type)

            if not issubclass(self._entry_class, Link):
                raise ProcessorConfigError("`entry_class` to this extractor "
                                           "must be a Link tpe.")

            self._parent_class: Type[Annotation] = self._entry_class.ParentType
            if not issubclass(self._parent_class, Annotation):
                raise ProcessorConfigError(
                    f"The parent class of the provided {self.config.entry_type}"
                    " must be an Annotation.")

            self._child_class: Type[Annotation] = self._entry_class.ChildType
            if not issubclass(self._child_class, Annotation):
                raise ProcessorConfigError(
                    f"The child class of the provided {self.config.entry_type}"
                    " must be an Annotation.")
Ejemplo n.º 5
0
def create_class_with_kwargs(class_name: str,
                             class_args: Dict,
                             h_params: Optional[Dict] = None):
    cls = get_class(class_name)
    if not class_args:
        class_args = {}
    obj = cls(**class_args)

    p_params: Dict = {}

    if h_params is not None and \
            "config_path" in h_params and \
            h_params["config_path"] is not None:
        filebased_hparams = yaml.safe_load(open(h_params["config_path"]))
    else:
        filebased_hparams = {}
    p_params.update(filebased_hparams)

    if h_params is not None:
        p_params.update(h_params.get("overwrite_configs", {}))
    default_processor_hparams = cls.default_hparams()

    processor_hparams = HParams(p_params, default_processor_hparams)

    return obj, processor_hparams
Ejemplo n.º 6
0
    def init_from_config(self, configs: Dict):
        """
        Parse the configuration sections from the input config,
            into a list of [processor, config]
        Initialize the pipeline with the configurations
        """
        if "Reader" not in configs or configs["Reader"] is None:
            raise KeyError('No reader in the configuration')

        reader_config = configs["Reader"]

        reader, reader_hparams = create_class_with_kwargs(
            class_name=reader_config["type"],
            class_args=reader_config.get("kwargs", {}),
            h_params=reader_config.get("hparams", {}))

        self.set_reader(reader, reader_hparams)

        # HParams cannot create HParams from the inner dict of list
        if "Processors" in configs and configs["Processors"] is not None:
            for processor_configs in configs["Processors"]:
                p, processor_hparams = create_class_with_kwargs(
                    class_name=processor_configs["type"],
                    class_args=processor_configs.get("kwargs", {}),
                    h_params=processor_configs.get("hparams", {}))

                selector_hparams = processor_hparams.selector
                selector_class = get_class(selector_hparams['type'])
                selector_kwargs = selector_hparams["kwargs"]

                selector = selector_class(**selector_kwargs)

                self.add_processor(p, processor_hparams, selector)

            self.initialize()
Ejemplo n.º 7
0
    def initialize(self, config: Config):
        super().initialize(config)

        if config["context_type"] is None:
            raise AttributeError("'context_type' cannot be None.")

        if config["batch_size"] is None:
            raise AttributeError("'batch_size' cannot be None.")

        if isinstance(config["context_type"], str):
            self._context_type = get_class(config["context_type"])
        else:
            self._context_type = config["context_type"]

        if not issubclass(self._context_type, Annotation):
            raise ValidationError(
                f"The provided context type {self._context_type} "
                f"is not an Annotation type."
            )

        self.batch_size = config["batch_size"]

        self.instance_pool.clear()
        self.feature_pool.clear()
        self.pool_size = 0
        self.batch_is_full = False
Ejemplo n.º 8
0
    def _process(self, input_pack: DataPack):
        for type_name in self.configs.removal_types:
            type_cls = get_class(type_name)

            # Note: we cannot delete during iteration, which may cause
            # un-expected behavior in the iterator.
            for t in list(input_pack.get(type_cls)):
                input_pack.delete_entry(t)
Ejemplo n.º 9
0
    def init_from_config(self, configs: Dict):
        """
        Initialize the pipeline with the configurations

        Args:
            configs: The configurations used to create the pipeline.

        Returns:

        """
        if "Reader" not in configs or configs["Reader"] is None:
            raise KeyError('No reader in the configuration')

        reader_config = configs["Reader"]

        reader, reader_hparams = create_class_with_kwargs(
            class_name=reader_config["type"],
            class_args=reader_config.get("kwargs", {}),
            h_params=reader_config.get("hparams", {}))

        self.set_reader(reader, reader_hparams)

        if "Processors" in configs and configs["Processors"] is not None:
            for processor_configs in configs["Processors"]:

                p_class = get_class(processor_configs["type"])
                if processor_configs.get("kwargs"):
                    processor_kwargs = processor_configs["kwargs"]
                else:
                    processor_kwargs = {}
                p = p_class(**processor_kwargs)

                hparams: Dict = {}

                if processor_configs.get("hparams"):
                    # Extract the hparams section and build hparams
                    processor_hparams = processor_configs["hparams"]

                    if processor_hparams.get("config_path"):
                        filebased_hparams = yaml.safe_load(
                            open(processor_hparams["config_path"]))
                    else:
                        filebased_hparams = {}
                    hparams.update(filebased_hparams)

                    if processor_hparams.get("overwrite_configs"):
                        overwrite_hparams = processor_hparams[
                            "overwrite_configs"]
                    else:
                        overwrite_hparams = {}
                    hparams.update(overwrite_hparams)
                default_processor_hparams = p_class.default_hparams()

                processor_hparams = HParams(hparams, default_processor_hparams)
                self.add_processor(p, processor_hparams)

            self.initialize()
Ejemplo n.º 10
0
 def initialize(self, config: Union[Dict, Config]):
     # pylint: disable=attribute-defined-outside-init
     super().initialize(config=config)
     if self.config.attribute is None:
         raise AttributeError("attribute is required "
                              "in BioSeqTaggingExtractor.")
     if not self.config.tagging_unit:
         raise AttributeError("tagging_unit is required in "
                              "BioSeqTaggingExtractor.")
     self.attribute: str = self.config.attribute
     self.tagging_unit: Type[Annotation] = get_class(
         self.config.tagging_unit)
     self.is_bert: bool = self.config.is_bert
Ejemplo n.º 11
0
    def initialize(self, config: Union[Dict, Config]):
        """
        Initialize the extractor based on the provided configuration.

        Args:
            config: The configuration of the extractor, it can be a `Dict` or
                :class:`~forte.common.configuration.Config`.
                See :meth:`default_configs` for available options and
                default values.
        """
        # pylint: disable=attribute-defined-outside-init
        super().initialize(config=config)
        if self.config.attribute is None:
            raise AttributeError("attribute is required "
                                 "in BioSeqTaggingExtractor.")
        if not self.config.tagging_unit:
            raise AttributeError("tagging_unit is required in "
                                 "BioSeqTaggingExtractor.")
        self._attribute: str = self.config.attribute
        self._tagging_unit: Type[Annotation] = get_class(
            self.config.tagging_unit)
        self._entry_type: Type[Annotation] = get_class(self.config.entry_type)
Ejemplo n.º 12
0
    def initialize(self, resources: Resources, configs: Config):
        # pylint: disable=attribute-defined-outside-init,unused-argument
        r"""Initialize the evaluator with `resources` and `configs`.
        This method is called by the pipeline during the initialization.

        Args:
            resources (Resources): An object of class
                :class:`~forte.common.Resources` that holds references to
                objects that can be shared throughout the pipeline.
            configs (Config): A configuration to initialize the
                evaluator. This evaluator is expected to hold the
                following (key, value) pairs
                - `"entry_type"` (str): The entry to be evaluated.
                - `"tagging_unit"` (str): The tagging unit that the evaluation
                is performed on. e.g. `"ft.onto.base_ontology.Sentence"`
                - `"attribute"` (str): The attribute of the entry to be
                evaluated.

        """
        super().initialize(resources, configs)
        self.entry_type = get_class(configs.entry_type)
        self.tagging_unit = get_class(configs.tagging_unit)
        self.attribute = configs.attribute
Ejemplo n.º 13
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        if not self.configs.entry_type:
            raise ProcessorConfigError("Please specify an entity mention type!")

        self.entry_type = get_class(self.configs.entry_type)

        if not issubclass(self.entry_type, Annotation):
            raise AttributeError(
                f"The entry type to delete [{self.entry_type}] "
                f"is not a sub-class of "
                f"'forte.data.ontology.top.Annotation' class."
            )
Ejemplo n.º 14
0
    def init_from_config(self, configs: Dict):
        """
        Initialize the pipeline with the configurations

        Args:
            configs: The configurations used to create the pipeline.

        Returns:

        """
        # HParams cannot create HParams from the inner dict of list

        if "Processors" in configs and configs["Processors"] is not None:
            for processor_configs in configs["Processors"]:

                p_class = get_class(processor_configs["type"])
                if processor_configs.get("kwargs"):
                    processor_kwargs = processor_configs["kwargs"]
                else:
                    processor_kwargs = {}
                p = p_class(**processor_kwargs)

                hparams: Dict = {}

                if processor_configs.get("hparams"):
                    # Extract the hparams section and build hparams
                    processor_hparams = processor_configs["hparams"]

                    if processor_hparams.get("config_path"):
                        filebased_hparams = yaml.safe_load(
                            open(processor_hparams["config_path"]))
                    else:
                        filebased_hparams = {}
                    hparams.update(filebased_hparams)

                    if processor_hparams.get("overwrite_configs"):
                        overwrite_hparams = processor_hparams[
                            "overwrite_configs"]
                    else:
                        overwrite_hparams = {}
                    hparams.update(overwrite_hparams)
                default_processor_hparams = p_class.default_hparams()

                processor_hparams = HParams(hparams,
                                            default_processor_hparams)
                self.add_processor(p, processor_hparams)

            self.initialize()
Ejemplo n.º 15
0
    def initialize(self, config: Union[Dict, Config]):
        # pylint: disable=attribute-defined-outside-init
        self.config = Config(config, self.default_configs())
        if self.config.entry_type is None:
            raise AttributeError("`entry_type` needs to be specified in "
                                 "the configuration of an extractor.")
        self._entry_type = get_class(self.config.entry_type)

        if self.config.vocab_method != "custom":
            self._vocab = Vocabulary(
                method=self.config.vocab_method,
                use_pad=self.config.need_pad,
                use_unk=self.config.vocab_use_unk,
                pad_value=self.config.pad_value,
                unk_value=self.config.unk_value,
            )
        else:
            self._vocab = None
        self._vocab_method = self.config.vocab_method
Ejemplo n.º 16
0
    def _process(self, input_pack: DataPack):
        """Perform HuggingFace NER Pipeline on the input data pack.

        Args:
            input_pack: Input pack to fill
        Returns:
        """
        if not self.configs.entry_type:
            raise ProcessorConfigError("Please specify an input entry type!")

        output_entry = get_class(self.configs.output_entry_type)

        for entry_specified in input_pack.get(self.configs.entry_type):
            result = self.classifier(entry_specified.text)

            if self.configs.tagging_scheme == "bio-merge":  # Merge BIO tagging
                result_types, result_indices = self._merge_bio_tokens(result)

            elif self.configs.tagging_scheme == "no-merge":
                result_indices = []
                result_types = []
                for token in result:
                    start, end = token["start"], token["end"]
                    result_types.append(token["entity"])
                    result_indices.append((start, end))
            else:
                raise ProcessorConfigError(
                    f"The tagging_scheme strategy {self.configs.tagging_scheme}"
                    f"was not defined. Please check your input config.")

            for type, (start, end) in zip(result_types, result_indices):
                entity = output_entry(
                    pack=input_pack,
                    begin=entry_specified.span.begin + int(start),
                    end=entry_specified.span.begin + int(end),
                )
                setattr(entity, self.configs.attribute_name, type)
Ejemplo n.º 17
0
    def get(  # type: ignore
        self,
        entry_type: Union[str, Type[EntryType]],
        components: Optional[Union[str, List[str]]] = None,
        include_sub_type=True,
    ) -> Iterator[EntryType]:
        """Get entries of `entry_type` from this multi pack.

        Example:

        .. code-block:: python

            for relation in pack.get(
                                CrossDocEntityRelation,
                                component="relation_creator"
                                ):
                print(relation.get_parent())

        In the above code snippet, we get entries of type
        ``CrossDocEntityRelation`` which were
        generated by a component named ``relation_creator``

        Args:
            entry_type (type): The type of the entries requested.
            components (str or list, optional): The component generating the
                entries requested. If `None`, all valid entries generated by
                any component will be returned.
            include_sub_type (bool): whether to return the sub types of the
                queried `entry_type`. True by default.

        Returns: An iterator of the entries matching the arguments, following
        the order of entries (first sort by entry comparison, then by
        insertion)

        """
        entry_type_: Type[EntryType]
        if isinstance(entry_type, str):
            entry_type_ = get_class(entry_type)
            if not issubclass(entry_type_, Entry):
                raise AttributeError(
                    f"The specified entry type [{entry_type}] "
                    f"does not correspond to a "
                    f"'forte.data.ontology.core.Entry' class"
                )
        else:
            entry_type_ = entry_type

        entry_iter: Iterator[Entry]

        if not include_sub_type:
            entry_iter = self.get_entries_of(entry_type_)
        elif issubclass(entry_type_, MultiPackLink):
            entry_iter = self.links
        elif issubclass(entry_type_, MultiPackGroup):
            entry_iter = self.groups
        elif issubclass(entry_type_, MultiPackGeneric):
            entry_iter = self.generics
        else:
            raise ValueError(
                f"The entry type: {entry_type_} is not supported by MultiPack."
            )

        all_types: Set[Type]
        if include_sub_type:
            all_types = self._expand_to_sub_types(entry_type_)

        if components is not None:
            if isinstance(components, str):
                components = [components]

        for e in entry_iter:
            # Will check for the type matching if sub types are also requested.
            if include_sub_type and type(e) not in all_types:
                continue

            # Check for the component.
            if components is not None:
                if not self.is_created_by(e, components):
                    continue

            yield e  # type: ignore
Ejemplo n.º 18
0
 def initialize(self, resources: Resources, configs: Config):
     super().initialize(resources, configs)
     for entry_type, entry_attributes in self.configs.requests.items():
         entry_class = get_class(entry_type)
         self.fields[entry_class] = entry_attributes
 def initialize(self, resources: Resources, configs: HParams):
     super().initialize(resources, configs)
     cls = utils.get_class(self.config.indexer.name,
                           module_paths=["forte.indexers"])
     self.indexer = cls(hparams=self.config.indexer.hparams)
Ejemplo n.º 20
0
    def _parse_request(self, request: Dict):
        """
        This method has two responsibilities:
        1. parse the given data request and stored it internally
        2. validate if the given data request is valid
        """
        parsed_request: Dict[str, Any] = {}

        assert "scope" in request, "Field not found for data request: `scope`"
        assert (
            "feature_scheme" in request
        ), "Field not found for data request: `schemes`"

        parsed_request["scope"] = get_class(request["scope"])
        parsed_request["schemes"] = {}

        # Used for check dependency between different extractors
        scheme_group: Dict[str, Dict] = {"dependent": {}, "dependee": {}}

        for tag, scheme in request["feature_scheme"].items():
            assert (
                "extractor" in scheme
            ), "Field not found for data request scheme: `extractor`"
            parsed_request["schemes"][tag] = {}

            assert (
                "type" in scheme
            ), "Field not found for data request scheme: `type`"
            assert scheme["type"] in [
                "data_input",
                "data_output",
            ], "Type field must be either data_input or data_output."
            if scheme["type"] == "data_input":
                parsed_request["schemes"][tag][
                    "type"
                ] = TrainPreprocessor.DATA_INPUT
            if scheme["type"] == "data_output":
                parsed_request["schemes"][tag][
                    "type"
                ] = TrainPreprocessor.DATA_OUTPUT

            extractor_class = scheme["extractor"]["class_name"]
            if not isinstance(get_class(extractor_class)(), BaseExtractor):
                raise RuntimeError("Invalid extractor: ", scheme["extractor"])

            extractor: BaseExtractor = get_class(extractor_class)()
            extractor.initialize(config=scheme["extractor"]["config"])
            parsed_request["schemes"][tag]["extractor"] = extractor

            # Track dependency
            if hasattr(extractor, "based_on"):
                if extractor.entry_type not in scheme_group["dependent"]:
                    scheme_group["dependent"][extractor.entry_type] = set()
                scheme_group["dependent"][extractor.entry_type].add(extractor)
            else:
                if extractor.entry_type not in scheme_group["dependee"]:
                    scheme_group["dependee"][extractor.entry_type] = set()
                scheme_group["dependee"][extractor.entry_type].add(extractor)

            # Create default converter if there is no given converter
            if "converter" not in scheme:
                converter: Converter = Converter({})
                parsed_request["schemes"][tag]["converter"] = converter

        # Check dependency
        for _, dependent_extractors in scheme_group["dependent"].items():
            for dependent_extractor in dependent_extractors:
                based_on: Entry = dependent_extractor.based_on
                if based_on not in scheme_group["dependee"]:
                    raise ValueError(
                        "Extractor {} needs the entry {} to do extraction "
                        "processing but it is not extracted by any other "
                        "extractors given in request".format(
                            based_on, dependent_extractor.tag
                        )
                    )

        self._request = parsed_request
        self._request_ready = True
Ejemplo n.º 21
0
    def _process(self, input_pack: DataPack):
        for type_name in self.configs.removal_types:
            type_cls = get_class(type_name)

            for t in input_pack.get(type_cls):
                input_pack.delete_entry(t)