Beispiel #1
0
def add_extractor(
    request: Dict,
    name: str,
    extractor: BaseExtractor,
    is_input: bool,
    converter: Optional[Converter] = None,
):
    """
    Extractors can be added to the preprocessor directly via this
    method.

    Args:
        request: A request dictionary to be populated.
        name: The name/identifier of this extractor, the name should be
          different between different extractors.
        extractor: The extractor instance to be added.
        is_input: Whether this extractor will be used as input or output.
        converter:  The converter instance to be applied after running
          the extractor.

    Returns:

    """
    request["schemes"][name]["extractor"] = extractor
    request["schemes"][name]["type"] = DATA_INPUT if is_input else DATA_OUTPUT
    request["schemes"][name]["converter"] = (
        Converter({}) if converter is None else converter
    )
Beispiel #2
0
    def _parse_request(self, request: Dict):
        """
        This method has two responsibilities:
        1. parse the given data request and stored it internally
        2. validate if the given data request is valid
        """

        assert "scope" in request, \
            "Field not found for data request: `scope`"
        assert "schemes" in request, \
            "Field not found for data request: `schemes`"

        resource_schemes: Dict[str, Dict] = {}
        # Used for check dependency between different extractors
        scheme_group: Dict[str, Dict] = {
            "dependent": {}, "dependee": {}
        }

        for tag, scheme in request["schemes"].items():
            assert "extractor" in scheme, \
                "Field not found for data request scheme: `extractor`"
            assert "type" in scheme, \
                "Field not found for data request scheme: `type`"
            resource_schemes[tag] = {}

            if not isinstance(scheme["extractor"], BaseExtractor):
                raise RuntimeError("Invalid extractor: ", scheme["extractor"])

            extractor: BaseExtractor = scheme["extractor"]

            # Track dependency
            if hasattr(extractor, "based_on"):
                if extractor.entry_type not in scheme_group["dependent"]:
                    scheme_group["dependent"][extractor.entry_type] = set()
                scheme_group["dependent"][extractor.entry_type].add(
                    extractor)
            else:
                if extractor.entry_type not in scheme_group["dependee"]:
                    scheme_group["dependee"][extractor.entry_type] = set()
                scheme_group["dependee"][extractor.entry_type].add(
                    extractor)

            # Create default converter if there is no given converter
            if "converter" not in scheme:
                converter: Converter = Converter({})
                scheme["converter"] = converter

        # Check dependency
        for _, dependent_extractors in scheme_group["dependent"].items():
            for dependent_extractor in dependent_extractors:
                based_on: Entry = dependent_extractor.based_on
                if based_on not in scheme_group["dependee"]:
                    raise ValueError(
                        "Extractor {} needs the entry {} to do extraction "
                        "processing but it is not extracted by any other "
                        "extractors given in request".
                            format(based_on, dependent_extractor.tag))

        self._request = request
        self._request_ready = True
    def _parse_configs(self, configs):
        parsed_configs = self.default_configs()
        parsed_configs["batch_size"] = configs.batch_size
        parsed_configs["scope"] = get_class(configs.scope)
        parsed_configs["do_eval"] = configs.do_eval
        parsed_configs["feature_scheme"] = {}
        for tag, scheme in configs.feature_scheme.items():
            parsed_configs["feature_scheme"][tag] = {}
            if scheme["type"] == "data_input":
                parsed_configs["feature_scheme"][tag][
                    "type"] = TrainPreprocessor.DATA_INPUT
            elif scheme["type"] == "data_output":
                parsed_configs["feature_scheme"][tag][
                    "type"] = TrainPreprocessor.DATA_OUTPUT

            extractor = get_class(scheme["extractor"]["class_name"])()
            extractor.initialize(config=scheme["extractor"]["config"])
            if "vocab_path" in scheme["extractor"]:
                vocab_file = open(scheme["extractor"]["vocab_path"], "rb")
                extractor.vocab = pickle.load(vocab_file)
                vocab_file.close()
            parsed_configs["feature_scheme"][tag]["extractor"] = extractor

            if "converter" not in scheme:
                parsed_configs["feature_scheme"][tag]["converter"] = Converter(
                    {})
            else:
                parsed_configs["feature_scheme"][tag]["converter"] = scheme[
                    "converter"]
        return Config(parsed_configs, default_hparams=self.default_configs())
Beispiel #4
0
    def test_convert_no_pad(self):
        features1: List[Feature] = self.create_features1(need_pad=False)

        converter: Converter = Converter({"to_numpy": False,
                                          "to_torch": False})
        data, _ = converter.convert(features1)
        self.assertTrue(
            np.array_equal(data,
                           [[7, 8, 9], [1, 2, 5, 6], [4]]))
Beispiel #5
0
    def test_state(self):
        converter_states = {"to_numpy": True, "to_torch": False}
        converter: Converter = Converter(converter_states)

        # Test state.
        self.assertEqual(converter.state, converter_states)

        # Test save & load state.
        tmp_state_file = ".tmp_converter_state"
        torch.save(converter.state, tmp_state_file)
        self.assertTrue(os.path.exists(tmp_state_file))

        recover_converter: Converter = Converter({})
        recover_converter.load_state(torch.load(tmp_state_file))

        self.assertEqual(recover_converter.state, converter_states)

        os.remove(tmp_state_file)
        self.assertFalse(os.path.exists(tmp_state_file))
Beispiel #6
0
    def test_convert_no_to_torch(self):
        features1: List[Feature] = self.create_features1()

        converter: Converter = Converter({"to_torch": False})
        data, _ = converter.convert(features1)
        self.assertNotEqual(type(data), torch.Tensor)
        self.assertTrue(
            np.array_equal(data,
                           np.array([[7, 8, 9, 0], [1, 2, 5, 6], [4, 0, 0, 0]],
                                    dtype=np.long)))
    def test_convert_no_pad_but_to_torch(self):
        features1: List[Feature] = self.create_features1(data_list=[[7], [1],
                                                                    [4]],
                                                         need_pad=False)

        converter: Converter = Converter()
        data, _ = converter.convert(features1)
        self.assertTrue(
            torch.allclose(data, torch.tensor([[7], [1], [4]],
                                              dtype=torch.long)))
Beispiel #8
0
def parse_feature_extractors(scheme_configs: Config) -> Dict[str, Any]:
    feature_requests: Dict[str, Any] = {}

    for tag, scheme_config in scheme_configs.items():
        assert (
            "extractor" in scheme_config
        ), "Field not found for data request scheme: `extractor`"
        assert (
            "type" in scheme_config
        ), "Field not found for data request scheme: `type`"
        assert scheme_config["type"] in [
            "data_input",
            "data_output",
        ], "Type field must be either data_input or data_output."

        feature_requests[tag] = {}

        if scheme_config["type"] == "data_input":
            feature_requests[tag]["type"] = DATA_INPUT
        elif scheme_config["type"] == "data_output":
            feature_requests[tag]["type"] = DATA_OUTPUT

        extractor_class = get_class(scheme_config["extractor"]["class_name"])
        extractor: BaseExtractor = extractor_class()
        if not isinstance(extractor, BaseExtractor):
            raise RuntimeError(
                "Invalid extractor: ", scheme_config["extractor"]
            )

        extractor.initialize(config=scheme_config["extractor"]["config"])

        # Load vocab from disk if provided.
        if "vocab_path" in scheme_config["extractor"]:
            with open(
                scheme_config["extractor"]["vocab_path"], "rb"
            ) as vocab_file:
                extractor.vocab = pickle.load(vocab_file)

        feature_requests[tag]["extractor"] = extractor

        if "converter" not in scheme_config:
            # Create default converter if there is no given converter
            feature_requests[tag]["converter"] = Converter({})
        else:
            converter_class = get_class(
                scheme_config["converter"]["class_name"]
            )
            converter: Converter = converter_class()
            if not isinstance(converter, Converter):
                raise RuntimeError(
                    "Invalid converter: ", scheme_config["converter"]
                )
            feature_requests[tag]["converter"] = converter

    return feature_requests
Beispiel #9
0
    def test_pipeline1(self, batch_size):
        """Tests a chain of Batch->Pack->Batch with different batch sizes."""

        data_path = data_samples_root + "/random_texts/0.txt"
        pipeline = Pipeline[DataPack]()
        pipeline.set_reader(SentenceReader())
        pipeline.initialize()

        text_extractor = AttributeExtractor({
            "need_pad": True,
            "entry_type": Token,
            "attribute": "text",
        })
        for pack in pipeline.process_dataset(data_path):
            for instance in pack.get(Sentence):
                text_extractor.update_vocab(pack, instance)

        model = DummyModel()
        predictor = DummyPredictor()
        predictor_config = {
            "scope": Sentence,
            "batch_size": batch_size,
            "feature_scheme": {
                "text_tag": {
                    "extractor": text_extractor,
                    "converter": Converter(),
                    "type": TrainPreprocessor.DATA_INPUT
                },
            },
        }
        predictor.load(model)

        nlp = Pipeline[DataPack]()
        reader = SentenceReader()
        nlp.set_reader(reader)
        nlp.add(predictor, config=predictor_config)
        nlp.add(DummyEvaluator())
        nlp.initialize()
        num_packs = 0
        for _ in nlp.process_dataset(data_path):
            num_packs += 1

        # check that all packs are yielded
        self.assertEqual(num_packs, reader.count)
Beispiel #10
0
    def test_FixedSizeDataPackBatcherWithExtractor(self):
        r"""This funciton tests the corectness of cross_pack."""
        pipeline = Pipeline[DataPack]()
        pipeline.set_reader(CoNLL03Reader())
        pipeline.initialize()

        text_extractor = AttributeExtractor()
        text_extractor.initialize({
            "need_pad": True,
            "entry_type": "ft.onto.base_ontology.Token",
            "attribute": "text",
        })

        pack_num = 0
        for pack in pipeline.process_dataset(self.dataset_path):
            pack_num += 1
            for instance in pack.get(Sentence):
                text_extractor.update_vocab(pack, instance)
        self.assertEqual(pack_num, 2)

        batch_size = 2
        batcher = FixedSizeDataPackBatcherWithExtractor()
        batcher.initialize({
            "context_type": Sentence,
            "batch_size": batch_size,
            "feature_scheme": {
                "text_tag": {
                    "extractor": text_extractor,
                    "converter": Converter(),
                    "type": TrainPreprocessor.DATA_INPUT,
                }
            },
        })

        batch_num = 0
        for pack in pipeline.process_dataset(self.dataset_path):
            for batch in batcher.get_batch(pack):
                batch_num += 1
                self.assertEqual(len(batch[0]), batch_size)
        for _ in batcher.flush():
            batch_num += 1
        self.assertEqual(batch_num, 1)
Beispiel #11
0
 def setUp(self):
     self.converter: Converter = Converter({})
Beispiel #12
0
    def test_Predictor(self):
        pipeline = Pipeline[DataPack]()
        pipeline.set_reader(CoNLL03Reader())
        pipeline.initialize()

        text_extractor = AttributeExtractor({
            "need_pad": True,
            "entry_type": Token,
            "attribute": "text",
        })
        for pack in pipeline.process_dataset(self.dataset_path):
            for instance in pack.get(Sentence):
                text_extractor.update_vocab(pack, instance)

        ner_extractor = BioSeqTaggingExtractor({
            "entry_type": EntityMention,
            "need_pad": True,
            "attribute": "ner_type",
            "tagging_unit": Token,
        })
        for pack in pipeline.process_dataset(self.dataset_path):
            for instance in pack.get(Sentence):
                ner_extractor.update_vocab(pack, instance)

        FAKEOUTPUT = 2
        expected_ners = [
            ner_extractor.id2element(FAKEOUTPUT)[0] for _ in range(30)
        ]

        class Model:
            def __call__(self, batch):
                text_feature = batch["text_tag"]["data"]
                return {
                    "ner_tag":
                    [[FAKEOUTPUT for j in range(len(text_feature[0]))]
                     for i in range(len(text_feature))]
                }

        model = Model()

        class NERPredictor(Predictor):
            def predict(self, batch):
                return self.model(batch)

        predictor = NERPredictor()

        predictor_pipeline = Pipeline[DataPack]()
        predictor_pipeline.set_reader(CoNLL03Reader())

        predictor_config = {
            "scope": Sentence,
            "batch_size": 2,
            "feature_scheme": {
                "text_tag": {
                    "extractor": text_extractor,
                    "converter": Converter({}),
                    "type": TrainPreprocessor.DATA_INPUT
                },
                "ner_tag": {
                    "extractor": ner_extractor,
                    "converter": Converter({}),
                    "type": TrainPreprocessor.DATA_OUTPUT
                },
            },
        }
        predictor.load(model)
        predictor_pipeline.add(predictor, predictor_config)
        predictor_pipeline.initialize()

        for pack in predictor_pipeline.process_dataset(self.dataset_path):
            for instance in pack.get(Sentence):
                ners = [
                    e.ner_type for e in list(pack.get(EntityMention, instance))
                ]
                self.assertListEqual(ners, expected_ners)
Beispiel #13
0
    def test_Predictor(self):
        pipeline = Pipeline[DataPack]()
        pipeline.set_reader(CoNLL03Reader())
        pipeline.initialize()

        text_extractor = AttributeExtractor({
            "need_pad": True,
            "entry_type": Token,
            "attribute": "text",
        })
        for pack in pipeline.process_dataset(self.dataset_path):
            for instance in pack.get(Sentence):
                text_extractor.update_vocab(pack, instance)

        ner_extractor = BioSeqTaggingExtractor({
            "entry_type": EntityMention,
            "need_pad": True,
            "attribute": "ner_type",
            "tagging_unit": Token,
        })
        for pack in pipeline.process_dataset(self.dataset_path):
            for instance in pack.get(Sentence):
                ner_extractor.update_vocab(pack, instance)

        expected_ners = [
            ner_extractor.id2element(FAKEOUTPUT)[0] for _ in range(30)]

        model = DummyModel()

        predictor_pipeline = Pipeline[DataPack]()
        predictor_pipeline.set_reader(CoNLL03Reader())

        predictor_config = {
            "scope": Sentence,
            "batch_size": 2,
            "feature_scheme": {
                "text_tag": {
                    "extractor": text_extractor,
                    "converter": Converter(),
                    "type": TrainPreprocessor.DATA_INPUT
                },
                "ner_tag": {
                    "extractor": ner_extractor,
                    "converter": Converter(),
                    "type": TrainPreprocessor.DATA_OUTPUT
                },
            },
        }

        # dummy = DummyRelationExtractor()
        # config = {"batcher": {"batch_size": 5}}

        predictor = NERPredictor()
        predictor.load(model)
        predictor_pipeline.add(predictor, predictor_config)
        # predictor_pipeline.add(dummy, config)

        predictor_pipeline.add(CoNLLNEREvaluator())

        predictor_pipeline.initialize()
        for pack in predictor_pipeline.process_dataset(self.dataset_path):
            for instance in pack.get(Sentence):
                ners = [e.ner_type for e in
                        list(pack.get(EntityMention, instance))]
                self.assertListEqual(ners, expected_ners)
    def _parse_request(self, request: Dict):
        """
        This method has two responsibilities:
        1. parse the given data request and stored it internally
        2. validate if the given data request is valid
        """
        parsed_request: Dict[str, Any] = {}

        assert "scope" in request, "Field not found for data request: `scope`"
        assert (
            "feature_scheme" in request
        ), "Field not found for data request: `schemes`"

        parsed_request["scope"] = get_class(request["scope"])
        parsed_request["schemes"] = {}

        # Used for check dependency between different extractors
        scheme_group: Dict[str, Dict] = {"dependent": {}, "dependee": {}}

        for tag, scheme in request["feature_scheme"].items():
            assert (
                "extractor" in scheme
            ), "Field not found for data request scheme: `extractor`"
            parsed_request["schemes"][tag] = {}

            assert (
                "type" in scheme
            ), "Field not found for data request scheme: `type`"
            assert scheme["type"] in [
                "data_input",
                "data_output",
            ], "Type field must be either data_input or data_output."
            if scheme["type"] == "data_input":
                parsed_request["schemes"][tag][
                    "type"
                ] = TrainPreprocessor.DATA_INPUT
            if scheme["type"] == "data_output":
                parsed_request["schemes"][tag][
                    "type"
                ] = TrainPreprocessor.DATA_OUTPUT

            extractor_class = scheme["extractor"]["class_name"]
            if not isinstance(get_class(extractor_class)(), BaseExtractor):
                raise RuntimeError("Invalid extractor: ", scheme["extractor"])

            extractor: BaseExtractor = get_class(extractor_class)()
            extractor.initialize(config=scheme["extractor"]["config"])
            parsed_request["schemes"][tag]["extractor"] = extractor

            # Track dependency
            if hasattr(extractor, "based_on"):
                if extractor.entry_type not in scheme_group["dependent"]:
                    scheme_group["dependent"][extractor.entry_type] = set()
                scheme_group["dependent"][extractor.entry_type].add(extractor)
            else:
                if extractor.entry_type not in scheme_group["dependee"]:
                    scheme_group["dependee"][extractor.entry_type] = set()
                scheme_group["dependee"][extractor.entry_type].add(extractor)

            # Create default converter if there is no given converter
            if "converter" not in scheme:
                converter: Converter = Converter({})
                parsed_request["schemes"][tag]["converter"] = converter

        # Check dependency
        for _, dependent_extractors in scheme_group["dependent"].items():
            for dependent_extractor in dependent_extractors:
                based_on: Entry = dependent_extractor.based_on
                if based_on not in scheme_group["dependee"]:
                    raise ValueError(
                        "Extractor {} needs the entry {} to do extraction "
                        "processing but it is not extracted by any other "
                        "extractors given in request".format(
                            based_on, dependent_extractor.tag
                        )
                    )

        self._request = parsed_request
        self._request_ready = True