def add_extractor( request: Dict, name: str, extractor: BaseExtractor, is_input: bool, converter: Optional[Converter] = None, ): """ Extractors can be added to the preprocessor directly via this method. Args: request: A request dictionary to be populated. name: The name/identifier of this extractor, the name should be different between different extractors. extractor: The extractor instance to be added. is_input: Whether this extractor will be used as input or output. converter: The converter instance to be applied after running the extractor. Returns: """ request["schemes"][name]["extractor"] = extractor request["schemes"][name]["type"] = DATA_INPUT if is_input else DATA_OUTPUT request["schemes"][name]["converter"] = ( Converter({}) if converter is None else converter )
def _parse_request(self, request: Dict): """ This method has two responsibilities: 1. parse the given data request and stored it internally 2. validate if the given data request is valid """ assert "scope" in request, \ "Field not found for data request: `scope`" assert "schemes" in request, \ "Field not found for data request: `schemes`" resource_schemes: Dict[str, Dict] = {} # Used for check dependency between different extractors scheme_group: Dict[str, Dict] = { "dependent": {}, "dependee": {} } for tag, scheme in request["schemes"].items(): assert "extractor" in scheme, \ "Field not found for data request scheme: `extractor`" assert "type" in scheme, \ "Field not found for data request scheme: `type`" resource_schemes[tag] = {} if not isinstance(scheme["extractor"], BaseExtractor): raise RuntimeError("Invalid extractor: ", scheme["extractor"]) extractor: BaseExtractor = scheme["extractor"] # Track dependency if hasattr(extractor, "based_on"): if extractor.entry_type not in scheme_group["dependent"]: scheme_group["dependent"][extractor.entry_type] = set() scheme_group["dependent"][extractor.entry_type].add( extractor) else: if extractor.entry_type not in scheme_group["dependee"]: scheme_group["dependee"][extractor.entry_type] = set() scheme_group["dependee"][extractor.entry_type].add( extractor) # Create default converter if there is no given converter if "converter" not in scheme: converter: Converter = Converter({}) scheme["converter"] = converter # Check dependency for _, dependent_extractors in scheme_group["dependent"].items(): for dependent_extractor in dependent_extractors: based_on: Entry = dependent_extractor.based_on if based_on not in scheme_group["dependee"]: raise ValueError( "Extractor {} needs the entry {} to do extraction " "processing but it is not extracted by any other " "extractors given in request". format(based_on, dependent_extractor.tag)) self._request = request self._request_ready = True
def _parse_configs(self, configs): parsed_configs = self.default_configs() parsed_configs["batch_size"] = configs.batch_size parsed_configs["scope"] = get_class(configs.scope) parsed_configs["do_eval"] = configs.do_eval parsed_configs["feature_scheme"] = {} for tag, scheme in configs.feature_scheme.items(): parsed_configs["feature_scheme"][tag] = {} if scheme["type"] == "data_input": parsed_configs["feature_scheme"][tag][ "type"] = TrainPreprocessor.DATA_INPUT elif scheme["type"] == "data_output": parsed_configs["feature_scheme"][tag][ "type"] = TrainPreprocessor.DATA_OUTPUT extractor = get_class(scheme["extractor"]["class_name"])() extractor.initialize(config=scheme["extractor"]["config"]) if "vocab_path" in scheme["extractor"]: vocab_file = open(scheme["extractor"]["vocab_path"], "rb") extractor.vocab = pickle.load(vocab_file) vocab_file.close() parsed_configs["feature_scheme"][tag]["extractor"] = extractor if "converter" not in scheme: parsed_configs["feature_scheme"][tag]["converter"] = Converter( {}) else: parsed_configs["feature_scheme"][tag]["converter"] = scheme[ "converter"] return Config(parsed_configs, default_hparams=self.default_configs())
def test_convert_no_pad(self): features1: List[Feature] = self.create_features1(need_pad=False) converter: Converter = Converter({"to_numpy": False, "to_torch": False}) data, _ = converter.convert(features1) self.assertTrue( np.array_equal(data, [[7, 8, 9], [1, 2, 5, 6], [4]]))
def test_state(self): converter_states = {"to_numpy": True, "to_torch": False} converter: Converter = Converter(converter_states) # Test state. self.assertEqual(converter.state, converter_states) # Test save & load state. tmp_state_file = ".tmp_converter_state" torch.save(converter.state, tmp_state_file) self.assertTrue(os.path.exists(tmp_state_file)) recover_converter: Converter = Converter({}) recover_converter.load_state(torch.load(tmp_state_file)) self.assertEqual(recover_converter.state, converter_states) os.remove(tmp_state_file) self.assertFalse(os.path.exists(tmp_state_file))
def test_convert_no_to_torch(self): features1: List[Feature] = self.create_features1() converter: Converter = Converter({"to_torch": False}) data, _ = converter.convert(features1) self.assertNotEqual(type(data), torch.Tensor) self.assertTrue( np.array_equal(data, np.array([[7, 8, 9, 0], [1, 2, 5, 6], [4, 0, 0, 0]], dtype=np.long)))
def test_convert_no_pad_but_to_torch(self): features1: List[Feature] = self.create_features1(data_list=[[7], [1], [4]], need_pad=False) converter: Converter = Converter() data, _ = converter.convert(features1) self.assertTrue( torch.allclose(data, torch.tensor([[7], [1], [4]], dtype=torch.long)))
def parse_feature_extractors(scheme_configs: Config) -> Dict[str, Any]: feature_requests: Dict[str, Any] = {} for tag, scheme_config in scheme_configs.items(): assert ( "extractor" in scheme_config ), "Field not found for data request scheme: `extractor`" assert ( "type" in scheme_config ), "Field not found for data request scheme: `type`" assert scheme_config["type"] in [ "data_input", "data_output", ], "Type field must be either data_input or data_output." feature_requests[tag] = {} if scheme_config["type"] == "data_input": feature_requests[tag]["type"] = DATA_INPUT elif scheme_config["type"] == "data_output": feature_requests[tag]["type"] = DATA_OUTPUT extractor_class = get_class(scheme_config["extractor"]["class_name"]) extractor: BaseExtractor = extractor_class() if not isinstance(extractor, BaseExtractor): raise RuntimeError( "Invalid extractor: ", scheme_config["extractor"] ) extractor.initialize(config=scheme_config["extractor"]["config"]) # Load vocab from disk if provided. if "vocab_path" in scheme_config["extractor"]: with open( scheme_config["extractor"]["vocab_path"], "rb" ) as vocab_file: extractor.vocab = pickle.load(vocab_file) feature_requests[tag]["extractor"] = extractor if "converter" not in scheme_config: # Create default converter if there is no given converter feature_requests[tag]["converter"] = Converter({}) else: converter_class = get_class( scheme_config["converter"]["class_name"] ) converter: Converter = converter_class() if not isinstance(converter, Converter): raise RuntimeError( "Invalid converter: ", scheme_config["converter"] ) feature_requests[tag]["converter"] = converter return feature_requests
def test_pipeline1(self, batch_size): """Tests a chain of Batch->Pack->Batch with different batch sizes.""" data_path = data_samples_root + "/random_texts/0.txt" pipeline = Pipeline[DataPack]() pipeline.set_reader(SentenceReader()) pipeline.initialize() text_extractor = AttributeExtractor({ "need_pad": True, "entry_type": Token, "attribute": "text", }) for pack in pipeline.process_dataset(data_path): for instance in pack.get(Sentence): text_extractor.update_vocab(pack, instance) model = DummyModel() predictor = DummyPredictor() predictor_config = { "scope": Sentence, "batch_size": batch_size, "feature_scheme": { "text_tag": { "extractor": text_extractor, "converter": Converter(), "type": TrainPreprocessor.DATA_INPUT }, }, } predictor.load(model) nlp = Pipeline[DataPack]() reader = SentenceReader() nlp.set_reader(reader) nlp.add(predictor, config=predictor_config) nlp.add(DummyEvaluator()) nlp.initialize() num_packs = 0 for _ in nlp.process_dataset(data_path): num_packs += 1 # check that all packs are yielded self.assertEqual(num_packs, reader.count)
def test_FixedSizeDataPackBatcherWithExtractor(self): r"""This funciton tests the corectness of cross_pack.""" pipeline = Pipeline[DataPack]() pipeline.set_reader(CoNLL03Reader()) pipeline.initialize() text_extractor = AttributeExtractor() text_extractor.initialize({ "need_pad": True, "entry_type": "ft.onto.base_ontology.Token", "attribute": "text", }) pack_num = 0 for pack in pipeline.process_dataset(self.dataset_path): pack_num += 1 for instance in pack.get(Sentence): text_extractor.update_vocab(pack, instance) self.assertEqual(pack_num, 2) batch_size = 2 batcher = FixedSizeDataPackBatcherWithExtractor() batcher.initialize({ "context_type": Sentence, "batch_size": batch_size, "feature_scheme": { "text_tag": { "extractor": text_extractor, "converter": Converter(), "type": TrainPreprocessor.DATA_INPUT, } }, }) batch_num = 0 for pack in pipeline.process_dataset(self.dataset_path): for batch in batcher.get_batch(pack): batch_num += 1 self.assertEqual(len(batch[0]), batch_size) for _ in batcher.flush(): batch_num += 1 self.assertEqual(batch_num, 1)
def setUp(self): self.converter: Converter = Converter({})
def test_Predictor(self): pipeline = Pipeline[DataPack]() pipeline.set_reader(CoNLL03Reader()) pipeline.initialize() text_extractor = AttributeExtractor({ "need_pad": True, "entry_type": Token, "attribute": "text", }) for pack in pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): text_extractor.update_vocab(pack, instance) ner_extractor = BioSeqTaggingExtractor({ "entry_type": EntityMention, "need_pad": True, "attribute": "ner_type", "tagging_unit": Token, }) for pack in pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): ner_extractor.update_vocab(pack, instance) FAKEOUTPUT = 2 expected_ners = [ ner_extractor.id2element(FAKEOUTPUT)[0] for _ in range(30) ] class Model: def __call__(self, batch): text_feature = batch["text_tag"]["data"] return { "ner_tag": [[FAKEOUTPUT for j in range(len(text_feature[0]))] for i in range(len(text_feature))] } model = Model() class NERPredictor(Predictor): def predict(self, batch): return self.model(batch) predictor = NERPredictor() predictor_pipeline = Pipeline[DataPack]() predictor_pipeline.set_reader(CoNLL03Reader()) predictor_config = { "scope": Sentence, "batch_size": 2, "feature_scheme": { "text_tag": { "extractor": text_extractor, "converter": Converter({}), "type": TrainPreprocessor.DATA_INPUT }, "ner_tag": { "extractor": ner_extractor, "converter": Converter({}), "type": TrainPreprocessor.DATA_OUTPUT }, }, } predictor.load(model) predictor_pipeline.add(predictor, predictor_config) predictor_pipeline.initialize() for pack in predictor_pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): ners = [ e.ner_type for e in list(pack.get(EntityMention, instance)) ] self.assertListEqual(ners, expected_ners)
def test_Predictor(self): pipeline = Pipeline[DataPack]() pipeline.set_reader(CoNLL03Reader()) pipeline.initialize() text_extractor = AttributeExtractor({ "need_pad": True, "entry_type": Token, "attribute": "text", }) for pack in pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): text_extractor.update_vocab(pack, instance) ner_extractor = BioSeqTaggingExtractor({ "entry_type": EntityMention, "need_pad": True, "attribute": "ner_type", "tagging_unit": Token, }) for pack in pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): ner_extractor.update_vocab(pack, instance) expected_ners = [ ner_extractor.id2element(FAKEOUTPUT)[0] for _ in range(30)] model = DummyModel() predictor_pipeline = Pipeline[DataPack]() predictor_pipeline.set_reader(CoNLL03Reader()) predictor_config = { "scope": Sentence, "batch_size": 2, "feature_scheme": { "text_tag": { "extractor": text_extractor, "converter": Converter(), "type": TrainPreprocessor.DATA_INPUT }, "ner_tag": { "extractor": ner_extractor, "converter": Converter(), "type": TrainPreprocessor.DATA_OUTPUT }, }, } # dummy = DummyRelationExtractor() # config = {"batcher": {"batch_size": 5}} predictor = NERPredictor() predictor.load(model) predictor_pipeline.add(predictor, predictor_config) # predictor_pipeline.add(dummy, config) predictor_pipeline.add(CoNLLNEREvaluator()) predictor_pipeline.initialize() for pack in predictor_pipeline.process_dataset(self.dataset_path): for instance in pack.get(Sentence): ners = [e.ner_type for e in list(pack.get(EntityMention, instance))] self.assertListEqual(ners, expected_ners)
def _parse_request(self, request: Dict): """ This method has two responsibilities: 1. parse the given data request and stored it internally 2. validate if the given data request is valid """ parsed_request: Dict[str, Any] = {} assert "scope" in request, "Field not found for data request: `scope`" assert ( "feature_scheme" in request ), "Field not found for data request: `schemes`" parsed_request["scope"] = get_class(request["scope"]) parsed_request["schemes"] = {} # Used for check dependency between different extractors scheme_group: Dict[str, Dict] = {"dependent": {}, "dependee": {}} for tag, scheme in request["feature_scheme"].items(): assert ( "extractor" in scheme ), "Field not found for data request scheme: `extractor`" parsed_request["schemes"][tag] = {} assert ( "type" in scheme ), "Field not found for data request scheme: `type`" assert scheme["type"] in [ "data_input", "data_output", ], "Type field must be either data_input or data_output." if scheme["type"] == "data_input": parsed_request["schemes"][tag][ "type" ] = TrainPreprocessor.DATA_INPUT if scheme["type"] == "data_output": parsed_request["schemes"][tag][ "type" ] = TrainPreprocessor.DATA_OUTPUT extractor_class = scheme["extractor"]["class_name"] if not isinstance(get_class(extractor_class)(), BaseExtractor): raise RuntimeError("Invalid extractor: ", scheme["extractor"]) extractor: BaseExtractor = get_class(extractor_class)() extractor.initialize(config=scheme["extractor"]["config"]) parsed_request["schemes"][tag]["extractor"] = extractor # Track dependency if hasattr(extractor, "based_on"): if extractor.entry_type not in scheme_group["dependent"]: scheme_group["dependent"][extractor.entry_type] = set() scheme_group["dependent"][extractor.entry_type].add(extractor) else: if extractor.entry_type not in scheme_group["dependee"]: scheme_group["dependee"][extractor.entry_type] = set() scheme_group["dependee"][extractor.entry_type].add(extractor) # Create default converter if there is no given converter if "converter" not in scheme: converter: Converter = Converter({}) parsed_request["schemes"][tag]["converter"] = converter # Check dependency for _, dependent_extractors in scheme_group["dependent"].items(): for dependent_extractor in dependent_extractors: based_on: Entry = dependent_extractor.based_on if based_on not in scheme_group["dependee"]: raise ValueError( "Extractor {} needs the entry {} to do extraction " "processing but it is not extracted by any other " "extractors given in request".format( based_on, dependent_extractor.tag ) ) self._request = parsed_request self._request_ready = True