def setUp(self) -> None: self.reader = OntonotesReader() data_path = os.path.join(os.path.dirname( os.path.dirname(forte.__file__)), "examples/data_samples/ontonotes/00/abc_0059.gold_conll" ) self.data_pack = list(self.reader.parse_pack(data_path))[0]
def pack_example(input_path, output_path): """ This example read data from input path and serialize to output path. Args: input_path: output_path: Returns: """ print("Pack serialization example.") nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) nlp.add(NLTKSentenceSegmenter()) nlp.add(NLTKWordTokenizer()) nlp.add(NLTKPOSTagger()) # This is a simple writer that serialize the result to the current # directory and will use the DocID field in the data pack as the file name. nlp.add(PackNameJsonPackWriter(), { 'output_dir': output_path, 'indent': 2, 'overwrite': True, }) nlp.run(input_path)
def test_process_multi_next(self): from forte.data.readers import OntonotesReader # Define and config the Pipeline nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) pack_name = 'test_pack' nlp.add(MultiPackBoxer(), {'pack_name': pack_name}) nlp.add(DummyRelationExtractor(), config={"batcher": { "batch_size": 5 }}, selector=NameMatchSelector(select_name=pack_name)) nlp.initialize() dataset_path = data_samples_root + "/ontonotes/00" # get processed pack from dataset m_pack: MultiPack for m_pack in nlp.process_dataset(dataset_path): pack = m_pack.get_pack(pack_name) # get sentence from pack for sentence in pack.get(Sentence): sent_text = sentence.text # second method to get entry in a sentence tokens = [token.text for token in pack.get(Token, sentence)] self.assertEqual(sent_text, " ".join(tokens))
def create_pack_iterator(self) -> Iterator[DataPack]: srl_train_reader = OntonotesReader(cache_in_memory=True) train_pl: Pipeline = Pipeline() train_pl.set_reader(srl_train_reader) train_pl.initialize() pack_iterator = train_pl.process_dataset(self.train_path) return pack_iterator
def setUp(self) -> None: file_dir_path = os.path.dirname(__file__) data_path = os.path.join(file_dir_path, os.pardir, os.pardir, 'test_data', 'ontonotes') pipeline: Pipeline = Pipeline() pipeline.set_reader(OntonotesReader()) pipeline.initialize() self.data_pack: DataPack = pipeline.process_one(data_path)
def setUp(self) -> None: self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) dummy = DummyRelationExtractor() config = {"batcher": {"batch_size": 5}} self.nlp.add_processor(dummy, config=config) self.nlp.initialize() self.data_path = "data_samples/ontonotes/00/"
def setUp(self): # Define and config the Pipeline self.dataset_path = "examples/data_samples/ontonotes/00" self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) self.nlp.add_processor(DummyPackProcessor()) self.nlp.initialize()
def setUp(self) -> None: file_dir_path = os.path.dirname(__file__) data_path = os.path.abspath( os.path.join(file_dir_path, '../../../', 'data_samples', 'ontonotes/one_file')) pipeline: Pipeline = Pipeline() pipeline.set_reader(OntonotesReader()) pipeline.initialize() self.data_pack: DataPack = pipeline.process_one(data_path)
def setUp(self) -> None: # Define and config the Pipeline self.dataset_path = "examples/ontonotes_sample_dataset/00" self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) self.processor = DummyRelationExtractor() self.nlp.add_processor(self.processor) self.nlp.initialize()
class DataPackTest(unittest.TestCase): def setUp(self) -> None: self.reader = OntonotesReader() data_path = os.path.join(os.path.dirname( os.path.dirname(forte.__file__)), "examples/data_samples/ontonotes/00/abc_0059.gold_conll" ) self.data_pack = list(self.reader.parse_pack(data_path))[0] def test_get_data(self): requests = { Sentence: ["speaker"], Token: ["pos", "sense"], EntityMention: [], PredicateMention: [], PredicateArgument: { "fields": [], "unit": "Token" }, PredicateLink: { "component": self.reader.component_name, "fields": ["parent", "child", "arg_type"] } } # case 1: get sentence context from the beginning instances = list(self.data_pack.get_data(Sentence)) self.assertEqual(len(instances), 2) self.assertEqual(instances[1]["offset"], len(instances[0]["context"]) + 1) # case 2: get sentence context from the second instance instances = list(self.data_pack.get_data(Sentence, skip_k=1)) self.assertEqual(len(instances), 1) self.assertEqual(instances[0]["offset"], 165) # case 3: get document context instances = list(self.data_pack.get_data(Document, skip_k=0)) self.assertEqual(len(instances), 1) self.assertEqual(instances[0]["offset"], 0) # case 4: test offset out of index instances = list(self.data_pack.get_data(Sentence, skip_k=10)) self.assertEqual(len(instances), 0) # case 5: get entries instances = list(self.data_pack.get_data(Sentence, request=requests, skip_k=1)) self.assertEqual(len(instances[0].keys()), 9) self.assertEqual(len(instances[0]["PredicateLink"]), 4) self.assertEqual(len(instances[0]["Token"]), 5) self.assertEqual(len(instances[0]["EntityMention"]), 3)
def setUp(self) -> None: self.nlp = Pipeline() self.nlp.set_reader(OntonotesReader()) dummy = DummyRelationExtractor() config = HParams({"batcher": { "batch_size": 5 }}, dummy.default_hparams()) self.nlp.add_processor(dummy, config=config) self.nlp.initialize() self.data_path = \ "forte/processors/base/tests/data_samples/ontonotes/00/"
def test_serialize_deserialize_processor(self): pipe_serialize = Pipeline[DataPack]() pipe_serialize.set_reader(OntonotesReader()) pipe_serialize.add( AnnotationRemover(), # Remove tokens and sentences form OntonotesReader. { "removal_types": [ "ft.onto.base_ontology.Token", "ft.onto.base_ontology.Sentence", ] }, ) pipe_serialize.add(PeriodSentenceSplitter()) pipe_serialize.add(WhiteSpaceTokenizer()) with tempfile.TemporaryDirectory() as output_dir: pipe_serialize.add( PackNameJsonPackWriter(), { "output_dir": output_dir, "indent": 2, }, ) pipe_serialize.run(self.data_path) pipe_deserialize = Pipeline[DataPack]() pipe_deserialize.set_reader(RecursiveDirectoryDeserializeReader()) pipe_deserialize.initialize() token_counts: Dict[str, int] = {} # This basically test whether the deserialized data is # still the same as expected. pack: DataPack for pack in pipe_deserialize.process_dataset(output_dir): tokens: List[Token] = list(pack.get(Token)) token_counts[pack.pack_name] = len(tokens) expected_count = { "bn/abc/00/abc_0039": 72, "bn/abc/00/abc_0019": 370, "bn/abc/00/abc_0059": 39, "bn/abc/00/abc_0009": 424, "bn/abc/00/abc_0029": 487, "bn/abc/00/abc_0069": 428, "bn/abc/00/abc_0049": 73, } assert token_counts == expected_count
def test_serialize_deserialize_processor(self): pipe_serialize = Pipeline[DataPack]() pipe_serialize.set_reader(OntonotesReader()) pipe_serialize.add( AnnotationRemover(), # Remove tokens and sentences form OntonotesReader. { 'removal_types': [ 'ft.onto.base_ontology.Token', 'ft.onto.base_ontology.Sentence', ] }) pipe_serialize.add(NLTKSentenceSegmenter()) pipe_serialize.add(NLTKWordTokenizer()) pipe_serialize.add(NLTKPOSTagger()) output_path = tempfile.mkdtemp() pipe_serialize.add(DocIdJsonPackWriter(), { 'output_dir': output_path, 'indent': 2, }) dataset_path = "data_samples/ontonotes/00" pipe_serialize.run(dataset_path) pipe_deserialize = Pipeline[DataPack]() pipe_deserialize.set_reader(RecursiveDirectoryDeserializeReader()) pipe_deserialize.initialize() token_counts: Dict[str, int] = {} # This basically test whether the deserialized data is still the same # as expected. pack: DataPack for pack in pipe_deserialize.process_dataset(output_path): tokens: List[Token] = list(pack.get(Token)) token_counts[pack.pack_name] = len(tokens) expected_count = { 'bn/abc/00/abc_0039': 72, 'bn/abc/00/abc_0019': 370, 'bn/abc/00/abc_0059': 39, 'bn/abc/00/abc_0009': 424, 'bn/abc/00/abc_0029': 487, 'bn/abc/00/abc_0069': 428, 'bn/abc/00/abc_0049': 73 } assert token_counts == expected_count shutil.rmtree(output_path)
def setUp(self): self._port: int = 8880 _file_dir_path: str = os.path.dirname(__file__) self._project_name: str = "serialization_pipeline_test" self._dataset_dir: str = os.path.abspath( os.path.join(_file_dir_path, "../../../", "data_samples/ontonotes/00/")) self._test_specs_dir: str = os.path.abspath( os.path.join(_file_dir_path, "../data/ontology/test_specs/")) self._stave_processor = StaveProcessor() self.pl = Pipeline[DataPack](ontology_file=os.path.join( self._test_specs_dir, "test_stave_ontology.json")) self.pl.set_reader(OntonotesReader())
def setUp(self): file_dir_path = os.path.dirname(__file__) data_path = os.path.join(file_dir_path, "../../../../", 'data_samples', 'ontonotes', '00') self.main_output = tempfile.TemporaryDirectory() nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) nlp.add( PackIdJsonPackWriter(), { 'output_dir': os.path.join(self.main_output.name, 'packs'), 'indent': 2, 'overwrite': True, }) nlp.run(data_path)
def setUp(self): file_dir_path = os.path.dirname(__file__) data_path = os.path.join(file_dir_path, "../../../../", "data_samples", "ontonotes", "00") self.main_output = tempfile.TemporaryDirectory() nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) nlp.add( PackIdJsonPackWriter(), { "output_dir": os.path.join(self.main_output.name, "packs"), "indent": 2, "overwrite": True, }, ) nlp.run(data_path)
def testPackWriting(self, config_data): zip_pack, method = config_data with tempfile.TemporaryDirectory() as main_output: write_pipeline = Pipeline[DataPack]() write_pipeline.set_reader(OntonotesReader()) write_pipeline.add( AutoNamePackWriter(), { "output_dir": os.path.join(main_output, "packs"), "overwrite": True, "zip_pack": zip_pack, "serialize_method": method, }, ) write_pipeline.run(self.data_path) read_pipeline = Pipeline[DataPack]() read_pipeline.set_reader(DirPackReader())
def test_process_next(self): # Define and config the Pipeline nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) dummy = DummyRelationExtractor() config = {"batcher": {"batch_size": 5}} nlp.add(dummy, config=config) nlp.initialize() dataset_path = os.path.join(data_samples_root, "ontonotes/00") # get processed pack from dataset for pack in nlp.process_dataset(dataset_path): # get sentence from pack for sentence in pack.get(Sentence): sent_text = sentence.text # second method to get entry in a sentence tokens = [token.text for token in pack.get(Token, sentence)] self.assertEqual(sent_text, " ".join(tokens))
def train(self): schemes: Dict = self.train_preprocessor.request["schemes"] text_extractor: BaseExtractor = schemes["text_tag"]["extractor"] char_extractor: BaseExtractor = schemes["char_tag"]["extractor"] link_extractor: BaseExtractor = schemes["pred_link_tag"]["extractor"] self.model: LabeledSpanGraphNetwork = LabeledSpanGraphNetwork( word_vocab=text_extractor.vocab.to_dict(), char_vocab_size=len(char_extractor.vocab), label_vocab=link_extractor.vocab.to_dict(), ) optim: Optimizer = SGD( self.model.parameters(), lr=self.lr, momentum=self.momentum, nesterov=self.nesterov, ) srl_val_reader = OntonotesReader() predictor = RelationPredictor() val_pl: Pipeline = Pipeline() val_pl.set_reader(srl_val_reader) val_pl.add(predictor, {}) # TODO: We need an evaluator here for SRL task logger.info("Start training.") epoch = 0 train_loss: float = 0.0 train_total: int = 0 while epoch < self.num_epochs: epoch += 1 # Get iterator of preprocessed batch of train data train_batch_iter: Iterator[ Batch ] = self.train_preprocessor.get_train_batch_iterator() for batch in train_batch_iter: char_tensor: Tensor = batch["char_tag"]["data"] char_masks: List[Tensor] = batch["char_tag"]["masks"] text_tensor: Tensor = batch["text_tag"]["data"] text_mask: Tensor = batch["text_tag"]["masks"][0] text: List[List[str]] = batch["raw_text_tag"]["data"] pred_link_features: List[Feature] = batch["pred_link_tag"][ "features" ] optim.zero_grad() output: LabeledSpanGraphNetwork.ReturnType = self.model( text=text, char_batch=char_tensor, char_masks=char_masks, text_batch=text_tensor, text_mask=text_mask, srl_features=pred_link_features, ) output["loss"].backward() optim.step() train_loss += output["loss"].item() train_total += 1 logger.info( "%dth Epoch training, " "loss: %f", epoch, train_loss / train_total, ) train_loss = 0.0 train_total = 0 val_pl.run(self.val_path) logger.info("%dth Epoch evaluating", epoch)
from texar.torch import HParams from forte.pipeline import Pipeline from forte.data.readers import OntonotesReader from forte.processors.nltk_processors import NLTKWordTokenizer, \ NLTKPOSTagger, NLTKSentenceSegmenter from forte.processors.writers import DocIdJsonPackWriter nlp = Pipeline() reader = OntonotesReader() data_path = "../data_samples/ontonotes/00/" nlp.set_reader(OntonotesReader()) nlp.add_processor(NLTKSentenceSegmenter()) nlp.add_processor(NLTKWordTokenizer()) nlp.add_processor(NLTKPOSTagger()) # This is a simple writer that serialize the result to the current directory and # will use the DocID field in the data pack as the file name. nlp.add_processor( DocIdJsonPackWriter(), HParams( {'output_dir': '.'}, DocIdJsonPackWriter.default_hparams(), )) nlp.initialize() nlp.run(data_path)
def setUp(self) -> None: self.reader = OntonotesReader() data_path = "data_samples/ontonotes/00/abc_0059.gold_conll" self.data_pack = list(self.reader.parse_pack(data_path))[0]
def testMultiPackWriting(self, config_data): zip_pack, method = config_data # Use different sub-directory to avoid conflicting. subdir = f"{zip_pack}_{method}" with tempfile.TemporaryDirectory() as main_output: # Prepare input data. prepared_input: str = os.path.join(main_output, subdir, "input_packs") data_output: str = os.path.join(main_output, subdir, "output") suffix = ".pickle" if method == "pickle" else ".json" if zip_pack: suffix = suffix + ".gz" nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) nlp.add( PackIdJsonPackWriter(), { "output_dir": prepared_input, "overwrite": True, "serialize_method": method, "zip_pack": zip_pack, }, ) nlp.run(self.data_path) # Convert to multi pack. coref_pl = Pipeline() coref_pl.set_reader( DirPackReader(), { "serialize_method": method, "zip_pack": zip_pack, "suffix": suffix, }, ) coref_pl.add(MultiPackBoxer()) coref_pl.add(CopySentence()) coref_pl.add(NaiveCoref()) coref_pl.add( PackIdMultiPackWriter(), config={ "output_dir": data_output, "overwrite": True, "serialize_method": method, "zip_pack": zip_pack, }, ) coref_pl.run(prepared_input) self.assertTrue( os.path.exists(os.path.join(data_output, "multi.idx"))) self.assertTrue( os.path.exists(os.path.join(data_output, "pack.idx"))) self.assertTrue(os.path.exists(os.path.join(data_output, "packs"))) self.assertTrue(os.path.exists(os.path.join(data_output, "multi"))) # Read the multi pack again. mp_pipeline = Pipeline() mp_pipeline.set_reader( MultiPackDirectoryReader(), config={ "suffix": suffix, "zip_pack": zip_pack, "serialize_method": method, "data_pack_dir": os.path.join(data_output, "packs"), "multi_pack_dir": os.path.join(data_output, "multi"), }, ).initialize() re: CrossDocEntityRelation for mp in mp_pipeline.process_dataset(): for re in mp.get(CrossDocEntityRelation): self.assertEqual(re.get_parent().text, re.get_child().text)