def write_results(pl: Pipeline, output_path: str, input_data: str): pl.add( WikiArticleWriter(), config={ "output_dir": output_path, "zip_pack": True, "drop_record": True, }, ) pl.run(input_data)
def prepare(self, *args, **kwargs): # pylint: disable=unused-argument prepare_pl = Pipeline() prepare_pl.set_reader(self.train_reader) for p in self.preprocessors: prepare_pl.add_processor(p) prepare_pl.run(self.configs.config_data.train_path) for p in self.preprocessors: p.finish(resource=self.resource)
def multi_example(input_path, output_path): """ This example reads data from input path, and write multi pack output to output path. Args: input_path: output_path: Returns: """ print("Multi Pack serialization example.") print( "We first read the data, and add multi-packs to them, and then " "save the results." ) coref_pl = Pipeline() coref_pl.set_reader(DirPackReader()) coref_pl.add(MultiPackBoxer()) coref_pl.add(PackCopier()) coref_pl.add(ExampleCoreferencer()) coref_pl.add(ExampleCorefCounter()) coref_pl.add( MultiPackWriter(), config={ "output_dir": output_path, "indent": 2, "overwrite": True, }, ) coref_pl.run(input_path) print( "We can then load the saved results, and see if everything is OK. " "We should see the same number of multi packs there. " ) reading_pl = Pipeline() reading_pl.set_reader( MultiPackDirectoryReader(), config={ "multi_pack_dir": os.path.join(output_path, "multi"), "data_pack_dir": os.path.join(output_path, "packs"), }, ) reading_pl.add(ExampleCorefCounter()) reading_pl.run()
def testMultiPackWriting(self): coref_pl = Pipeline() coref_pl.set_reader(DirPackReader()) coref_pl.add(MultiPackBoxer()) coref_pl.add(CopySentence()) coref_pl.add(PackIdMultiPackWriter(), config={ 'output_dir': os.path.join(self.main_output.name, 'multi'), 'indent': 2, 'overwrite': True, }) coref_pl.run(os.path.join(self.main_output.name, 'packs')) self.assertTrue(os.path.exists(os.path.join('multi_out', 'multi.idx'))) self.assertTrue(os.path.exists(os.path.join('multi_out', 'pack.idx'))) self.assertTrue(os.path.exists(os.path.join('multi_out', 'packs'))) self.assertTrue(os.path.exists(os.path.join('multi_out', 'multi')))
def testMultiPackWriting(self): coref_pl = Pipeline() coref_pl.set_reader(DirPackReader()) coref_pl.add(MultiPackBoxer()) coref_pl.add(CopySentence()) coref_pl.add( PackIdMultiPackWriter(), config={ "output_dir": os.path.join(self.main_output.name, "multi"), "indent": 2, "overwrite": True, }, ) coref_pl.run(os.path.join(self.main_output.name, "packs")) self.assertTrue(os.path.exists(os.path.join("multi_out", "multi.idx"))) self.assertTrue(os.path.exists(os.path.join("multi_out", "pack.idx"))) self.assertTrue(os.path.exists(os.path.join("multi_out", "packs"))) self.assertTrue(os.path.exists(os.path.join("multi_out", "multi")))
def multi_example(input_path, output_path): """ This example reads data from input path, and write multi pack output to output path. Args: input_path: output_path: Returns: """ print("Multi Pack serialization example.") print("We first read the data, and add multi-packs to them, and then " "save the results.") coref_pl = Pipeline() coref_pl.set_reader(DirPackReader()) coref_pl.add(MultiPackBoxer()) coref_pl.add(PackCopier()) coref_pl.add(ExampleCoreferencer()) coref_pl.add(ExampleCorefCounter()) coref_pl.add( MultiPackWriter(), { 'output_dir': output_path, 'indent': 2, 'overwrite': True, } ) coref_pl.run(input_path) print("We can then load the saved results, and see if everything is OK. " "We should see the same number of multi packs there. ") reading_pl = Pipeline() reading_pl.set_reader(MultiPackDiskReader(), {'data_path': output_path}) reading_pl.add(ExampleCorefCounter()) reading_pl.run()
def prepare(self): prepare_pl = Pipeline() prepare_pl.set_reader(self.train_reader) for p in self.preprocessors: prepare_pl.add(p) prepare_pl.run(self.configs.config_data.train_path)
def main(nif_context: str, nif_page_structure: str, mapping_literals: str, mapping_objects: str, nif_text_links: str, redirects: str, info_boxs: str, output_path: str): # Load redirects. logging.info("Loading redirects") redirect_pickle = os.path.join(output_path, 'redirects.pickle') if os.path.exists(redirect_pickle): redirect_map: Dict[str, str] = pickle.load(open(redirect_pickle, 'rb')) else: redirect_map: Dict[str, str] = load_redirects(redirects) with open(redirect_pickle, 'wb') as pickle_f: pickle.dump(redirect_map, pickle_f) logging.info("Done loading.") # The datasets are read in two steps. raw_pack_dir = os.path.join(output_path, 'nif_raw') # First, we create the NIF reader that read the NIF in order. nif_pl = Pipeline() nif_pl.resource.update(redirects=redirect_map) nif_pl.set_reader(DBpediaWikiReader(), config=HParams( { 'redirect_path': redirects, 'nif_page_structure': nif_page_structure, 'nif_text_links': nif_text_links, }, DBpediaWikiReader.default_configs())) nif_pl.add_processor(WikiArticleWriter(), config=HParams( { 'output_dir': raw_pack_dir, 'zip_pack': True, }, WikiArticleWriter.default_configs())) nif_pl.initialize() logging.info('Start running the DBpedia text pipeline.') nif_pl.run(nif_context) # Second, we add info boxes to the packs with NIF. ib_pl = Pipeline() ib_pl.resource.update(redirects=redirect_map) ib_pl.set_reader( DBpediaInfoBoxReader(), config=HParams( { 'pack_index': os.path.join(raw_pack_dir, 'article.idx'), 'pack_dir': raw_pack_dir, 'mapping_literals': mapping_literals, 'mapping_objects': mapping_objects, 'reading_log': os.path.join(output_path, 'infobox.log') }, DBpediaInfoBoxReader.default_configs())) ib_pl.add_processor( WikiArticleWriter(), config=HParams( { 'output_dir': os.path.join(output_path, 'nif_info_box'), 'zip_pack': True, }, WikiArticleWriter.default_configs())) # Now we run the info box pipeline. ib_pl.initialize() ib_pl.run(info_boxs)
from texar.torch import HParams from forte.pipeline import Pipeline from forte.data.readers import OntonotesReader from forte.processors.nltk_processors import NLTKWordTokenizer, \ NLTKPOSTagger, NLTKSentenceSegmenter from forte.processors.writers import DocIdJsonPackWriter nlp = Pipeline() reader = OntonotesReader() data_path = "../data_samples/ontonotes/00/" nlp.set_reader(OntonotesReader()) nlp.add_processor(NLTKSentenceSegmenter()) nlp.add_processor(NLTKWordTokenizer()) nlp.add_processor(NLTKPOSTagger()) # This is a simple writer that serialize the result to the current directory and # will use the DocID field in the data pack as the file name. nlp.add_processor( DocIdJsonPackWriter(), HParams( {'output_dir': '.'}, DocIdJsonPackWriter.default_hparams(), )) nlp.initialize() nlp.run(data_path)
def testMultiPackWriting(self, config_data): zip_pack, method = config_data # Use different sub-directory to avoid conflicting. subdir = f"{zip_pack}_{method}" with tempfile.TemporaryDirectory() as main_output: # Prepare input data. prepared_input: str = os.path.join(main_output, subdir, "input_packs") data_output: str = os.path.join(main_output, subdir, "output") suffix = ".pickle" if method == "pickle" else ".json" if zip_pack: suffix = suffix + ".gz" nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) nlp.add( PackIdJsonPackWriter(), { "output_dir": prepared_input, "overwrite": True, "serialize_method": method, "zip_pack": zip_pack, }, ) nlp.run(self.data_path) # Convert to multi pack. coref_pl = Pipeline() coref_pl.set_reader( DirPackReader(), { "serialize_method": method, "zip_pack": zip_pack, "suffix": suffix, }, ) coref_pl.add(MultiPackBoxer()) coref_pl.add(CopySentence()) coref_pl.add(NaiveCoref()) coref_pl.add( PackIdMultiPackWriter(), config={ "output_dir": data_output, "overwrite": True, "serialize_method": method, "zip_pack": zip_pack, }, ) coref_pl.run(prepared_input) self.assertTrue( os.path.exists(os.path.join(data_output, "multi.idx"))) self.assertTrue( os.path.exists(os.path.join(data_output, "pack.idx"))) self.assertTrue(os.path.exists(os.path.join(data_output, "packs"))) self.assertTrue(os.path.exists(os.path.join(data_output, "multi"))) # Read the multi pack again. mp_pipeline = Pipeline() mp_pipeline.set_reader( MultiPackDirectoryReader(), config={ "suffix": suffix, "zip_pack": zip_pack, "serialize_method": method, "data_pack_dir": os.path.join(data_output, "packs"), "multi_pack_dir": os.path.join(data_output, "multi"), }, ).initialize() re: CrossDocEntityRelation for mp in mp_pipeline.process_dataset(): for re in mp.get(CrossDocEntityRelation): self.assertEqual(re.get_parent().text, re.get_child().text)