def pack_example(input_path, output_path): """ This example read data from input path and serialize to output path. Args: input_path: output_path: Returns: """ print("Pack serialization example.") nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) nlp.add(NLTKSentenceSegmenter()) nlp.add(NLTKWordTokenizer()) nlp.add(NLTKPOSTagger()) # This is a simple writer that serialize the result to the current # directory and will use the DocID field in the data pack as the file name. nlp.add( DocIdJsonPackWriter(), { 'output_dir': output_path, 'indent': 2, 'overwrite': True, } ) nlp.run(input_path)
def test_serialize_deserialize_processor(self): pipe_serialize = Pipeline[DataPack]() pipe_serialize.set_reader(OntonotesReader()) pipe_serialize.add( AnnotationRemover(), # Remove tokens and sentences form OntonotesReader. { 'removal_types': [ 'ft.onto.base_ontology.Token', 'ft.onto.base_ontology.Sentence', ] }) pipe_serialize.add(NLTKSentenceSegmenter()) pipe_serialize.add(NLTKWordTokenizer()) pipe_serialize.add(NLTKPOSTagger()) output_path = tempfile.mkdtemp() pipe_serialize.add(DocIdJsonPackWriter(), { 'output_dir': output_path, 'indent': 2, }) dataset_path = "data_samples/ontonotes/00" pipe_serialize.run(dataset_path) pipe_deserialize = Pipeline[DataPack]() pipe_deserialize.set_reader(RecursiveDirectoryDeserializeReader()) pipe_deserialize.initialize() token_counts: Dict[str, int] = {} # This basically test whether the deserialized data is still the same # as expected. pack: DataPack for pack in pipe_deserialize.process_dataset(output_path): tokens: List[Token] = list(pack.get(Token)) token_counts[pack.pack_name] = len(tokens) expected_count = { 'bn/abc/00/abc_0039': 72, 'bn/abc/00/abc_0019': 370, 'bn/abc/00/abc_0059': 39, 'bn/abc/00/abc_0009': 424, 'bn/abc/00/abc_0029': 487, 'bn/abc/00/abc_0069': 428, 'bn/abc/00/abc_0049': 73 } assert token_counts == expected_count shutil.rmtree(output_path)
def main(): import sys ner_dir, srl_dir = sys.argv[ # pylint: disable=unbalanced-tuple-unpacking 1:3] output_config = HParams( {'output_dir': '.'}, DocIdJsonPackWriter.default_hparams(), ) eng_text = "The plain green Norway spruce is displayed in the gallery's " \ "foyer. Wentworth worked as an assistant to sculptor Henry " \ "Moore in the late 1960s. His reputation as a sculptor grew " \ "in the 1980s." fr_text = "Van Gogh grandit au sein d'une famille de " \ "l'ancienne bourgeoisie." stanford_nlp_example1('en', eng_text, output_config) stanford_nlp_example1('fr', fr_text, output_config) string_processor_example(ner_dir, srl_dir)
def stanford_nlp_example1(lang: str, text: str, output_config: HParams): pl = Pipeline() pl.set_reader(StringReader()) models_path = os.getcwd() config = HParams( { 'processors': 'tokenize,pos,lemma,depparse', 'lang': lang, # Language code for the language to build the Pipeline 'use_gpu': False }, StandfordNLPProcessor.default_hparams()) pl.add_processor(processor=StandfordNLPProcessor(models_path), config=config) pl.add_processor(processor=DocIdJsonPackWriter(), config=output_config) pl.initialize() pack = pl.process(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") tokens = [(token.text, token.pos, token.lemma) for token in pack.get(Token, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("Dependency Relations:", 'red')) for link in pack.get(Dependency, sentence): parent: Token = link.get_parent() # type: ignore child: Token = link.get_child() # type: ignore print(colored(child.text, 'cyan'), "has relation", colored(link.rel_type, 'green'), "of parent", colored(parent.text, 'cyan')) print("\n----------------------\n")
from texar.torch import HParams from forte.pipeline import Pipeline from forte.data.readers import OntonotesReader from forte.processors.nltk_processors import NLTKWordTokenizer, \ NLTKPOSTagger, NLTKSentenceSegmenter from forte.processors.writers import DocIdJsonPackWriter nlp = Pipeline() reader = OntonotesReader() data_path = "../data_samples/ontonotes/00/" nlp.set_reader(OntonotesReader()) nlp.add_processor(NLTKSentenceSegmenter()) nlp.add_processor(NLTKWordTokenizer()) nlp.add_processor(NLTKPOSTagger()) # This is a simple writer that serialize the result to the current directory and # will use the DocID field in the data pack as the file name. nlp.add_processor( DocIdJsonPackWriter(), HParams( {'output_dir': '.'}, DocIdJsonPackWriter.default_hparams(), )) nlp.initialize() nlp.run(data_path)
from texar.torch import HParams from forte.pipeline import Pipeline from forte.data.readers import OntonotesReader from forte.processors.nltk_processors import NLTKWordTokenizer, \ NLTKPOSTagger, NLTKSentenceSegmenter from forte.processors.writers import DocIdJsonPackWriter nlp = Pipeline() reader = OntonotesReader() data_path = "../data_samples/ontonotes/00/" nlp.set_reader(OntonotesReader()) nlp.add_processor(NLTKSentenceSegmenter()) nlp.add_processor(NLTKWordTokenizer()) nlp.add_processor(NLTKPOSTagger()) # This is a simple writer that serialize the result to the current directory and # will use the DocID field in the data pack as the file name. nlp.add_processor(DocIdJsonPackWriter(), HParams( { 'output_dir': '.' }, DocIdJsonPackWriter.default_hparams(), )) nlp.initialize() nlp.run(data_path)
from texar.torch import HParams from forte.pipeline import Pipeline from forte.data.readers import OntonotesReader from forte.processors.nltk_processors import NLTKWordTokenizer, \ NLTKPOSTagger, NLTKSentenceSegmenter from forte.processors.writers import DocIdJsonPackWriter nlp = Pipeline() reader = OntonotesReader() data_path = "../data_samples/ontonotes/00/" nlp.set_reader(OntonotesReader()) nlp.add_processor(NLTKSentenceSegmenter()) nlp.add_processor(NLTKWordTokenizer()) nlp.add_processor(NLTKPOSTagger()) # This is a simple writer that serialize the result to the current directory and # will use the DocID field in the data pack as the file name. nlp.add_processor( DocIdJsonPackWriter(), HParams( {'output_dir': '.'}, DocIdJsonPackWriter.default_configs(), )) nlp.initialize() nlp.run(data_path)