def complete_and_tokens(): # Define paths pack_input = os.path.join(pack_dir, "nif_raw_struct_links") pack_output = os.path.join(pack_dir, "nif_raw_struct_links_token") # Store which documents are processed, try to make input output structure # similar. pack_input_index = os.path.join(pack_input, "article.idx") pack_output_index = os.path.join(pack_output, "article.idx") logging.basicConfig( format="%(asctime)s - %(message)s", level=logging.INFO, filename=os.path.join(pack_dir, "complete_tokenize.log"), ) pipeline = Pipeline(loaded_resource).set_reader( DirPackReader(), config={ "suffix": ".json.gz", "zip_pack": True }, # ).add( # WikiEntityCompletion() ).add(WikiAddTitle()).add(SpacyProcessor(), config={ "processors": ["sentence", "tokenize"], }).add(SubwordTokenizer(), config={ "tokenizer_configs": { "pretrained_model_name": "bert-base-uncased" }, "token_source": "ft.onto.base_ontology.Token", }).add( WikiArticleWriter(), config={ "output_dir": pack_output, "zip_pack": True, "drop_record": True, "input_index_file": pack_input_index, "output_index_file": pack_output_index, "use_input_index": True, "serialize_method": "jsonpickle" }, ).add(ProgressPrinter()) pipeline.run(pack_input)
from forte.huggingface import ZeroShotClassifier from forte.stanza import StandfordNLPProcessor from forte import Pipeline from forte.data.readers import TerminalReader from forte.processors.stave import StaveProcessor nlp = Pipeline() nlp.set_reader(TerminalReader()) nlp.add(StandfordNLPProcessor()) nlp.add( ZeroShotClassifier(), config={ "candidate_labels": [ "travel", "cooking", "dancing", "exploration", ], }, ) nlp.add(StaveProcessor()) nlp.initialize() nlp.run()