def pack_example(input_path, output_path): """ This example read data from input path and serialize to output path. Args: input_path: output_path: Returns: """ print("Pack serialization example.") nlp = Pipeline[DataPack]() nlp.set_reader(OntonotesReader()) nlp.add(NLTKSentenceSegmenter()) nlp.add(NLTKWordTokenizer()) nlp.add(NLTKPOSTagger()) # This is a simple writer that serialize the result to the current # directory and will use the DocID field in the data pack as the file name. nlp.add( PackNameJsonPackWriter(), { "output_dir": output_path, "indent": 2, "overwrite": True, }, ) nlp.run(input_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--config_file", default="./config.yml", help="Config YAML filepath") args = parser.parse_args() # loading config config = yaml.safe_load(open(args.config_file, "r")) nlp: Pipeline[MultiPack] = Pipeline() nlp.set_reader(RandomDataSelector(), config=config["data_selector_config"]) nlp.add(component=MultiPackBoxer(), config=config["boxer_config"]) nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector()) nlp.add( component=ReplacementDataAugmentProcessor(), config=config["da_processor_config"], ) nlp.initialize() for _, m_pack in enumerate(nlp.process_dataset()): aug_pack = m_pack.get_pack("augmented_input") logging.info(aug_pack.text)
def setUp(self): self.nltk = Pipeline[DataPack](enforce_consistency=True) self.nltk.set_reader(StringReader()) self.nltk.add(NLTKSentenceSegmenter()) self.nltk.add(NLTKWordTokenizer()) self.nltk.add(NLTKPOSTagger()) self.nltk.initialize()
def setUp(self): self.nltk = Pipeline[DataPack](enforce_consistency=True) self.nltk.set_reader(StringReader()) self.nltk.add(NLTKSentenceSegmenter()) self.nltk.add(NLTKWordTokenizer()) self.nltk.add(NLTKPOSTagger()) config = {"pattern": "NP: {<DT>?<JJ>*<NN>}"} self.nltk.add(NLTKChunker(), config=config) self.nltk.initialize()
def main(): pl = Pipeline[DataPack]() pl.set_reader(StringReader()) pl.add(NLTKSentenceSegmenter()) pl.add(NLTKWordTokenizer()) pl.add(NLTKPOSTagger()) config = yaml.safe_load(open("config.yml", "r")) config = Config(config, default_hparams=None) pl.add(CoNLLNERPredictor(), config=config.NER) pl.add(SRLPredictor(), config=config.SRL) pl.initialize() text = ( "So I was excited to see Journey to the Far Side of the Sun finally " "get released on an affordable DVD (the previous print had been " "fetching $100 on eBay - I'm sure those people wish they had their " "money back - but more about that in a second)." ) pack = pl.process_one(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", "red"), sent_text, "\n") # first method to get entry in a sentence tokens = [ (token.text, token.pos) for token in pack.get(Token, sentence) ] entities = [ (entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence) ] print(colored("Tokens:", "red"), tokens, "\n") print(colored("EntityMentions:", "red"), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", "red")) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() child: PredicateArgument = link.get_child() print( f' - "{child.text}" is role {link.arg_type} of ' f'predicate "{parent.text}"' ) entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", "green"))
def main(dataset_dir: str): config = yaml.safe_load(open("config.yml", "r")) config = Config(config, default_hparams=None) pl = Pipeline[DataPack]() pl.set_reader(PlainTextReader()) pl.add(NLTKSentenceSegmenter()) pl.add(NLTKWordTokenizer()) pl.add(NLTKPOSTagger()) pl.add(CoNLLNERPredictor(), config=config.NER) pl.add(SRLPredictor(), config=config.SRL) pl.initialize() for pack in pl.process_dataset(dataset_dir): print(colored("Document", "red"), pack.pack_name) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", "red"), sent_text, "\n") # first method to get entry in a sentence tokens = [ (token.text, token.pos) for token in pack.get(Token, sentence) ] entities = [ (entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence) ] print(colored("Tokens:", "red"), tokens, "\n") print(colored("EntityMentions:", "red"), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", "red")) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print( f' - "{child.text}" is role {link.arg_type} of ' f'predicate "{parent.text}"' ) entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", "green"))
def setup(config: Config) -> Pipeline: resource = Resources() query_pipeline = Pipeline[MultiPack](resource=resource) query_pipeline.set_reader( reader=MultiPackTerminalReader(), config=config.reader ) query_pipeline.add( component=MicrosoftBingTranslator(), config=config.translator ) query_pipeline.add( component=BertBasedQueryCreator(), config=config.query_creator ) query_pipeline.add(component=SearchProcessor(), config=config.searcher) top_response_pack_name = config.indexer.response_pack_name + "_0" query_pipeline.add( component=NLTKSentenceSegmenter(), selector=NameMatchSelector(select_name=top_response_pack_name), ) query_pipeline.add( component=NLTKWordTokenizer(), selector=NameMatchSelector(select_name=top_response_pack_name), ) query_pipeline.add( component=NLTKPOSTagger(), selector=NameMatchSelector(select_name=top_response_pack_name), ) query_pipeline.add( component=SRLPredictor(), config=config.SRL, selector=NameMatchSelector(select_name=top_response_pack_name), ) query_pipeline.add( component=MicrosoftBingTranslator(), config=config.back_translator ) query_pipeline.initialize() return query_pipeline