def test_reader_original_span_test(self, value): span_ops, output = ( [ (Span(11, 19), "New"), (Span(19, 20), " Shiny "), (Span(25, 25), " Ends"), ], "<title>The New Shiny Title Ends </title>", ) input_span, expected_span, mode = value pipeline = Pipeline() reader = PlainTextReader() reader.text_replace_operation = lambda _: span_ops pipeline.set_reader(reader, {"file_ext": ".html"}) pipeline.initialize() pack = pipeline.process_one(self.test_dir) self.assertEqual(pack.text, output) output_span = pack.get_original_span(input_span, mode) self.assertEqual( output_span, expected_span, f"Expected: ({expected_span.begin, expected_span.end}" f"), Found: ({output_span.begin, output_span.end})" f" when Input: ({input_span.begin, input_span.end})" f" and Mode: {mode}", )
def test_reader_no_replace_test(self): # Read with no replacements pipeline = Pipeline() reader = PlainTextReader() pipeline.set_reader(reader, {"file_ext": ".html"}) pipeline.initialize() pack = pipeline.process_one(self.test_dir) self.assertEqual(pack.text, self.orig_text)
def setUp(self) -> None: file_dir_path = os.path.dirname(__file__) data_path = os.path.join(file_dir_path, os.pardir, os.pardir, 'test_data', 'ontonotes') pipeline = Pipeline() pipeline.set_reader(OntonotesReader()) pipeline.initialize() self.data_pack: DataPack = pipeline.process_one(data_path)
def string_processor_example(ner_model_dir: str, srl_model_dir: str): pl = Pipeline() pl.set_reader(StringReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) ner_configs = HParams( {'storage_path': os.path.join(ner_model_dir, 'resources.pkl')}, CoNLLNERPredictor.default_hparams()) ner_predictor = CoNLLNERPredictor() pl.add_processor(ner_predictor, ner_configs) srl_configs = HParams({ 'storage_path': srl_model_dir, }, SRLPredictor.default_hparams()) pl.add_processor(SRLPredictor(), srl_configs) pl.initialize() text = ( "The plain green Norway spruce is displayed in the gallery's foyer. " "Wentworth worked as an assistant to sculptor Henry Moore in the " "late 1960s. His reputation as a sculptor grew in the 1980s.") pack = pl.process_one(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def test_reader_replace_back_test(self, value): # Reading with replacements - replacing a span and changing it back span_ops, output = value pipeline = Pipeline() reader = PlainTextReader() reader.text_replace_operation = lambda _: span_ops pipeline.set_reader(reader, {"file_ext": ".html"}) pipeline.initialize() pack: DataPack = pipeline.process_one(self.test_dir) self.assertEqual(pack.text, output) orig_text_from_pack = pack.get_original_text() self.assertEqual(self.orig_text, orig_text_from_pack)
def test_parse_pack(self, text, annotation_length): file_path = os.path.join(self.test_dir, 'test.txt') with open(file_path, 'w') as f: f.write(text) pl = Pipeline() pl.set_reader(MultiPackSentenceReader()) pl.initialize() multipack: MultiPack = pl.process_one(self.test_dir) input_pack = multipack.get_pack('input_src') self.assertEqual(len(multipack.packs), 2) self.assertEqual(multipack._pack_names, ['input_src', 'output_tgt']) self.assertEqual(len(input_pack.annotations), annotation_length) self.assertEqual(input_pack.text, text + "\n")
def main(): pl = Pipeline() pl.set_reader(StringReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) pl.add_processor(CoNLLNERPredictor(), config=config.NER) pl.add_processor(SRLPredictor(), config=config.SRL) pl.initialize() text = ( "So I was excited to see Journey to the Far Side of the Sun finally " "get released on an affordable DVD (the previous print had been " "fetching $100 on eBay - I'm sure those people wish they had their " "money back - but more about that in a second).") pack = pl.process_one(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
"with the added feature of being able to read" \ " out the results. Scottish speech technology firm CEC " \ "Systems launched the site in November. But experts have" \ " questioned whether talking search engines are of any " \ "real benefit to people with visual impairments. The" \ " Edinburgh-based firm CEC has married speech " \ "technology with ever-popular internet search. The " \ "ability to search is becoming increasingly crucial to " \ "surfers baffled by the huge amount of information " \ "available on the web." win_medal_text = "British hurdler Sarah Claxton is confident she can win her " \ "first major medal at next month's European Indoor " \ "Championships in Madrid." pack = pl.process_one(win_medal_text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") for sentence in pack.get(Sentence): tokens = [(token.text, token.pos_tag) for token in pack.get(Token, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") break for sentence in pack.get(Sentence): for entity in pack.get(EntityMention, sentence): print(colored("EntityMention:", 'red'), entity.text, 'has type', colored(entity.ner_type, 'blue'), "\n")