def test_huggingface_pos_token_classification(self): nlp = Pipeline[DataPack]() nlp.set_reader(StringReader()) nlp.add(PeriodSentenceSplitter()) token_config = { "entry_type": "ft.onto.base_ontology.Sentence", "output_entry_type": "ft.onto.base_ontology.Token", "attribute_name": "pos", "tagging_scheme": "no-merge", "model_name": "vblagoje/bert-english-uncased-finetuned-pos", "tokenizer": "vblagoje/bert-english-uncased-finetuned-pos", "framework": "pt", } nlp.add(TokenClassification(), config=token_config) nlp.initialize() sentences = ["My name is Clara and I live in Berkeley, California."] pack = nlp.process(sentences) expected_type = [[ "PRON", "NOUN", "AUX", "PROPN", "CCONJ", "PRON", "VERB", "ADP", "PROPN", "PUNCT", "PROPN", "PUNCT", ]] expected_index = [[ (0, 2), (3, 7), (8, 10), (11, 16), (17, 20), (21, 22), (23, 27), (28, 30), (31, 39), (39, 40), (41, 51), (51, 52), ]] for entry_idx, entry in enumerate(pack.get( token_config["entry_type"])): for idx, token in enumerate( pack.get( entry_type=token_config["output_entry_type"], range_annotation=entry, )): token_type = getattr(token, token_config["attribute_name"]) self.assertEqual(token_type, expected_type[entry_idx][idx]) self.assertEqual(token.begin, expected_index[entry_idx][idx][0]) self.assertEqual(token.end, expected_index[entry_idx][idx][1])
def setUp(self): self.nltk = Pipeline[DataPack](enforce_consistency=True) self.nltk.set_reader(StringReader()) self.nltk.add(NLTKSentenceSegmenter()) self.nltk.add(NLTKWordTokenizer()) self.nltk.add(NLTKPOSTagger()) self.nltk.initialize()
def main(): pl = Pipeline[DataPack]() pl.set_reader(StringReader()) pl.add(NLTKSentenceSegmenter()) pl.add(BERTTokenizer(), config=config.BERTTokenizer) pl.add(BioBERTNERPredictor(), config=config.BioBERTNERPredictor) pl.initialize() text = ("More than three-quarters of patients (77.5%) had comorbidities. " "Twenty-four isolates (60%) were associated with pneumonia, " "14 (35%) with upper respiratory tract infections, " "and 2 (5%) with bronchiolitis. " "The 3 patients who died of M pneumoniae pneumonia " "had other comorbidities. ") pack = pl.process(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence subwords = [(subword.text, subword.ner) for subword in pack.get(Subword, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Subwords:", 'red'), subwords, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") input(colored("Press ENTER to continue...\n", 'green'))
def test_pipeline(self, texts, expected_outputs, expected_tokens): nlp = Pipeline[MultiPack]() boxer_config = {"pack_name": "input"} replacer_op = TmpReplacer.__module__ + "." + TmpReplacer.__qualname__ processor_config = { "augment_entry": "ft.onto.base_ontology.Token", "other_entry_policy": { "ft.onto.base_ontology.Document": "auto_align", "ft.onto.base_ontology.Sentence": "auto_align", }, "type": "data_augmentation_op", "data_aug_op": replacer_op, "data_aug_op_config": {}, "augment_pack_names": {}, } nlp.set_reader(reader=StringReader()) nlp.add(component=MultiPackBoxer(), config=boxer_config) nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector()) nlp.add( component=ReplacementDataAugmentProcessor(), config=processor_config ) nlp.initialize() for idx, m_pack in enumerate(nlp.process_dataset(texts)): aug_pack = m_pack.get_pack("augmented_input") self.assertEqual(aug_pack.text, expected_outputs[idx]) for j, token in enumerate(aug_pack.get(Token)): self.assertEqual(token.text, expected_tokens[idx][j])
def test_tokenizer_auto(self, input_data): tokenizer = SubwordTokenizer() self.pl = ( Pipeline[DataPack]() .set_reader(StringReader()) .add( tokenizer, config={"tokenizer_configs": {"do_lower_case": True}} ) .initialize() ) # Take the vocabulary used by the tokenizer. self.vocab: Dict[str, str] = tokenizer.tokenizer.vocab for pack in self.pl.process_dataset(input_data): for subword in pack.get(Subword): if subword.is_unk: assert subword.vocab_id == 100 else: subword_repr = ( subword.text if subword.is_first_segment else "##" + subword.text ) if not ( subword_repr in self.vocab or subword_repr.lower() in self.vocab ): assert False
def test_ir(self, input_output_pair): """ Verify the intermediate representation of pipeline. """ i_str, o_str = input_output_pair pl_config_path: str = os.path.join( os.path.dirname(os.path.abspath(__file__)), "eliza_pl_ir.yaml") # Build eliza pipeline eliza_pl: Pipeline[DataPack] = Pipeline[DataPack]() eliza_pl.set_reader(StringReader()) eliza_pl.add(UserSimulator(), config={"user_input": i_str}) eliza_pl.add(ElizaProcessor()) eliza_pl.save(pl_config_path) # Build test pipeline test_pl: Pipeline[DataPack] = Pipeline[DataPack]() test_pl.init_from_config_path(pl_config_path) test_pl.initialize() # Verify output res: DataPack = test_pl.process("") utterance = get_last_utterance(res, "ai") self.assertEqual(len([_ for _ in res.get(Utterance)]), 2) self.assertEqual(utterance.text, o_str)
def stanford_nlp_example(lang: str, text: str): pl = Pipeline[DataPack]() pl.set_reader(StringReader()) models_path = os.getcwd() config = Config( { 'processors': 'tokenize,pos,lemma,depparse', 'lang': lang, # Language code for the language to build the Pipeline 'use_gpu': False }, StandfordNLPProcessor.default_configs()) pl.add(component=StandfordNLPProcessor(models_path), config=config) pl.initialize() pack = pl.process(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") tokens = [(token.text, token.pos, token.lemma) for token in pack.get(Token, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("Dependency Relations:", 'red')) for link in pack.get(Dependency, sentence): parent: Token = link.get_parent() # type: ignore child: Token = link.get_child() # type: ignore print(colored(child.text, 'cyan'), "has relation", colored(link.rel_type, 'green'), "of parent", colored(parent.text, 'cyan')) print("\n----------------------\n")
def setUp(self): # Define and config the Pipeline self.dataset_path = "examples/" self.pl1 = Pipeline() self._cache_directory = Path(os.path.join(os.getcwd(), "cache_data")) self.pl1.set_reader(StringReader()) self.pl2 = Pipeline() self.pl2.set_reader(StringReader()) self.text = ( "The plain green Norway spruce is displayed in the gallery's " "foyer. Wentworth worked as an assistant to sculptor Henry Moore " "in the late 1960s. His reputation as a sculptor grew in the " "1980s.")
def test_huggingface_ner_token_classification(self): nlp = Pipeline[DataPack]() nlp.set_reader(StringReader()) nlp.add(PeriodSentenceSplitter()) token_config = { "entry_type": "ft.onto.base_ontology.Sentence", "output_entry_type": "ft.onto.base_ontology.EntityMention", "attribute_name": "ner_type", "tagging_scheme": "no-merge", # 'bio-merge' "model_name": "jplu/tf-xlm-r-ner-40-lang", "tokenizer": "jplu/tf-xlm-r-ner-40-lang", "framework": "tf", } nlp.add(TokenClassification(), config=token_config) nlp.initialize() sentences = ["Barack Obama was born in Hawaii."] pack = nlp.process(sentences) expected_type = [["PER", "PER", "LOC"]] expected_index = [[(0, 6), (7, 12), (25, 31)]] for entry_idx, entry in enumerate(pack.get( token_config["entry_type"])): for idx, token in enumerate( pack.get( entry_type=token_config["output_entry_type"], range_annotation=entry, )): token_type = getattr(token, token_config["attribute_name"]) self.assertEqual(token_type, expected_type[entry_idx][idx]) self.assertEqual(token.begin, expected_index[entry_idx][idx][0]) self.assertEqual(token.end, expected_index[entry_idx][idx][1])
def setUp(self): self.nltk = Pipeline[DataPack]() self.nltk.set_reader(StringReader()) self.nltk.add(NLTKSentenceSegmenter()) self.nltk.add(NLTKWordTokenizer()) self.nltk.add(NLTKPOSTagger()) self.nltk.add(NLTKNER()) self.nltk.initialize()
def _create_pipeline(config): nlp = Pipeline[DataPack]() nlp.set_reader(StringReader()) # Using NLTKSentenceSegmenter to segment the sentences nlp.add(component=NLTKSentenceSegmenter()) nlp.add(component=AllenNLPProcessor(), config=config) nlp.initialize() return nlp
def setUp(self): self.nltk = Pipeline[DataPack]() self.nltk.set_reader(StringReader()) self.nltk.add(NLTKSentenceSegmenter()) self.nltk.add(NLTKWordTokenizer()) self.nltk.add(NLTKPOSTagger()) config = {'pattern': 'NP: {<DT>?<JJ>*<NN>}'} self.nltk.add(NLTKChunker(), config=config) self.nltk.initialize()
def setUp(self): self.nltk = Pipeline[DataPack](enforce_consistency=True) self.nltk.set_reader(StringReader()) self.nltk.add(NLTKSentenceSegmenter()) self.nltk.add(NLTKWordTokenizer()) self.nltk.add(NLTKPOSTagger()) config = {"pattern": "NP: {<DT>?<JJ>*<NN>}"} self.nltk.add(NLTKChunker(), config=config) self.nltk.initialize()
def setUp(self): self.pl = Pipeline[DataPack]() self.pl.set_reader(StringReader()) self.pl.add(DummyPackProcessor(0.9)) self.pl.add(DummyPackProcessor(0.5)) self.pl.add(DummyPackProcessor(1.2)) self.pl.set_profiling() self.pl.initialize()
def setUp(self): self.nlp = Pipeline() self.nlp.set_reader(StringReader()) self.nlp.add(NLTKSentenceSegmenter()) boxer_config = {"pack_name": "question"} self.nlp.add(MultiPackBoxer(), boxer_config) self.nlp.add(MutliDocPackAdder()) self.nlp.add(QuestionAnsweringMulti()) self.nlp.initialize()
def setUp(self): random.seed(0) self.nlp = Pipeline[MultiPack]() boxer_config = {'pack_name': 'input_src'} self.nlp.set_reader(reader=StringReader()) self.nlp.add(component=MultiPackBoxer(), config=boxer_config) self.nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector()) self.nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
def setUp(self): self.nlp = Pipeline[DataPack](enforce_consistency=True) self.nlp.set_reader(StringReader()) config = { "question": "What is the molecular function of" " psoralen photobinding on DNA?" } self.nlp.add(QuestionAnsweringSingle(), config=config) self.nlp.initialize()
def main(): pl = Pipeline[DataPack]() pl.set_reader(StringReader()) pl.add(NLTKSentenceSegmenter()) pl.add(NLTKWordTokenizer()) pl.add(NLTKPOSTagger()) config = yaml.safe_load(open("config.yml", "r")) config = Config(config, default_hparams=None) pl.add(CoNLLNERPredictor(), config=config.NER) pl.add(SRLPredictor(), config=config.SRL) pl.initialize() text = ( "So I was excited to see Journey to the Far Side of the Sun finally " "get released on an affordable DVD (the previous print had been " "fetching $100 on eBay - I'm sure those people wish they had their " "money back - but more about that in a second)." ) pack = pl.process_one(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", "red"), sent_text, "\n") # first method to get entry in a sentence tokens = [ (token.text, token.pos) for token in pack.get(Token, sentence) ] entities = [ (entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence) ] print(colored("Tokens:", "red"), tokens, "\n") print(colored("EntityMentions:", "red"), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", "red")) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() child: PredicateArgument = link.get_child() print( f' - "{child.text}" is role {link.arg_type} of ' f'predicate "{parent.text}"' ) entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", "green"))
def string_processor_example(ner_model_dir: str, srl_model_dir: str): pl = Pipeline() pl.set_reader(StringReader()) pl.add_processor(NLTKSentenceSegmenter()) pl.add_processor(NLTKWordTokenizer()) pl.add_processor(NLTKPOSTagger()) ner_configs = HParams( {'storage_path': os.path.join(ner_model_dir, 'resources.pkl')}, CoNLLNERPredictor.default_hparams()) ner_predictor = CoNLLNERPredictor() pl.add_processor(ner_predictor, ner_configs) srl_configs = HParams({ 'storage_path': srl_model_dir, }, SRLPredictor.default_hparams()) pl.add_processor(SRLPredictor(), srl_configs) pl.initialize() text = ( "The plain green Norway spruce is displayed in the gallery's foyer. " "Wentworth worked as an assistant to sculptor Henry Moore in the " "late 1960s. His reputation as a sculptor grew in the 1980s.") pack = pl.process_one(text) for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") # first method to get entry in a sentence tokens = [(token.text, token.pos) for token in pack.get(Token, sentence)] entities = [(entity.text, entity.ner_type) for entity in pack.get(EntityMention, sentence)] print(colored("Tokens:", 'red'), tokens, "\n") print(colored("EntityMentions:", 'red'), entities, "\n") # second method to get entry in a sentence print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent: PredicateMention = link.get_parent() # type: ignore child: PredicateArgument = link.get_child() # type: ignore print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") entities = [ entity.text for entity in pack.get(EntityMention, child) ] print(" Entities in predicate argument:", entities, "\n") print() input(colored("Press ENTER to continue...\n", 'green'))
def setUp(self): random.seed(0) self.nlp = Pipeline[MultiPack]() boxer_config = {"pack_name": "input_src"} self.nlp.set_reader(reader=StringReader()) self.nlp.add(component=MultiPackBoxer(), config=boxer_config) self.nlp.add( component=WhiteSpaceTokenizer(), selector=AllPackSelector() )
def setUp(self): self.stanford_nlp = Pipeline[DataPack]() self.stanford_nlp.set_reader(StringReader()) config = { "processors": "tokenize", "lang": "en", # Language code for the language to build the Pipeline "use_gpu": False, } self.stanford_nlp.add(StandfordNLPProcessor(), config=config) self.stanford_nlp.initialize()
def test_ir_basic(self, input_output_pair): """ Verify the intermediate representation of pipeline. """ i_str, o_str = input_output_pair # Build eliza pipeline eliza_pl: Pipeline[DataPack] = Pipeline[DataPack]( ontology_file=self._onto_path, enforce_consistency=True, do_init_type_check=True, ) eliza_pl.set_reader(StringReader()) eliza_pl.add(UserSimulator(), config={"user_input": i_str}) eliza_pl.add(ElizaProcessor()) eliza_pl.set_profiling() eliza_pl.initialize() eliza_pl.save(self._pl_config_path) # Build test pipeline test_pl: Pipeline[DataPack] = Pipeline[DataPack]() test_pl.init_from_config_path(self._pl_config_path) # Verify pipeline states self.assertListEqual( *map( lambda pl: [ getattr(pl, attr) for attr in ( "_initialized", "_enable_profiling", "_check_type_consistency", "_do_init_type_check", ) if hasattr(pl, attr) ], (eliza_pl, test_pl), ) ) self.assertDictEqual( eliza_pl.resource.get("onto_specs_dict"), test_pl.resource.get("onto_specs_dict"), ) self._assertEntryTreeEqual( eliza_pl.resource.get("merged_entry_tree").root, test_pl.resource.get("merged_entry_tree").root, ) # Verify output test_pl.initialize() res: DataPack = test_pl.process("") utterance = get_last_utterance(res, "ai") self.assertEqual(len([_ for _ in res.get(Utterance)]), 2) self.assertEqual(utterance.text, o_str)
def setUp(self): random.seed(8) self.nlp = Pipeline[MultiPack]() boxer_config = {"pack_name": "input_src"} entity_config = {"entities_to_insert": ["Mary", "station"]} self.nlp.set_reader(reader=StringReader()) self.nlp.add(component=EntityMentionInserter(), config=entity_config) self.nlp.add(PeriodSentenceSplitter()) self.nlp.add(component=MultiPackBoxer(), config=boxer_config) self.nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector())
def test_lowercase_with_substitution(self): document = "Yıldız İbrahimova" pack = (Pipeline[DataPack]().set_reader(StringReader()).add( LowerCaserProcessor(), config={ "custom_substitutions": { "İ": "i" } }, ).initialize().process(document)) self.assertNotEqual(pack.text, document)
def setUp(self): self.spacy = Pipeline() self.spacy.set_reader(StringReader()) config = { "processors": "tokenize", "lang": "en_core_web_sm", # Language code for the language to build the Pipeline "use_gpu": False } self.spacy.add_processor(SpacyProcessor(), config=config) self.spacy.initialize()
def setUp(self): self.stanford_nlp = Pipeline() self.stanford_nlp.set_reader(StringReader()) models_path = os.getcwd() config = { "processors": "tokenize", "lang": "en", # Language code for the language to build the Pipeline "use_gpu": False } self.stanford_nlp.add_processor(StandfordNLPProcessor(models_path), config=config) self.stanford_nlp.initialize()
def test_allennlp_processor_with_invalid_config(self, processors): nlp = Pipeline[DataPack]() nlp.set_reader(StringReader()) # Using SpacyProcessor to segment the sentences nlp.add(component=NLTKSentenceSegmenter()) nlp.add( component=AllenNLPProcessor(), config={"processors": processors} ) with self.assertRaises(ProcessorConfigError): nlp.initialize()
def setUp(self): self.spacy = Pipeline[DataPack]() self.spacy.set_reader(StringReader()) config = { "processors": "sentence, tokenize", "lang": "en_core_web_sm", # Language code for the language to build the Pipeline "use_gpu": False, } self.spacy.add(SpacyProcessor(), config=config) self.spacy.initialize() self.nlp: Language = spacy.load(config["lang"])
def test_spacy_processor_with_invalid_config(self, processor): spacy = Pipeline[DataPack]() spacy.set_reader(StringReader()) config = { "processors": processor, "lang": "en_core_web_sm", # Language code for the language to build the Pipeline "use_gpu": False, } spacy.add(SpacyProcessor(), config=config) with self.assertRaises(ProcessorConfigError): spacy.initialize()
def setUp(self): self.cliner = Pipeline[DataPack]() self.cliner.set_reader(StringReader()) self.cliner.add(ClinicalNER(), config={ 'config_model': 'examples/Cliner/CliNER/models/train_full' '.model', 'config_data': 'examples/Cliner/CliNER/data/examples/test.txt', 'config_output': 'examples/Cliner/CliNER/data/examples', }) self.cliner.initialize()