Beispiel #1
0
    def test_huggingface_pos_token_classification(self):
        nlp = Pipeline[DataPack]()
        nlp.set_reader(StringReader())
        nlp.add(PeriodSentenceSplitter())
        token_config = {
            "entry_type": "ft.onto.base_ontology.Sentence",
            "output_entry_type": "ft.onto.base_ontology.Token",
            "attribute_name": "pos",
            "tagging_scheme": "no-merge",
            "model_name": "vblagoje/bert-english-uncased-finetuned-pos",
            "tokenizer": "vblagoje/bert-english-uncased-finetuned-pos",
            "framework": "pt",
        }
        nlp.add(TokenClassification(), config=token_config)
        nlp.initialize()
        sentences = ["My name is Clara and I live in Berkeley, California."]

        pack = nlp.process(sentences)

        expected_type = [[
            "PRON",
            "NOUN",
            "AUX",
            "PROPN",
            "CCONJ",
            "PRON",
            "VERB",
            "ADP",
            "PROPN",
            "PUNCT",
            "PROPN",
            "PUNCT",
        ]]
        expected_index = [[
            (0, 2),
            (3, 7),
            (8, 10),
            (11, 16),
            (17, 20),
            (21, 22),
            (23, 27),
            (28, 30),
            (31, 39),
            (39, 40),
            (41, 51),
            (51, 52),
        ]]

        for entry_idx, entry in enumerate(pack.get(
                token_config["entry_type"])):
            for idx, token in enumerate(
                    pack.get(
                        entry_type=token_config["output_entry_type"],
                        range_annotation=entry,
                    )):
                token_type = getattr(token, token_config["attribute_name"])
                self.assertEqual(token_type, expected_type[entry_idx][idx])
                self.assertEqual(token.begin,
                                 expected_index[entry_idx][idx][0])
                self.assertEqual(token.end, expected_index[entry_idx][idx][1])
 def setUp(self):
     self.nltk = Pipeline[DataPack](enforce_consistency=True)
     self.nltk.set_reader(StringReader())
     self.nltk.add(NLTKSentenceSegmenter())
     self.nltk.add(NLTKWordTokenizer())
     self.nltk.add(NLTKPOSTagger())
     self.nltk.initialize()
def main():
    pl = Pipeline[DataPack]()
    pl.set_reader(StringReader())
    pl.add(NLTKSentenceSegmenter())
    pl.add(BERTTokenizer(), config=config.BERTTokenizer)
    pl.add(BioBERTNERPredictor(), config=config.BioBERTNERPredictor)
    pl.initialize()

    text = ("More than three-quarters of patients (77.5%) had comorbidities. "
            "Twenty-four isolates (60%) were associated with pneumonia, "
            "14 (35%) with upper respiratory tract infections, "
            "and 2 (5%) with bronchiolitis. "
            "The 3 patients who died of M pneumoniae pneumonia "
            "had other comorbidities. ")
    pack = pl.process(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        subwords = [(subword.text, subword.ner)
                    for subword in pack.get(Subword, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Subwords:", 'red'), subwords, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        input(colored("Press ENTER to continue...\n", 'green'))
Beispiel #4
0
    def test_pipeline(self, texts, expected_outputs, expected_tokens):
        nlp = Pipeline[MultiPack]()

        boxer_config = {"pack_name": "input"}

        replacer_op = TmpReplacer.__module__ + "." + TmpReplacer.__qualname__

        processor_config = {
            "augment_entry": "ft.onto.base_ontology.Token",
            "other_entry_policy": {
                "ft.onto.base_ontology.Document": "auto_align",
                "ft.onto.base_ontology.Sentence": "auto_align",
            },
            "type": "data_augmentation_op",
            "data_aug_op": replacer_op,
            "data_aug_op_config": {},
            "augment_pack_names": {},
        }

        nlp.set_reader(reader=StringReader())
        nlp.add(component=MultiPackBoxer(), config=boxer_config)
        nlp.add(component=WhiteSpaceTokenizer(), selector=AllPackSelector())
        nlp.add(
            component=ReplacementDataAugmentProcessor(), config=processor_config
        )
        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(texts)):
            aug_pack = m_pack.get_pack("augmented_input")

            self.assertEqual(aug_pack.text, expected_outputs[idx])

            for j, token in enumerate(aug_pack.get(Token)):
                self.assertEqual(token.text, expected_tokens[idx][j])
Beispiel #5
0
    def test_tokenizer_auto(self, input_data):
        tokenizer = SubwordTokenizer()
        self.pl = (
            Pipeline[DataPack]()
            .set_reader(StringReader())
            .add(
                tokenizer, config={"tokenizer_configs": {"do_lower_case": True}}
            )
            .initialize()
        )

        # Take the vocabulary used by the tokenizer.
        self.vocab: Dict[str, str] = tokenizer.tokenizer.vocab
        for pack in self.pl.process_dataset(input_data):
            for subword in pack.get(Subword):
                if subword.is_unk:
                    assert subword.vocab_id == 100
                else:
                    subword_repr = (
                        subword.text
                        if subword.is_first_segment
                        else "##" + subword.text
                    )
                    if not (
                        subword_repr in self.vocab
                        or subword_repr.lower() in self.vocab
                    ):
                        assert False
Beispiel #6
0
    def test_ir(self, input_output_pair):
        """
        Verify the intermediate representation of pipeline.
        """
        i_str, o_str = input_output_pair
        pl_config_path: str = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "eliza_pl_ir.yaml")

        # Build eliza pipeline
        eliza_pl: Pipeline[DataPack] = Pipeline[DataPack]()
        eliza_pl.set_reader(StringReader())
        eliza_pl.add(UserSimulator(), config={"user_input": i_str})
        eliza_pl.add(ElizaProcessor())
        eliza_pl.save(pl_config_path)

        # Build test pipeline
        test_pl: Pipeline[DataPack] = Pipeline[DataPack]()
        test_pl.init_from_config_path(pl_config_path)
        test_pl.initialize()

        # Verify output
        res: DataPack = test_pl.process("")
        utterance = get_last_utterance(res, "ai")
        self.assertEqual(len([_ for _ in res.get(Utterance)]), 2)
        self.assertEqual(utterance.text, o_str)
Beispiel #7
0
def stanford_nlp_example(lang: str, text: str):
    pl = Pipeline[DataPack]()
    pl.set_reader(StringReader())

    models_path = os.getcwd()
    config = Config(
        {
            'processors': 'tokenize,pos,lemma,depparse',
            'lang': lang,
            # Language code for the language to build the Pipeline
            'use_gpu': False
        },
        StandfordNLPProcessor.default_configs())
    pl.add(component=StandfordNLPProcessor(models_path), config=config)

    pl.initialize()

    pack = pl.process(text)
    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        tokens = [(token.text, token.pos, token.lemma)
                  for token in pack.get(Token, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")

        print(colored("Dependency Relations:", 'red'))
        for link in pack.get(Dependency, sentence):
            parent: Token = link.get_parent()  # type: ignore
            child: Token = link.get_child()  # type: ignore
            print(colored(child.text, 'cyan'), "has relation",
                  colored(link.rel_type, 'green'), "of parent",
                  colored(parent.text, 'cyan'))

        print("\n----------------------\n")
Beispiel #8
0
    def setUp(self):
        # Define and config the Pipeline
        self.dataset_path = "examples/"

        self.pl1 = Pipeline()
        self._cache_directory = Path(os.path.join(os.getcwd(), "cache_data"))
        self.pl1.set_reader(StringReader())

        self.pl2 = Pipeline()
        self.pl2.set_reader(StringReader())

        self.text = (
            "The plain green Norway spruce is displayed in the gallery's "
            "foyer. Wentworth worked as an assistant to sculptor Henry Moore "
            "in the late 1960s. His reputation as a sculptor grew in the "
            "1980s.")
Beispiel #9
0
    def test_huggingface_ner_token_classification(self):
        nlp = Pipeline[DataPack]()
        nlp.set_reader(StringReader())
        nlp.add(PeriodSentenceSplitter())
        token_config = {
            "entry_type": "ft.onto.base_ontology.Sentence",
            "output_entry_type": "ft.onto.base_ontology.EntityMention",
            "attribute_name": "ner_type",
            "tagging_scheme": "no-merge",  # 'bio-merge'
            "model_name": "jplu/tf-xlm-r-ner-40-lang",
            "tokenizer": "jplu/tf-xlm-r-ner-40-lang",
            "framework": "tf",
        }
        nlp.add(TokenClassification(), config=token_config)
        nlp.initialize()
        sentences = ["Barack Obama was born in Hawaii."]

        pack = nlp.process(sentences)

        expected_type = [["PER", "PER", "LOC"]]
        expected_index = [[(0, 6), (7, 12), (25, 31)]]

        for entry_idx, entry in enumerate(pack.get(
                token_config["entry_type"])):
            for idx, token in enumerate(
                    pack.get(
                        entry_type=token_config["output_entry_type"],
                        range_annotation=entry,
                    )):

                token_type = getattr(token, token_config["attribute_name"])
                self.assertEqual(token_type, expected_type[entry_idx][idx])
                self.assertEqual(token.begin,
                                 expected_index[entry_idx][idx][0])
                self.assertEqual(token.end, expected_index[entry_idx][idx][1])
Beispiel #10
0
 def setUp(self):
     self.nltk = Pipeline[DataPack]()
     self.nltk.set_reader(StringReader())
     self.nltk.add(NLTKSentenceSegmenter())
     self.nltk.add(NLTKWordTokenizer())
     self.nltk.add(NLTKPOSTagger())
     self.nltk.add(NLTKNER())
     self.nltk.initialize()
Beispiel #11
0
    def _create_pipeline(config):
        nlp = Pipeline[DataPack]()
        nlp.set_reader(StringReader())

        # Using NLTKSentenceSegmenter to segment the sentences
        nlp.add(component=NLTKSentenceSegmenter())
        nlp.add(component=AllenNLPProcessor(), config=config)
        nlp.initialize()
        return nlp
Beispiel #12
0
 def setUp(self):
     self.nltk = Pipeline[DataPack]()
     self.nltk.set_reader(StringReader())
     self.nltk.add(NLTKSentenceSegmenter())
     self.nltk.add(NLTKWordTokenizer())
     self.nltk.add(NLTKPOSTagger())
     config = {'pattern': 'NP: {<DT>?<JJ>*<NN>}'}
     self.nltk.add(NLTKChunker(), config=config)
     self.nltk.initialize()
 def setUp(self):
     self.nltk = Pipeline[DataPack](enforce_consistency=True)
     self.nltk.set_reader(StringReader())
     self.nltk.add(NLTKSentenceSegmenter())
     self.nltk.add(NLTKWordTokenizer())
     self.nltk.add(NLTKPOSTagger())
     config = {"pattern": "NP: {<DT>?<JJ>*<NN>}"}
     self.nltk.add(NLTKChunker(), config=config)
     self.nltk.initialize()
Beispiel #14
0
    def setUp(self):
        self.pl = Pipeline[DataPack]()
        self.pl.set_reader(StringReader())
        self.pl.add(DummyPackProcessor(0.9))
        self.pl.add(DummyPackProcessor(0.5))
        self.pl.add(DummyPackProcessor(1.2))

        self.pl.set_profiling()
        self.pl.initialize()
Beispiel #15
0
 def setUp(self):
     self.nlp = Pipeline()
     self.nlp.set_reader(StringReader())
     self.nlp.add(NLTKSentenceSegmenter())
     boxer_config = {"pack_name": "question"}
     self.nlp.add(MultiPackBoxer(), boxer_config)
     self.nlp.add(MutliDocPackAdder())
     self.nlp.add(QuestionAnsweringMulti())
     self.nlp.initialize()
Beispiel #16
0
    def setUp(self):
        random.seed(0)
        self.nlp = Pipeline[MultiPack]()

        boxer_config = {'pack_name': 'input_src'}

        self.nlp.set_reader(reader=StringReader())
        self.nlp.add(component=MultiPackBoxer(), config=boxer_config)
        self.nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector())
        self.nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
Beispiel #17
0
 def setUp(self):
     self.nlp = Pipeline[DataPack](enforce_consistency=True)
     self.nlp.set_reader(StringReader())
     config = {
         "question":
         "What is the molecular function of"
         " psoralen photobinding on DNA?"
     }
     self.nlp.add(QuestionAnsweringSingle(), config=config)
     self.nlp.initialize()
Beispiel #18
0
def main():
    pl = Pipeline[DataPack]()
    pl.set_reader(StringReader())
    pl.add(NLTKSentenceSegmenter())
    pl.add(NLTKWordTokenizer())
    pl.add(NLTKPOSTagger())

    config = yaml.safe_load(open("config.yml", "r"))

    config = Config(config, default_hparams=None)

    pl.add(CoNLLNERPredictor(), config=config.NER)
    pl.add(SRLPredictor(), config=config.SRL)

    pl.initialize()

    text = (
        "So I was excited to see Journey to the Far Side of the Sun finally "
        "get released on an affordable DVD (the previous print had been "
        "fetching $100 on eBay - I'm sure those people wish they had their "
        "money back - but more about that in a second)."
    )

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", "red"), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [
            (token.text, token.pos) for token in pack.get(Token, sentence)
        ]
        entities = [
            (entity.text, entity.ner_type)
            for entity in pack.get(EntityMention, sentence)
        ]
        print(colored("Tokens:", "red"), tokens, "\n")
        print(colored("EntityMentions:", "red"), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", "red"))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()
            child: PredicateArgument = link.get_child()
            print(
                f'  - "{child.text}" is role {link.arg_type} of '
                f'predicate "{parent.text}"'
            )
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", "green"))
Beispiel #19
0
def string_processor_example(ner_model_dir: str, srl_model_dir: str):
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_dir, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    ner_predictor = CoNLLNERPredictor()

    pl.add_processor(ner_predictor, ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_dir,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)

    pl.initialize()

    text = (
        "The plain green Norway spruce is displayed in the gallery's foyer. "
        "Wentworth worked as an assistant to sculptor Henry Moore in the "
        "late 1960s. His reputation as a sculptor grew in the 1980s.")

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [(token.text, token.pos)
                  for token in pack.get(Token, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", 'red'))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()  # type: ignore
            child: PredicateArgument = link.get_child()  # type: ignore
            print(f"  - \"{child.text}\" is role {link.arg_type} of "
                  f"predicate \"{parent.text}\"")
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", 'green'))
Beispiel #20
0
    def setUp(self):
        random.seed(0)
        self.nlp = Pipeline[MultiPack]()

        boxer_config = {"pack_name": "input_src"}

        self.nlp.set_reader(reader=StringReader())
        self.nlp.add(component=MultiPackBoxer(), config=boxer_config)
        self.nlp.add(
            component=WhiteSpaceTokenizer(), selector=AllPackSelector()
        )
 def setUp(self):
     self.stanford_nlp = Pipeline[DataPack]()
     self.stanford_nlp.set_reader(StringReader())
     config = {
         "processors": "tokenize",
         "lang": "en",
         # Language code for the language to build the Pipeline
         "use_gpu": False,
     }
     self.stanford_nlp.add(StandfordNLPProcessor(), config=config)
     self.stanford_nlp.initialize()
Beispiel #22
0
    def test_ir_basic(self, input_output_pair):
        """
        Verify the intermediate representation of pipeline.
        """
        i_str, o_str = input_output_pair

        # Build eliza pipeline
        eliza_pl: Pipeline[DataPack] = Pipeline[DataPack](
            ontology_file=self._onto_path,
            enforce_consistency=True,
            do_init_type_check=True,
        )
        eliza_pl.set_reader(StringReader())
        eliza_pl.add(UserSimulator(), config={"user_input": i_str})
        eliza_pl.add(ElizaProcessor())
        eliza_pl.set_profiling()
        eliza_pl.initialize()
        eliza_pl.save(self._pl_config_path)

        # Build test pipeline
        test_pl: Pipeline[DataPack] = Pipeline[DataPack]()
        test_pl.init_from_config_path(self._pl_config_path)

        # Verify pipeline states
        self.assertListEqual(
            *map(
                lambda pl: [
                    getattr(pl, attr)
                    for attr in (
                        "_initialized",
                        "_enable_profiling",
                        "_check_type_consistency",
                        "_do_init_type_check",
                    )
                    if hasattr(pl, attr)
                ],
                (eliza_pl, test_pl),
            )
        )
        self.assertDictEqual(
            eliza_pl.resource.get("onto_specs_dict"),
            test_pl.resource.get("onto_specs_dict"),
        )
        self._assertEntryTreeEqual(
            eliza_pl.resource.get("merged_entry_tree").root,
            test_pl.resource.get("merged_entry_tree").root,
        )

        # Verify output
        test_pl.initialize()
        res: DataPack = test_pl.process("")
        utterance = get_last_utterance(res, "ai")
        self.assertEqual(len([_ for _ in res.get(Utterance)]), 2)
        self.assertEqual(utterance.text, o_str)
Beispiel #23
0
    def setUp(self):
        random.seed(8)
        self.nlp = Pipeline[MultiPack]()

        boxer_config = {"pack_name": "input_src"}
        entity_config = {"entities_to_insert": ["Mary", "station"]}
        self.nlp.set_reader(reader=StringReader())
        self.nlp.add(component=EntityMentionInserter(), config=entity_config)
        self.nlp.add(PeriodSentenceSplitter())
        self.nlp.add(component=MultiPackBoxer(), config=boxer_config)
        self.nlp.add(component=WhiteSpaceTokenizer(),
                     selector=AllPackSelector())
Beispiel #24
0
    def test_lowercase_with_substitution(self):
        document = "Yıldız İbrahimova"
        pack = (Pipeline[DataPack]().set_reader(StringReader()).add(
            LowerCaserProcessor(),
            config={
                "custom_substitutions": {
                    "İ": "i"
                }
            },
        ).initialize().process(document))

        self.assertNotEqual(pack.text, document)
    def setUp(self):
        self.spacy = Pipeline()
        self.spacy.set_reader(StringReader())

        config = {
            "processors": "tokenize",
            "lang": "en_core_web_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        self.spacy.add_processor(SpacyProcessor(), config=config)
        self.spacy.initialize()
Beispiel #26
0
 def setUp(self):
     self.stanford_nlp = Pipeline()
     self.stanford_nlp.set_reader(StringReader())
     models_path = os.getcwd()
     config = {
         "processors": "tokenize",
         "lang": "en",
         # Language code for the language to build the Pipeline
         "use_gpu": False
     }
     self.stanford_nlp.add_processor(StandfordNLPProcessor(models_path),
                                     config=config)
     self.stanford_nlp.initialize()
Beispiel #27
0
    def test_allennlp_processor_with_invalid_config(self, processors):
        nlp = Pipeline[DataPack]()
        nlp.set_reader(StringReader())

        # Using SpacyProcessor to segment the sentences
        nlp.add(component=NLTKSentenceSegmenter())

        nlp.add(
            component=AllenNLPProcessor(), config={"processors": processors}
        )

        with self.assertRaises(ProcessorConfigError):
            nlp.initialize()
    def setUp(self):
        self.spacy = Pipeline[DataPack]()
        self.spacy.set_reader(StringReader())

        config = {
            "processors": "sentence, tokenize",
            "lang": "en_core_web_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False,
        }
        self.spacy.add(SpacyProcessor(), config=config)
        self.spacy.initialize()

        self.nlp: Language = spacy.load(config["lang"])
    def test_spacy_processor_with_invalid_config(self, processor):
        spacy = Pipeline[DataPack]()
        spacy.set_reader(StringReader())

        config = {
            "processors": processor,
            "lang": "en_core_web_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False,
        }
        spacy.add(SpacyProcessor(), config=config)

        with self.assertRaises(ProcessorConfigError):
            spacy.initialize()
Beispiel #30
0
 def setUp(self):
     self.cliner = Pipeline[DataPack]()
     self.cliner.set_reader(StringReader())
     self.cliner.add(ClinicalNER(),
                     config={
                         'config_model':
                         'examples/Cliner/CliNER/models/train_full'
                         '.model',
                         'config_data':
                         'examples/Cliner/CliNER/data/examples/test.txt',
                         'config_output':
                         'examples/Cliner/CliNER/data/examples',
                     })
     self.cliner.initialize()