def test_pipeline(self, texts):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx+1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline()
        reader_config = HParams(
            {
                "input_pack_name": "input",
                "output_pack_name": "output"
            }, MultiPackSentenceReader.default_hparams())
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)
        translator_config = HParams(
            {
                "src_language": "de",
                "target_language": "en",
                "in_pack_name": "input",
                "out_pack_name": "result"
            }, None)

        nlp.add_processor(MicrosoftBingTranslator(), config=translator_config)
        nlp.initialize()

        english_results = ["Hey good morning", "This is Forte. A tool for NLP"]
        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            self.assertEqual(set(m_pack._pack_names),
                             set(["input", "output", "result"]))
            self.assertEqual(
                m_pack.get_pack("result").text, english_results[idx] + "\n")
Beispiel #2
0
class TestStanfordNLPProcessor(unittest.TestCase):
    def setUp(self):
        self.stanford_nlp = Pipeline()
        self.stanford_nlp.set_reader(StringReader())
        models_path = os.getcwd()
        config = HParams(
            {
                "processors": "tokenize",
                "lang": "en",
                # Language code for the language to build the Pipeline
                "use_gpu": False
            },
            StandfordNLPProcessor.default_hparams())
        self.stanford_nlp.add_processor(StandfordNLPProcessor(models_path),
                                        config=config)
        self.stanford_nlp.initialize()

    # TODO
    @unittest.skip("We need to test this without needing to download models "
                   "everytime")
    def test_stanford_processor(self):
        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        pack = self.stanford_nlp.process(document)
        print(pack)
class MSMarcoPassageReaderTest(unittest.TestCase):
    def setUp(self):
        self.pipeline = Pipeline()

        self.pipeline.set_reader(MSMarcoPassageReader())
        self.pipeline.initialize()

        self.data_dir = 'data_samples/ms_marco_passage_retrieval'

        corpus_file = os.path.join(self.data_dir, 'collection.tsv')
        self.expected_content = {}
        with open(corpus_file, 'r') as f:
            for line in f.readlines():
                key, value = tuple(line.split('\t', 1))
                self.expected_content[key] = value

    def test_ms_marco_passage_reader(self):
        actual_content: Dict[str, str] = {}
        for data_pack in self.pipeline.process_dataset(self.data_dir):
            self.assertIsInstance(data_pack, DataPack)
            doc_entries = list(data_pack.get(Document))
            self.assertTrue(len(doc_entries) == 1)
            doc_entry: Document = doc_entries[0]
            self.assertIsInstance(doc_entry, Document)
            actual_content[data_pack.pack_name] = doc_entry.text

        self.assertDictEqual(actual_content, self.expected_content)
Beispiel #4
0
class PipelineTest(unittest.TestCase):
    def setUp(self) -> None:
        # Define and config the Pipeline
        self.nlp = Pipeline()
        self.nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = HParams({"batcher": {
            "batch_size": 5
        }}, dummy.default_hparams())
        self.nlp.add_processor(dummy, config=config)
        self.nlp.initialize()

        self.dataset_path = \
            "forte/tests/data_samples/ontonotes_sample_dataset/00"

    def test_process_next(self):
        # get processed pack from dataset
        for pack in self.nlp.process_dataset(self.dataset_path):
            # get sentence from pack
            for sentence in pack.get_entries(Sentence):
                sent_text = sentence.text

                # first method to get entry in a sentence
                for link in pack.get_entries(RelationLink, sentence):
                    parent = link.get_parent()
                    child = link.get_child()
                    print(f"{parent.text} is {link.rel_type} {child.text}")
                    pass  # some operation on link

                # second method to get entry in a sentence
                tokens = [
                    token.text for token in pack.get_entries(Token, sentence)
                ]
                self.assertEqual(sent_text, " ".join(tokens))
Beispiel #5
0
class AGNewsReaderTest(unittest.TestCase):
    def setUp(self):
        self.pipeline = Pipeline()

        self.pipeline.set_reader(AGNewsReader())
        self.pipeline.initialize()

        self.sample_file: str = os.path.abspath(
            os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         *([os.path.pardir] * 4),
                         "data_samples/ag_news/sample.csv"))

        self.expected_content: Dict[int, str] = {}
        with open(self.sample_file, "r") as file:
            for line_id, line in enumerate(file):
                data = line.strip().split(",")
                class_id, title, description = (
                    int(data[0].replace('"', "")),
                    data[1],
                    data[2],
                )
                self.expected_content[line_id] = (class_id, title, description)

        self.class_idx_to_name = {
            1: "World",
            2: "Sports",
            3: "Business",
            4: "Sci/Tech",
        }

    def test_ag_news_reader(self):
        for data_pack in self.pipeline.process_dataset(self.sample_file):
            (
                expected_class_id,
                expected_title,
                expected_desc,
            ) = self.expected_content[data_pack.pack_name]
            self.assertIsInstance(data_pack, DataPack)
            # Test Article
            doc_entries = list(data_pack.get(Document))
            self.assertTrue(len(doc_entries) == 1)
            article: Document = doc_entries[0]
            self.assertIsInstance(article, Document)
            self.assertEqual(article.text,
                             expected_title + "\n" + expected_desc)
            # Test Document Class
            doc_class = article.document_class
            self.assertTrue(len(doc_class) == 1)
            self.assertEqual(doc_class[0],
                             self.class_idx_to_name[expected_class_id])
            # Test Title
            title_entries = list(data_pack.get(Title))
            self.assertTrue(len(title_entries) == 1)
            title: Title = title_entries[0]
            self.assertEqual(title.text, expected_title)
            # Test Description
            desc_entries = list(data_pack.get(Description))
            self.assertTrue(len(desc_entries) == 1)
            description: Description = desc_entries[0]
            self.assertEqual(description.text, expected_desc)
def stanford_nlp_example(lang: str, text: str):
    pl = Pipeline()
    pl.set_reader(StringReader())

    models_path = os.getcwd()
    config = HParams(
        {
            'processors': 'tokenize,pos,lemma,depparse',
            'lang': lang,
            # Language code for the language to build the Pipeline
            'use_gpu': False
        },
        StandfordNLPProcessor.default_hparams())
    pl.add_processor(processor=StandfordNLPProcessor(models_path),
                     config=config)

    pl.initialize()

    pack = pl.process(text)
    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        tokens = [(token.text, token.pos, token.lemma)
                  for token in pack.get(Token, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")

        print(colored("Dependency Relations:", 'red'))
        for link in pack.get(Dependency, sentence):
            parent: Token = link.get_parent()  # type: ignore
            child: Token = link.get_child()  # type: ignore
            print(colored(child.text, 'cyan'), "has relation",
                  colored(link.rel_type, 'green'), "of parent",
                  colored(parent.text, 'cyan'))

        print("\n----------------------\n")
    def test_pipeline(self, texts):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx+1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline()
        reader_config = {
            "input_pack_name": "query",
            "output_pack_name": "output"
        }
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)
        config = {
            "model": {
                "name": "bert-base-uncased"
            },
            "tokenizer": {
                "name": "bert-base-uncased"
            },
            "max_seq_length": 128,
            "query_pack_name": "query"
        }
        nlp.add_processor(BertBasedQueryCreator(), config=config)

        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            query_pack = m_pack.get_pack("query")
            self.assertEqual(len(query_pack.generics), 1)
            self.assertIsInstance(query_pack.generics[0], Query)
            query = query_pack.generics[0].value
            self.assertEqual(query.shape, (1, 768))
Beispiel #8
0
    def test_pipeline7(self, batch_size1, batch_size2, batch_size3):
        # Tests a chain of Batch->Batch->Batch->Pack with different batch sizes.

        nlp = Pipeline()
        reader = MultiPackSentenceReader()
        nlp.set_reader(reader)
        dummy1 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size1}}
        nlp.add_processor(processor=dummy1,
                          config=config,
                          selector=FirstPackSelector())
        dummy2 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size2}}
        nlp.add_processor(processor=dummy2,
                          config=config,
                          selector=FirstPackSelector())
        dummy3 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size3}}
        nlp.add_processor(processor=dummy3,
                          config=config,
                          selector=FirstPackSelector())
        dummy4 = DummyPackProcessor()
        nlp.add_processor(processor=dummy4, selector=FirstPackSelector())
        nlp.initialize()
        data_path = "data_samples/random_texts/0.txt"

        num_packs = 0
        for pack in nlp.process_dataset(data_path):
            types = list(pack.get_pack("pack").get_entries_by_type(NewType))
            num_packs += 1
            self.assertEqual(len(types), 1)
            self.assertEqual(types[0].value, "[BATCH][BATCH][BATCH][PACK]")

        # check that all packs are yielded
        self.assertEqual(num_packs, reader.count)
Beispiel #9
0
    def test_pipeline4(self, batch_size):
        """Tests a chain of Pack->Batch->Pack."""

        nlp = Pipeline()
        reader = SentenceReader()
        nlp.set_reader(reader)
        dummy1 = DummyPackProcessor()
        nlp.add_processor(processor=dummy1)

        dummy2 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size}}
        nlp.add_processor(processor=dummy2, config=config)

        dummy3 = DummyPackProcessor()
        nlp.add_processor(processor=dummy3)
        nlp.initialize()
        data_path = "data_samples/random_texts/0.txt"

        num_packs = 0
        for pack in nlp.process_dataset(data_path):
            types = list(pack.get_entries_by_type(NewType))
            num_packs += 1
            self.assertEqual(len(types), 1)
            self.assertEqual(types[0].value, "[PACK][BATCH][PACK]")

        # check that all packs are yielded
        self.assertEqual(num_packs, reader.count)
Beispiel #10
0
class CoNLL03ReaderPipelineTest(unittest.TestCase):
    def setUp(self):
        # Define and config the Pipeline
        self.dataset_path = "data_samples/conll03"

        self.nlp = Pipeline()

        self.nlp.set_reader(CoNLL03Reader())
        self.nlp.add_processor(DummyPackProcessor())
        self.nlp.add_processor(DummyPackProcessor())

        self.nlp.initialize()

    def test_process_next(self):
        doc_exists = False
        # get processed pack from dataset
        for pack in self.nlp.process_dataset(self.dataset_path):
            # get sentence from pack
            for sentence in pack.get_entries(Sentence):
                doc_exists = True
                sent_text = sentence.text
                # second method to get entry in a sentence
                tokens = [
                    token.text for token in pack.get_entries(Token, sentence)
                ]
                self.assertEqual(sent_text, " ".join(tokens))
        self.assertTrue(doc_exists)
    def test_reader_original_span_test(self, value):
        span_ops, output = (
            [
                (Span(11, 19), "New"),
                (Span(19, 20), " Shiny "),
                (Span(25, 25), " Ends"),
            ],
            "<title>The New Shiny Title Ends </title>",
        )
        input_span, expected_span, mode = value

        pipeline = Pipeline()
        reader = PlainTextReader()
        reader.text_replace_operation = lambda _: span_ops
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        pack = pipeline.process_one(self.test_dir)

        self.assertEqual(pack.text, output)

        output_span = pack.get_original_span(input_span, mode)
        self.assertEqual(
            output_span,
            expected_span,
            f"Expected: ({expected_span.begin, expected_span.end}"
            f"), Found: ({output_span.begin, output_span.end})"
            f" when Input: ({input_span.begin, input_span.end})"
            f" and Mode: {mode}",
        )
Beispiel #12
0
    def setUp(self) -> None:
        file_dir_path = os.path.dirname(__file__)
        data_path = os.path.join(file_dir_path, os.pardir, os.pardir,
                                 'test_data', 'ontonotes')

        pipeline = Pipeline()
        pipeline.set_reader(OntonotesReader())
        pipeline.initialize()
        self.data_pack: DataPack = pipeline.process_one(data_path)
    def test_without_attribute_masker(self):
        pl = Pipeline()
        pl.set_reader(CoNLL03Reader())
        pl.initialize()

        for pack in pl.process_dataset("data_samples/conll03/"):
            entries = pack.get_entries_by_type(Token)
            for entry in entries:
                self.assertIsNotNone(entry.ner)
    def test_reader_no_replace_test(self):
        # Read with no replacements
        pipeline = Pipeline()
        reader = PlainTextReader()
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        pack = pipeline.process_one(self.test_dir)
        self.assertEqual(pack.text, self.orig_text)
    def test_process_next(self):

        another_pipeline = Pipeline()
        another_pipeline.set_reader(DeserializeReader())
        another_pipeline.initialize()

        data = ["Testing Reader", "Testing Deserializer"]

        for pack in self.nlp.process_dataset(data):
            for new_pack in another_pipeline.process_dataset([pack.serialize()]):
                self.assertEqual(pack.text, new_pack.text)
Beispiel #16
0
def string_processor_example(ner_model_dir: str, srl_model_dir: str):
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_dir, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    ner_predictor = CoNLLNERPredictor()

    pl.add_processor(ner_predictor, ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_dir,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)

    pl.initialize()

    text = (
        "The plain green Norway spruce is displayed in the gallery's foyer. "
        "Wentworth worked as an assistant to sculptor Henry Moore in the "
        "late 1960s. His reputation as a sculptor grew in the 1980s.")

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [(token.text, token.pos)
                  for token in pack.get(Token, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", 'red'))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()  # type: ignore
            child: PredicateArgument = link.get_child()  # type: ignore
            print(f"  - \"{child.text}\" is role {link.arg_type} of "
                  f"predicate \"{parent.text}\"")
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", 'green'))
    def test_attribute_masker(self):
        pl = Pipeline()
        pl.set_reader(CoNLL03Reader())
        config = {"kwargs": {Token: ["ner"]}}

        pl.add_processor(processor=AttributeMasker(), config=config)
        pl.initialize()

        for pack in pl.process_dataset("data_samples/conll03/"):
            entries = pack.get_entries_by_type(Token)
            for entry in entries:
                self.assertIsNone(entry.ner)
    def test_reader_replace_error_test(self, value):
        # Read with errors in span replacements
        span_ops, output = value

        pipeline = Pipeline()
        reader = PlainTextReader()
        reader.text_replace_operation = lambda _: span_ops
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        with self.assertRaises(ValueError):
            pipeline.process(self.test_dir)
    def test_encoder_sentence(self):
        pipeline = Pipeline()
        pipeline.set_reader(StringReader())
        pipeline.add_processor(NLTKSentenceSegmenter())
        pipeline.add_processor(PretrainedEncoder())
        pipeline.initialize()

        sentences = ["This tool is called Forte.",
                     "The goal of this project to help you build NLP "
                     "pipelines.",
                     "NLP has never been made this easy before."]
        document = ' '.join(sentences)
        pack = pipeline.process(document)
        for i, sentence in enumerate(pack.get(Sentence)):
            self.assertEqual(sentence.embedding.shape, (1, 512, 768))
    def test_reader_replace_back_test(self, value):
        # Reading with replacements - replacing a span and changing it back
        span_ops, output = value

        pipeline = Pipeline()
        reader = PlainTextReader()
        reader.text_replace_operation = lambda _: span_ops
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        pack: DataPack = pipeline.process_one(self.test_dir)
        self.assertEqual(pack.text, output)

        orig_text_from_pack = pack.get_original_text()
        self.assertEqual(self.orig_text, orig_text_from_pack)
    def test_encoder_document(self):
        pipeline = Pipeline()
        pipeline.set_reader(StringReader())
        pipeline.add_processor(
            PretrainedEncoder(),
            config={'entry_type': 'ft.onto.base_ontology.Document'})
        pipeline.initialize()

        sentences = ["This tool is called Forte.",
                     "The goal of this project to help you build NLP "
                     "pipelines.",
                     "NLP has never been made this easy before."]
        document = ' '.join(sentences)
        pack = pipeline.process(document)
        for i, doc in enumerate(pack.get(Document)):
            self.assertEqual(doc.embedding.shape, (1, 512, 768))
Beispiel #22
0
    def test_parse_pack(self, text, annotation_length):

        file_path = os.path.join(self.test_dir, 'test.txt')
        with open(file_path, 'w') as f:
            f.write(text)

        pl = Pipeline()
        pl.set_reader(MultiPackSentenceReader())
        pl.initialize()

        multipack: MultiPack = pl.process_one(self.test_dir)
        input_pack = multipack.get_pack('input_src')
        self.assertEqual(len(multipack.packs), 2)
        self.assertEqual(multipack._pack_names, ['input_src', 'output_tgt'])
        self.assertEqual(len(input_pack.annotations), annotation_length)
        self.assertEqual(input_pack.text, text + "\n")
 def test_one_batch_processor(self, batch_size):
     nlp = Pipeline()
     nlp.set_reader(StringReader())
     dummy = DummmyFixedSizeBatchProcessor()
     config = {"batcher": {"batch_size": batch_size}}
     nlp.add_processor(NLTKSentenceSegmenter())
     nlp.add_processor(dummy, config=config)
     nlp.initialize()
     sentences = ["This tool is called Forte. The goal of this project to "
                  "help you build NLP pipelines. NLP has never been made "
                  "this easy before."]
     pack = nlp.process(sentences)
     sent_len = len(list(pack.get(Sentence)))
     self.assertEqual(
         dummy.counter, (sent_len // batch_size +
                         (sent_len % batch_size > 0)))
Beispiel #24
0
    def test_pipeline(self, texts):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx+1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline()
        reader_config = HParams({"input_pack_name": "input",
                                 "output_pack_name": "output"},
                                MultiPackSentenceReader.default_hparams())
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)
        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            self.assertEqual(m_pack._pack_names, ["input", "output"])
            self.assertEqual(m_pack.get_pack("input").text, texts[idx] + "\n")
Beispiel #25
0
    def test_reader_no_replace_test(self):
        # Read with no replacements
        pipeline = Pipeline()
        reader = SquadReader()
        pipeline.set_reader(reader)
        pipeline.initialize()

        data_packs: Iterable[DataPack] = pipeline.process_dataset(
            self.dataset_path)
        file_path: str = self.dataset_path
        expected_file_dict = {}
        with open(file_path, "r", encoding="utf8", errors="ignore") as file:
            expected_json = json.load(file)
            for dic in expected_json["data"]:
                title = dic["title"]
                cnt = 0
                for qa_dic in dic["paragraphs"]:
                    expected_file_dict[title +
                                       str(cnt)] = qa_dic  # qas, context
                    cnt += 1

        count_packs = 0
        for pack in data_packs:
            count_packs += 1
            expected_text: str = ""
            expected = expected_file_dict[pack.pack_name]

            passage = list(pack.get(Passage))
            self.assertEqual(len(passage), 1)
            expected_context = expected["context"]
            self.assertEqual(passage[0].text, expected_context)
            expected_text += expected_context

            for qid, question in enumerate(pack.get(MRCQuestion)):
                expected_qa = expected["qas"][qid]
                expected_question = expected_qa["question"]
                expected_answers = expected_qa["answers"]
                self.assertEqual(question.text, expected_question)
                if not isinstance(expected_answers, list):
                    expected_answers = [expected_answers]
                answers = question.answers

                for answer, expected_answer in zip(answers, expected_answers):
                    self.assertEqual(answer.text, expected_answer["text"])
                expected_text += "\n" + expected_question

            self.assertEqual(pack.text, expected_text)
Beispiel #26
0
    def _create_pipeline(config):
        nlp = Pipeline()
        nlp.set_reader(StringReader())

        # Using SpacyProcessor to segment the sentences
        nlp.add_processor(
            processor=SpacyProcessor(),
            config={
                'processors': '',
                'lang':
                "en_core_web_sm",  # Language code to build the Pipeline
                'use_gpu': False
            })

        nlp.add_processor(processor=AllenNLPProcessor(), config=config)
        nlp.initialize()
        return nlp
Beispiel #27
0
class TestLowerCaserProcessor(unittest.TestCase):
    def setUp(self):
        self.nlp = Pipeline()
        self.nlp.set_reader(StringReader())
        self.nlp.add_processor(LowerCaserProcessor())
        self.nlp.initialize()

    def test_lowercaser_processor(self):
        document = "This tool is called Forte. The goal of this project to " \
                   "help you build NLP pipelines. NLP has never been made " \
                   "this easy before."
        pack = self.nlp.process(document)
        print(pack)
        print(pack.text)
        assert pack.text == "this tool is called forte. the goal of this " \
                            "project to help you build nlp pipelines. nlp " \
                            "has never been made this easy before."
def main(dataset_dir: str, ner_model_path: str, srl_model_path: str):
    pl = Pipeline()
    pl.set_reader(PlainTextReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_path, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    pl.add_processor(CoNLLNERPredictor(), ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_path,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)
    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", 'red'), pack.meta.doc_id)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [(token.text, token.pos)
                      for token in pack.get(Token, sentence)]
            entities = [(entity.text, entity.ner_type)
                        for entity in pack.get(EntityMention, sentence)]
            print(colored("Tokens:", 'red'), tokens, "\n")
            print(colored("EntityMentions:", 'red'), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
Beispiel #29
0
class DummyProcessorTest(unittest.TestCase):
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = {"batcher": {"batch_size": 5}}
        self.nlp.add_processor(dummy, config=config)
        self.nlp.initialize()

        self.data_path = "data_samples/ontonotes/00/"

    def test_processor(self):
        pack = self.nlp.process(self.data_path)
        relations = list(pack.get_entries(RelationLink))
        assert (len(relations) > 0)
        for relation in relations:
            self.assertEqual(relation.get_field("rel_type"), "dummy_relation")
def main():
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    pl.add_processor(CoNLLNERPredictor(), config=config.NER)
    pl.add_processor(SRLPredictor(), config=config.SRL)

    pl.initialize()

    text = (
        "So I was excited to see Journey to the Far Side of the Sun finally "
        "get released on an affordable DVD (the previous print had been "
        "fetching $100 on eBay - I'm sure those people wish they had their "
        "money back - but more about that in a second).")

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [(token.text, token.pos)
                  for token in pack.get(Token, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", 'red'))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()  # type: ignore
            child: PredicateArgument = link.get_child()  # type: ignore
            print(f"  - \"{child.text}\" is role {link.arg_type} of "
                  f"predicate \"{parent.text}\"")
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", 'green'))