class TestNLTKWordTokenizer(unittest.TestCase):
    def setUp(self):
        self.nltk = Pipeline()
        self.nltk.set_reader(StringReader())
        self.nltk.add_processor(NLTKSentenceSegmenter())
        self.nltk.add_processor(NLTKWordTokenizer())

    def test_tokenizer(self):
        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        tokens = [["This", "tool", "is", "called", "Forte", "."],
                  [
                      "The", "goal", "of", "this", "project", "to", "help",
                      "you", "build", "NLP", "pipelines", "."
                  ],
                  [
                      "NLP", "has", "never", "been", "made", "this", "easy",
                      "before", "."
                  ]]
        document = ' '.join(sentences)
        pack = self.nltk.process(document)
        for i, sentence in enumerate(pack.get(Sentence)):
            for j, token in enumerate(
                    pack.get(entry_type=Token, range_annotation=sentence)):
                self.assertEqual(token.text, tokens[i][j])
Example #2
0
class TestStanfordNLPProcessor(unittest.TestCase):
    def setUp(self):
        self.stanford_nlp = Pipeline()
        self.stanford_nlp.set_reader(StringReader())
        models_path = os.getcwd()
        config = HParams(
            {
                "processors": "tokenize",
                "lang": "en",
                # Language code for the language to build the Pipeline
                "use_gpu": False
            },
            StandfordNLPProcessor.default_hparams())
        self.stanford_nlp.add_processor(StandfordNLPProcessor(models_path),
                                        config=config)
        self.stanford_nlp.initialize()

    # TODO
    @unittest.skip("We need to test this without needing to download models "
                   "everytime")
    def test_stanford_processor(self):
        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        pack = self.stanford_nlp.process(document)
        print(pack)
Example #3
0
class PipelineTest(unittest.TestCase):
    def setUp(self) -> None:
        # Define and config the Pipeline
        self.nlp = Pipeline()
        self.nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = HParams({"batcher": {
            "batch_size": 5
        }}, dummy.default_hparams())
        self.nlp.add_processor(dummy, config=config)
        self.nlp.initialize()

        self.dataset_path = \
            "forte/tests/data_samples/ontonotes_sample_dataset/00"

    def test_process_next(self):
        # get processed pack from dataset
        for pack in self.nlp.process_dataset(self.dataset_path):
            # get sentence from pack
            for sentence in pack.get_entries(Sentence):
                sent_text = sentence.text

                # first method to get entry in a sentence
                for link in pack.get_entries(RelationLink, sentence):
                    parent = link.get_parent()
                    child = link.get_child()
                    print(f"{parent.text} is {link.rel_type} {child.text}")
                    pass  # some operation on link

                # second method to get entry in a sentence
                tokens = [
                    token.text for token in pack.get_entries(Token, sentence)
                ]
                self.assertEqual(sent_text, " ".join(tokens))
    def test_pipeline(self, texts):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx+1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline()
        reader_config = HParams(
            {
                "input_pack_name": "input",
                "output_pack_name": "output"
            }, MultiPackSentenceReader.default_hparams())
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)
        translator_config = HParams(
            {
                "src_language": "de",
                "target_language": "en",
                "in_pack_name": "input",
                "out_pack_name": "result"
            }, None)

        nlp.add_processor(MicrosoftBingTranslator(), config=translator_config)
        nlp.initialize()

        english_results = ["Hey good morning", "This is Forte. A tool for NLP"]
        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            self.assertEqual(set(m_pack._pack_names),
                             set(["input", "output", "result"]))
            self.assertEqual(
                m_pack.get_pack("result").text, english_results[idx] + "\n")
    def test_pipeline(self, texts):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx+1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline()
        reader_config = {
            "input_pack_name": "query",
            "output_pack_name": "output"
        }
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)
        config = {
            "model": {
                "name": "bert-base-uncased"
            },
            "tokenizer": {
                "name": "bert-base-uncased"
            },
            "max_seq_length": 128,
            "query_pack_name": "query"
        }
        nlp.add_processor(BertBasedQueryCreator(), config=config)

        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            query_pack = m_pack.get_pack("query")
            self.assertEqual(len(query_pack.generics), 1)
            self.assertIsInstance(query_pack.generics[0], Query)
            query = query_pack.generics[0].value
            self.assertEqual(query.shape, (1, 768))
def stanford_nlp_example(lang: str, text: str):
    pl = Pipeline()
    pl.set_reader(StringReader())

    models_path = os.getcwd()
    config = HParams(
        {
            'processors': 'tokenize,pos,lemma,depparse',
            'lang': lang,
            # Language code for the language to build the Pipeline
            'use_gpu': False
        },
        StandfordNLPProcessor.default_hparams())
    pl.add_processor(processor=StandfordNLPProcessor(models_path),
                     config=config)

    pl.initialize()

    pack = pl.process(text)
    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        tokens = [(token.text, token.pos, token.lemma)
                  for token in pack.get(Token, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")

        print(colored("Dependency Relations:", 'red'))
        for link in pack.get(Dependency, sentence):
            parent: Token = link.get_parent()  # type: ignore
            child: Token = link.get_child()  # type: ignore
            print(colored(child.text, 'cyan'), "has relation",
                  colored(link.rel_type, 'green'), "of parent",
                  colored(parent.text, 'cyan'))

        print("\n----------------------\n")
Example #7
0
class CoNLL03ReaderPipelineTest(unittest.TestCase):
    def setUp(self):
        # Define and config the Pipeline
        self.dataset_path = "data_samples/conll03"

        self.nlp = Pipeline()

        self.nlp.set_reader(CoNLL03Reader())
        self.nlp.add_processor(DummyPackProcessor())
        self.nlp.add_processor(DummyPackProcessor())

        self.nlp.initialize()

    def test_process_next(self):
        doc_exists = False
        # get processed pack from dataset
        for pack in self.nlp.process_dataset(self.dataset_path):
            # get sentence from pack
            for sentence in pack.get_entries(Sentence):
                doc_exists = True
                sent_text = sentence.text
                # second method to get entry in a sentence
                tokens = [
                    token.text for token in pack.get_entries(Token, sentence)
                ]
                self.assertEqual(sent_text, " ".join(tokens))
        self.assertTrue(doc_exists)
Example #8
0
    def prepare(self, *args, **kwargs):  # pylint: disable=unused-argument
        prepare_pl = Pipeline()
        prepare_pl.set_reader(self.train_reader)
        for p in self.preprocessors:
            prepare_pl.add_processor(p)

        prepare_pl.run(self.configs.config_data.train_path)

        for p in self.preprocessors:
            p.finish(resource=self.resource)
Example #9
0
def string_processor_example(ner_model_dir: str, srl_model_dir: str):
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_dir, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    ner_predictor = CoNLLNERPredictor()

    pl.add_processor(ner_predictor, ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_dir,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)

    pl.initialize()

    text = (
        "The plain green Norway spruce is displayed in the gallery's foyer. "
        "Wentworth worked as an assistant to sculptor Henry Moore in the "
        "late 1960s. His reputation as a sculptor grew in the 1980s.")

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [(token.text, token.pos)
                  for token in pack.get(Token, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", 'red'))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()  # type: ignore
            child: PredicateArgument = link.get_child()  # type: ignore
            print(f"  - \"{child.text}\" is role {link.arg_type} of "
                  f"predicate \"{parent.text}\"")
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", 'green'))
    def test_attribute_masker(self):
        pl = Pipeline()
        pl.set_reader(CoNLL03Reader())
        config = {"kwargs": {Token: ["ner"]}}

        pl.add_processor(processor=AttributeMasker(), config=config)
        pl.initialize()

        for pack in pl.process_dataset("data_samples/conll03/"):
            entries = pack.get_entries_by_type(Token)
            for entry in entries:
                self.assertIsNone(entry.ner)
Example #11
0
    def test_pipeline7(self, batch_size1, batch_size2, batch_size3):
        # Tests a chain of Batch->Batch->Batch->Pack with different batch sizes.

        nlp = Pipeline()
        reader = MultiPackSentenceReader()
        nlp.set_reader(reader)
        dummy1 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size1}}
        nlp.add_processor(processor=dummy1,
                          config=config,
                          selector=FirstPackSelector())
        dummy2 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size2}}
        nlp.add_processor(processor=dummy2,
                          config=config,
                          selector=FirstPackSelector())
        dummy3 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size3}}
        nlp.add_processor(processor=dummy3,
                          config=config,
                          selector=FirstPackSelector())
        dummy4 = DummyPackProcessor()
        nlp.add_processor(processor=dummy4, selector=FirstPackSelector())
        nlp.initialize()
        data_path = "data_samples/random_texts/0.txt"

        num_packs = 0
        for pack in nlp.process_dataset(data_path):
            types = list(pack.get_pack("pack").get_entries_by_type(NewType))
            num_packs += 1
            self.assertEqual(len(types), 1)
            self.assertEqual(types[0].value, "[BATCH][BATCH][BATCH][PACK]")

        # check that all packs are yielded
        self.assertEqual(num_packs, reader.count)
    def test_encoder_sentence(self):
        pipeline = Pipeline()
        pipeline.set_reader(StringReader())
        pipeline.add_processor(NLTKSentenceSegmenter())
        pipeline.add_processor(PretrainedEncoder())
        pipeline.initialize()

        sentences = ["This tool is called Forte.",
                     "The goal of this project to help you build NLP "
                     "pipelines.",
                     "NLP has never been made this easy before."]
        document = ' '.join(sentences)
        pack = pipeline.process(document)
        for i, sentence in enumerate(pack.get(Sentence)):
            self.assertEqual(sentence.embedding.shape, (1, 512, 768))
    def test_encoder_document(self):
        pipeline = Pipeline()
        pipeline.set_reader(StringReader())
        pipeline.add_processor(
            PretrainedEncoder(),
            config={'entry_type': 'ft.onto.base_ontology.Document'})
        pipeline.initialize()

        sentences = ["This tool is called Forte.",
                     "The goal of this project to help you build NLP "
                     "pipelines.",
                     "NLP has never been made this easy before."]
        document = ' '.join(sentences)
        pack = pipeline.process(document)
        for i, doc in enumerate(pack.get(Document)):
            self.assertEqual(doc.embedding.shape, (1, 512, 768))
class TestNLTKSentenceSegmenter(unittest.TestCase):
    def setUp(self):
        self.nltk = Pipeline()
        self.nltk.set_reader(StringReader())
        self.nltk.add_processor(NLTKSentenceSegmenter())

    def test_segmenter(self):
        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        pack = self.nltk.process(document)
        for idx, sentence in enumerate(pack.get(Sentence)):
            self.assertEqual(sentence.text, sentences[idx])
 def test_one_batch_processor(self, batch_size):
     nlp = Pipeline()
     nlp.set_reader(StringReader())
     dummy = DummmyFixedSizeBatchProcessor()
     config = {"batcher": {"batch_size": batch_size}}
     nlp.add_processor(NLTKSentenceSegmenter())
     nlp.add_processor(dummy, config=config)
     nlp.initialize()
     sentences = ["This tool is called Forte. The goal of this project to "
                  "help you build NLP pipelines. NLP has never been made "
                  "this easy before."]
     pack = nlp.process(sentences)
     sent_len = len(list(pack.get(Sentence)))
     self.assertEqual(
         dummy.counter, (sent_len // batch_size +
                         (sent_len % batch_size > 0)))
Example #16
0
class DummyProcessorTest(unittest.TestCase):
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = {"batcher": {"batch_size": 5}}
        self.nlp.add_processor(dummy, config=config)
        self.nlp.initialize()

        self.data_path = "data_samples/ontonotes/00/"

    def test_processor(self):
        pack = self.nlp.process(self.data_path)
        relations = list(pack.get_entries(RelationLink))
        assert (len(relations) > 0)
        for relation in relations:
            self.assertEqual(relation.get_field("rel_type"), "dummy_relation")
Example #17
0
class TestLowerCaserProcessor(unittest.TestCase):
    def setUp(self):
        self.nlp = Pipeline()
        self.nlp.set_reader(StringReader())
        self.nlp.add_processor(LowerCaserProcessor())
        self.nlp.initialize()

    def test_lowercaser_processor(self):
        document = "This tool is called Forte. The goal of this project to " \
                   "help you build NLP pipelines. NLP has never been made " \
                   "this easy before."
        pack = self.nlp.process(document)
        print(pack)
        print(pack.text)
        assert pack.text == "this tool is called forte. the goal of this " \
                            "project to help you build nlp pipelines. nlp " \
                            "has never been made this easy before."
Example #18
0
def main(dataset_dir: str, ner_model_path: str, srl_model_path: str):
    pl = Pipeline()
    pl.set_reader(PlainTextReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_path, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    pl.add_processor(CoNLLNERPredictor(), ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_path,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)
    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", 'red'), pack.meta.doc_id)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [(token.text, token.pos)
                      for token in pack.get(Token, sentence)]
            entities = [(entity.text, entity.ner_type)
                        for entity in pack.get(EntityMention, sentence)]
            print(colored("Tokens:", 'red'), tokens, "\n")
            print(colored("EntityMentions:", 'red'), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
Example #19
0
    def _create_pipeline(config):
        nlp = Pipeline()
        nlp.set_reader(StringReader())

        # Using SpacyProcessor to segment the sentences
        nlp.add_processor(
            processor=SpacyProcessor(),
            config={
                'processors': '',
                'lang':
                "en_core_web_sm",  # Language code to build the Pipeline
                'use_gpu': False
            })

        nlp.add_processor(processor=AllenNLPProcessor(), config=config)
        nlp.initialize()
        return nlp
def main():
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    pl.add_processor(CoNLLNERPredictor(), config=config.NER)
    pl.add_processor(SRLPredictor(), config=config.SRL)

    pl.initialize()

    text = (
        "So I was excited to see Journey to the Far Side of the Sun finally "
        "get released on an affordable DVD (the previous print had been "
        "fetching $100 on eBay - I'm sure those people wish they had their "
        "money back - but more about that in a second).")

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [(token.text, token.pos)
                  for token in pack.get(Token, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", 'red'))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()  # type: ignore
            child: PredicateArgument = link.get_child()  # type: ignore
            print(f"  - \"{child.text}\" is role {link.arg_type} of "
                  f"predicate \"{parent.text}\"")
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", 'green'))
Example #21
0
    def test_pipeline1(self):
        """Tests a pack processor only."""

        nlp = Pipeline()
        reader = MultiPackSentenceReader()
        nlp.set_reader(reader)
        dummy = DummyPackProcessor()
        nlp.add_processor(dummy, selector=FirstPackSelector())
        nlp.initialize()
        data_path = "data_samples/random_texts/0.txt"
        num_packs = 0
        for pack in nlp.process_dataset(data_path):
            types = list(pack.get_pack("pack").get_entries_by_type(NewType))
            num_packs += 1
            self.assertEqual(len(types), 1)
            self.assertEqual(types[0].value, "[PACK]")

        # check that all packs are yielded
        self.assertEqual(num_packs, reader.count)
Example #22
0
class DummyProcessorTest(unittest.TestCase):
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.reader = OntonotesReader()

        self.data_path = "examples/data_samples/ontonotes/00/"

        self.nlp.set_reader(OntonotesReader())
        self.nlp.add_processor(DummyRelationExtractor())
        self.nlp.initialize()

    def test_processor(self):
        pack = self.nlp.process(self.data_path)

        relations = list(pack.get_entries(RelationLink))

        assert (len(relations) > 0)

        for relation in relations:
            assert (relation.get_field("rel_type") == "dummy_relation")
Example #23
0
    def test_pipeline2(self):
        """Tests a batch processor only."""

        nlp = Pipeline()
        reader = SentenceReader()
        nlp.set_reader(reader)
        dummy = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": 4}}
        nlp.add_processor(processor=dummy, config=config)
        nlp.initialize()
        data_path = "data_samples/random_texts/0.txt"
        num_packs = 0
        for pack in nlp.process_dataset(data_path):
            types = list(pack.get_entries_by_type(NewType))
            num_packs += 1
            self.assertEqual(len(types), 1)
            self.assertEqual(types[0].value, "[BATCH]")

        # check that all packs are yielded
        self.assertEqual(num_packs, reader.count)
def main(dataset_dir: str):
    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    pl = Pipeline()
    pl.set_reader(PlainTextReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())
    pl.add_processor(CoNLLNERPredictor(), config=config.NER)
    pl.add_processor(SRLPredictor(), config=config.SRL)

    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", 'red'), pack.meta.doc_id)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [(token.text, token.pos)
                      for token in pack.get(Token, sentence)]
            entities = [(entity.text, entity.ner_type)
                        for entity in pack.get(EntityMention, sentence)]
            print(colored("Tokens:", 'red'), tokens, "\n")
            print(colored("EntityMentions:", 'red'), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))
Example #25
0
    def test_neg_spacy_processor(self):
        spacy = Pipeline()
        spacy.set_reader(StringReader())

        config = {
            "processors": 'ner',
            "lang": "xx_ent_wiki_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        spacy.add_processor(SpacyProcessor(), config=config)
        spacy.initialize()

        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        with self.assertRaises(ValueError):
            _ = spacy.process(document)
    def test_encoder_phrase(self):
        pipeline = Pipeline()
        pipeline.set_reader(StringReader())
        pipeline.add_processor(NLTKSentenceSegmenter())
        pipeline.add_processor(NLTKWordTokenizer())
        pipeline.add_processor(NLTKPOSTagger())
        config = {'pattern': 'NP: {<DT>?<JJ>*<NN>}'}
        pipeline.add_processor(NLTKChunker(), config=config)
        pipeline.add_processor(
            PretrainedEncoder(),
            config={'entry_type': 'ft.onto.base_ontology.Phrase'})
        pipeline.initialize()

        sentences = ["This tool is called Forte.",
                     "The goal of this project to help you build NLP "
                     "pipelines.",
                     "NLP has never been made this easy before."]
        document = ' '.join(sentences)
        pack = pipeline.process(document)
        for i, phrase in enumerate(pack.get(Phrase)):
            self.assertEqual(phrase.embedding.shape, (1, 512, 768))
class TestNLTKPOSTagger(unittest.TestCase):
    def setUp(self):
        self.nltk = Pipeline()
        self.nltk.set_reader(StringReader())
        self.nltk.add_processor(NLTKSentenceSegmenter())
        self.nltk.add_processor(NLTKWordTokenizer())
        self.nltk.add_processor(NLTKPOSTagger())

    def test_pos_tagger(self):
        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        pos = [["DT", "NN", "VBZ", "VBN", "NNP", "."],
               [
                   "DT", "NN", "IN", "DT", "NN", "TO", "VB", "PRP", "VB",
                   "NNP", "NNS", "."
               ], ["NNP", "VBZ", "RB", "VBN", "VBN", "DT", "JJ", "RB", "."]]
        document = ' '.join(sentences)
        pack = self.nltk.process(document)
        for i, sentence in enumerate(pack.get(Sentence)):
            for j, token in enumerate(
                    pack.get(entry_type=Token, range_annotation=sentence)):
                self.assertEqual(token.pos, pos[i][j])
Example #28
0
    def test_pipeline3(self, batch_size):
        """Tests a chain of Batch->Pack->Batch with different batch sizes."""

        nlp = Pipeline()
        reader = SentenceReader()
        nlp.set_reader(reader)
        dummy1 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size}}
        nlp.add_processor(processor=dummy1, config=config)
        dummy2 = DummyPackProcessor()
        nlp.add_processor(processor=dummy2)
        dummy3 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": 2 * batch_size}}
        nlp.add_processor(processor=dummy3, config=config)
        nlp.initialize()
        data_path = "data_samples/random_texts/0.txt"

        num_packs = 0
        for pack in nlp.process_dataset(data_path):
            types = list(pack.get_entries_by_type(NewType))
            num_packs += 1
            self.assertEqual(len(types), 1)
            self.assertEqual(types[0].value, "[BATCH][PACK][BATCH]")

        # check that all packs are yielded
        self.assertEqual(num_packs, reader.count)
Example #29
0
    def test_process_next(self):

        # Define and config the Pipeline
        nlp = Pipeline()
        nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = {"batcher": {"batch_size": 5}}
        nlp.add_processor(dummy, config=config)
        nlp.initialize()

        dataset_path = "data_samples/ontonotes/00"

        # get processed pack from dataset
        for pack in nlp.process_dataset(dataset_path):
            # get sentence from pack
            for sentence in pack.get_entries(Sentence):
                sent_text = sentence.text

                # second method to get entry in a sentence
                tokens = [
                    token.text for token in pack.get_entries(Token, sentence)
                ]
                self.assertEqual(sent_text, " ".join(tokens))
class DummyFixedSizeBatchProcessorTest(unittest.TestCase):
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.nlp.set_reader(StringReader())
        self.dummy = DummmyFixedSizeBatchProcessor()

    @data(1, 2, 3)
    def test_processor(self, batch_size):
        config = HParams({"batcher": {
            "batch_size": batch_size
        }}, self.dummy.default_hparams())
        self.nlp.add_processor(NLTKSentenceSegmenter())
        self.nlp.add_processor(self.dummy, config=config)
        self.nlp.initialize()
        sentences = [
            "This tool is called Forte. The goal of this project to "
            "help you build NLP pipelines. NLP has never been made "
            "this easy before."
        ]
        pack = self.nlp.process(sentences)
        sent_len = len(list(pack.get(Sentence)))
        self.assertEqual(self.dummy.counter, (sent_len // batch_size +
                                              (sent_len % batch_size > 0)))