class MSMarcoPassageReaderTest(unittest.TestCase):
    def setUp(self):
        self.pipeline = Pipeline()

        self.pipeline.set_reader(MSMarcoPassageReader())
        self.pipeline.initialize()

        self.data_dir = 'data_samples/ms_marco_passage_retrieval'

        corpus_file = os.path.join(self.data_dir, 'collection.tsv')
        self.expected_content = {}
        with open(corpus_file, 'r') as f:
            for line in f.readlines():
                key, value = tuple(line.split('\t', 1))
                self.expected_content[key] = value

    def test_ms_marco_passage_reader(self):
        actual_content: Dict[str, str] = {}
        for data_pack in self.pipeline.process_dataset(self.data_dir):
            self.assertIsInstance(data_pack, DataPack)
            doc_entries = list(data_pack.get(Document))
            self.assertTrue(len(doc_entries) == 1)
            doc_entry: Document = doc_entries[0]
            self.assertIsInstance(doc_entry, Document)
            actual_content[data_pack.pack_name] = doc_entry.text

        self.assertDictEqual(actual_content, self.expected_content)
Example #2
0
    def setUp(self):
        self.pipeline = Pipeline()

        self.pipeline.set_reader(AGNewsReader())
        self.pipeline.initialize()

        self.sample_file: str = os.path.abspath(
            os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         *([os.path.pardir] * 4),
                         "data_samples/ag_news/sample.csv"))

        self.expected_content: Dict[int, str] = {}
        with open(self.sample_file, "r") as file:
            for line_id, line in enumerate(file):
                data = line.strip().split(",")
                class_id, title, description = (
                    int(data[0].replace('"', "")),
                    data[1],
                    data[2],
                )
                self.expected_content[line_id] = (class_id, title, description)

        self.class_idx_to_name = {
            1: "World",
            2: "Sports",
            3: "Business",
            4: "Sci/Tech",
        }
Example #3
0
class AGNewsReaderTest(unittest.TestCase):
    def setUp(self):
        self.pipeline = Pipeline()

        self.pipeline.set_reader(AGNewsReader())
        self.pipeline.initialize()

        self.sample_file: str = os.path.abspath(
            os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         *([os.path.pardir] * 4),
                         "data_samples/ag_news/sample.csv"))

        self.expected_content: Dict[int, str] = {}
        with open(self.sample_file, "r") as file:
            for line_id, line in enumerate(file):
                data = line.strip().split(",")
                class_id, title, description = (
                    int(data[0].replace('"', "")),
                    data[1],
                    data[2],
                )
                self.expected_content[line_id] = (class_id, title, description)

        self.class_idx_to_name = {
            1: "World",
            2: "Sports",
            3: "Business",
            4: "Sci/Tech",
        }

    def test_ag_news_reader(self):
        for data_pack in self.pipeline.process_dataset(self.sample_file):
            (
                expected_class_id,
                expected_title,
                expected_desc,
            ) = self.expected_content[data_pack.pack_name]
            self.assertIsInstance(data_pack, DataPack)
            # Test Article
            doc_entries = list(data_pack.get(Document))
            self.assertTrue(len(doc_entries) == 1)
            article: Document = doc_entries[0]
            self.assertIsInstance(article, Document)
            self.assertEqual(article.text,
                             expected_title + "\n" + expected_desc)
            # Test Document Class
            doc_class = article.document_class
            self.assertTrue(len(doc_class) == 1)
            self.assertEqual(doc_class[0],
                             self.class_idx_to_name[expected_class_id])
            # Test Title
            title_entries = list(data_pack.get(Title))
            self.assertTrue(len(title_entries) == 1)
            title: Title = title_entries[0]
            self.assertEqual(title.text, expected_title)
            # Test Description
            desc_entries = list(data_pack.get(Description))
            self.assertTrue(len(desc_entries) == 1)
            description: Description = desc_entries[0]
            self.assertEqual(description.text, expected_desc)
Example #4
0
 def setUp(self):
     # Define and config the Pipeline
     self.fp = tempfile.NamedTemporaryFile(mode='w',
                                           suffix='.jsonl',
                                           delete=False)
     self.nlp = Pipeline()
     self.nlp.set_reader(ProdigyReader())
     self.create_sample_file()
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = {"batcher": {"batch_size": 5}}
        self.nlp.add_processor(dummy, config=config)
        self.nlp.initialize()

        self.data_path = "data_samples/ontonotes/00/"
Example #6
0
 def setUp(self):
     self.nlp = Pipeline()
     self.nlp.set_reader(StringReader())
     self.nlp.add(NLTKSentenceSegmenter())
     boxer_config = {"pack_name": "question"}
     self.nlp.add(MultiPackBoxer(), boxer_config)
     self.nlp.add(MutliDocPackAdder())
     self.nlp.add(QuestionAnsweringMulti())
     self.nlp.initialize()
Example #7
0
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.reader = OntonotesReader()

        self.data_path = "examples/data_samples/ontonotes/00/"

        self.nlp.set_reader(OntonotesReader())
        self.nlp.add_processor(DummyRelationExtractor())
        self.nlp.initialize()
Example #8
0
def write_results(pl: Pipeline, output_path: str, input_data: str):
    pl.add(
        WikiArticleWriter(),
        config={
            "output_dir": output_path,
            "zip_pack": True,
            "drop_record": True,
        },
    )
    pl.run(input_data)
Example #9
0
    def setUp(self):
        # Define and config the Pipeline
        self.dataset_path = "examples/data_samples/ontonotes/00"

        self.nlp = Pipeline()

        self.nlp.set_reader(OntonotesReader())
        self.nlp.add_processor(DummyPackProcessor())

        self.nlp.initialize()
class TestNLTKPOSTagger(unittest.TestCase):
    def setUp(self):
        self.nltk = Pipeline()
        self.nltk.set_reader(StringReader())
        self.nltk.add_processor(NLTKSentenceSegmenter())
        self.nltk.add_processor(NLTKWordTokenizer())
        self.nltk.add_processor(NLTKPOSTagger())

    def test_pos_tagger(self):
        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        pos = [["DT", "NN", "VBZ", "VBN", "NNP", "."],
               [
                   "DT", "NN", "IN", "DT", "NN", "TO", "VB", "PRP", "VB",
                   "NNP", "NNS", "."
               ], ["NNP", "VBZ", "RB", "VBN", "VBN", "DT", "JJ", "RB", "."]]
        document = ' '.join(sentences)
        pack = self.nltk.process(document)
        for i, sentence in enumerate(pack.get(Sentence)):
            for j, token in enumerate(
                    pack.get(entry_type=Token, range_annotation=sentence)):
                self.assertEqual(token.pos, pos[i][j])
def stanford_nlp_example(lang: str, text: str):
    pl = Pipeline()
    pl.set_reader(StringReader())

    models_path = os.getcwd()
    config = HParams(
        {
            'processors': 'tokenize,pos,lemma,depparse',
            'lang': lang,
            # Language code for the language to build the Pipeline
            'use_gpu': False
        },
        StandfordNLPProcessor.default_hparams())
    pl.add_processor(processor=StandfordNLPProcessor(models_path),
                     config=config)

    pl.initialize()

    pack = pl.process(text)
    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        tokens = [(token.text, token.pos, token.lemma)
                  for token in pack.get(Token, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")

        print(colored("Dependency Relations:", 'red'))
        for link in pack.get(Dependency, sentence):
            parent: Token = link.get_parent()  # type: ignore
            child: Token = link.get_child()  # type: ignore
            print(colored(child.text, 'cyan'), "has relation",
                  colored(link.rel_type, 'green'), "of parent",
                  colored(parent.text, 'cyan'))

        print("\n----------------------\n")
    def test_pipeline(self, texts):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx+1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline()
        reader_config = {
            "input_pack_name": "query",
            "output_pack_name": "output"
        }
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)
        config = {
            "model": {
                "name": "bert-base-uncased"
            },
            "tokenizer": {
                "name": "bert-base-uncased"
            },
            "max_seq_length": 128,
            "query_pack_name": "query"
        }
        nlp.add_processor(BertBasedQueryCreator(), config=config)

        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            query_pack = m_pack.get_pack("query")
            self.assertEqual(len(query_pack.generics), 1)
            self.assertIsInstance(query_pack.generics[0], Query)
            query = query_pack.generics[0].value
            self.assertEqual(query.shape, (1, 768))
    def test_pipeline(self, texts):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx+1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline()
        reader_config = HParams(
            {
                "input_pack_name": "input",
                "output_pack_name": "output"
            }, MultiPackSentenceReader.default_hparams())
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)
        translator_config = HParams(
            {
                "src_language": "de",
                "target_language": "en",
                "in_pack_name": "input",
                "out_pack_name": "result"
            }, None)

        nlp.add_processor(MicrosoftBingTranslator(), config=translator_config)
        nlp.initialize()

        english_results = ["Hey good morning", "This is Forte. A tool for NLP"]
        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            self.assertEqual(set(m_pack._pack_names),
                             set(["input", "output", "result"]))
            self.assertEqual(
                m_pack.get_pack("result").text, english_results[idx] + "\n")
Example #14
0
    def setUp(self):
        # Define and config the Pipeline
        self.dataset_path = "data_samples/conll03"

        self.nlp = Pipeline()

        self.nlp.set_reader(CoNLL03Reader())
        self.nlp.add_processor(DummyPackProcessor())
        self.nlp.add_processor(DummyPackProcessor())

        self.nlp.initialize()
Example #15
0
    def setUp(self) -> None:
        # Define and config the Pipeline
        self.dataset_path = "examples/ontonotes_sample_dataset/00"

        self.nlp = Pipeline()

        self.nlp.set_reader(OntonotesReader())
        self.processor = DummyRelationExtractor()
        self.nlp.add_processor(self.processor)

        self.nlp.initialize()
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = HParams({"batcher": {
            "batch_size": 5
        }}, dummy.default_hparams())
        self.nlp.add_processor(dummy, config=config)
        self.nlp.initialize()

        self.data_path = \
            "forte/processors/base/tests/data_samples/ontonotes/00/"
Example #17
0
    def setUp(self):
        self.spacy = Pipeline()
        self.spacy.set_reader(StringReader())

        config = {
            "processors": "tokenize",
            "lang": "en_core_web_sm",
            # Language code for the language to build the Pipeline
            "use_gpu": False
        }
        self.spacy.add_processor(SpacyProcessor(), config=config)
        self.spacy.initialize()
Example #18
0
    def setUp(self):
        self._cache_directory = Path(os.path.join(os.getcwd(), "cache_html"))
        self.reader = HTMLReader(cache_directory=self._cache_directory,
                                 append_to_cache=True)

        self.pl1 = Pipeline()
        self.pl1.set_reader(self.reader)
        self.pl1.initialize()

        self.pl2 = Pipeline()
        self.pl2.set_reader(HTMLReader(from_cache=True,
                                       cache_directory=self._cache_directory))
        self.pl2.initialize()
Example #19
0
 def setUp(self):
     self.stanford_nlp = Pipeline()
     self.stanford_nlp.set_reader(StringReader())
     models_path = os.getcwd()
     config = {
         "processors": "tokenize",
         "lang": "en",
         # Language code for the language to build the Pipeline
         "use_gpu": False
     }
     self.stanford_nlp.add_processor(StandfordNLPProcessor(models_path),
                                     config=config)
     self.stanford_nlp.initialize()
    def setUp(self):
        self.pipeline = Pipeline()

        self.pipeline.set_reader(MSMarcoPassageReader())
        self.pipeline.initialize()

        self.data_dir = 'data_samples/ms_marco_passage_retrieval'

        corpus_file = os.path.join(self.data_dir, 'collection.tsv')
        self.expected_content = {}
        with open(corpus_file, 'r') as f:
            for line in f.readlines():
                key, value = tuple(line.split('\t', 1))
                self.expected_content[key] = value
    def test_attribute_masker(self):
        pl = Pipeline()
        pl.set_reader(CoNLL03Reader())
        config = {"kwargs": {Token: ["ner"]}}

        pl.add_processor(processor=AttributeMasker(), config=config)
        pl.initialize()

        for pack in pl.process_dataset("data_samples/conll03/"):
            entries = pack.get_entries_by_type(Token)
            for entry in entries:
                self.assertIsNone(entry.ner)
Example #22
0
    def setUp(self):
        p: Pipeline = Pipeline()
        p.set_reader(EmptyReader())
        p.add(EntryAnnotator())
        p.initialize()

        self.pack: DataPack = p.process(['doc1', 'doc2'])
Example #23
0
    def setUp(self):
        # Define and config the Pipeline
        self.dataset_path = "examples/"

        self.pl1 = Pipeline()
        self._cache_directory = Path(os.path.join(os.getcwd(), "cache_data"))
        self.pl1.set_reader(StringReader())

        self.pl2 = Pipeline()
        self.pl2.set_reader(StringReader())

        self.text = (
            "The plain green Norway spruce is displayed in the gallery's "
            "foyer. Wentworth worked as an assistant to sculptor Henry Moore "
            "in the late 1960s. His reputation as a sculptor grew in the "
            "1980s.")
Example #24
0
 def test_caster_all_selector(self):
     """
     Test if the caster and all pack selector works well.
     The caster is used to convert a single pack to multi pack, and then
     pack copier is used to create a new pack. The all pack selector selects
     all the pack from the multi pack. This test make sure this pipeline
     works OK.
     """
     mp: MultiPack
     for mp in (
         Pipeline()
         .set_reader(SentenceReader())
         .add(MultiPackBoxer())
         .add(MultiPackCopier())
         .add(DummyPackProcessor(), selector=AllPackSelector())
         .initialize()
         .process_dataset(
             os.path.join(data_samples_root, "random_texts", "0.txt")
         )
     ):
         num_pack = 0
         for pack in mp.packs:
             num_pack += 1
             entries = list(pack.get(NewType))
             self.assertEqual(len(entries), 1)
             self.assertEqual(entries[0].value, "[PACK]")
         self.assertEqual(num_pack, 2)
    def setUp(self):
        root_path = os.path.abspath(
            os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                os.pardir,
                os.pardir,
                os.pardir,
            ))

        file_path: str = os.path.join(root_path,
                                      "data_samples/data_pack_dataset_test")
        reader = CoNLL03Reader()
        context_type = Sentence
        request = {Sentence: []}
        skip_k = 0

        self.input_files = ["conll03_1.conll", "conll03_2.conll"]
        self.feature_schemes = {}

        train_pl: Pipeline = Pipeline()
        train_pl.set_reader(reader)
        train_pl.initialize()
        pack_iterator: Iterator[PackType] = train_pl.process_dataset(file_path)

        self.data_source: DataPackIterator = DataPackIterator(
            pack_iterator, context_type, request, skip_k)
    def setUp(self):
        # create indexer
        file_dir_path = os.path.dirname(__file__)
        data_dir = 'data_samples/ms_marco_passage_retrieval'
        self.abs_data_dir = os.path.abspath(
            os.path.join(file_dir_path, *([os.pardir] * 4), data_dir))
        self.index_name = "final"
        indexer_config = {
            "batch_size": 5,
            "fields": ["doc_id", "content", "pack_info"],
            "indexer": {
                "name": "ElasticSearchIndexer",
                "hparams": {
                    "index_name": self.index_name,
                    "hosts": "localhost:9200",
                    "algorithm": "bm25"
                },
                "other_kwargs": {
                    "request_timeout": 10,
                    "refresh": True
                }
            }
        }
        self.indexer = ElasticSearchIndexer(
            config={"index_name": self.index_name})
        nlp: Pipeline[DataPack] = Pipeline()
        nlp.set_reader(MSMarcoPassageReader())
        nlp.add(DataSelectorIndexProcessor(), config=indexer_config)
        nlp.initialize()

        self.size = 0
        for _ in nlp.process_dataset(self.abs_data_dir):
            self.size += 1

        self.test_dir = tempfile.mkdtemp()
class TestNLTKSentenceSegmenter(unittest.TestCase):
    def setUp(self):
        self.nltk = Pipeline()
        self.nltk.set_reader(StringReader())
        self.nltk.add_processor(NLTKSentenceSegmenter())

    def test_segmenter(self):
        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        pack = self.nltk.process(document)
        for idx, sentence in enumerate(pack.get(Sentence)):
            self.assertEqual(sentence.text, sentences[idx])
Example #28
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config_file",
                        default="./config.yml",
                        help="Config YAML filepath")
    args = parser.parse_args()

    # loading config
    config = yaml.safe_load(open(args.config_file, "r"))

    nlp: Pipeline[MultiPack] = Pipeline()
    nlp.set_reader(RandomDataSelector(), config=config["data_selector_config"])
    nlp.add(component=MultiPackBoxer(), config=config["boxer_config"])
    nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector())
    nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
    nlp.add(
        component=ReplacementDataAugmentProcessor(),
        config=config["da_processor_config"],
    )

    nlp.initialize()

    for _, m_pack in enumerate(nlp.process_dataset()):
        aug_pack = m_pack.get_pack("augmented_input")
        logging.info(aug_pack.text)
Example #29
0
    def _create_pipeline(config):
        nlp = Pipeline()
        nlp.set_reader(StringReader())

        # Using SpacyProcessor to segment the sentences
        nlp.add_processor(
            processor=SpacyProcessor(),
            config={
                'processors': '',
                'lang':
                "en_core_web_sm",  # Language code to build the Pipeline
                'use_gpu': False
            })

        nlp.add_processor(processor=AllenNLPProcessor(), config=config)
        nlp.initialize()
        return nlp
Example #30
0
    def create_pack_iterator(self) -> Iterator[DataPack]:
        srl_train_reader = OntonotesReader(cache_in_memory=True)
        train_pl: Pipeline = Pipeline()
        train_pl.set_reader(srl_train_reader)
        train_pl.initialize()
        pack_iterator = train_pl.process_dataset(self.train_path)

        return pack_iterator