コード例 #1
0
ファイル: html_reader_test.py プロジェクト: meelement/forte
    def setUp(self):
        self._cache_directory = Path(os.path.join(os.getcwd(), "cache_html"))
        self.reader = HTMLReader(cache_directory=self._cache_directory,
                                 append_to_cache=True)

        self.pl1 = Pipeline()
        self.pl1.set_reader(self.reader)
        self.pl1.initialize()

        self.pl2 = Pipeline()
        self.pl2.set_reader(HTMLReader(from_cache=True,
                                       cache_directory=self._cache_directory))
        self.pl2.initialize()
コード例 #2
0
def multi_example(input_path, output_path):
    """
    This example reads data from input path, and write multi pack output
    to output path.

    Args:
        input_path:
        output_path:

    Returns:

    """
    print("Multi Pack serialization example.")

    print(
        "We first read the data, and add multi-packs to them, and then "
        "save the results."
    )
    coref_pl = Pipeline()
    coref_pl.set_reader(DirPackReader())
    coref_pl.add(MultiPackBoxer())
    coref_pl.add(PackCopier())
    coref_pl.add(ExampleCoreferencer())
    coref_pl.add(ExampleCorefCounter())

    coref_pl.add(
        MultiPackWriter(),
        config={
            "output_dir": output_path,
            "indent": 2,
            "overwrite": True,
        },
    )

    coref_pl.run(input_path)

    print(
        "We can then load the saved results, and see if everything is OK. "
        "We should see the same number of multi packs there. "
    )
    reading_pl = Pipeline()
    reading_pl.set_reader(
        MultiPackDirectoryReader(),
        config={
            "multi_pack_dir": os.path.join(output_path, "multi"),
            "data_pack_dir": os.path.join(output_path, "packs"),
        },
    )
    reading_pl.add(ExampleCorefCounter())
    reading_pl.run()
コード例 #3
0
    def test_pipeline4(self, batch_size):
        """Tests a chain of Pack->Batch->Pack."""

        nlp = Pipeline()
        reader = SentenceReader()
        nlp.set_reader(reader)
        dummy1 = DummyPackProcessor()
        nlp.add_processor(processor=dummy1)

        dummy2 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size}}
        nlp.add_processor(processor=dummy2, config=config)

        dummy3 = DummyPackProcessor()
        nlp.add_processor(processor=dummy3)
        nlp.initialize()
        data_path = "data_samples/random_texts/0.txt"

        num_packs = 0
        for pack in nlp.process_dataset(data_path):
            types = list(pack.get_entries_by_type(NewType))
            num_packs += 1
            self.assertEqual(len(types), 1)
            self.assertEqual(types[0].value, "[PACK][BATCH][PACK]")

        # check that all packs are yielded
        self.assertEqual(num_packs, reader.count)
コード例 #4
0
    def test_pipeline(self, texts):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx+1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline()
        reader_config = {
            "input_pack_name": "query",
            "output_pack_name": "output"
        }
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)
        config = {
            "model": {
                "name": "bert-base-uncased"
            },
            "tokenizer": {
                "name": "bert-base-uncased"
            },
            "max_seq_length": 128,
            "query_pack_name": "query"
        }
        nlp.add_processor(BertBasedQueryCreator(), config=config)

        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            query_pack = m_pack.get_pack("query")
            self.assertEqual(len(query_pack.generics), 1)
            self.assertIsInstance(query_pack.generics[0], Query)
            query = query_pack.generics[0].value
            self.assertEqual(query.shape, (1, 768))
コード例 #5
0
    def test_pipeline(self, texts):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx+1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline()
        reader_config = HParams(
            {
                "input_pack_name": "input",
                "output_pack_name": "output"
            }, MultiPackSentenceReader.default_hparams())
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)
        translator_config = HParams(
            {
                "src_language": "de",
                "target_language": "en",
                "in_pack_name": "input",
                "out_pack_name": "result"
            }, None)

        nlp.add_processor(MicrosoftBingTranslator(), config=translator_config)
        nlp.initialize()

        english_results = ["Hey good morning", "This is Forte. A tool for NLP"]
        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            self.assertEqual(set(m_pack._pack_names),
                             set(["input", "output", "result"]))
            self.assertEqual(
                m_pack.get_pack("result").text, english_results[idx] + "\n")
コード例 #6
0
    def setUp(self):
        # Define and config the Pipeline
        self.dataset_path = "examples/"

        self.pl1 = Pipeline()
        self._cache_directory = Path(os.path.join(os.getcwd(), "cache_data"))
        self.pl1.set_reader(StringReader())

        self.pl2 = Pipeline()
        self.pl2.set_reader(StringReader())

        self.text = (
            "The plain green Norway spruce is displayed in the gallery's "
            "foyer. Wentworth worked as an assistant to sculptor Henry Moore "
            "in the late 1960s. His reputation as a sculptor grew in the "
            "1980s.")
コード例 #7
0
def stanford_nlp_example(lang: str, text: str):
    pl = Pipeline()
    pl.set_reader(StringReader())

    models_path = os.getcwd()
    config = HParams(
        {
            'processors': 'tokenize,pos,lemma,depparse',
            'lang': lang,
            # Language code for the language to build the Pipeline
            'use_gpu': False
        },
        StandfordNLPProcessor.default_hparams())
    pl.add_processor(processor=StandfordNLPProcessor(models_path),
                     config=config)

    pl.initialize()

    pack = pl.process(text)
    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        tokens = [(token.text, token.pos, token.lemma)
                  for token in pack.get(Token, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")

        print(colored("Dependency Relations:", 'red'))
        for link in pack.get(Dependency, sentence):
            parent: Token = link.get_parent()  # type: ignore
            child: Token = link.get_child()  # type: ignore
            print(colored(child.text, 'cyan'), "has relation",
                  colored(link.rel_type, 'green'), "of parent",
                  colored(parent.text, 'cyan'))

        print("\n----------------------\n")
コード例 #8
0
    def test_reader_original_span_test(self, value):
        span_ops, output = (
            [
                (Span(11, 19), "New"),
                (Span(19, 20), " Shiny "),
                (Span(25, 25), " Ends"),
            ],
            "<title>The New Shiny Title Ends </title>",
        )
        input_span, expected_span, mode = value

        pipeline = Pipeline()
        reader = PlainTextReader()
        reader.text_replace_operation = lambda _: span_ops
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        pack = pipeline.process_one(self.test_dir)

        self.assertEqual(pack.text, output)

        output_span = pack.get_original_span(input_span, mode)
        self.assertEqual(
            output_span,
            expected_span,
            f"Expected: ({expected_span.begin, expected_span.end}"
            f"), Found: ({output_span.begin, output_span.end})"
            f" when Input: ({input_span.begin, input_span.end})"
            f" and Mode: {mode}",
        )
コード例 #9
0
    def setUp(self):
        root_path = os.path.abspath(
            os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                os.pardir,
                os.pardir,
                os.pardir,
            ))

        file_path: str = os.path.join(root_path,
                                      "data_samples/data_pack_dataset_test")
        reader = CoNLL03Reader()
        context_type = Sentence
        request = {Sentence: []}
        skip_k = 0

        self.input_files = ["conll03_1.conll", "conll03_2.conll"]
        self.feature_schemes = {}

        train_pl: Pipeline = Pipeline()
        train_pl.set_reader(reader)
        train_pl.initialize()
        pack_iterator: Iterator[PackType] = train_pl.process_dataset(file_path)

        self.data_source: DataPackIterator = DataPackIterator(
            pack_iterator, context_type, request, skip_k)
コード例 #10
0
    def setUp(self):
        p: Pipeline = Pipeline()
        p.set_reader(EmptyReader())
        p.add(EntryAnnotator())
        p.initialize()

        self.pack: DataPack = p.process(['doc1', 'doc2'])
コード例 #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config_file",
                        default="./config.yml",
                        help="Config YAML filepath")
    args = parser.parse_args()

    # loading config
    config = yaml.safe_load(open(args.config_file, "r"))

    nlp: Pipeline[MultiPack] = Pipeline()
    nlp.set_reader(RandomDataSelector(), config=config["data_selector_config"])
    nlp.add(component=MultiPackBoxer(), config=config["boxer_config"])
    nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector())
    nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
    nlp.add(
        component=ReplacementDataAugmentProcessor(),
        config=config["da_processor_config"],
    )

    nlp.initialize()

    for _, m_pack in enumerate(nlp.process_dataset()):
        aug_pack = m_pack.get_pack("augmented_input")
        logging.info(aug_pack.text)
コード例 #12
0
    def setUp(self):
        self.pipeline = Pipeline()

        self.pipeline.set_reader(AGNewsReader())
        self.pipeline.initialize()

        self.sample_file: str = os.path.abspath(
            os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         *([os.path.pardir] * 4),
                         "data_samples/ag_news/sample.csv"))

        self.expected_content: Dict[int, str] = {}
        with open(self.sample_file, "r") as file:
            for line_id, line in enumerate(file):
                data = line.strip().split(",")
                class_id, title, description = (
                    int(data[0].replace('"', "")),
                    data[1],
                    data[2],
                )
                self.expected_content[line_id] = (class_id, title, description)

        self.class_idx_to_name = {
            1: "World",
            2: "Sports",
            3: "Business",
            4: "Sci/Tech",
        }
コード例 #13
0
 def test_caster_all_selector(self):
     """
     Test if the caster and all pack selector works well.
     The caster is used to convert a single pack to multi pack, and then
     pack copier is used to create a new pack. The all pack selector selects
     all the pack from the multi pack. This test make sure this pipeline
     works OK.
     """
     mp: MultiPack
     for mp in (
         Pipeline()
         .set_reader(SentenceReader())
         .add(MultiPackBoxer())
         .add(MultiPackCopier())
         .add(DummyPackProcessor(), selector=AllPackSelector())
         .initialize()
         .process_dataset(
             os.path.join(data_samples_root, "random_texts", "0.txt")
         )
     ):
         num_pack = 0
         for pack in mp.packs:
             num_pack += 1
             entries = list(pack.get(NewType))
             self.assertEqual(len(entries), 1)
             self.assertEqual(entries[0].value, "[PACK]")
         self.assertEqual(num_pack, 2)
コード例 #14
0
    def test_pipeline7(self, batch_size1, batch_size2, batch_size3):
        # Tests a chain of Batch->Batch->Batch->Pack with different batch sizes.

        nlp = Pipeline()
        reader = MultiPackSentenceReader()
        nlp.set_reader(reader)
        dummy1 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size1}}
        nlp.add_processor(processor=dummy1,
                          config=config,
                          selector=FirstPackSelector())
        dummy2 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size2}}
        nlp.add_processor(processor=dummy2,
                          config=config,
                          selector=FirstPackSelector())
        dummy3 = DummmyFixedSizeBatchProcessor()
        config = {"batcher": {"batch_size": batch_size3}}
        nlp.add_processor(processor=dummy3,
                          config=config,
                          selector=FirstPackSelector())
        dummy4 = DummyPackProcessor()
        nlp.add_processor(processor=dummy4, selector=FirstPackSelector())
        nlp.initialize()
        data_path = "data_samples/random_texts/0.txt"

        num_packs = 0
        for pack in nlp.process_dataset(data_path):
            types = list(pack.get_pack("pack").get_entries_by_type(NewType))
            num_packs += 1
            self.assertEqual(len(types), 1)
            self.assertEqual(types[0].value, "[BATCH][BATCH][BATCH][PACK]")

        # check that all packs are yielded
        self.assertEqual(num_packs, reader.count)
コード例 #15
0
    def setUp(self):
        # create indexer
        file_dir_path = os.path.dirname(__file__)
        data_dir = 'data_samples/ms_marco_passage_retrieval'
        self.abs_data_dir = os.path.abspath(
            os.path.join(file_dir_path, *([os.pardir] * 4), data_dir))
        self.index_name = "final"
        indexer_config = {
            "batch_size": 5,
            "fields": ["doc_id", "content", "pack_info"],
            "indexer": {
                "name": "ElasticSearchIndexer",
                "hparams": {
                    "index_name": self.index_name,
                    "hosts": "localhost:9200",
                    "algorithm": "bm25"
                },
                "other_kwargs": {
                    "request_timeout": 10,
                    "refresh": True
                }
            }
        }
        self.indexer = ElasticSearchIndexer(
            config={"index_name": self.index_name})
        nlp: Pipeline[DataPack] = Pipeline()
        nlp.set_reader(MSMarcoPassageReader())
        nlp.add(DataSelectorIndexProcessor(), config=indexer_config)
        nlp.initialize()

        self.size = 0
        for _ in nlp.process_dataset(self.abs_data_dir):
            self.size += 1

        self.test_dir = tempfile.mkdtemp()
コード例 #16
0
    def create_pack_iterator(self) -> Iterator[DataPack]:
        srl_train_reader = OntonotesReader(cache_in_memory=True)
        train_pl: Pipeline = Pipeline()
        train_pl.set_reader(srl_train_reader)
        train_pl.initialize()
        pack_iterator = train_pl.process_dataset(self.train_path)

        return pack_iterator
コード例 #17
0
 def setUp(self):
     # Define and config the Pipeline
     self.fp = tempfile.NamedTemporaryFile(mode='w',
                                           suffix='.jsonl',
                                           delete=False)
     self.nlp = Pipeline()
     self.nlp.set_reader(ProdigyReader())
     self.create_sample_file()
コード例 #18
0
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = {"batcher": {"batch_size": 5}}
        self.nlp.add_processor(dummy, config=config)
        self.nlp.initialize()

        self.data_path = "data_samples/ontonotes/00/"
コード例 #19
0
    def test_without_attribute_masker(self):
        pl = Pipeline()
        pl.set_reader(CoNLL03Reader())
        pl.initialize()

        for pack in pl.process_dataset("data_samples/conll03/"):
            entries = pack.get_entries_by_type(Token)
            for entry in entries:
                self.assertIsNotNone(entry.ner)
コード例 #20
0
    def test_reader_no_replace_test(self):
        # Read with no replacements
        pipeline = Pipeline()
        reader = PlainTextReader()
        pipeline.set_reader(reader, {"file_ext": ".html"})
        pipeline.initialize()

        pack = pipeline.process_one(self.test_dir)
        self.assertEqual(pack.text, self.orig_text)
コード例 #21
0
 def setUp(self):
     self.nlp = Pipeline()
     self.nlp.set_reader(StringReader())
     self.nlp.add(NLTKSentenceSegmenter())
     boxer_config = {"pack_name": "question"}
     self.nlp.add(MultiPackBoxer(), boxer_config)
     self.nlp.add(MutliDocPackAdder())
     self.nlp.add(QuestionAnsweringMulti())
     self.nlp.initialize()
コード例 #22
0
    def setUp(self) -> None:
        file_dir_path = os.path.dirname(__file__)
        data_path = os.path.join(file_dir_path, os.pardir, os.pardir,
                                 'test_data', 'ontonotes')

        pipeline: Pipeline = Pipeline()
        pipeline.set_reader(OntonotesReader())
        pipeline.initialize()
        self.data_pack: DataPack = pipeline.process_one(data_path)
コード例 #23
0
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.reader = OntonotesReader()

        self.data_path = "examples/data_samples/ontonotes/00/"

        self.nlp.set_reader(OntonotesReader())
        self.nlp.add_processor(DummyRelationExtractor())
        self.nlp.initialize()
コード例 #24
0
    def prepare(self, *args, **kwargs):  # pylint: disable=unused-argument
        prepare_pl = Pipeline()
        prepare_pl.set_reader(self.train_reader)
        for p in self.preprocessors:
            prepare_pl.add_processor(p)

        prepare_pl.run(self.configs.config_data.train_path)

        for p in self.preprocessors:
            p.finish(resource=self.resource)
コード例 #25
0
ファイル: data_pack_test.py プロジェクト: gaurav5590/forte
    def setUp(self) -> None:
        file_dir_path = os.path.dirname(__file__)
        data_path = os.path.abspath(
            os.path.join(file_dir_path, '../../../', 'data_samples',
                         'ontonotes/one_file'))

        pipeline: Pipeline = Pipeline()
        pipeline.set_reader(OntonotesReader())
        pipeline.initialize()
        self.data_pack: DataPack = pipeline.process_one(data_path)
コード例 #26
0
    def setUp(self):
        # Define and config the Pipeline
        self.dataset_path = "examples/data_samples/ontonotes/00"

        self.nlp = Pipeline()

        self.nlp.set_reader(OntonotesReader())
        self.nlp.add_processor(DummyPackProcessor())

        self.nlp.initialize()
コード例 #27
0
    def setUp(self):
        # Define and config the Pipeline
        self.dataset_path = "data_samples/conll03"

        self.nlp = Pipeline()

        self.nlp.set_reader(CoNLL03Reader())
        self.nlp.add_processor(DummyPackProcessor())
        self.nlp.add_processor(DummyPackProcessor())

        self.nlp.initialize()
コード例 #28
0
    def test_process_next(self):

        another_pipeline = Pipeline()
        another_pipeline.set_reader(DeserializeReader())
        another_pipeline.initialize()

        data = ["Testing Reader", "Testing Deserializer"]

        for pack in self.nlp.process_dataset(data):
            for new_pack in another_pipeline.process_dataset([pack.serialize()]):
                self.assertEqual(pack.text, new_pack.text)
コード例 #29
0
ファイル: pipeline_test.py プロジェクト: huzecong/forte
    def setUp(self) -> None:
        # Define and config the Pipeline
        self.dataset_path = "examples/ontonotes_sample_dataset/00"

        self.nlp = Pipeline()

        self.nlp.set_reader(OntonotesReader())
        self.processor = DummyRelationExtractor()
        self.nlp.add_processor(self.processor)

        self.nlp.initialize()
コード例 #30
0
 def test_empty_selector(self):
     """
     Test the selector that doesn't select anything perform well in the
     pipeline.
     """
     for pack in Pipeline().set_reader(MultiPackSentenceReader()).add(
             DummyPackProcessor(),
             selector=NothingSelector()).initialize().process_dataset(
                 os.path.join(data_samples_root, "random_texts", "0.txt")):
         # Because no packs are selected, we do not have any entries added.
         self.assertTrue(pack.get_pack('pack').num_generics_entries == 0)