Ejemplo n.º 1
0
def write_results(pl: Pipeline, output_path: str, input_data: str):
    pl.add(
        WikiArticleWriter(),
        config={
            "output_dir": output_path,
            "zip_pack": True,
            "drop_record": True,
        },
    )
    pl.run(input_data)
Ejemplo n.º 2
0
    def prepare(self, *args, **kwargs):  # pylint: disable=unused-argument
        prepare_pl = Pipeline()
        prepare_pl.set_reader(self.train_reader)
        for p in self.preprocessors:
            prepare_pl.add_processor(p)

        prepare_pl.run(self.configs.config_data.train_path)

        for p in self.preprocessors:
            p.finish(resource=self.resource)
Ejemplo n.º 3
0
def multi_example(input_path, output_path):
    """
    This example reads data from input path, and write multi pack output
    to output path.

    Args:
        input_path:
        output_path:

    Returns:

    """
    print("Multi Pack serialization example.")

    print(
        "We first read the data, and add multi-packs to them, and then "
        "save the results."
    )
    coref_pl = Pipeline()
    coref_pl.set_reader(DirPackReader())
    coref_pl.add(MultiPackBoxer())
    coref_pl.add(PackCopier())
    coref_pl.add(ExampleCoreferencer())
    coref_pl.add(ExampleCorefCounter())

    coref_pl.add(
        MultiPackWriter(),
        config={
            "output_dir": output_path,
            "indent": 2,
            "overwrite": True,
        },
    )

    coref_pl.run(input_path)

    print(
        "We can then load the saved results, and see if everything is OK. "
        "We should see the same number of multi packs there. "
    )
    reading_pl = Pipeline()
    reading_pl.set_reader(
        MultiPackDirectoryReader(),
        config={
            "multi_pack_dir": os.path.join(output_path, "multi"),
            "data_pack_dir": os.path.join(output_path, "packs"),
        },
    )
    reading_pl.add(ExampleCorefCounter())
    reading_pl.run()
Ejemplo n.º 4
0
    def testMultiPackWriting(self):
        coref_pl = Pipeline()
        coref_pl.set_reader(DirPackReader())
        coref_pl.add(MultiPackBoxer())
        coref_pl.add(CopySentence())

        coref_pl.add(PackIdMultiPackWriter(),
                     config={
                         'output_dir': os.path.join(self.main_output.name,
                                                    'multi'),
                         'indent': 2,
                         'overwrite': True,
                     })
        coref_pl.run(os.path.join(self.main_output.name, 'packs'))
        self.assertTrue(os.path.exists(os.path.join('multi_out', 'multi.idx')))
        self.assertTrue(os.path.exists(os.path.join('multi_out', 'pack.idx')))
        self.assertTrue(os.path.exists(os.path.join('multi_out', 'packs')))
        self.assertTrue(os.path.exists(os.path.join('multi_out', 'multi')))
Ejemplo n.º 5
0
    def testMultiPackWriting(self):
        coref_pl = Pipeline()
        coref_pl.set_reader(DirPackReader())
        coref_pl.add(MultiPackBoxer())
        coref_pl.add(CopySentence())

        coref_pl.add(
            PackIdMultiPackWriter(),
            config={
                "output_dir": os.path.join(self.main_output.name, "multi"),
                "indent": 2,
                "overwrite": True,
            },
        )
        coref_pl.run(os.path.join(self.main_output.name, "packs"))
        self.assertTrue(os.path.exists(os.path.join("multi_out", "multi.idx")))
        self.assertTrue(os.path.exists(os.path.join("multi_out", "pack.idx")))
        self.assertTrue(os.path.exists(os.path.join("multi_out", "packs")))
        self.assertTrue(os.path.exists(os.path.join("multi_out", "multi")))
Ejemplo n.º 6
0
def multi_example(input_path, output_path):
    """
    This example reads data from input path, and write multi pack output
    to output path.

    Args:
        input_path:
        output_path:

    Returns:

    """
    print("Multi Pack serialization example.")

    print("We first read the data, and add multi-packs to them, and then "
          "save the results.")
    coref_pl = Pipeline()
    coref_pl.set_reader(DirPackReader())
    coref_pl.add(MultiPackBoxer())
    coref_pl.add(PackCopier())
    coref_pl.add(ExampleCoreferencer())
    coref_pl.add(ExampleCorefCounter())

    coref_pl.add(
        MultiPackWriter(),
        {
            'output_dir': output_path,
            'indent': 2,
            'overwrite': True,
        }
    )

    coref_pl.run(input_path)

    print("We can then load the saved results, and see if everything is OK. "
          "We should see the same number of multi packs there. ")
    reading_pl = Pipeline()
    reading_pl.set_reader(MultiPackDiskReader(), {'data_path': output_path})
    reading_pl.add(ExampleCorefCounter())
    reading_pl.run()
Ejemplo n.º 7
0
 def prepare(self):
     prepare_pl = Pipeline()
     prepare_pl.set_reader(self.train_reader)
     for p in self.preprocessors:
         prepare_pl.add(p)
     prepare_pl.run(self.configs.config_data.train_path)
Ejemplo n.º 8
0
def main(nif_context: str, nif_page_structure: str, mapping_literals: str,
         mapping_objects: str, nif_text_links: str, redirects: str,
         info_boxs: str, output_path: str):
    # Load redirects.
    logging.info("Loading redirects")
    redirect_pickle = os.path.join(output_path, 'redirects.pickle')
    if os.path.exists(redirect_pickle):
        redirect_map: Dict[str, str] = pickle.load(open(redirect_pickle, 'rb'))
    else:
        redirect_map: Dict[str, str] = load_redirects(redirects)
        with open(redirect_pickle, 'wb') as pickle_f:
            pickle.dump(redirect_map, pickle_f)
    logging.info("Done loading.")

    # The datasets are read in two steps.
    raw_pack_dir = os.path.join(output_path, 'nif_raw')

    # First, we create the NIF reader that read the NIF in order.
    nif_pl = Pipeline()
    nif_pl.resource.update(redirects=redirect_map)

    nif_pl.set_reader(DBpediaWikiReader(),
                      config=HParams(
                          {
                              'redirect_path': redirects,
                              'nif_page_structure': nif_page_structure,
                              'nif_text_links': nif_text_links,
                          }, DBpediaWikiReader.default_configs()))

    nif_pl.add_processor(WikiArticleWriter(),
                         config=HParams(
                             {
                                 'output_dir': raw_pack_dir,
                                 'zip_pack': True,
                             }, WikiArticleWriter.default_configs()))

    nif_pl.initialize()
    logging.info('Start running the DBpedia text pipeline.')
    nif_pl.run(nif_context)

    # Second, we add info boxes to the packs with NIF.
    ib_pl = Pipeline()
    ib_pl.resource.update(redirects=redirect_map)
    ib_pl.set_reader(
        DBpediaInfoBoxReader(),
        config=HParams(
            {
                'pack_index': os.path.join(raw_pack_dir, 'article.idx'),
                'pack_dir': raw_pack_dir,
                'mapping_literals': mapping_literals,
                'mapping_objects': mapping_objects,
                'reading_log': os.path.join(output_path, 'infobox.log')
            }, DBpediaInfoBoxReader.default_configs()))

    ib_pl.add_processor(
        WikiArticleWriter(),
        config=HParams(
            {
                'output_dir': os.path.join(output_path, 'nif_info_box'),
                'zip_pack': True,
            }, WikiArticleWriter.default_configs()))

    # Now we run the info box pipeline.
    ib_pl.initialize()
    ib_pl.run(info_boxs)
Ejemplo n.º 9
0
from texar.torch import HParams

from forte.pipeline import Pipeline
from forte.data.readers import OntonotesReader
from forte.processors.nltk_processors import NLTKWordTokenizer, \
    NLTKPOSTagger, NLTKSentenceSegmenter
from forte.processors.writers import DocIdJsonPackWriter

nlp = Pipeline()
reader = OntonotesReader()

data_path = "../data_samples/ontonotes/00/"

nlp.set_reader(OntonotesReader())
nlp.add_processor(NLTKSentenceSegmenter())
nlp.add_processor(NLTKWordTokenizer())
nlp.add_processor(NLTKPOSTagger())

# This is a simple writer that serialize the result to the current directory and
# will use the DocID field in the data pack as the file name.
nlp.add_processor(
    DocIdJsonPackWriter(),
    HParams(
        {'output_dir': '.'},
        DocIdJsonPackWriter.default_hparams(),
    ))

nlp.initialize()

nlp.run(data_path)
Ejemplo n.º 10
0
    def testMultiPackWriting(self, config_data):
        zip_pack, method = config_data

        # Use different sub-directory to avoid conflicting.
        subdir = f"{zip_pack}_{method}"

        with tempfile.TemporaryDirectory() as main_output:
            # Prepare input data.
            prepared_input: str = os.path.join(main_output, subdir,
                                               "input_packs")
            data_output: str = os.path.join(main_output, subdir, "output")
            suffix = ".pickle" if method == "pickle" else ".json"
            if zip_pack:
                suffix = suffix + ".gz"

            nlp = Pipeline[DataPack]()
            nlp.set_reader(OntonotesReader())
            nlp.add(
                PackIdJsonPackWriter(),
                {
                    "output_dir": prepared_input,
                    "overwrite": True,
                    "serialize_method": method,
                    "zip_pack": zip_pack,
                },
            )
            nlp.run(self.data_path)

            # Convert to multi pack.
            coref_pl = Pipeline()

            coref_pl.set_reader(
                DirPackReader(),
                {
                    "serialize_method": method,
                    "zip_pack": zip_pack,
                    "suffix": suffix,
                },
            )
            coref_pl.add(MultiPackBoxer())
            coref_pl.add(CopySentence())
            coref_pl.add(NaiveCoref())

            coref_pl.add(
                PackIdMultiPackWriter(),
                config={
                    "output_dir": data_output,
                    "overwrite": True,
                    "serialize_method": method,
                    "zip_pack": zip_pack,
                },
            )
            coref_pl.run(prepared_input)

            self.assertTrue(
                os.path.exists(os.path.join(data_output, "multi.idx")))
            self.assertTrue(
                os.path.exists(os.path.join(data_output, "pack.idx")))
            self.assertTrue(os.path.exists(os.path.join(data_output, "packs")))
            self.assertTrue(os.path.exists(os.path.join(data_output, "multi")))

            # Read the multi pack again.
            mp_pipeline = Pipeline()

            mp_pipeline.set_reader(
                MultiPackDirectoryReader(),
                config={
                    "suffix": suffix,
                    "zip_pack": zip_pack,
                    "serialize_method": method,
                    "data_pack_dir": os.path.join(data_output, "packs"),
                    "multi_pack_dir": os.path.join(data_output, "multi"),
                },
            ).initialize()

            re: CrossDocEntityRelation
            for mp in mp_pipeline.process_dataset():
                for re in mp.get(CrossDocEntityRelation):
                    self.assertEqual(re.get_parent().text, re.get_child().text)