Example #1
0
def pack_example(input_path, output_path):
    """
    This example read data from input path and serialize to output path.
    Args:
        input_path:
        output_path:

    Returns:

    """
    print("Pack serialization example.")
    nlp = Pipeline[DataPack]()

    nlp.set_reader(OntonotesReader())
    nlp.add(NLTKSentenceSegmenter())
    nlp.add(NLTKWordTokenizer())
    nlp.add(NLTKPOSTagger())

    # This is a simple writer that serialize the result to the current
    # directory and will use the DocID field in the data pack as the file name.
    nlp.add(PackNameJsonPackWriter(), {
        'output_dir': output_path,
        'indent': 2,
        'overwrite': True,
    })

    nlp.run(input_path)
def build_pipeline(result_dir: str, word_counter: Counter,
                   tag_counter: Counter):
    r"""Build the pipeline to parse IU Xray report with tokenizer, lowercase and
    non-alpha removal to generate forte json file with the same name with
    preprocessed content and information of impression, findings and path to the
    parent image.
    Args:
        result_dir: the directory to save the forte json files.
    Return:
        pipeline: built pipeline to process the xml files
    """

    pipeline = Pipeline[MultiPack]()
    pipeline.resource.update(word_counter=word_counter)
    pipeline.resource.update(tag_counter=tag_counter)
    pipeline.set_reader(IUXrayReportReader())
    pipeline.add(MultiPackBoxer())
    pipeline.add(PackNameJsonPackWriter(), {
        'indent': 2,
        'output_dir': result_dir,
        'overwrite': True
    }, NameMatchSelector(select_name='default'))
    pipeline.initialize()

    return pipeline
Example #3
0
    def test_serialize_deserialize_processor(self):
        pipe_serialize = Pipeline[DataPack]()
        pipe_serialize.set_reader(OntonotesReader())
        pipe_serialize.add(
            AnnotationRemover(),
            # Remove tokens and sentences form OntonotesReader.
            {
                "removal_types": [
                    "ft.onto.base_ontology.Token",
                    "ft.onto.base_ontology.Sentence",
                ]
            },
        )
        pipe_serialize.add(PeriodSentenceSplitter())
        pipe_serialize.add(WhiteSpaceTokenizer())

        with tempfile.TemporaryDirectory() as output_dir:
            pipe_serialize.add(
                PackNameJsonPackWriter(),
                {
                    "output_dir": output_dir,
                    "indent": 2,
                },
            )

            pipe_serialize.run(self.data_path)

            pipe_deserialize = Pipeline[DataPack]()
            pipe_deserialize.set_reader(RecursiveDirectoryDeserializeReader())
            pipe_deserialize.initialize()

            token_counts: Dict[str, int] = {}

            # This basically test whether the deserialized data is
            # still the same as expected.
            pack: DataPack
            for pack in pipe_deserialize.process_dataset(output_dir):
                tokens: List[Token] = list(pack.get(Token))
                token_counts[pack.pack_name] = len(tokens)

            expected_count = {
                "bn/abc/00/abc_0039": 72,
                "bn/abc/00/abc_0019": 370,
                "bn/abc/00/abc_0059": 39,
                "bn/abc/00/abc_0009": 424,
                "bn/abc/00/abc_0029": 487,
                "bn/abc/00/abc_0069": 428,
                "bn/abc/00/abc_0049": 73,
            }

            assert token_counts == expected_count
Example #4
0
    def test_serialize_deserialize_processor(self):
        pipe_serialize = Pipeline[DataPack]()
        pipe_serialize.set_reader(OntonotesReader())
        pipe_serialize.add(
            AnnotationRemover(),
            # Remove tokens and sentences form OntonotesReader.
            {
                'removal_types': [
                    'ft.onto.base_ontology.Token',
                    'ft.onto.base_ontology.Sentence',
                ]
            })
        pipe_serialize.add(NLTKSentenceSegmenter())
        pipe_serialize.add(NLTKWordTokenizer())
        pipe_serialize.add(NLTKPOSTagger())

        output_path = tempfile.mkdtemp()

        pipe_serialize.add(PackNameJsonPackWriter(), {
            'output_dir': output_path,
            'indent': 2,
        })

        dataset_path = "data_samples/ontonotes/00"
        pipe_serialize.run(dataset_path)

        pipe_deserialize = Pipeline[DataPack]()
        pipe_deserialize.set_reader(RecursiveDirectoryDeserializeReader())
        pipe_deserialize.initialize()

        token_counts: Dict[str, int] = {}

        # This basically test whether the deserialized data is still the same
        # as expected.
        pack: DataPack
        for pack in pipe_deserialize.process_dataset(output_path):
            tokens: List[Token] = list(pack.get(Token))
            token_counts[pack.pack_name] = len(tokens)

        expected_count = {
            'bn/abc/00/abc_0039': 72,
            'bn/abc/00/abc_0019': 370,
            'bn/abc/00/abc_0059': 39,
            'bn/abc/00/abc_0009': 424,
            'bn/abc/00/abc_0029': 487,
            'bn/abc/00/abc_0069': 428,
            'bn/abc/00/abc_0049': 73
        }

        assert token_counts == expected_count
        shutil.rmtree(output_path)
Example #5
0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The preparing pipeline for the clinical ner.
"""
from examples.Cliner.reader import ClinerReader
from forte.data.data_pack import DataPack
from forte.pipeline import Pipeline

# Let's create a pipeline that accept a Json string.
from forte.processors.writers import PackNameJsonPackWriter

pipeline = Pipeline[DataPack]()
pipeline.set_reader(ClinerReader())
pipeline.add(PackNameJsonPackWriter(), {
    'indent': 2,
    'output_dir': '.',
    'overwrite': True
})
pipeline.run('CliNER/data/examples/ex_doc.txt')
        u = Utterance(
            input_pack,
            len(input_pack.text) - len(self.instruction),
            len(input_pack.text),
        )
        u.speaker = "ai"


instruct_text = (
    "This is an example to use the chatbot interface with the "
    "content rewriter model. To run this example, follow the "
    'instructions here "https://github.com/asyml/forte'
    '/tree/master/examples/content_rewriter" to obtain '
    "the models and make sure Forte is in your Python Path."
)

pipeline = Pipeline[DataPack]()
pipeline.set_reader(TableReader())
pipeline.add(Instructor(instruct_text))
pipeline.add(
    PackNameJsonPackWriter(),
    {
        "indent": 2,
        "output_dir": "table_inputs",
        "overwrite": True,
        "drop_record": True,
    },
)

pipeline.run("table_samples.txt")