コード例 #1
0
ファイル: serialize_example.py プロジェクト: awoziji/forte
def pack_example(input_path, output_path):
    """
    This example read data from input path and serialize to output path.
    Args:
        input_path:
        output_path:

    Returns:

    """
    print("Pack serialization example.")
    nlp = Pipeline[DataPack]()

    nlp.set_reader(OntonotesReader())
    nlp.add(NLTKSentenceSegmenter())
    nlp.add(NLTKWordTokenizer())
    nlp.add(NLTKPOSTagger())

    # This is a simple writer that serialize the result to the current
    # directory and will use the DocID field in the data pack as the file name.
    nlp.add(
        DocIdJsonPackWriter(),
        {
            'output_dir': output_path,
            'indent': 2,
            'overwrite': True,
        }
    )

    nlp.run(input_path)
コード例 #2
0
    def test_serialize_deserialize_processor(self):
        pipe_serialize = Pipeline[DataPack]()
        pipe_serialize.set_reader(OntonotesReader())
        pipe_serialize.add(
            AnnotationRemover(),
            # Remove tokens and sentences form OntonotesReader.
            {
                'removal_types': [
                    'ft.onto.base_ontology.Token',
                    'ft.onto.base_ontology.Sentence',
                ]
            })
        pipe_serialize.add(NLTKSentenceSegmenter())
        pipe_serialize.add(NLTKWordTokenizer())
        pipe_serialize.add(NLTKPOSTagger())

        output_path = tempfile.mkdtemp()

        pipe_serialize.add(DocIdJsonPackWriter(), {
            'output_dir': output_path,
            'indent': 2,
        })

        dataset_path = "data_samples/ontonotes/00"
        pipe_serialize.run(dataset_path)

        pipe_deserialize = Pipeline[DataPack]()
        pipe_deserialize.set_reader(RecursiveDirectoryDeserializeReader())
        pipe_deserialize.initialize()

        token_counts: Dict[str, int] = {}

        # This basically test whether the deserialized data is still the same
        # as expected.
        pack: DataPack
        for pack in pipe_deserialize.process_dataset(output_path):
            tokens: List[Token] = list(pack.get(Token))
            token_counts[pack.pack_name] = len(tokens)

        expected_count = {
            'bn/abc/00/abc_0039': 72,
            'bn/abc/00/abc_0019': 370,
            'bn/abc/00/abc_0059': 39,
            'bn/abc/00/abc_0009': 424,
            'bn/abc/00/abc_0029': 487,
            'bn/abc/00/abc_0069': 428,
            'bn/abc/00/abc_0049': 73
        }

        assert token_counts == expected_count
        shutil.rmtree(output_path)
コード例 #3
0
def main():
    import sys
    ner_dir, srl_dir = sys.argv[  # pylint: disable=unbalanced-tuple-unpacking
        1:3]

    output_config = HParams(
        {'output_dir': '.'},
        DocIdJsonPackWriter.default_hparams(),
    )

    eng_text = "The plain green Norway spruce is displayed in the gallery's " \
               "foyer. Wentworth worked as an assistant to sculptor Henry " \
               "Moore in the late 1960s. His reputation as a sculptor grew " \
               "in the 1980s."

    fr_text = "Van Gogh grandit au sein d'une famille de " \
              "l'ancienne bourgeoisie."

    stanford_nlp_example1('en', eng_text, output_config)
    stanford_nlp_example1('fr', fr_text, output_config)

    string_processor_example(ner_dir, srl_dir)
コード例 #4
0
def stanford_nlp_example1(lang: str, text: str, output_config: HParams):
    pl = Pipeline()
    pl.set_reader(StringReader())

    models_path = os.getcwd()
    config = HParams(
        {
            'processors': 'tokenize,pos,lemma,depparse',
            'lang': lang,
            # Language code for the language to build the Pipeline
            'use_gpu': False
        },
        StandfordNLPProcessor.default_hparams())
    pl.add_processor(processor=StandfordNLPProcessor(models_path),
                     config=config)
    pl.add_processor(processor=DocIdJsonPackWriter(), config=output_config)

    pl.initialize()

    pack = pl.process(text)
    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        tokens = [(token.text, token.pos, token.lemma)
                  for token in pack.get(Token, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")

        print(colored("Dependency Relations:", 'red'))
        for link in pack.get(Dependency, sentence):
            parent: Token = link.get_parent()  # type: ignore
            child: Token = link.get_child()  # type: ignore
            print(colored(child.text, 'cyan'), "has relation",
                  colored(link.rel_type, 'green'), "of parent",
                  colored(parent.text, 'cyan'))

        print("\n----------------------\n")
コード例 #5
0
ファイル: serialize_example.py プロジェクト: huzecong/forte
from texar.torch import HParams

from forte.pipeline import Pipeline
from forte.data.readers import OntonotesReader
from forte.processors.nltk_processors import NLTKWordTokenizer, \
    NLTKPOSTagger, NLTKSentenceSegmenter
from forte.processors.writers import DocIdJsonPackWriter

nlp = Pipeline()
reader = OntonotesReader()

data_path = "../data_samples/ontonotes/00/"

nlp.set_reader(OntonotesReader())
nlp.add_processor(NLTKSentenceSegmenter())
nlp.add_processor(NLTKWordTokenizer())
nlp.add_processor(NLTKPOSTagger())

# This is a simple writer that serialize the result to the current directory and
# will use the DocID field in the data pack as the file name.
nlp.add_processor(
    DocIdJsonPackWriter(),
    HParams(
        {'output_dir': '.'},
        DocIdJsonPackWriter.default_hparams(),
    ))

nlp.initialize()

nlp.run(data_path)
コード例 #6
0
from texar.torch import HParams

from forte.pipeline import Pipeline
from forte.data.readers import OntonotesReader
from forte.processors.nltk_processors import NLTKWordTokenizer, \
    NLTKPOSTagger, NLTKSentenceSegmenter
from forte.processors.writers import DocIdJsonPackWriter

nlp = Pipeline()
reader = OntonotesReader()

data_path = "../data_samples/ontonotes/00/"

nlp.set_reader(OntonotesReader())
nlp.add_processor(NLTKSentenceSegmenter())
nlp.add_processor(NLTKWordTokenizer())
nlp.add_processor(NLTKPOSTagger())

# This is a simple writer that serialize the result to the current directory and
# will use the DocID field in the data pack as the file name.
nlp.add_processor(DocIdJsonPackWriter(), HParams(
    {
        'output_dir': '.'
    },
    DocIdJsonPackWriter.default_hparams(),
))

nlp.initialize()

nlp.run(data_path)
コード例 #7
0
ファイル: serialize_example.py プロジェクト: williamwhe/forte
from texar.torch import HParams

from forte.pipeline import Pipeline
from forte.data.readers import OntonotesReader
from forte.processors.nltk_processors import NLTKWordTokenizer, \
    NLTKPOSTagger, NLTKSentenceSegmenter
from forte.processors.writers import DocIdJsonPackWriter

nlp = Pipeline()
reader = OntonotesReader()

data_path = "../data_samples/ontonotes/00/"

nlp.set_reader(OntonotesReader())
nlp.add_processor(NLTKSentenceSegmenter())
nlp.add_processor(NLTKWordTokenizer())
nlp.add_processor(NLTKPOSTagger())

# This is a simple writer that serialize the result to the current directory and
# will use the DocID field in the data pack as the file name.
nlp.add_processor(
    DocIdJsonPackWriter(),
    HParams(
        {'output_dir': '.'},
        DocIdJsonPackWriter.default_configs(),
    ))

nlp.initialize()

nlp.run(data_path)