Python NLTKWordTokenizerの例、forte.processors.nltk_processors.NLTKWordTokenizer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: serialize_example.py プロジェクト: gxchris95/forte-1

def pack_example(input_path, output_path):
    """
    This example read data from input path and serialize to output path.
    Args:
        input_path:
        output_path:

    Returns:

    """
    print("Pack serialization example.")
    nlp = Pipeline[DataPack]()

    nlp.set_reader(OntonotesReader())
    nlp.add(NLTKSentenceSegmenter())
    nlp.add(NLTKWordTokenizer())
    nlp.add(NLTKPOSTagger())

    # This is a simple writer that serialize the result to the current
    # directory and will use the DocID field in the data pack as the file name.
    nlp.add(PackNameJsonPackWriter(), {
        'output_dir': output_path,
        'indent': 2,
        'overwrite': True,
    })

    nlp.run(input_path)

コード例 #2

0

ファイルを表示

def setup(config: Config) -> Pipeline:
    resource = Resources()
    query_pipeline = Pipeline[MultiPack](resource=resource)
    query_pipeline.set_reader(
        reader=MultiPackTerminalReader(), config=config.reader)
    query_pipeline.add(
        component=MicrosoftBingTranslator(), config=config.translator)
    query_pipeline.add(
        component=BertBasedQueryCreator(), config=config.query_creator)
    query_pipeline.add(
        component=SearchProcessor(), config=config.searcher)

    top_response_pack_name = config.indexer.response_pack_name + '_0'

    query_pipeline.add(
        component=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=NLTKWordTokenizer(),
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=NLTKPOSTagger(),
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=SRLPredictor(), config=config.SRL,
        selector=NameMatchSelector(select_name=top_response_pack_name))
    query_pipeline.add(
        component=MicrosoftBingTranslator(), config=config.back_translator)

    query_pipeline.initialize()

    return query_pipeline

コード例 #3

0

ファイルを表示

 def setUp(self):
     self.nltk = Pipeline[DataPack]()
     self.nltk.set_reader(StringReader())
     self.nltk.add(NLTKSentenceSegmenter())
     self.nltk.add(NLTKWordTokenizer())
     self.nltk.add(NLTKPOSTagger())
     self.nltk.initialize()

コード例 #4

0

ファイルを表示

 def setUp(self):
     self.nltk = Pipeline[DataPack]()
     self.nltk.set_reader(StringReader())
     self.nltk.add(NLTKSentenceSegmenter())
     self.nltk.add(NLTKWordTokenizer())
     self.nltk.add(NLTKPOSTagger())
     config = {'pattern': 'NP: {<DT>?<JJ>*<NN>}'}
     self.nltk.add(NLTKChunker(), config=config)
     self.nltk.initialize()

コード例 #5

0

ファイルを表示

    def setUp(self):
        random.seed(0)
        self.nlp = Pipeline[MultiPack]()

        boxer_config = {'pack_name': 'input_src'}

        self.nlp.set_reader(reader=StringReader())
        self.nlp.add(component=MultiPackBoxer(), config=boxer_config)
        self.nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector())
        self.nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())

コード例 #6

0

ファイルを表示

def string_processor_example(ner_model_dir: str, srl_model_dir: str):
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_dir, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    ner_predictor = CoNLLNERPredictor()

    pl.add_processor(ner_predictor, ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_dir,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)

    pl.initialize()

    text = (
        "The plain green Norway spruce is displayed in the gallery's foyer. "
        "Wentworth worked as an assistant to sculptor Henry Moore in the "
        "late 1960s. His reputation as a sculptor grew in the 1980s.")

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [(token.text, token.pos)
                  for token in pack.get(Token, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", 'red'))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()  # type: ignore
            child: PredicateArgument = link.get_child()  # type: ignore
            print(f"  - \"{child.text}\" is role {link.arg_type} of "
                  f"predicate \"{parent.text}\"")
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", 'green'))

コード例 #7

0

ファイルを表示

    def test_serialize_deserialize_processor(self):
        pipe_serialize = Pipeline[DataPack]()
        pipe_serialize.set_reader(OntonotesReader())
        pipe_serialize.add(
            AnnotationRemover(),
            # Remove tokens and sentences form OntonotesReader.
            {
                'removal_types': [
                    'ft.onto.base_ontology.Token',
                    'ft.onto.base_ontology.Sentence',
                ]
            })
        pipe_serialize.add(NLTKSentenceSegmenter())
        pipe_serialize.add(NLTKWordTokenizer())
        pipe_serialize.add(NLTKPOSTagger())

        output_path = tempfile.mkdtemp()

        pipe_serialize.add(DocIdJsonPackWriter(), {
            'output_dir': output_path,
            'indent': 2,
        })

        dataset_path = "data_samples/ontonotes/00"
        pipe_serialize.run(dataset_path)

        pipe_deserialize = Pipeline[DataPack]()
        pipe_deserialize.set_reader(RecursiveDirectoryDeserializeReader())
        pipe_deserialize.initialize()

        token_counts: Dict[str, int] = {}

        # This basically test whether the deserialized data is still the same
        # as expected.
        pack: DataPack
        for pack in pipe_deserialize.process_dataset(output_path):
            tokens: List[Token] = list(pack.get(Token))
            token_counts[pack.pack_name] = len(tokens)

        expected_count = {
            'bn/abc/00/abc_0039': 72,
            'bn/abc/00/abc_0019': 370,
            'bn/abc/00/abc_0059': 39,
            'bn/abc/00/abc_0009': 424,
            'bn/abc/00/abc_0029': 487,
            'bn/abc/00/abc_0069': 428,
            'bn/abc/00/abc_0049': 73
        }

        assert token_counts == expected_count
        shutil.rmtree(output_path)

コード例 #8

0

ファイルを表示

ファイル: process_dataset_example.py プロジェクト: huzecong/forte

def main(dataset_dir: str, ner_model_path: str, srl_model_path: str):
    pl = Pipeline()
    pl.set_reader(PlainTextReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    ner_configs = HParams(
        {'storage_path': os.path.join(ner_model_path, 'resources.pkl')},
        CoNLLNERPredictor.default_hparams())

    pl.add_processor(CoNLLNERPredictor(), ner_configs)

    srl_configs = HParams({
        'storage_path': srl_model_path,
    }, SRLPredictor.default_hparams())
    pl.add_processor(SRLPredictor(), srl_configs)
    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", 'red'), pack.meta.doc_id)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [(token.text, token.pos)
                      for token in pack.get(Token, sentence)]
            entities = [(entity.text, entity.ner_type)
                        for entity in pack.get(EntityMention, sentence)]
            print(colored("Tokens:", 'red'), tokens, "\n")
            print(colored("EntityMentions:", 'red'), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))

コード例 #9

0

ファイルを表示

ファイル: process_string_example.py プロジェクト: mgupta1410/forte-1

def main():
    pl = Pipeline()
    pl.set_reader(StringReader())
    pl.add_processor(NLTKSentenceSegmenter())
    pl.add_processor(NLTKWordTokenizer())
    pl.add_processor(NLTKPOSTagger())

    pl.add_processor(CoNLLNERPredictor(), config=config.NER)
    pl.add_processor(SRLPredictor(), config=config.SRL)

    pl.initialize()

    text = (
        "So I was excited to see Journey to the Far Side of the Sun finally "
        "get released on an affordable DVD (the previous print had been "
        "fetching $100 on eBay - I'm sure those people wish they had their "
        "money back - but more about that in a second).")

    pack = pl.process_one(text)

    for sentence in pack.get(Sentence):
        sent_text = sentence.text
        print(colored("Sentence:", 'red'), sent_text, "\n")
        # first method to get entry in a sentence
        tokens = [(token.text, token.pos)
                  for token in pack.get(Token, sentence)]
        entities = [(entity.text, entity.ner_type)
                    for entity in pack.get(EntityMention, sentence)]
        print(colored("Tokens:", 'red'), tokens, "\n")
        print(colored("EntityMentions:", 'red'), entities, "\n")

        # second method to get entry in a sentence
        print(colored("Semantic role labels:", 'red'))
        for link in pack.get(PredicateLink, sentence):
            parent: PredicateMention = link.get_parent()  # type: ignore
            child: PredicateArgument = link.get_child()  # type: ignore
            print(f"  - \"{child.text}\" is role {link.arg_type} of "
                  f"predicate \"{parent.text}\"")
            entities = [
                entity.text for entity in pack.get(EntityMention, child)
            ]
            print("      Entities in predicate argument:", entities, "\n")
        print()

        input(colored("Press ENTER to continue...\n", 'green'))

コード例 #10

0

ファイルを表示

ファイル: data_augment_replacement_processor_test.py プロジェクト: jasonyanwenl/forte

    def test_pipeline(self, texts, expected_outputs, expected_tokens):
        nlp = Pipeline[MultiPack]()

        boxer_config = {
            'pack_name': 'input'
        }

        processor_config = {
            'augment_entry': "ft.onto.base_ontology.Token",
            'other_entry_policy': {
                'type': '',
                'kwargs': {
                    "ft.onto.base_ontology.Document": "auto_align",
                    "ft.onto.base_ontology.Sentence": "auto_align"
                }
            },
            'type': 'data_augmentation_op',
            'data_aug_op': 'tests.forte.processors.base.data_augment_replacement_processor_test.TmpReplacer',
            'data_aug_op_config': {
                'type': '',
                'kwargs': {}
            },
            'augment_pack_names': {
                'kwargs': {
                    'input': 'augmented_input'
                }
            }
        }

        nlp.set_reader(reader=StringReader())
        nlp.add(component=MultiPackBoxer(), config=boxer_config)
        nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector())
        nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
        nlp.add(component=ReplacementDataAugmentProcessor(), config=processor_config)
        nlp.initialize()

        for idx, m_pack in enumerate(nlp.process_dataset(texts)):
            aug_pack = m_pack.get_pack('augmented_input')

            self.assertEqual(aug_pack.text, expected_outputs[idx])

            for j, token in enumerate(aug_pack.get(Token)):
                self.assertEqual(token.text, expected_tokens[idx][j])

コード例 #11

0

ファイルを表示

ファイル: process_dataset_example.py プロジェクト: awoziji/forte

def main(dataset_dir: str):
    config = yaml.safe_load(open("config.yml", "r"))
    config = Config(config, default_hparams=None)

    pl = Pipeline[DataPack]()
    pl.set_reader(PlainTextReader())
    pl.add(NLTKSentenceSegmenter())
    pl.add(NLTKWordTokenizer())
    pl.add(NLTKPOSTagger())
    pl.add(CoNLLNERPredictor(), config=config.NER)
    pl.add(SRLPredictor(), config=config.SRL)

    pl.initialize()

    for pack in pl.process_dataset(dataset_dir):
        print(colored("Document", 'red'), pack.meta.doc_id)
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")
            # first method to get entry in a sentence
            tokens = [(token.text, token.pos)
                      for token in pack.get(Token, sentence)]
            entities = [(entity.text, entity.ner_type)
                        for entity in pack.get(EntityMention, sentence)]
            print(colored("Tokens:", 'red'), tokens, "\n")
            print(colored("EntityMentions:", 'red'), entities, "\n")

            # second method to get entry in a sentence
            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent: PredicateMention = link.get_parent()  # type: ignore
                child: PredicateArgument = link.get_child()  # type: ignore
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
                entities = [
                    entity.text for entity in pack.get(EntityMention, child)
                ]
                print("      Entities in predicate argument:", entities, "\n")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))

コード例 #12

0

ファイルを表示

    def test_encoder_phrase(self):
        pipeline = Pipeline[DataPack]()
        pipeline.set_reader(StringReader())
        pipeline.add(NLTKSentenceSegmenter())
        pipeline.add(NLTKWordTokenizer())
        pipeline.add(NLTKPOSTagger())
        config = {'pattern': 'NP: {<DT>?<JJ>*<NN>}'}
        pipeline.add(NLTKChunker(), config=config)
        pipeline.add(PretrainedEncoder(),
                     config={'entry_type': 'ft.onto.base_ontology.Phrase'})
        pipeline.initialize()

        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP "
            "pipelines.", "NLP has never been made this easy before."
        ]
        document = ' '.join(sentences)
        pack = pipeline.process(document)
        for i, phrase in enumerate(pack.get(Phrase)):
            self.assertEqual(phrase.embedding.shape, (1, 512, 768))

コード例 #13

0

ファイルを表示

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config_file",
                        default="./config.yml",
                        help="Config YAML filepath")
    args = parser.parse_args()

    # loading config
    config = yaml.safe_load(open(args.config_file, "r"))

    nlp: Pipeline[MultiPack] = Pipeline()
    nlp.set_reader(RandomDataSelector(), config=config["data_selector_config"])
    nlp.add(component=MultiPackBoxer(), config=config["boxer_config"])
    nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector())
    nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())
    nlp.add(component=ReplacementDataAugmentProcessor(),
            config=config["da_processor_config"])

    nlp.initialize()

    for _, m_pack in enumerate(nlp.process_dataset()):
        aug_pack = m_pack.get_pack('augmented_input')
        logging.info(aug_pack.text)

コード例 #14

0

ファイルを表示

ファイル: nltk_processors_test.py プロジェクト: mgupta1410/forte-1

 def setUp(self):
     self.nltk = Pipeline()
     self.nltk.set_reader(StringReader())
     self.nltk.add_processor(NLTKSentenceSegmenter())
     self.nltk.add_processor(NLTKWordTokenizer())
     self.nltk.add_processor(NLTKPOSTagger())

コード例 #15

0

ファイルを表示

ファイル: serialize_example.py プロジェクト: huzecong/forte

from texar.torch import HParams

from forte.pipeline import Pipeline
from forte.data.readers import OntonotesReader
from forte.processors.nltk_processors import NLTKWordTokenizer, \
    NLTKPOSTagger, NLTKSentenceSegmenter
from forte.processors.writers import DocIdJsonPackWriter

nlp = Pipeline()
reader = OntonotesReader()

data_path = "../data_samples/ontonotes/00/"

nlp.set_reader(OntonotesReader())
nlp.add_processor(NLTKSentenceSegmenter())
nlp.add_processor(NLTKWordTokenizer())
nlp.add_processor(NLTKPOSTagger())

# This is a simple writer that serialize the result to the current directory and
# will use the DocID field in the data pack as the file name.
nlp.add_processor(
    DocIdJsonPackWriter(),
    HParams(
        {'output_dir': '.'},
        DocIdJsonPackWriter.default_hparams(),
    ))

nlp.initialize()

nlp.run(data_path)

コード例 #16

0

ファイルを表示

def main():

    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    if not os.path.exists(config.indexer.model_dir):
        print(f"Creating a new index...")
        encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
        encoder.to(device)

        feature_original_types = {
            "id": ["int64", "FixedLenFeature"],
            "input_ids": ["int64", "FixedLenFeature",
                          config.indexer.max_seq_length],
            "segment_ids": ["int64", "FixedLenFeature",
                            config.indexer.max_seq_length],
            "text": ["str", "FixedLenFeature"]
        }

        hparam = {
            "allow_smaller_final_batch": True,
            "batch_size": config.indexer.batch_size,
            "dataset": {
                "data_name": "data",
                "feature_original_types": feature_original_types,
                "files": config.indexer.pickle_data_dir
            },
            "shuffle": False
        }

        print(f"Embedding the text using BERTEncoder...")
        record_data = RecordData(hparams=hparam, device=device)
        data_iterator = DataIterator(record_data)
        index = EmbeddingBasedIndexer(hparams={
            "index_type": "GpuIndexFlatIP",
            "dim": 768,
            "device": "gpu0"
        })

        for idx, batch in enumerate(data_iterator):
            ids = batch["id"]
            input_ids = batch["input_ids"]
            segment_ids = batch["segment_ids"]
            text = batch["text"]
            _, pooled_output = get_embeddings(encoder, input_ids, segment_ids)
            index.add(vectors=pooled_output,
                      meta_data={k.item(): v for k, v in zip(ids, text)})

            if (idx + 1) % 50 == 0:
                print(f"Completed {idx+1} batches of size "
                      f"{config.indexer.batch_size}")

        index.save(path=config.indexer.model_dir)

    resource = Resources()
    query_pipeline = Pipeline(resource=resource)
    query_pipeline.set_reader(MultiPackTerminalReader())

    query_pipeline.add_processor(
        processor=MachineTranslationProcessor(), config=config.translator)
    query_pipeline.add_processor(
        processor=QueryCreator(), config=config.query_creator)
    query_pipeline.add_processor(
        processor=SearchProcessor(), config=config.indexer)
    query_pipeline.add_processor(
        processor=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=NLTKWordTokenizer(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=NLTKPOSTagger(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=SRLPredictor(), config=config.SRL,
        selector=NameMatchSelector(select_name="doc_0"))
    # query_pipeline.add_processor(
    #    processor=CoNLLNERPredictor(), config=config.NER,
    #    selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=MachineTranslationProcessor(), config=config.back_translator)

    query_pipeline.initialize()

    for m_pack in query_pipeline.process_dataset():

        # update resource to be used in the next conversation
        query_pack = m_pack.get_pack("query")
        if resource.get("user_utterance"):
            resource.get("user_utterance").append(query_pack)
        else:
            resource.update(user_utterance=[query_pack])

        response_pack = m_pack.get_pack("response")

        if resource.get("bot_utterance"):
            resource.get("bot_utterance").append(response_pack)
        else:
            resource.update(bot_utterance=[response_pack])

        english_pack = m_pack.get_pack("pack")
        print(colored("English Translation of the query: ", "green"),
              english_pack.text, "\n")
        pack = m_pack.get_pack("doc_0")
        print(colored("Retrieved Document", "green"), pack.text, "\n")
        print(colored("German Translation", "green"),
              m_pack.get_pack("response").text, "\n")
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")

            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent = link.get_parent()
                child = link.get_child()
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))

コード例 #17

0

ファイルを表示

def main():

    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    resource = Resources()
    query_pipeline = Pipeline(resource=resource)
    query_pipeline.set_reader(reader=MultiPackTerminalReader(),
                              config=config.reader)

    query_pipeline.add_processor(processor=MicrosoftBingTranslator(),
                                 config=config.translator)
    query_pipeline.add_processor(processor=BertBasedQueryCreator(),
                                 config=config.query_creator)
    query_pipeline.add_processor(processor=SearchProcessor(),
                                 config=config.indexer)
    query_pipeline.add_processor(
        processor=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=NLTKWordTokenizer(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=NLTKPOSTagger(),
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(
        processor=SRLPredictor(),
        config=config.SRL,
        selector=NameMatchSelector(
            select_name=config.indexer.response_pack_name[0]))
    query_pipeline.add_processor(processor=MicrosoftBingTranslator(),
                                 config=config.back_translator)

    query_pipeline.initialize()

    for m_pack in query_pipeline.process_dataset():

        # update resource to be used in the next conversation
        query_pack = m_pack.get_pack(config.translator.in_pack_name)
        if resource.get("user_utterance"):
            resource.get("user_utterance").append(query_pack)
        else:
            resource.update(user_utterance=[query_pack])

        response_pack = m_pack.get_pack(config.back_translator.in_pack_name)

        if resource.get("bot_utterance"):
            resource.get("bot_utterance").append(response_pack)
        else:
            resource.update(bot_utterance=[response_pack])

        english_pack = m_pack.get_pack("pack")
        print(colored("English Translation of the query: ", "green"),
              english_pack.text, "\n")
        pack = m_pack.get_pack(config.indexer.response_pack_name[0])
        print(colored("Retrieved Document", "green"), pack.text, "\n")
        print(colored("German Translation", "green"),
              m_pack.get_pack("response").text, "\n")
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")

            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent = link.get_parent()
                child = link.get_child()
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))

コード例 #18

0

ファイルを表示

ファイル: data_augment_replacement_processor_test.py プロジェクト: jasonyanwenl/forte

    def test_replace_token(self, texts, expected_outputs, expected_tokens, expected_links):
        for idx, text in enumerate(texts):
            file_path = os.path.join(self.test_dir, f"{idx + 1}.txt")
            with open(file_path, 'w') as f:
                f.write(text)

        nlp = Pipeline[MultiPack]()
        reader_config = {
            "input_pack_name": "input_src",
            "output_pack_name": "output_tgt"
        }
        nlp.set_reader(reader=MultiPackSentenceReader(), config=reader_config)

        nlp.add(component=NLTKWordTokenizer(), selector=AllPackSelector())
        nlp.add(component=NLTKPOSTagger(), selector=AllPackSelector())

        nlp.initialize()

        processor_config = {
            'augment_entry': "ft.onto.base_ontology.Token",
            'other_entry_policy': {
                "kwargs": {
                    "ft.onto.base_ontology.Sentence": "auto_align"
                }
            },
            'type': 'data_augmentation_op',
            'data_aug_op': 'tests.forte.processors.base.data_augment_replacement_processor_test.TmpReplacer',
            "data_aug_op_config": {
                'kwargs': {}
            },
            'augment_pack_names': {
                'kwargs': {}
            }
        }

        processor = ReplacementDataAugmentProcessor()
        processor.initialize(resources=None, configs=processor_config)

        for idx, m_pack in enumerate(nlp.process_dataset(self.test_dir)):
            src_pack = m_pack.get_pack('input_src')
            tgt_pack = m_pack.get_pack('output_tgt')

            num_mpl_orig, num_mpg_orig = 0, 0
            # Copy the source pack to target pack.
            tgt_pack.set_text(src_pack.text)

            src_pack.add_entry(Document(src_pack, 0, len(src_pack.text)))
            for anno in src_pack.get(Annotation):
                new_anno = type(anno)(
                    tgt_pack, anno.begin, anno.end
                )
                tgt_pack.add_entry(new_anno)

                # Create MultiPackLink.
                m_pack.add_entry(
                    MultiPackLink(
                        m_pack, anno, new_anno
                    )
                )

                # Create MultiPackGroup.
                m_pack.add_entry(
                    MultiPackGroup(
                        m_pack, [anno, new_anno]
                    )
                )

                # Count the number of MultiPackLink/MultiPackGroup.
                num_mpl_orig += 1
                num_mpg_orig += 1

            # Create Links in the source pack.
            # The Links should be a tree:
            #
            #                           Link 3
            #                    _________|_________
            #                   |                  |
            #                 Link 2               |
            #            _______|________          |
            #           |               |          |
            #         Link 1            |          |
            #     ______|_____          |          |
            #    |           |          |          |
            # token 1     token 2    token 3    token 4 ... ...
            prev_entry = None
            for i, token in enumerate(src_pack.get(Token)):
                # Avoid overlapping with deleted tokens.
                if i < 10:
                    continue
                if prev_entry:
                    link = Link(src_pack, prev_entry, token)
                    src_pack.add_entry(
                        link
                    )
                    prev_entry = link
                else:
                    prev_entry = token

            # Create Groups in the target pack.
            # The Groups should be a tree like the Links.
            prev_entry = None
            for i, token in enumerate(tgt_pack.get(Token)):
                # Avoid overlapping with deleted tokens.
                if i < 10:
                    continue
                if prev_entry:
                    group = Group(tgt_pack, [prev_entry, token])
                    tgt_pack.add_entry(
                        group
                    )
                    prev_entry = group
                else:
                    prev_entry = token

            doc_src = list(src_pack.get(Document))[0]
            doc_tgt = list(tgt_pack.get(Document))[0]

            sent_src = list(src_pack.get(Sentence))[0]
            sent_tgt = list(tgt_pack.get(Sentence))[0]

            # Insert two extra Links in the src_pack.
            # They should not be copied to new_src_pack, because the Document is not copied.
            link_src_low = src_pack.add_entry(Link(src_pack, doc_src, sent_src))
            src_pack.add_entry(Link(src_pack, link_src_low, sent_src))

            # Insert two extra Groups in the tgt_pack.
            # They should not be copied to new_tgt_pack, because the Document is not copied.
            group_tgt_low = tgt_pack.add_entry(Group(tgt_pack, [doc_tgt, sent_tgt]))
            tgt_pack.add_entry(Group(tgt_pack, [group_tgt_low, sent_tgt]))

            # Call the augment function explicitly for duplicate replacement
            # to test the False case of _replace function.
            processor._augment(m_pack, ["input_src", "output_tgt"])

            # Test the insertion and deletion
            for pack in (src_pack, tgt_pack):
                # Insert an "NLP" at the beginning
                processor._insert(" NLP ", pack, 0)
                processor._insert(" NLP ", pack, 18)
                processor._insert(" NLP ", pack, len(pack.text) - 2)
                processor._insert("NLP", pack, len(pack.text) - 1)
                # Delete the second token "and"
                processor._delete(list(pack.get(Token))[1])

                # This duplicate insertion should be invalid.
                processor._insert(" NLP ", pack, 0)
                # This insertion overlaps with a replacement.
                # It should be invalid.
                processor._insert(" NLP ", pack, 2)

            processor._process(m_pack)

            new_src_pack = m_pack.get_pack('augmented_input_src')
            new_tgt_pack = m_pack.get_pack('augmented_output_tgt')

            self.assertEqual(new_src_pack.text, expected_outputs[idx] + "\n")

            for j, token in enumerate(new_src_pack.get(Token)):
                self.assertEqual(token.text, expected_tokens[idx][j])

            for sent in new_src_pack.get(Sentence):
                self.assertEqual(sent.text, expected_outputs[idx])

            # Test the copied Links.
            prev_link = None
            for i, link in enumerate(new_src_pack.get(Link)):
                if prev_link:
                    self.assertEqual(link.get_parent().tid, prev_link.tid)
                    self.assertEqual(link.get_child().text, expected_links[idx][i])
                prev_link = link

            # Test the copied Groups.
            prev_group = None
            for i, group in enumerate(new_tgt_pack.get(Group)):
                members = group.get_members()
                if isinstance(members[0], Token):
                    member_token = members[0]
                    member_group = members[1]
                else:
                    member_token = members[1]
                    member_group = members[0]

                if prev_group:
                    self.assertEqual(isinstance(member_token, Token), True)
                    self.assertEqual(isinstance(member_group, Group), True)
                    self.assertEqual(member_group.tid, prev_group.tid)
                    self.assertEqual(member_token.text, expected_links[idx][i])

                prev_group = group

            # The two extra Links should not be copied, because of missing Document.
            self.assertEqual(len(list(src_pack.get(Link))) - 2, len(list(new_src_pack.get(Link))))
            # The two extra Groups should not be copied, because of missing Document.
            self.assertEqual(len(list(tgt_pack.get(Group))) - 2, len(list(new_tgt_pack.get(Group))))

            # Test the MultiPackLink/MultiPackGroup
            num_mpl_aug, num_mpg_aug = 0, 0
            for mpl in m_pack.get(MultiPackLink):
                parent = mpl.get_parent()
                child = mpl.get_child()
                num_mpl_aug += 1
                self.assertEqual(parent.text, child.text)
                self.assertNotEqual(parent.pack.meta.pack_id, child.pack.meta.pack_id)

            for mpg in m_pack.get(MultiPackGroup):
                members = mpg.get_members()
                num_mpg_aug += 1
                self.assertEqual(members[0].text, members[1].text)
                self.assertNotEqual(members[0].pack.meta.pack_id, members[1].pack.meta.pack_id)

            # Test the number of MultiPackLink/MultiPackGroup.
            # Minus the aug and orig counters by 1, because the Document is not copied.
            # So we ignore the MPL and MPG between Document.
            # The number should be doubled, except for one deletion.
            self.assertEqual(num_mpl_aug - 1, (num_mpl_orig - 1) * 2 - 1)
            self.assertEqual(num_mpg_aug - 1, (num_mpg_orig - 1) * 2 - 1)