Exemple #1
0
 def setUp(self) -> None:
     self.reader = OntonotesReader()
     data_path = os.path.join(os.path.dirname(
         os.path.dirname(forte.__file__)),
         "examples/data_samples/ontonotes/00/abc_0059.gold_conll"
     )
     self.data_pack = list(self.reader.parse_pack(data_path))[0]
def pack_example(input_path, output_path):
    """
    This example read data from input path and serialize to output path.
    Args:
        input_path:
        output_path:

    Returns:

    """
    print("Pack serialization example.")
    nlp = Pipeline[DataPack]()

    nlp.set_reader(OntonotesReader())
    nlp.add(NLTKSentenceSegmenter())
    nlp.add(NLTKWordTokenizer())
    nlp.add(NLTKPOSTagger())

    # This is a simple writer that serialize the result to the current
    # directory and will use the DocID field in the data pack as the file name.
    nlp.add(PackNameJsonPackWriter(), {
        'output_dir': output_path,
        'indent': 2,
        'overwrite': True,
    })

    nlp.run(input_path)
Exemple #3
0
    def test_process_multi_next(self):
        from forte.data.readers import OntonotesReader

        # Define and config the Pipeline
        nlp = Pipeline[DataPack]()
        nlp.set_reader(OntonotesReader())

        pack_name = 'test_pack'
        nlp.add(MultiPackBoxer(), {'pack_name': pack_name})
        nlp.add(DummyRelationExtractor(),
                config={"batcher": {
                    "batch_size": 5
                }},
                selector=NameMatchSelector(select_name=pack_name))
        nlp.initialize()

        dataset_path = data_samples_root + "/ontonotes/00"

        # get processed pack from dataset
        m_pack: MultiPack
        for m_pack in nlp.process_dataset(dataset_path):
            pack = m_pack.get_pack(pack_name)
            # get sentence from pack
            for sentence in pack.get(Sentence):
                sent_text = sentence.text

                # second method to get entry in a sentence
                tokens = [token.text for token in pack.get(Token, sentence)]
                self.assertEqual(sent_text, " ".join(tokens))
Exemple #4
0
    def create_pack_iterator(self) -> Iterator[DataPack]:
        srl_train_reader = OntonotesReader(cache_in_memory=True)
        train_pl: Pipeline = Pipeline()
        train_pl.set_reader(srl_train_reader)
        train_pl.initialize()
        pack_iterator = train_pl.process_dataset(self.train_path)

        return pack_iterator
Exemple #5
0
    def setUp(self) -> None:
        file_dir_path = os.path.dirname(__file__)
        data_path = os.path.join(file_dir_path, os.pardir, os.pardir,
                                 'test_data', 'ontonotes')

        pipeline: Pipeline = Pipeline()
        pipeline.set_reader(OntonotesReader())
        pipeline.initialize()
        self.data_pack: DataPack = pipeline.process_one(data_path)
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = {"batcher": {"batch_size": 5}}
        self.nlp.add_processor(dummy, config=config)
        self.nlp.initialize()

        self.data_path = "data_samples/ontonotes/00/"
Exemple #7
0
    def setUp(self):
        # Define and config the Pipeline
        self.dataset_path = "examples/data_samples/ontonotes/00"

        self.nlp = Pipeline()

        self.nlp.set_reader(OntonotesReader())
        self.nlp.add_processor(DummyPackProcessor())

        self.nlp.initialize()
Exemple #8
0
    def setUp(self) -> None:
        file_dir_path = os.path.dirname(__file__)
        data_path = os.path.abspath(
            os.path.join(file_dir_path, '../../../', 'data_samples',
                         'ontonotes/one_file'))

        pipeline: Pipeline = Pipeline()
        pipeline.set_reader(OntonotesReader())
        pipeline.initialize()
        self.data_pack: DataPack = pipeline.process_one(data_path)
Exemple #9
0
    def setUp(self) -> None:
        # Define and config the Pipeline
        self.dataset_path = "examples/ontonotes_sample_dataset/00"

        self.nlp = Pipeline()

        self.nlp.set_reader(OntonotesReader())
        self.processor = DummyRelationExtractor()
        self.nlp.add_processor(self.processor)

        self.nlp.initialize()
Exemple #10
0
class DataPackTest(unittest.TestCase):

    def setUp(self) -> None:
        self.reader = OntonotesReader()
        data_path = os.path.join(os.path.dirname(
            os.path.dirname(forte.__file__)),
            "examples/data_samples/ontonotes/00/abc_0059.gold_conll"
        )
        self.data_pack = list(self.reader.parse_pack(data_path))[0]

    def test_get_data(self):
        requests = {
            Sentence: ["speaker"],
            Token: ["pos", "sense"],
            EntityMention: [],
            PredicateMention: [],
            PredicateArgument: {
                "fields": [],
                "unit": "Token"
            },
            PredicateLink: {
                "component": self.reader.component_name,
                "fields": ["parent", "child", "arg_type"]
            }
        }

        # case 1: get sentence context from the beginning
        instances = list(self.data_pack.get_data(Sentence))
        self.assertEqual(len(instances), 2)
        self.assertEqual(instances[1]["offset"],
                         len(instances[0]["context"]) + 1)

        # case 2: get sentence context from the second instance
        instances = list(self.data_pack.get_data(Sentence, skip_k=1))
        self.assertEqual(len(instances), 1)
        self.assertEqual(instances[0]["offset"], 165)

        # case 3: get document context
        instances = list(self.data_pack.get_data(Document, skip_k=0))
        self.assertEqual(len(instances), 1)
        self.assertEqual(instances[0]["offset"], 0)

        # case 4: test offset out of index
        instances = list(self.data_pack.get_data(Sentence, skip_k=10))
        self.assertEqual(len(instances), 0)

        # case 5: get entries
        instances = list(self.data_pack.get_data(Sentence,
                                                 request=requests,
                                                 skip_k=1))
        self.assertEqual(len(instances[0].keys()), 9)
        self.assertEqual(len(instances[0]["PredicateLink"]), 4)
        self.assertEqual(len(instances[0]["Token"]), 5)
        self.assertEqual(len(instances[0]["EntityMention"]), 3)
    def setUp(self) -> None:
        self.nlp = Pipeline()
        self.nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = HParams({"batcher": {
            "batch_size": 5
        }}, dummy.default_hparams())
        self.nlp.add_processor(dummy, config=config)
        self.nlp.initialize()

        self.data_path = \
            "forte/processors/base/tests/data_samples/ontonotes/00/"
    def test_serialize_deserialize_processor(self):
        pipe_serialize = Pipeline[DataPack]()
        pipe_serialize.set_reader(OntonotesReader())
        pipe_serialize.add(
            AnnotationRemover(),
            # Remove tokens and sentences form OntonotesReader.
            {
                "removal_types": [
                    "ft.onto.base_ontology.Token",
                    "ft.onto.base_ontology.Sentence",
                ]
            },
        )
        pipe_serialize.add(PeriodSentenceSplitter())
        pipe_serialize.add(WhiteSpaceTokenizer())

        with tempfile.TemporaryDirectory() as output_dir:
            pipe_serialize.add(
                PackNameJsonPackWriter(),
                {
                    "output_dir": output_dir,
                    "indent": 2,
                },
            )

            pipe_serialize.run(self.data_path)

            pipe_deserialize = Pipeline[DataPack]()
            pipe_deserialize.set_reader(RecursiveDirectoryDeserializeReader())
            pipe_deserialize.initialize()

            token_counts: Dict[str, int] = {}

            # This basically test whether the deserialized data is
            # still the same as expected.
            pack: DataPack
            for pack in pipe_deserialize.process_dataset(output_dir):
                tokens: List[Token] = list(pack.get(Token))
                token_counts[pack.pack_name] = len(tokens)

            expected_count = {
                "bn/abc/00/abc_0039": 72,
                "bn/abc/00/abc_0019": 370,
                "bn/abc/00/abc_0059": 39,
                "bn/abc/00/abc_0009": 424,
                "bn/abc/00/abc_0029": 487,
                "bn/abc/00/abc_0069": 428,
                "bn/abc/00/abc_0049": 73,
            }

            assert token_counts == expected_count
Exemple #13
0
    def test_serialize_deserialize_processor(self):
        pipe_serialize = Pipeline[DataPack]()
        pipe_serialize.set_reader(OntonotesReader())
        pipe_serialize.add(
            AnnotationRemover(),
            # Remove tokens and sentences form OntonotesReader.
            {
                'removal_types': [
                    'ft.onto.base_ontology.Token',
                    'ft.onto.base_ontology.Sentence',
                ]
            })
        pipe_serialize.add(NLTKSentenceSegmenter())
        pipe_serialize.add(NLTKWordTokenizer())
        pipe_serialize.add(NLTKPOSTagger())

        output_path = tempfile.mkdtemp()

        pipe_serialize.add(DocIdJsonPackWriter(), {
            'output_dir': output_path,
            'indent': 2,
        })

        dataset_path = "data_samples/ontonotes/00"
        pipe_serialize.run(dataset_path)

        pipe_deserialize = Pipeline[DataPack]()
        pipe_deserialize.set_reader(RecursiveDirectoryDeserializeReader())
        pipe_deserialize.initialize()

        token_counts: Dict[str, int] = {}

        # This basically test whether the deserialized data is still the same
        # as expected.
        pack: DataPack
        for pack in pipe_deserialize.process_dataset(output_path):
            tokens: List[Token] = list(pack.get(Token))
            token_counts[pack.pack_name] = len(tokens)

        expected_count = {
            'bn/abc/00/abc_0039': 72,
            'bn/abc/00/abc_0019': 370,
            'bn/abc/00/abc_0059': 39,
            'bn/abc/00/abc_0009': 424,
            'bn/abc/00/abc_0029': 487,
            'bn/abc/00/abc_0069': 428,
            'bn/abc/00/abc_0049': 73
        }

        assert token_counts == expected_count
        shutil.rmtree(output_path)
Exemple #14
0
    def setUp(self):

        self._port: int = 8880
        _file_dir_path: str = os.path.dirname(__file__)
        self._project_name: str = "serialization_pipeline_test"
        self._dataset_dir: str = os.path.abspath(
            os.path.join(_file_dir_path, "../../../",
                         "data_samples/ontonotes/00/"))
        self._test_specs_dir: str = os.path.abspath(
            os.path.join(_file_dir_path, "../data/ontology/test_specs/"))
        self._stave_processor = StaveProcessor()

        self.pl = Pipeline[DataPack](ontology_file=os.path.join(
            self._test_specs_dir, "test_stave_ontology.json"))
        self.pl.set_reader(OntonotesReader())
Exemple #15
0
    def setUp(self):
        file_dir_path = os.path.dirname(__file__)
        data_path = os.path.join(file_dir_path, "../../../../", 'data_samples',
                                 'ontonotes', '00')

        self.main_output = tempfile.TemporaryDirectory()

        nlp = Pipeline[DataPack]()
        nlp.set_reader(OntonotesReader())
        nlp.add(
            PackIdJsonPackWriter(), {
                'output_dir': os.path.join(self.main_output.name, 'packs'),
                'indent': 2,
                'overwrite': True,
            })
        nlp.run(data_path)
    def setUp(self):
        file_dir_path = os.path.dirname(__file__)
        data_path = os.path.join(file_dir_path, "../../../../", "data_samples",
                                 "ontonotes", "00")

        self.main_output = tempfile.TemporaryDirectory()

        nlp = Pipeline[DataPack]()
        nlp.set_reader(OntonotesReader())
        nlp.add(
            PackIdJsonPackWriter(),
            {
                "output_dir": os.path.join(self.main_output.name, "packs"),
                "indent": 2,
                "overwrite": True,
            },
        )
        nlp.run(data_path)
Exemple #17
0
    def testPackWriting(self, config_data):
        zip_pack, method = config_data

        with tempfile.TemporaryDirectory() as main_output:
            write_pipeline = Pipeline[DataPack]()
            write_pipeline.set_reader(OntonotesReader())
            write_pipeline.add(
                AutoNamePackWriter(),
                {
                    "output_dir": os.path.join(main_output, "packs"),
                    "overwrite": True,
                    "zip_pack": zip_pack,
                    "serialize_method": method,
                },
            )
            write_pipeline.run(self.data_path)

            read_pipeline = Pipeline[DataPack]()
            read_pipeline.set_reader(DirPackReader())
Exemple #18
0
    def test_process_next(self):
        # Define and config the Pipeline
        nlp = Pipeline[DataPack]()
        nlp.set_reader(OntonotesReader())
        dummy = DummyRelationExtractor()
        config = {"batcher": {"batch_size": 5}}
        nlp.add(dummy, config=config)
        nlp.initialize()

        dataset_path = os.path.join(data_samples_root, "ontonotes/00")

        # get processed pack from dataset
        for pack in nlp.process_dataset(dataset_path):
            # get sentence from pack
            for sentence in pack.get(Sentence):
                sent_text = sentence.text

                # second method to get entry in a sentence
                tokens = [token.text for token in pack.get(Token, sentence)]
                self.assertEqual(sent_text, " ".join(tokens))
Exemple #19
0
    def train(self):
        schemes: Dict = self.train_preprocessor.request["schemes"]
        text_extractor: BaseExtractor = schemes["text_tag"]["extractor"]
        char_extractor: BaseExtractor = schemes["char_tag"]["extractor"]
        link_extractor: BaseExtractor = schemes["pred_link_tag"]["extractor"]

        self.model: LabeledSpanGraphNetwork = LabeledSpanGraphNetwork(
            word_vocab=text_extractor.vocab.to_dict(),
            char_vocab_size=len(char_extractor.vocab),
            label_vocab=link_extractor.vocab.to_dict(),
        )

        optim: Optimizer = SGD(
            self.model.parameters(),
            lr=self.lr,
            momentum=self.momentum,
            nesterov=self.nesterov,
        )

        srl_val_reader = OntonotesReader()
        predictor = RelationPredictor()
        val_pl: Pipeline = Pipeline()
        val_pl.set_reader(srl_val_reader)
        val_pl.add(predictor, {})
        # TODO: We need an evaluator here for SRL task

        logger.info("Start training.")
        epoch = 0
        train_loss: float = 0.0
        train_total: int = 0

        while epoch < self.num_epochs:
            epoch += 1

            # Get iterator of preprocessed batch of train data
            train_batch_iter: Iterator[
                Batch
            ] = self.train_preprocessor.get_train_batch_iterator()

            for batch in train_batch_iter:
                char_tensor: Tensor = batch["char_tag"]["data"]
                char_masks: List[Tensor] = batch["char_tag"]["masks"]
                text_tensor: Tensor = batch["text_tag"]["data"]
                text_mask: Tensor = batch["text_tag"]["masks"][0]
                text: List[List[str]] = batch["raw_text_tag"]["data"]
                pred_link_features: List[Feature] = batch["pred_link_tag"][
                    "features"
                ]

                optim.zero_grad()

                output: LabeledSpanGraphNetwork.ReturnType = self.model(
                    text=text,
                    char_batch=char_tensor,
                    char_masks=char_masks,
                    text_batch=text_tensor,
                    text_mask=text_mask,
                    srl_features=pred_link_features,
                )

                output["loss"].backward()
                optim.step()

                train_loss += output["loss"].item()
                train_total += 1

            logger.info(
                "%dth Epoch training, " "loss: %f",
                epoch,
                train_loss / train_total,
            )

            train_loss = 0.0
            train_total = 0

            val_pl.run(self.val_path)

            logger.info("%dth Epoch evaluating", epoch)
Exemple #20
0
from texar.torch import HParams

from forte.pipeline import Pipeline
from forte.data.readers import OntonotesReader
from forte.processors.nltk_processors import NLTKWordTokenizer, \
    NLTKPOSTagger, NLTKSentenceSegmenter
from forte.processors.writers import DocIdJsonPackWriter

nlp = Pipeline()
reader = OntonotesReader()

data_path = "../data_samples/ontonotes/00/"

nlp.set_reader(OntonotesReader())
nlp.add_processor(NLTKSentenceSegmenter())
nlp.add_processor(NLTKWordTokenizer())
nlp.add_processor(NLTKPOSTagger())

# This is a simple writer that serialize the result to the current directory and
# will use the DocID field in the data pack as the file name.
nlp.add_processor(
    DocIdJsonPackWriter(),
    HParams(
        {'output_dir': '.'},
        DocIdJsonPackWriter.default_hparams(),
    ))

nlp.initialize()

nlp.run(data_path)
Exemple #21
0
 def setUp(self) -> None:
     self.reader = OntonotesReader()
     data_path = "data_samples/ontonotes/00/abc_0059.gold_conll"
     self.data_pack = list(self.reader.parse_pack(data_path))[0]
Exemple #22
0
    def testMultiPackWriting(self, config_data):
        zip_pack, method = config_data

        # Use different sub-directory to avoid conflicting.
        subdir = f"{zip_pack}_{method}"

        with tempfile.TemporaryDirectory() as main_output:
            # Prepare input data.
            prepared_input: str = os.path.join(main_output, subdir,
                                               "input_packs")
            data_output: str = os.path.join(main_output, subdir, "output")
            suffix = ".pickle" if method == "pickle" else ".json"
            if zip_pack:
                suffix = suffix + ".gz"

            nlp = Pipeline[DataPack]()
            nlp.set_reader(OntonotesReader())
            nlp.add(
                PackIdJsonPackWriter(),
                {
                    "output_dir": prepared_input,
                    "overwrite": True,
                    "serialize_method": method,
                    "zip_pack": zip_pack,
                },
            )
            nlp.run(self.data_path)

            # Convert to multi pack.
            coref_pl = Pipeline()

            coref_pl.set_reader(
                DirPackReader(),
                {
                    "serialize_method": method,
                    "zip_pack": zip_pack,
                    "suffix": suffix,
                },
            )
            coref_pl.add(MultiPackBoxer())
            coref_pl.add(CopySentence())
            coref_pl.add(NaiveCoref())

            coref_pl.add(
                PackIdMultiPackWriter(),
                config={
                    "output_dir": data_output,
                    "overwrite": True,
                    "serialize_method": method,
                    "zip_pack": zip_pack,
                },
            )
            coref_pl.run(prepared_input)

            self.assertTrue(
                os.path.exists(os.path.join(data_output, "multi.idx")))
            self.assertTrue(
                os.path.exists(os.path.join(data_output, "pack.idx")))
            self.assertTrue(os.path.exists(os.path.join(data_output, "packs")))
            self.assertTrue(os.path.exists(os.path.join(data_output, "multi")))

            # Read the multi pack again.
            mp_pipeline = Pipeline()

            mp_pipeline.set_reader(
                MultiPackDirectoryReader(),
                config={
                    "suffix": suffix,
                    "zip_pack": zip_pack,
                    "serialize_method": method,
                    "data_pack_dir": os.path.join(data_output, "packs"),
                    "multi_pack_dir": os.path.join(data_output, "multi"),
                },
            ).initialize()

            re: CrossDocEntityRelation
            for mp in mp_pipeline.process_dataset():
                for re in mp.get(CrossDocEntityRelation):
                    self.assertEqual(re.get_parent().text, re.get_child().text)