Esempio n. 1
0
def complete_and_tokens():
    # Define paths
    pack_input = os.path.join(pack_dir, "nif_raw_struct_links")
    pack_output = os.path.join(pack_dir, "nif_raw_struct_links_token")
    # Store which documents are processed, try to make input output structure
    # similar.
    pack_input_index = os.path.join(pack_input, "article.idx")
    pack_output_index = os.path.join(pack_output, "article.idx")

    logging.basicConfig(
        format="%(asctime)s - %(message)s",
        level=logging.INFO,
        filename=os.path.join(pack_dir, "complete_tokenize.log"),
    )

    pipeline = Pipeline(loaded_resource).set_reader(
        DirPackReader(),
        config={
            "suffix": ".json.gz",
            "zip_pack": True
        },
        # ).add(
        #     WikiEntityCompletion()
    ).add(WikiAddTitle()).add(SpacyProcessor(),
                              config={
                                  "processors": ["sentence", "tokenize"],
                              }).add(SubwordTokenizer(),
                                     config={
                                         "tokenizer_configs": {
                                             "pretrained_model_name":
                                             "bert-base-uncased"
                                         },
                                         "token_source":
                                         "ft.onto.base_ontology.Token",
                                     }).add(
                                         WikiArticleWriter(),
                                         config={
                                             "output_dir": pack_output,
                                             "zip_pack": True,
                                             "drop_record": True,
                                             "input_index_file":
                                             pack_input_index,
                                             "output_index_file":
                                             pack_output_index,
                                             "use_input_index": True,
                                             "serialize_method": "jsonpickle"
                                         },
                                     ).add(ProgressPrinter())
    pipeline.run(pack_input)
Esempio n. 2
0
def main(input_path: str):
    pipeline = Pipeline()

    pipeline.set_reader(DirPackReader(), ).add(TbfWriter(),
                                               config={
                                                   "output_path": None,
                                                   "system_name": None,
                                               }).initialize()

    pipeline.process(input_path)
Esempio n. 3
0
Pipeline().set_reader(
    MultiNLIReader()
).add(
    # Call spacy on remote.
    RemoteProcessor(),
    config={
        "url": "http://localhost:8008"
    },
).add(
    # Call allennlp on remote.
    RemoteProcessor(),
    config={
        "url": "http://localhost:8009"
    },
).add(
    MultiPackBoxer()
).add(
    TweakData()
).add(
    NLIProcessor(),
    selector=NameMatchSelector(),
    selector_config={
        "select_name": "default",
        "reverse_selection": True,
    }
).add(
    PackNameMultiPackWriter(),
    config={
        "output_dir": output_dir
    }
).add(
    ProgressPrinter(),
).run()
Esempio n. 4
0
    pack_output = os.path.join(pack_dir, "category")
    # Store which documents have category.
    pack_input_index = os.path.join(pack_input, "article.idx")
    # Store which documents have category.
    pack_output_index = os.path.join(pack_output, "category.idx")

    logging.basicConfig(
        format="%(asctime)s - %(message)s",
        level=logging.INFO,
        filename=os.path.join(pack_dir, "category.log"),
    )

    Pipeline(resources).set_reader(
        WikiCategoryReader(),
        config={
            "pack_index": pack_input_index,
            "pack_dir": pack_input,
        },
    ).add(
        WikiArticleWriter(),
        config={
            "output_dir": pack_output,
            "zip_pack": True,
            "drop_record": True,
            "input_index_file": pack_input_index,
            "output_index_file": pack_output_index,
            "use_input_index": True,
            "overwrite": True,
        },
    ).run(os.path.join(base_dir, "article_categories_en.tql.bz2"))
Esempio n. 5
0
from forte.huggingface import ZeroShotClassifier
from forte.stanza import StandfordNLPProcessor

from forte import Pipeline
from forte.data.readers import TerminalReader
from forte.processors.stave import StaveProcessor

nlp = Pipeline()
nlp.set_reader(TerminalReader())
nlp.add(StandfordNLPProcessor())
nlp.add(
    ZeroShotClassifier(),
    config={
        "candidate_labels": [
            "travel",
            "cooking",
            "dancing",
            "exploration",
        ],
    },
)
nlp.add(StaveProcessor())
nlp.initialize()
nlp.run()
Esempio n. 6
0
from facets.kbp_reader import EREReader
from facets.nli.analysis import DebugProcessor
from forte import Pipeline

import sys

kbp_dir = sys.argv[1]

Pipeline().set_reader(EREReader()).run([kbp_dir])
Esempio n. 7
0
from forte import Pipeline
from forte.data.readers import DirPackReader
from forte.processors.stave import StaveProcessor

Pipeline(
    ontology_file="conf/full.json"
).set_reader(
    DirPackReader()
).add(
    StaveProcessor(),
    config={
        "port": 8880,
        "use_pack_name": True,
    }
).run(
    # "/home/hector/data/kbp/train"
    "/Users/hector.liu/Downloads/train"
)

from fortex.allennlp import AllenNLPProcessor

from forte import Pipeline
from forte.data.readers import RawDataDeserializeReader

Pipeline().set_reader(RawDataDeserializeReader()).add(
    AllenNLPProcessor(),
    config={
        "processors": ["tokenize", "pos", "lemma", "depparse", "srl"],
        "infer_batch_size": 1,
    }
).serve(port=8009)
Esempio n. 9
0
import sys

from forte import Pipeline
from forte.data.readers import DirPackReader
from forte.processors.stave import StaveProcessor

if __name__ == '__main__':
    input_dir = sys.argv[1]
    onto_file = sys.argv[2]
    nlp = Pipeline(ontology_file=onto_file)

    nlp.set_reader(
        DirPackReader(), config={
            "suffix": ".json.gz"
        }
    ).add(
        StaveProcessor()
    ).run(input_dir)
Esempio n. 10
0
import sys

import IPython

from facets.utils import ProgressPrinter
from forte import Pipeline
from forte.data import DataPack
from forte.data.readers.deserialize_reader import DirPackReader
from forte.processors.base import PackProcessor


class PackExplorer(PackProcessor):
    def _process(self, pack: DataPack):
        IPython.embed()


if __name__ == "__main__":
    Pipeline().set_reader(
        DirPackReader(),
        config={
            "suffix": ".pickle.gz",
            "zip_pack": True,
            "serialize_method": "pickle"
        },
    ).add(ProgressPrinter()).run(sys.argv[1])
Esempio n. 11
0
from forte import Pipeline
from forte.data.readers import RawDataDeserializeReader
from fortex.spacy import SpacyProcessor

Pipeline().set_reader(RawDataDeserializeReader()).add(
    SpacyProcessor(),
    config={
        "processors": ["sentence"]
    }
).serve(port=8008)