Exemple #1
0
    def finish(self, resource: Resources):
        # if a singleton is in pre-trained embedding dict,
        # set the count to min_occur + c
        for word in self.word_cnt:
            if self.word_cnt[word] < self.min_frequency:
                del self.word_cnt[word]

        word_alphabet = Alphabet("word", self.word_cnt)
        char_alphabet = Alphabet("character", self.char_cnt)
        ner_alphabet = Alphabet("ner", self.ner_cnt)

        embedding_dict = load_glove_embedding(self.embedding_path)

        for word in embedding_dict:
            if word not in word_alphabet.instance2index:
                word_alphabet.add(word)

        word_embedding_table = construct_word_embedding_table(
            embedding_dict, word_alphabet)

        logging.info('word embedding table size: %s',
                     word_embedding_table.size())

        # Adding vocabulary information to resource.
        resource.update(word_alphabet=word_alphabet,
                        char_alphabet=char_alphabet,
                        ner_alphabet=ner_alphabet,
                        word_embedding_table=word_embedding_table)
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        # self._model = AutoModel.from_pretrained(
        #     self.configs.model_path
        # )
        # self._tokenizer = AutoTokenizer.from_pretrained(
        #     'distilbert-base-uncased'
        # )
        self._model = resources.get("coref_model")
        self._tokenizer = resources.get("tokenizer")

        self.events2sents: Dict[int, Sentence] = {}
Exemple #3
0
    def setUp(self):
        self.resources: Resources = Resources()
        self.resources.update(redirects={})

        self.data_dir: str = os.path.abspath(
            os.path.join(
                os.path.dirname(os.path.realpath(__file__)),
                "../../../../../data_samples/dbpedia",
            ))

        self.output_dir = tempfile.TemporaryDirectory()

        self.raw_output: str = os.path.join(self.output_dir.name, "raw")

        pl = Pipeline[DataPack](self.resources)
        pl.set_reader(DBpediaWikiReader())
        pl.add(
            WikiArticleWriter(),
            config={
                "output_dir": self.raw_output,
                "zip_pack": True,
                "drop_record": True,
            },
        )
        pl.run(os.path.join(self.data_dir, "nif_context.tql"))
    def test_document_and_passage_mode(self, doc_mode):
        resources: Resources = Resources()
        config: HParams = HParams({"doc_mode": doc_mode}, default_hparams=None)
        self.reader.initialize(resources, config)
        data_packs: List[DataPack] = \
            [data_pack for data_pack in self.reader.iter(self.data_dir,
                                                         'dev')]

        # get all queries and all documents
        queries: List[Query] = []
        documents: Dict[str, Document] = dict()
        for data_pack in data_packs:
            query_entries = list(data_pack.get_entries_by_type(Query))
            doc_entries = list(data_pack.get_entries_by_type(Document))

            self.assertTrue(len(query_entries) + len(doc_entries) == 1)

            if len(query_entries) > 0:
                query_entry: Query = query_entries[0]
                queries.append(query_entry)
            else:
                doc_entry: Document = doc_entries[0]
                documents[data_pack.meta.doc_id] = doc_entry

        # match text of documents relevant to the queries to the actual text
        for i, query in enumerate(queries):
            expected_query = self.expected_queries[i]
            expected_ids = self.expected_doc_ids[doc_mode][i]
            self.assertEqual(query.query, expected_query)
            self.assertCountEqual(query.doc_ids["relevant_docs"], expected_ids)
            for doc_id in expected_ids:
                expected_text = self.get_expected_text(doc_id, doc_mode)
                self.assertEqual(documents[doc_id].text, expected_text)
Exemple #5
0
    def initialize(self, resources: Resources, configs: Config):
        self.redirects = resources.get('redirects')

        # These NIF readers organize the statements in the specific RDF context,
        # in this case each context correspond to one wiki page, this allows
        # us to read the information more systematically.
        self.struct_reader = NIFBufferedContextReader(
            configs.nif_page_structure)
        self.link_reader = NIFBufferedContextReader(configs.nif_text_links)
    def initialize(self, resources: Resources, configs: Config):
        # pylint: disable=attribute-defined-outside-init
        self.pack_index = read_index(configs.pack_index)
        self.pack_dir = configs.pack_dir

        self.redirects = resources.get('redirects')

        self.literal_info_reader = NIFBufferedContextReader(
            configs.mapping_literals)
        self.object_info_reader = NIFBufferedContextReader(
            configs.mapping_objects)

        # Set up logging.
        f_handler = logging.FileHandler(configs.reading_log)
        f_format = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        f_handler.setFormatter(f_format)
        self.logger.handlers = [f_handler]
Exemple #7
0
import pickle
import sys

from forte import Pipeline
from forte.common import Resources
from forte.datasets.wikipedia.dbpedia import WikiArticleWriter

from facets.wiki.processors.wiki import WikiCategoryReader

if __name__ == "__main__":
    base_dir = sys.argv[1]
    pack_dir = os.path.join(base_dir, "packs")

    redirect_map = pickle.load(
        open(os.path.join(pack_dir, "redirects.pickle"), "rb"))
    resources = Resources()
    resources.update(redirects=redirect_map)

    # Define paths
    pack_input = os.path.join(pack_dir, "nif_raw_struct_links")
    # Should write to same files.
    # pack_output = pack_input
    pack_output = os.path.join(pack_dir, "category")
    # Store which documents have category.
    pack_input_index = os.path.join(pack_input, "article.idx")
    # Store which documents have category.
    pack_output_index = os.path.join(pack_output, "category.idx")

    logging.basicConfig(
        format="%(asctime)s - %(message)s",
        level=logging.INFO,
                                         },
                                         "token_source":
                                         "ft.onto.base_ontology.Token",
                                     }).add(
                                         WikiArticleWriter(),
                                         config={
                                             "output_dir": pack_output,
                                             "zip_pack": True,
                                             "drop_record": True,
                                             "input_index_file":
                                             pack_input_index,
                                             "output_index_file":
                                             pack_output_index,
                                             "use_input_index": True,
                                             "serialize_method": "jsonpickle"
                                         },
                                     ).add(ProgressPrinter())
    pipeline.run(pack_input)


if __name__ == "__main__":
    base_dir = sys.argv[1]
    pack_dir = os.path.join(base_dir, "packs")

    redirect_map = pickle.load(
        open(os.path.join(pack_dir, "redirects.pickle"), "rb"))
    loaded_resource = Resources()
    loaded_resource.update(redirects=redirect_map)

    complete_and_tokens()