def finish(self, resource: Resources): # if a singleton is in pre-trained embedding dict, # set the count to min_occur + c for word in self.word_cnt: if self.word_cnt[word] < self.min_frequency: del self.word_cnt[word] word_alphabet = Alphabet("word", self.word_cnt) char_alphabet = Alphabet("character", self.char_cnt) ner_alphabet = Alphabet("ner", self.ner_cnt) embedding_dict = load_glove_embedding(self.embedding_path) for word in embedding_dict: if word not in word_alphabet.instance2index: word_alphabet.add(word) word_embedding_table = construct_word_embedding_table( embedding_dict, word_alphabet) logging.info('word embedding table size: %s', word_embedding_table.size()) # Adding vocabulary information to resource. resource.update(word_alphabet=word_alphabet, char_alphabet=char_alphabet, ner_alphabet=ner_alphabet, word_embedding_table=word_embedding_table)
def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) # self._model = AutoModel.from_pretrained( # self.configs.model_path # ) # self._tokenizer = AutoTokenizer.from_pretrained( # 'distilbert-base-uncased' # ) self._model = resources.get("coref_model") self._tokenizer = resources.get("tokenizer") self.events2sents: Dict[int, Sentence] = {}
def setUp(self): self.resources: Resources = Resources() self.resources.update(redirects={}) self.data_dir: str = os.path.abspath( os.path.join( os.path.dirname(os.path.realpath(__file__)), "../../../../../data_samples/dbpedia", )) self.output_dir = tempfile.TemporaryDirectory() self.raw_output: str = os.path.join(self.output_dir.name, "raw") pl = Pipeline[DataPack](self.resources) pl.set_reader(DBpediaWikiReader()) pl.add( WikiArticleWriter(), config={ "output_dir": self.raw_output, "zip_pack": True, "drop_record": True, }, ) pl.run(os.path.join(self.data_dir, "nif_context.tql"))
def test_document_and_passage_mode(self, doc_mode): resources: Resources = Resources() config: HParams = HParams({"doc_mode": doc_mode}, default_hparams=None) self.reader.initialize(resources, config) data_packs: List[DataPack] = \ [data_pack for data_pack in self.reader.iter(self.data_dir, 'dev')] # get all queries and all documents queries: List[Query] = [] documents: Dict[str, Document] = dict() for data_pack in data_packs: query_entries = list(data_pack.get_entries_by_type(Query)) doc_entries = list(data_pack.get_entries_by_type(Document)) self.assertTrue(len(query_entries) + len(doc_entries) == 1) if len(query_entries) > 0: query_entry: Query = query_entries[0] queries.append(query_entry) else: doc_entry: Document = doc_entries[0] documents[data_pack.meta.doc_id] = doc_entry # match text of documents relevant to the queries to the actual text for i, query in enumerate(queries): expected_query = self.expected_queries[i] expected_ids = self.expected_doc_ids[doc_mode][i] self.assertEqual(query.query, expected_query) self.assertCountEqual(query.doc_ids["relevant_docs"], expected_ids) for doc_id in expected_ids: expected_text = self.get_expected_text(doc_id, doc_mode) self.assertEqual(documents[doc_id].text, expected_text)
def initialize(self, resources: Resources, configs: Config): self.redirects = resources.get('redirects') # These NIF readers organize the statements in the specific RDF context, # in this case each context correspond to one wiki page, this allows # us to read the information more systematically. self.struct_reader = NIFBufferedContextReader( configs.nif_page_structure) self.link_reader = NIFBufferedContextReader(configs.nif_text_links)
def initialize(self, resources: Resources, configs: Config): # pylint: disable=attribute-defined-outside-init self.pack_index = read_index(configs.pack_index) self.pack_dir = configs.pack_dir self.redirects = resources.get('redirects') self.literal_info_reader = NIFBufferedContextReader( configs.mapping_literals) self.object_info_reader = NIFBufferedContextReader( configs.mapping_objects) # Set up logging. f_handler = logging.FileHandler(configs.reading_log) f_format = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') f_handler.setFormatter(f_format) self.logger.handlers = [f_handler]
import pickle import sys from forte import Pipeline from forte.common import Resources from forte.datasets.wikipedia.dbpedia import WikiArticleWriter from facets.wiki.processors.wiki import WikiCategoryReader if __name__ == "__main__": base_dir = sys.argv[1] pack_dir = os.path.join(base_dir, "packs") redirect_map = pickle.load( open(os.path.join(pack_dir, "redirects.pickle"), "rb")) resources = Resources() resources.update(redirects=redirect_map) # Define paths pack_input = os.path.join(pack_dir, "nif_raw_struct_links") # Should write to same files. # pack_output = pack_input pack_output = os.path.join(pack_dir, "category") # Store which documents have category. pack_input_index = os.path.join(pack_input, "article.idx") # Store which documents have category. pack_output_index = os.path.join(pack_output, "category.idx") logging.basicConfig( format="%(asctime)s - %(message)s", level=logging.INFO,
}, "token_source": "ft.onto.base_ontology.Token", }).add( WikiArticleWriter(), config={ "output_dir": pack_output, "zip_pack": True, "drop_record": True, "input_index_file": pack_input_index, "output_index_file": pack_output_index, "use_input_index": True, "serialize_method": "jsonpickle" }, ).add(ProgressPrinter()) pipeline.run(pack_input) if __name__ == "__main__": base_dir = sys.argv[1] pack_dir = os.path.join(base_dir, "packs") redirect_map = pickle.load( open(os.path.join(pack_dir, "redirects.pickle"), "rb")) loaded_resource = Resources() loaded_resource.update(redirects=redirect_map) complete_and_tokens()