def handle(self, *args, **kwargs): for x in NerSource.objects.filter(title=source_json['title']): x.delete() source = NerSource.objects.create(title=source_json['title'], info=source_json['info']) clean_markup = os.path.join(acdh_tei_pyutils.__path__[0], 'files', 'clean_markup.xsl') g = Github() repo = g.get_repo('bleierr/NERDPool') contents = repo.get_contents("RTA_1576") for x in tqdm(contents, total=len(contents)): dl_url = x._rawData.get('download_url') doc = TeiReader(xml=dl_url, xsl=clean_markup) ne_list = doc.extract_ne_offsets( parent_nodes='.//tei:body//tei:p', ne_xpath= ".//*[contains(name(), 'Name') or name()='date' or name()='time']" ) for y in ne_list: ner_item = {"text": y[0], "entities": y[1]['entities']} ner_exit = bool(y[1]['entities']) NerSample.objects.create(ner_text=y[0], ner_sample=ner_item, ner_ent_exist=ner_exit, ner_source=source)
def test_004_check_ner_list(self): doc = TeiReader(xml=FILES[0]) ne_list = doc.get_text_nes_list() for y in ne_list: print(y) if y['ner_dicts']: self.assertTrue(y['ner_dicts'][0]['text'] in ["Prag", "Broschüre", "Böhmen"])
def handle(self, *args, **kwargs): for x in NerSource.objects.filter(title=source_json['title']): x.delete() source = NerSource.objects.create(title=source_json['title'], info=source_json['info']) doc = TeiReader('https://gams.uni-graz.at/o:dipko.rb/TEI_SOURCE') sample = doc.extract_ne_offsets( ne_xpath='.//tei:rs[not(@type="event")]') for x in tqdm(sample, total=len(sample)): data = {"text": x[0], "entities": x[1]['entities']} text = x[0] ner_exist = bool(data['entities']) NerSample.objects.create(ner_text=text, ner_sample=data, ner_ent_exist=ner_exist, ner_source=source)
def handle(self, *args, **kwargs): for x in NerSource.objects.filter(title=source_json['title']): x.delete() source = NerSource.objects.create(title=source_json['title'], info=source_json['info']) doc = TeiReader('http://gams.uni-graz.at/o:aled.1/TEI_SOURCE') sample = doc.extract_ne_offsets( ne_xpath= ".//*[(name()='name' and ./@ref) or name()='date' or name()='time']" ) for x in tqdm(sample, total=len(sample)): data = {"text": x[0], "entities": x[1]['entities']} text = x[0] ner_exist = bool(data['entities']) NerSample.objects.create(ner_text=text, ner_sample=data, ner_ent_exist=ner_exist, ner_source=source)
def test_006_markup_cleanup(self): doc = TeiReader(xml=FILES[0]) ent_list = doc.get_elements() self.assertTrue('{http://www.tei-c.org/ns/1.0}unclear' in ent_list) doc = TeiReader(xml=FILES[0], xsl=XSL) ent_list = doc.get_elements() self.assertFalse('{http://www.tei-c.org/ns/1.0}unclear' in ent_list)
def create_mans_from_folder(man_dir, frd_work, auth_items): glob_pattern = f"{man_dir}/*.xml" files = glob.glob(glob_pattern) manifestations = [] for x in files: doc = TeiReader(x) man_id = doc.any_xpath('.//@xml:id')[0].split('__')[-1] drupal_man_obj = frd.FrdManifestation(auth_items=auth_items, manifestation_id=man_id) man_slug = Path(x).stem frd_man, _ = FrdManifestation.objects.get_or_create( title_slug=man_slug, work=frd_work) frd_man.tei_doc = doc.return_string() frd_man.drupal_hash = man_id frd_man.save_path = x try: frd_man.drupal_json = drupal_man_obj.manifestation except KeyError: pass frd_man.save() manifestations.append(frd_man) return manifestations
def test_005_ner_offsets(self): doc = TeiReader(xml=FILES[0]) ne_offsets = doc.extract_ne_offsets() print(ne_offsets[2]) self.assertIsInstance(ne_offsets, list)
def test_003_extract_ner_list(self): doc = TeiReader(xml=FILES[0]) ne_list = doc.get_text_nes_list() self.assertIsInstance(ne_list, list) self.assertTrue(len(ne_list), 2)
def test_002_parsing_from_file(self): for x in FILES: doc = TeiReader(xml=x) parent_node = doc.any_xpath(any_xpath='//tei:body')[0] ne_list = doc.extract_ne_elements(parent_node) self.assertIsInstance(ne_list, list)