コード例 #1
0
 def handle(self, *args, **kwargs):
     for x in NerSource.objects.filter(title=source_json['title']):
         x.delete()
     source = NerSource.objects.create(title=source_json['title'],
                                       info=source_json['info'])
     clean_markup = os.path.join(acdh_tei_pyutils.__path__[0], 'files',
                                 'clean_markup.xsl')
     g = Github()
     repo = g.get_repo('bleierr/NERDPool')
     contents = repo.get_contents("RTA_1576")
     for x in tqdm(contents, total=len(contents)):
         dl_url = x._rawData.get('download_url')
         doc = TeiReader(xml=dl_url, xsl=clean_markup)
         ne_list = doc.extract_ne_offsets(
             parent_nodes='.//tei:body//tei:p',
             ne_xpath=
             ".//*[contains(name(), 'Name') or name()='date' or name()='time']"
         )
         for y in ne_list:
             ner_item = {"text": y[0], "entities": y[1]['entities']}
             ner_exit = bool(y[1]['entities'])
             NerSample.objects.create(ner_text=y[0],
                                      ner_sample=ner_item,
                                      ner_ent_exist=ner_exit,
                                      ner_source=source)
コード例 #2
0
 def test_004_check_ner_list(self):
     doc = TeiReader(xml=FILES[0])
     ne_list = doc.get_text_nes_list()
     for y in ne_list:
         print(y)
         if y['ner_dicts']:
             self.assertTrue(y['ner_dicts'][0]['text'] in
                             ["Prag", "Broschüre", "Böhmen"])
コード例 #3
0
ファイル: import_DIPKO.py プロジェクト: acdh-oeaw/nerdpool
 def handle(self, *args, **kwargs):
     for x in NerSource.objects.filter(title=source_json['title']):
         x.delete()
     source = NerSource.objects.create(title=source_json['title'],
                                       info=source_json['info'])
     doc = TeiReader('https://gams.uni-graz.at/o:dipko.rb/TEI_SOURCE')
     sample = doc.extract_ne_offsets(
         ne_xpath='.//tei:rs[not(@type="event")]')
     for x in tqdm(sample, total=len(sample)):
         data = {"text": x[0], "entities": x[1]['entities']}
         text = x[0]
         ner_exist = bool(data['entities'])
         NerSample.objects.create(ner_text=text,
                                  ner_sample=data,
                                  ner_ent_exist=ner_exist,
                                  ner_source=source)
コード例 #4
0
ファイル: import_ALED.py プロジェクト: acdh-oeaw/nerdpool
 def handle(self, *args, **kwargs):
     for x in NerSource.objects.filter(title=source_json['title']):
         x.delete()
     source = NerSource.objects.create(title=source_json['title'],
                                       info=source_json['info'])
     doc = TeiReader('http://gams.uni-graz.at/o:aled.1/TEI_SOURCE')
     sample = doc.extract_ne_offsets(
         ne_xpath=
         ".//*[(name()='name' and ./@ref) or name()='date' or name()='time']"
     )
     for x in tqdm(sample, total=len(sample)):
         data = {"text": x[0], "entities": x[1]['entities']}
         text = x[0]
         ner_exist = bool(data['entities'])
         NerSample.objects.create(ner_text=text,
                                  ner_sample=data,
                                  ner_ent_exist=ner_exist,
                                  ner_source=source)
コード例 #5
0
 def test_006_markup_cleanup(self):
     doc = TeiReader(xml=FILES[0])
     ent_list = doc.get_elements()
     self.assertTrue('{http://www.tei-c.org/ns/1.0}unclear' in ent_list)
     doc = TeiReader(xml=FILES[0], xsl=XSL)
     ent_list = doc.get_elements()
     self.assertFalse('{http://www.tei-c.org/ns/1.0}unclear' in ent_list)
コード例 #6
0
ファイル: utils.py プロジェクト: acdh-oeaw/freud-app
def create_mans_from_folder(man_dir, frd_work, auth_items):
    glob_pattern = f"{man_dir}/*.xml"
    files = glob.glob(glob_pattern)
    manifestations = []
    for x in files:
        doc = TeiReader(x)
        man_id = doc.any_xpath('.//@xml:id')[0].split('__')[-1]
        drupal_man_obj = frd.FrdManifestation(auth_items=auth_items,
                                              manifestation_id=man_id)
        man_slug = Path(x).stem
        frd_man, _ = FrdManifestation.objects.get_or_create(
            title_slug=man_slug, work=frd_work)
        frd_man.tei_doc = doc.return_string()
        frd_man.drupal_hash = man_id
        frd_man.save_path = x
        try:
            frd_man.drupal_json = drupal_man_obj.manifestation
        except KeyError:
            pass
        frd_man.save()
        manifestations.append(frd_man)
    return manifestations
コード例 #7
0
 def test_005_ner_offsets(self):
     doc = TeiReader(xml=FILES[0])
     ne_offsets = doc.extract_ne_offsets()
     print(ne_offsets[2])
     self.assertIsInstance(ne_offsets, list)
コード例 #8
0
 def test_003_extract_ner_list(self):
     doc = TeiReader(xml=FILES[0])
     ne_list = doc.get_text_nes_list()
     self.assertIsInstance(ne_list, list)
     self.assertTrue(len(ne_list), 2)
コード例 #9
0
 def test_002_parsing_from_file(self):
     for x in FILES:
         doc = TeiReader(xml=x)
         parent_node = doc.any_xpath(any_xpath='//tei:body')[0]
         ne_list = doc.extract_ne_elements(parent_node)
         self.assertIsInstance(ne_list, list)