def to_xml(self): doc = BioDesDoc() bioport_id = self.get_biographies()[0].get_bioport_id() # add the basic onfirmation doc.from_args( naam_publisher='Het Biografisch Portaal', url_biografie='http://www.biografischportaal.nl/persoon/%s' % bioport_id, url_publisher='http://www.biografischportaal.nl', namen=self.get_names(), bioport_id=bioport_id, sex=self.get_value('sex'), ) # add the events for event_type in ['birth', 'death', 'funeral', 'baptism', 'floruit']: event = self.get_event(event_type) if event is not None: doc._add_event_element(event) # add illustrations for ill in self.get_illustrations(): doc._add_figure(url=ill.source_url, head=ill.caption) # add links to all sources for bio in self.get_biographies(): if bio.get_source().id != 'bioport': # construct a bibl element bibl = SubElement(doc.get_element_biography(), 'bibl') publisher = SubElement(bibl, 'publisher') publisher.text = bio.get_value('name_publisher') ref = SubElement(bibl, 'ref') ref.attrib['target'] = bio.get_value('url_biography') author = bio.get_value('author') if author: for s in author: el_author = SubElement(bibl, 'author') el_author.text = s return doc
def test_from_args(self): kw = { 'url_biografie':'http://www.gerbrandy.com/bio?a&b', 'url_publisher':'http://www.gerbrandy.com', 'naam_publisher':'Website van Jelle', 'titel_biografie':'Bio van Jelle', 'naam':'Jelle Gerbrandy', 'local_id': '123', } doc = BioDesDoc() doc.from_args(**kw) self.assertEqual(doc.get_idno(), '123')
def process(self): names = [] for file in os.listdir('in'): tree = etree.parse("in/" + file) entries = tree.xpath("//item") for index, person in enumerate(entries, 1): self.total += 2 try: name1 = person.xpath('title/from')[0].text except IndexError: name1 = None try: name2 = person.xpath('title/to')[0].text except IndexError: name2 = None for name in (name1, name2): if name == "...." or not name: self.skip("null name") continue if name.replace('.', '').strip() == "": self.skip("null name") continue if name in names: self.skip("dupe name") continue names.append(name) for index, name in enumerate(names, 1): base_production = "http://www.inghist.nl/retroboeken/archives/" anchor = "#accessor=toc&accessor_href=toc%3Fcorrespondent%253Austring%253Autf-8%3D" encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = base_production + \ anchor + \ encoded_name bdes = BioDesDoc() args = dict(naam = name, naam_publisher = "Instituut voor Nederlandse Geschiedenis", url_publisher = "http://www.inghist.nl/", url_biografie = url, ) bdes.from_args(**args) self.write_file(bdes, index)
def write(self, names): names.sort() self.total = len(names) for index, name in enumerate(names): index += 1 name = sanitize_name(name) if self.name_already_processed(name): self.skip("dupe name") continue # URL encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = "http://www.inghist.nl/retroboeken/gachard/#accessor=toc&accessor_href=toc%3FSearchSource%253Austring%253Autf-8%3D%26van_aan%3D%26correspondent%253Austring%253Autf-8%3D" + encoded_name bdes = BioDesDoc() args = dict(naam = name, naam_publisher = "XXX", url_publisher = "http://XXX.nl", url_biografie = url, ) bdes.from_args(**args) self.write_file(bdes, index)
def test_create_some_samples(self, **args): #create a very simple file kw = { 'url_biografie':'http://www.gerbrandy.com/bio', 'url_publisher':'http://www.gerbrandy.com', 'naam_publisher':'Website van Jelle', 'titel_biografie':'Bio van Jelle', 'naam':'Jelle Gerbrandy', } doc = BioDesDoc() doc.from_args(**kw) doc.to_file('biodes10_minimal.xml') #the most complex case includes everyting kw = { 'bioport_id':'biodesid', 'url_biografie':'http://url_van_de_biografie', 'url_publisher':'http://url_van_de_publisher', 'titel_biografie':'titel van de biografie', 'naam_publisher':'naam van depublisher', # 'naam':'naam', 'auteur':'auteur', # 'beroep':'beroep', 'prepositie':'prepositie', 'voornaam':'voornaam', 'intrapositie':'intrapositie', 'geslachtsnaam': 'geslachtsnaam', 'postpositie':'postpositie', 'laatst_veranderd':'2009-11-11', 'publicatiedatum':'2009-11-11', 'geboortedatum':'2009-11-11', 'geboortedatum_tekst':'2009-11-11 in tekst', 'geboorteplaats':'geboorteplaats', 'sterfdatum':'2011-11-11', 'sterfdatum_tekst':'sterfdatum_tekst', 'sterfplaats':'sterfplaats', 'geslacht':'1', 'illustraties':['http://illustratie1.jpg', 'http://illustratie2.jpg'], 'namen':['Naam1', ('mr.', 'Jan', 'van', 'Voorbeeld', 'Esq.')], 'namen_en':['John'], 'tekst':'tekst van de biografie kan <em>Markup</em> <p>bevatten</p>', } doc.from_args(**kw) doc._add_event( type='marriage', when='1901-12-12', text='getrouwd met marietje', ) doc.add_state( type='occupation', frm='1940', to='1960', text='schilder', ) doc.add_state( type='residence', frm='1940', to='1960', text='Amsterdam', ) doc.add_state( type='claim_to_fame', text='Superbekende persoon!', ) doc.add_state( type='occupation', frm='1940', to='1960', text='schilder', place="Amsterdam", ) doc.to_file('biodes10_maximal.xml')