def write(self, names): names.sort() self.total = len(names) for index, name in enumerate(names): index += 1 self.print_progress(index, name) # URL base_dev = "http://dev.inghist.nl/retrotest2010/oldenbarnevelt/" base_production = "http://www.inghist.nl/retroboeken/oldenbarnevelt/" anchor = "#accessor=toc&accessor_href=toc%3FSearchSource%3D" encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = base_production + \ anchor + \ encoded_name + \ "%26correspondent%3D%26day1%3D%26month1%3D%26year1%3D%26day2%3D%26month2%3D%26year2%3D" bdes = BioDesDoc() args = dict( naam=name, naam_publisher="XXX", url_publisher="http://XXX.nl", url_biografie=url, ) bdes.from_args(**args) self.write_file(bdes, index)
def test_add_note(self): doc = BioDesDoc().from_xml(self.create_element()) doc.add_note('text of the note', type='sometype' ) self.assertEqual(len(doc.get_notes()), 1) self.assertEqual(len(doc.get_notes(type='sometype')), 1) self.assertEqual(doc.get_notes(type='sometype')[0].text, 'text of the note') doc.add_or_update_note('note2', type='sometype') self.assertEqual(doc.get_notes(type='sometype')[0].text, 'note2')
def test_from_args(self): kw = { 'url_biografie':'http://www.gerbrandy.com/bio?a&b', 'url_publisher':'http://www.gerbrandy.com', 'naam_publisher':'Website van Jelle', 'titel_biografie':'Bio van Jelle', 'naam':'Jelle Gerbrandy', 'local_id': '123', } doc = BioDesDoc() doc.from_args(**kw) self.assertEqual(doc.get_idno(), '123')
def to_xml(self): doc = BioDesDoc() bioport_id = self.get_biographies()[0].get_bioport_id() # add the basic onfirmation doc.from_args( naam_publisher='Het Biografisch Portaal', url_biografie='http://www.biografischportaal.nl/persoon/%s' % bioport_id, url_publisher='http://www.biografischportaal.nl', namen=self.get_names(), bioport_id=bioport_id, sex=self.get_value('sex'), ) # add the events for event_type in ['birth', 'death', 'funeral', 'baptism', 'floruit']: event = self.get_event(event_type) if event is not None: doc._add_event_element(event) # add illustrations for ill in self.get_illustrations(): doc._add_figure(url=ill.source_url, head=ill.caption) # add links to all sources for bio in self.get_biographies(): if bio.get_source().id != 'bioport': # construct a bibl element bibl = SubElement(doc.get_element_biography(), 'bibl') publisher = SubElement(bibl, 'publisher') publisher.text = bio.get_value('name_publisher') ref = SubElement(bibl, 'ref') ref.attrib['target'] = bio.get_value('url_biography') author = bio.get_value('author') if author: for s in author: el_author = SubElement(bibl, 'author') el_author.text = s return doc
def test_round_trip(k, o, **dict): """test a 'round trip': create a biodes doc with 'from_args', and then parse the file using 'to_dict' k = the key o = the expected oubput dict : the data used to create the biodes document """ if not dict: dict = {k:o} el = self.create_element(**dict) doc = BioDesDoc().from_element(el) dct = doc.to_dict() assert dct.has_key(k), doc.to_string() assert dct[k] == o, '%s shoudl be "%s", not "%s"\n%s' % (k, o, dct[k], doc.to_string())
def get_illustrations(self, default=[]): figures = BioDesDoc.get_illustrations(self) images_cache_local = '' images_cache_url = '' prefix = self.get_source().id if self.repository: images_cache_local = self.repository.images_cache_local images_cache_url = self.repository.images_cache_url result = [] for figure in figures: url, caption = figure if not caption: caption = 'illustratie uit %s' % self.get_source().description if (not url.startswith('http://')) and (not url.startswith('file://')): # this is a relative url url = '/'.join((os.path.dirname(self.source_url), url)) if not url.startswith('file://'): url = 'file://' + url result.append(Illustration( url=url, images_cache_local=images_cache_local, images_cache_url=images_cache_url, prefix=prefix, caption=caption, link_url=self.get_value('url_biografie'), )) return result
def write(self, people): self.total = len(people) for index, name in enumerate(people): if self.name_already_processed(name): self.skipped += 1 continue url = people[name]['url'] bdes = BioDesDoc() args = dict( naam=name, naam_publisher="XXX", url_publisher="http://XXX.nl", url_biografie=url, ) bdes.from_args(**args) self.write_file(bdes, index)
def test_replace_name(self): doc = BioDesDoc().from_xml(self.create_element()) naam = Name('Pietje Een') doc._add_a_name(naam) naam = Name('Pietje Twee') doc._add_a_name(naam) self.assertEqual(len(doc.get_names()), 3) new_naam = Name('Newt Newman') self.assertEqual(new_naam.to_string(), u'<persName>Newt Newman</persName>') doc._replace_name(new_naam, 1) self.assertEqual(doc.get_names()[1].to_string(), new_naam.to_string())
def test_read_write_events(self): doc = BioDesDoc().from_xml(self.create_element()) doc.add_or_update_event(type='a', when="2009", notBefore="2010", notAfter="2011", date_text="2012", place="asd", place_id="-12345") events = doc.get_events(type='a') self.assertEqual(len(events), 1) e = events[0] self.assertEqual(e.get('when'), '2009') self.assertEqual(e.get('notBefore'), '2010') self.assertEqual(e.get('notAfter'), '2011') self.assertEqual(e.find('date').text, '2012') self.assertEqual(e.find('place').text, 'asd') self.assertEqual(e.find('place').get('key'), '-12345') doc.add_or_update_event(type='a', date_text='') self.assertEqual(e.find('date'), None) doc.add_or_update_event(type='a', place='') self.assertEqual(e.find('place').text, '') doc.add_or_update_event(type='a', place_id='', place='') self.assertEqual(e.find('place').get('key'), '')
def process(self): names = [] for file in os.listdir('in'): tree = etree.parse("in/" + file) entries = tree.xpath("//item") for index, person in enumerate(entries, 1): self.total += 2 try: name1 = person.xpath('title/from')[0].text except IndexError: name1 = None try: name2 = person.xpath('title/to')[0].text except IndexError: name2 = None for name in (name1, name2): if name == "...." or not name: self.skip("null name") continue if name.replace('.', '').strip() == "": self.skip("null name") continue if name in names: self.skip("dupe name") continue names.append(name) for index, name in enumerate(names, 1): base_production = "http://www.inghist.nl/retroboeken/archives/" anchor = "#accessor=toc&accessor_href=toc%3Fcorrespondent%253Austring%253Autf-8%3D" encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = base_production + \ anchor + \ encoded_name bdes = BioDesDoc() args = dict(naam = name, naam_publisher = "Instituut voor Nederlandse Geschiedenis", url_publisher = "http://www.inghist.nl/", url_biografie = url, ) bdes.from_args(**args) self.write_file(bdes, index)
def test_to_dict(self): doc = BioDesDoc() doc.from_url(os.path.join(this_dir, 'bio.xml')) d = doc.to_dict() assert 'geboortedatum' in d, d assert doc.get_value('geboortedatum') self.assertEqual(type(doc.get_value('geboortedatum')), type(u''))
def write(self, names): names.sort() self.total = len(names) for index, name in enumerate(names): index += 1 name = sanitize_name(name) if self.name_already_processed(name): self.skip("dupe name") continue # URL encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = "http://www.inghist.nl/retroboeken/gachard/#accessor=toc&accessor_href=toc%3FSearchSource%253Austring%253Autf-8%3D%26van_aan%3D%26correspondent%253Austring%253Autf-8%3D" + encoded_name bdes = BioDesDoc() args = dict(naam = name, naam_publisher = "XXX", url_publisher = "http://XXX.nl", url_biografie = url, ) bdes.from_args(**args) self.write_file(bdes, index)
def process(self, people_dict): people = people_dict.keys() people.sort() self.total = len(people) x = 0 for name in people: x += 1 info = people_dict[name] print "processing: %s/%s - %s" %(x, len(people), name) name = sanitize_name(name) if self.name_already_processed(name): self.skipped += 1 continue # URL base_dev = "http://dev.inghist.nl/retrotest2010/groen/" base_production = "http://www.inghist.nl/retroboeken/groen/" encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = base_production + \ "#accessor=accessor_index&accessor_href=accessor_index%3FSearchSource%253Autf-8%253Austring%3D" + \ encoded_name bdes = BioDesDoc() args = dict(naam = name, naam_publisher = "XXX", url_publisher = "http://XXX.nl", url_biografie = url, ) """ args = dict(naam = name, figures =[(people_dict[id]['img_url'], people_dict[id]['caption'], )], naam_publisher = "Het Geheugen van Nederland", url_biografie = people_dict[id]['bio_url'], url_publisher = "http://geheugenvannederland.nl", tekst = people_dict[id]['tekst'] ) """ birth_date = info['born'] death_date = sterfdatum = info['dead'] if bdes.is_date(birth_date): args['geboortedatum'] = birth_date if bdes.is_date(death_date): args['sterfdatum'] = death_date bdes.from_args(**args) self.write_file(bdes, x)
def test_add_delete_update_figure(self): doc = BioDesDoc().from_xml(self.create_element()) self.assertEqual(len(doc.get_figures()), 0) _ref1 = doc.add_figure(uri='http://someref', text='some text') self.assertEqual(len(doc.get_figures()), 1) _ref2 = doc.add_figure(uri='http://someref2', text='some text2') self.assertEqual(len(doc.get_figures()), 2) index1 = doc.get_figures()[0][0] index2 = doc.get_figures()[1][0] doc.remove_figure(index2) self.assertEqual(len(doc.get_figures()), 1) _ref1 = doc.update_figure(index=index1, uri='http://somerefx', text='some textx') self.assertEqual(len(doc.get_figures()), 1) index, ill = doc.get_figures()[0] self.assertEqual(index, 0) self.assertEqual(ill.find('graphic').get('url'), 'http://somerefx') self.assertEqual(ill.find('head').text, 'some textx')
def test_add_delete_update_extrafield(self): doc = BioDesDoc().from_xml(self.create_element()) self.assertEqual(len(doc.get_extrafields()), 0) doc.add_extrafield(key='sleutel', value='some value') self.assertEqual(len(doc.get_extrafields()), 1) doc.add_extrafield(key='sleutel2', value='some value2') self.assertEqual(len(doc.get_extrafields()), 2) index1 = 0 index2 = 1 doc.remove_extrafield(index2) self.assertEqual(len(doc.get_extrafields()), 1) ref1 = doc.update_extrafield(index=index1, key='sleuteldifferent', value='different value') self.assertEqual(len(doc.get_extrafields()), 1) self.assertEqual(ref1.get('target'), 'sleuteldifferent') self.assertEqual(ref1.text, 'different value') #this is what happens when saveing fom the UI doc._replace_extrafields([]) self.assertEqual(len(doc.get_extrafields()), 0) doc.add_extrafield(key='key0', value='some value') doc.add_extrafield(key='key1', value='some value2') self.assertEqual(doc.get_extrafields()[0].get('target'), 'key0') self.assertEqual(doc.get_extrafields()[1].get('target'), 'key1') doc._replace_extrafields([('key0', 'some value'), ('key1', 'some value2')]) self.assertEqual(doc.get_extrafields()[0].get('target'), 'key0') self.assertEqual(doc.get_extrafields()[1].get('target'), 'key1')
def test_add_delete_update_reference(self): doc = BioDesDoc().from_xml(self.create_element()) self.assertEqual(len(doc.get_references()), 0) _ref1 = doc.add_reference(uri='http://someref', text='some text') self.assertEqual(len(doc.get_references()), 1) _ref2 = doc.add_reference(uri='http://someref2', text='some text2') self.assertEqual(len(doc.get_references()), 2) index1 = doc.get_references()[0][0] index2 = doc.get_references()[1][0] doc.remove_reference(index2) self.assertEqual(len(doc.get_references()), 1) ref1 = doc.update_reference(index=index1, uri='http://somerefx', text='some textx') self.assertEqual(len(doc.get_references()), 1) index, _ref = doc.get_references()[0] self.assertEqual(index, 0) self.assertEqual(ref1.get('target'), 'http://somerefx') self.assertEqual(ref1.text, 'some textx')
def test_relations(self): doc = BioDesDoc().from_xml(self.create_element()) doc.add_relation(person="Kwik", relation="partner") doc.add_relation(person="Kwek", relation="child") doc.add_relation(person="Kwak", relation="father") doc.add_relation(person="Donald", relation="mother") doc.add_relation(person="Dagobert", relation="parent") self.assertEqual(doc.get_relation('partner'), ['Kwik']) self.assertEqual(doc.get_relation('child'), ['Kwek']) self.assertEqual(doc.get_relation('father'), ['Kwak']) self.assertEqual(doc.get_relation('mother'), ['Donald']) self.assertEqual(doc.get_relation('parent'), ['Dagobert']) #make sure we are not reading the other names self.assertEqual(len(doc.get_names()), 1) self.assertEqual(len(doc.get_relations()), 5) ls = [(el_relation.get('name'), el_person[0].text) for (el_relation, el_person) in doc.get_relations()] self.assertTrue(('child', 'Kwek') in ls, ls) el_relation, el_person = doc.get_relations()[1] type = el_relation.get('name') name = el_person[0].text index = el_relation.getparent().index(el_relation) #see if deleting and re-adding is sane doc.remove_relation(index) self.assertEqual(len(doc.get_relations()), 4) doc.add_relation(person=name, relation=type) self.assertEqual(len(doc.get_relations()), 5)
def test_read_write_states(self): doc = BioDesDoc().from_xml(self.create_element()) doc.add_or_update_state(type='floruit', frm="1900", to="1910", place='Zohar', place_id='1') state = doc.get_state(type='floruit') self.assertEqual(state.get('from'), '1900') self.assertEqual(state.get('to'), '1910') self.assertEqual(state.get('type'), 'floruit') self.assertEqual(state.find('place').text, 'Zohar') self.assertEqual(state.find('place').get('key'), '1') doc.add_or_update_state(type='floruit', place_id='', place='') self.assertEqual(state.find('place').get('key'), '') states = doc.get_states(type='floruit') self.assertEqual(states, [state]) doc.add_state(type='occupation', idno="1") self.assertEqual(len(states), 1) doc.add_state(type='occupation', idno="2") doc.add_state(type='occupation', idno="3") states = doc.get_states(type='occupation') self.assertEqual(len(states), 3) doc.remove_state(type='occupation', idx=1) states = doc.get_states(type='occupation') self.assertEqual(len(states), 2) self.assertEqual([s.get('idno') for s in states], ['1', '3']) #remove states by index number states = doc.get_states() some_state = states[1] some_index = some_state.getparent().index(some_state) doc.remove_state(idx= some_index) self.assertEqual(len(states)-1, len(doc.get_states()))
def test_remove_name(self): doc = BioDesDoc().from_xml(self.create_element()) naam = Name('Pietje Een') doc._add_a_name(naam) naam = Name('Pietje Twee') doc._add_a_name(naam) self.assertEqual(doc.get_names()[2].volledige_naam(), 'Pietje Twee', doc.get_names() ) self.assertEqual(len(doc.get_names()), 3) doc.remove_name(1) self.assertEqual(len(doc.get_names()), 2) self.assertEqual(doc.get_names()[1].volledige_naam(), 'Pietje Twee', doc.get_names() )
def test_create_some_samples(self, **args): #create a very simple file kw = { 'url_biografie':'http://www.gerbrandy.com/bio', 'url_publisher':'http://www.gerbrandy.com', 'naam_publisher':'Website van Jelle', 'titel_biografie':'Bio van Jelle', 'naam':'Jelle Gerbrandy', } doc = BioDesDoc() doc.from_args(**kw) doc.to_file('biodes10_minimal.xml') #the most complex case includes everyting kw = { 'bioport_id':'biodesid', 'url_biografie':'http://url_van_de_biografie', 'url_publisher':'http://url_van_de_publisher', 'titel_biografie':'titel van de biografie', 'naam_publisher':'naam van depublisher', # 'naam':'naam', 'auteur':'auteur', # 'beroep':'beroep', 'prepositie':'prepositie', 'voornaam':'voornaam', 'intrapositie':'intrapositie', 'geslachtsnaam': 'geslachtsnaam', 'postpositie':'postpositie', 'laatst_veranderd':'2009-11-11', 'publicatiedatum':'2009-11-11', 'geboortedatum':'2009-11-11', 'geboortedatum_tekst':'2009-11-11 in tekst', 'geboorteplaats':'geboorteplaats', 'sterfdatum':'2011-11-11', 'sterfdatum_tekst':'sterfdatum_tekst', 'sterfplaats':'sterfplaats', 'geslacht':'1', 'illustraties':['http://illustratie1.jpg', 'http://illustratie2.jpg'], 'namen':['Naam1', ('mr.', 'Jan', 'van', 'Voorbeeld', 'Esq.')], 'namen_en':['John'], 'tekst':'tekst van de biografie kan <em>Markup</em> <p>bevatten</p>', } doc.from_args(**kw) doc._add_event( type='marriage', when='1901-12-12', text='getrouwd met marietje', ) doc.add_state( type='occupation', frm='1940', to='1960', text='schilder', ) doc.add_state( type='residence', frm='1940', to='1960', text='Amsterdam', ) doc.add_state( type='claim_to_fame', text='Superbekende persoon!', ) doc.add_state( type='occupation', frm='1940', to='1960', text='schilder', place="Amsterdam", ) doc.to_file('biodes10_maximal.xml')
def test_from_dict(self): d = self.kw doc = BioDesDoc() doc.from_dict(d)
def from_url(self, url): self.source_url = url BioDesDoc.from_url(self, url) self.create_id() return self
def test_get_names(self): url = os.path.join(this_dir, 'bio.xml') doc = BioDesDoc().from_url(url) self.assertEqual(len(doc.get_names()), 1) n = doc.get_names()[0] self.assertEqual(u'C. van Heynsbergen', n.volledige_naam())
def test_get_value(self): doc = BioDesDoc() doc.from_url(os.path.join(this_dir, 'bio.xml')) value = [n.volledige_naam() for n in doc.get_value('namen')] should_be = ['C. van Heynsbergen'] self.failUnlessEqual(value, should_be)
def get_biodes_documents(self): for url in self.root.xpath('//a/@href'): yield BioDesDoc().from_url(url)