def write(self, names): names.sort() self.total = len(names) for index, name in enumerate(names): index += 1 self.print_progress(index, name) # URL base_dev = "http://dev.inghist.nl/retrotest2010/oldenbarnevelt/" base_production = "http://www.inghist.nl/retroboeken/oldenbarnevelt/" anchor = "#accessor=toc&accessor_href=toc%3FSearchSource%3D" encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = base_production + \ anchor + \ encoded_name + \ "%26correspondent%3D%26day1%3D%26month1%3D%26year1%3D%26day2%3D%26month2%3D%26year2%3D" bdes = BioDesDoc() args = dict( naam=name, naam_publisher="XXX", url_publisher="http://XXX.nl", url_biografie=url, ) bdes.from_args(**args) self.write_file(bdes, index)
def process(self, people_dict): people = people_dict.keys() people.sort() self.total = len(people) x = 0 for name in people: x += 1 info = people_dict[name] print "processing: %s/%s - %s" %(x, len(people), name) name = sanitize_name(name) if self.name_already_processed(name): self.skipped += 1 continue # URL base_dev = "http://dev.inghist.nl/retrotest2010/groen/" base_production = "http://www.inghist.nl/retroboeken/groen/" encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = base_production + \ "#accessor=accessor_index&accessor_href=accessor_index%3FSearchSource%253Autf-8%253Austring%3D" + \ encoded_name bdes = BioDesDoc() args = dict(naam = name, naam_publisher = "XXX", url_publisher = "http://XXX.nl", url_biografie = url, ) """ args = dict(naam = name, figures =[(people_dict[id]['img_url'], people_dict[id]['caption'], )], naam_publisher = "Het Geheugen van Nederland", url_biografie = people_dict[id]['bio_url'], url_publisher = "http://geheugenvannederland.nl", tekst = people_dict[id]['tekst'] ) """ birth_date = info['born'] death_date = sterfdatum = info['dead'] if bdes.is_date(birth_date): args['geboortedatum'] = birth_date if bdes.is_date(death_date): args['sterfdatum'] = death_date bdes.from_args(**args) self.write_file(bdes, x)
def write(self, people): self.total = len(people) for index, name in enumerate(people): if self.name_already_processed(name): self.skipped += 1 continue url = people[name]['url'] bdes = BioDesDoc() args = dict( naam=name, naam_publisher="XXX", url_publisher="http://XXX.nl", url_biografie=url, ) bdes.from_args(**args) self.write_file(bdes, index)
def process(self): names = [] for file in os.listdir('in'): tree = etree.parse("in/" + file) entries = tree.xpath("//item") for index, person in enumerate(entries, 1): self.total += 2 try: name1 = person.xpath('title/from')[0].text except IndexError: name1 = None try: name2 = person.xpath('title/to')[0].text except IndexError: name2 = None for name in (name1, name2): if name == "...." or not name: self.skip("null name") continue if name.replace('.', '').strip() == "": self.skip("null name") continue if name in names: self.skip("dupe name") continue names.append(name) for index, name in enumerate(names, 1): base_production = "http://www.inghist.nl/retroboeken/archives/" anchor = "#accessor=toc&accessor_href=toc%3Fcorrespondent%253Austring%253Autf-8%3D" encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = base_production + \ anchor + \ encoded_name bdes = BioDesDoc() args = dict(naam = name, naam_publisher = "Instituut voor Nederlandse Geschiedenis", url_publisher = "http://www.inghist.nl/", url_biografie = url, ) bdes.from_args(**args) self.write_file(bdes, index)
def write(self, names): names.sort() self.total = len(names) for index, name in enumerate(names): index += 1 name = sanitize_name(name) if self.name_already_processed(name): self.skip("dupe name") continue # URL encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = "http://www.inghist.nl/retroboeken/gachard/#accessor=toc&accessor_href=toc%3FSearchSource%253Austring%253Autf-8%3D%26van_aan%3D%26correspondent%253Austring%253Autf-8%3D" + encoded_name bdes = BioDesDoc() args = dict(naam = name, naam_publisher = "XXX", url_publisher = "http://XXX.nl", url_biografie = url, ) bdes.from_args(**args) self.write_file(bdes, index)
def get_biodes_documents(self): for url in self.root.xpath('//a/@href'): yield BioDesDoc().from_url(url)