def process(self): data = eval(self._get_input_data()) self.total = len(data) for index, person in enumerate(data): index += 1 name = "" for x in (person[1], person[2], person[0]): if x is not None: name += x name += " " name = name.strip() if not name: self.skip("empty name") continue else: name = sanitize_name(name) if self.name_already_processed(name): self.skip("duplicate name") continue encoded_name = urllib.quote(name) biourl = "http://www.inghist.nl/Onderzoek/Projecten/WVO/brieven?af_naam_vol=" + encoded_name # ---- name = name.decode('latin1') bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=biourl, url_publisher="http://www.inghist.nl/", ) self.write_file(bdes, index)
def process(self): data = eval(self._get_input_data()) self.total = len(data) for index, person in enumerate(data): index += 1 self.print_progress(index) id = person[0] name = person[1] name = name.decode('latin1') name = sanitize_name(name) if self.name_already_processed(name): self.skip("duplicate name") continue encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) biourl = "http://www.inghist.nl/retroboeken/nib/?zoekveld=abdul" biourl += "&soort=persoon#accessor=cumulatieveindex&accessor_href=CumulatieveIndex%2FPersonenIndex%3Fzoekveld%3D" biourl += name biourl += "%26soort%3Dpersoon" text = person[2] text = text.decode('latin1') # ---- bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=biourl, url_publisher="http://www.inghist.nl/", text=text, ) self.write_file(bdes, index)
def read(self): names = [] lower_case_names = set() for file in ('in/1.xml', 'in/2.xml', 'in/3.xml'): tree = etree.parse(open(file, 'r')) items = tree.xpath("/*/item") for iteration, bio in enumerate(items): try: name1 = sanitize_name(bio.xpath('title/from')[0].text) except IndexError: name1 = None try: name2 = sanitize_name(bio.xpath('title/to')[0].text) except IndexError: name2 = None for name in (name1, name2): if not self.name_already_processed(name): names.append(name) return names
def process(self): data = eval(self._get_input_data()) self.total = len(data) for index, person in enumerate(data): index += 1 # id id = person[0] if not id: self.skip("no id") continue # name name = "" for x in (person[3], person[4], person[1]): if x is not None: name += x name += " " name = name.strip() if not name: self.skip("empty name") continue else: name = sanitize_name(name) if self.name_already_processed(name): self.skip("duplicate name") continue name = name.decode('latin1') print repr(name) # dates bplace = person[9] and person[9].decode('latin1') or None bdate = person[10] and str(person[10]) or None ddate = person[13] and str(person[13]) or None text = person[15] if text is not None: text = text.replace("\x00", "") text = text.decode('latin1') biourl = "http://www.inghist.nl/Onderzoek/Projecten/Egodocumenten/persoon_detail/%s" % id # ---- bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=biourl, url_publisher="http://www.inghist.nl/", birth_place=bplace, birth_date=bdate, death_date=ddate, text=text, ) self.write_file(bdes, index)
def process(self, people_dict): people = people_dict.keys() people.sort() self.total = len(people) x = 0 for name in people: x += 1 info = people_dict[name] print "processing: %s/%s - %s" %(x, len(people), name) name = sanitize_name(name) if self.name_already_processed(name): self.skipped += 1 continue # URL base_dev = "http://dev.inghist.nl/retrotest2010/groen/" base_production = "http://www.inghist.nl/retroboeken/groen/" encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = base_production + \ "#accessor=accessor_index&accessor_href=accessor_index%3FSearchSource%253Autf-8%253Austring%3D" + \ encoded_name bdes = BioDesDoc() args = dict(naam = name, naam_publisher = "XXX", url_publisher = "http://XXX.nl", url_biografie = url, ) """ args = dict(naam = name, figures =[(people_dict[id]['img_url'], people_dict[id]['caption'], )], naam_publisher = "Het Geheugen van Nederland", url_biografie = people_dict[id]['bio_url'], url_publisher = "http://geheugenvannederland.nl", tekst = people_dict[id]['tekst'] ) """ birth_date = info['born'] death_date = sterfdatum = info['dead'] if bdes.is_date(birth_date): args['geboortedatum'] = birth_date if bdes.is_date(death_date): args['sterfdatum'] = death_date bdes.from_args(**args) self.write_file(bdes, x)
def process(self): data = eval(self._get_input_data()) self.total = len(data) for index, person in enumerate(data): index += 1 # self.print_progress(index) id = person[0] name = "" for x in (person[4], person[2], person[3], person[1]): if x is not None: name += x name += " " name = name.decode('latin1') name = sanitize_name(name) if self.name_already_processed(name): self.skip("duplicate name") continue biourl = "http://www.inghist.nl/Onderzoek/Projecten/KPP/PersoonDetail?Id=%s" % id sex = person[5] if sex == 'm': sex = 1 elif sex == 'v': sex = 2 else: sex = None text = (person[16], person[17], person[18]) text = ' '.join([x for x in text if x]) text = text.strip() text = text.decode('latin1') if not text: text = person[14] if text: text = text.decode('latin1') # ---- bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=biourl, url_publisher="http://www.inghist.nl/", sex=sex, text=text, ) self.write_file(bdes, index)
def process(self): DateTime = lambda x: None data = eval(self._get_input_data()) self.total = len(data) for index, person in enumerate(data): index += 1 self.print_progress(index) id = person[0] name = person[1] if not id: self.skip("id is None") continue if name is None: self.skip("name is None") continue name = name.decode('latin1') name = sanitize_name(name) if self.name_already_processed(name): self.skip("duplicate name: %s" % name) continue encoded_name = urllib.quote(name.encode('utf8')) biourl = "http://www.inghist.nl/Onderzoek/Projecten/RapportenCentraleInlichtingendienst1919-1940/data/GeavanceerdResult.html?batch_size=15&persoon=" + encoded_name text = person[7] if text is not None: text = text.strip() text = text.decode('latin1') if "Berger, L.M., zie Morisset" in name: self.skip("name causing unknwon encoding error: %s" % name) continue # ---- bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=biourl, url_publisher="http://www.inghist.nl/", text=text, ) self.write_file(bdes, index)
def read(self): people_dict = {} tree = etree.parse(open('in/input.xml', 'r')) for index, bio in enumerate(tree.xpath("/*/item")): # Name text = bio.find('title').text if text == None or text.count(' aan ') != 1: continue for name in text.split('aan'): name = sanitize_name(name) # URL base_dev = "http://dev.inghist.nl/retrotest2010/staatsregeling/" base_production = "http://www.inghist.nl/retroboeken/staatsregeling/" anchor = "#accessor=toc1&accessor_href=toc1%3FSearchSource%3D" encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = base_production + \ anchor + \ encoded_name people_dict[name] = {"url": url} return people_dict
def write(self, names): names.sort() self.total = len(names) for index, name in enumerate(names): index += 1 name = sanitize_name(name) if self.name_already_processed(name): self.skip("dupe name") continue # URL encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) url = "http://www.inghist.nl/retroboeken/gachard/#accessor=toc&accessor_href=toc%3FSearchSource%253Austring%253Autf-8%3D%26van_aan%3D%26correspondent%253Austring%253Autf-8%3D" + encoded_name bdes = BioDesDoc() args = dict(naam = name, naam_publisher = "XXX", url_publisher = "http://XXX.nl", url_biografie = url, ) bdes.from_args(**args) self.write_file(bdes, index)
def process(self): tree = etree.parse(INPUT) entries = tree.xpath("//item") self.total = len(entries) for index, person in enumerate(entries): index += 1 self.print_progress(index) name = person.xpath("name")[0].text if not name: continue name = sanitize_name(name) while name.endswith('('): name = name[:-1] encoded_name = urllib.quote(urllib.quote(name.encode('utf8'))) biourl = "http://www.inghist.nl/retroboeken/schutte/#accessor=accessor_index&accessor_href=accessor_index%3FSearchSource%253Autf-8%253Austring%3D" + encoded_name # skip if not name: self.skip("empty name") continue if self.name_already_processed(name): self.skip("duplicate name") continue bdes = biodes.BioDesDoc() bdes.from_args( naam=name, naam_publisher="Instituut voor Nederlandse Geschiedenis", url_biografie=biourl, url_publisher="http://www.inghist.nl/", ) try: self.write_file(bdes, index) except etree.XMLSyntaxError, err: self.skip(str(err)) continue