Example #1
0
    def process(self):
        data = eval(self._get_input_data())
        self.total = len(data)
        for index, person in enumerate(data):
            index += 1
            name = ""
            for x in (person[1], person[2], person[0]):
                if x is not None:
                    name += x
                    name += " "
            name = name.strip()
            if not name:
                self.skip("empty name")
                continue
            else:
                name = sanitize_name(name)
            if self.name_already_processed(name):
                self.skip("duplicate name")
                continue

            encoded_name = urllib.quote(name)
            biourl = "http://www.inghist.nl/Onderzoek/Projecten/WVO/brieven?af_naam_vol=" + encoded_name
            # ----
            name = name.decode('latin1')
            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=biourl,
                url_publisher="http://www.inghist.nl/",
            )
            self.write_file(bdes, index)
Example #2
0
    def process(self):
        data = eval(self._get_input_data())
        self.total = len(data)
        for index, person in enumerate(data):
            index += 1
            self.print_progress(index)
            id = person[0]
            name = person[1]
            name = name.decode('latin1')
            name = sanitize_name(name)
            if self.name_already_processed(name):
                self.skip("duplicate name")
                continue

            encoded_name = urllib.quote(urllib.quote(name.encode('utf8')))
            biourl = "http://www.inghist.nl/retroboeken/nib/?zoekveld=abdul"
            biourl += "&soort=persoon#accessor=cumulatieveindex&accessor_href=CumulatieveIndex%2FPersonenIndex%3Fzoekveld%3D"
            biourl += name
            biourl += "%26soort%3Dpersoon"

            text = person[2]
            text = text.decode('latin1')

            # ----
            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=biourl,
                url_publisher="http://www.inghist.nl/",
                text=text,
            )
            self.write_file(bdes, index)
Example #3
0
    def read(self):
        names = []
        lower_case_names = set()
        for file in ('in/1.xml', 'in/2.xml', 'in/3.xml'):
            tree = etree.parse(open(file, 'r'))
            items = tree.xpath("/*/item")
            for iteration, bio in enumerate(items):
                try:
                    name1 = sanitize_name(bio.xpath('title/from')[0].text)
                except IndexError:
                    name1 = None
                try:
                    name2 = sanitize_name(bio.xpath('title/to')[0].text)
                except IndexError:
                    name2 = None

                for name in (name1, name2):
                    if not self.name_already_processed(name):
                        names.append(name)
        return names
Example #4
0
    def process(self):
        data = eval(self._get_input_data())
        self.total = len(data)
        for index, person in enumerate(data):
            index += 1

            # id
            id = person[0]
            if not id:
                self.skip("no id")
                continue
            # name
            name = ""
            for x in (person[3], person[4], person[1]):
                if x is not None:
                    name += x
                    name += " "
            name = name.strip()
            if not name:
                self.skip("empty name")
                continue
            else:
                name = sanitize_name(name)

            if self.name_already_processed(name):
                self.skip("duplicate name")
                continue
            name = name.decode('latin1')
            print repr(name)

            # dates
            bplace = person[9] and person[9].decode('latin1') or None
            bdate = person[10] and str(person[10]) or None
            ddate = person[13] and str(person[13]) or None
            text = person[15]
            if text is not None:
                text = text.replace("\x00", "")
                text = text.decode('latin1')

            biourl = "http://www.inghist.nl/Onderzoek/Projecten/Egodocumenten/persoon_detail/%s" % id

            # ----
            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=biourl,
                url_publisher="http://www.inghist.nl/",
                birth_place=bplace,
                birth_date=bdate,
                death_date=ddate,
                text=text,
            )
            self.write_file(bdes, index)
Example #5
0
 def process(self, people_dict):
     people = people_dict.keys()
     people.sort()
     self.total = len(people)
     x = 0
     for name in people:
         x += 1
         info = people_dict[name]
         print "processing: %s/%s - %s" %(x, len(people), name)
         name = sanitize_name(name)
         if self.name_already_processed(name):
             self.skipped += 1
             continue           
         
         # URL
         base_dev = "http://dev.inghist.nl/retrotest2010/groen/"
         base_production = "http://www.inghist.nl/retroboeken/groen/"                          
         encoded_name = urllib.quote(urllib.quote(name.encode('utf8')))
         url = base_production + \
               "#accessor=accessor_index&accessor_href=accessor_index%3FSearchSource%253Autf-8%253Austring%3D" + \
               encoded_name
         
         bdes = BioDesDoc()
         args = dict(naam = name,
                     naam_publisher = "XXX",
                     url_publisher = "http://XXX.nl",
                     url_biografie = url,
                )
         """
         args = dict(naam = name,
                     figures =[(people_dict[id]['img_url'], 
                                people_dict[id]['caption'],
                               )],
                     naam_publisher = "Het Geheugen van Nederland",
                     url_biografie = people_dict[id]['bio_url'],
                     url_publisher = "http://geheugenvannederland.nl",
                     tekst = people_dict[id]['tekst']
                     )
         """
         birth_date = info['born']
         death_date = sterfdatum = info['dead']
         if bdes.is_date(birth_date):
             args['geboortedatum'] = birth_date
         if bdes.is_date(death_date):
             args['sterfdatum'] = death_date
             
         bdes.from_args(**args)
         self.write_file(bdes, x)
Example #6
0
    def process(self):
        data = eval(self._get_input_data())
        self.total = len(data)
        for index, person in enumerate(data):
            index += 1
            #            self.print_progress(index)
            id = person[0]
            name = ""
            for x in (person[4], person[2], person[3], person[1]):
                if x is not None:
                    name += x
                    name += " "
            name = name.decode('latin1')
            name = sanitize_name(name)
            if self.name_already_processed(name):
                self.skip("duplicate name")
                continue
            biourl = "http://www.inghist.nl/Onderzoek/Projecten/KPP/PersoonDetail?Id=%s" % id
            sex = person[5]
            if sex == 'm':
                sex = 1
            elif sex == 'v':
                sex = 2
            else:
                sex = None
            text = (person[16], person[17], person[18])
            text = ' '.join([x for x in text if x])
            text = text.strip()
            text = text.decode('latin1')

            if not text:
                text = person[14]
                if text:
                    text = text.decode('latin1')

            # ----
            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=biourl,
                url_publisher="http://www.inghist.nl/",
                sex=sex,
                text=text,
            )
            self.write_file(bdes, index)
Example #7
0
    def process(self):
        DateTime = lambda x: None
        data = eval(self._get_input_data())
        self.total = len(data)
        for index, person in enumerate(data):
            index += 1
            self.print_progress(index)
            id = person[0]
            name = person[1]
            if not id:
                self.skip("id is None")
                continue
            if name is None:
                self.skip("name is None")
                continue
            name = name.decode('latin1')
            name = sanitize_name(name)
            if self.name_already_processed(name):
                self.skip("duplicate name: %s" % name)
                continue

            encoded_name = urllib.quote(name.encode('utf8'))
            biourl = "http://www.inghist.nl/Onderzoek/Projecten/RapportenCentraleInlichtingendienst1919-1940/data/GeavanceerdResult.html?batch_size=15&persoon=" + encoded_name

            text = person[7]
            if text is not None:
                text = text.strip()
                text = text.decode('latin1')

            if "Berger, L.M., zie Morisset" in name:
                self.skip("name causing unknwon encoding error: %s" % name)
                continue

            # ----
            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=biourl,
                url_publisher="http://www.inghist.nl/",
                text=text,
            )

            self.write_file(bdes, index)
Example #8
0
 def read(self):
     people_dict = {}
     tree = etree.parse(open('in/input.xml', 'r'))
     for index, bio in enumerate(tree.xpath("/*/item")):
         # Name
         text = bio.find('title').text
         if text == None or text.count(' aan ') != 1:
             continue
         for name in text.split('aan'):
             name = sanitize_name(name)
             # URL
             base_dev = "http://dev.inghist.nl/retrotest2010/staatsregeling/"
             base_production = "http://www.inghist.nl/retroboeken/staatsregeling/"
             anchor = "#accessor=toc1&accessor_href=toc1%3FSearchSource%3D"
             encoded_name = urllib.quote(urllib.quote(name.encode('utf8')))
             url = base_production + \
                   anchor + \
                   encoded_name
             people_dict[name] = {"url": url}
     return people_dict
Example #9
0
    def write(self, names):
        names.sort()
        self.total = len(names)
        for index, name in enumerate(names):
            index += 1
           
            name = sanitize_name(name)
            if self.name_already_processed(name):
                self.skip("dupe name")
                continue
            # URL
            encoded_name = urllib.quote(urllib.quote(name.encode('utf8')))
            url = "http://www.inghist.nl/retroboeken/gachard/#accessor=toc&accessor_href=toc%3FSearchSource%253Austring%253Autf-8%3D%26van_aan%3D%26correspondent%253Austring%253Autf-8%3D" + encoded_name

            bdes = BioDesDoc()
            args = dict(naam = name,
                        naam_publisher = "XXX",
                        url_publisher = "http://XXX.nl",
                        url_biografie = url,
                       )               
            bdes.from_args(**args)
            self.write_file(bdes, index)
Example #10
0
    def process(self):
        tree = etree.parse(INPUT)
        entries = tree.xpath("//item")
        self.total = len(entries)
        for index, person in enumerate(entries):
            index += 1
            self.print_progress(index)
            name = person.xpath("name")[0].text
            if not name:
                continue
            name = sanitize_name(name)
            while name.endswith('('):
                name = name[:-1]

            encoded_name = urllib.quote(urllib.quote(name.encode('utf8')))
            biourl = "http://www.inghist.nl/retroboeken/schutte/#accessor=accessor_index&accessor_href=accessor_index%3FSearchSource%253Autf-8%253Austring%3D" + encoded_name

            # skip
            if not name:
                self.skip("empty name")
                continue
            if self.name_already_processed(name):
                self.skip("duplicate name")
                continue

            bdes = biodes.BioDesDoc()
            bdes.from_args(
                naam=name,
                naam_publisher="Instituut voor Nederlandse Geschiedenis",
                url_biografie=biourl,
                url_publisher="http://www.inghist.nl/",
            )
            try:
                self.write_file(bdes, index)
            except etree.XMLSyntaxError, err:
                self.skip(str(err))
                continue