def toDB(rec):
    db = {}
    db['city'] = 'badenwuerttemberg'  # Baden-Württenberg is not a city ?!
    db['source'] = 'd'
    db['costs'] = None

    db['categories'] = categoryToODM(rec['category'])
    db['url'] = rec['page-url']
    db['title'] = rec['title']
    db['description'] = rec['description']
    db['publisher'] = rec['herausgeber']
    db['filelist'] = [extractUrl(rec['url'])]
    db['formats'] = formatToODM(rec['format'])
    db['licenseshort'] = licenseToODM(rec['nutzungsbedingungen'])
    temps = filter(lambda x: x != "",
                   [rec['zeitraum'], rec['stichtag'], rec['publiziert am']])
    db['temporalextent'] = temps[0] if temps else None

    db['open'] = metautils.isopen(db['licenseshort'])
    db['spatial'] = False

    db['metadata'] = ''  #db.copy() - lohnt sich nicht
    db['metadata_xml'] = None

    return db
def toDB(rec):
    db = {}
    db['city'] = 'badenwuerttemberg'  # Baden-Württenberg is not a city ?!
    db['source'] = 'd'
    db['costs'] = None

    db['categories'] = categoryToODM(rec['category'])
    db['url'] = rec['page-url']
    db['title'] = rec['title']
    db['description'] = rec['description']
    db['publisher'] = rec['herausgeber']
    db['filelist'] = [extractUrl(rec['url'])]
    db['formats'] = formatToODM(rec['format'])
    db['licenseshort'] = licenseToODM(rec['nutzungsbedingungen'])
    temps = filter(lambda x: x != "",
                   [rec['zeitraum'], rec['stichtag'], rec['publiziert am']])
    db['temporalextent'] = temps[0] if temps else None

    db['open'] = metautils.isopen(db['licenseshort'])
    db['spatial'] = False

    db['metadata'] = ''#db.copy() - lohnt sich nicht
    db['metadata_xml'] = None

    return db
Beispiel #3
0
def import_data(rec):
    rec['originating_portal'] = portalname
    rec['city'] = city
    rec['source'] = 'd'
    rec['publisher'] = ''
    rec['description'] = None
    rec['costs'] = None
    rec['metadata_xml'] = None
    rec['spatial'] = False
    rec['categories'] = [category_to_odm_map[rec['categories']]]
    rec['filelist'] = []
    rec['metadata'] = ''

    # according to http://www.arnsberg.de/open-data/nutzungsbedingungen.php
    # nothing seems to be marked different
    rec['licenseshort'] = 'dl-de-zero-2.0'
    rec['open'] = metautils.isopen(rec['licenseshort'])

    # If a year of the 21st century is in the title use it as the temporalextent
    # insted of the date the file was added.
    # This is inconsistend but still better?
    t = re.search(r'20\d\d', rec['title'])
    if t: rec['temporalextent'] = t.group(0)

    return rec
Beispiel #4
0
def import_data(rec):
    rec['originating_portal'] = portalname
    rec['city'] = city
    rec['source'] = 'd'
    rec['publisher'] = ''
    rec['description'] = None
    rec['costs'] = None
    rec['metadata_xml'] = None
    rec['spatial'] = False
    rec['categories'] = [category_to_odm_map[rec['categories']]]
    rec['filelist'] = []
    rec['metadata'] = ''

    # according to http://www.arnsberg.de/open-data/nutzungsbedingungen.php
    # nothing seems to be marked different
    rec['licenseshort'] = 'dl-de-zero-2.0'
    rec['open'] = metautils.isopen(rec['licenseshort'])

    # If a year of the 21st century is in the title use it as the temporalextent
    # insted of the date the file was added.
    # This is inconsistend but still better?
    t = re.search(r'20\d\d', rec['title'])
    if t: rec['temporalextent'] = t.group(0)

    return rec
Beispiel #5
0
     vstellekey = 'author'
     catskey = 'tags'
     catssubkey = 'name'
 #Generate URL for the catalog page
 row[u'URL PARENT'] = url + '/dataset/' + package['name']
 if 'notes' in package and package['notes'] != None:
     row[u'Beschreibung'] = package['notes']
     if cityname == 'koeln':
         row[u'Beschreibung'] = metautils.unrenderhtml(row[u'Beschreibung'])
 else:
     row[u'Beschreibung'] = ''
 row[u'Zeitlicher Bezug'] = ''
 if licensekey in package and package[licensekey] != None:
     row[u'Lizenz'] = package[licensekey]
     #if not already short, try to convert
     if metautils.isopen(row[u'Lizenz'], quiet=True) is None:
         row[u'Lizenz'] = metautils.long_license_to_short(row[u'Lizenz'])
 else:
     row[u'Lizenz'] = 'nicht bekannt'
 if vstellekey in package and package[vstellekey] != None:
     row[u'Veröffentlichende Stelle'] = package[vstellekey]
 else:
     row[u'Veröffentlichende Stelle'] = ''
     if 'extras' in package:
         print 'WARNING: No author/maintainer/publisher, checking extras'
         for extra in package['extras']:
             if extra['key'] == 'contacts':
                 print 'WARNING: No author, but amazingly there is possibly data in the contacts: ' + extra['value']
 for group in metautils.setofvaluesasarray(package[catskey], catssubkey):
     if cityname != 'berlin':
         odm_cats = metautils.govDataLongToODM(group)
            if len(row.xpath('td[2]/text()')) != 0:
                val = row.xpath('td[2]/text()')[0]
            elif len(row.xpath('td[2]//a')) != 0: 
                val = row.xpath('td[2]//a/text()')[0]
            else:
                if (verbose): print 'ERROR: Missing value'
                print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem'
                exit()
            if (verbose): print 'Parsing key ' + key.replace(':', '') + ' with value ' + val
            if u'veröffentlicht' in key:
                record['publisher'] = val
            elif u'geändert' in key:
                record['temporalextent'] = val.split(' ')[2]
            elif u'Lizenz' in key:
                record['licenseshort'] = metautils.long_license_to_short(val)
                record['open'] = metautils.isopen(record['licenseshort'])
            elif u'Webseite' in key:
                record['website'] = row.xpath('td[2]//a/@href')[0] #keep, as 'original' metadata
                if 'http://' not in record['website']:
                    record['website'] = rooturl + record['website']
            elif u'Kontakt' in key:
                record['contact'] = rooturl + row.xpath('td[2]//a/@href')[0]

        allrecords.append(record)
                
#Find things in multiple categories
recordsdict = {}
for record in allrecords:
    if record['title'] not in recordsdict:
        recordsdict[record['title']] = record
    else:
            elif len(row.xpath('td[2]//a')) != 0:
                val = row.xpath('td[2]//a/text()')[0]
            else:
                if (verbose): print 'ERROR: Missing value'
                print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem'
                exit()
            if (verbose):
                print 'Parsing key ' + key.replace(':',
                                                   '') + ' with value ' + val
            if u'veröffentlicht' in key:
                record['publisher'] = val
            elif u'geändert' in key:
                record['temporalextent'] = val.split(' ')[2]
            elif u'Lizenz' in key:
                record['licenseshort'] = metautils.long_license_to_short(val)
                record['open'] = metautils.isopen(record['licenseshort'])
            elif u'Webseite' in key:
                record['website'] = row.xpath('td[2]//a/@href')[
                    0]  #keep, as 'original' metadata
                if 'http://' not in record['website']:
                    record['website'] = rooturl + record['website']
            elif u'Kontakt' in key:
                record['contact'] = rooturl + row.xpath('td[2]//a/@href')[0]

        allrecords.append(record)

#Find things in multiple categories
recordsdict = {}
for record in allrecords:
    if record['title'] not in recordsdict:
        recordsdict[record['title']] = record
Beispiel #8
0
print '\nMarking all Bonn Google data as rejected (needs to be changed if Google searches are ever resumed!'
cur = metautils.getDBCursor(settings, dictCursor=True)
cur.execute('update data set accepted = %s where city = %s and source = %s',
            (False, 'bonn', 'g'))
metautils.dbCommit()

print '\nResetting open...'
cur = metautils.getDBCursor(settings, dictCursor=True)
cur.execute('select url, licenseshort from data')
for ores in cur.fetchall():
    if ores['licenseshort'].strip() == '':
        license = 'nicht bekannt'
        open = None
    else:
        open = metautils.isopen(ores['licenseshort'].strip())
        license = ores['licenseshort'].strip()
    cur.execute('update data set licenseshort = %s, open = %s where url = %s',
                (license, open, ores['url']))
metautils.dbCommit()

print 'Finding cities with data...'
cities = metautils.getCitiesWithData()
print cities

print '\nRemoving search machine data that has been found with own crawler...'
for city in cities:
    cur = metautils.getDBCursor(settings, dictCursor=True)

    #Get all Google and Bing data to see if the files have also been found by crawling
    cur.execute(
Beispiel #9
0
     vstellekey = 'author'
     catskey = 'tags'
     catssubkey = 'name'
 #Generate URL for the catalog page
 row[u'URL PARENT'] = url + '/dataset/' + package['name']
 if 'notes' in package and package['notes'] != None:
     row[u'Beschreibung'] = package['notes']
     if cityname == 'koeln':
         row[u'Beschreibung'] = metautils.unrenderhtml(row[u'Beschreibung'])
 else:
     row[u'Beschreibung'] = ''
 row[u'Zeitlicher Bezug'] = ''
 if licensekey in package and package[licensekey] != None:
     row[u'Lizenz'] = package[licensekey]
     #if not already short, try to convert
     if metautils.isopen(row[u'Lizenz'], quiet=True) is None:
         row[u'Lizenz'] = metautils.long_license_to_short(row[u'Lizenz'])
 else:
     row[u'Lizenz'] = 'nicht bekannt'
 if vstellekey in package and package[vstellekey] != None:
     row[u'Veröffentlichende Stelle'] = package[vstellekey]
 else:
     row[u'Veröffentlichende Stelle'] = ''
     if 'extras' in package:
         print 'WARNING: No author/maintainer/publisher, checking extras'
         for extra in package['extras']:
             if extra['key'] == 'contacts':
                 print 'WARNING: No author, but amazingly there is possibly data in the contacts: ' + extra[
                     'value']
 for group in metautils.setofvaluesasarray(package[catskey], catssubkey):
     if cityname != 'berlin':
metautils.setsettings(settings)

print '\nMarking all Bonn Google data as rejected (needs to be changed if Google searches are ever resumed!'
cur = metautils.getDBCursor(settings, dictCursor = True)
cur.execute('update data set accepted = %s where city = %s and source = %s', (False,'bonn','g'))
metautils.dbCommit()

print '\nResetting open...'
cur = metautils.getDBCursor(settings, dictCursor = True)
cur.execute('select url, licenseshort from data')
for ores in cur.fetchall():
    if ores['licenseshort'].strip() == '':
        license = 'nicht bekannt'
        open = None
    else:
        open = metautils.isopen(ores['licenseshort'].strip())
        license = ores['licenseshort'].strip()
    cur.execute('update data set licenseshort = %s, open = %s where url = %s', (license, open, ores['url']))
metautils.dbCommit()

print 'Finding cities with data...'
cities = metautils.getCitiesWithData()
print cities

print '\nRemoving search machine data that has been found with own crawler...'
for city in cities:
    cur = metautils.getDBCursor(settings, dictCursor = True)
    
    #Get all Google and Bing data to see if the files have also been found by crawling
    cur.execute('SELECT source, url FROM data WHERE city LIKE %s AND (source = %s OR source = %s) AND accepted = %s', (city,'b','g', True))
    gbres = cur.fetchall()