Ejemplo n.º 1
0
def import_data(rec):
    rec['originating_portal'] = portalname
    rec['city'] = city
    rec['source'] = 'd'
    rec['publisher'] = ''
    rec['description'] = None
    rec['costs'] = None
    rec['metadata_xml'] = None
    rec['spatial'] = False
    rec['categories'] = [category_to_odm_map[rec['categories']]]
    rec['filelist'] = []
    rec['metadata'] = ''

    # according to http://www.arnsberg.de/open-data/nutzungsbedingungen.php
    # nothing seems to be marked different
    rec['licenseshort'] = 'dl-de-zero-2.0'
    rec['open'] = metautils.isopen(rec['licenseshort'])

    # If a year of the 21st century is in the title use it as the temporalextent
    # insted of the date the file was added.
    # This is inconsistend but still better?
    t = re.search(r'20\d\d', rec['title'])
    if t: rec['temporalextent'] = t.group(0)

    return rec
Ejemplo n.º 2
0
def recordToDB(rec):
    db = {}
    db['city'] = 'braunschweig'
    db['source'] = 'd'
    db['costs'] = None

    db['url'] = rec['url']
    db['title'] = rec['title']
    db['description'] = rec['abstract']
    db['temporalextent'] = rec['created']
    db['publisher'] = rec['organisation']
    db['filelist'] = rec['filelist']

    db['formats'] = formatsToODM(rec['formats'])
    db['categories'] = categoriesToODM(rec['topic category'])
    db['licenseshort'] = licenseToODM(rec['rights'])
    db['open'] = metautils.isopen(db['licenseshort'])
    db['spatial'] = isSpatialFormat(db['formats'])

    additionalMetadata = [
        'accessRights', 'modified', 'spatials', 'type', 'subjects',
        'categoriesB'
    ]
    db['metadata'] = dict(db.items() +
                          {key: rec[key]
                           for key in additionalMetadata}.items())

    # xml metadata only includes data from catalog api?!
    db['metadata_xml'] = rec['xml']
    return db
Ejemplo n.º 3
0
 def import_data(self, d):
     d = metautils.gerToEngKeys(d)
     d['originating_portal'] = portalname
     d['accepted'] = True
     d['source'] = 'd'
     d['metadata_xml'] = None
     d['costs'] = None
     d['spatial'] = None
     d['open'] = metautils.isopen(d.get('licenseshort', '').strip())
     d['temporalextent'] = ''  # have a look if its there
     return d
Ejemplo n.º 4
0
    def import_data(self, rec):
        d = imp_rec(rec)
        d = metautils.gerToEngKeys(d)
        d['open'] = metautils.isopen(d.get('licenseshort', '').strip())

        d['json'] = ''
        d['publisher'] = ''
        d['originating_portal'] = 'daten.ulm.de'
        d['accepted'] = True
        d['source'] = 'd'
        d[u'metadata_xml'] = rec.get('metadata_xml', '')
        return d
Ejemplo n.º 5
0
 def import_data(self, d):
     d = import_package(d)
     d = metautils.gerToEngKeys(d)
     d = dict(d)
     d['originating_portal'] = portalname
     d['accepted'] = True
     d['source'] = 'd'
     d['metadata_xml'] = None
     d['costs'] = None
     d['spatial'] = None
     d['open'] = metautils.isopen(d['licenseshort'].strip())
     d['publisher'] = ''  # actually its in the data
     d['filelist'] = d['files']
     return d
Ejemplo n.º 6
0
    def import_data(self, d):
        d = import_package(d)
        d = metautils.gerToEngKeys(d)
        d['originating_portal'] = portalname
        d['accepted'] = True
        d['costs'] = None
        d['open'] = metautils.isopen(d['licenseshort'])
        d['publisher'] = None
        d['spatial'] = None
        d['source'] = 'd'
        d['metadata_xml'] = None
        d['filelist'] = d['files']

        return d
 def import_data(self, d):
     d = importCity(self.city, self.url, d)
     if d != {}:
         d = metautils.gerToEngKeys(d)
         d = dict(d)
         d['originating_portal'] = self.portalname
         d['accepted'] = True
         d['costs'] = None
         d['spatial'] = None
         d['source'] = 'd'
         d['metadata_xml'] = None
         d['formats'] = list(d['formats'])
         d['open'] = metautils.isopen(d['licenseshort'].strip())
         if 'categoies' not in d:
             d['categories'] = []
         d['filelist'] = d['files']
     return d
Ejemplo n.º 8
0
 def import_data(self, d):
     d = importCity(self.city, self.url, d)
     if d != {}:
         d = metautils.gerToEngKeys(d)
         d = dict(d)
         d['city'] = self.city
         d['originating_portal'] = self.portalname
         d['accepted'] = True
         d['costs'] = None
         d['spatial'] = None
         d['source'] = 'd'
         d['metadata_xml'] = None
         d['formats'] = list(d['formats'])
         d['open'] = metautils.isopen(d['licenseshort'].strip())
         if 'categories' not in d:
             d['categories'] = []
         d['filelist'] = d['files']
     return d
def toDB(rec):
    db = {}
    db['city'] = 'badenwuerttemberg'  # Baden-Württenberg is not a city ?!
    db['source'] = 'd'
    db['costs'] = None

    db['categories'] = categoryToODM(rec['category'])
    db['url'] = rec['url']
    db['title'] = rec['title']
    db['description'] = rec['description']
    db['publisher'] = rec['herausgeber']
    db['filelist'] = [extractUrl(rec['file-url'])]
    db['formats'] = formatToODM(rec['format'])
    db['licenseshort'] = licenseToODM(rec['nutzungsbedingungen'])
    temps = filter(lambda x: x != "",
                   [rec['zeitraum'], rec['stichtag'], rec['publiziert am']])
    db['temporalextent'] = temps[0] if temps else None
    db['open'] = metautils.isopen(db['licenseshort'])
    db['spatial'] = False

    db['metadata'] = ''
    db['metadata_xml'] = None

    return db
Ejemplo n.º 10
0
def toDB(rec):
    db = {}
    db['city'] = 'badenwuerttemberg'  # Baden-Württenberg is not a city ?!
    db['source'] = 'd'
    db['costs'] = None

    db['categories'] = categoryToODM(rec['category'])
    db['url'] = rec['url']
    db['title'] = rec['title']
    db['description'] = rec['description']
    db['publisher'] = rec['herausgeber']
    db['filelist'] = [extractUrl(rec['file-url'])]
    db['formats'] = formatToODM(rec['format'])
    db['licenseshort'] = licenseToODM(rec['nutzungsbedingungen'])
    temps = filter(lambda x: x != "",
                   [rec['zeitraum'], rec['stichtag'], rec['publiziert am']])
    db['temporalextent'] = temps[0] if temps else None
    db['open'] = metautils.isopen(db['licenseshort'])
    db['spatial'] = False

    db['metadata'] = ''
    db['metadata_xml'] = None

    return db
def importCity(cityname, url, package):
    if cityname == 'hamburg':
        # Only take 'open data'
        if package['type'] != 'dataset' or 'forward-reference' in package['title']:
            return {}

    resources = []
    formats = set()
    files = []
    # Key for the file link in the resource
    urlkeys = ['url']
    formatkey = 'format'

    if ('resources' in package):
        resources = package['resources']

    for file in resources:
        for urlkey in urlkeys:
            if (file[urlkey] not in [None, '']):
                if '://' not in file[urlkey]:
                    files.append(url + file[urlkey])
                else:
                    files.append(file[urlkey])
                break
        if formatkey in file and file[formatkey] not in [None, '']:
            format = file[formatkey]
            formats.add(format.upper())

    row = {}

    row[u'Stadt'] = cityname
    row[u'Dateibezeichnung'] = package['title']
    row[u'URL PARENT'] = url + '/dataset/' + package['name']
    if cityname in ('hamburg', 'koeln', 'frankfurt', 'aachen', 'berlin', 'muenchen'):
        if cityname in ('hamburg', 'frankfurt', 'aachen'):
            licensekey = 'license_id'
            vstellekey = 'author'
            catskey = 'groups'
            catssubkey = 'title'
        elif cityname == 'muenchen':
            licensekey = 'license_id'
            vstellekey = 'maintainer'
            catskey = 'groups'
            catssubkey = 'title'
        elif cityname in ('koeln', 'berlin'):
            licensekey = 'license_title'
            vstellekey = 'maintainer'
            if cityname == 'koeln':
                catskey = 'tags'
            elif cityname == 'berlin':
                catskey = 'groups'
            catssubkey = 'name'
        # Generate URL for the catalog page
        if 'notes' in package and package['notes'] != None:
            row[u'Beschreibung'] = package['notes']
            if cityname == 'koeln':
                soup = BeautifulSoup(row[u'Beschreibung'])
                row[u'Beschreibung'] = soup.getText('\n')
        else:
            row[u'Beschreibung'] = ''
        row[u'Zeitlicher Bezug'] = ''
        if licensekey in package and package[licensekey] != None:
            row[u'Lizenz'] = package[licensekey]
            # if not already short, try to convert
            if metautils.isopen(row[u'Lizenz']) is 'Unbekannt':
                row[u'Lizenz'] = metautils.long_license_to_short(row[u'Lizenz'])
        else:
            row[u'Lizenz'] = 'nicht bekannt'
        if vstellekey in package and package[vstellekey] != None:
            row[u'Veröffentlichende Stelle'] = package[vstellekey]
        else:
            row[u'Veröffentlichende Stelle'] = ''
            if 'extras' in package:
                print 'WARNING: No author/maintainer/publisher, checking extras'
                for extra in package['extras']:
                    if extra['key'] == 'contacts':
                        print 'WARNING: No author, but amazingly there is possibly data in the contacts: ' + extra['value']
        for group in metautils.setofvaluesasarray(package[catskey], catssubkey):
            if cityname != 'berlin':
                odm_cats = metautils.govDataLongToODM(group)
            else:
                odm_cats = berlin_to_odm(group)
            row[u'categories'] = odm_cats

    # Bonn is just different enough to do it separately. TODO: Consider combining into above.
    elif cityname == 'bonn':
        row[u'Beschreibung'] = package.get('description', '')
        for timeattempt in ['temporal', 'modified']:
            if timeattempt in package and package[timeattempt] not in [None, '']:
                row[u'Zeitlicher Bezug'] = package[timeattempt]
                break
        row[u'Zeitlicher Bezug'] = row.get(u'Zeitlicher Bezug', '')

        row[u'Lizenz'] = package.get('license', False)
        if not row[u'Lizenz']:
            row[u'Lizenz'] = package['license_title']

        row[u'Veröffentlichende Stelle'] = package.get('publisher', '')

        cats = package.get('keyword', [])
        odm_cats = map(lambda x: metautils.govDataLongToODM(x, checkAll=True), cats)
        resources = package.get(u'distribution', [])
        for r in resources:
            files.append(r[u'accessURL'])
            formats.append(r[u'format'])

    row[u'Format'] = formats
    row[u'files'] = files

    row['metadata'] = package
    return row
Ejemplo n.º 12
0
def get_categorie_content(category_link):
    # Get the page
    allrecords = []

    parser = etree.HTMLParser(encoding='utf-8')
    data = etree.parse(rooturl + category_link, parser)
    # Get the category
    category = data.xpath('/html/body/div/div[5]/div/div[1]//h1/text()')[0].strip()
    # category = urllib.unquote(category).decode('utf8')
    if (verbose): print 'Category: ' + ascii_only(category)

    datasets = get_datasets(data)
    numdatasets = len(datasets)

    if (verbose): print 'There are ' + str(numdatasets) + ' datasets'

    # Now get the html for each one. This is painful.
    # The bit of html concerning the datasets:
    corehtml = data.xpath('//div[@id=\'ContentBlock\']')[0]
    # First try to split by the horizontal rules. This usually works, but not always
    datasetparts = etree.tostring(corehtml).split('<hr id="hr')
    if (verbose): print 'Found ' + str(len(datasetparts)) + ' datasets by splitting by hr elements with ids'
    if len(datasetparts) != numdatasets:
        if (verbose): print 'This doesn\'t match. Trying with links to TOC'
        # If there is TOC, this works. There isn\'t always one.
        datasetparts = etree.tostring(corehtml).split('nach oben')
        del datasetparts[len(datasetparts) - 1]
        for index in range(0, len(datasetparts)):
            datasetparts[index] = datasetparts[index] + '</a>'
        if (verbose): print 'Found ' + str(len(datasetparts)) + ' datasets by splitting by links to TOC'
        if len(datasetparts) != numdatasets:
            if (verbose): print 'Well, that didn\'t work either. Giving up'
            print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem'
            exit()
    else:
        if numdatasets > 1:
            for index in range(1, len(datasetparts)):
                # That split makes for bad HTML. Make it better.
                datasetparts[index] = '<hr id="hr' + datasetparts[index]

    count = 1

    for datasetpart in datasetparts:
        data = etree.HTML(datasetpart)
        record = {}
        record['city'] = 'bochum'
        record['categories'] = []
        record['categories'].append(category)

        datasets = get_datasets(data)
        record['title'] = datasets[0]

        if (verbose): print 'Parsing dataset ' + ascii_only(record['title'])
        if 'noch im Aufbau' in record['title']:
           # Nothing to see here
           if (verbose): print 'Empty category'
           continue
        record['url'] = rooturl + category_link + '#par' + str(count)
        count += 1
        datatables, filetables = findfilesanddata(data)

        if len(datatables) == 0:
            if (verbose): print 'This record contains no data... checking for link to another page...'
            checkforsubpage = data.xpath('//span//a')

            for link in checkforsubpage:
                if (verbose): print etree.tostring(link)
                if len(link.xpath('text()')) > 0 and u'zu den Daten' in link.xpath('text()')[0]:
                    testurl = link.xpath('@href')[0]
                    if (verbose): print 'Following/updating URL: ' + rooturl + testurl
                    record['url'] = rooturl + testurl
                    datatables, filetables = findfilesanddata(html.parse(rooturl + testurl))

        # get the data on the files, and get each link in it
        record['filelist'] = []
        for table in filetables:
            record['filelist'].extend([(rooturl + x) for x in etree.HTML(table).xpath('//a/@href')])

        record['formats'] = set()
        record['spatial'] = False
        for file in record['filelist']:
            formatarray = file.split('/')[-1].split('.')
            format = 'Unknown'
            if len(formatarray)>1:
                format = formatarray[1].upper().split('?')[0]
            elif 'WMS' in formatarray[0]:
                format = 'WMS'
            elif 'WFS' in formatarray[0]:
                format = 'WFS'
            record['formats'].add(format)
            if (format.upper() in metautils.geoformats):
                record['spatial'] = True
        record['formats'] = list(record['formats'])

        if len(datatables) > 1:
            if (verbose): print 'ERROR: More than one data table'
            print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem'
            exit()
        elif len(datatables) == 0:
            if (verbose): print 'ERROR: No data table'
            print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem'
            exit()

        # parse the data table by row
        if (verbose): print 'Reading datatable...'
        rowelements = etree.HTML(datatables[0]).xpath('//tr')
        for row in rowelements:
            if len(row.xpath('td[1]/text()')) == 0: continue
            key = row.xpath('td[1]/text()')[0]
            if (verbose): print ascii_only(key)
            if len(row.xpath('td[2]/text()')) != 0:
                val = row.xpath('td[2]/text()')[0]
            elif len(row.xpath('td[2]//a')) != 0:
                val = row.xpath('td[2]//a/text()')[0]
            else:
                if (verbose): print 'ERROR: Missing value'
                print 'Exciting because of a serious error - turn on verbose in the code to find out what dataset is causing the problem'
                exit()
            if (verbose): print ascii_only('Parsing key ' + key.replace(':', '') + ' with value ' + val)
            if u'veröffentlicht' in key:
                record['publisher'] = val
            elif u'geändert' in key:
                record['temporalextent'] = val.split(' ')[2]
            elif u'Lizenz' in key:
                record['licenseshort'] = metautils.long_license_to_short(val)
                record['open'] = metautils.isopen(record['licenseshort'])
            elif u'Webseite' in key:
                record['website'] = row.xpath('td[2]//a/@href')[0]  # keep, as 'original' metadata
                if 'http://' not in record['website']:
                    record['website'] = rooturl + record['website']
            elif u'Kontakt' in key:
                record['contact'] = rooturl + row.xpath('td[2]//a/@href')[0]

        allrecords.append(record)
    return allrecords
Ejemplo n.º 13
0
 def test_is_open_uppercase(self):
     assert metautils.isopen('CC BY 3.0 DE') == 'Offen'
Ejemplo n.º 14
0
 def test_isopen(self):
     assert metautils.isopen('dl-de-by-2.0') == 'Offen'
Ejemplo n.º 15
0
def importCity(cityname, url, package):
    if cityname == 'hamburg':
        # Only take 'open data'
        if package['type'] != 'dataset' or 'forward-reference' in package[
                'title']:
            return {}

    #There is a version of CKAN that can output private datasets! but DKAN is using this field for different purposes
    if package['private'] and cityname not in dkanCities:
        return {}

    resources = []
    formats = set()
    files = []
    # Key for the file link in the resource
    urlkeys = ['url']
    formatkey = 'format'

    if ('resources' in package):
        resources = package['resources']

    for file in resources:
        for urlkey in urlkeys:
            if (file[urlkey] not in [None, '']):
                if '://' not in file[urlkey]:
                    files.append(url + file[urlkey])
                else:
                    files.append(file[urlkey])
                break
        if formatkey in file and file[formatkey] not in [None, '']:
            format = file[formatkey]
            formats.add(format.upper())

    row = {}

    row[u'Stadt'] = cityname
    row[u'Dateibezeichnung'] = package['title']
    if 'name' in package:
        row[u'URL PARENT'] = url + '/dataset/' + package['name']
    elif 'url' in package:
        row[u'URL PARENT'] = package['url']
    else:
        row[u'URL PARENT'] = ''
    if cityname in v3cities:
        licensekey = 'license_id'
        vstellekey = 'author'
        catskey = 'groups'
        catssubkey = 'title'
        if cityname == 'berlin':
            catssubkey = 'name'
    elif cityname == 'muenchen':
        licensekey = 'license_id'
        vstellekey = 'maintainer'
        catskey = 'groups'
        catssubkey = 'title'
    elif cityname in dkanCities:
        licensekey = 'license_title'
        vstellekey = 'maintainer'
        catskey = 'tags'
        catssubkey = 'name'
    # Generate URL for the catalog page
    if 'notes' in package and package['notes'] != None:
        row[u'Beschreibung'] = package['notes']
        if cityname == 'koeln':
            soup = BeautifulSoup(row[u'Beschreibung'])
            row[u'Beschreibung'] = soup.getText('\n')
    else:
        row[u'Beschreibung'] = ''
    row[u'Zeitlicher Bezug'] = ''
    if licensekey in package and package[licensekey] != None:
        row[u'Lizenz'] = package[licensekey]
        # if not already short, try to convert
        if metautils.isopen(row[u'Lizenz']) is 'Unbekannt':
            row[u'Lizenz'] = metautils.long_license_to_short(row[u'Lizenz'])
    else:
        row[u'Lizenz'] = 'nicht bekannt'
    if vstellekey in package and package[vstellekey] != None:
        row[u'Veröffentlichende Stelle'] = package[vstellekey]
    else:
        row[u'Veröffentlichende Stelle'] = ''
        if 'extras' in package:
            print 'WARNING: No author/maintainer/publisher, checking extras'
            for extra in package['extras']:
                if extra['key'] == 'contacts':
                    print 'WARNING: No author, but amazingly there is possibly data in the contacts: ' + extra[
                        'value']
    cat_groups = metautils.setofvaluesasarray(package[catskey], catssubkey)
    if cityname != 'berlin':
        odm_cats = metautils.matchCategories(cat_groups)
    else:
        for group in cat_groups:
            odm_cats = berlin_to_odm(group)
    row[u'categories'] = odm_cats

    row[u'Format'] = formats
    row[u'files'] = files

    row['metadata'] = package

    row[u'original_metadata'] = {
        u'metadata_created': package['metadata_created'],
        u'metadata_modified': package['metadata_modified']
    }

    return row