Beispiel #1
0
def arnsberg():
    g = gather()
    dataForDB = map(import_data, g)
    metautils.setsettings(settings)
    metautils.addSimpleDataToDB(dataForDB,
                                portalname,
                                checked=True,
                                accepted=True,
                                remove_data=True)
Beispiel #2
0
def arnsberg():
    g = gather()
    dataForDB = map(import_data, g)
    metautils.setsettings(settings)
    metautils.addSimpleDataToDB(dataForDB,
                                portalname,
                                checked=True,
                                accepted=True,
                                remove_data=True)
def bayern():
    ds = catalog_entry_urls()
    ds = map(fetch, ds)
    ds = map(import_data, ds)

    metautils.setsettings(settings)
    metautils.addSimpleDataToDB(ds,
                                portalname,
                                checked=True,
                                accepted=True,
                                remove_data=True)
    return ds
Beispiel #4
0
def bayern():
    ds = catalog_entry_urls()
    ds = map(fetch, ds)
    ds = map(import_data, ds)

    metautils.setsettings(settings)
    metautils.addSimpleDataToDB(ds,
                                portalname,
                                checked=True,
                                accepted=True,
                                remove_data=True)
    return ds
def badenWuerttenberg():
    print "Get Catalog Entries"
    catalogPages = getCatalogPages()
    catalogItemDicts = map(scrapeCatalogPageList, catalogPages)
    catalogItemDicts = list(itertools.chain(*catalogItemDicts))

    print "Scrape Catalog Entries"
    catalogDicts = map(scrapeCatalogEntryPage, catalogItemDicts)
    dataForDB = map(toDB, catalogDicts)

    print "Write to db"
    metautils.setsettings(settings)
    metautils.addSimpleDataToDB(dataForDB,
                                portalname,
                                checked=True,
                                accepted=True,
                                remove_data=True)
def badenWuerttenberg():
    print "Get Catalog Entries"
    catalogPages = getCatalogPages()
    catalogItemDicts = map(scrapeCatalogPageList, catalogPages)
    catalogItemDicts = list(itertools.chain(*catalogItemDicts))

    print "Scrape Catalog Entries"
    catalogDicts = map(scrapeCatalogEntryPage, catalogItemDicts)
    dataForDB = map(toDB, catalogDicts)

    print "Write to db"
    metautils.setsettings(settings)
    metautils.addSimpleDataToDB(dataForDB,
                                portalname,
                                checked=True,
                                accepted=True,
                                remove_data=True)
Beispiel #7
0
def main():
    metautils.setsettings(settings)

    cur = metautils.getDBCursor(settings)

    #Get cities to search
    cur.execute('SELECT city_shortname, url FROM cities WHERE binged = %s',
                (True, ))
    bres = cur.fetchall()

    print 'domain:incommon:new:binglost'

    for row in bres:
        citydata = find_data(row[0], row[1])
        citydict = {}
        for result in citydata:
            citydict[result['URL_Datei']] = result
        bingset = set(citydict.keys())
        allset = set(citydict.keys())
        cur = metautils.getDBCursor(settings)
        cur.execute('SELECT url FROM data WHERE source=%s AND city=%s',
                    ('b', row[0]))
        dbset = set()
        for dbres in cur.fetchall():
            dbset.add(dbres[0])
            allset.add(dbres[0])
        #Analysis
        intersection = dbset.intersection(bingset)
        dbnot = allset.difference(dbset)
        bingnot = allset.difference(bingset)

        records = []
        for urlkey in dbnot:
            therow = citydict[urlkey]
            #In this case, we can safely assign it directly
            therow['URL'] = therow['URL_Datei']
            #Likewise, there cannot be any filenames
            therow['filenames'] = []
            metautils.convert_crawl_row(therow, 'b')
            records.append(therow)

        print row[1] + ':' + str(len(intersection)) + ':' + str(
            len(dbnot)) + ':' + str(len(bingnot))
        #Write to DB
        metautils.addCrawlDataToDB(
            records)  #Checked and accepted are both false by default
def main():
    metautils.setsettings(settings)
    
    cur = metautils.getDBCursor(settings)
    
    #Get cities to search
    cur.execute('SELECT city_shortname, url FROM cities WHERE binged = %s', (True,))
    bres = cur.fetchall()
    
    print 'domain:incommon:new:binglost'
    
    for row in bres:
        citydata = find_data(row[0], row[1])
        citydict = {}
        for result in citydata:
            citydict[result['URL_Datei']] = result
        bingset = set(citydict.keys())
        allset = set(citydict.keys())
        cur = metautils.getDBCursor(settings)
        cur.execute('SELECT url FROM data WHERE source=%s AND city=%s', ('b', row[0]))
        dbset = set()
        for dbres in cur.fetchall():
            dbset.add(dbres[0])
            allset.add(dbres[0])
        #Analysis
        intersection = dbset.intersection(bingset)
        dbnot = allset.difference(dbset)
        bingnot = allset.difference(bingset)

        records = []
        for urlkey in dbnot:
            therow = citydict[urlkey]
            #In this case, we can safely assign it directly
            therow['URL'] = therow['URL_Datei']
            #Likewise, there cannot be any filenames
            therow['filenames'] = []
            metautils.convert_crawl_row(therow, 'b')
            records.append(therow)
            
        print row[1] + ':' + str(len(intersection)) + ':' + str(len(dbnot)) + ':' + str(len(bingnot))
        #Write to DB  
        metautils.addCrawlDataToDB(records) #Checked and accepted are both false by default
def braunschweigGeoportal():
    print 'Get catalog records'
    catalog = getRecords()

    xmlString = etree.tostring(catalog, pretty_print=True)
    with open(braunschweigMetaDataFile, 'w') as f:
        f.write(xmlString.encode('utf8'))

    print 'Scrape catalog record entries'
    recsList = extractRecords(catalog)
    recDicts = map(extractData, recsList)
    recDicts = map(scrapeData, recDicts)
    dataForDB = map(recordToDB, recDicts)

    print 'Write to db'
    metautils.setsettings(settings)
    metautils.addSimpleDataToDB(dataForDB,
                                portalname,
                                checked=True,
                                accepted=True,
                                remove_data=True)
Beispiel #10
0
def braunschweigGeoportal():
    print 'Get catalog records'
    catalog = getRecords()

    xmlString = etree.tostring(catalog, pretty_print=True)
    with open(braunschweigMetaDataFile, 'w') as f:
        f.write(xmlString.encode('utf8'))

    print 'Scrape catalog record entries'
    recsList = extractRecords(catalog)
    recDicts = map(extractData, recsList)
    recDicts = map(scrapeData, recDicts)
    dataForDB = map(recordToDB, recDicts)

    print 'Write to db'
    metautils.setsettings(settings)
    metautils.addSimpleDataToDB(dataForDB,
                                portalname,
                                checked=True,
                                accepted=True,
                                remove_data=True)
        row[u'files'] = files
        
    if 'temporal_coverage_from' in package['extras'] and len(package['extras']['temporal_coverage_from'])>3:
        row[u'Zeitlicher Bezug'] = package['extras']['temporal_coverage_from'][0:4]
    
    if ('terms_of_use' in package['extras'] and len(package['extras']['terms_of_use']) > 0):
        row[u'Lizenz'] = package['extras']['terms_of_use']['licence_id']

    groups = u''
    if ('groups' in package and len(package['groups']) > 0):
        for group in package['groups']:
            if city == 'moers':
                odm_cats = metautils.govDataLongToODM(group)
            elif city == 'bremen':
                odm_cats = metautils.govDataShortToODM(group)
            if len(odm_cats) > 0:
                for cat in odm_cats:
                    row[cat] = 'x'
                row[u'Noch nicht kategorisiert'] = ''

    datafordb.append(row)
#Write data to the DB
metautils.setsettings(settings)
#Remove this catalog's data
metautils.removeDataFromPortal(portalname)
#Add data
metautils.addDataToDB(datafordb=datafordb, originating_portal=portalname, checked=True, accepted=True, remove_data=True)



    print 'Of the total ' + str(beforefilter) + ' records, ' + str(afterfilter) + ' appear to be related to a city'
    print 'The rest (' + str(len(notcitydata)) + ') will be assigned to Rheinland-Pfalz as a Land'
else:
    data = data['results']

#Map and write the data. Still wondering how much of this can/should be pulled out to metautils
row = metautils.getBlankRow()
datafordb = []

[returnData, uniquecities] = mapData(data)
datafordb.extend(returnData)
if cityimport == 'rlp':
    [returnData, ignoreuniquecities] = mapData(notcitydata, nocity=True)
    datafordb.extend(returnData)

#Write data to the DB
metautils.setsettings(settings)
if cityimport == 'rlp':
    #Update city list
    metautils.addCities(uniquecities, 'Rheinland-Pfalz')
    #Remove this catalog's data
    metautils.removeDataFromPortal('daten.rlp.de')
    #Add data, checking that used cities are in RLP
    metautils.addDataToDB(datafordb=datafordb, bundesland='Rheinland-Pfalz', originating_portal='daten.rlp.de', checked=True, accepted=True)
elif cityimport == 'rostock':
    #Remove this catalog's data
    metautils.removeDataFromPortal('opendata-hro.de')
    #Add data, checking that used cities are in RLP
    metautils.addDataToDB(datafordb=datafordb, originating_portal='opendata-hro.de', checked=True, accepted=True, remove_data=True)
else:
    print datafordb