def arnsberg(): g = gather() dataForDB = map(import_data, g) metautils.setsettings(settings) metautils.addSimpleDataToDB(dataForDB, portalname, checked=True, accepted=True, remove_data=True)
def bayern(): ds = catalog_entry_urls() ds = map(fetch, ds) ds = map(import_data, ds) metautils.setsettings(settings) metautils.addSimpleDataToDB(ds, portalname, checked=True, accepted=True, remove_data=True) return ds
def badenWuerttenberg(): print "Get Catalog Entries" catalogPages = getCatalogPages() catalogItemDicts = map(scrapeCatalogPageList, catalogPages) catalogItemDicts = list(itertools.chain(*catalogItemDicts)) print "Scrape Catalog Entries" catalogDicts = map(scrapeCatalogEntryPage, catalogItemDicts) dataForDB = map(toDB, catalogDicts) print "Write to db" metautils.setsettings(settings) metautils.addSimpleDataToDB(dataForDB, portalname, checked=True, accepted=True, remove_data=True)
def main(): metautils.setsettings(settings) cur = metautils.getDBCursor(settings) #Get cities to search cur.execute('SELECT city_shortname, url FROM cities WHERE binged = %s', (True, )) bres = cur.fetchall() print 'domain:incommon:new:binglost' for row in bres: citydata = find_data(row[0], row[1]) citydict = {} for result in citydata: citydict[result['URL_Datei']] = result bingset = set(citydict.keys()) allset = set(citydict.keys()) cur = metautils.getDBCursor(settings) cur.execute('SELECT url FROM data WHERE source=%s AND city=%s', ('b', row[0])) dbset = set() for dbres in cur.fetchall(): dbset.add(dbres[0]) allset.add(dbres[0]) #Analysis intersection = dbset.intersection(bingset) dbnot = allset.difference(dbset) bingnot = allset.difference(bingset) records = [] for urlkey in dbnot: therow = citydict[urlkey] #In this case, we can safely assign it directly therow['URL'] = therow['URL_Datei'] #Likewise, there cannot be any filenames therow['filenames'] = [] metautils.convert_crawl_row(therow, 'b') records.append(therow) print row[1] + ':' + str(len(intersection)) + ':' + str( len(dbnot)) + ':' + str(len(bingnot)) #Write to DB metautils.addCrawlDataToDB( records) #Checked and accepted are both false by default
def main(): metautils.setsettings(settings) cur = metautils.getDBCursor(settings) #Get cities to search cur.execute('SELECT city_shortname, url FROM cities WHERE binged = %s', (True,)) bres = cur.fetchall() print 'domain:incommon:new:binglost' for row in bres: citydata = find_data(row[0], row[1]) citydict = {} for result in citydata: citydict[result['URL_Datei']] = result bingset = set(citydict.keys()) allset = set(citydict.keys()) cur = metautils.getDBCursor(settings) cur.execute('SELECT url FROM data WHERE source=%s AND city=%s', ('b', row[0])) dbset = set() for dbres in cur.fetchall(): dbset.add(dbres[0]) allset.add(dbres[0]) #Analysis intersection = dbset.intersection(bingset) dbnot = allset.difference(dbset) bingnot = allset.difference(bingset) records = [] for urlkey in dbnot: therow = citydict[urlkey] #In this case, we can safely assign it directly therow['URL'] = therow['URL_Datei'] #Likewise, there cannot be any filenames therow['filenames'] = [] metautils.convert_crawl_row(therow, 'b') records.append(therow) print row[1] + ':' + str(len(intersection)) + ':' + str(len(dbnot)) + ':' + str(len(bingnot)) #Write to DB metautils.addCrawlDataToDB(records) #Checked and accepted are both false by default
def braunschweigGeoportal(): print 'Get catalog records' catalog = getRecords() xmlString = etree.tostring(catalog, pretty_print=True) with open(braunschweigMetaDataFile, 'w') as f: f.write(xmlString.encode('utf8')) print 'Scrape catalog record entries' recsList = extractRecords(catalog) recDicts = map(extractData, recsList) recDicts = map(scrapeData, recDicts) dataForDB = map(recordToDB, recDicts) print 'Write to db' metautils.setsettings(settings) metautils.addSimpleDataToDB(dataForDB, portalname, checked=True, accepted=True, remove_data=True)
row[u'files'] = files if 'temporal_coverage_from' in package['extras'] and len(package['extras']['temporal_coverage_from'])>3: row[u'Zeitlicher Bezug'] = package['extras']['temporal_coverage_from'][0:4] if ('terms_of_use' in package['extras'] and len(package['extras']['terms_of_use']) > 0): row[u'Lizenz'] = package['extras']['terms_of_use']['licence_id'] groups = u'' if ('groups' in package and len(package['groups']) > 0): for group in package['groups']: if city == 'moers': odm_cats = metautils.govDataLongToODM(group) elif city == 'bremen': odm_cats = metautils.govDataShortToODM(group) if len(odm_cats) > 0: for cat in odm_cats: row[cat] = 'x' row[u'Noch nicht kategorisiert'] = '' datafordb.append(row) #Write data to the DB metautils.setsettings(settings) #Remove this catalog's data metautils.removeDataFromPortal(portalname) #Add data metautils.addDataToDB(datafordb=datafordb, originating_portal=portalname, checked=True, accepted=True, remove_data=True)
print 'Of the total ' + str(beforefilter) + ' records, ' + str(afterfilter) + ' appear to be related to a city' print 'The rest (' + str(len(notcitydata)) + ') will be assigned to Rheinland-Pfalz as a Land' else: data = data['results'] #Map and write the data. Still wondering how much of this can/should be pulled out to metautils row = metautils.getBlankRow() datafordb = [] [returnData, uniquecities] = mapData(data) datafordb.extend(returnData) if cityimport == 'rlp': [returnData, ignoreuniquecities] = mapData(notcitydata, nocity=True) datafordb.extend(returnData) #Write data to the DB metautils.setsettings(settings) if cityimport == 'rlp': #Update city list metautils.addCities(uniquecities, 'Rheinland-Pfalz') #Remove this catalog's data metautils.removeDataFromPortal('daten.rlp.de') #Add data, checking that used cities are in RLP metautils.addDataToDB(datafordb=datafordb, bundesland='Rheinland-Pfalz', originating_portal='daten.rlp.de', checked=True, accepted=True) elif cityimport == 'rostock': #Remove this catalog's data metautils.removeDataFromPortal('opendata-hro.de') #Add data, checking that used cities are in RLP metautils.addDataToDB(datafordb=datafordb, originating_portal='opendata-hro.de', checked=True, accepted=True, remove_data=True) else: print datafordb