def combine_sheet(engine, resource, sheet_id, table, mapping): begin = time.time() base = { 'resource_id': resource['resource_id'], 'resource_hash': resource['extract_hash'], 'sheet_id': sheet_id, } spending_table = sl.get_table(engine, 'spending') connection = engine.connect() trans = connection.begin() try: rows = 0 sl.delete(connection, spending_table, resource_id=resource['resource_id'], sheet_id=sheet_id) for row in sl.all(connection, table): data = dict(base) for col, value in row.items(): if col == 'id': data['row_id'] = value continue mapped = mapping.get(col) if mapped is not None: data[mapped] = value sl.add_row(connection, spending_table, data) rows += 1 trans.commit() log.info("Loaded %s rows in %s ms", rows, int((time.time() - begin) * 1000)) return rows > 0 finally: connection.close()
def combine_sheet(engine, resource, sheet_id, table, mapping): begin = time.time() base = { 'resource_id': resource['resource_id'], 'resource_hash': resource['extract_hash'], 'sheet_id': sheet_id, } spending_table = sl.get_table(engine, 'spending') connection = engine.connect() trans = connection.begin() try: rows = 0 sl.delete(connection, spending_table, resource_id=resource['resource_id'], sheet_id=sheet_id) for row in sl.all(connection, table): data = dict(base) for col, value in row.items(): if col == 'id': data['row_id'] = value continue mapped = mapping.get(col) if mapped is not None: data[mapped] = value sl.add_row(connection, spending_table, data) rows += 1 trans.commit() log.info("Loaded %s rows in %s ms", rows, int((time.time()-begin)*1000)) return rows > 0 finally: connection.close()
def cleanup_sheet(engine, row, sheet_id): spending_table = sl.get_table(engine, 'spending') data = list( sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) connection = engine.connect() trans = connection.begin() date_formats = cleanup_dates.detect_formats(data) try: if None in date_formats.values(): log.warn("Couldn't detect date formats: %r", date_formats) issue(engine, row['resource_id'], row['retrieve_hash'], "Couldn't detect date formats", repr(date_formats)) return False sl.delete(connection, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id) for row in data: row = cleanup_dates.apply(row, date_formats) row = cleanup_numbers.apply(row) row = cleanup_gov.apply(row) #row = cleanup_supplier.apply(row, engine) del row['id'] sl.add_row(connection, spending_table, row) trans.commit() return True finally: connection.close()
def clear_issues(engine, resource_id, stage): import sqlaload as sl # this import is slow, so it is done inside this func table = sl.get_table(engine, 'issue') sl.delete(engine, table, resource_id=resource_id, stage=stage, )
def clear_issues(engine, resource_id, stage): import sqlaload as sl # this import is slow, so it is done inside this func table = sl.get_table(engine, 'issue') sl.delete( engine, table, resource_id=resource_id, stage=stage, )
def build_index(publisher_name=None): '''Searches CKAN for spending resources and writes their metadata to the database.''' engine, table = connect() client = ckan_client() log.info('CKAN: %s', client.base_location) tags = ['+tags:"%s"' % t for t in TAGS] q = " OR ".join(tags) publisher_dict_filter = {} if publisher_name: publisher_solr_filter = 'publisher:"%s"' % publisher_name q = '(%s) AND (%s)' % (q, publisher_solr_filter) publisher_dict_filter = {'publisher_name': publisher_name} log.info('SOLR Search q: %r', q) existing_packages = set( [res['package_name'] for res in sl.distinct(engine, table, 'package_name', **publisher_dict_filter)]) log.info('Existing datasets: %i', len(existing_packages)) processed_packages = set() log.info('Doing package search for: "%s"', q) res = client.package_search(q, search_options={'limit': 2000}) log.info('Search returned %i dataset results', res['count']) stats = OpenSpendingStats() stats_resources = OpenSpendingStats() for package_name in res['results']: processed_packages.add(package_name) num_resources = fetch_package(client, package_name, engine, table, stats_resources) if num_resources == 0: stats.add('Dataset has no resources', package_name) else: stats.add('Dataset has resources', package_name) # Removed rows about deleted packages obsolete_packages = existing_packages - processed_packages log.info('Obsolete datasets: %s from %s', len(obsolete_packages), len(existing_packages)) for package_name in obsolete_packages: sl.delete(engine, table, package_name=package_name) sl.delete(engine, 'issue', package_name=package_name) stats.add('Removed obsolete dataset', package_name) # Removed stray rows without package_name stray_rows = list(sl.find(engine, table, package_name=None)) if stray_rows: log.info('Stray rows without package_name: %i', len(stray_rows)) sl.delete(engine, table, package_name=None) sl.delete(engine, 'issue', package_name=None) for row in stray_rows: stats.add('Stray row removed', row['resource_id']) print 'Datasets build_index summary:' print stats.report() print 'Resources build_index summary:' print stats_resources.report()
def build_index(publisher_name=None): '''Searches CKAN for spending resources and writes their metadata to the database.''' engine, table = connect() client = ckan_client() log.info('CKAN: %s', client.base_location) tags = ['+tags:"%s"' % t for t in TAGS] q = " OR ".join(tags) publisher_dict_filter = {} if publisher_name: publisher_solr_filter = 'publisher:"%s"' % publisher_name q = '(%s) AND (%s)' % (q, publisher_solr_filter) publisher_dict_filter = {'publisher_name': publisher_name} log.info('SOLR Search q: %r', q) existing_packages = set([ res['package_name'] for res in sl.distinct( engine, table, 'package_name', **publisher_dict_filter) ]) log.info('Existing datasets: %i', len(existing_packages)) processed_packages = set() log.info('Doing package search for: "%s"', q) res = client.package_search(q, search_options={'limit': 2000}) log.info('Search returned %i dataset results', res['count']) stats = OpenSpendingStats() stats_resources = OpenSpendingStats() for package_name in res['results']: processed_packages.add(package_name) num_resources = fetch_package(client, package_name, engine, table, stats_resources) if num_resources == 0: stats.add('Dataset has no resources', package_name) else: stats.add('Dataset has resources', package_name) # Removed rows about deleted packages obsolete_packages = existing_packages - processed_packages log.info('Obsolete datasets: %s from %s', len(obsolete_packages), len(existing_packages)) for package_name in obsolete_packages: sl.delete(engine, table, package_name=package_name) sl.delete(engine, 'issue', package_name=package_name) stats.add('Removed obsolete dataset', package_name) # Removed stray rows without package_name stray_rows = list(sl.find(engine, table, package_name=None)) if stray_rows: log.info('Stray rows without package_name: %i', len(stray_rows)) sl.delete(engine, table, package_name=None) sl.delete(engine, 'issue', package_name=None) for row in stray_rows: stats.add('Stray row removed', row['resource_id']) print 'Datasets build_index summary:' print stats.report() print 'Resources build_index summary:' print stats_resources.report()
def cleanup_sheet(engine, row, sheet_id, data_row_filter, stats_spending): spending_table = sl.get_table(engine, 'spending') data = list( sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) if not data: log.info('Sheet has no rows') return False, None connection = engine.connect() trans = connection.begin() date_formats = cleanup_dates.detect_formats(data) try: for date_format in date_formats.values(): if isinstance(date_format, basestring): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "Couldn't detect date formats because: %s" % date_format, repr(date_formats)) return True, date_format if not data_row_filter: sl.delete(connection, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id) for row in data: if data_row_filter and data_row_filter != row['row_id']: continue row = cleanup_dates.apply(row, date_formats, stats_spending) row = cleanup_numbers.apply(row, stats_spending) row = cleanup_gov.apply(row, stats_spending) #row = cleanup_supplier.apply(row, engine) del row['id'] sl.add_row(connection, spending_table, row) trans.commit() return True, None finally: connection.close()
def cleanup_sheet(engine, row, sheet_id): spending_table = sl.get_table(engine, "spending") data = list(sl.find(engine, spending_table, resource_id=row["resource_id"], sheet_id=sheet_id)) connection = engine.connect() trans = connection.begin() date_formats = cleanup_dates.detect_formats(data) try: if None in date_formats.values(): log.warn("Couldn't detect date formats: %r", date_formats) issue(engine, row["resource_id"], row["retrieve_hash"], "Couldn't detect date formats", repr(date_formats)) return False sl.delete(connection, spending_table, resource_id=row["resource_id"], sheet_id=sheet_id) for row in data: row = cleanup_dates.apply(row, date_formats) row = cleanup_numbers.apply(row) row = cleanup_gov.apply(row) # row = cleanup_supplier.apply(row, engine) del row["id"] sl.add_row(connection, spending_table, row) trans.commit() return True finally: connection.close()
def cleanup_sheet(engine, row, sheet_id, data_row_filter, stats_spending): spending_table = sl.get_table(engine, 'spending') data = list(sl.find(engine, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id)) if not data: log.info('Sheet has no rows') return False, None connection = engine.connect() trans = connection.begin() date_formats = cleanup_dates.detect_formats(data) try: for date_format in date_formats.values(): if isinstance(date_format, basestring): issue(engine, row['resource_id'], row['retrieve_hash'], STAGE, "Couldn't detect date formats because: %s" % date_format, repr(date_formats)) return True, date_format if not data_row_filter: sl.delete(connection, spending_table, resource_id=row['resource_id'], sheet_id=sheet_id) for row in data: if data_row_filter and data_row_filter != row['row_id']: continue row = cleanup_dates.apply(row, date_formats, stats_spending) row = cleanup_numbers.apply(row, stats_spending) row = cleanup_gov.apply(row, stats_spending) #row = cleanup_supplier.apply(row, engine) del row['id'] sl.add_row(connection, spending_table, row) trans.commit() return True, None finally: connection.close()
def scrape_ablauf(engine, url, force=False): Ablauf = sl.get_table(engine, 'ablauf') key = int(url.rsplit('/', 1)[-1].split('.')[0]) a = sl.find_one(engine, Ablauf, source_url=url) if a is not None and a['abgeschlossen'] and not force: raise Unmodified() response = fetch(url) a = check_tags(a or {}, response, force) a.update({'key': key, 'source_url': url}) doc = inline_xml_from_page(response.content, url) if doc is None: raise NoContentException() a['wahlperiode'] = int(doc.findtext("WAHLPERIODE")) a['typ'] = doc.findtext("VORGANGSTYP") a['titel'] = doc.findtext("TITEL") if not a['titel'] or not len(a['titel'].strip()): raise NoContentException() if '\n' in a['titel']: t, k = a['titel'].rsplit('\n', 1) k = k.strip() if k.startswith('KOM') or k.startswith('SEK'): a['titel'] = t a['initiative'] = doc.findtext("INITIATIVE") a['stand'] = doc.findtext("AKTUELLER_STAND") a['signatur'] = doc.findtext("SIGNATUR") a['gesta_id'] = doc.findtext("GESTA_ORDNUNGSNUMMER") a['eu_dok_nr'] = doc.findtext("EU_DOK_NR") a['abstrakt'] = doc.findtext("ABSTRAKT") a['sachgebiet'] = doc.findtext("SACHGEBIET") a['zustimmungsbeduerftig'] = doc.findtext("ZUSTIMMUNGSBEDUERFTIGKEIT") #a.schlagworte = [] Schlagwort = sl.get_table(engine, 'schlagwort') for sw in doc.findall("SCHLAGWORT"): wort = {'wort': sw.text, 'source_url': url} sl.upsert(engine, Schlagwort, wort, unique=wort.keys()) log.info("Ablauf %s: %s", url, a['titel'].encode('ascii', 'replace')) a['titel'] = a['titel'].strip().lstrip('.').strip() a = expand_dok_nr(a) a['abgeschlossen'] = DIP_ABLAUF_STATES_FINISHED.get(a['stand'], False) if a['wahlperiode'] != max(app.config.get('WAHLPERIODEN')): a['abgeschlossen'] = True if 'Originaltext der Frage(n):' in a['abstrakt']: _, a['abstrakt'] = a['abstrakt'].split('Originaltext der Frage(n):', 1) sl.delete(engine, sl.get_table(engine, 'position'), source_url=url) sl.delete(engine, sl.get_table(engine, 'beitrag'), source_url=url) sl.delete(engine, sl.get_table(engine, 'zuweisung'), source_url=url) sl.delete(engine, sl.get_table(engine, 'beschluss'), source_url=url) sl.delete(engine, sl.get_table(engine, 'referenz'), source_url=url) for elem in doc.findall(".//VORGANGSPOSITION"): scrape_activity(engine, url, elem) Referenz = sl.get_table(engine, 'referenz') for elem in doc.findall("WICHTIGE_DRUCKSACHE"): link = elem.findtext("DRS_LINK") hash = None if link is not None and '#' in link: link, hash = link.rsplit('#', 1) dokument = dokument_by_id(elem.findtext("DRS_HERAUSGEBER"), 'drs', elem.findtext("DRS_NUMMER"), link=link) dokument['text'] = elem.findtext("DRS_TYP") dokument['seiten'] = hash dokument['source_url'] = url sl.upsert(engine, Referenz, dokument, unique=['link', 'source_url', 'seiten']) for elem in doc.findall("PLENUM"): link = elem.findtext("PLPR_LINK") if link is not None and '#' in link: link, hash = link.rsplit('#', 1) dokument = dokument_by_id(elem.findtext("PLPR_HERAUSGEBER"), 'plpr', elem.findtext("PLPR_NUMMER"), link=link) dokument['text'] = elem.findtext("PLPR_KLARTEXT") dokument['seiten'] = elem.findtext("PLPR_SEITEN") dokument['source_url'] = url sl.upsert(engine, Referenz, dokument, unique=['link', 'source_url', 'seiten']) sl.upsert(engine, Ablauf, a, unique=['source_url']) return a
processed_resource_ids.add(res['id']) if row and row['url'] != data['url']: # url has changed, so force retrieval next time data['retrieve_status'] = False stats_resources.add_source('URL changed', data) elif row: stats_resources.add_source('URL unchanged', data) else: stats_resources.add_source('New resource', data) sl.upsert(engine, table, data, ['resource_id']) # Remove references to any deleted resources for this dataset obsolete_rows = [row for row in existing_rows if row['resource_id'] not in processed_resource_ids] for row in obsolete_rows: sl.delete(engine, table, resource_id=row['resource_id']) sl.delete(engine, 'issue', resource_id=row['resource_id']) stats_resources.add_source('Deleted obsolete row', row) return len(resources) def connect(): engine = db_connect() src_table = sl.get_table(engine, 'source') return engine, src_table def build_index(publisher_name=None): '''Searches CKAN for spending resources and writes their metadata to the database.''' engine, table = connect() client = ckan_client() log.info('CKAN: %s', client.base_location)
def scrape_ablauf(engine, url, force=False): Ablauf = sl.get_table(engine, 'ablauf') key = int(url.rsplit('/', 1)[-1].split('.')[0]) a = sl.find_one(engine, Ablauf, source_url=url) if a is not None and a['abgeschlossen'] and not force: raise Unmodified() response = fetch(url) a = check_tags(a or {}, response, force) a.update({'key': key, 'source_url': url}) doc = inline_xml_from_page(response.content, url) if doc is None: raise NoContentException() a['wahlperiode'] = int(doc.findtext("WAHLPERIODE")) a['typ'] = doc.findtext("VORGANGSTYP") a['titel'] = doc.findtext("TITEL") if not a['titel'] or not len(a['titel'].strip()): raise NoContentException() if '\n' in a['titel']: t, k = a['titel'].rsplit('\n', 1) k = k.strip() if k.startswith('KOM') or k.startswith('SEK'): a['titel'] = t a['initiative'] = doc.findtext("INITIATIVE") a['stand'] = doc.findtext("AKTUELLER_STAND") a['signatur'] = doc.findtext("SIGNATUR") a['gesta_id'] = doc.findtext("GESTA_ORDNUNGSNUMMER") a['eu_dok_nr'] = doc.findtext("EU_DOK_NR") a['abstrakt'] = doc.findtext("ABSTRAKT") a['sachgebiet'] = doc.findtext("SACHGEBIET") a['zustimmungsbeduerftig'] = doc.findtext("ZUSTIMMUNGSBEDUERFTIGKEIT") #a.schlagworte = [] Schlagwort = sl.get_table(engine, 'schlagwort') for sw in doc.findall("SCHLAGWORT"): wort = {'wort': sw.text, 'source_url': url} sl.upsert(engine, Schlagwort, wort, unique=wort.keys()) log.info("Ablauf %s: %s", url, a['titel'].encode('ascii', 'replace')) a['titel'] = a['titel'].strip().lstrip('.').strip() a = expand_dok_nr(a) a['abgeschlossen'] = DIP_ABLAUF_STATES_FINISHED.get(a['stand'], False) if 'Originaltext der Frage(n):' in a['abstrakt']: _, a['abstrakt'] = a['abstrakt'].split('Originaltext der Frage(n):', 1) sl.delete(engine, sl.get_table(engine, 'position'), source_url=url) sl.delete(engine, sl.get_table(engine, 'beitrag'), source_url=url) sl.delete(engine, sl.get_table(engine, 'zuweisung'), source_url=url) sl.delete(engine, sl.get_table(engine, 'beschluss'), source_url=url) sl.delete(engine, sl.get_table(engine, 'referenz'), source_url=url) for elem in doc.findall(".//VORGANGSPOSITION"): scrape_activity(engine, url, elem) Referenz = sl.get_table(engine, 'referenz') for elem in doc.findall("WICHTIGE_DRUCKSACHE"): link = elem.findtext("DRS_LINK") hash = None if link is not None and '#' in link: link, hash = link.rsplit('#', 1) dokument = dokument_by_id(elem.findtext("DRS_HERAUSGEBER"), 'drs', elem.findtext("DRS_NUMMER"), link=link) dokument['text'] = elem.findtext("DRS_TYP") dokument['seiten'] = hash dokument['source_url'] = url sl.upsert(engine, Referenz, dokument, unique=[ 'link', 'source_url', 'seiten' ]) for elem in doc.findall("PLENUM"): link = elem.findtext("PLPR_LINK") if link is not None and '#' in link: link, hash = link.rsplit('#', 1) dokument = dokument_by_id(elem.findtext("PLPR_HERAUSGEBER"), 'plpr', elem.findtext("PLPR_NUMMER"), link=link) dokument['text'] = elem.findtext("PLPR_KLARTEXT") dokument['seiten'] = elem.findtext("PLPR_SEITEN") dokument['source_url'] = url sl.upsert(engine, Referenz, dokument, unique=[ 'link', 'source_url', 'seiten' ]) sl.upsert(engine, Ablauf, a, unique=['source_url']) return a
# url has changed, so force retrieval next time data['retrieve_status'] = False stats_resources.add_source('URL changed', data) elif row: stats_resources.add_source('URL unchanged', data) else: stats_resources.add_source('New resource', data) sl.upsert(engine, table, data, ['resource_id']) # Remove references to any deleted resources for this dataset obsolete_rows = [ row for row in existing_rows if row['resource_id'] not in processed_resource_ids ] for row in obsolete_rows: sl.delete(engine, table, resource_id=row['resource_id']) sl.delete(engine, 'issue', resource_id=row['resource_id']) stats_resources.add_source('Deleted obsolete row', row) return len(resources) def connect(): engine = db_connect() src_table = sl.get_table(engine, 'source') return engine, src_table def build_index(publisher_name=None): '''Searches CKAN for spending resources and writes their metadata to the database.''' engine, table = connect()