def condense(engine, resource_id, table_id, force): table_suffix = '%s_table%s' % (resource_id, table_id) if not engine.has_table('raw_%s' % table_suffix): return condensed_table = sl.get_table(engine, 'condensed') # Skip over tables we have already extracted if not force and sl.find_one(engine, condensed_table, resource_id=resource_id, table_id=table_id) is not None: return connection = engine.connect() trans = connection.begin() start = time.time() try: raw_table = sl.get_table(connection, 'raw_%s' % table_suffix) sl.drop_table(connection, 'spending_%s' % table_suffix) spending_table = sl.get_table(connection, 'spending_%s' % table_suffix) columns_table = sl.get_table(connection, 'column_sets') normalise_map = normalised_columns_map(raw_table) normalised_headers = ','.join(sorted(normalise_map.values())) mapping_row = sl.find_one(connection, columns_table, normalised=normalised_headers) if mapping_row is None or not mapping_row.get('valid'): # This table is unmapped, cannot be condensed return column_mapping = json.loads(mapping_row['column_map']) # Build the final mapping from input column to output column mapping = {} for k,n in normalise_map.iteritems(): if n in column_mapping and column_mapping[n] is not None and len(column_mapping[n]) > 0: mapping[k] = column_mapping[n] for row in sl.all(connection, raw_table): spending_row = {} for key, value in row.items(): if key not in mapping: continue if not value or not len(value.strip()): continue if mapping[key] in spending_row: continue spending_row[mapping[key]] = value.strip() #print spending_row sl.add_row(connection, spending_table, spending_row) sl.upsert(connection, condensed_table, {'resource_id': resource_id, 'table_id': table_id, 'condense_time': time.time() - start, }, ['resource_id', 'table_id']) trans.commit() finally: connection.close()
def cleanup_resource(engine, source_table, row, force): if not row["combine_status"]: return # Skip over tables we have already cleaned up if ( not force and sl.find_one( engine, source_table, resource_id=row["resource_id"], cleanup_status=True, cleanup_hash=row["combine_hash"] ) is not None ): return log.info("Cleanup: %s, Resource %s", row["package_name"], row["resource_id"]) status = True for sheet_id in range(0, row["sheets"]): sheet_status = cleanup_sheet(engine, row, sheet_id) if status and not sheet_status: status = False sl.upsert( engine, source_table, {"resource_id": row["resource_id"], "cleanup_hash": row["combine_hash"], "cleanup_status": status}, unique=["resource_id"], )
def scrape_transcript(engine, url, force=False): wp, session = url_metadata(url) table = sl.get_table(engine, 'speech') sample = sl.find_one(engine, table, source_url=url, matched=True) response, sio = fetch_stream(url) sample = check_tags(sample or {}, response, force) base_data = {'source_url': url, 'sitzung': session, 'wahlperiode': wp, 'matched': False, 'loaded': False, 'source_etag': sample['source_etag']} log.info("Loading transcript: %s/%s, from %s" , wp, session, url) seq = 0 parser = SpeechParser(sio) for contrib in parser: if not len(contrib['text'].strip()): continue contrib.update(base_data) contrib['sequence'] = seq sl.upsert(engine, table, contrib, unique=['source_url', 'sequence']) seq += 1 if not parser.missing_recon: sl.upsert(engine, table, { 'matched': True, 'source_url': url, }, unique=['source_url']) else: raise InvalidReference() return base_data
def scrape_transcript(engine, url, force=False): wp, session = url_metadata(url) table = sl.get_table(engine, 'speech') sio = find_local(url) sample = {'source_etag': 'local'} if sio is None: sample = sl.find_one(engine, table, source_url=url, matched=True) response, sio = fetch_stream(url) sample = check_tags(sample or {}, response, force) base_data = {'source_url': url, 'sitzung': session, 'wahlperiode': wp, 'matched': False, 'loaded': False, 'source_etag': sample['source_etag']} log.info("Loading transcript: %s/%s, from %s" , wp, session, url) seq = 0 parser = SpeechParser(sio) for contrib in parser: if not len(contrib['text'].strip()): continue contrib.update(base_data) contrib['sequence'] = seq sl.upsert(engine, table, contrib, unique=['source_url', 'sequence']) seq += 1 if not parser.missing_recon: sl.upsert(engine, table, { 'matched': True, 'source_url': url, }, unique=['source_url']) else: raise InvalidReference() return base_data
def extract_resource(engine, source_table, row, force, stats): if not row['retrieve_status']: stats.add_source('Previous step (retrieve) not complete', row) log.debug('Row has no retrieve status - skipping') return # Skip over tables we have already extracted if not force and sl.find_one( engine, source_table, resource_id=row['resource_id'], extract_status=True, extract_hash=row['retrieve_hash']) is not None: stats.add_source('Already extracted', row) return log.info("Extract: /dataset/%s/resource/%s", row['package_name'], row['resource_id']) clear_issues(engine, row['resource_id'], STAGE) status, sheets = extract_resource_core(engine, row, stats) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'extract_hash': row['retrieve_hash'], 'extract_status': status, 'sheets': sheets }, unique=['resource_id'])
def load_transcript(engine, wp, session, incremental=True): url = URL % (wp, session) Speech = sl.get_table(engine, 'speech') if incremental and sl.find_one(engine, Speech, source_url=url, matched=True): return True if '404 Seite nicht gefunden' in fetch(url): return False sio = fetch_stream(url) if sio is None: return False log.info("Loading transcript: %s/%s" % (wp, session)) seq = 0 parser = SpeechParser(engine, sio) for contrib in parser: if not len(contrib['text'].strip()): continue contrib['sitzung'] = session contrib['sequence'] = seq contrib['wahlperiode'] = wp contrib['source_url'] = url contrib['matched'] = True sl.upsert(engine, Speech, contrib, unique=['sequence', 'sitzung', 'wahlperiode']) seq += 1 if parser.missing_recon: sl.upsert(engine, Speech, { 'matched': False, 'sitzung': session, 'wahlperiode': wp }, unique=['sitzung', 'wahlperiode']) return True
def combine_resource(engine, source_table, row, force, stats): if not row['extract_status']: stats.add_source('Previous step (extract) not complete', row) return # Skip over tables we have already combined if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], combine_hash=row['extract_hash'], combine_status=True) is not None: stats.add_source('Already combined', row) return log.info("Combine: /dataset/%s/resource/%s", row['package_name'], row['resource_id']) clear_issues(engine, row['resource_id'], STAGE) status = combine_resource_core(engine, row, stats) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'combine_hash': row['extract_hash'], 'combine_status': status, }, unique=['resource_id'])
def cleanup_resource(engine, source_table, row, force): if not row['combine_status']: return # Skip over tables we have already cleaned up if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], cleanup_status=True, cleanup_hash=row['combine_hash']) is not None: return log.info("Cleanup: %s, Resource %s", row['package_name'], row['resource_id']) status = True for sheet_id in range(0, row['sheets']): sheet_status = cleanup_sheet(engine, row, sheet_id) if status and not sheet_status: status = False sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'cleanup_hash': row['combine_hash'], 'cleanup_status': status, }, unique=['resource_id'])
def supplier_names_from_table(engine, resource_id, table_id): global tables_count, rows_count tables_count = tables_count + 1 print "# checking table %d" % tables_count table_suffix = "%s_table%s" % (resource_id, table_id) table = sl.get_table(engine, "spending_%s" % table_suffix) supplier_table = sl.get_table(engine, "suppliers") for row in sl.all(engine, table): rows_count = rows_count + 1 if not row.has_key("SupplierName"): # One of the junk tables that contain no real data, usually excel noise continue supplier = row["SupplierName"] if supplier is None or supplier == "": continue if suppliers_visited.has_key(supplier): continue suppliers_visited[supplier] = True if sl.find_one(engine, supplier_table, original=supplier) is not None: continue yield supplier
def load_sessions(engine, wp='17'): table = sl.get_table(engine, 'webtv') for i in count(1): i = str(i) if sl.find_one(engine, table, wp=wp, session=i): continue if not load_agenda(engine, wp, i): return
def scrape_abstimmung(engine, url, force=False): abstimmung = sl.get_table(engine, 'abstimmung') sample = sl.find_one(engine, abstimmung, source_url=url) response = fetch(url) sample = check_tags(sample or {}, response, force) base_data = {'source_url': url, 'source_etag': sample['source_etag']} handle_xml(engine, base_data, response.content) return base_data
def validate_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending): if not row['cleanup_status']: stats.add_source('Previous step (cleanup) not complete', row) return # Skip over tables we have already cleaned up if not force and sl.find_one( engine, source_table, resource_id=row['resource_id'], validate_status=True, validate_hash=row['cleanup_hash']) is not None: stats.add_source('Already validated', row) return log.info("Validate: /dataset/%s/resource/%s", row['package_name'], row['resource_id']) if not data_row_filter: clear_issues(engine, row['resource_id'], STAGE) no_errors = True no_records = True error_message = None for sheet_id in range(0, row['sheets']): sheet_records, sheet_error_message = validate_sheet( engine, row, sheet_id, data_row_filter, stats_spending) if no_errors and sheet_error_message: no_errors = False error_message = sheet_error_message if no_records and sheet_records: no_records = False if data_row_filter: stats.add_source( 'Resource data filtered, not saving resource cleanup.', row) else: log.info("Result: records=%s errors=%s", not no_records, not no_errors) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'validate_hash': row['cleanup_hash'], 'validate_status': no_errors, }, unique=['resource_id']) if no_errors: if no_records: stats.add_source('No records but no errors', row) else: stats.add_source('Validated ok', row) else: if no_records: stats.add_source( 'All transactions invalid: %s' % error_message, row) else: stats.add_source( 'Some transactions invalid: %s' % error_message, row)
def load_vote(url, engine, incremental=True): Vote = sl.get_table(engine, 'abstimmung') if incremental and sl.find_one(engine, Vote, source_url=url): log.info("%s is done, skipping.", url) return fh, path = tempfile.mkstemp('.pdf') fo = open(path, 'wb') fo.write(fetch(url)) fo.close() xml = pdftoxml(path) handle_xml(xml, engine, url)
def match_beitrag(engine, beitrag, url): beitrag_print = make_long_name(beitrag) log.info("Matching: %s", beitrag_print.encode('ascii', 'replace')) try: value = resolve_person(beitrag_print) if sl.find_one(engine, sl.get_table(engine, 'person'), fingerprint=value) is None: make_person(engine, beitrag, value, url) return value except BadReference: log.info("Beitrag person is unknown: %s", beitrag_print.encode('ascii', 'replace'))
def match_beitrag(engine, beitrag): nkp = nk_persons() beitrag_print = make_long_name(beitrag) log.info("Matching: %s", beitrag_print) try: value = match_speaker(beitrag_print) if sl.find_one(engine, sl.get_table(engine, 'person'), fingerprint=value) is None: make_person(beitrag, value, engine) return value except NKNoMatch, nm: log.info("Beitrag person is unknown: %s", beitrag_print) return None
def load_item(url, engine, gremium=None): table = sl.get_table(engine, 'news') item = sl.find_one(engine, table, source_url=url) if item is not None: if gremium is not None: item['gremium'] = gremium['key'] sl.upsert(engine, table, item, unique=['source_url']) return try: doc = _xml(url) except Exception, e: log.exception(e) return
def lazyload_person(engine, indexer, fingerprint): if fingerprint is None or not len(fingerprint): return person = Person.query.filter_by(fingerprint=fingerprint).first() if person is not None: return person table = sl.get_table(engine, 'person') data = sl.find_one(engine, table, fingerprint=fingerprint) if data is None: log.error("Invalid fingerprint reference: %s", fingerprint) return person = load_person(engine, data) indexer.add(person) return person
def lazyload_person(engine, indexer, fingerprint): if fingerprint is None or not len(fingerprint): return person = Person.query.filter_by(fingerprint=fingerprint).first() if person is not None: return person table = sl.get_table(engine, "person") data = sl.find_one(engine, table, fingerprint=fingerprint) if data is None: log.error("Invalid fingerprint reference: %s", fingerprint) return person = load_person(engine, data) indexer.add(person) return person
def add_to_gremium(node, url, role, engine): key = node.get('id') table = sl.get_table(engine, 'gremium') g = sl.find_one(engine, table, key=key) if g is None: g = {'key': key, 'type': 'sonstiges'} g['name'] = node.findtext('gremiumName') g['url'] = node.findtext('gremiumURL') sl.upsert(engine, table, g, unique=['key']) table = sl.get_table(engine, 'gremium_mitglieder') sl.upsert(engine, table, { 'gremium_key': g['key'], 'person_source_url': url, 'role': role }, unique=['person_source_url', 'gremium_key', 'role'])
def cleanup_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending): if not row['combine_status']: stats.add_source('Previous step (combine) not complete', row) return # Skip over tables we have already cleaned up if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], cleanup_status=True, cleanup_hash=row['combine_hash']) is not None: stats.add_source('Already cleaned up', row) return log.info("Cleanup: /dataset/%s/resource/%s", row['package_name'], row['resource_id']) if not data_row_filter: clear_issues(engine, row['resource_id'], STAGE) no_rows = True no_errors = True error_message = None for sheet_id in range(0, row['sheets']): sheet_has_rows, sheet_error_message = cleanup_sheet( engine, row, sheet_id, data_row_filter, stats_spending) if no_errors and sheet_error_message: no_errors = False error_message = sheet_error_message if no_rows and sheet_has_rows: no_rows = False if data_row_filter: stats.add_source( 'Resource data filtered, not saving resource cleanup.', row) else: sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'cleanup_hash': row['combine_hash'], 'cleanup_status': no_errors, }, unique=['resource_id']) if no_rows: stats.add_source('Empty sheet', row) elif no_errors: stats.add_source('Cleaned up ok', row) else: stats.add_source(error_message, row)
def add_to_gremium(node, url, role, engine): key = node.get("id") table = sl.get_table(engine, "gremium") g = sl.find_one(engine, table, key=key) if g is None: g = {"key": key, "type": "sonstiges"} g["name"] = node.findtext("gremiumName") g["url"] = node.findtext("gremiumURL") sl.upsert(engine, table, g, unique=["key"]) table = sl.get_table(engine, "gremium_mitglieder") sl.upsert( engine, table, {"gremium_key": g["key"], "person_source_url": url, "role": role}, unique=["person_source_url", "gremium_key", "role"], )
def resolve_abstimmung(engine, source_url): table = sl.get_table(engine, 'abstimmung') data = sl.find_one(engine, table, source_url=source_url) if data is None: log.error("No data: %s", source_url) return subject = data['subject'] try: title = resolve_votes(subject) except BadReference: title = None log.info("No match for: %s", data['person']) sl.upsert(engine, table, {'subject': subject, 'title': title}, unique=['subject'])
def scrape_activity(ablauf, elem, engine): urheber = elem.findtext("URHEBER") fundstelle = elem.findtext("FUNDSTELLE") Position = sl.get_table(engine, 'position') p = sl.find_one(engine, Position, urheber=urheber, fundstelle=fundstelle, ablauf_id=ablauf['ablauf_id']) if p is not None: return p = {'ablauf_id': ablauf['ablauf_id'], 'urheber': urheber, 'fundstelle': fundstelle} pos_keys = p.copy() p['zuordnung'] = elem.findtext("ZUORDNUNG") p['abstrakt'] = elem.findtext("VP_ABSTRAKT") p['fundstelle_url'] = elem.findtext("FUNDSTELLE_LINK") Zuweisung = sl.get_table(engine, 'zuweisung') for zelem in elem.findall("ZUWEISUNG"): z = pos_keys.copy() z['text'] = zelem.findtext("AUSSCHUSS_KLARTEXT") z['federfuehrung'] = zelem.find("FEDERFUEHRUNG") is not None z['gremium_key'] = DIP_GREMIUM_TO_KEY.get(z['text']) sl.upsert(engine, Zuweisung, z, unique=[]) Beschluss = sl.get_table(engine, 'beschluss') for belem in elem.findall("BESCHLUSS"): b = pos_keys.copy() b['seite'] = belem.findtext("BESCHLUSSSEITE") b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT") b['tenor'] = belem.findtext("BESCHLUSSTENOR") b['grundlage'] = belem.findtext("GRUNDLAGE") sl.upsert(engine, Beschluss, b, unique=[]) Referenz = sl.get_table(engine, 'referenz') try: dokument = dokument_by_url(p['fundstelle_url']) or \ dokument_by_name(p['fundstelle']) dokument.update(pos_keys) dokument['ablauf_key'] = ablauf['key'] dokument['wahlperiode'] = ablauf['wahlperiode'] sl.upsert(engine, Referenz, dokument, unique=[ 'link', 'wahlperiode', 'ablauf_key', 'seiten' ]) except Exception, e: log.exception(e)
def validate_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending): if not row['cleanup_status']: stats.add_source('Previous step (cleanup) not complete', row) return # Skip over tables we have already cleaned up if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], validate_status=True, validate_hash=row['cleanup_hash']) is not None: stats.add_source('Already validated', row) return log.info("Validate: /dataset/%s/resource/%s", row['package_name'], row['resource_id']) if not data_row_filter: clear_issues(engine, row['resource_id'], STAGE) no_errors = True no_records = True error_message = None for sheet_id in range(0, row['sheets']): sheet_records, sheet_error_message = validate_sheet(engine, row, sheet_id, data_row_filter, stats_spending) if no_errors and sheet_error_message: no_errors = False error_message = sheet_error_message if no_records and sheet_records: no_records = False if data_row_filter: stats.add_source('Resource data filtered, not saving resource cleanup.', row) else: log.info("Result: records=%s errors=%s", not no_records, not no_errors) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'validate_hash': row['cleanup_hash'], 'validate_status': no_errors, }, unique=['resource_id']) if no_errors: if no_records: stats.add_source('No records but no errors', row) else: stats.add_source('Validated ok', row) else: if no_records: stats.add_source('All transactions invalid: %s' % error_message, row) else: stats.add_source('Some transactions invalid: %s' % error_message, row)
def scrape_gremium(engine, url, force=False): table = sl.get_table(engine, 'gremium') response, doc = _xml(url) a = sl.find_one(engine, table, source_url=url) if a is None: a = {'source_url': url} a = check_tags(a, response, force) a['key'] = doc.findtext('/ausschussId') a['name'] = doc.findtext('/ausschussName') log.info("Ausschuss (%s): %s" % (a['key'], a['name'])) a['aufgabe'] = doc.findtext('/ausschussAufgabe') a['image_url'] = doc.findtext('/ausschussBildURL') a['image_copyright'] = doc.findtext('/ausschussCopyright') a['rss_url'] = GREMIUM_RSS_FEEDS.get(a['key']) a['url'] = URL_PATTERN % a['key'] a['type'] = 'ausschuss' sl.upsert(engine, table, a, unique=['key']) return a
def resolve_abstimmung(engine, source_url): table = sl.get_table(engine, 'abstimmung') data = sl.find_one(engine, table, source_url=source_url) if data is None: log.error("No data: %s", source_url) return subject = data['subject'] try: title = resolve_votes(subject) except BadReference: title = None log.info("No match for: %s", data['person']) sl.upsert(engine, table, { 'subject': subject, 'title': title }, unique=['subject'])
def combine_resource(engine, source_table, row, force): if not row['extract_status']: return # Skip over tables we have already combined if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], combine_hash=row['extract_hash'], combine_status=True) is not None: return log.info("Combine: %s, Resource %s", row['package_name'], row['resource_id']) status = combine_resource_core(engine, row) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'combine_hash': row['extract_hash'], 'combine_status': status, }, unique=['resource_id'])
def format(engine, resource_id, table_id, force): connection = engine.connect() trans = connection.begin() condensed_table = sl.get_table(connection, 'condensed') condensed_row = sl.find_one(connection, condensed_table, resource_id=resource_id, table_id=table_id) if condensed_row is None: condensed_row = {'resource_id': resource_id, 'table_id': table_id, } start = time.time() try: if not force and condensed_row['format_time'] is not None: return do_format(connection, resource_id, table_id) condensed_row['format_time'] = time.time() - start sl.upsert(connection, condensed_table, condensed_row, ['resource_id', 'table_id']) trans.commit() finally: connection.close()
def load_sitzung(engine, indexer, wp, session): table = sl.get_table(engine, 'webtv_speech') data = sl.find_one(engine, table, wp=wp, session=session) log.info("Loading Sitzung: %s/%s...", wp, session) sitzung = Sitzung.query.filter_by(wahlperiode=wp, nummer=session).first() if sitzung is None: sitzung = Sitzung() sitzung.wahlperiode = wp sitzung.nummer = session sitzung.titel = data.get('session_name') sitzung.date = to_date(data.get('session_date')) sitzung.source_url = data.get('session_url') db.session.add(sitzung) db.session.flush() indexer.add(sitzung) load_debatten(engine, indexer, sitzung) db.session.commit() return sitzung
def cleanup_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending): if not row['combine_status']: stats.add_source('Previous step (combine) not complete', row) return # Skip over tables we have already cleaned up if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], cleanup_status=True, cleanup_hash=row['combine_hash']) is not None: stats.add_source('Already cleaned up', row) return log.info("Cleanup: /dataset/%s/resource/%s", row['package_name'], row['resource_id']) if not data_row_filter: clear_issues(engine, row['resource_id'], STAGE) no_rows = True no_errors = True error_message = None for sheet_id in range(0, row['sheets']): sheet_has_rows, sheet_error_message = cleanup_sheet(engine, row, sheet_id, data_row_filter, stats_spending) if no_errors and sheet_error_message: no_errors = False error_message = sheet_error_message if no_rows and sheet_has_rows: no_rows = False if data_row_filter: stats.add_source('Resource data filtered, not saving resource cleanup.', row) else: sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'cleanup_hash': row['combine_hash'], 'cleanup_status': no_errors, }, unique=['resource_id']) if no_rows: stats.add_source('Empty sheet', row) elif no_errors: stats.add_source('Cleaned up ok', row) else: stats.add_source(error_message, row)
def lookup(val, engine): supplier_table = sl.get_table(engine, 'supplier') data = sl.find_one(engine, supplier_table, name=val) if data is not None: return data['canonical'], data['uri'], data['score'] try: query = json.dumps({'query': val, 'limit': 1}) res = session.get('http://opencorporates.com/reconcile/gb', params={'query': query}) data = {'name': val, 'canonical': None, 'uri': None, 'score': 0} if res.ok and res.json and len(res.json.get('result')): r = res.json.get('result').pop() data['canonical'] = r['name'] data['uri'] = r['uri'] data['score'] = r['score'] log.info('OpenCorporates Lookup: %s -> %s', val, data['canonical']) sl.upsert(engine, supplier_table, data, unique=['name']) return data['canonical'], data['uri'], data['score'] except Exception, ex: log.exception(ex) return None, None, None
def extract_resource(engine, source_table, row, force): if not row['retrieve_status']: log.debug('Row has no retrieve status - skipping') return # Skip over tables we have already extracted if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], extract_status=True, extract_hash=row['retrieve_hash']) is not None: return log.info("Extracting: %s, File %s", row['package_name'], row['resource_id']) status, sheets = extract_resource_core(engine, row) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'extract_hash': row['retrieve_hash'], 'extract_status': status, 'sheets': sheets }, unique=['resource_id'])
def canonical_actor(grano, engine, title): entity_table = sl.get_table(engine, 'entity') res = sl.find_one(engine, entity_table, etlFingerPrint=title) if res is not None and \ 'canonicalName' in res and \ res['canonicalName'] and \ title != res['canonicalName']: if title not in DELETED: nonCanon = grano.findEntity(ACTOR['name'], title=title) if nonCanon: ent = grano.getEntity(nonCanon['id'], deep=True) for rel in ent.get('incoming', []): grano.deleteRelation(rel) for rel in ent.get('outgoing', []): grano.deleteRelation(rel) grano.deleteEntity(nonCanon) DELETED.add(title) title = res['canonicalName'] act = grano.findEntity(ACTOR['name'], title=title) or {} print [title], act.get('id') act['title'] = title act['type'] = ACTOR['name'] return act
def validate_resource(engine, source_table, row, force): if not row['cleanup_status']: return # Skip over tables we have already cleaned up if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], validate_status=True, validate_hash=row['cleanup_hash']) is not None: return log.info("Validate: %s, Resource %s", row['package_name'], row['resource_id']) status = True for sheet_id in range(0, row['sheets']): sheet_status = validate_sheet(engine, row, sheet_id) if status and not sheet_status: status = False log.info("Result: %s", status) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'validate_hash': row['cleanup_hash'], 'validate_status': status, }, unique=['resource_id'])
def extract_resource(engine, source_table, row, force, stats): if not row['retrieve_status']: stats.add_source('Previous step (retrieve) not complete', row) log.debug('Row has no retrieve status - skipping') return # Skip over tables we have already extracted if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], extract_status=True, extract_hash=row['retrieve_hash']) is not None: stats.add_source('Already extracted', row) return log.info("Extract: /dataset/%s/resource/%s", row['package_name'], row['resource_id']) clear_issues(engine, row['resource_id'], STAGE) status, sheets = extract_resource_core(engine, row, stats) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'extract_hash': row['retrieve_hash'], 'extract_status': status, 'sheets': sheets }, unique=['resource_id'])
def scrape_ablauf(engine, url, force=False): Ablauf = sl.get_table(engine, 'ablauf') key = int(url.rsplit('/', 1)[-1].split('.')[0]) a = sl.find_one(engine, Ablauf, source_url=url) if a is not None and a['abgeschlossen'] and not force: raise Unmodified() response = fetch(url) a = check_tags(a or {}, response, force) a.update({'key': key, 'source_url': url}) doc = inline_xml_from_page(response.content, url) if doc is None: raise NoContentException() a['wahlperiode'] = int(doc.findtext("WAHLPERIODE")) a['typ'] = doc.findtext("VORGANGSTYP") a['titel'] = doc.findtext("TITEL") if not a['titel'] or not len(a['titel'].strip()): raise NoContentException() if '\n' in a['titel']: t, k = a['titel'].rsplit('\n', 1) k = k.strip() if k.startswith('KOM') or k.startswith('SEK'): a['titel'] = t a['initiative'] = doc.findtext("INITIATIVE") a['stand'] = doc.findtext("AKTUELLER_STAND") a['signatur'] = doc.findtext("SIGNATUR") a['gesta_id'] = doc.findtext("GESTA_ORDNUNGSNUMMER") a['eu_dok_nr'] = doc.findtext("EU_DOK_NR") a['abstrakt'] = doc.findtext("ABSTRAKT") a['sachgebiet'] = doc.findtext("SACHGEBIET") a['zustimmungsbeduerftig'] = doc.findtext("ZUSTIMMUNGSBEDUERFTIGKEIT") #a.schlagworte = [] Schlagwort = sl.get_table(engine, 'schlagwort') for sw in doc.findall("SCHLAGWORT"): wort = {'wort': sw.text, 'source_url': url} sl.upsert(engine, Schlagwort, wort, unique=wort.keys()) log.info("Ablauf %s: %s", url, a['titel'].encode('ascii', 'replace')) a['titel'] = a['titel'].strip().lstrip('.').strip() a = expand_dok_nr(a) a['abgeschlossen'] = DIP_ABLAUF_STATES_FINISHED.get(a['stand'], False) if 'Originaltext der Frage(n):' in a['abstrakt']: _, a['abstrakt'] = a['abstrakt'].split('Originaltext der Frage(n):', 1) Referenz = sl.get_table(engine, 'referenz') for elem in doc.findall("WICHTIGE_DRUCKSACHE"): link = elem.findtext("DRS_LINK") hash = None if link is not None and '#' in link: link, hash = link.rsplit('#', 1) dokument = dokument_by_id(elem.findtext("DRS_HERAUSGEBER"), 'drs', elem.findtext("DRS_NUMMER"), link=link) dokument['text'] = elem.findtext("DRS_TYP") dokument['seiten'] = hash dokument['source_url'] = url sl.upsert(engine, Referenz, dokument, unique=[ 'link', 'source_url', 'seiten' ]) for elem in doc.findall("PLENUM"): link = elem.findtext("PLPR_LINK") if link is not None and '#' in link: link, hash = link.rsplit('#', 1) dokument = dokument_by_id(elem.findtext("PLPR_HERAUSGEBER"), 'plpr', elem.findtext("PLPR_NUMMER"), link=link) dokument['text'] = elem.findtext("PLPR_KLARTEXT") dokument['seiten'] = elem.findtext("PLPR_SEITEN") dokument['source_url'] = url sl.upsert(engine, Referenz, dokument, unique=[ 'link', 'source_url', 'seiten' ]) sl.upsert(engine, Ablauf, a, unique=['source_url']) for elem in doc.findall(".//VORGANGSPOSITION"): scrape_activity(engine, url, elem) return a
'link', 'source_url', 'seiten' ]) except Exception, e: log.exception(e) sl.upsert(engine, Position, p, unique=[]) Person = sl.get_table(engine, 'person') Beitrag = sl.get_table(engine, 'beitrag') for belem in elem.findall("PERSOENLICHER_URHEBER"): b = pos_keys.copy() b['vorname'] = belem.findtext("VORNAME") b['nachname'] = belem.findtext("NACHNAME") b['funktion'] = belem.findtext("FUNKTION") b['ort'] = belem.findtext('WAHLKREISZUSATZ') p = sl.find_one(engine, Person, vorname=b['vorname'], nachname=b['nachname'], ort=b['ort']) if p is not None: b['person_source_url'] = p['source_url'] b['ressort'] = belem.findtext("RESSORT") b['land'] = belem.findtext("BUNDESLAND") b['fraktion'] = FACTION_MAPS.get(belem.findtext("FRAKTION"), belem.findtext("FRAKTION")) b['seite'] = belem.findtext("SEITE") b['art'] = belem.findtext("AKTIVITAETSART") sl.upsert(engine, Beitrag, b, unique=[]) class NoContentException(Exception): pass def scrape_ablauf(engine, url, force=False): Ablauf = sl.get_table(engine, 'ablauf')
def lazyload_gremium(engine, key): table = sl.get_table(engine, 'gremium') data = sl.find_one(engine, table, key=key) if data is None: return None return load_gremium(engine, data)
def check_done(engine, url, force=False): if force: return False table = sl.get_table(engine, 'speech') data = sl.find_one(engine, table, source_url=url, loaded=True) return data is not None
def scrape_mdb(engine, url, force=False): response, doc = _xml(url) id = int(doc.findtext('//mdbID')) table_person = sl.get_table(engine, 'person') table_rolle = sl.get_table(engine, 'rolle') p = sl.find_one(engine, table_person, mdb_id=id) if p is None: p = {'source_url': url} else: r_ = sl.find_one(engine, table_rolle, mdb_id=id, funktion='MdB') if r_ is not None: r = r_ p = check_tags(p, response, force) r = {'person_source_url': url, 'funktion': 'MdB'} r['mdb_id'] = p['mdb_id'] = id r['status'] = doc.find('//mdbID').get('status') if doc.findtext('//mdbAustrittsdatum'): r['austritt'] = datetime.strptime(doc.findtext('//mdbAustrittsdatum'), '%d.%m.%Y').isoformat() p['vorname'] = doc.findtext('//mdbVorname') p['nachname'] = doc.findtext('//mdbZuname') p['adelstitel'] = doc.findtext('//mdbAdelstitel') p['titel'] = doc.findtext('//mdbAkademischerTitel') p['ort'] = doc.findtext('//mdbOrtszusatz') message = 'MdB: %s %s (%s)' % (p['vorname'], p['nachname'], p['ort']) log.info(message.encode('ascii', 'replace')) p['geburtsdatum'] = doc.findtext('//mdbGeburtsdatum') p['religion'] = doc.findtext('//mdbReligionKonfession') p['hochschule'] = doc.findtext('//mdbHochschulbildung') p['beruf'] = doc.findtext('//mdbBeruf') p['berufsfeld'] = doc.find('//mdbBeruf').get('berufsfeld') p['geschlecht'] = doc.findtext('//mdbGeschlecht') p['familienstand'] = doc.findtext('//mdbFamilienstand') p['kinder'] = doc.findtext('//mdbAnzahlKinder') r['fraktion'] = doc.findtext('//mdbFraktion') p['fraktion'] = doc.findtext('//mdbFraktion') p['partei'] = doc.findtext('//mdbPartei') p['land'] = doc.findtext('//mdbLand') r['gewaehlt'] = doc.findtext('//mdbGewaehlt') p['bio_url'] = doc.findtext('//mdbBioURL') p['bio'] = doc.findtext('//mdbBiografischeInformationen') p['wissenswertes'] = doc.findtext('//mdbWissenswertes') p['homepage_url'] = doc.findtext('//mdbHomepageURL') p['telefon'] = doc.findtext('//mdbTelefon') p['angaben'] = doc.findtext('//mdbVeroeffentlichungspflichtigeAngaben') p['foto_url'] = doc.findtext('//mdbFotoURL') p['foto_copyright'] = doc.findtext('//mdbFotoCopyright') p['reden_plenum_url'] = doc.findtext('//mdbRedenVorPlenumURL') p['reden_plenum_rss_url'] = doc.findtext('//mdbRedenVorPlenumRSS') p['wk_nummer'] = doc.findtext('//mdbWahlkreisNummer') p['wk_name'] = doc.findtext('//mdbWahlkreisName') p['wk_url'] = doc.findtext('//mdbWahlkreisURL') for website in doc.findall('//mdbSonstigeWebsite'): type_ = website.findtext('mdbSonstigeWebsiteTitel') ws_url = website.findtext('mdbSonstigeWebsiteURL') if type_.lower() == 'twitter': p['twitter_url'] = ws_url if type_.lower() == 'facebook': p['facebook_url'] = ws_url if doc.findtext('//mdbBundestagspraesident'): sl.upsert(engine, table_rolle, { 'person_source_url': url, 'funktion': u'Bundestagspräsident', }, unique=['person_source_url', 'funktion']) if doc.findtext('//mdbBundestagsvizepraesident'): sl.upsert(engine, table_rolle, { 'person_source_url': url, 'funktion': u'Bundestagsvizepräsident', }, unique=['person_source_url', 'funktion']) for n in doc.findall('//mdbObleuteGremium'): add_to_gremium(n, url, 'obleute', engine) for n in doc.findall('//mdbVorsitzGremium'): add_to_gremium(n, url, 'vorsitz', engine) for n in doc.findall('//mdbStellvertretenderVorsitzGremium'): add_to_gremium(n, url, 'stellv_vorsitz', engine) for n in doc.findall('//mdbVorsitzSonstigesGremium'): add_to_gremium(n, url, 'vorsitz', engine) for n in doc.findall('//mdbStellvVorsitzSonstigesGremium'): add_to_gremium(n, url, 'stellv_vorsitz', engine) for n in doc.findall('//mdbOrdentlichesMitgliedGremium'): add_to_gremium(n, url, 'mitglied', engine) for n in doc.findall('//mdbStellvertretendesMitgliedGremium'): add_to_gremium(n, url, 'stellv_mitglied', engine) for n in doc.findall('//mdbOrdentlichesMitgliedSonstigesGremium'): add_to_gremium(n, url, 'mitglied', engine) for n in doc.findall('//mdbStellvertretendesMitgliedSonstigesGremium'): add_to_gremium(n, url, 'stellv_mitglied', engine) sl.upsert(engine, table_person, p, unique=['source_url']) sl.upsert(engine, table_rolle, r, unique=['person_source_url', 'funktion']) return p
for res in pkg['resources']: log.info(" > Resource %s: %s", res['id'], res['url']) data = { 'resource_id': res['id'], 'package_id': pkg['id'], 'package_name': pkg['name'], 'package_title': pkg['title'], 'last_modified': res.get('last_modified'), 'url': res['url'], 'publisher_name': group.get('name'), 'publisher_title': group.get('title'), 'publisher_type': group.get('type'), 'format': res['format'], 'description': res['description'] } row = sl.find_one(engine, table, resource_id=pkg['id']) if row and row['url'] != pkg['url']: # url has changed, so force retrieval next time data['retrieve_status'] = False sl.upsert(engine, table, data, ['resource_id']) def connect(): engine = db_connect() src_table = sl.get_table(engine, 'source') return engine, src_table def build_index(department_filter=None): '''Searches CKAN for spending resources and writes their metadata to the database.''' engine, table = connect() client = ckan_client()
sl.add_row(engine, Referenz, dokument) except Exception, e: log.exception(e) sl.add_row(engine, Position, p) Person = sl.get_table(engine, 'person') Beitrag = sl.get_table(engine, 'beitrag') for belem in elem.findall("PERSOENLICHER_URHEBER"): b = pos_keys.copy() b['vorname'] = belem.findtext("VORNAME") b['nachname'] = belem.findtext("NACHNAME") b['funktion'] = belem.findtext("FUNKTION") b['ort'] = belem.findtext('WAHLKREISZUSATZ') p = sl.find_one(engine, Person, vorname=b['vorname'], nachname=b['nachname'], ort=b['ort']) if p is not None: b['person_source_url'] = p['source_url'] b['ressort'] = belem.findtext("RESSORT") b['land'] = belem.findtext("BUNDESLAND") b['fraktion'] = FACTION_MAPS.get(belem.findtext("FRAKTION"), belem.findtext("FRAKTION")) b['seite'] = belem.findtext("SEITE") b['art'] = belem.findtext("AKTIVITAETSART") sl.add_row(engine, Beitrag, b) class NoContentException(Exception): pass
def fetch_row(engine, table_name, **kw): table = sl.get_table(engine, table_name) return sl.find_one(engine, table, **kw)
def scrape_ablauf(engine, url, force=False): Ablauf = sl.get_table(engine, 'ablauf') key = int(url.rsplit('/', 1)[-1].split('.')[0]) a = sl.find_one(engine, Ablauf, source_url=url) if a is not None and a['abgeschlossen'] and not force: raise Unmodified() response = fetch(url) a = check_tags(a or {}, response, force) a.update({'key': key, 'source_url': url}) doc = inline_xml_from_page(response.content, url) if doc is None: raise NoContentException() a['wahlperiode'] = int(doc.findtext("WAHLPERIODE")) a['typ'] = doc.findtext("VORGANGSTYP") a['titel'] = doc.findtext("TITEL") if not a['titel'] or not len(a['titel'].strip()): raise NoContentException() if '\n' in a['titel']: t, k = a['titel'].rsplit('\n', 1) k = k.strip() if k.startswith('KOM') or k.startswith('SEK'): a['titel'] = t a['initiative'] = doc.findtext("INITIATIVE") a['stand'] = doc.findtext("AKTUELLER_STAND") a['signatur'] = doc.findtext("SIGNATUR") a['gesta_id'] = doc.findtext("GESTA_ORDNUNGSNUMMER") a['eu_dok_nr'] = doc.findtext("EU_DOK_NR") a['abstrakt'] = doc.findtext("ABSTRAKT") a['sachgebiet'] = doc.findtext("SACHGEBIET") a['zustimmungsbeduerftig'] = doc.findtext("ZUSTIMMUNGSBEDUERFTIGKEIT") #a.schlagworte = [] Schlagwort = sl.get_table(engine, 'schlagwort') for sw in doc.findall("SCHLAGWORT"): wort = {'wort': sw.text, 'source_url': url} sl.upsert(engine, Schlagwort, wort, unique=wort.keys()) log.info("Ablauf %s: %s", url, a['titel'].encode('ascii', 'replace')) a['titel'] = a['titel'].strip().lstrip('.').strip() a = expand_dok_nr(a) a['abgeschlossen'] = DIP_ABLAUF_STATES_FINISHED.get(a['stand'], False) if a['wahlperiode'] != max(app.config.get('WAHLPERIODEN')): a['abgeschlossen'] = True if 'Originaltext der Frage(n):' in a['abstrakt']: _, a['abstrakt'] = a['abstrakt'].split('Originaltext der Frage(n):', 1) sl.delete(engine, sl.get_table(engine, 'position'), source_url=url) sl.delete(engine, sl.get_table(engine, 'beitrag'), source_url=url) sl.delete(engine, sl.get_table(engine, 'zuweisung'), source_url=url) sl.delete(engine, sl.get_table(engine, 'beschluss'), source_url=url) sl.delete(engine, sl.get_table(engine, 'referenz'), source_url=url) for elem in doc.findall(".//VORGANGSPOSITION"): scrape_activity(engine, url, elem) Referenz = sl.get_table(engine, 'referenz') for elem in doc.findall("WICHTIGE_DRUCKSACHE"): link = elem.findtext("DRS_LINK") hash = None if link is not None and '#' in link: link, hash = link.rsplit('#', 1) dokument = dokument_by_id(elem.findtext("DRS_HERAUSGEBER"), 'drs', elem.findtext("DRS_NUMMER"), link=link) dokument['text'] = elem.findtext("DRS_TYP") dokument['seiten'] = hash dokument['source_url'] = url sl.upsert(engine, Referenz, dokument, unique=['link', 'source_url', 'seiten']) for elem in doc.findall("PLENUM"): link = elem.findtext("PLPR_LINK") if link is not None and '#' in link: link, hash = link.rsplit('#', 1) dokument = dokument_by_id(elem.findtext("PLPR_HERAUSGEBER"), 'plpr', elem.findtext("PLPR_NUMMER"), link=link) dokument['text'] = elem.findtext("PLPR_KLARTEXT") dokument['seiten'] = elem.findtext("PLPR_SEITEN") dokument['source_url'] = url sl.upsert(engine, Referenz, dokument, unique=['link', 'source_url', 'seiten']) sl.upsert(engine, Ablauf, a, unique=['source_url']) return a