def merge_speeches(engine): # desired result: (position_id, debatte_id) referenzen = referenzen_index(engine) items = item_index(engine) log.info("Finding best matches.... ") matches = {} for (ablauf_id, rwp, rsession), rdrs in referenzen.items(): for (iwp, isession, item_id), idrs in items.items(): if iwp != rwp or rsession != isession: continue ints = len(idrs.intersection(rdrs)) if ints == 0: continue k = (ablauf_id, rwp, rsession) if k in matches and matches[k][1] > ints: continue matches[k] = (item_id, ints) log.info("Saving position associations....") pos_tbl = sl.get_table(engine, 'position') for (ablauf_id, wp, session), (item_id, n) in matches.items(): for pos in sl.find(engine, pos_tbl, ablauf_id="%s/%s" % (wp, ablauf_id)): if not pos['fundstelle_url']: continue if 'btp/%s/%s%03d.pdf' % (wp, wp, int(session)) in pos['fundstelle_url']: d = {'ablauf_id': pos['ablauf_id'], 'hash': pos['hash'], 'debatte_wp': wp, 'debatte_session': session, 'debatte_item_id': item_id} sl.upsert(engine, pos_tbl, d, unique=['ablauf_id', 'hash'])
def load_transcript(engine, wp, session, incremental=True): url = URL % (wp, session) Speech = sl.get_table(engine, 'speech') if incremental and sl.find_one(engine, Speech, source_url=url, matched=True): return True if '404 Seite nicht gefunden' in fetch(url): return False sio = fetch_stream(url) if sio is None: return False log.info("Loading transcript: %s/%s" % (wp, session)) seq = 0 parser = SpeechParser(engine, sio) for contrib in parser: if not len(contrib['text'].strip()): continue contrib['sitzung'] = session contrib['sequence'] = seq contrib['wahlperiode'] = wp contrib['source_url'] = url contrib['matched'] = True sl.upsert(engine, Speech, contrib, unique=['sequence', 'sitzung', 'wahlperiode']) seq += 1 if parser.missing_recon: sl.upsert(engine, Speech, { 'matched': False, 'sitzung': session, 'wahlperiode': wp }, unique=['sitzung', 'wahlperiode']) return True
def update_network_entities(engine, file_name): log.info("Updating network entities reference sheet: %s", file_name) network_entities = set() table = sl.get_table(engine, 'network_entity') if os.path.exists(file_name): fh = open(file_name, 'rb') reader = csv.DictReader(fh) for d in reader: e = [(k, v.decode('utf-8')) for (k, v) in d.items()] e = dict(e) network_entities.add((e['representativeEtlId'], e['etlFingerPrint'])) sl.upsert(engine, table, e, ['representativeEtlId', 'etlFingerPrint']) fh.close() reps = set([ne[0] for ne in network_entities]) rep_table = sl.get_table(engine, 'representative') for rep in reps: sl.update(engine, rep_table, {'etlId': rep}, {'network_extracted': True}) for row in sl.all(engine, table): network_entities.add((row['representativeEtlId'], row['etlFingerPrint'])) fh = open(file_name, 'wb') writer = None table = sl.get_table(engine, 'network_entity') for ic, fp in network_entities: row = { 'representativeEtlId': ic, 'etlFingerPrint': fp } if writer is None: writer = csv.DictWriter(fh, row.keys()) writer.writerow(dict(zip(row.keys(), row.keys()))) r = [(k, unicode(v).encode('utf-8')) for k, v in row.items()] writer.writerow(dict(r)) fh.close()
def extract_resource(engine, source_table, row, force, stats): if not row['retrieve_status']: stats.add_source('Previous step (retrieve) not complete', row) log.debug('Row has no retrieve status - skipping') return # Skip over tables we have already extracted if not force and sl.find_one( engine, source_table, resource_id=row['resource_id'], extract_status=True, extract_hash=row['retrieve_hash']) is not None: stats.add_source('Already extracted', row) return log.info("Extract: /dataset/%s/resource/%s", row['package_name'], row['resource_id']) clear_issues(engine, row['resource_id'], STAGE) status, sheets = extract_resource_core(engine, row, stats) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'extract_hash': row['retrieve_hash'], 'extract_status': status, 'sheets': sheets }, unique=['resource_id'])
def scrape_transcript(engine, url, force=False): wp, session = url_metadata(url) table = sl.get_table(engine, 'speech') sio = find_local(url) sample = {'source_etag': 'local'} if sio is None: sample = sl.find_one(engine, table, source_url=url, matched=True) response, sio = fetch_stream(url) sample = check_tags(sample or {}, response, force) base_data = {'source_url': url, 'sitzung': session, 'wahlperiode': wp, 'matched': False, 'loaded': False, 'source_etag': sample['source_etag']} log.info("Loading transcript: %s/%s, from %s" , wp, session, url) seq = 0 parser = SpeechParser(sio) for contrib in parser: if not len(contrib['text'].strip()): continue contrib.update(base_data) contrib['sequence'] = seq sl.upsert(engine, table, contrib, unique=['source_url', 'sequence']) seq += 1 if not parser.missing_recon: sl.upsert(engine, table, { 'matched': True, 'source_url': url, }, unique=['source_url']) else: raise InvalidReference() return base_data
def extend_positions(engine): log.info("Amending positions ...") Position = sl.get_table(engine, 'position') for i, data in enumerate(sl.find(engine, Position)): if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() dt, rest = data['fundstelle'].split("-", 1) data['date'] = datetime.strptime(dt.strip(), "%d.%m.%Y").isoformat() if ',' in data['urheber']: typ, quelle = data['urheber'].split(',', 1) data['quelle'] = re.sub("^.*Urheber.*:", "", quelle).strip() data['typ'] = typ.strip() else: data['typ'] = data['urheber'] br = 'Bundesregierung, ' if data['urheber'].startswith(br): data['urheber'] = data['urheber'][len(br):] data['fundstelle_doc'] = None if data['fundstelle_url'] and \ 'btp' in data['fundstelle_url']: data['fundstelle_doc'] = data['fundstelle_url']\ .rsplit('#',1)[0] hash = sha1(data['fundstelle'].encode('utf-8') \ + data['urheber'].encode('utf-8') + \ data['ablauf_id'].encode('utf-8')).hexdigest() data['hash'] = hash[:10] sl.upsert(engine, Position, data, unique=UNIQUE)
def cleanup_resource(engine, source_table, row, force): if not row["combine_status"]: return # Skip over tables we have already cleaned up if ( not force and sl.find_one( engine, source_table, resource_id=row["resource_id"], cleanup_status=True, cleanup_hash=row["combine_hash"] ) is not None ): return log.info("Cleanup: %s, Resource %s", row["package_name"], row["resource_id"]) status = True for sheet_id in range(0, row["sheets"]): sheet_status = cleanup_sheet(engine, row, sheet_id) if status and not sheet_status: status = False sl.upsert( engine, source_table, {"resource_id": row["resource_id"], "cleanup_hash": row["combine_hash"], "cleanup_status": status}, unique=["resource_id"], )
def combine_resource(engine, source_table, row, force, stats): if not row['extract_status']: stats.add_source('Previous step (extract) not complete', row) return # Skip over tables we have already combined if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], combine_hash=row['extract_hash'], combine_status=True) is not None: stats.add_source('Already combined', row) return log.info("Combine: /dataset/%s/resource/%s", row['package_name'], row['resource_id']) clear_issues(engine, row['resource_id'], STAGE) status = combine_resource_core(engine, row, stats) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'combine_hash': row['extract_hash'], 'combine_status': status, }, unique=['resource_id'])
def make_fingerprint(engine, person): try: long_name = make_long_name(person) try: long_name = resolve_person(long_name) log.info(" -> %s" % long_name.strip()) except: log.error("Resolve did not work") pass Person = sl.get_table(engine, 'person') sl.upsert(engine, Person, { 'fingerprint': long_name, 'slug': url_slug(long_name), 'mdb_id': person['mdb_id'] }, unique=['mdb_id']) Rolle = sl.get_table(engine, 'rolle') sl.upsert(engine, Rolle, { 'mdb_id': person['mdb_id'], 'fingerprint': long_name }, unique=['mdb_id']) person['fingerprint'] = long_name except BadReference: log.error("Bad Reference %s", person) pass
def cleanup_resource(engine, source_table, row, force): if not row['combine_status']: return # Skip over tables we have already cleaned up if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], cleanup_status=True, cleanup_hash=row['combine_hash']) is not None: return log.info("Cleanup: %s, Resource %s", row['package_name'], row['resource_id']) status = True for sheet_id in range(0, row['sheets']): sheet_status = cleanup_sheet(engine, row, sheet_id) if status and not sheet_status: status = False sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'cleanup_hash': row['combine_hash'], 'cleanup_status': status, }, unique=['resource_id'])
def extend_position(engine, table, data): dt, rest = data['fundstelle'].split("-", 1) data['date'] = datetime.strptime(dt.strip(), "%d.%m.%Y").isoformat() if ',' in data['urheber']: typ, quelle = data['urheber'].split(',', 1) data['quelle'] = re.sub("^.*Urheber.*:", "", quelle).strip() data['typ'] = typ.strip() else: data['typ'] = data['urheber'] br = 'Bundesregierung, ' if data['urheber'].startswith(br): data['urheber'] = data['urheber'][len(br):] data['fundstelle_doc'] = None if data['fundstelle_url'] and \ 'btp' in data['fundstelle_url']: data['fundstelle_doc'] = data['fundstelle_url']\ .rsplit('#',1)[0] hash = sha1(data['fundstelle'].encode('utf-8') \ + data['urheber'].encode('utf-8') + \ data['source_url'].encode('utf-8')).hexdigest() data['hash'] = hash[:10] sl.upsert(engine, table, data, unique=['id'])
def extend_speeches(engine, wahlperiode=17): log.info("Amending speeches with DRS ...") drs_match = re.compile(DRS_MATCH % (wahlperiode, wahlperiode)) Speech = sl.get_table(engine, 'speech') SpeechDocument = sl.get_table(engine, 'speech_document') for i, data in enumerate(sl.find(engine, Speech)): if data.get('type') != 'chair': continue if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() m = drs_match.search(data.get('text')) if m is None: continue for i, grp in enumerate(m.groups()): if grp and '/' in grp: wp, nummer = grp.split('/', 1) sl.upsert( engine, SpeechDocument, { 'group': i, 'sequence': data['sequence'], 'sitzung': data['sitzung'], 'wahlperiode': wahlperiode, 'dok_nummer': nummer }, unique=['sequence', 'sitzung', 'wahlperiode', 'group'])
def map_columns(): engine, columns_table = connect() q = select([columns_table.c.normalised, columns_table.c.count, columns_table.c.valid], order_by=[columns_table.c.count.desc().nullslast()]) for normalised, count, valid in engine.execute(q): if valid is not None: continue try: columns = map_column(engine, columns_table, normalised, count) if columns is not None: sl.upsert(engine, columns_table, {'normalised': normalised, 'valid': True, 'column_map': json.dumps(columns)}, ['normalised']) else: sl.upsert(engine, columns_table, {'normalised': normalised, 'valid': False}, ['normalised']) except SystemExit: raise except: traceback.print_exc()
def scrape_transcript(engine, url, force=False): wp, session = url_metadata(url) table = sl.get_table(engine, 'speech') sample = sl.find_one(engine, table, source_url=url, matched=True) response, sio = fetch_stream(url) sample = check_tags(sample or {}, response, force) base_data = {'source_url': url, 'sitzung': session, 'wahlperiode': wp, 'matched': False, 'loaded': False, 'source_etag': sample['source_etag']} log.info("Loading transcript: %s/%s, from %s" , wp, session, url) seq = 0 parser = SpeechParser(sio) for contrib in parser: if not len(contrib['text'].strip()): continue contrib.update(base_data) contrib['sequence'] = seq sl.upsert(engine, table, contrib, unique=['source_url', 'sequence']) seq += 1 if not parser.missing_recon: sl.upsert(engine, table, { 'matched': True, 'source_url': url, }, unique=['source_url']) else: raise InvalidReference() return base_data
def generate_person_long_names(engine): log.info("Generating person fingerprints and slugs...") from offenesparlament.transform.namematch import match_speaker nkp = nk_persons() Person = sl.get_table(engine, 'person') for person in sl.find(engine, Person): long_name = make_long_name(person) try: long_name = match_speaker(long_name) except NKNoMatch: pass log.info(" -> %s" % long_name.strip()) slug = url_slug(long_name) sl.upsert(engine, Person, { 'fingerprint': long_name, 'slug': slug, 'id': person['id']}, unique=['id']) tries = 0 while True: try: nkp.ensure_value(long_name, data=person) except ValueError, E: log.warn('Exception: %s' % str(E)) tries = tries + 1 if tries > 5: raise else: break
def load_budget(base_url, year, engine, table): context = {'data_year': year} print "\nHaushalt: %s" % year i = 0 for row in load_einzelplaene(base_url % year, context): row['titel_id'] = row['id'] del row['id'] row['remarks'] = "\n\n".join(row['remarks']) commitment_appropriations = row['commitment_appropriations'].copy() del row['commitment_appropriations'] #if len(commitment_appropriations): # #print len(commitment_appropriations) row['commitment_year'] = None row['source_id'] = str(year) + "." + str(i) sl.upsert(engine, table, row, UNIQUE_COLUMNS) i += 1 for year, amount in commitment_appropriations.items(): ca = row.copy() ca['commitment_year'] = context['data_year'] ca['year'] = year ca['amount'] = amount ca['financial_type'] = 'VE' ca['source_id'] = str(year) + "." + str(i) sl.upsert(engine, table, ca, UNIQUE_COLUMNS) i += 1
def mark_done(engine, url): table = sl.get_table(engine, 'speech') sl.upsert(engine, table, { 'loaded': True, 'source_url': url, }, unique=['source_url'])
def process_rows(handlefunc, engine=None): if engine is None: engine = make_engine() table = sl.get_table(engine, 'fts') for row in sl.all(engine, table): out = handlefunc(row) sl.upsert(engine, table, out, ['id']) return table
def condense(engine, resource_id, table_id, force): table_suffix = '%s_table%s' % (resource_id, table_id) if not engine.has_table('raw_%s' % table_suffix): return condensed_table = sl.get_table(engine, 'condensed') # Skip over tables we have already extracted if not force and sl.find_one(engine, condensed_table, resource_id=resource_id, table_id=table_id) is not None: return connection = engine.connect() trans = connection.begin() start = time.time() try: raw_table = sl.get_table(connection, 'raw_%s' % table_suffix) sl.drop_table(connection, 'spending_%s' % table_suffix) spending_table = sl.get_table(connection, 'spending_%s' % table_suffix) columns_table = sl.get_table(connection, 'column_sets') normalise_map = normalised_columns_map(raw_table) normalised_headers = ','.join(sorted(normalise_map.values())) mapping_row = sl.find_one(connection, columns_table, normalised=normalised_headers) if mapping_row is None or not mapping_row.get('valid'): # This table is unmapped, cannot be condensed return column_mapping = json.loads(mapping_row['column_map']) # Build the final mapping from input column to output column mapping = {} for k,n in normalise_map.iteritems(): if n in column_mapping and column_mapping[n] is not None and len(column_mapping[n]) > 0: mapping[k] = column_mapping[n] for row in sl.all(connection, raw_table): spending_row = {} for key, value in row.items(): if key not in mapping: continue if not value or not len(value.strip()): continue if mapping[key] in spending_row: continue spending_row[mapping[key]] = value.strip() #print spending_row sl.add_row(connection, spending_table, spending_row) sl.upsert(connection, condensed_table, {'resource_id': resource_id, 'table_id': table_id, 'condense_time': time.time() - start, }, ['resource_id', 'table_id']) trans.commit() finally: connection.close()
def validate_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending): if not row['cleanup_status']: stats.add_source('Previous step (cleanup) not complete', row) return # Skip over tables we have already cleaned up if not force and sl.find_one( engine, source_table, resource_id=row['resource_id'], validate_status=True, validate_hash=row['cleanup_hash']) is not None: stats.add_source('Already validated', row) return log.info("Validate: /dataset/%s/resource/%s", row['package_name'], row['resource_id']) if not data_row_filter: clear_issues(engine, row['resource_id'], STAGE) no_errors = True no_records = True error_message = None for sheet_id in range(0, row['sheets']): sheet_records, sheet_error_message = validate_sheet( engine, row, sheet_id, data_row_filter, stats_spending) if no_errors and sheet_error_message: no_errors = False error_message = sheet_error_message if no_records and sheet_records: no_records = False if data_row_filter: stats.add_source( 'Resource data filtered, not saving resource cleanup.', row) else: log.info("Result: records=%s errors=%s", not no_records, not no_errors) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'validate_hash': row['cleanup_hash'], 'validate_status': no_errors, }, unique=['resource_id']) if no_errors: if no_records: stats.add_source('No records but no errors', row) else: stats.add_source('Validated ok', row) else: if no_records: stats.add_source( 'All transactions invalid: %s' % error_message, row) else: stats.add_source( 'Some transactions invalid: %s' % error_message, row)
def speechmatcher_alignment_post(wp, session): engine = etl_engine() table = sl.get_table(engine, 'alignments') data = dict(request.form.items()) data['sequence'] = int(data['sequence']) data['wp'] = wp data['session'] = session sl.upsert(engine, table, data, ['wp', 'session', 'sequence']) return speechmatcher_alignment_get(wp, session)
def match_beitraege(engine, url): table = sl.get_table(engine, 'beitrag') for beitrag in sl.distinct(engine, table, *KEYS, source_url=url): match = match_beitrag(engine, beitrag, url) beitrag['fingerprint'] = match beitrag['matched'] = match is not None if match: ensure_rolle(beitrag, match, engine) sl.upsert(engine, table, beitrag, unique=KEYS)
def merge(): engine = util.make_engine() table = sl.get_table(engine, 'fts') for row in sl.distinct(engine, table, 'beneficiary', 'country_code'): canonical, uri, score = lookup(row.get('beneficiary'), row.get('country_code'), engine) row['beneficiary_canonical'] = canonical row['beneficiary_uri'] = uri row['beneficiary_score'] = score sl.upsert(engine, table, row, ['beneficiary', 'country'])
def merge(): read_countries() engine = util.make_engine() table = sl.get_table(engine, 'fts') for row in sl.distinct(engine, table, 'country'): country = row.get('country') data = match(country) row['country_code'] = data.get('iso_3166-1_2') row['country_common'] = data.get('common') sl.upsert(engine, table, row, ['country'])
def extend_ablaeufe(engine, master): log.info("Amending ablaeufe ...") Ablauf = sl.get_table(engine, 'ablauf') typen = [(t.get('typ'), t.get('class')) for t in master['ablauf_typ']] typen = dict(typen) for data in sl.distinct(engine, Ablauf, 'typ'): klass = typen.get(data.get('typ')) sl.upsert(engine, Ablauf, {'typ': data.get('typ'), 'class': klass}, unique=['typ'])
def ensure_rolle(beitrag, fp, engine): rolle = { 'fingerprint': fp, 'ressort': beitrag.get('ressort'), 'fraktion': beitrag.get('fraktion'), 'funktion': beitrag.get('funktion') } Rolle = sl.get_table(engine, 'rolle') sl.upsert(engine, Rolle, rolle, unique=['fingerprint', 'funktion'])
def clean_ablauf(engine, data): try: table = sl.get_table(engine, 'ablauf') data['class'] = resolve_type(data.get('typ')) data['stage'] = resolve_stage(data.get('stand')) d = {'class': data['class'], 'stage': data['stand'], 'source_url': data['source_url']} sl.upsert(engine, table, d, unique=['source_url']) except BadReference: pass
def save(): etlId = request.form.get('representativeEtlId') matches = set(request.form.getlist('matches[]')) for match in matches: match = match.strip().strip(",").strip(";").strip(".").strip() sl.upsert(engine, network_entity, {'etlFingerPrint': match, 'representativeEtlId': etlId}, ['etlFingerPrint', 'representativeEtlId']) sl.upsert(engine, representative, {'etlId': etlId, 'network_extracted': True}, ['etlId']) return jsonify({'status': 'OK'})
def create_entities(engine): log.info("De-normalizing global entities collection...") table = sl.get_table(engine, 'entity') for tbl in ['representative', 'person', 'financialDataTurnover', 'organisation', 'network_entity']: for row in sl.all(engine, sl.get_table(engine, tbl)): entity = {'etlFingerPrint': row.get('etlFingerPrint')} entity['legalStatus'] = row.get('legalStatus', '') entity['countryCode'] = row.get('contactCountryCode', '') entity['etlTable'] = tbl sl.upsert(engine, table, entity, ['etlFingerPrint', 'etlTable'])
def load_person(person, role, childBase, engine): table = sl.get_table(engine, 'person') person_ = childBase.copy() person_.update(person) person_['role'] = role person_['etlFingerPrint'] = '%s %s %s' % (person['title'] or '', person['firstName'], person['lastName']) person_['etlFingerPrint'] = person_['etlFingerPrint'].strip() sl.upsert(engine, table, person_, ['representativeEtlId', 'role', 'etlFingerPrint'])
def integrate_recon(engine, table, qfunc, src_col, dst_name_col, dst_uri_col, min_score=None, limit=200, memory_name=None): if memory_name is None: memory_name = "recon_%s_%s" % (table.name, src_col) memory = SQLALoadMemory(engine, table=memory_name) for row in sl.distinct(engine, table, src_col): res = interactive(qfunc, row[src_col], min_score=min_score, memory=memory, limit=limit) if res is not None: #print row.get(src_col), " -> ", res.name.encode('utf-8'), res.score sl.upsert(engine, table, {src_col: row[src_col], dst_name_col: res.name, dst_uri_col: res.uri}, [src_col])
def resolve_stimmen(engine, source_url): table = sl.get_table(engine, 'abstimmung') for data in sl.find(engine, table, source_url=source_url): try: fp = resolve_person(data['person']) except BadReference: fp = None log.info("No match for: %s", data['person']) sl.upsert(engine, table, {'person': data.get('person'), 'matched': fp is not None, 'fingerprint': fp}, unique=['person'])
def fetch_package(client, package_name, engine, table): print package_name pkg = client.package_entity_get(package_name) for res in pkg['resources']: sl.upsert(engine, table, { 'resource_id': res['id'], 'package_id': pkg['id'], 'package_name': pkg['name'], 'url': res['url'], 'publisher': pkg.get('extras', {}).get('published_by'), 'format': res['format'], 'description': res['description'] }, ['resource_id'])
def articles(engine): a_table = sl.get_table(engine, 'article') for data in sl.find(engine, a_table): up = {'number': data['number']} slug_parts = data['canonical_url'].split('/')[3:] if len(slug_parts) > 3: print slug_parts if len(slug_parts) == 3: up['ressort'], up['subressort'], _ = slug_parts elif len(slug_parts) == 2: up['ressort'], _ = slug_parts up['date'] = parse_date(data['date_text']) sl.upsert(engine, a_table, up, ['number'])
def scrape_speeches(engine, data): url = WEBTV_SPEECHES % (data['wp'], data['session'], data['item_id']) response, doc = _html(url) rows = doc.findall('//tr') table = sl.get_table(engine, 'webtv') for i, row in enumerate(rows): if i % 4 != 0: continue data['speaker'] = row.xpath('string()').strip() if isinstance(data['speaker'], str): data['speaker'] = data['speaker'].encode('latin-1').decode('utf-8') data['speech_id'] = rows[i + 2].find('.//a').get('href').split('=')[-1] sl.upsert(engine, table, data, ['speech_id'])
def match_beitraege(engine): Beitrag = sl.get_table(engine, 'beitrag') for i, beitrag in enumerate(sl.distinct(engine, Beitrag, 'vorname', 'nachname', 'funktion', 'land', 'fraktion', 'ressort', 'ort')): if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() match = match_beitrag(engine, beitrag) ensure_rolle(beitrag, match, engine) beitrag['fingerprint'] = match beitrag['matched'] = match is not None sl.upsert(engine, Beitrag, beitrag, unique=['vorname', 'nachname', 'funktion', 'land', 'fraktion', 'ressort', 'ort'])
def load_ausschuss(url, engine, table): doc = _xml(url) a = {'source_url': url} a['key'] = doc.findtext('/ausschussId') a['name'] = doc.findtext('/ausschussName') log.info("Ausschuss (%s): %s" % (a['key'], a['name'])) a['aufgabe'] = doc.findtext('/ausschussAufgabe') a['image_url'] = doc.findtext('/ausschussBildURL') a['image_copyright'] = doc.findtext('/ausschussCopyright') a['rss_url'] = RSS_FEEDS.get(a['key']) a['url'] = URL_PATTERN % a['key'] a['type'] = 'ausschuss' sl.upsert(engine, table, a, unique=['key'])
def resolve_stimmen(engine, source_url): table = sl.get_table(engine, 'abstimmung') for data in sl.find(engine, table, source_url=source_url): try: fp = resolve_person(data['person']) except BadReference: fp = None log.info("No match for: %s", data['person']) sl.upsert(engine, table, { 'person': data.get('person'), 'matched': fp is not None, 'fingerprint': fp }, unique=['person'])
def cleanup_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending): if not row['combine_status']: stats.add_source('Previous step (combine) not complete', row) return # Skip over tables we have already cleaned up if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], cleanup_status=True, cleanup_hash=row['combine_hash']) is not None: stats.add_source('Already cleaned up', row) return log.info("Cleanup: /dataset/%s/resource/%s", row['package_name'], row['resource_id']) if not data_row_filter: clear_issues(engine, row['resource_id'], STAGE) no_rows = True no_errors = True error_message = None for sheet_id in range(0, row['sheets']): sheet_has_rows, sheet_error_message = cleanup_sheet( engine, row, sheet_id, data_row_filter, stats_spending) if no_errors and sheet_error_message: no_errors = False error_message = sheet_error_message if no_rows and sheet_has_rows: no_rows = False if data_row_filter: stats.add_source( 'Resource data filtered, not saving resource cleanup.', row) else: sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'cleanup_hash': row['combine_hash'], 'cleanup_status': no_errors, }, unique=['resource_id']) if no_rows: stats.add_source('Empty sheet', row) elif no_errors: stats.add_source('Cleaned up ok', row) else: stats.add_source(error_message, row)
def retrieve(row, engine, source_table, force, stats): content_id = None if not force and row.get('retrieve_status') is True \ and row.get('retrieve_hash') and os.path.exists(source_path(row)): # cached file exists and url is unchanged stats.add_source('Already cached and in database', row) return # fetch the file log.info("Retrieve: /dataset/%s/resource/%s", row['package_name'], row['resource_id']) clear_issues(engine, row['resource_id'], STAGE) url = row['url'].strip() # no-one can disagree with doing .strip() log.info('Fetching: "%s"', url) success, content_or_error = get_url(url) if not success: # URL didn't work, so try 'fixed' versions of it original_error = content_or_error fixed_urls = fix_url(url) for fixed_url in fixed_urls: log.info('Fetching fixed url: "%s"', fixed_url) success, content_or_error = get_url(fixed_url) if success: break if success: stats.add_source('Downloaded', row) elif os.path.exists(source_path(row)): stats.add_source('Could not download but it was in the cache', row) with open(source_path(row), 'rb') as fh: content_or_error = fh.read() success = True if success: data = content_or_error content_id = calculate_hash(data) fh = open(source_path(row), 'wb') fh.write(data) fh.close() else: stats.add_source(original_error, row) issue(engine, row['resource_id'], None, STAGE, original_error, url.encode('utf8', 'ignore')) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'retrieve_status': success, 'retrieve_hash': content_id}, unique=['resource_id'])
def add_to_gremium(node, url, role, engine): key = node.get('id') table = sl.get_table(engine, 'gremium') g = sl.find_one(engine, table, key=key) if g is None: g = {'key': key, 'type': 'sonstiges'} g['name'] = node.findtext('gremiumName') g['url'] = node.findtext('gremiumURL') sl.upsert(engine, table, g, unique=['key']) table = sl.get_table(engine, 'gremium_mitglieder') sl.upsert(engine, table, { 'gremium_key': g['key'], 'person_source_url': url, 'role': role }, unique=['person_source_url', 'gremium_key', 'role'])
def resolve_abstimmung(engine, source_url): table = sl.get_table(engine, 'abstimmung') data = sl.find_one(engine, table, source_url=source_url) if data is None: log.error("No data: %s", source_url) return subject = data['subject'] try: title = resolve_votes(subject) except BadReference: title = None log.info("No match for: %s", data['person']) sl.upsert(engine, table, { 'subject': subject, 'title': title }, unique=['subject'])
def scrape_gremium(engine, url, force=False): table = sl.get_table(engine, 'gremium') response, doc = _xml(url) a = sl.find_one(engine, table, source_url=url) if a is None: a = {'source_url': url} a = check_tags(a, response, force) a['key'] = doc.findtext('/ausschussId') a['name'] = doc.findtext('/ausschussName') log.info("Ausschuss (%s): %s" % (a['key'], a['name'])) a['aufgabe'] = doc.findtext('/ausschussAufgabe') a['image_url'] = doc.findtext('/ausschussBildURL') a['image_copyright'] = doc.findtext('/ausschussCopyright') a['rss_url'] = GREMIUM_RSS_FEEDS.get(a['key']) a['url'] = URL_PATTERN % a['key'] a['type'] = 'ausschuss' sl.upsert(engine, table, a, unique=['key']) return a
def combine_resource(engine, source_table, row, force): if not row['extract_status']: return # Skip over tables we have already combined if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], combine_hash=row['extract_hash'], combine_status=True) is not None: return log.info("Combine: %s, Resource %s", row['package_name'], row['resource_id']) status = combine_resource_core(engine, row) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'combine_hash': row['extract_hash'], 'combine_status': status, }, unique=['resource_id'])
def load_profiles(engine): doc = etree.parse(FEED_URL) Person = sl.get_table(engine, 'person') for profile in doc.findall('//PROFIL'): name = profile.findtext('.//VORNAME') if name is None: continue name += ' ' + profile.findtext('.//NACHNAME') partei = profile.findtext('.//PARTEI') name += ' ' + PARTEI_MAPPING.get(partei, partei) try: fp = resolve_person(name) sl.upsert(engine, Person, { 'awatch_url': profile.get('url'), 'fingerprint': fp }, unique=['fingerprint']) except BadReference: pass
def merge_speech(engine, wp, session): log.info("Merging media + transcript: %s/%s" % (wp, session)) score, alignment = get_alignment(engine, wp, session) log.info("Matching score: %s", score) agenda = get_agenda(engine, wp, session) agenda = dict([(a['item_id'], a) for a in agenda]) alignment = dict([(a['sequence'], a) for a in alignment]) item = None table = sl.get_table(engine, 'webtv_speech') for speech in sl.find(engine, sl.get_table(engine, 'speech'), order_by='sequence', wahlperiode=wp, sitzung=session, matched=True): sequence = speech['sequence'] item = alignment.get(sequence, item) data = agenda.get(item['item_id']).copy() del data['id'] data['sequence'] = sequence sl.upsert(engine, table, data, unique=['wp', 'session', 'sequence'])
def extract_resource(engine, source_table, row, force): if not row['retrieve_status']: log.debug('Row has no retrieve status - skipping') return # Skip over tables we have already extracted if not force and sl.find_one(engine, source_table, resource_id=row['resource_id'], extract_status=True, extract_hash=row['retrieve_hash']) is not None: return log.info("Extracting: %s, File %s", row['package_name'], row['resource_id']) status, sheets = extract_resource_core(engine, row) sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'extract_hash': row['retrieve_hash'], 'extract_status': status, 'sheets': sheets }, unique=['resource_id'])
def lookup(val, engine): supplier_table = sl.get_table(engine, 'supplier') data = sl.find_one(engine, supplier_table, name=val) if data is not None: return data['canonical'], data['uri'], data['score'] try: query = json.dumps({'query': val, 'limit': 1}) res = session.get('http://opencorporates.com/reconcile/gb', params={'query': query}) data = {'name': val, 'canonical': None, 'uri': None, 'score': 0} if res.ok and res.json and len(res.json.get('result')): r = res.json.get('result').pop() data['canonical'] = r['name'] data['uri'] = r['uri'] data['score'] = r['score'] log.info('OpenCorporates Lookup: %s -> %s', val, data['canonical']) sl.upsert(engine, supplier_table, data, unique=['name']) return data['canonical'], data['uri'], data['score'] except Exception, ex: log.exception(ex) return None, None, None
def make_person(engine, beitrag, fp, source_url): try: fp = resolve_person(fp) person = { 'fingerprint': fp, 'slug': url_slug(fp), 'source_url': source_url, 'vorname': beitrag['vorname'], 'nachname': beitrag['nachname'], 'ort': beitrag.get('ort'), 'ressort': beitrag.get('ressort'), 'land': beitrag.get('land'), 'fraktion': beitrag.get('fraktion') } sl.upsert(engine, sl.get_table(engine, 'person'), person, unique=['fingerprint']) except BadReference: pass return fp
def handle_list(page): texts = page.findall('text') header = [c.xpath("string()") for c in texts[:20]] if header[1].strip() == 'Seite:': col_offset = 3 else: for i, h in enumerate(header): if 'Name' in h: col_offset = i break fraktion = texts[col_offset - 1].xpath("string()") fraktion = fraktion.replace(u"ÜNDNIS`", "") fraktion = fraktion.replace(u"ÜNDNIS'", "") columns = [(int(c.get('left')), c.xpath("string()")) for c in \ texts[col_offset:col_offset+6]] texts = texts[col_offset + 6:] name = u'' #print columns for i, t in enumerate(texts): txt = t.xpath('string()').strip() if txt == 'Summe': break if not len(txt): continue left, field = min(columns, key=lambda c: abs(int(t.get('left')) - c[0])) if 'Name' in field: name += ' ' + txt if txt == 'X': field = field.strip().strip('.').strip() data = { 'subject': unicode(subject), 'person': name.strip() + ' ' + fraktion, 'date': unicode(date), 'vote': unicode(field) } data.update(base_data) sl.upsert(engine, Vote, data, unique=['subject', 'person']) name = u''
def merge_speeches(engine): # desired result: (position_id, debatte_id) referenzen = referenzen_index(engine) items = item_index(engine) log.info("Finding best matches.... ") matches = {} for (ablauf_id, rwp, rsession), rdrs in referenzen.items(): for (iwp, isession, item_id), idrs in items.items(): if iwp != rwp or rsession != isession: continue ints = len(idrs.intersection(rdrs)) if ints == 0: continue k = (ablauf_id, rwp, rsession) if k in matches and matches[k][1] > ints: continue matches[k] = (item_id, ints) log.info("Saving position associations....") pos_tbl = sl.get_table(engine, 'position') for (ablauf_id, wp, session), (item_id, n) in matches.items(): for pos in sl.find(engine, pos_tbl, ablauf_id="%s/%s" % (wp, ablauf_id)): if not pos['fundstelle_url']: continue if 'btp/%s/%s%03d.pdf' % (wp, wp, int(session)) in pos['fundstelle_url']: d = { 'ablauf_id': pos['ablauf_id'], 'hash': pos['hash'], 'debatte_wp': wp, 'debatte_session': session, 'debatte_item_id': item_id } sl.upsert(engine, pos_tbl, d, unique=['ablauf_id', 'hash'])
def parse_angaben(engine, data): if not data.get('angaben'): return snippet = '<x>' + data['angaben'] + '</x>' doc = html.fragment_fromstring(snippet) table = sl.get_table(engine, 'angaben') data = {'source_url': data['source_url']} wrapped_name = False for el in doc: if el.tag == 'h3': wrapped_name = False data['section'] = el.text.split('. ', 1)[-1] elif el.tag == 'strong' or not el.text or not el.get('class'): continue elif 'voa_abstand' in el.get('class') or wrapped_name: client = el.text if wrapped_name: client = data['client'] + ' ' + client data['client'] = client client.strip().strip(',') els = client.rsplit(',', 2) if len(els) == 3: wrapped_name = False data['client_name'] = els[0].strip() data['client_city'] = els[1].strip() else: wrapped_name = True continue else: data['service'] = el.text data['level'] = 'Stufe 0' for name in LEVELS: if name.lower() in data['service'].lower(): data['level'] = name sl.upsert(engine, table, data, ['source_url', 'section', 'client', 'service'])
res.status_code, url_printable) result = 'Download failed (status %s)' % res.status_code except requests.Timeout, re: result = 'Timeout accessing URL' issue(engine, row['resource_id'], None, result, url_printable) success = False except Exception, re: log.exception(re) issue(engine, row['resource_id'], None, 'Exception occurred', unicode(re)) success = False result = 'Exception occurred' sl.upsert(engine, source_table, { 'resource_id': row['resource_id'], 'retrieve_status': success, 'retrieve_hash': content_id }, unique=['resource_id']) return result def retrieve_some(force=False, **filters): engine = db_connect() source_table = sl.get_table(engine, 'source') result_counts = defaultdict(int) for row in sl.find(engine, source_table, **filters): result = retrieve(row, engine, source_table, force) result_counts['total'] += 1 result_counts[result] += 1 log.info('Total %i URLs', result_counts.pop('total'))