def transform(): """ Transform and clean up content """ db, _ = WebStore(app.config["STAGING_URL"]) master = master_data() from offenesparlament.transform import persons persons.generate_person_long_names(db) from offenesparlament.transform import ablaeufe ablaeufe.extend_ablaeufe(db, master) from offenesparlament.transform import positions positions.extend_positions(db) from offenesparlament.transform import namematch namematch.match_persons(db, master) from offenesparlament.transform import abstimmungen abstimmungen.extend_abstimmungen(db, master) # persons.generate_person_long_names(db) from offenesparlament.transform import mediathek mediathek.extend_speeches(db, master) from offenesparlament.transform import speechparser speechparser.load_transcripts(db, master) mediathek.merge_speeches(db, master) from offenesparlament.transform import speechmatch speechmatch.extend_speeches(db, master)
def load_transcript(db, master, wp, session): url = URL % (wp, session) fh = urlopen(url) sio = StringIO(fh.read()) fh.close() log.info("Loading transcript: %s/%s" % (wp, session)) Speech = db['speech'] seq = 0 for contrib in SpeechParser(master_data(), db, sio): if not len(contrib['text'].strip()): continue contrib['sitzung'] = session contrib['sequence'] = seq contrib['wahlperiode'] = wp contrib['source_url'] = url Speech.writerow(contrib, unique_columns=['sequence', 'sitzung', 'wahlperiode']) seq += 1
continue contrib['sitzung'] = session contrib['sequence'] = seq contrib['wahlperiode'] = wp contrib['source_url'] = url Speech.writerow(contrib, unique_columns=['sequence', 'sitzung', 'wahlperiode']) seq += 1 def load_transcripts(db, master): for i in count(33): try: load_transcript(db, master, 17, i) except HTTPError: break if __name__ == '__main__': assert len(sys.argv)==2, "Need argument: webstore-url!" db, _ = WebStore(sys.argv[1]) print "DESTINATION", db #load_transcripts(db, master_data()) #load_transcript(db, master_data(), 17, 72) #load_transcript(db, master_data(), 17, 93) #load_transcript(db, master_data(), 17, 101) #load_transcript(db, master_data(), 17, 103) load_transcript(db, master_data(), 17, 126) #sp = SpeechParser(master_data(), db, fp) #for l in sp: # pprint(l) #pass
def cache_abstimmungen(engine): Abstimmung = sl.get_table(engine, 'abstimmung') data = defaultdict(dict) for e in sl.distinct(engine, Abstimmung, 'subject', 'date'): data[e['date']][e['subject']] = set(drucksachen(e['subject'])) return dict(data.items()) def extend_beschluesse(engine, master): log.info("Re-connecting beschluesse ...") abstimmungen = cache_abstimmungen(engine) #pprint(abstimmungen) Beschluss = sl.get_table(engine, 'beschluss') for data in sl.find(engine, Beschluss): date = data['fundstelle'].split(' ')[0] data['date'] = datetime.strptime(date, '%d.%m.%Y').isoformat() if not data['dokument_text']: continue if data['date'] in abstimmungen: abst = abstimmungen[data['date']] doks = set(data['dokument_text'].split(', ')) for subject, adoks in abst.items(): if len(doks & adoks): print "MATCH", data['date'], doks, adoks if __name__ == '__main__': engine = etl_engine() print "DESTINATION", engine extend_beschluesse(engine, master_data())
from webstore.client import URL as WebStore from offenesparlament.core import master_data from offenesparlament.transform.namematch import match_speaker, make_prints log = logging.getLogger(__name__) logging.basicConfig(level=logging.NOTSET) import re def drucksachen(text, wahlperiode=17): pat = r"(%s/\d{1,6}(\s*\(.{1,10}\))?)" % wahlperiode for drs, sufx in re.findall(pat, text): yield drs def _foo(db, master): for subj in db['abstimmung'].distinct('subject'): print list(drucksachen(subj['subject'])) if __name__ == '__main__': assert len(sys.argv)==2, "Need argument: webstore-url!" db, _ = WebStore(sys.argv[1]) print "DESTINATION", db _foo(db, master_data())
SpeechDocument = db['speech_document'] for i, data in enumerate(Speech): if data.get('type') != 'chair': continue if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() m = drs_match.search(data.get('text')) if m is None: continue for i, grp in enumerate(m.groups()): if grp and '/' in grp: wp, nummer = grp.split('/', 1) SpeechDocument.writerow({ 'group': i, 'sequence': data['sequence'], 'sitzung': data['sitzung'], 'wahlperiode': wahlperiode, 'dok_nummer': nummer}, unique_columns=['sequence', 'sitzung', 'wahlperiode', 'group'], bufferlen=5000) SpeechDocument.flush() if __name__ == '__main__': assert len(sys.argv)==2, "Need argument: webstore-url!" db, _ = WebStore(sys.argv[1]) print "DESTINATION", db extend_speeches(db, master_data())
from webstore.client import URL as WebStore from offenesparlament.core import master_data from offenesparlament.transform.namematch import match_speaker, make_prints log = logging.getLogger(__name__) logging.basicConfig(level=logging.NOTSET) def extend_abstimmungen(db, master): log.info("Amending votes ...") Abstimmung = db["abstimmung"] prints = make_prints(db) for data in Abstimmung.distinct("person"): try: fp = match_speaker(master, data["person"], prints) if fp is not None: Abstimmung.writerow( {"person": data.get("person"), "fingerprint": fp}, unique_columns=["person"], bufferlen=100 ) except ValueError, ve: log.exception(ve) Abstimmung.flush() if __name__ == "__main__": assert len(sys.argv) == 2, "Need argument: webstore-url!" db, _ = WebStore(sys.argv[1]) print "DESTINATION", db extend_abstimmungen(db, master_data())
def match_beitraege(db, master, prints): Beitrag = db['beitrag'] for i, beitrag in enumerate(db.query(QUERY)): if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() match = match_beitrag(db, master, beitrag, prints) ensure_rolle(beitrag, match, db) beitrag['fingerprint'] = match Beitrag.writerow(beitrag, unique_columns=['vorname', 'nachname', 'funktion', 'land', 'fraktion', 'ressort', 'ort'], bufferlen=2000) Beitrag.flush() def make_prints(db): return [p.get('fingerprint') for p in db['person'].distinct('fingerprint') \ if p.get('fingerprint')] def match_persons(db, master): prints = make_prints(db) match_beitraege(db, master, prints) match_speakers(db, master, prints) if __name__ == '__main__': assert len(sys.argv)==2, "Need argument: webstore-url!" db, _ = WebStore(sys.argv[1]) print "DESTINATION", db match_persons(db, master_data())
import logging import sqlaload as sl from offenesparlament.core import etl_engine from offenesparlament.core import master_data log = logging.getLogger(__name__) def extend_ablaeufe(engine, master): log.info("Amending ablaeufe ...") Ablauf = sl.get_table(engine, 'ablauf') typen = [(t.get('typ'), t.get('class')) for t in master['ablauf_typ']] typen = dict(typen) for data in sl.distinct(engine, Ablauf, 'typ'): klass = typen.get(data.get('typ')) sl.upsert(engine, Ablauf, {'typ': data.get('typ'), 'class': klass}, unique=['typ']) if __name__ == '__main__': engine = etl_engine() print "DESTINATION", engine extend_ablaeufe(engine, master_data())
if speech_fp == med_fp(speech_idx+1): # 2. curren matches, next also matches # -> use current and increment speech_idx += 1 else: # 1. current matches, next does not match # -> use current break else: if speech_fp == med_fp(speech_idx+1): # 4. current does not match, next matches # -> use next speech_idx += 1 emit(speech, speech_idx) break else: # 3. current does not match, next does not match # -> use current emit(speech, speech_idx) break SpeechMediathek.flush() if __name__ == '__main__': assert len(sys.argv)==2, "Need argument: webstore-url!" db, _ = WebStore(sys.argv[1]) print "DESTINATION", db extend_speeches(db, master_data()) merge_speech(db, master_data(), 17, 121)
data = defaultdict(dict) q = db.query("SELECT DISTINCT subject, date FROM abstimmung;") for e in q: data[e['date']][e['subject']] = set(drucksachen(e['subject'])) return dict(data.items()) def extend_beschluesse(db, master): log.info("Re-connecting beschluesse ...") abstimmungen = cache_abstimmungen(db) pprint(abstimmungen) Beschluss = db['beschluss'] for data in Beschluss: date = data['fundstelle'].split(' ')[0] data['date'] = datetime.strptime(date, '%d.%m.%Y').isoformat() if not data['dokument_text']: continue if data['date'] in abstimmungen: abst = abstimmungen[data['date']] doks = set(data['dokument_text'].split(', ')) for subject, adoks in abst.items(): if len(doks & adoks): print "MATCH", data['date'], doks, adoks if __name__ == '__main__': assert len(sys.argv)==2, "Need argument: webstore-url!" db, _ = WebStore(sys.argv[1]) print "DESTINATION", db extend_beschluesse(db, master_data())