Exemple #1
0
def transform():
    """ Transform and clean up content """
    db, _ = WebStore(app.config["STAGING_URL"])
    master = master_data()
    from offenesparlament.transform import persons

    persons.generate_person_long_names(db)
    from offenesparlament.transform import ablaeufe

    ablaeufe.extend_ablaeufe(db, master)
    from offenesparlament.transform import positions

    positions.extend_positions(db)
    from offenesparlament.transform import namematch

    namematch.match_persons(db, master)
    from offenesparlament.transform import abstimmungen

    abstimmungen.extend_abstimmungen(db, master)
    # persons.generate_person_long_names(db)
    from offenesparlament.transform import mediathek

    mediathek.extend_speeches(db, master)
    from offenesparlament.transform import speechparser

    speechparser.load_transcripts(db, master)
    mediathek.merge_speeches(db, master)
    from offenesparlament.transform import speechmatch

    speechmatch.extend_speeches(db, master)
Exemple #2
0
def load_transcript(db, master, wp, session):
    url = URL % (wp, session)
    fh = urlopen(url)
    sio = StringIO(fh.read())
    fh.close()
    log.info("Loading transcript: %s/%s" % (wp, session))
    Speech = db['speech']
    seq = 0
    for contrib in SpeechParser(master_data(), db, sio):
        if not len(contrib['text'].strip()):
            continue
        contrib['sitzung'] = session
        contrib['sequence'] = seq
        contrib['wahlperiode'] = wp
        contrib['source_url'] = url
        Speech.writerow(contrib, 
                unique_columns=['sequence', 'sitzung', 'wahlperiode'])
        seq += 1
Exemple #3
0
            continue
        contrib['sitzung'] = session
        contrib['sequence'] = seq
        contrib['wahlperiode'] = wp
        contrib['source_url'] = url
        Speech.writerow(contrib, 
                unique_columns=['sequence', 'sitzung', 'wahlperiode'])
        seq += 1

def load_transcripts(db, master):
    for i in count(33):
        try:
            load_transcript(db, master, 17, i)
        except HTTPError:
            break

if __name__ == '__main__':
    assert len(sys.argv)==2, "Need argument: webstore-url!"
    db, _ = WebStore(sys.argv[1])
    print "DESTINATION", db
    #load_transcripts(db, master_data())
    #load_transcript(db, master_data(), 17, 72)
    #load_transcript(db, master_data(), 17, 93)
    #load_transcript(db, master_data(), 17, 101)
    #load_transcript(db, master_data(), 17, 103)
    load_transcript(db, master_data(), 17, 126)
    #sp = SpeechParser(master_data(), db, fp)
    #for l in sp:
    #    pprint(l)
        #pass
def cache_abstimmungen(engine):
    Abstimmung = sl.get_table(engine, 'abstimmung')
    data = defaultdict(dict)
    for e in sl.distinct(engine, Abstimmung, 'subject', 'date'):
        data[e['date']][e['subject']] = set(drucksachen(e['subject']))
    return dict(data.items())


def extend_beschluesse(engine, master):
    log.info("Re-connecting beschluesse ...")
    abstimmungen = cache_abstimmungen(engine)
    #pprint(abstimmungen)
    Beschluss = sl.get_table(engine, 'beschluss')
    for data in sl.find(engine, Beschluss):
        date = data['fundstelle'].split(' ')[0]
        data['date'] = datetime.strptime(date, '%d.%m.%Y').isoformat()
        if not data['dokument_text']:
            continue
        if data['date'] in abstimmungen:
            abst = abstimmungen[data['date']]
            doks = set(data['dokument_text'].split(', '))
            for subject, adoks in abst.items():
                if len(doks & adoks):
                    print "MATCH", data['date'], doks, adoks


if __name__ == '__main__':
    engine = etl_engine()
    print "DESTINATION", engine
    extend_beschluesse(engine, master_data())
Exemple #5
0
from webstore.client import URL as WebStore

from offenesparlament.core import master_data
from offenesparlament.transform.namematch import match_speaker, make_prints

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.NOTSET)

import re


def drucksachen(text, wahlperiode=17):
    pat = r"(%s/\d{1,6}(\s*\(.{1,10}\))?)" % wahlperiode
    for drs, sufx in re.findall(pat, text):
        yield drs



def _foo(db, master):
    for subj in db['abstimmung'].distinct('subject'):
        print list(drucksachen(subj['subject']))

if __name__ == '__main__':
    assert len(sys.argv)==2, "Need argument: webstore-url!"
    db, _ = WebStore(sys.argv[1])
    print "DESTINATION", db
    _foo(db, master_data())


Exemple #6
0
    SpeechDocument = db['speech_document']
    for i, data in enumerate(Speech):
        if data.get('type') != 'chair':
            continue
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        m = drs_match.search(data.get('text'))
        if m is None:
            continue
        for i, grp in enumerate(m.groups()):
            if grp and '/' in grp:
                wp, nummer = grp.split('/', 1)
                SpeechDocument.writerow({
                    'group': i,
                    'sequence': data['sequence'],
                    'sitzung': data['sitzung'],
                    'wahlperiode': wahlperiode,
                    'dok_nummer': nummer},
                    unique_columns=['sequence', 'sitzung', 'wahlperiode',
                        'group'],
                    bufferlen=5000)
    SpeechDocument.flush()

if __name__ == '__main__':
    assert len(sys.argv)==2, "Need argument: webstore-url!"
    db, _ = WebStore(sys.argv[1])
    print "DESTINATION", db
    extend_speeches(db, master_data())

Exemple #7
0
from webstore.client import URL as WebStore

from offenesparlament.core import master_data
from offenesparlament.transform.namematch import match_speaker, make_prints

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.NOTSET)


def extend_abstimmungen(db, master):
    log.info("Amending votes ...")
    Abstimmung = db["abstimmung"]
    prints = make_prints(db)
    for data in Abstimmung.distinct("person"):
        try:
            fp = match_speaker(master, data["person"], prints)
            if fp is not None:
                Abstimmung.writerow(
                    {"person": data.get("person"), "fingerprint": fp}, unique_columns=["person"], bufferlen=100
                )
        except ValueError, ve:
            log.exception(ve)
    Abstimmung.flush()


if __name__ == "__main__":
    assert len(sys.argv) == 2, "Need argument: webstore-url!"
    db, _ = WebStore(sys.argv[1])
    print "DESTINATION", db
    extend_abstimmungen(db, master_data())
Exemple #8
0
def match_beitraege(db, master, prints):
    Beitrag = db['beitrag']
    for i, beitrag in enumerate(db.query(QUERY)):
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        match = match_beitrag(db, master, beitrag, prints)
        ensure_rolle(beitrag, match, db)
        beitrag['fingerprint'] = match
        Beitrag.writerow(beitrag, unique_columns=['vorname', 'nachname',
            'funktion', 'land', 'fraktion', 'ressort', 'ort'],
            bufferlen=2000)
    Beitrag.flush()

def make_prints(db):
    return [p.get('fingerprint') for p in db['person'].distinct('fingerprint') \
            if p.get('fingerprint')]


def match_persons(db, master):
    prints = make_prints(db)
    match_beitraege(db, master, prints)
    match_speakers(db, master, prints)

if __name__ == '__main__':
    assert len(sys.argv)==2, "Need argument: webstore-url!"
    db, _ = WebStore(sys.argv[1])
    print "DESTINATION", db
    match_persons(db, master_data())
Exemple #9
0
import logging

import sqlaload as sl

from offenesparlament.core import etl_engine
from offenesparlament.core import master_data

log = logging.getLogger(__name__)

def extend_ablaeufe(engine, master):
    log.info("Amending ablaeufe ...")
    Ablauf = sl.get_table(engine, 'ablauf')
    typen = [(t.get('typ'), t.get('class')) for t in master['ablauf_typ']]
    typen = dict(typen)
    for data in sl.distinct(engine, Ablauf, 'typ'):
        klass = typen.get(data.get('typ'))
        sl.upsert(engine, Ablauf, {'typ': data.get('typ'),
                         'class': klass}, 
                         unique=['typ'])

if __name__ == '__main__':
    engine = etl_engine()
    print "DESTINATION", engine
    extend_ablaeufe(engine, master_data())
Exemple #10
0
                if speech_fp == med_fp(speech_idx+1):
                    # 2. curren matches, next also matches
                    # -> use current and increment
                    speech_idx += 1
                else:
                    # 1. current matches, next does not match
                    # -> use current
                    break
            else:
                if speech_fp == med_fp(speech_idx+1):
                    # 4. current does not match, next matches
                    # -> use next
                    speech_idx += 1
                    emit(speech, speech_idx)
                    break
                else:
                    # 3. current does not match, next does not match
                    # -> use current
                    emit(speech, speech_idx)
                    break
    
    SpeechMediathek.flush()


if __name__ == '__main__':
    assert len(sys.argv)==2, "Need argument: webstore-url!"
    db, _ = WebStore(sys.argv[1])
    print "DESTINATION", db
    extend_speeches(db, master_data())
    merge_speech(db, master_data(), 17, 121)
Exemple #11
0
    data = defaultdict(dict)
    q = db.query("SELECT DISTINCT subject, date FROM abstimmung;")
    for e in q:
        data[e['date']][e['subject']] = set(drucksachen(e['subject']))
    return dict(data.items())


def extend_beschluesse(db, master):
    log.info("Re-connecting beschluesse ...")
    abstimmungen = cache_abstimmungen(db)
    pprint(abstimmungen)
    Beschluss = db['beschluss']
    for data in Beschluss:
        date = data['fundstelle'].split(' ')[0]
        data['date'] = datetime.strptime(date, '%d.%m.%Y').isoformat()
        if not data['dokument_text']:
            continue
        if data['date'] in abstimmungen:
            abst = abstimmungen[data['date']]
            doks = set(data['dokument_text'].split(', '))
            for subject, adoks in abst.items():
                if len(doks & adoks):
                    print "MATCH", data['date'], doks, adoks


if __name__ == '__main__':
    assert len(sys.argv)==2, "Need argument: webstore-url!"
    db, _ = WebStore(sys.argv[1])
    print "DESTINATION", db
    extend_beschluesse(db, master_data())