def func(url):
     from offenesparlament.core import etl_engine, db
     engine = etl_engine()
     proc['handler'](engine, indexer, url, \
                     force=force)
     engine.dispose()
     db.session.close()
Example #2
0
def index():
    engine = etl_engine()
    webtv_table = sl.get_table(engine, 'webtv')
    sessions = sl.distinct(engine, webtv_table,
        'wp', 'session', 'session_name')
    sessions = sorted(sessions, reverse=True)
    return render_template('backend/index.html',
        sessions=sessions)
Example #3
0
def load():
    """ Load and index staging DB into production """
    engine = etl_engine()
    from offenesparlament.load import load
    load.load(engine)
    load.aggregate()
    from offenesparlament.load import index
    index.index()
Example #4
0
def speechmatcher_alignment_post(wp, session):
    engine = etl_engine()
    table = sl.get_table(engine, 'alignments')
    data = dict(request.form.items())
    data['sequence'] = int(data['sequence'])
    data['wp'] = wp
    data['session'] = session
    sl.upsert(engine, table, data, ['wp', 'session', 'sequence'])
    return speechmatcher_alignment_get(wp, session)
Example #5
0
def extract_base():
    """ Run the extract stage """
    engine = etl_engine()
    from offenesparlament.extract.xml import ausschuss
    ausschuss.load_index(engine)
    from offenesparlament.extract.xml import news
    #news.load_index(engine)
    from offenesparlament.extract.xml import mdb
    mdb.load_index(engine)
Example #6
0
def speechmatcher(wp, session):
    engine = etl_engine()
    speech_table = sl.get_table(engine, 'speech')
    speeches = sl.find(engine, speech_table, order_by='sequence', 
        wahlperiode=wp, sitzung=session, matched=True)
    webtv_table = sl.get_table(engine, 'webtv')
    agenda = sl.find(engine, webtv_table, wp=wp, session=session)
    agenda = list(agenda)
    return render_template('backend/speechmatcher.html',
            speeches=speeches, agenda=agenda, wp=wp, session=session)
Example #7
0
def update(force=False, threaded=False, preload=False):
    """ Update the entire database. """
    app.config["NOMENKLATURA_PRELOAD"] = not preload
    engine = etl_engine()
    indexer = get_indexer()
    try:
        for stage in [GREMIUM, PERSON, ABSTIMMUNG, ABLAUF, TRANSCRIPT]:
            process(engine, indexer, proc, force=force, threaded=threaded)
    finally:
        indexer.flush()
Example #8
0
def _stage(proc, url=None, force=False, threaded=False, preload=True):
    app.config['NOMENKLATURA_PRELOAD'] = preload
    indexer = get_indexer()
    try:
        if url is None:
            process(indexer, proc, force=force, threaded=threaded)
        else:
            engine = etl_engine()
            proc['handler'](engine, indexer, url, force=force)
    finally:
        indexer.flush()
Example #9
0
def _stage(proc, url=None, force=False, threaded=False, preload=True):
    app.config["NOMENKLATURA_PRELOAD"] = preload
    indexer = get_indexer()
    try:
        if url is None:
            process(indexer, proc, force=force, threaded=threaded)
        else:
            engine = etl_engine()
            proc["handler"](engine, indexer, url, force=force)
    finally:
        indexer.flush()
Example #10
0
def speechmatcher_alignment_get(wp, session):
    engine = etl_engine()
    score, alignment = get_alignment(engine, wp, session)
    align_data = {}
    for align in alignment:
        seq = align.pop('sequence')
        align['matched'] = align['transcript_fp']==align['agenda_fp']
        del align['transcript_fp']
        align_data[seq] = align
    return jsonify({
        'score': score,
        'alignment': align_data
        })
Example #11
0
def transform():
    """ Transform and clean up content """
    engine = etl_engine()
    from offenesparlament.transform import persons
    persons.generate_person_long_names(engine)
    from offenesparlament.transform import positions
    positions.extend_positions(engine)
    from offenesparlament.transform import namematch
    namematch.match_persons(engine)
    from offenesparlament.transform import abstimmungen
    abstimmungen.extend_abstimmungen(engine)
    persons.generate_person_long_names(engine)
    from offenesparlament.transform import speechparser
    speechparser.load_transcripts(engine)
    from offenesparlament.transform import webtv
    webtv.merge_speeches(engine)
    from offenesparlament.transform import awatch
    awatch.load_profiles(engine)
    from offenesparlament.transform import speechmatch
    persons.generate_person_long_names(engine)
    speechmatch.extend_speeches(engine)
    persons.generate_person_long_names(engine)
    from offenesparlament.transform import drs
    drs.merge_speeches(engine)
Example #12
0
def cache_abstimmungen(engine):
    Abstimmung = sl.get_table(engine, 'abstimmung')
    data = defaultdict(dict)
    for e in sl.distinct(engine, Abstimmung, 'subject', 'date'):
        data[e['date']][e['subject']] = set(drucksachen(e['subject']))
    return dict(data.items())


def extend_beschluesse(engine):
    log.info("Re-connecting beschluesse ...")
    abstimmungen = cache_abstimmungen(engine)
    #pprint(abstimmungen)
    Beschluss = sl.get_table(engine, 'beschluss')
    for data in sl.find(engine, Beschluss):
        date = data['fundstelle'].split(' ')[0]
        data['date'] = datetime.strptime(date, '%d.%m.%Y').isoformat()
        if not data['dokument_text']:
            continue
        if data['date'] in abstimmungen:
            abst = abstimmungen[data['date']]
            doks = set(data['dokument_text'].split(', '))
            for subject, adoks in abst.items():
                if len(doks & adoks):
                    print "MATCH", data['date'], doks, adoks


if __name__ == '__main__':
    engine = etl_engine()
    print "DESTINATION", engine
    extend_beschluesse(engine)
Example #13
0
def download_docs():
    """ Download all PDFs from DIP. """
    engine = etl_engine()
    from offenesparlament.extract import documents
    documents.load_documents(engine)
Example #14
0
def extract_votes():
    """ Run the extract stage """
    engine = etl_engine()
    from offenesparlament.extract import abstimmungen
    abstimmungen.load_index(engine)
Example #15
0
    parser = SpeechParser(engine, sio)
    for contrib in parser:
        if not len(contrib['text'].strip()):
            continue
        contrib['sitzung'] = session
        contrib['sequence'] = seq
        contrib['wahlperiode'] = wp
        contrib['source_url'] = url
        contrib['matched'] = True
        sl.upsert(engine, Speech, contrib, 
                  unique=['sequence', 'sitzung', 'wahlperiode'])
        seq += 1
    if parser.missing_recon:
        sl.upsert(engine, Speech, {
                    'matched': False,
                    'sitzung': session,
                    'wahlperiode': wp
            }, unique=['sitzung', 'wahlperiode'])

    return True

def load_transcripts(engine, incremental=True):
    for i in count(33):
        if not load_transcript(engine, 17, i, incremental=incremental) and i > 180:
            break

if __name__ == '__main__':
    engine = etl_engine()
    print "DESTINATION", engine
    load_transcripts(engine)
Example #16
0
def download_docs():
    """ Download all PDFs from DIP. """
    engine = etl_engine()
    from offenesparlament.extract import documents

    documents.load_documents(engine)
Example #17
0
def longextract():
    """ Run the extract stage, including long-running tasks """
    engine = etl_engine()
    from offenesparlament.extract import wahlkreise
    wahlkreise.load_wahlkreise(engine)
Example #18
0
def extract_media():
    """ Run the extract stage """
    engine = etl_engine()
    from offenesparlament.extract import webtv
    webtv.load_sessions(engine)
Example #19
0
def devtf():
    """ Transform and clean up content (dev bits) """
    engine = etl_engine()
    from offenesparlament.transform import drs
    drs.merge(engine)
Example #20
0
def extract_docs():
    """ Run the extract stage """
    engine = etl_engine()
    from offenesparlament.extract import dip
    dip.load_dip(engine)