Beispiel #1
0
def load_debatten(engine, indexer, sitzung):
    WebTV_Speech = sl.get_table(engine, 'webtv_speech')
    zitate = list(sl.find(engine, WebTV_Speech, wp=str(sitzung.wahlperiode),
        session=str(sitzung.nummer)))
    debatten = dict([(z['item_id'], z) for z in zitate])
    speeches = list(sl.find(engine, sl.get_table(engine, 'speech'),
        wahlperiode=int(sitzung.wahlperiode), sitzung=int(sitzung.nummer)))
    for i, data in debatten.items():
        log.info("Loading  -> Debatte: %s..." % data.get('item_label'))
        debatte = Debatte.query.filter_by(
                sitzung=sitzung,
                nummer=data.get('item_id')
                ).first()
        if debatte is None:
            debatte = Debatte()
        debatte.sitzung = sitzung
        debatte.nummer = data.get('item_id')
        debatte.tops = data.get('item_key')
        debatte.titel = data.get('item_label')
        debatte.text = data.get('item_description')

        db.session.add(debatte)
        db.session.flush()
        indexer.add(debatte)
        
        dzitate = filter(lambda z: z['item_id'] == data['item_id'], zitate)
        reden = load_reden(engine, indexer, debatte, dzitate)
        load_zitate(engine, indexer, debatte, dzitate, speeches, reden)
        db.session.commit()
        indexer.add_many(reden.values())
Beispiel #2
0
def load_debatten(engine, sitzung):
    WebTV_Speech = sl.get_table(engine, 'webtv_speech')
    zitate = list(sl.find(engine, WebTV_Speech, wp=str(sitzung.wahlperiode),
        session=str(sitzung.nummer)))
    debatten = dict([(z['item_id'], z) for z in zitate])
    speeches = list(sl.find(engine, sl.get_table(engine, 'speech'),
        wahlperiode=int(sitzung.wahlperiode), sitzung=int(sitzung.nummer)))
    for i, data in debatten.items():
        log.info("Loading  -> Debatte: %s..." % data.get('item_label'))
        debatte = Debatte.query.filter_by(
                sitzung=sitzung,
                nummer=data.get('item_id')
                ).first()
        if debatte is None:
            debatte = Debatte()
        debatte.sitzung = sitzung
        debatte.nummer = data.get('item_id')
        debatte.tops = data.get('item_key')
        debatte.titel = data.get('item_label')
        debatte.text = data.get('item_description')

        db.session.add(debatte)
        db.session.flush()

        dzitate = filter(lambda z: z['item_id'] == data['item_id'], zitate)
        load_zitate(engine, debatte, dzitate, speeches)
        db.session.commit()
Beispiel #3
0
def load_ablauf(engine, indexer, data):
    ablauf = Ablauf.query.filter_by(source_url=data.get('source_url')).first()
    if ablauf is None:
        ablauf = Ablauf()

    ablauf.key = data.get('key')
    ablauf.source_url = data.get('source_url')
    ablauf.wahlperiode = data.get('wahlperiode')
    ablauf.typ = data.get('typ')
    ablauf.klasse = data.get('class')
    ablauf.titel = data.get('titel')
    if not len(ablauf.titel):
        log.error("No titel!")
        return

    ablauf.initiative = data.get('initiative')
    ablauf.stand = data.get('stand')
    ablauf.signatur = data.get('signatur')
    ablauf.gesta_id = data.get('gesta_id')
    ablauf.eu_dok_nr = data.get('eu_dok_nr')
    ablauf.eur_lex_url = data.get('eur_lex_url')
    ablauf.eur_lex_pdf = data.get('eur_lex_pdf')
    ablauf.consilium_url = data.get('consilium_url')
    ablauf.abstrakt = data.get('abstrakt')
    ablauf.zustimmungsbeduerftig = data.get('zustimmungsbeduerftig')
    ablauf.sachgebiet = data.get('sachgebiet')
    ablauf.abgeschlossen = True if str(data.get('abgeschlossen')) \
            == 'True' else False
    db.session.add(ablauf)
    db.session.flush()

    worte = []
    _Schlagwort = sl.get_table(engine, 'schlagwort')
    for sw in sl.find(engine, _Schlagwort, source_url=ablauf.source_url):
        wort = Schlagwort()
        wort.name = sw['wort']
        db.session.add(wort)
        worte.append(wort)
    ablauf.schlagworte = worte

    _Referenz = sl.get_table(engine, 'referenz')
    for ddata in sl.find(engine, _Referenz, source_url=ablauf.source_url):
        dokument = load_dokument(engine, indexer, ddata)
        referenz = Referenz.query.filter_by(
                dokument=dokument,
                seiten=ddata.get('seiten'),
                ).filter(Referenz.ablaeufe.any(id=ablauf.id)).first()
        if referenz is None:
            referenz = Referenz()
            referenz.ablaeufe.append(ablauf)
            referenz.dokument = dokument
        referenz.seiten = ddata.get('seiten')
        referenz.text = ddata.get('text')

    _Position = sl.get_table(engine, 'position')
    for position in sl.find(engine, _Position, source_url=ablauf.source_url):
        load_position(engine, indexer, ablauf, position)

    db.session.commit()
    indexer.add(ablauf)
Beispiel #4
0
def load_ablauf(engine, indexer, data):
    ablauf = Ablauf.query.filter_by(source_url=data.get('source_url')).first()
    if ablauf is None:
        ablauf = Ablauf()

    ablauf.key = data.get('key')
    ablauf.source_url = data.get('source_url')
    ablauf.wahlperiode = data.get('wahlperiode')
    ablauf.typ = data.get('typ')
    ablauf.klasse = data.get('class')
    ablauf.titel = data.get('titel')
    if not len(ablauf.titel):
        log.error("No titel!")
        return

    ablauf.initiative = data.get('initiative')
    ablauf.stand = data.get('stand')
    ablauf.signatur = data.get('signatur')
    ablauf.gesta_id = data.get('gesta_id')
    ablauf.eu_dok_nr = data.get('eu_dok_nr')
    ablauf.eur_lex_url = data.get('eur_lex_url')
    ablauf.eur_lex_pdf = data.get('eur_lex_pdf')
    ablauf.consilium_url = data.get('consilium_url')
    ablauf.abstrakt = data.get('abstrakt')
    ablauf.zustimmungsbeduerftig = data.get('zustimmungsbeduerftig')
    ablauf.sachgebiet = data.get('sachgebiet')
    ablauf.abgeschlossen = True if str(data.get('abgeschlossen')) \
            == 'True' else False
    db.session.add(ablauf)
    db.session.flush()

    worte = []
    _Schlagwort = sl.get_table(engine, 'schlagwort')
    for sw in sl.find(engine, _Schlagwort, source_url=ablauf.source_url):
        wort = Schlagwort()
        wort.name = sw['wort']
        db.session.add(wort)
        worte.append(wort)
    ablauf.schlagworte = worte

    _Referenz = sl.get_table(engine, 'referenz')
    for ddata in sl.find(engine, _Referenz, source_url=ablauf.source_url):
        dokument = load_dokument(engine, indexer, ddata)
        referenz = Referenz.query.filter_by(
            dokument=dokument,
            seiten=ddata.get('seiten'),
        ).filter(Referenz.ablaeufe.any(id=ablauf.id)).first()
        if referenz is None:
            referenz = Referenz()
            referenz.ablaeufe.append(ablauf)
            referenz.dokument = dokument
        referenz.seiten = ddata.get('seiten')
        referenz.text = ddata.get('text')

    _Position = sl.get_table(engine, 'position')
    for position in sl.find(engine, _Position, source_url=ablauf.source_url):
        load_position(engine, indexer, ablauf, position)

    db.session.commit()
    indexer.add(ablauf)
Beispiel #5
0
def speechmatcher(wp, session):
    engine = etl_engine()
    speech_table = sl.get_table(engine, 'speech')
    speeches = sl.find(engine, speech_table, order_by='sequence', 
        wahlperiode=wp, sitzung=session, matched=True)
    webtv_table = sl.get_table(engine, 'webtv')
    agenda = sl.find(engine, webtv_table, wp=wp, session=session)
    agenda = list(agenda)
    return render_template('backend/speechmatcher.html',
            speeches=speeches, agenda=agenda, wp=wp, session=session)
Beispiel #6
0
def load_abstimmung(engine, source_url):
    table = sl.get_table(engine, 'abstimmung')
    stimmen = list(sl.find(engine, table, source_url=source_url, matched=True))
    if not len(stimmen):
        log.error("No reconciled votes, signals deeper trouble?")
        return
    thema = stimmen[0].get('subject')
    abst = Abstimmung.query.filter_by(thema=thema).first()
    if abst is None:
        abst = Abstimmung()
        abst.thema = thema
        abst.datum = to_date(stimmen[0].get('date'))
    db.session.add(abst)
    db.session.flush()
    for stimme_ in stimmen:
        person = Person.query.filter_by(
            fingerprint=stimme_.get('fingerprint')).first()
        if person is None:
            continue
        stimme = Stimme.query.filter_by(abstimmung=abst).filter_by(
            person=person).first()
        if stimme is not None:
            continue
        stimme = Stimme()
        stimme.entscheidung = stimme_['vote']
        stimme.person = person
        stimme.abstimmung = abst
        db.session.add(stimme)
    db.session.commit()
Beispiel #7
0
def extend_positions(engine):
    log.info("Amending positions ...")
    Position = sl.get_table(engine, 'position')
    for i, data in enumerate(sl.find(engine, Position)):
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        dt, rest = data['fundstelle'].split("-", 1)
        data['date'] = datetime.strptime(dt.strip(), "%d.%m.%Y").isoformat()
        if ',' in data['urheber']:
            typ, quelle = data['urheber'].split(',', 1)
            data['quelle'] = re.sub("^.*Urheber.*:", "", quelle).strip()
            data['typ'] = typ.strip()
        else:
            data['typ'] = data['urheber']

        br = 'Bundesregierung, '
        if data['urheber'].startswith(br):
            data['urheber'] = data['urheber'][len(br):]

        data['fundstelle_doc'] = None
        if data['fundstelle_url'] and \
                'btp' in data['fundstelle_url']:
            data['fundstelle_doc'] = data['fundstelle_url']\
                    .rsplit('#',1)[0]

        hash = sha1(data['fundstelle'].encode('utf-8') \
                + data['urheber'].encode('utf-8') + \
                data['ablauf_id'].encode('utf-8')).hexdigest()
        data['hash'] = hash[:10]
        sl.upsert(engine, Position, data, unique=UNIQUE)
Beispiel #8
0
def combine(force=False, filter=None):
    stats = OpenSpendingStats()
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        combine_resource(engine, source_table, row, force, stats)
    log.info('Combine summary: \n%s' % stats.report())
Beispiel #9
0
def generate_person_long_names(engine):
    log.info("Generating person fingerprints and slugs...")
    from offenesparlament.transform.namematch import match_speaker
    nkp = nk_persons()
    Person = sl.get_table(engine, 'person')
    for person in sl.find(engine, Person):
        long_name = make_long_name(person)
        try:
            long_name = match_speaker(long_name)
        except NKNoMatch:
            pass
        log.info(" -> %s" % long_name.strip())
        slug = url_slug(long_name)
        sl.upsert(engine, Person, {
                         'fingerprint': long_name,
                         'slug': slug,
                         'id': person['id']},
                         unique=['id'])
        tries = 0
        while True:
            try:
                nkp.ensure_value(long_name, data=person)
            except ValueError, E:
                log.warn('Exception: %s' % str(E))
                tries = tries + 1
                if tries > 5:
                    raise
            else:
                break
Beispiel #10
0
def load_rollen(engine, person, data):
    _RolleSource = sl.get_table(engine, 'rolle')
    for rdata in sl.find(engine, _RolleSource,
                         fingerprint=data['fingerprint']):
        rolle = Rolle.query.filter_by(person=person,
                                      funktion=rdata.get('funktion'),
                                      ressort=rdata.get('ressort'),
                                      fraktion=rdata.get('fraktion'),
                                      land=rdata.get('land')).first()
        if rolle is None:
            rolle = Rolle()

        rolle.person = person
        rolle.mdb_id = rdata.get('mdb_id')
        rolle.status = rdata.get('status')
        rolle.funktion = rdata.get('funktion')
        rolle.fraktion = rdata.get('fraktion')
        rolle.gewaehlt = rdata.get('gewaehlt')
        rolle.ressort = rdata.get('ressort')
        rolle.land = rdata.get('land')
        rolle.austritt = to_date(rdata.get('austritt'))

        if rdata.get('mdb_id'):
            rolle.wahlkreis = load_wahlkreis(engine, rolle, data)
        db.session.add(rolle)
Beispiel #11
0
def get_agenda(engine, wp, session):
    return list(
        sl.find(engine,
                sl.get_table(engine, 'webtv'),
                wp=wp,
                session=session,
                order_by='speech_id'))
Beispiel #12
0
def merge_speeches(engine):
    # desired result: (position_id, debatte_id)
    referenzen = referenzen_index(engine)
    items = item_index(engine)
    
    log.info("Finding best matches.... ")
    matches = {}
    for (ablauf_id, rwp, rsession), rdrs in referenzen.items():
        for (iwp, isession, item_id), idrs in items.items():
            if iwp != rwp or rsession != isession:
                continue
            ints = len(idrs.intersection(rdrs))
            if ints == 0:
                continue
            k = (ablauf_id, rwp, rsession)
            if k in matches and matches[k][1] > ints:
                continue
            matches[k] = (item_id, ints)

    log.info("Saving position associations....")
    pos_tbl = sl.get_table(engine, 'position')
    for (ablauf_id, wp, session), (item_id, n) in matches.items():
        for pos in sl.find(engine, pos_tbl, ablauf_id="%s/%s" % (wp, ablauf_id)):
            if not pos['fundstelle_url']:
                continue
            if 'btp/%s/%s%03d.pdf' % (wp, wp, int(session)) in pos['fundstelle_url']:
                d = {'ablauf_id': pos['ablauf_id'], 
                     'hash': pos['hash'],
                     'debatte_wp': wp,
                     'debatte_session': session,
                     'debatte_item_id': item_id}
                sl.upsert(engine, pos_tbl, d, unique=['ablauf_id', 'hash'])
Beispiel #13
0
def load_rollen(engine, person, data):
    _RolleSource = sl.get_table(engine, "rolle")
    for rdata in sl.find(engine, _RolleSource, fingerprint=data["fingerprint"]):
        rolle = Rolle.query.filter_by(
            person=person,
            funktion=rdata.get("funktion"),
            ressort=rdata.get("ressort"),
            fraktion=rdata.get("fraktion"),
            land=rdata.get("land"),
        ).first()
        if rolle is None:
            rolle = Rolle()

        rolle.person = person
        rolle.mdb_id = rdata.get("mdb_id")
        rolle.status = rdata.get("status")
        rolle.funktion = rdata.get("funktion")
        rolle.fraktion = rdata.get("fraktion")
        rolle.gewaehlt = rdata.get("gewaehlt")
        rolle.ressort = rdata.get("ressort")
        rolle.land = rdata.get("land")
        rolle.austritt = to_date(rdata.get("austritt"))

        if rdata.get("mdb_id"):
            rolle.wahlkreis = load_wahlkreis(engine, rolle, data)
        db.session.add(rolle)
Beispiel #14
0
def combine(force=False, filter=None):
    stats = OpenSpendingStats()
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        combine_resource(engine, source_table, row, force, stats)
    log.info('Combine summary: \n%s' % stats.report())
Beispiel #15
0
def load_abstimmungen(engine):
    _Abstimmung = sl.get_table(engine, 'abstimmung')
    i = 0
    for row in sl.distinct(engine, _Abstimmung, 'subject', 'date'):
        thema = row.get('subject')
        abst = Abstimmung.query.filter_by(thema=thema).first()
        if abst is None:
            abst = Abstimmung()
            abst.thema = thema
            abst.datum = date(row.get('date'))
        db.session.add(abst)
        for stimme_ in sl.find(engine, _Abstimmung, subject=thema,
            matched=True):
            if i % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            i += 1
            person = Person.query.filter_by(
                fingerprint=stimme_.get('fingerprint')).first()
            if person is None:
                continue
            stimme = Stimme.query.filter_by(
                abstimmung=abst).filter_by(
                person=person).first()
            if stimme is not None:
                continue
            stimme = Stimme()
            stimme.entscheidung = stimme_['vote']
            stimme.person = person
            stimme.abstimmung = abst
            db.session.add(stimme)
        db.session.commit()
Beispiel #16
0
def extend_speeches(engine, wahlperiode=17):
    log.info("Amending speeches with DRS ...")
    drs_match = re.compile(DRS_MATCH % (wahlperiode, wahlperiode))
    Speech = sl.get_table(engine, 'speech')
    SpeechDocument = sl.get_table(engine, 'speech_document')
    for i, data in enumerate(sl.find(engine, Speech)):
        if data.get('type') != 'chair':
            continue
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        m = drs_match.search(data.get('text'))
        if m is None:
            continue
        for i, grp in enumerate(m.groups()):
            if grp and '/' in grp:
                wp, nummer = grp.split('/', 1)
                sl.upsert(
                    engine,
                    SpeechDocument, {
                        'group': i,
                        'sequence': data['sequence'],
                        'sitzung': data['sitzung'],
                        'wahlperiode': wahlperiode,
                        'dok_nummer': nummer
                    },
                    unique=['sequence', 'sitzung', 'wahlperiode', 'group'])
Beispiel #17
0
def load_rollen(engine, person, data):
    _RolleSource = sl.get_table(engine, 'rolle')
    mdb_rolle = None
    for rdata in sl.find(engine, _RolleSource, fingerprint=data['fingerprint']):
        rolle = Rolle.query.filter_by(
                person=person,
                funktion=rdata.get('funktion'),
                ressort=rdata.get('ressort'),
                fraktion=rdata.get('fraktion'),
                land=rdata.get('land')).first()
        if rolle is None:
            rolle = Rolle()

        rolle.person = person
        rolle.mdb_id = rdata.get('mdb_id')
        rolle.status = rdata.get('status')
        rolle.funktion = rdata.get('funktion')
        rolle.fraktion = rdata.get('fraktion')
        rolle.gewaehlt = rdata.get('gewaehlt')
        rolle.ressort = rdata.get('ressort')
        rolle.land = rdata.get('land')
        rolle.austritt = date(rdata.get('austritt'))

        if rdata.get('mdb_id'):
            rolle.wahlkreis = load_wahlkreis(engine, rolle, data)
            mdb_rolle = rolle
        db.session.add(rolle)
    return mdb_rolle
Beispiel #18
0
def validate_sheet(engine, row, sheet_id, data_row_filter, stats_spending):
    spending_table = sl.get_table(engine, 'spending')
    data = list(
        sl.find(engine,
                spending_table,
                resource_id=row['resource_id'],
                sheet_id=sheet_id))
    connection = engine.connect()
    trans = connection.begin()
    issue_noted_for_this_resource = False  # record first failure only
    error_message = None
    try:
        records = 0
        for row_ in data:
            if data_row_filter and data_row_filter != row_['row_id']:
                continue
            result = {'id': row_['id'], 'valid': True}
            result['signature'] = generate_signature(row_)

            if row_['DateFormatted'] is None:
                stats_spending['date'].add_spending('Date invalid', row_)
                result['valid'] = False
                if not issue_noted_for_this_resource:
                    issue(
                        engine, row['resource_id'], row['retrieve_hash'],
                        STAGE,
                        'Date invalid (blank, inconsistent or unrecognised format)',
                        {
                            'row_id': row_.get('row_id'),
                            'row_number': row_.get('row_number'),
                            'Date': row_.get('Date')
                        })
                    error_message = 'Date invalid'
                    issue_noted_for_this_resource = True
            else:
                stats_spending['date'].add_spending('Date ok', row_)

            if row_['AmountFormatted'] is None:
                stats_spending['amount'].add_spending('Amount invalid', row_)
                result['valid'] = False
                if not issue_noted_for_this_resource:
                    issue(
                        engine, row['resource_id'], row['retrieve_hash'],
                        STAGE, 'Amount invalid', {
                            'row_id': row_.get('row_id'),
                            'row_number': row_.get('row_number'),
                            'Amount': row_.get('Amount')
                        })
                    error_message = 'Amount invalid'
                    issue_noted_for_this_resource = True
            else:
                stats_spending['amount'].add_spending('Amount ok', row_)

            if result['valid']:
                records += 1
            sl.update(connection, spending_table, {'id': result['id']}, result)
        trans.commit()
        return records > 0, error_message
    finally:
        connection.close()
Beispiel #19
0
def load_abstimmung(engine, source_url):
    table = sl.get_table(engine, 'abstimmung')
    stimmen = list(sl.find(engine, table, source_url=source_url,
        matched=True))
    if not len(stimmen):
        log.error("No reconciled votes, signals deeper trouble?")
        return
    thema = stimmen[0].get('subject')
    abst = Abstimmung.query.filter_by(thema=thema).first()
    if abst is None:
        abst = Abstimmung()
        abst.thema = thema
        abst.datum = to_date(stimmen[0].get('date'))
    db.session.add(abst)
    db.session.flush()
    for stimme_ in stimmen:
        person = Person.query.filter_by(
            fingerprint=stimme_.get('fingerprint')).first()
        if person is None:
            continue
        stimme = Stimme.query.filter_by(
            abstimmung=abst).filter_by(
            person=person).first()
        if stimme is not None:
            continue
        stimme = Stimme()
        stimme.entscheidung = stimme_['vote']
        stimme.person = person
        stimme.abstimmung = abst
        db.session.add(stimme)
    db.session.commit()
Beispiel #20
0
def cleanup_sheet(engine, row, sheet_id):
    spending_table = sl.get_table(engine, 'spending')
    data = list(
        sl.find(engine,
                spending_table,
                resource_id=row['resource_id'],
                sheet_id=sheet_id))
    connection = engine.connect()
    trans = connection.begin()
    date_formats = cleanup_dates.detect_formats(data)
    try:
        if None in date_formats.values():
            log.warn("Couldn't detect date formats: %r", date_formats)
            issue(engine, row['resource_id'], row['retrieve_hash'],
                  "Couldn't detect date formats", repr(date_formats))
            return False

        sl.delete(connection,
                  spending_table,
                  resource_id=row['resource_id'],
                  sheet_id=sheet_id)
        for row in data:
            row = cleanup_dates.apply(row, date_formats)
            row = cleanup_numbers.apply(row)
            row = cleanup_gov.apply(row)
            #row = cleanup_supplier.apply(row, engine)
            del row['id']
            sl.add_row(connection, spending_table, row)
        trans.commit()
        return True
    finally:
        connection.close()
Beispiel #21
0
def load(engine, grano):
    for rep in sl.find(engine, sl.get_table(engine, 'representative')):
        del rep['id']
        rep_ent = canonical_actor(grano, engine, rep['originalName'])
        if 'id' in rep_ent:
            rep_ent = grano.getEntity(rep_ent['id'], deep=True)
        #if not SETTINGS.FULL and rep_ent['etlId'] == rep['etlId']:
        #    continue
        rep_ent.update(rep)
        rep_ent['actsAsRepresentative'] = True
        rep_ent['staffMembers'] = int(float(rep['members']))
        rep_ent['incoming'] = rep_ent.get('incoming', [])
        rep_ent['outgoing'] = rep_ent.get('outgoing', [])
        rep_ent['contactCountry'] = rep_ent['contactCountryNorm']
        rep_ent = load_clients(grano, engine, rep_ent)
        rep_ent = load_organisations(grano, engine, rep_ent)
        rep_ent = load_networking(grano, engine, rep_ent)
        rep_ent = load_persons(grano, engine, rep_ent)
        rep_ent = load_interests(grano, engine, rep_ent)
        rep_ent = load_action_fields(grano, engine, rep_ent)
        rep_ent = get_financial_data(engine, rep_ent)
        # TODO: other financial sources
        #from pprint import pprint
        #pprint(rep_ent)
        grano.updateEntity(rep_ent)
Beispiel #22
0
def extract_some(force=False, filter=None):
    # kwargs = resource_id=x, package_name=y, publisher_title=z
    stats = OpenSpendingStats()
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        extract_resource(engine, source_table, row, force, stats)
    log.info('Extract summary: \n%s' % stats.report())
Beispiel #23
0
def extract_some(force=False, filter=None):
    # kwargs = resource_id=x, package_name=y, publisher_title=z
    stats = OpenSpendingStats()
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        extract_resource(engine, source_table, row, force, stats)
    log.info('Extract summary: \n%s' % stats.report())
Beispiel #24
0
def retrieve_some(force=False, filter=None):
    stats = OpenSpendingStats()
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        retrieve(row, engine, source_table, force, stats)
    print 'Retrieve summary:'
    print stats.report()
Beispiel #25
0
def load_ablaeufe(engine):
    _Ablauf = sl.get_table(engine, 'ablauf')

    for i, data in enumerate(sl.find(engine, _Ablauf, wahlperiode=str(17))):
        log.info("Loading Ablauf: %s..." % data['titel'])
        load_ablauf(engine, data)
        if i % 500 == 0:
            db.session.commit()
    db.session.commit()
Beispiel #26
0
def cleanup(force=False, resource_filter=None, data_row_filter=None):
    stats = OpenSpendingStats()
    stats_spending = defaultdict(OpenSpendingStats)
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        cleanup_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending)
    log.info('Cleanup summary: \n%s' % stats.report())
    for key in stats_spending:
        log.info('Cleanup %s: \n%s' % (key, stats_spending[key].report()))
Beispiel #27
0
def load_ap(ap, engine):
    orgs = list(sl.find(engine, sl.get_table(engine, 'representative'),
                   identificationCode=ap['orgIdentificationCode']))
    if len(orgs):
        org = max(orgs, key=lambda o: o['lastUpdateDate'])
        childBase = {'representativeEtlId': org['etlId'],
                     'representativeUpdateDate': org['lastUpdateDate']}
        load_person(ap, 'accredited', childBase, engine)
    else:
        print ap
Beispiel #28
0
def get_transcript(engine, wp, session):
    speeches = []
    for speech in sl.find(
        engine, sl.get_table(engine, "speech"), order_by="sequence", wahlperiode=wp, sitzung=session, matched=True
    ):
        if speech["type"] == "poi":
            continue
        seg = (speech["sequence"], speech["fingerprint"])
        speeches.append(seg)
    return speeches
def retrieve_some(force=False, **filters):
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    result_counts = defaultdict(int)
    for row in sl.find(engine, source_table, **filters):
        result = retrieve(row, engine, source_table, force)
        result_counts['total'] += 1
        result_counts[result] += 1
    log.info('Total %i URLs', result_counts.pop('total'))
    for result, count in result_counts.items():
        log.info('  %i %s', count, result)
Beispiel #30
0
def cleanup(force=False, resource_filter=None, data_row_filter=None):
    stats = OpenSpendingStats()
    stats_spending = defaultdict(OpenSpendingStats)
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        cleanup_resource(engine, source_table, row, force, data_row_filter,
                         stats, stats_spending)
    log.info('Cleanup summary: \n%s' % stats.report())
    for key in stats_spending:
        log.info('Cleanup %s: \n%s' % (key, stats_spending[key].report()))
Beispiel #31
0
def retrieve_some(force=False, **filters):
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    result_counts = defaultdict(int)
    for row in sl.find(engine, source_table, **filters):
        result = retrieve(row, engine, source_table, force)
        result_counts['total'] += 1
        result_counts[result] += 1
    log.info('Total %i URLs', result_counts.pop('total'))
    for result, count in result_counts.items():
        log.info('  %i %s', count, result)
Beispiel #32
0
def build_index(publisher_name=None):
    '''Searches CKAN for spending resources and writes their metadata to
    the database.'''
    engine, table = connect()
    client = ckan_client()
    log.info('CKAN: %s', client.base_location)
    tags = ['+tags:"%s"' % t for t in TAGS]
    q = " OR ".join(tags)
    publisher_dict_filter = {}
    if publisher_name:
        publisher_solr_filter = 'publisher:"%s"' % publisher_name
        q = '(%s) AND (%s)' % (q, publisher_solr_filter)
        publisher_dict_filter = {'publisher_name': publisher_name}
    log.info('SOLR Search q: %r', q)

    existing_packages = set([
        res['package_name'] for res in sl.distinct(
            engine, table, 'package_name', **publisher_dict_filter)
    ])
    log.info('Existing datasets: %i', len(existing_packages))
    processed_packages = set()
    log.info('Doing package search for: "%s"', q)
    res = client.package_search(q, search_options={'limit': 2000})
    log.info('Search returned %i dataset results', res['count'])
    stats = OpenSpendingStats()
    stats_resources = OpenSpendingStats()
    for package_name in res['results']:
        processed_packages.add(package_name)
        num_resources = fetch_package(client, package_name, engine, table,
                                      stats_resources)
        if num_resources == 0:
            stats.add('Dataset has no resources', package_name)
        else:
            stats.add('Dataset has resources', package_name)
    # Removed rows about deleted packages
    obsolete_packages = existing_packages - processed_packages
    log.info('Obsolete datasets: %s from %s', len(obsolete_packages),
             len(existing_packages))
    for package_name in obsolete_packages:
        sl.delete(engine, table, package_name=package_name)
        sl.delete(engine, 'issue', package_name=package_name)
        stats.add('Removed obsolete dataset', package_name)
    # Removed stray rows without package_name
    stray_rows = list(sl.find(engine, table, package_name=None))
    if stray_rows:
        log.info('Stray rows without package_name: %i', len(stray_rows))
        sl.delete(engine, table, package_name=None)
        sl.delete(engine, 'issue', package_name=None)
        for row in stray_rows:
            stats.add('Stray row removed', row['resource_id'])
    print 'Datasets build_index summary:'
    print stats.report()
    print 'Resources build_index summary:'
    print stats_resources.report()
Beispiel #33
0
def validate(force=False, filter=None, data_row_filter=None):
    stats = OpenSpendingStats()
    stats_spending = {'date': OpenSpendingStats(),
                      'amount': OpenSpendingStats()}
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        validate_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending)
    log.info('Validate summary: \n%s' % stats.report())
    for stat_type in stats_spending:
        log.info('Validate %s: \n%s' % (stat_type, stats_spending[stat_type].report()))
Beispiel #34
0
def build_index(publisher_name=None):
    '''Searches CKAN for spending resources and writes their metadata to
    the database.'''
    engine, table = connect()
    client = ckan_client()
    log.info('CKAN: %s', client.base_location)
    tags = ['+tags:"%s"' % t for t in TAGS]
    q = " OR ".join(tags)
    publisher_dict_filter = {}
    if publisher_name:
        publisher_solr_filter = 'publisher:"%s"' % publisher_name
        q = '(%s) AND (%s)' % (q, publisher_solr_filter)
        publisher_dict_filter = {'publisher_name': publisher_name}
    log.info('SOLR Search q: %r', q)

    existing_packages = set(
            [res['package_name']
             for res in sl.distinct(engine, table, 'package_name', **publisher_dict_filter)])
    log.info('Existing datasets: %i', len(existing_packages))
    processed_packages = set()
    log.info('Doing package search for: "%s"', q)
    res = client.package_search(q,
            search_options={'limit': 2000})
    log.info('Search returned %i dataset results', res['count'])
    stats = OpenSpendingStats()
    stats_resources = OpenSpendingStats()
    for package_name in res['results']:
        processed_packages.add(package_name)
        num_resources = fetch_package(client, package_name, engine, table, stats_resources)
        if num_resources == 0:
            stats.add('Dataset has no resources', package_name)
        else:
            stats.add('Dataset has resources', package_name)
    # Removed rows about deleted packages
    obsolete_packages = existing_packages - processed_packages
    log.info('Obsolete datasets: %s from %s',
             len(obsolete_packages), len(existing_packages))
    for package_name in obsolete_packages:
        sl.delete(engine, table, package_name=package_name)
        sl.delete(engine, 'issue', package_name=package_name)
        stats.add('Removed obsolete dataset', package_name)
    # Removed stray rows without package_name
    stray_rows = list(sl.find(engine, table, package_name=None))
    if stray_rows:
        log.info('Stray rows without package_name: %i',
                 len(stray_rows))
        sl.delete(engine, table, package_name=None)
        sl.delete(engine, 'issue', package_name=None)
        for row in stray_rows:
            stats.add('Stray row removed', row['resource_id'])
    print 'Datasets build_index summary:'
    print stats.report()
    print 'Resources build_index summary:'
    print stats_resources.report()
Beispiel #35
0
def validate_sheet(engine, row, sheet_id, data_row_filter, stats_spending):
    spending_table = sl.get_table(engine, 'spending')
    data = list(sl.find(engine, spending_table,
            resource_id=row['resource_id'],
            sheet_id=sheet_id))
    connection = engine.connect()
    trans = connection.begin()
    issue_noted_for_this_resource = False # record first failure only
    error_message = None
    try:
        records = 0
        for row_ in data:
            if data_row_filter and data_row_filter != row_['row_id']:
                continue
            result = {'id': row_['id'], 'valid': True}
            result['signature'] = generate_signature(row_)

            if row_['DateFormatted'] is None:
                stats_spending['date'].add_spending('Date invalid', row_)
                result['valid'] = False
                if not issue_noted_for_this_resource:
                    issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
                          'Date invalid (blank, inconsistent or unrecognised format)',
                          {'row_id': row_.get('row_id'),
                           'row_number': row_.get('row_number'),
                           'Date': row_.get('Date')})
                    error_message = 'Date invalid'
                    issue_noted_for_this_resource = True
            else:
                stats_spending['date'].add_spending('Date ok', row_)

            if row_['AmountFormatted'] is None:
                stats_spending['amount'].add_spending('Amount invalid', row_)
                result['valid'] = False
                if not issue_noted_for_this_resource:
                    issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
                          'Amount invalid', {'row_id': row_.get('row_id'),
                                             'row_number': row_.get('row_number'),
                                             'Amount': row_.get('Amount')})
                    error_message = 'Amount invalid'
                    issue_noted_for_this_resource = True
            else:
                stats_spending['amount'].add_spending('Amount ok', row_)

            if result['valid']:
                records += 1
            sl.update(connection, spending_table,
                      {'id': result['id']}, result)
        trans.commit()
        return records > 0, error_message
    finally:
        connection.close()
Beispiel #36
0
def get_transcript(engine, wp, session):
    speeches = []
    for speech in sl.find(engine,
                          sl.get_table(engine, 'speech'),
                          order_by='sequence',
                          wahlperiode=wp,
                          sitzung=session,
                          matched=True):
        if speech['type'] == 'poi':
            continue
        seg = (speech['sequence'], speech['fingerprint'])
        speeches.append(seg)
    return speeches
Beispiel #37
0
def resolve_stimmen(engine, source_url):
    table = sl.get_table(engine, 'abstimmung')
    for data in sl.find(engine, table, source_url=source_url):
        try:
            fp = resolve_person(data['person'])
        except BadReference:
            fp = None
            log.info("No match for: %s", data['person'])
        sl.upsert(engine, table,
                  {'person': data.get('person'),
                   'matched': fp is not None,
                   'fingerprint': fp},
                  unique=['person'])
Beispiel #38
0
def articles(engine):
    a_table = sl.get_table(engine, 'article')
    for data in sl.find(engine, a_table):
        up = {'number': data['number']}
        slug_parts = data['canonical_url'].split('/')[3:]
        if len(slug_parts) > 3:
            print slug_parts
        if len(slug_parts) == 3:
            up['ressort'], up['subressort'], _ = slug_parts
        elif len(slug_parts) == 2:
            up['ressort'], _ = slug_parts
        up['date'] = parse_date(data['date_text'])
        sl.upsert(engine, a_table, up, ['number'])
Beispiel #39
0
def articles(engine):
    a_table = sl.get_table(engine, 'article')
    for data in sl.find(engine, a_table):
        up = {'number': data['number']}
        slug_parts = data['canonical_url'].split('/')[3:]
        if len(slug_parts) > 3:
            print slug_parts
        if len(slug_parts) == 3:
            up['ressort'], up['subressort'], _ = slug_parts
        elif len(slug_parts) == 2:
            up['ressort'], _ = slug_parts
        up['date'] = parse_date(data['date_text'])
        sl.upsert(engine, a_table, up, ['number'])
Beispiel #40
0
def merge_speech(engine, wp, session):
    log.info("Merging media + transcript: %s/%s" % (wp, session))
    WebTV = sl.get_table(engine, 'webtv')
    WebTV_Speeches = sl.get_table(engine, 'webtv_speech')
    changes, recordings = [], []
    for recd in sl.find(engine, WebTV, wp=wp, session=session, 
            order_by='speech_id'):
        recordings.append(recd)
        if not len(changes) or changes[-1] != recd['fingerprint']:
            changes.append(recd)
    #speakers = []
    changes_index = 0

    def emit(speech):
        data = changes[changes_index].copy()
        del data['id']
        data['sequence'] = speech['sequence']
        sl.upsert(engine, WebTV_Speeches, data,
                unique=['wp', 'session', 'sequence'])

    Speech = sl.get_table(engine, 'speech')
    for speech in sl.find(engine, Speech, order_by='sequence', 
        wahlperiode=wp, sitzung=session, matched=True):
        if speech['type'] == 'poi':
            emit(speech)
            continue

        if speech['type'] == 'chair':
            match_chair(speech, changes[changes_index])

        transition = changes[changes_index]
        if len(changes) > changes_index + 1:
            transition = changes[changes_index + 1]

            if speech['fingerprint'] == transition['fingerprint']:
                changes_index += 1
        recd = changes[changes_index]
        #print [speech['fingerprint'], recd['fingerprint'], recd['item_label']]
        emit(speech)
Beispiel #41
0
def load_networking(grano, engine, rep):
    for org in sl.find(engine, sl.get_table(engine, 'network_entity'),
        representativeEtlId=rep['etlId']):
        ent = canonical_actor(grano, engine, org['etlFingerPrint'])
        ent = ensure_actor(grano, ent)

        rel = find_relation(rep['outgoing'], 'target', ent,
            {'type': ASSOCIATED['name']})
        rel['type'] = ASSOCIATED['name']
        rel['source'] = rep.get('id')
        rel['target'] = ent
        rep['outgoing'] = replace_relation(rep['outgoing'], 'target', rel)
    return rep
Beispiel #42
0
def get_financial_data(engine, rep):
    fds = list(sl.find(engine, sl.get_table(engine, 'financialData'),
        representativeEtlId=rep['etlId']))
    fd = max(fds, key=lambda f: f.get('endDate'))
    for key, value in fd.items():
        if key in [u'totalBudget', u'turnoverMin', u'costAbsolute', u'publicFinancingNational',
            u'otherSourcesDonation', u'eurSourcesProcurement', u'costMax', u'eurSourcesGrants',
            u'otherSourcesContributions', u'publicFinancingTotal', u'turnoverAbsolute',
            u'turnoverMax', u'costMin', u'directRepCostsMin', u'directRepCostsMax',
            u'publicFinancingInfranational', u'otherSourcesTotal']:
            if value is not None:
                value = int(float(value))
        key = 'fd' + key[0].upper() + key[1:]
        rep[key] = value
    return rep
Beispiel #43
0
def resolve_stimmen(engine, source_url):
    table = sl.get_table(engine, 'abstimmung')
    for data in sl.find(engine, table, source_url=source_url):
        try:
            fp = resolve_person(data['person'])
        except BadReference:
            fp = None
            log.info("No match for: %s", data['person'])
        sl.upsert(engine,
                  table, {
                      'person': data.get('person'),
                      'matched': fp is not None,
                      'fingerprint': fp
                  },
                  unique=['person'])
Beispiel #44
0
def validate(force=False, filter=None, data_row_filter=None):
    stats = OpenSpendingStats()
    stats_spending = {
        'date': OpenSpendingStats(),
        'amount': OpenSpendingStats()
    }
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        validate_resource(engine, source_table, row, force, data_row_filter,
                          stats, stats_spending)
    log.info('Validate summary: \n%s' % stats.report())
    for stat_type in stats_spending:
        log.info('Validate %s: \n%s' %
                 (stat_type, stats_spending[stat_type].report()))
Beispiel #45
0
def load_organisations(grano, engine, rep):
    for org in sl.find(engine, sl.get_table(engine, 'organisation'),
        representativeEtlId=rep['etlId']):
        ent = canonical_actor(grano, engine, org['name'])
        ent['orgMembers'] = int(float(org['numberOfMembers'] or 0))
        ent['actsAsOrganisation'] = True

        ent = ensure_actor(grano, ent)

        rel = find_relation(rep['outgoing'], 'target', ent,
            {'type': MEMBERSHIP['name']})
        rel['type'] = MEMBERSHIP['name']
        rel['source'] = rep.get('id')
        rel['target'] = ent
        rep['outgoing'] = replace_relation(rep['outgoing'], 'target', rel)
    return rep
def extend_beschluesse(engine):
    log.info("Re-connecting beschluesse ...")
    abstimmungen = cache_abstimmungen(engine)
    # pprint(abstimmungen)
    Beschluss = sl.get_table(engine, "beschluss")
    for data in sl.find(engine, Beschluss):
        date = data["fundstelle"].split(" ")[0]
        data["date"] = datetime.strptime(date, "%d.%m.%Y").isoformat()
        if not data["dokument_text"]:
            continue
        if data["date"] in abstimmungen:
            abst = abstimmungen[data["date"]]
            doks = set(data["dokument_text"].split(", "))
            for subject, adoks in abst.items():
                if len(doks & adoks):
                    print "MATCH", data["date"], doks, adoks
Beispiel #47
0
def extend_beschluesse(engine):
    log.info("Re-connecting beschluesse ...")
    abstimmungen = cache_abstimmungen(engine)
    #pprint(abstimmungen)
    Beschluss = sl.get_table(engine, 'beschluss')
    for data in sl.find(engine, Beschluss):
        date = data['fundstelle'].split(' ')[0]
        data['date'] = datetime.strptime(date, '%d.%m.%Y').isoformat()
        if not data['dokument_text']:
            continue
        if data['date'] in abstimmungen:
            abst = abstimmungen[data['date']]
            doks = set(data['dokument_text'].split(', '))
            for subject, adoks in abst.items():
                if len(doks & adoks):
                    print "MATCH", data['date'], doks, adoks
Beispiel #48
0
def extend_beschluesse(engine, master):
    log.info("Re-connecting beschluesse ...")
    abstimmungen = cache_abstimmungen(engine)
    #pprint(abstimmungen)
    Beschluss = sl.get_table(engine, 'beschluss')
    for data in sl.find(engine, Beschluss):
        date = data['fundstelle'].split(' ')[0]
        data['date'] = datetime.strptime(date, '%d.%m.%Y').isoformat()
        if not data['dokument_text']:
            continue
        if data['date'] in abstimmungen:
            abst = abstimmungen[data['date']]
            doks = set(data['dokument_text'].split(', '))
            for subject, adoks in abst.items():
                if len(doks & adoks):
                    print "MATCH", data['date'], doks, adoks
def validate_sheet(engine, row, sheet_id):
    spending_table = sl.get_table(engine, 'spending')
    data = list(
        sl.find(engine,
                spending_table,
                resource_id=row['resource_id'],
                sheet_id=sheet_id))
    connection = engine.connect()
    trans = connection.begin()
    issue_noted_for_this_resource = False  # record first failure only
    try:
        records = 0
        for row_ in data:
            result = {'id': row_['id'], 'valid': True}
            result['signature'] = generate_signature(row_)

            if row_['DateFormatted'] is None:
                result['valid'] = False
                if not issue_noted_for_this_resource:
                    issue(
                        engine, row['resource_id'], row['retrieve_hash'],
                        'Date invalid (or possible the date format is inconsistent)',
                        {
                            'row_id': row_.get('row_id'),
                            'Date': row_.get('Date')
                        })
                    issue_noted_for_this_resource = True
            if row_['AmountFormatted'] is None:
                result['valid'] = False
                if not issue_noted_for_this_resource:
                    issue(engine, row['resource_id'], row['retrieve_hash'],
                          'Amount invalid', {
                              'row_id': row_.get('row_id'),
                              'Amount': row_.get('Amount')
                          })
                    issue_noted_for_this_resource = True

            if result['valid']:
                records += 1
            sl.update(connection, spending_table, {'id': result['id']}, result)
        trans.commit()
        return records > 0
    finally:
        connection.close()
Beispiel #50
0
def generate_all():
    engine = db_connect()
    spending = sl.get_table(engine, 'spending')
    sources = sources_metadata(engine)
    signatures = set()
    for row in sl.find(engine, spending, valid=True):
        if row['signature'] in signatures:
            continue
        signatures.add(row['signature'])
        if not row['resource_id'] in sources:
            continue
        row.update(sources[row['resource_id']])
        row.pop('valid', True)
        row.pop('row_id', True)
        row.pop('resource_id', True)
        row.pop('resource_hash', True)
        row['RecordETLID'] = row.pop('id', None)
        row['RecordSignature'] = row.pop('signature', None)
        row['SourceSheetID'] = row.pop('sheet_id', None)
        yield row
Beispiel #51
0
def get_alignment(engine, wp, session):
    agenda_speeches = get_agenda(engine, wp, session)
    transcript_speeches = get_transcript(engine, wp, session)

    try:
        cuts = list(
            sl.find(engine,
                    sl.get_table(engine, 'alignments'),
                    wp=str(wp),
                    session=str(session),
                    order_by='sequence'))
    except KeyError:
        cuts = []

    alignment = []
    tr_offset = 0
    ag_offset = 0
    for cut in cuts:
        tr_speeches = transcript_seek(transcript_speeches, cut, tr_offset)
        tr_current = len(tr_speeches) + 1
        tr_offset = tr_offset + tr_current

        ag_speeches = agenda_seek(agenda_speeches, cut, ag_offset)
        ag_offset = ag_offset + len(ag_speeches) - 1

        section = align_section(tr_speeches, ag_speeches)
        alignment.extend(section)

        data = {
            'item_id': cut.get('item_id'),
            'speech_id': cut.get('speech_id'),
            'sequence': cut.get('sequence'),
            'agenda_fp': ag_speeches[-1].get('fingerprint'),
            'transcript_fp': transcript_speeches[tr_current][1]
        }
        alignment.append(data)

    section = align_section(transcript_speeches[tr_offset:],
                            agenda_speeches[ag_offset:])
    alignment.extend(section)
    return score_alignment(alignment), alignment
Beispiel #52
0
def load_gremium_mitglieder(engine, person):
    _GremiumMitglieder = sl.get_table(engine, 'gremium_mitglieder')
    for gmdata in sl.find(engine,
                          _GremiumMitglieder,
                          person_source_url=person.source_url):
        gremium = Gremium.query.filter_by(key=gmdata['gremium_key']).first()
        if gremium is None:
            gremium = lazyload_gremium(engine, gmdata['gremium_key'])
            if gremium is None:
                log.error("Gremium not found: %s" % gmdata['gremium_key'])
        role = gmdata['role']
        if role == 'obleute':
            gremium.obleute.append(person)
        elif role == 'vorsitz':
            gremium.vorsitz = person
        elif role == 'stellv_vorsitz':
            gremium.stellv_vorsitz = person
        elif role == 'mitglied':
            gremium.mitglieder.append(person)
        elif role == 'stellv_mitglied':
            gremium.stellvertreter.append(person)
Beispiel #53
0
def merge_speech(engine, wp, session):
    log.info("Merging media + transcript: %s/%s" % (wp, session))
    score, alignment = get_alignment(engine, wp, session)
    log.info("Matching score: %s", score)
    agenda = get_agenda(engine, wp, session)
    agenda = dict([(a['item_id'], a) for a in agenda])
    alignment = dict([(a['sequence'], a) for a in alignment])
    item = None
    table = sl.get_table(engine, 'webtv_speech')
    for speech in sl.find(engine,
                          sl.get_table(engine, 'speech'),
                          order_by='sequence',
                          wahlperiode=wp,
                          sitzung=session,
                          matched=True):
        sequence = speech['sequence']
        item = alignment.get(sequence, item)
        data = agenda.get(item['item_id']).copy()
        del data['id']
        data['sequence'] = sequence
        sl.upsert(engine, table, data, unique=['wp', 'session', 'sequence'])
Beispiel #54
0
def cleanup_sheet(engine, row, sheet_id, data_row_filter, stats_spending):
    spending_table = sl.get_table(engine, 'spending')
    data = list(
        sl.find(engine,
                spending_table,
                resource_id=row['resource_id'],
                sheet_id=sheet_id))
    if not data:
        log.info('Sheet has no rows')
        return False, None
    connection = engine.connect()
    trans = connection.begin()
    date_formats = cleanup_dates.detect_formats(data)
    try:
        for date_format in date_formats.values():
            if isinstance(date_format, basestring):
                issue(engine, row['resource_id'], row['retrieve_hash'], STAGE,
                      "Couldn't detect date formats because: %s" % date_format,
                      repr(date_formats))
                return True, date_format

        if not data_row_filter:
            sl.delete(connection,
                      spending_table,
                      resource_id=row['resource_id'],
                      sheet_id=sheet_id)
        for row in data:
            if data_row_filter and data_row_filter != row['row_id']:
                continue
            row = cleanup_dates.apply(row, date_formats, stats_spending)
            row = cleanup_numbers.apply(row, stats_spending)
            row = cleanup_gov.apply(row, stats_spending)
            #row = cleanup_supplier.apply(row, engine)
            del row['id']
            sl.add_row(connection, spending_table, row)
        trans.commit()
        return True, None
    finally:
        connection.close()
Beispiel #55
0
def merge_speeches(engine):
    # desired result: (position_id, debatte_id)
    referenzen = referenzen_index(engine)
    items = item_index(engine)

    log.info("Finding best matches.... ")
    matches = {}
    for (ablauf_id, rwp, rsession), rdrs in referenzen.items():
        for (iwp, isession, item_id), idrs in items.items():
            if iwp != rwp or rsession != isession:
                continue
            ints = len(idrs.intersection(rdrs))
            if ints == 0:
                continue
            k = (ablauf_id, rwp, rsession)
            if k in matches and matches[k][1] > ints:
                continue
            matches[k] = (item_id, ints)

    log.info("Saving position associations....")
    pos_tbl = sl.get_table(engine, 'position')
    for (ablauf_id, wp, session), (item_id, n) in matches.items():
        for pos in sl.find(engine,
                           pos_tbl,
                           ablauf_id="%s/%s" % (wp, ablauf_id)):
            if not pos['fundstelle_url']:
                continue
            if 'btp/%s/%s%03d.pdf' % (wp, wp,
                                      int(session)) in pos['fundstelle_url']:
                d = {
                    'ablauf_id': pos['ablauf_id'],
                    'hash': pos['hash'],
                    'debatte_wp': wp,
                    'debatte_session': session,
                    'debatte_item_id': item_id
                }
                sl.upsert(engine, pos_tbl, d, unique=['ablauf_id', 'hash'])
Beispiel #56
0
def combine(force=False, filter=None):
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, **(filter or {})):
        combine_resource(engine, source_table, row, force)
Beispiel #57
0
def combine_resource_id(resource_id, force=False):
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table, resource_id=resource_id):
        combine_resource(engine, source_table, row, force)
Beispiel #58
0
def cleanup_all(force=False):
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    for row in sl.find(engine, source_table):
        cleanup_resource(engine, source_table, row, force)