Example #1
0
def merge_speeches(engine):
    # desired result: (position_id, debatte_id)
    referenzen = referenzen_index(engine)
    items = item_index(engine)
    
    log.info("Finding best matches.... ")
    matches = {}
    for (ablauf_id, rwp, rsession), rdrs in referenzen.items():
        for (iwp, isession, item_id), idrs in items.items():
            if iwp != rwp or rsession != isession:
                continue
            ints = len(idrs.intersection(rdrs))
            if ints == 0:
                continue
            k = (ablauf_id, rwp, rsession)
            if k in matches and matches[k][1] > ints:
                continue
            matches[k] = (item_id, ints)

    log.info("Saving position associations....")
    pos_tbl = sl.get_table(engine, 'position')
    for (ablauf_id, wp, session), (item_id, n) in matches.items():
        for pos in sl.find(engine, pos_tbl, ablauf_id="%s/%s" % (wp, ablauf_id)):
            if not pos['fundstelle_url']:
                continue
            if 'btp/%s/%s%03d.pdf' % (wp, wp, int(session)) in pos['fundstelle_url']:
                d = {'ablauf_id': pos['ablauf_id'], 
                     'hash': pos['hash'],
                     'debatte_wp': wp,
                     'debatte_session': session,
                     'debatte_item_id': item_id}
                sl.upsert(engine, pos_tbl, d, unique=['ablauf_id', 'hash'])
Example #2
0
def load_transcript(engine, wp, session, incremental=True):
    url = URL % (wp, session)
    Speech = sl.get_table(engine, 'speech')
    if incremental and sl.find_one(engine, Speech,
        source_url=url, matched=True):
        return True
    if '404 Seite nicht gefunden' in fetch(url):
        return False
    sio = fetch_stream(url)
    if sio is None:
        return False
    log.info("Loading transcript: %s/%s" % (wp, session))
    seq = 0
    parser = SpeechParser(engine, sio)
    for contrib in parser:
        if not len(contrib['text'].strip()):
            continue
        contrib['sitzung'] = session
        contrib['sequence'] = seq
        contrib['wahlperiode'] = wp
        contrib['source_url'] = url
        contrib['matched'] = True
        sl.upsert(engine, Speech, contrib, 
                  unique=['sequence', 'sitzung', 'wahlperiode'])
        seq += 1
    if parser.missing_recon:
        sl.upsert(engine, Speech, {
                    'matched': False,
                    'sitzung': session,
                    'wahlperiode': wp
            }, unique=['sitzung', 'wahlperiode'])

    return True
def update_network_entities(engine, file_name):
    log.info("Updating network entities reference sheet: %s", file_name)
    network_entities = set()
    table = sl.get_table(engine, 'network_entity')
    if os.path.exists(file_name):
        fh = open(file_name, 'rb')
        reader = csv.DictReader(fh)
        for d in reader:
            e = [(k, v.decode('utf-8')) for (k, v) in d.items()]
            e = dict(e)
            network_entities.add((e['representativeEtlId'], e['etlFingerPrint']))
            sl.upsert(engine, table, e, ['representativeEtlId', 'etlFingerPrint'])
        fh.close()
        reps = set([ne[0] for ne in network_entities])
        rep_table = sl.get_table(engine, 'representative')
        for rep in reps:
            sl.update(engine, rep_table, {'etlId': rep}, {'network_extracted': True})

    for row in sl.all(engine, table):
        network_entities.add((row['representativeEtlId'], row['etlFingerPrint']))

    fh = open(file_name, 'wb')
    writer = None
    table = sl.get_table(engine, 'network_entity')
    for ic, fp in network_entities:
        row = {
            'representativeEtlId': ic,
            'etlFingerPrint': fp
        }
        if writer is None:
            writer = csv.DictWriter(fh, row.keys())
            writer.writerow(dict(zip(row.keys(), row.keys())))
        r = [(k, unicode(v).encode('utf-8')) for k, v in row.items()]
        writer.writerow(dict(r))
    fh.close()
Example #4
0
def extract_resource(engine, source_table, row, force, stats):
    if not row['retrieve_status']:
        stats.add_source('Previous step (retrieve) not complete', row)
        log.debug('Row has no retrieve status - skipping')
        return

    # Skip over tables we have already extracted
    if not force and sl.find_one(
            engine,
            source_table,
            resource_id=row['resource_id'],
            extract_status=True,
            extract_hash=row['retrieve_hash']) is not None:
        stats.add_source('Already extracted', row)
        return

    log.info("Extract: /dataset/%s/resource/%s", row['package_name'],
             row['resource_id'])
    clear_issues(engine, row['resource_id'], STAGE)

    status, sheets = extract_resource_core(engine, row, stats)
    sl.upsert(engine,
              source_table, {
                  'resource_id': row['resource_id'],
                  'extract_hash': row['retrieve_hash'],
                  'extract_status': status,
                  'sheets': sheets
              },
              unique=['resource_id'])
Example #5
0
def scrape_transcript(engine, url, force=False):
    wp, session = url_metadata(url)
    table = sl.get_table(engine, 'speech')
    sio = find_local(url)
    sample = {'source_etag': 'local'}
    if sio is None:
        sample = sl.find_one(engine, table, source_url=url, matched=True)
        response, sio = fetch_stream(url)
        sample = check_tags(sample or {}, response, force)
    base_data = {'source_url': url,
                 'sitzung': session,
                 'wahlperiode': wp,
                 'matched': False,
                 'loaded': False,
                 'source_etag': sample['source_etag']}
    log.info("Loading transcript: %s/%s, from %s" , wp, session, url)
    seq = 0
    parser = SpeechParser(sio)
    for contrib in parser:
        if not len(contrib['text'].strip()):
            continue
        contrib.update(base_data)
        contrib['sequence'] = seq
        sl.upsert(engine, table, contrib, 
                  unique=['source_url', 'sequence'])
        seq += 1
    if not parser.missing_recon:
        sl.upsert(engine, table, {
                    'matched': True,
                    'source_url': url,
            }, unique=['source_url'])
    else:
        raise InvalidReference()
    return base_data
Example #6
0
def extend_positions(engine):
    log.info("Amending positions ...")
    Position = sl.get_table(engine, 'position')
    for i, data in enumerate(sl.find(engine, Position)):
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        dt, rest = data['fundstelle'].split("-", 1)
        data['date'] = datetime.strptime(dt.strip(), "%d.%m.%Y").isoformat()
        if ',' in data['urheber']:
            typ, quelle = data['urheber'].split(',', 1)
            data['quelle'] = re.sub("^.*Urheber.*:", "", quelle).strip()
            data['typ'] = typ.strip()
        else:
            data['typ'] = data['urheber']

        br = 'Bundesregierung, '
        if data['urheber'].startswith(br):
            data['urheber'] = data['urheber'][len(br):]

        data['fundstelle_doc'] = None
        if data['fundstelle_url'] and \
                'btp' in data['fundstelle_url']:
            data['fundstelle_doc'] = data['fundstelle_url']\
                    .rsplit('#',1)[0]

        hash = sha1(data['fundstelle'].encode('utf-8') \
                + data['urheber'].encode('utf-8') + \
                data['ablauf_id'].encode('utf-8')).hexdigest()
        data['hash'] = hash[:10]
        sl.upsert(engine, Position, data, unique=UNIQUE)
Example #7
0
def cleanup_resource(engine, source_table, row, force):
    if not row["combine_status"]:
        return

    # Skip over tables we have already cleaned up
    if (
        not force
        and sl.find_one(
            engine, source_table, resource_id=row["resource_id"], cleanup_status=True, cleanup_hash=row["combine_hash"]
        )
        is not None
    ):
        return

    log.info("Cleanup: %s, Resource %s", row["package_name"], row["resource_id"])

    status = True
    for sheet_id in range(0, row["sheets"]):
        sheet_status = cleanup_sheet(engine, row, sheet_id)
        if status and not sheet_status:
            status = False
    sl.upsert(
        engine,
        source_table,
        {"resource_id": row["resource_id"], "cleanup_hash": row["combine_hash"], "cleanup_status": status},
        unique=["resource_id"],
    )
Example #8
0
def combine_resource(engine, source_table, row, force, stats):
    if not row['extract_status']:
        stats.add_source('Previous step (extract) not complete', row)
        return

    # Skip over tables we have already combined
    if not force and sl.find_one(engine,
                                 source_table,
                                 resource_id=row['resource_id'],
                                 combine_hash=row['extract_hash'],
                                 combine_status=True) is not None:
        stats.add_source('Already combined', row)
        return

    log.info("Combine: /dataset/%s/resource/%s", row['package_name'],
             row['resource_id'])
    clear_issues(engine, row['resource_id'], STAGE)

    status = combine_resource_core(engine, row, stats)
    sl.upsert(engine,
              source_table, {
                  'resource_id': row['resource_id'],
                  'combine_hash': row['extract_hash'],
                  'combine_status': status,
              },
              unique=['resource_id'])
Example #9
0
def make_fingerprint(engine, person):
    try:
        long_name = make_long_name(person)
        try:
            long_name = resolve_person(long_name)
            log.info(" -> %s" % long_name.strip())
        except:
            log.error("Resolve did not work")
            pass

        Person = sl.get_table(engine, 'person')
        sl.upsert(engine, Person, {
            'fingerprint': long_name,
            'slug': url_slug(long_name),
            'mdb_id': person['mdb_id']
            }, unique=['mdb_id'])
        Rolle = sl.get_table(engine, 'rolle')
        sl.upsert(engine, Rolle, {
            'mdb_id': person['mdb_id'],
            'fingerprint': long_name
            }, unique=['mdb_id'])
        person['fingerprint'] = long_name
    except BadReference:
        log.error("Bad Reference %s", person)
        pass
Example #10
0
def cleanup_resource(engine, source_table, row, force):
    if not row['combine_status']:
        return

    # Skip over tables we have already cleaned up
    if not force and sl.find_one(engine,
                                 source_table,
                                 resource_id=row['resource_id'],
                                 cleanup_status=True,
                                 cleanup_hash=row['combine_hash']) is not None:
        return

    log.info("Cleanup: %s, Resource %s", row['package_name'],
             row['resource_id'])

    status = True
    for sheet_id in range(0, row['sheets']):
        sheet_status = cleanup_sheet(engine, row, sheet_id)
        if status and not sheet_status:
            status = False
    sl.upsert(engine,
              source_table, {
                  'resource_id': row['resource_id'],
                  'cleanup_hash': row['combine_hash'],
                  'cleanup_status': status,
              },
              unique=['resource_id'])
def extend_position(engine, table, data):
    dt, rest = data['fundstelle'].split("-", 1)
    data['date'] = datetime.strptime(dt.strip(), "%d.%m.%Y").isoformat()
    if ',' in data['urheber']:
        typ, quelle = data['urheber'].split(',', 1)
        data['quelle'] = re.sub("^.*Urheber.*:", "", quelle).strip()
        data['typ'] = typ.strip()
    else:
        data['typ'] = data['urheber']

    br = 'Bundesregierung, '
    if data['urheber'].startswith(br):
        data['urheber'] = data['urheber'][len(br):]

    data['fundstelle_doc'] = None
    if data['fundstelle_url'] and \
            'btp' in data['fundstelle_url']:
        data['fundstelle_doc'] = data['fundstelle_url']\
                .rsplit('#',1)[0]

    hash = sha1(data['fundstelle'].encode('utf-8') \
            + data['urheber'].encode('utf-8') + \
            data['source_url'].encode('utf-8')).hexdigest()
    data['hash'] = hash[:10]
    sl.upsert(engine, table, data, unique=['id'])
Example #12
0
def extend_speeches(engine, wahlperiode=17):
    log.info("Amending speeches with DRS ...")
    drs_match = re.compile(DRS_MATCH % (wahlperiode, wahlperiode))
    Speech = sl.get_table(engine, 'speech')
    SpeechDocument = sl.get_table(engine, 'speech_document')
    for i, data in enumerate(sl.find(engine, Speech)):
        if data.get('type') != 'chair':
            continue
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        m = drs_match.search(data.get('text'))
        if m is None:
            continue
        for i, grp in enumerate(m.groups()):
            if grp and '/' in grp:
                wp, nummer = grp.split('/', 1)
                sl.upsert(
                    engine,
                    SpeechDocument, {
                        'group': i,
                        'sequence': data['sequence'],
                        'sitzung': data['sitzung'],
                        'wahlperiode': wahlperiode,
                        'dok_nummer': nummer
                    },
                    unique=['sequence', 'sitzung', 'wahlperiode', 'group'])
Example #13
0
def map_columns():
    engine, columns_table = connect()

    q = select([columns_table.c.normalised, columns_table.c.count, columns_table.c.valid], order_by=[columns_table.c.count.desc().nullslast()])

    for normalised, count, valid in engine.execute(q):
        if valid is not None:
            continue
        try:
            columns = map_column(engine, columns_table, normalised, count)
            if columns is not None:
                sl.upsert(engine, columns_table, 
                          {'normalised': normalised,
                           'valid': True,
                           'column_map': json.dumps(columns)},
                          ['normalised'])
            else:
                sl.upsert(engine, columns_table, 
                          {'normalised': normalised,
                           'valid': False}, 
                          ['normalised'])
        except SystemExit:
            raise
        except:
            traceback.print_exc()
def scrape_transcript(engine, url, force=False):
    wp, session = url_metadata(url)
    table = sl.get_table(engine, 'speech')
    sample = sl.find_one(engine, table, source_url=url, matched=True)
    response, sio = fetch_stream(url)
    sample = check_tags(sample or {}, response, force)
    base_data = {'source_url': url, 
                 'sitzung': session,
                 'wahlperiode': wp,
                 'matched': False,
                 'loaded': False,
                 'source_etag': sample['source_etag']}
    log.info("Loading transcript: %s/%s, from %s" , wp, session, url)
    seq = 0
    parser = SpeechParser(sio)
    for contrib in parser:
        if not len(contrib['text'].strip()):
            continue
        contrib.update(base_data)
        contrib['sequence'] = seq
        sl.upsert(engine, table, contrib, 
                  unique=['source_url', 'sequence'])
        seq += 1
    if not parser.missing_recon:
        sl.upsert(engine, table, {
                    'matched': True,
                    'source_url': url,
            }, unique=['source_url'])
    else:
        raise InvalidReference()
    return base_data
Example #15
0
def generate_person_long_names(engine):
    log.info("Generating person fingerprints and slugs...")
    from offenesparlament.transform.namematch import match_speaker
    nkp = nk_persons()
    Person = sl.get_table(engine, 'person')
    for person in sl.find(engine, Person):
        long_name = make_long_name(person)
        try:
            long_name = match_speaker(long_name)
        except NKNoMatch:
            pass
        log.info(" -> %s" % long_name.strip())
        slug = url_slug(long_name)
        sl.upsert(engine, Person, {
                         'fingerprint': long_name,
                         'slug': slug,
                         'id': person['id']},
                         unique=['id'])
        tries = 0
        while True:
            try:
                nkp.ensure_value(long_name, data=person)
            except ValueError, E:
                log.warn('Exception: %s' % str(E))
                tries = tries + 1
                if tries > 5:
                    raise
            else:
                break
Example #16
0
def load_budget(base_url, year, engine, table):
    context = {'data_year': year}
    print "\nHaushalt: %s" % year
    i = 0
    for row in load_einzelplaene(base_url % year, context):
        row['titel_id'] = row['id']
        del row['id']
        row['remarks'] = "\n\n".join(row['remarks'])
        commitment_appropriations = row['commitment_appropriations'].copy()
        del row['commitment_appropriations']
        #if len(commitment_appropriations):
        #    #print len(commitment_appropriations)
        
        row['commitment_year'] = None
        row['source_id'] = str(year) + "." + str(i)
        sl.upsert(engine, table, row, UNIQUE_COLUMNS)
        i += 1

        for year, amount in commitment_appropriations.items():
            ca = row.copy()
            ca['commitment_year'] = context['data_year']
            ca['year'] = year
            ca['amount'] = amount
            ca['financial_type'] = 'VE'
            ca['source_id'] = str(year) + "." + str(i)
            sl.upsert(engine, table, ca, UNIQUE_COLUMNS)
            i += 1
Example #17
0
def mark_done(engine, url):
    table = sl.get_table(engine, 'speech')
    sl.upsert(engine,
              table, {
                  'loaded': True,
                  'source_url': url,
              },
              unique=['source_url'])
Example #18
0
def process_rows(handlefunc, engine=None):
    if engine is None:
        engine = make_engine()
    table = sl.get_table(engine, 'fts')
    for row in sl.all(engine, table):
        out = handlefunc(row)
        sl.upsert(engine, table, out, ['id'])
    return table
Example #19
0
def condense(engine, resource_id, table_id, force):
    table_suffix = '%s_table%s' % (resource_id, table_id)

    if not engine.has_table('raw_%s' % table_suffix):
        return

    condensed_table = sl.get_table(engine, 'condensed')

    # Skip over tables we have already extracted
    if not force and sl.find_one(engine, condensed_table, resource_id=resource_id, table_id=table_id) is not None:
        return

    connection = engine.connect()
    trans = connection.begin()

    start = time.time()

    try:
        raw_table = sl.get_table(connection, 'raw_%s' % table_suffix)
        sl.drop_table(connection, 'spending_%s' % table_suffix)
        spending_table = sl.get_table(connection, 'spending_%s' % table_suffix)
        columns_table = sl.get_table(connection, 'column_sets')

        normalise_map = normalised_columns_map(raw_table)
        normalised_headers = ','.join(sorted(normalise_map.values()))
        mapping_row = sl.find_one(connection, columns_table, normalised=normalised_headers)

        if mapping_row is None or not mapping_row.get('valid'):
            # This table is unmapped, cannot be condensed
            return

        column_mapping = json.loads(mapping_row['column_map'])

        # Build the final mapping from input column to output column
        mapping = {}
        for k,n in normalise_map.iteritems():
            if n in column_mapping and column_mapping[n] is not None and len(column_mapping[n]) > 0:
                mapping[k] = column_mapping[n]
        
        for row in sl.all(connection, raw_table):
            spending_row = {}
            for key, value in row.items():
                if key not in mapping:
                    continue
                if not value or not len(value.strip()):
                    continue
                if mapping[key] in spending_row:
                    continue
                spending_row[mapping[key]] = value.strip()
            #print spending_row
            sl.add_row(connection, spending_table, spending_row)
        sl.upsert(connection, condensed_table, {'resource_id': resource_id,
                                                'table_id': table_id,
                                                'condense_time': time.time() - start,
                                                }, ['resource_id', 'table_id'])
        trans.commit()
    finally:
        connection.close()
Example #20
0
def validate_resource(engine, source_table, row, force, data_row_filter, stats,
                      stats_spending):
    if not row['cleanup_status']:
        stats.add_source('Previous step (cleanup) not complete', row)
        return

    # Skip over tables we have already cleaned up
    if not force and sl.find_one(
            engine,
            source_table,
            resource_id=row['resource_id'],
            validate_status=True,
            validate_hash=row['cleanup_hash']) is not None:
        stats.add_source('Already validated', row)
        return

    log.info("Validate: /dataset/%s/resource/%s", row['package_name'],
             row['resource_id'])
    if not data_row_filter:
        clear_issues(engine, row['resource_id'], STAGE)

    no_errors = True
    no_records = True
    error_message = None
    for sheet_id in range(0, row['sheets']):
        sheet_records, sheet_error_message = validate_sheet(
            engine, row, sheet_id, data_row_filter, stats_spending)
        if no_errors and sheet_error_message:
            no_errors = False
            error_message = sheet_error_message
        if no_records and sheet_records:
            no_records = False

    if data_row_filter:
        stats.add_source(
            'Resource data filtered, not saving resource cleanup.', row)
    else:
        log.info("Result: records=%s errors=%s", not no_records, not no_errors)
        sl.upsert(engine,
                  source_table, {
                      'resource_id': row['resource_id'],
                      'validate_hash': row['cleanup_hash'],
                      'validate_status': no_errors,
                  },
                  unique=['resource_id'])
        if no_errors:
            if no_records:
                stats.add_source('No records but no errors', row)
            else:
                stats.add_source('Validated ok', row)
        else:
            if no_records:
                stats.add_source(
                    'All transactions invalid: %s' % error_message, row)
            else:
                stats.add_source(
                    'Some transactions invalid: %s' % error_message, row)
Example #21
0
def speechmatcher_alignment_post(wp, session):
    engine = etl_engine()
    table = sl.get_table(engine, 'alignments')
    data = dict(request.form.items())
    data['sequence'] = int(data['sequence'])
    data['wp'] = wp
    data['session'] = session
    sl.upsert(engine, table, data, ['wp', 'session', 'sequence'])
    return speechmatcher_alignment_get(wp, session)
Example #22
0
def match_beitraege(engine, url):
    table = sl.get_table(engine, 'beitrag')
    for beitrag in sl.distinct(engine, table, *KEYS, source_url=url):
        match = match_beitrag(engine, beitrag, url)
        beitrag['fingerprint'] = match
        beitrag['matched'] = match is not None
        if match:
            ensure_rolle(beitrag, match, engine)
        sl.upsert(engine, table, beitrag, unique=KEYS)
Example #23
0
def merge():
    engine = util.make_engine()
    table = sl.get_table(engine, 'fts')
    for row in sl.distinct(engine, table, 'beneficiary', 'country_code'):
        canonical, uri, score = lookup(row.get('beneficiary'), row.get('country_code'), engine)
        row['beneficiary_canonical'] = canonical
        row['beneficiary_uri'] = uri
        row['beneficiary_score'] = score
        sl.upsert(engine, table, row, ['beneficiary', 'country'])
Example #24
0
def merge():
    read_countries()
    engine = util.make_engine()
    table = sl.get_table(engine, 'fts')
    for row in sl.distinct(engine, table, 'country'):
        country = row.get('country')
        data = match(country)
        row['country_code'] = data.get('iso_3166-1_2')
        row['country_common'] = data.get('common')
        sl.upsert(engine, table, row, ['country'])
Example #25
0
def extend_ablaeufe(engine, master):
    log.info("Amending ablaeufe ...")
    Ablauf = sl.get_table(engine, 'ablauf')
    typen = [(t.get('typ'), t.get('class')) for t in master['ablauf_typ']]
    typen = dict(typen)
    for data in sl.distinct(engine, Ablauf, 'typ'):
        klass = typen.get(data.get('typ'))
        sl.upsert(engine, Ablauf, {'typ': data.get('typ'),
                         'class': klass}, 
                         unique=['typ'])
Example #26
0
def ensure_rolle(beitrag, fp, engine):
    rolle = {
        'fingerprint': fp,
        'ressort': beitrag.get('ressort'),
        'fraktion': beitrag.get('fraktion'),
        'funktion': beitrag.get('funktion')
        }
    Rolle = sl.get_table(engine, 'rolle')
    sl.upsert(engine, Rolle, rolle,
            unique=['fingerprint', 'funktion'])
def clean_ablauf(engine, data):
    try:
        table = sl.get_table(engine, 'ablauf')
        data['class'] = resolve_type(data.get('typ'))
        data['stage'] = resolve_stage(data.get('stand'))
        d = {'class': data['class'], 
             'stage': data['stand'],
             'source_url': data['source_url']}
        sl.upsert(engine, table, d, unique=['source_url'])
    except BadReference:
        pass
Example #28
0
def save():
    etlId = request.form.get('representativeEtlId')
    matches = set(request.form.getlist('matches[]'))
    for match in matches:
        match = match.strip().strip(",").strip(";").strip(".").strip()
        sl.upsert(engine, network_entity, {'etlFingerPrint': match,
                                           'representativeEtlId': etlId},
                ['etlFingerPrint', 'representativeEtlId'])
    sl.upsert(engine, representative, {'etlId': etlId,
        'network_extracted': True}, ['etlId'])
    return jsonify({'status': 'OK'})
Example #29
0
def create_entities(engine):
    log.info("De-normalizing global entities collection...")
    table = sl.get_table(engine, 'entity')
    for tbl in ['representative', 'person', 'financialDataTurnover',
        'organisation', 'network_entity']:
        for row in sl.all(engine, sl.get_table(engine, tbl)):
            entity = {'etlFingerPrint': row.get('etlFingerPrint')}
            entity['legalStatus'] = row.get('legalStatus', '')
            entity['countryCode'] = row.get('contactCountryCode', '')
            entity['etlTable'] = tbl
            sl.upsert(engine, table, entity, ['etlFingerPrint', 'etlTable'])
Example #30
0
def load_person(person, role, childBase, engine):
    table = sl.get_table(engine, 'person')
    person_ = childBase.copy()
    person_.update(person)
    person_['role'] = role
    person_['etlFingerPrint'] = '%s %s %s' % (person['title'] or '',
                                              person['firstName'],
                                              person['lastName'])
    person_['etlFingerPrint'] = person_['etlFingerPrint'].strip()
    sl.upsert(engine, table, person_, ['representativeEtlId',
                                       'role',
                                       'etlFingerPrint'])
Example #31
0
def integrate_recon(engine, table, qfunc, src_col, dst_name_col, dst_uri_col,
        min_score=None, limit=200, memory_name=None):
    if memory_name is None:
        memory_name = "recon_%s_%s" % (table.name, src_col)
    memory = SQLALoadMemory(engine, table=memory_name)
    for row in sl.distinct(engine, table, src_col):
        res = interactive(qfunc, row[src_col], min_score=min_score,
                memory=memory, limit=limit)
        if res is not None:
            #print row.get(src_col), " -> ", res.name.encode('utf-8'), res.score
            sl.upsert(engine, table, {src_col: row[src_col], dst_name_col: res.name, 
                      dst_uri_col: res.uri}, [src_col])
Example #32
0
def resolve_stimmen(engine, source_url):
    table = sl.get_table(engine, 'abstimmung')
    for data in sl.find(engine, table, source_url=source_url):
        try:
            fp = resolve_person(data['person'])
        except BadReference:
            fp = None
            log.info("No match for: %s", data['person'])
        sl.upsert(engine, table,
                  {'person': data.get('person'),
                   'matched': fp is not None,
                   'fingerprint': fp},
                  unique=['person'])
Example #33
0
def fetch_package(client, package_name, engine, table):
    print package_name
    pkg = client.package_entity_get(package_name)
    for res in pkg['resources']:
        sl.upsert(engine, table, {
            'resource_id': res['id'],
            'package_id': pkg['id'],
            'package_name': pkg['name'],
            'url': res['url'],
            'publisher': pkg.get('extras', {}).get('published_by'),
            'format': res['format'],
            'description': res['description']
            }, ['resource_id'])
Example #34
0
def articles(engine):
    a_table = sl.get_table(engine, 'article')
    for data in sl.find(engine, a_table):
        up = {'number': data['number']}
        slug_parts = data['canonical_url'].split('/')[3:]
        if len(slug_parts) > 3:
            print slug_parts
        if len(slug_parts) == 3:
            up['ressort'], up['subressort'], _ = slug_parts
        elif len(slug_parts) == 2:
            up['ressort'], _ = slug_parts
        up['date'] = parse_date(data['date_text'])
        sl.upsert(engine, a_table, up, ['number'])
Example #35
0
def scrape_speeches(engine, data):
    url = WEBTV_SPEECHES % (data['wp'], data['session'], data['item_id'])
    response, doc = _html(url)
    rows = doc.findall('//tr')
    table = sl.get_table(engine, 'webtv')
    for i, row in enumerate(rows):
        if i % 4 != 0:
            continue
        data['speaker'] = row.xpath('string()').strip()
        if isinstance(data['speaker'], str):
            data['speaker'] = data['speaker'].encode('latin-1').decode('utf-8')
        data['speech_id'] = rows[i + 2].find('.//a').get('href').split('=')[-1]
        sl.upsert(engine, table, data, ['speech_id'])
Example #36
0
def articles(engine):
    a_table = sl.get_table(engine, 'article')
    for data in sl.find(engine, a_table):
        up = {'number': data['number']}
        slug_parts = data['canonical_url'].split('/')[3:]
        if len(slug_parts) > 3:
            print slug_parts
        if len(slug_parts) == 3:
            up['ressort'], up['subressort'], _ = slug_parts
        elif len(slug_parts) == 2:
            up['ressort'], _ = slug_parts
        up['date'] = parse_date(data['date_text'])
        sl.upsert(engine, a_table, up, ['number'])
Example #37
0
def match_beitraege(engine):
    Beitrag = sl.get_table(engine, 'beitrag')
    for i, beitrag in enumerate(sl.distinct(engine, Beitrag, 'vorname',
        'nachname', 'funktion', 'land', 'fraktion', 'ressort', 'ort')):
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        match = match_beitrag(engine, beitrag)
        ensure_rolle(beitrag, match, engine)
        beitrag['fingerprint'] = match
        beitrag['matched'] = match is not None
        sl.upsert(engine, Beitrag, beitrag, unique=['vorname', 'nachname',
            'funktion', 'land', 'fraktion', 'ressort', 'ort'])
Example #38
0
def load_ausschuss(url, engine, table):
    doc = _xml(url)
    a = {'source_url': url}
    a['key'] = doc.findtext('/ausschussId')
    a['name'] = doc.findtext('/ausschussName')
    log.info("Ausschuss (%s): %s" % (a['key'], a['name']))
    a['aufgabe'] = doc.findtext('/ausschussAufgabe')
    a['image_url'] = doc.findtext('/ausschussBildURL')
    a['image_copyright'] = doc.findtext('/ausschussCopyright')
    a['rss_url'] = RSS_FEEDS.get(a['key'])
    a['url'] = URL_PATTERN % a['key']
    a['type'] = 'ausschuss'
    sl.upsert(engine, table, a, unique=['key'])
Example #39
0
def resolve_stimmen(engine, source_url):
    table = sl.get_table(engine, 'abstimmung')
    for data in sl.find(engine, table, source_url=source_url):
        try:
            fp = resolve_person(data['person'])
        except BadReference:
            fp = None
            log.info("No match for: %s", data['person'])
        sl.upsert(engine,
                  table, {
                      'person': data.get('person'),
                      'matched': fp is not None,
                      'fingerprint': fp
                  },
                  unique=['person'])
Example #40
0
def cleanup_resource(engine, source_table, row, force, data_row_filter, stats,
                     stats_spending):
    if not row['combine_status']:
        stats.add_source('Previous step (combine) not complete', row)
        return

    # Skip over tables we have already cleaned up
    if not force and sl.find_one(engine,
                                 source_table,
                                 resource_id=row['resource_id'],
                                 cleanup_status=True,
                                 cleanup_hash=row['combine_hash']) is not None:
        stats.add_source('Already cleaned up', row)
        return

    log.info("Cleanup: /dataset/%s/resource/%s", row['package_name'],
             row['resource_id'])
    if not data_row_filter:
        clear_issues(engine, row['resource_id'], STAGE)

    no_rows = True
    no_errors = True
    error_message = None
    for sheet_id in range(0, row['sheets']):
        sheet_has_rows, sheet_error_message = cleanup_sheet(
            engine, row, sheet_id, data_row_filter, stats_spending)
        if no_errors and sheet_error_message:
            no_errors = False
            error_message = sheet_error_message
        if no_rows and sheet_has_rows:
            no_rows = False
    if data_row_filter:
        stats.add_source(
            'Resource data filtered, not saving resource cleanup.', row)
    else:
        sl.upsert(engine,
                  source_table, {
                      'resource_id': row['resource_id'],
                      'cleanup_hash': row['combine_hash'],
                      'cleanup_status': no_errors,
                  },
                  unique=['resource_id'])
        if no_rows:
            stats.add_source('Empty sheet', row)
        elif no_errors:
            stats.add_source('Cleaned up ok', row)
        else:
            stats.add_source(error_message, row)
Example #41
0
def retrieve(row, engine, source_table, force, stats):
    content_id = None
    if not force and row.get('retrieve_status') is True \
           and row.get('retrieve_hash') and os.path.exists(source_path(row)):
        # cached file exists and url is unchanged
        stats.add_source('Already cached and in database', row)
        return

    # fetch the file
    log.info("Retrieve: /dataset/%s/resource/%s", row['package_name'], row['resource_id'])
    clear_issues(engine, row['resource_id'], STAGE)
    url = row['url'].strip() # no-one can disagree with doing .strip()
    log.info('Fetching: "%s"', url)
    success, content_or_error = get_url(url)
    if not success:
        # URL didn't work, so try 'fixed' versions of it
        original_error = content_or_error
        fixed_urls = fix_url(url)
        for fixed_url in fixed_urls:
            log.info('Fetching fixed url: "%s"', fixed_url)
            success, content_or_error = get_url(fixed_url)
            if success:
                break
    if success:
        stats.add_source('Downloaded', row)
    elif os.path.exists(source_path(row)):
        stats.add_source('Could not download but it was in the cache', row)
        with open(source_path(row), 'rb') as fh:
            content_or_error = fh.read()
        success = True

    if success:
        data = content_or_error
        content_id = calculate_hash(data)
        fh = open(source_path(row), 'wb')
        fh.write(data)
        fh.close()
    else:
        stats.add_source(original_error, row)
        issue(engine, row['resource_id'], None, STAGE,
              original_error, url.encode('utf8', 'ignore'))
    sl.upsert(engine, source_table, {
        'resource_id': row['resource_id'],
        'retrieve_status': success,
        'retrieve_hash': content_id},
        unique=['resource_id'])
Example #42
0
def add_to_gremium(node, url, role, engine):
    key = node.get('id')
    table = sl.get_table(engine, 'gremium')
    g = sl.find_one(engine, table, key=key)
    if g is None:
        g = {'key': key, 'type': 'sonstiges'}
        g['name'] = node.findtext('gremiumName')
        g['url'] = node.findtext('gremiumURL')
        sl.upsert(engine, table, g, unique=['key'])
    table = sl.get_table(engine, 'gremium_mitglieder')
    sl.upsert(engine,
              table, {
                  'gremium_key': g['key'],
                  'person_source_url': url,
                  'role': role
              },
              unique=['person_source_url', 'gremium_key', 'role'])
Example #43
0
def resolve_abstimmung(engine, source_url):
    table = sl.get_table(engine, 'abstimmung')
    data = sl.find_one(engine, table, source_url=source_url)
    if data is None:
        log.error("No data: %s", source_url)
        return
    subject = data['subject']
    try:
        title = resolve_votes(subject)
    except BadReference:
        title = None
        log.info("No match for: %s", data['person'])
    sl.upsert(engine,
              table, {
                  'subject': subject,
                  'title': title
              },
              unique=['subject'])
Example #44
0
def scrape_gremium(engine, url, force=False):
    table = sl.get_table(engine, 'gremium')
    response, doc = _xml(url)
    a = sl.find_one(engine, table, source_url=url)
    if a is None:
        a = {'source_url': url}
    a = check_tags(a, response, force)
    a['key'] = doc.findtext('/ausschussId')
    a['name'] = doc.findtext('/ausschussName')
    log.info("Ausschuss (%s): %s" % (a['key'], a['name']))
    a['aufgabe'] = doc.findtext('/ausschussAufgabe')
    a['image_url'] = doc.findtext('/ausschussBildURL')
    a['image_copyright'] = doc.findtext('/ausschussCopyright')
    a['rss_url'] = GREMIUM_RSS_FEEDS.get(a['key'])
    a['url'] = URL_PATTERN % a['key']
    a['type'] = 'ausschuss'
    sl.upsert(engine, table, a, unique=['key'])
    return a
Example #45
0
def combine_resource(engine, source_table, row, force):
    if not row['extract_status']:
        return

    # Skip over tables we have already combined
    if not force and sl.find_one(engine, source_table,
            resource_id=row['resource_id'],
            combine_hash=row['extract_hash'],
            combine_status=True) is not None:
        return

    log.info("Combine: %s, Resource %s", row['package_name'], row['resource_id'])

    status = combine_resource_core(engine, row)
    sl.upsert(engine, source_table, {
        'resource_id': row['resource_id'],
        'combine_hash': row['extract_hash'],
        'combine_status': status,
        }, unique=['resource_id'])
Example #46
0
def load_profiles(engine):
    doc = etree.parse(FEED_URL)
    Person = sl.get_table(engine, 'person')
    for profile in doc.findall('//PROFIL'):
        name = profile.findtext('.//VORNAME')
        if name is None:
            continue
        name += ' ' + profile.findtext('.//NACHNAME')
        partei = profile.findtext('.//PARTEI')
        name += ' ' + PARTEI_MAPPING.get(partei, partei)
        try:
            fp = resolve_person(name)
            sl.upsert(engine,
                      Person, {
                          'awatch_url': profile.get('url'),
                          'fingerprint': fp
                      },
                      unique=['fingerprint'])
        except BadReference:
            pass
Example #47
0
def merge_speech(engine, wp, session):
    log.info("Merging media + transcript: %s/%s" % (wp, session))
    score, alignment = get_alignment(engine, wp, session)
    log.info("Matching score: %s", score)
    agenda = get_agenda(engine, wp, session)
    agenda = dict([(a['item_id'], a) for a in agenda])
    alignment = dict([(a['sequence'], a) for a in alignment])
    item = None
    table = sl.get_table(engine, 'webtv_speech')
    for speech in sl.find(engine,
                          sl.get_table(engine, 'speech'),
                          order_by='sequence',
                          wahlperiode=wp,
                          sitzung=session,
                          matched=True):
        sequence = speech['sequence']
        item = alignment.get(sequence, item)
        data = agenda.get(item['item_id']).copy()
        del data['id']
        data['sequence'] = sequence
        sl.upsert(engine, table, data, unique=['wp', 'session', 'sequence'])
Example #48
0
def extract_resource(engine, source_table, row, force):
    if not row['retrieve_status']:
        log.debug('Row has no retrieve status - skipping')
        return

    # Skip over tables we have already extracted
    if not force and sl.find_one(engine, source_table,
            resource_id=row['resource_id'],
            extract_status=True,
            extract_hash=row['retrieve_hash']) is not None:
        return

    log.info("Extracting: %s, File %s", row['package_name'], row['resource_id'])

    status, sheets = extract_resource_core(engine, row)
    sl.upsert(engine, source_table, {
        'resource_id': row['resource_id'],
        'extract_hash': row['retrieve_hash'],
        'extract_status': status,
        'sheets': sheets
        }, unique=['resource_id'])
def lookup(val, engine):
    supplier_table = sl.get_table(engine, 'supplier')
    data = sl.find_one(engine, supplier_table, name=val)
    if data is not None:
        return data['canonical'], data['uri'], data['score']
    try:
        query = json.dumps({'query': val, 'limit': 1})
        res = session.get('http://opencorporates.com/reconcile/gb',
                          params={'query': query})
        data = {'name': val, 'canonical': None, 'uri': None, 'score': 0}
        if res.ok and res.json and len(res.json.get('result')):
            r = res.json.get('result').pop()
            data['canonical'] = r['name']
            data['uri'] = r['uri']
            data['score'] = r['score']
        log.info('OpenCorporates Lookup: %s -> %s', val, data['canonical'])
        sl.upsert(engine, supplier_table, data, unique=['name'])
        return data['canonical'], data['uri'], data['score']
    except Exception, ex:
        log.exception(ex)
        return None, None, None
Example #50
0
def make_person(engine, beitrag, fp, source_url):
    try:
        fp = resolve_person(fp)
        person = {
            'fingerprint': fp,
            'slug': url_slug(fp),
            'source_url': source_url,
            'vorname': beitrag['vorname'],
            'nachname': beitrag['nachname'],
            'ort': beitrag.get('ort'),
            'ressort': beitrag.get('ressort'),
            'land': beitrag.get('land'),
            'fraktion': beitrag.get('fraktion')
        }
        sl.upsert(engine,
                  sl.get_table(engine, 'person'),
                  person,
                  unique=['fingerprint'])
    except BadReference:
        pass
    return fp
Example #51
0
 def handle_list(page):
     texts = page.findall('text')
     header = [c.xpath("string()") for c in texts[:20]]
     if header[1].strip() == 'Seite:':
         col_offset = 3
     else:
         for i, h in enumerate(header):
             if 'Name' in h:
                 col_offset = i
                 break
     fraktion = texts[col_offset - 1].xpath("string()")
     fraktion = fraktion.replace(u"ÜNDNIS`", "")
     fraktion = fraktion.replace(u"ÜNDNIS'", "")
     columns = [(int(c.get('left')), c.xpath("string()")) for c in \
                texts[col_offset:col_offset+6]]
     texts = texts[col_offset + 6:]
     name = u''
     #print columns
     for i, t in enumerate(texts):
         txt = t.xpath('string()').strip()
         if txt == 'Summe':
             break
         if not len(txt):
             continue
         left, field = min(columns,
                           key=lambda c: abs(int(t.get('left')) - c[0]))
         if 'Name' in field:
             name += ' ' + txt
         if txt == 'X':
             field = field.strip().strip('.').strip()
             data = {
                 'subject': unicode(subject),
                 'person': name.strip() + ' ' + fraktion,
                 'date': unicode(date),
                 'vote': unicode(field)
             }
             data.update(base_data)
             sl.upsert(engine, Vote, data, unique=['subject', 'person'])
             name = u''
Example #52
0
def merge_speeches(engine):
    # desired result: (position_id, debatte_id)
    referenzen = referenzen_index(engine)
    items = item_index(engine)

    log.info("Finding best matches.... ")
    matches = {}
    for (ablauf_id, rwp, rsession), rdrs in referenzen.items():
        for (iwp, isession, item_id), idrs in items.items():
            if iwp != rwp or rsession != isession:
                continue
            ints = len(idrs.intersection(rdrs))
            if ints == 0:
                continue
            k = (ablauf_id, rwp, rsession)
            if k in matches and matches[k][1] > ints:
                continue
            matches[k] = (item_id, ints)

    log.info("Saving position associations....")
    pos_tbl = sl.get_table(engine, 'position')
    for (ablauf_id, wp, session), (item_id, n) in matches.items():
        for pos in sl.find(engine,
                           pos_tbl,
                           ablauf_id="%s/%s" % (wp, ablauf_id)):
            if not pos['fundstelle_url']:
                continue
            if 'btp/%s/%s%03d.pdf' % (wp, wp,
                                      int(session)) in pos['fundstelle_url']:
                d = {
                    'ablauf_id': pos['ablauf_id'],
                    'hash': pos['hash'],
                    'debatte_wp': wp,
                    'debatte_session': session,
                    'debatte_item_id': item_id
                }
                sl.upsert(engine, pos_tbl, d, unique=['ablauf_id', 'hash'])
def parse_angaben(engine, data):
    if not data.get('angaben'):
        return
    snippet = '<x>' + data['angaben'] + '</x>'
    doc = html.fragment_fromstring(snippet)
    table = sl.get_table(engine, 'angaben')
    data = {'source_url': data['source_url']}
    wrapped_name = False
    for el in doc:
        if el.tag == 'h3':
            wrapped_name = False
            data['section'] = el.text.split('. ', 1)[-1]
        elif el.tag == 'strong' or not el.text or not el.get('class'):
            continue
        elif 'voa_abstand' in el.get('class') or wrapped_name:
            client = el.text
            if wrapped_name:
                client = data['client'] + ' ' + client
            data['client'] = client
            client.strip().strip(',')
            els = client.rsplit(',', 2)
            if len(els) == 3:
                wrapped_name = False
                data['client_name'] = els[0].strip()
                data['client_city'] = els[1].strip()
            else:
                wrapped_name = True
                continue
        else:
            data['service'] = el.text
            data['level'] = 'Stufe 0'
            for name in LEVELS:
                if name.lower() in data['service'].lower():
                    data['level'] = name
            sl.upsert(engine, table, data,
                ['source_url', 'section', 'client', 'service'])
                    res.status_code, url_printable)
                result = 'Download failed (status %s)' % res.status_code
    except requests.Timeout, re:
        result = 'Timeout accessing URL'
        issue(engine, row['resource_id'], None, result, url_printable)
        success = False
    except Exception, re:
        log.exception(re)
        issue(engine, row['resource_id'], None, 'Exception occurred',
              unicode(re))
        success = False
        result = 'Exception occurred'
    sl.upsert(engine,
              source_table, {
                  'resource_id': row['resource_id'],
                  'retrieve_status': success,
                  'retrieve_hash': content_id
              },
              unique=['resource_id'])
    return result


def retrieve_some(force=False, **filters):
    engine = db_connect()
    source_table = sl.get_table(engine, 'source')
    result_counts = defaultdict(int)
    for row in sl.find(engine, source_table, **filters):
        result = retrieve(row, engine, source_table, force)
        result_counts['total'] += 1
        result_counts[result] += 1
    log.info('Total %i URLs', result_counts.pop('total'))