Example #1
0
def condense(engine, resource_id, table_id, force):
    table_suffix = '%s_table%s' % (resource_id, table_id)

    if not engine.has_table('raw_%s' % table_suffix):
        return

    condensed_table = sl.get_table(engine, 'condensed')

    # Skip over tables we have already extracted
    if not force and sl.find_one(engine, condensed_table, resource_id=resource_id, table_id=table_id) is not None:
        return

    connection = engine.connect()
    trans = connection.begin()

    start = time.time()

    try:
        raw_table = sl.get_table(connection, 'raw_%s' % table_suffix)
        sl.drop_table(connection, 'spending_%s' % table_suffix)
        spending_table = sl.get_table(connection, 'spending_%s' % table_suffix)
        columns_table = sl.get_table(connection, 'column_sets')

        normalise_map = normalised_columns_map(raw_table)
        normalised_headers = ','.join(sorted(normalise_map.values()))
        mapping_row = sl.find_one(connection, columns_table, normalised=normalised_headers)

        if mapping_row is None or not mapping_row.get('valid'):
            # This table is unmapped, cannot be condensed
            return

        column_mapping = json.loads(mapping_row['column_map'])

        # Build the final mapping from input column to output column
        mapping = {}
        for k,n in normalise_map.iteritems():
            if n in column_mapping and column_mapping[n] is not None and len(column_mapping[n]) > 0:
                mapping[k] = column_mapping[n]
        
        for row in sl.all(connection, raw_table):
            spending_row = {}
            for key, value in row.items():
                if key not in mapping:
                    continue
                if not value or not len(value.strip()):
                    continue
                if mapping[key] in spending_row:
                    continue
                spending_row[mapping[key]] = value.strip()
            #print spending_row
            sl.add_row(connection, spending_table, spending_row)
        sl.upsert(connection, condensed_table, {'resource_id': resource_id,
                                                'table_id': table_id,
                                                'condense_time': time.time() - start,
                                                }, ['resource_id', 'table_id'])
        trans.commit()
    finally:
        connection.close()
Example #2
0
def cleanup_resource(engine, source_table, row, force):
    if not row["combine_status"]:
        return

    # Skip over tables we have already cleaned up
    if (
        not force
        and sl.find_one(
            engine, source_table, resource_id=row["resource_id"], cleanup_status=True, cleanup_hash=row["combine_hash"]
        )
        is not None
    ):
        return

    log.info("Cleanup: %s, Resource %s", row["package_name"], row["resource_id"])

    status = True
    for sheet_id in range(0, row["sheets"]):
        sheet_status = cleanup_sheet(engine, row, sheet_id)
        if status and not sheet_status:
            status = False
    sl.upsert(
        engine,
        source_table,
        {"resource_id": row["resource_id"], "cleanup_hash": row["combine_hash"], "cleanup_status": status},
        unique=["resource_id"],
    )
def scrape_transcript(engine, url, force=False):
    wp, session = url_metadata(url)
    table = sl.get_table(engine, 'speech')
    sample = sl.find_one(engine, table, source_url=url, matched=True)
    response, sio = fetch_stream(url)
    sample = check_tags(sample or {}, response, force)
    base_data = {'source_url': url, 
                 'sitzung': session,
                 'wahlperiode': wp,
                 'matched': False,
                 'loaded': False,
                 'source_etag': sample['source_etag']}
    log.info("Loading transcript: %s/%s, from %s" , wp, session, url)
    seq = 0
    parser = SpeechParser(sio)
    for contrib in parser:
        if not len(contrib['text'].strip()):
            continue
        contrib.update(base_data)
        contrib['sequence'] = seq
        sl.upsert(engine, table, contrib, 
                  unique=['source_url', 'sequence'])
        seq += 1
    if not parser.missing_recon:
        sl.upsert(engine, table, {
                    'matched': True,
                    'source_url': url,
            }, unique=['source_url'])
    else:
        raise InvalidReference()
    return base_data
Example #4
0
def scrape_transcript(engine, url, force=False):
    wp, session = url_metadata(url)
    table = sl.get_table(engine, 'speech')
    sio = find_local(url)
    sample = {'source_etag': 'local'}
    if sio is None:
        sample = sl.find_one(engine, table, source_url=url, matched=True)
        response, sio = fetch_stream(url)
        sample = check_tags(sample or {}, response, force)
    base_data = {'source_url': url,
                 'sitzung': session,
                 'wahlperiode': wp,
                 'matched': False,
                 'loaded': False,
                 'source_etag': sample['source_etag']}
    log.info("Loading transcript: %s/%s, from %s" , wp, session, url)
    seq = 0
    parser = SpeechParser(sio)
    for contrib in parser:
        if not len(contrib['text'].strip()):
            continue
        contrib.update(base_data)
        contrib['sequence'] = seq
        sl.upsert(engine, table, contrib, 
                  unique=['source_url', 'sequence'])
        seq += 1
    if not parser.missing_recon:
        sl.upsert(engine, table, {
                    'matched': True,
                    'source_url': url,
            }, unique=['source_url'])
    else:
        raise InvalidReference()
    return base_data
Example #5
0
def extract_resource(engine, source_table, row, force, stats):
    if not row['retrieve_status']:
        stats.add_source('Previous step (retrieve) not complete', row)
        log.debug('Row has no retrieve status - skipping')
        return

    # Skip over tables we have already extracted
    if not force and sl.find_one(
            engine,
            source_table,
            resource_id=row['resource_id'],
            extract_status=True,
            extract_hash=row['retrieve_hash']) is not None:
        stats.add_source('Already extracted', row)
        return

    log.info("Extract: /dataset/%s/resource/%s", row['package_name'],
             row['resource_id'])
    clear_issues(engine, row['resource_id'], STAGE)

    status, sheets = extract_resource_core(engine, row, stats)
    sl.upsert(engine,
              source_table, {
                  'resource_id': row['resource_id'],
                  'extract_hash': row['retrieve_hash'],
                  'extract_status': status,
                  'sheets': sheets
              },
              unique=['resource_id'])
Example #6
0
def load_transcript(engine, wp, session, incremental=True):
    url = URL % (wp, session)
    Speech = sl.get_table(engine, 'speech')
    if incremental and sl.find_one(engine, Speech,
        source_url=url, matched=True):
        return True
    if '404 Seite nicht gefunden' in fetch(url):
        return False
    sio = fetch_stream(url)
    if sio is None:
        return False
    log.info("Loading transcript: %s/%s" % (wp, session))
    seq = 0
    parser = SpeechParser(engine, sio)
    for contrib in parser:
        if not len(contrib['text'].strip()):
            continue
        contrib['sitzung'] = session
        contrib['sequence'] = seq
        contrib['wahlperiode'] = wp
        contrib['source_url'] = url
        contrib['matched'] = True
        sl.upsert(engine, Speech, contrib, 
                  unique=['sequence', 'sitzung', 'wahlperiode'])
        seq += 1
    if parser.missing_recon:
        sl.upsert(engine, Speech, {
                    'matched': False,
                    'sitzung': session,
                    'wahlperiode': wp
            }, unique=['sitzung', 'wahlperiode'])

    return True
Example #7
0
def combine_resource(engine, source_table, row, force, stats):
    if not row['extract_status']:
        stats.add_source('Previous step (extract) not complete', row)
        return

    # Skip over tables we have already combined
    if not force and sl.find_one(engine,
                                 source_table,
                                 resource_id=row['resource_id'],
                                 combine_hash=row['extract_hash'],
                                 combine_status=True) is not None:
        stats.add_source('Already combined', row)
        return

    log.info("Combine: /dataset/%s/resource/%s", row['package_name'],
             row['resource_id'])
    clear_issues(engine, row['resource_id'], STAGE)

    status = combine_resource_core(engine, row, stats)
    sl.upsert(engine,
              source_table, {
                  'resource_id': row['resource_id'],
                  'combine_hash': row['extract_hash'],
                  'combine_status': status,
              },
              unique=['resource_id'])
Example #8
0
def cleanup_resource(engine, source_table, row, force):
    if not row['combine_status']:
        return

    # Skip over tables we have already cleaned up
    if not force and sl.find_one(engine,
                                 source_table,
                                 resource_id=row['resource_id'],
                                 cleanup_status=True,
                                 cleanup_hash=row['combine_hash']) is not None:
        return

    log.info("Cleanup: %s, Resource %s", row['package_name'],
             row['resource_id'])

    status = True
    for sheet_id in range(0, row['sheets']):
        sheet_status = cleanup_sheet(engine, row, sheet_id)
        if status and not sheet_status:
            status = False
    sl.upsert(engine,
              source_table, {
                  'resource_id': row['resource_id'],
                  'cleanup_hash': row['combine_hash'],
                  'cleanup_status': status,
              },
              unique=['resource_id'])
Example #9
0
def supplier_names_from_table(engine, resource_id, table_id):
    global tables_count, rows_count
    tables_count = tables_count + 1
    print "# checking table %d" % tables_count
    table_suffix = "%s_table%s" % (resource_id, table_id)

    table = sl.get_table(engine, "spending_%s" % table_suffix)
    supplier_table = sl.get_table(engine, "suppliers")

    for row in sl.all(engine, table):
        rows_count = rows_count + 1
        if not row.has_key("SupplierName"):
            # One of the junk tables that contain no real data, usually excel noise
            continue
        supplier = row["SupplierName"]

        if supplier is None or supplier == "":
            continue

        if suppliers_visited.has_key(supplier):
            continue

        suppliers_visited[supplier] = True

        if sl.find_one(engine, supplier_table, original=supplier) is not None:
            continue

        yield supplier
Example #10
0
def load_sessions(engine, wp='17'):
    table = sl.get_table(engine, 'webtv')
    for i in count(1):
        i = str(i)
        if sl.find_one(engine, table, wp=wp, session=i):
            continue
        if not load_agenda(engine, wp, i):
            return
Example #11
0
def scrape_abstimmung(engine, url, force=False):
    abstimmung = sl.get_table(engine, 'abstimmung')
    sample = sl.find_one(engine, abstimmung, source_url=url)
    response = fetch(url)
    sample = check_tags(sample or {}, response, force)

    base_data = {'source_url': url, 'source_etag': sample['source_etag']}
    handle_xml(engine, base_data, response.content)
    return base_data
Example #12
0
def validate_resource(engine, source_table, row, force, data_row_filter, stats,
                      stats_spending):
    if not row['cleanup_status']:
        stats.add_source('Previous step (cleanup) not complete', row)
        return

    # Skip over tables we have already cleaned up
    if not force and sl.find_one(
            engine,
            source_table,
            resource_id=row['resource_id'],
            validate_status=True,
            validate_hash=row['cleanup_hash']) is not None:
        stats.add_source('Already validated', row)
        return

    log.info("Validate: /dataset/%s/resource/%s", row['package_name'],
             row['resource_id'])
    if not data_row_filter:
        clear_issues(engine, row['resource_id'], STAGE)

    no_errors = True
    no_records = True
    error_message = None
    for sheet_id in range(0, row['sheets']):
        sheet_records, sheet_error_message = validate_sheet(
            engine, row, sheet_id, data_row_filter, stats_spending)
        if no_errors and sheet_error_message:
            no_errors = False
            error_message = sheet_error_message
        if no_records and sheet_records:
            no_records = False

    if data_row_filter:
        stats.add_source(
            'Resource data filtered, not saving resource cleanup.', row)
    else:
        log.info("Result: records=%s errors=%s", not no_records, not no_errors)
        sl.upsert(engine,
                  source_table, {
                      'resource_id': row['resource_id'],
                      'validate_hash': row['cleanup_hash'],
                      'validate_status': no_errors,
                  },
                  unique=['resource_id'])
        if no_errors:
            if no_records:
                stats.add_source('No records but no errors', row)
            else:
                stats.add_source('Validated ok', row)
        else:
            if no_records:
                stats.add_source(
                    'All transactions invalid: %s' % error_message, row)
            else:
                stats.add_source(
                    'Some transactions invalid: %s' % error_message, row)
Example #13
0
def scrape_abstimmung(engine, url, force=False):
    abstimmung = sl.get_table(engine, 'abstimmung')
    sample = sl.find_one(engine, abstimmung, source_url=url)
    response = fetch(url)
    sample = check_tags(sample or {}, response, force)
    
    base_data = {'source_url': url, 
                 'source_etag': sample['source_etag']}
    handle_xml(engine, base_data, response.content)
    return base_data
Example #14
0
def load_vote(url, engine, incremental=True):
    Vote = sl.get_table(engine, 'abstimmung')
    if incremental and sl.find_one(engine, Vote, source_url=url):
        log.info("%s is done, skipping.", url)
        return
    fh, path = tempfile.mkstemp('.pdf')
    fo = open(path, 'wb')
    fo.write(fetch(url))
    fo.close()
    xml = pdftoxml(path)
    handle_xml(xml, engine, url)
Example #15
0
def match_beitrag(engine, beitrag, url):
    beitrag_print = make_long_name(beitrag)
    log.info("Matching: %s", beitrag_print.encode('ascii', 'replace'))
    try:
        value = resolve_person(beitrag_print)
        if sl.find_one(engine, sl.get_table(engine, 'person'),
                fingerprint=value) is None:
            make_person(engine, beitrag, value, url)
        return value
    except BadReference:
        log.info("Beitrag person is unknown: %s",
                beitrag_print.encode('ascii', 'replace'))
Example #16
0
def match_beitrag(engine, beitrag):
    nkp = nk_persons()
    beitrag_print = make_long_name(beitrag)
    log.info("Matching: %s", beitrag_print)
    try:
        value = match_speaker(beitrag_print)
        if sl.find_one(engine, sl.get_table(engine, 'person'),
                fingerprint=value) is None:
            make_person(beitrag, value, engine)
        return value
    except NKNoMatch, nm:
        log.info("Beitrag person is unknown: %s", beitrag_print)
        return None
Example #17
0
def load_item(url, engine, gremium=None):
    table = sl.get_table(engine, 'news')
    item = sl.find_one(engine, table, source_url=url)
    if item is not None:
        if gremium is not None:
            item['gremium'] = gremium['key']
            sl.upsert(engine, table, item, unique=['source_url'])
        return
    try:
        doc = _xml(url)
    except Exception, e:
        log.exception(e)
        return
Example #18
0
def lazyload_person(engine, indexer, fingerprint):
    if fingerprint is None or not len(fingerprint):
        return
    person = Person.query.filter_by(fingerprint=fingerprint).first()
    if person is not None:
        return person
    table = sl.get_table(engine, 'person')
    data = sl.find_one(engine, table, fingerprint=fingerprint)
    if data is None:
        log.error("Invalid fingerprint reference: %s", fingerprint)
        return
    person = load_person(engine, data)
    indexer.add(person)
    return person
Example #19
0
def lazyload_person(engine, indexer, fingerprint):
    if fingerprint is None or not len(fingerprint):
        return
    person = Person.query.filter_by(fingerprint=fingerprint).first()
    if person is not None:
        return person
    table = sl.get_table(engine, "person")
    data = sl.find_one(engine, table, fingerprint=fingerprint)
    if data is None:
        log.error("Invalid fingerprint reference: %s", fingerprint)
        return
    person = load_person(engine, data)
    indexer.add(person)
    return person
Example #20
0
def add_to_gremium(node, url, role, engine):
    key = node.get('id')
    table = sl.get_table(engine, 'gremium')
    g = sl.find_one(engine, table, key=key)
    if g is None:
        g = {'key': key, 'type': 'sonstiges'}
        g['name'] = node.findtext('gremiumName')
        g['url'] = node.findtext('gremiumURL')
        sl.upsert(engine, table, g, unique=['key'])
    table = sl.get_table(engine, 'gremium_mitglieder')
    sl.upsert(engine, table, {
        'gremium_key': g['key'],
        'person_source_url': url,
        'role': role
        }, unique=['person_source_url', 'gremium_key', 'role'])
Example #21
0
def cleanup_resource(engine, source_table, row, force, data_row_filter, stats,
                     stats_spending):
    if not row['combine_status']:
        stats.add_source('Previous step (combine) not complete', row)
        return

    # Skip over tables we have already cleaned up
    if not force and sl.find_one(engine,
                                 source_table,
                                 resource_id=row['resource_id'],
                                 cleanup_status=True,
                                 cleanup_hash=row['combine_hash']) is not None:
        stats.add_source('Already cleaned up', row)
        return

    log.info("Cleanup: /dataset/%s/resource/%s", row['package_name'],
             row['resource_id'])
    if not data_row_filter:
        clear_issues(engine, row['resource_id'], STAGE)

    no_rows = True
    no_errors = True
    error_message = None
    for sheet_id in range(0, row['sheets']):
        sheet_has_rows, sheet_error_message = cleanup_sheet(
            engine, row, sheet_id, data_row_filter, stats_spending)
        if no_errors and sheet_error_message:
            no_errors = False
            error_message = sheet_error_message
        if no_rows and sheet_has_rows:
            no_rows = False
    if data_row_filter:
        stats.add_source(
            'Resource data filtered, not saving resource cleanup.', row)
    else:
        sl.upsert(engine,
                  source_table, {
                      'resource_id': row['resource_id'],
                      'cleanup_hash': row['combine_hash'],
                      'cleanup_status': no_errors,
                  },
                  unique=['resource_id'])
        if no_rows:
            stats.add_source('Empty sheet', row)
        elif no_errors:
            stats.add_source('Cleaned up ok', row)
        else:
            stats.add_source(error_message, row)
Example #22
0
def add_to_gremium(node, url, role, engine):
    key = node.get("id")
    table = sl.get_table(engine, "gremium")
    g = sl.find_one(engine, table, key=key)
    if g is None:
        g = {"key": key, "type": "sonstiges"}
        g["name"] = node.findtext("gremiumName")
        g["url"] = node.findtext("gremiumURL")
        sl.upsert(engine, table, g, unique=["key"])
    table = sl.get_table(engine, "gremium_mitglieder")
    sl.upsert(
        engine,
        table,
        {"gremium_key": g["key"], "person_source_url": url, "role": role},
        unique=["person_source_url", "gremium_key", "role"],
    )
Example #23
0
def resolve_abstimmung(engine, source_url):
    table = sl.get_table(engine, 'abstimmung')
    data = sl.find_one(engine, table, source_url=source_url)
    if data is None:
        log.error("No data: %s", source_url)
        return
    subject = data['subject']
    try:
        title = resolve_votes(subject)
    except BadReference:
        title = None
        log.info("No match for: %s", data['person'])
    sl.upsert(engine, table,
              {'subject': subject,
               'title': title},
              unique=['subject'])
Example #24
0
def scrape_activity(ablauf, elem, engine):
    urheber = elem.findtext("URHEBER")
    fundstelle = elem.findtext("FUNDSTELLE")
    Position = sl.get_table(engine, 'position')
    p = sl.find_one(engine, Position, 
                    urheber=urheber, 
                    fundstelle=fundstelle, 
                    ablauf_id=ablauf['ablauf_id'])
    if p is not None:
        return 
    p = {'ablauf_id': ablauf['ablauf_id'], 
         'urheber': urheber,
         'fundstelle': fundstelle}
    pos_keys = p.copy()
    p['zuordnung'] = elem.findtext("ZUORDNUNG")
    p['abstrakt'] = elem.findtext("VP_ABSTRAKT")
    p['fundstelle_url'] = elem.findtext("FUNDSTELLE_LINK")
    
    Zuweisung = sl.get_table(engine, 'zuweisung')
    for zelem in elem.findall("ZUWEISUNG"):
        z = pos_keys.copy()
        z['text'] = zelem.findtext("AUSSCHUSS_KLARTEXT")
        z['federfuehrung'] = zelem.find("FEDERFUEHRUNG") is not None
        z['gremium_key'] = DIP_GREMIUM_TO_KEY.get(z['text'])
        sl.upsert(engine, Zuweisung, z, unique=[])
        
    Beschluss = sl.get_table(engine, 'beschluss')
    for belem in elem.findall("BESCHLUSS"):
        b = pos_keys.copy()
        b['seite'] = belem.findtext("BESCHLUSSSEITE")
        b['dokument_text'] = belem.findtext("BEZUGSDOKUMENT")
        b['tenor'] = belem.findtext("BESCHLUSSTENOR")
        b['grundlage'] = belem.findtext("GRUNDLAGE")
        sl.upsert(engine, Beschluss, b, unique=[])

    Referenz = sl.get_table(engine, 'referenz')
    try:
        dokument = dokument_by_url(p['fundstelle_url']) or \
            dokument_by_name(p['fundstelle'])
        dokument.update(pos_keys)
        dokument['ablauf_key'] = ablauf['key']
        dokument['wahlperiode'] = ablauf['wahlperiode']
        sl.upsert(engine, Referenz, dokument, unique=[
                'link', 'wahlperiode', 'ablauf_key', 'seiten'
                ])
    except Exception, e:
        log.exception(e)
Example #25
0
def validate_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending):
    if not row['cleanup_status']:
        stats.add_source('Previous step (cleanup) not complete', row)
        return

    # Skip over tables we have already cleaned up
    if not force and sl.find_one(engine, source_table,
            resource_id=row['resource_id'],
            validate_status=True,
            validate_hash=row['cleanup_hash']) is not None:
        stats.add_source('Already validated', row)
        return

    log.info("Validate: /dataset/%s/resource/%s", row['package_name'], row['resource_id'])
    if not data_row_filter:
        clear_issues(engine, row['resource_id'], STAGE)

    no_errors = True
    no_records = True
    error_message = None
    for sheet_id in range(0, row['sheets']):
        sheet_records, sheet_error_message = validate_sheet(engine, row, sheet_id, data_row_filter, stats_spending)
        if no_errors and sheet_error_message:
            no_errors = False
            error_message = sheet_error_message
        if no_records and sheet_records:
            no_records = False
    
    if data_row_filter:
        stats.add_source('Resource data filtered, not saving resource cleanup.', row)
    else:
        log.info("Result: records=%s errors=%s", not no_records, not no_errors)
        sl.upsert(engine, source_table, {
            'resource_id': row['resource_id'],
            'validate_hash': row['cleanup_hash'],
            'validate_status': no_errors,
            }, unique=['resource_id'])
        if no_errors:
            if no_records:
                stats.add_source('No records but no errors', row)
            else:
                stats.add_source('Validated ok', row)
        else:
            if no_records:
                stats.add_source('All transactions invalid: %s' % error_message, row)
            else:
                stats.add_source('Some transactions invalid: %s' % error_message, row)
Example #26
0
def add_to_gremium(node, url, role, engine):
    key = node.get('id')
    table = sl.get_table(engine, 'gremium')
    g = sl.find_one(engine, table, key=key)
    if g is None:
        g = {'key': key, 'type': 'sonstiges'}
        g['name'] = node.findtext('gremiumName')
        g['url'] = node.findtext('gremiumURL')
        sl.upsert(engine, table, g, unique=['key'])
    table = sl.get_table(engine, 'gremium_mitglieder')
    sl.upsert(engine,
              table, {
                  'gremium_key': g['key'],
                  'person_source_url': url,
                  'role': role
              },
              unique=['person_source_url', 'gremium_key', 'role'])
Example #27
0
def scrape_gremium(engine, url, force=False):
    table = sl.get_table(engine, 'gremium')
    response, doc = _xml(url)
    a = sl.find_one(engine, table, source_url=url)
    if a is None:
        a = {'source_url': url}
    a = check_tags(a, response, force)
    a['key'] = doc.findtext('/ausschussId')
    a['name'] = doc.findtext('/ausschussName')
    log.info("Ausschuss (%s): %s" % (a['key'], a['name']))
    a['aufgabe'] = doc.findtext('/ausschussAufgabe')
    a['image_url'] = doc.findtext('/ausschussBildURL')
    a['image_copyright'] = doc.findtext('/ausschussCopyright')
    a['rss_url'] = GREMIUM_RSS_FEEDS.get(a['key'])
    a['url'] = URL_PATTERN % a['key']
    a['type'] = 'ausschuss'
    sl.upsert(engine, table, a, unique=['key'])
    return a
Example #28
0
def resolve_abstimmung(engine, source_url):
    table = sl.get_table(engine, 'abstimmung')
    data = sl.find_one(engine, table, source_url=source_url)
    if data is None:
        log.error("No data: %s", source_url)
        return
    subject = data['subject']
    try:
        title = resolve_votes(subject)
    except BadReference:
        title = None
        log.info("No match for: %s", data['person'])
    sl.upsert(engine,
              table, {
                  'subject': subject,
                  'title': title
              },
              unique=['subject'])
Example #29
0
def combine_resource(engine, source_table, row, force):
    if not row['extract_status']:
        return

    # Skip over tables we have already combined
    if not force and sl.find_one(engine, source_table,
            resource_id=row['resource_id'],
            combine_hash=row['extract_hash'],
            combine_status=True) is not None:
        return

    log.info("Combine: %s, Resource %s", row['package_name'], row['resource_id'])

    status = combine_resource_core(engine, row)
    sl.upsert(engine, source_table, {
        'resource_id': row['resource_id'],
        'combine_hash': row['extract_hash'],
        'combine_status': status,
        }, unique=['resource_id'])
Example #30
0
def format(engine, resource_id, table_id, force):
    connection = engine.connect()
    trans = connection.begin()
    condensed_table = sl.get_table(connection, 'condensed')
    condensed_row = sl.find_one(connection, condensed_table, resource_id=resource_id, table_id=table_id)
    if condensed_row is None:
        condensed_row = {'resource_id': resource_id,
                         'table_id': table_id,
                         }
    start = time.time()
    try:
        if not force and condensed_row['format_time'] is not None:
            return
        do_format(connection, resource_id, table_id)
        condensed_row['format_time'] = time.time() - start
        sl.upsert(connection, condensed_table, condensed_row, ['resource_id', 'table_id'])
        trans.commit()
    finally:
        connection.close()
Example #31
0
def load_sitzung(engine, indexer, wp, session):
    table = sl.get_table(engine, 'webtv_speech')
    data = sl.find_one(engine, table, wp=wp, session=session)
    log.info("Loading Sitzung: %s/%s...", wp, session)
    sitzung = Sitzung.query.filter_by(wahlperiode=wp, nummer=session).first()
    if sitzung is None:
        sitzung = Sitzung()
        sitzung.wahlperiode = wp
        sitzung.nummer = session
    sitzung.titel = data.get('session_name')
    sitzung.date = to_date(data.get('session_date'))
    sitzung.source_url = data.get('session_url')

    db.session.add(sitzung)
    db.session.flush()
    indexer.add(sitzung)

    load_debatten(engine, indexer, sitzung)
    db.session.commit()
    return sitzung
Example #32
0
def cleanup_resource(engine, source_table, row, force, data_row_filter, stats, stats_spending):
    if not row['combine_status']:
        stats.add_source('Previous step (combine) not complete', row)
        return

    # Skip over tables we have already cleaned up
    if not force and sl.find_one(engine, source_table,
            resource_id=row['resource_id'],
            cleanup_status=True,
            cleanup_hash=row['combine_hash']) is not None:
        stats.add_source('Already cleaned up', row)
        return

    log.info("Cleanup: /dataset/%s/resource/%s", row['package_name'], row['resource_id'])
    if not data_row_filter:
        clear_issues(engine, row['resource_id'], STAGE)

    no_rows = True
    no_errors = True
    error_message = None
    for sheet_id in range(0, row['sheets']):
        sheet_has_rows, sheet_error_message = cleanup_sheet(engine, row, sheet_id, data_row_filter, stats_spending)
        if no_errors and sheet_error_message:
            no_errors = False
            error_message = sheet_error_message
        if no_rows and sheet_has_rows:
            no_rows = False
    if data_row_filter:
        stats.add_source('Resource data filtered, not saving resource cleanup.', row)
    else:
        sl.upsert(engine, source_table, {
            'resource_id': row['resource_id'],
            'cleanup_hash': row['combine_hash'],
            'cleanup_status': no_errors,
            }, unique=['resource_id'])
        if no_rows:
            stats.add_source('Empty sheet', row)
        elif no_errors:
            stats.add_source('Cleaned up ok', row)
        else:
            stats.add_source(error_message, row)
def lookup(val, engine):
    supplier_table = sl.get_table(engine, 'supplier')
    data = sl.find_one(engine, supplier_table, name=val)
    if data is not None:
        return data['canonical'], data['uri'], data['score']
    try:
        query = json.dumps({'query': val, 'limit': 1})
        res = session.get('http://opencorporates.com/reconcile/gb',
                          params={'query': query})
        data = {'name': val, 'canonical': None, 'uri': None, 'score': 0}
        if res.ok and res.json and len(res.json.get('result')):
            r = res.json.get('result').pop()
            data['canonical'] = r['name']
            data['uri'] = r['uri']
            data['score'] = r['score']
        log.info('OpenCorporates Lookup: %s -> %s', val, data['canonical'])
        sl.upsert(engine, supplier_table, data, unique=['name'])
        return data['canonical'], data['uri'], data['score']
    except Exception, ex:
        log.exception(ex)
        return None, None, None
Example #34
0
def lookup(val, engine):
    supplier_table = sl.get_table(engine, 'supplier')
    data = sl.find_one(engine, supplier_table, name=val)
    if data is not None:
        return data['canonical'], data['uri'], data['score']
    try:
        query = json.dumps({'query': val, 'limit': 1})
        res = session.get('http://opencorporates.com/reconcile/gb',
                params={'query': query})
        data = {'name': val, 'canonical': None, 'uri': None, 'score': 0}
        if res.ok and res.json and len(res.json.get('result')):
            r = res.json.get('result').pop()
            data['canonical'] = r['name']
            data['uri'] = r['uri']
            data['score'] = r['score']
        log.info('OpenCorporates Lookup: %s -> %s', val, data['canonical'])
        sl.upsert(engine, supplier_table, data, unique=['name'])
        return data['canonical'], data['uri'], data['score']
    except Exception, ex:
        log.exception(ex)
        return None, None, None
Example #35
0
def extract_resource(engine, source_table, row, force):
    if not row['retrieve_status']:
        log.debug('Row has no retrieve status - skipping')
        return

    # Skip over tables we have already extracted
    if not force and sl.find_one(engine, source_table,
            resource_id=row['resource_id'],
            extract_status=True,
            extract_hash=row['retrieve_hash']) is not None:
        return

    log.info("Extracting: %s, File %s", row['package_name'], row['resource_id'])

    status, sheets = extract_resource_core(engine, row)
    sl.upsert(engine, source_table, {
        'resource_id': row['resource_id'],
        'extract_hash': row['retrieve_hash'],
        'extract_status': status,
        'sheets': sheets
        }, unique=['resource_id'])
Example #36
0
def combine_resource(engine, source_table, row, force, stats):
    if not row['extract_status']:
        stats.add_source('Previous step (extract) not complete', row)
        return

    # Skip over tables we have already combined
    if not force and sl.find_one(engine, source_table,
            resource_id=row['resource_id'],
            combine_hash=row['extract_hash'],
            combine_status=True) is not None:
        stats.add_source('Already combined', row)
        return

    log.info("Combine: /dataset/%s/resource/%s", row['package_name'], row['resource_id'])
    clear_issues(engine, row['resource_id'], STAGE)

    status = combine_resource_core(engine, row, stats)
    sl.upsert(engine, source_table, {
        'resource_id': row['resource_id'],
        'combine_hash': row['extract_hash'],
        'combine_status': status,
        }, unique=['resource_id'])
Example #37
0
def canonical_actor(grano, engine, title):
    entity_table = sl.get_table(engine, 'entity')
    res = sl.find_one(engine, entity_table, etlFingerPrint=title)
    if res is not None and \
        'canonicalName' in res and \
        res['canonicalName'] and \
        title != res['canonicalName']:
        if title not in DELETED:
            nonCanon = grano.findEntity(ACTOR['name'], title=title)
            if nonCanon:
                ent = grano.getEntity(nonCanon['id'], deep=True)
                for rel in ent.get('incoming', []):
                    grano.deleteRelation(rel)
                for rel in ent.get('outgoing', []):
                    grano.deleteRelation(rel)
                grano.deleteEntity(nonCanon)
                DELETED.add(title)
        title = res['canonicalName']
    act = grano.findEntity(ACTOR['name'], title=title) or {}
    print [title], act.get('id')
    act['title'] = title
    act['type'] = ACTOR['name']
    return act
Example #38
0
def validate_resource(engine, source_table, row, force):
    if not row['cleanup_status']:
        return

    # Skip over tables we have already cleaned up
    if not force and sl.find_one(engine, source_table,
            resource_id=row['resource_id'],
            validate_status=True,
            validate_hash=row['cleanup_hash']) is not None:
        return

    log.info("Validate: %s, Resource %s", row['package_name'], row['resource_id'])

    status = True
    for sheet_id in range(0, row['sheets']):
        sheet_status = validate_sheet(engine, row, sheet_id)
        if status and not sheet_status:
            status = False
    log.info("Result: %s", status)
    sl.upsert(engine, source_table, {
        'resource_id': row['resource_id'],
        'validate_hash': row['cleanup_hash'],
        'validate_status': status,
        }, unique=['resource_id'])
Example #39
0
def extract_resource(engine, source_table, row, force, stats):
    if not row['retrieve_status']:
        stats.add_source('Previous step (retrieve) not complete', row)
        log.debug('Row has no retrieve status - skipping')
        return

    # Skip over tables we have already extracted
    if not force and sl.find_one(engine, source_table,
            resource_id=row['resource_id'],
            extract_status=True,
            extract_hash=row['retrieve_hash']) is not None:
        stats.add_source('Already extracted', row)
        return

    log.info("Extract: /dataset/%s/resource/%s", row['package_name'], row['resource_id'])
    clear_issues(engine, row['resource_id'], STAGE)

    status, sheets = extract_resource_core(engine, row, stats)
    sl.upsert(engine, source_table, {
        'resource_id': row['resource_id'],
        'extract_hash': row['retrieve_hash'],
        'extract_status': status,
        'sheets': sheets
        }, unique=['resource_id'])
Example #40
0
def scrape_ablauf(engine, url, force=False):
    Ablauf = sl.get_table(engine, 'ablauf')

    key = int(url.rsplit('/', 1)[-1].split('.')[0])
    a = sl.find_one(engine, Ablauf, source_url=url)
    if a is not None and a['abgeschlossen'] and not force:
        raise Unmodified()
    response = fetch(url)
    a = check_tags(a or {}, response, force)
    a.update({'key': key, 
              'source_url': url})
    doc = inline_xml_from_page(response.content, url)
    if doc is None: 
        raise NoContentException()
    
    a['wahlperiode'] = int(doc.findtext("WAHLPERIODE"))
    a['typ'] = doc.findtext("VORGANGSTYP")
    a['titel'] = doc.findtext("TITEL")

    if not a['titel'] or not len(a['titel'].strip()):
        raise NoContentException()

    if '\n' in a['titel']:
        t, k = a['titel'].rsplit('\n', 1)
        k = k.strip()
        if k.startswith('KOM') or k.startswith('SEK'):
            a['titel'] = t

    a['initiative'] = doc.findtext("INITIATIVE")
    a['stand'] = doc.findtext("AKTUELLER_STAND")
    a['signatur'] = doc.findtext("SIGNATUR")
    a['gesta_id'] = doc.findtext("GESTA_ORDNUNGSNUMMER")
    a['eu_dok_nr'] = doc.findtext("EU_DOK_NR")
    a['abstrakt'] = doc.findtext("ABSTRAKT")
    a['sachgebiet'] = doc.findtext("SACHGEBIET")
    a['zustimmungsbeduerftig'] = doc.findtext("ZUSTIMMUNGSBEDUERFTIGKEIT")
    #a.schlagworte = []
    Schlagwort = sl.get_table(engine, 'schlagwort')
    for sw in doc.findall("SCHLAGWORT"):
        wort = {'wort': sw.text, 'source_url': url}
        sl.upsert(engine, Schlagwort, wort, unique=wort.keys())
    log.info("Ablauf %s: %s", url, a['titel'].encode('ascii', 'replace'))
    a['titel'] = a['titel'].strip().lstrip('.').strip()
    a = expand_dok_nr(a)
    a['abgeschlossen'] = DIP_ABLAUF_STATES_FINISHED.get(a['stand'], False)
    if 'Originaltext der Frage(n):' in a['abstrakt']:
        _, a['abstrakt'] = a['abstrakt'].split('Originaltext der Frage(n):', 1)

    Referenz = sl.get_table(engine, 'referenz')
    for elem in doc.findall("WICHTIGE_DRUCKSACHE"):
        link = elem.findtext("DRS_LINK")
        hash = None
        if link is not None and '#' in link:
            link, hash = link.rsplit('#', 1)
        dokument = dokument_by_id(elem.findtext("DRS_HERAUSGEBER"), 
                'drs', elem.findtext("DRS_NUMMER"), link=link)
        dokument['text'] = elem.findtext("DRS_TYP")
        dokument['seiten'] = hash
        dokument['source_url'] = url
        sl.upsert(engine, Referenz, dokument, unique=[
            'link', 'source_url', 'seiten'
            ])

    for elem in doc.findall("PLENUM"):
        link = elem.findtext("PLPR_LINK")
        if link is not None and '#' in link:
            link, hash = link.rsplit('#', 1)
        dokument = dokument_by_id(elem.findtext("PLPR_HERAUSGEBER"), 
                'plpr', elem.findtext("PLPR_NUMMER"), link=link)
        dokument['text'] = elem.findtext("PLPR_KLARTEXT")
        dokument['seiten'] = elem.findtext("PLPR_SEITEN")
        dokument['source_url'] = url
        sl.upsert(engine, Referenz, dokument, unique=[
            'link', 'source_url', 'seiten'
            ])

    sl.upsert(engine, Ablauf, a, unique=['source_url'])
    for elem in doc.findall(".//VORGANGSPOSITION"):
        scrape_activity(engine, url, elem)
    return a
Example #41
0
                'link', 'source_url', 'seiten'
                ])
    except Exception, e:
        log.exception(e)

    sl.upsert(engine, Position, p, unique=[])
    Person = sl.get_table(engine, 'person')
    Beitrag = sl.get_table(engine, 'beitrag')
    for belem in elem.findall("PERSOENLICHER_URHEBER"):
        b = pos_keys.copy()
        b['vorname'] = belem.findtext("VORNAME")
        b['nachname'] = belem.findtext("NACHNAME")
        b['funktion'] = belem.findtext("FUNKTION")
        b['ort'] = belem.findtext('WAHLKREISZUSATZ')
        p = sl.find_one(engine, Person, 
                vorname=b['vorname'],
                nachname=b['nachname'],
                ort=b['ort'])
        if p is not None:
            b['person_source_url'] = p['source_url']
        b['ressort'] = belem.findtext("RESSORT")
        b['land'] = belem.findtext("BUNDESLAND")
        b['fraktion'] = FACTION_MAPS.get(belem.findtext("FRAKTION"), 
            belem.findtext("FRAKTION"))
        b['seite'] = belem.findtext("SEITE")
        b['art'] = belem.findtext("AKTIVITAETSART")
        sl.upsert(engine, Beitrag, b, unique=[])

class NoContentException(Exception): pass

def scrape_ablauf(engine, url, force=False):
    Ablauf = sl.get_table(engine, 'ablauf')
Example #42
0
def lazyload_gremium(engine, key):
    table = sl.get_table(engine, 'gremium')
    data = sl.find_one(engine, table, key=key)
    if data is None:
        return None
    return load_gremium(engine, data)
Example #43
0
def check_done(engine, url, force=False):
    if force:
        return False
    table = sl.get_table(engine, 'speech')
    data = sl.find_one(engine, table, source_url=url, loaded=True)
    return data is not None
Example #44
0
def scrape_mdb(engine, url, force=False):
    response, doc = _xml(url)
    id = int(doc.findtext('//mdbID'))
    table_person = sl.get_table(engine, 'person')
    table_rolle = sl.get_table(engine, 'rolle')
    p = sl.find_one(engine, table_person, mdb_id=id)
    if p is None:
        p = {'source_url': url}
    else:
        r_ = sl.find_one(engine, table_rolle, mdb_id=id, funktion='MdB')
        if r_ is not None:
            r = r_
    p = check_tags(p, response, force)
    r = {'person_source_url': url, 'funktion': 'MdB'}

    r['mdb_id'] = p['mdb_id'] = id
    r['status'] = doc.find('//mdbID').get('status')
    if doc.findtext('//mdbAustrittsdatum'):
        r['austritt'] = datetime.strptime(doc.findtext('//mdbAustrittsdatum'),
                                          '%d.%m.%Y').isoformat()
    p['vorname'] = doc.findtext('//mdbVorname')
    p['nachname'] = doc.findtext('//mdbZuname')
    p['adelstitel'] = doc.findtext('//mdbAdelstitel')
    p['titel'] = doc.findtext('//mdbAkademischerTitel')
    p['ort'] = doc.findtext('//mdbOrtszusatz')
    message = 'MdB: %s %s (%s)' % (p['vorname'], p['nachname'], p['ort'])
    log.info(message.encode('ascii', 'replace'))
    p['geburtsdatum'] = doc.findtext('//mdbGeburtsdatum')
    p['religion'] = doc.findtext('//mdbReligionKonfession')
    p['hochschule'] = doc.findtext('//mdbHochschulbildung')
    p['beruf'] = doc.findtext('//mdbBeruf')
    p['berufsfeld'] = doc.find('//mdbBeruf').get('berufsfeld')
    p['geschlecht'] = doc.findtext('//mdbGeschlecht')
    p['familienstand'] = doc.findtext('//mdbFamilienstand')
    p['kinder'] = doc.findtext('//mdbAnzahlKinder')
    r['fraktion'] = doc.findtext('//mdbFraktion')
    p['fraktion'] = doc.findtext('//mdbFraktion')
    p['partei'] = doc.findtext('//mdbPartei')
    p['land'] = doc.findtext('//mdbLand')
    r['gewaehlt'] = doc.findtext('//mdbGewaehlt')
    p['bio_url'] = doc.findtext('//mdbBioURL')
    p['bio'] = doc.findtext('//mdbBiografischeInformationen')
    p['wissenswertes'] = doc.findtext('//mdbWissenswertes')
    p['homepage_url'] = doc.findtext('//mdbHomepageURL')
    p['telefon'] = doc.findtext('//mdbTelefon')
    p['angaben'] = doc.findtext('//mdbVeroeffentlichungspflichtigeAngaben')
    p['foto_url'] = doc.findtext('//mdbFotoURL')
    p['foto_copyright'] = doc.findtext('//mdbFotoCopyright')
    p['reden_plenum_url'] = doc.findtext('//mdbRedenVorPlenumURL')
    p['reden_plenum_rss_url'] = doc.findtext('//mdbRedenVorPlenumRSS')

    p['wk_nummer'] = doc.findtext('//mdbWahlkreisNummer')
    p['wk_name'] = doc.findtext('//mdbWahlkreisName')
    p['wk_url'] = doc.findtext('//mdbWahlkreisURL')

    for website in doc.findall('//mdbSonstigeWebsite'):
        type_ = website.findtext('mdbSonstigeWebsiteTitel')
        ws_url = website.findtext('mdbSonstigeWebsiteURL')
        if type_.lower() == 'twitter':
            p['twitter_url'] = ws_url
        if type_.lower() == 'facebook':
            p['facebook_url'] = ws_url

    if doc.findtext('//mdbBundestagspraesident'):
        sl.upsert(engine,
                  table_rolle, {
                      'person_source_url': url,
                      'funktion': u'Bundestagspräsident',
                  },
                  unique=['person_source_url', 'funktion'])
    if doc.findtext('//mdbBundestagsvizepraesident'):
        sl.upsert(engine,
                  table_rolle, {
                      'person_source_url': url,
                      'funktion': u'Bundestagsvizepräsident',
                  },
                  unique=['person_source_url', 'funktion'])

    for n in doc.findall('//mdbObleuteGremium'):
        add_to_gremium(n, url, 'obleute', engine)

    for n in doc.findall('//mdbVorsitzGremium'):
        add_to_gremium(n, url, 'vorsitz', engine)

    for n in doc.findall('//mdbStellvertretenderVorsitzGremium'):
        add_to_gremium(n, url, 'stellv_vorsitz', engine)

    for n in doc.findall('//mdbVorsitzSonstigesGremium'):
        add_to_gremium(n, url, 'vorsitz', engine)

    for n in doc.findall('//mdbStellvVorsitzSonstigesGremium'):
        add_to_gremium(n, url, 'stellv_vorsitz', engine)

    for n in doc.findall('//mdbOrdentlichesMitgliedGremium'):
        add_to_gremium(n, url, 'mitglied', engine)

    for n in doc.findall('//mdbStellvertretendesMitgliedGremium'):
        add_to_gremium(n, url, 'stellv_mitglied', engine)

    for n in doc.findall('//mdbOrdentlichesMitgliedSonstigesGremium'):
        add_to_gremium(n, url, 'mitglied', engine)

    for n in doc.findall('//mdbStellvertretendesMitgliedSonstigesGremium'):
        add_to_gremium(n, url, 'stellv_mitglied', engine)

    sl.upsert(engine, table_person, p, unique=['source_url'])
    sl.upsert(engine, table_rolle, r, unique=['person_source_url', 'funktion'])
    return p
Example #45
0
    for res in pkg['resources']:
        log.info(" > Resource %s: %s", res['id'], res['url'])
        data = {
            'resource_id': res['id'],
            'package_id': pkg['id'],
            'package_name': pkg['name'],
            'package_title': pkg['title'],
            'last_modified': res.get('last_modified'),
            'url': res['url'],
            'publisher_name': group.get('name'),
            'publisher_title': group.get('title'),
            'publisher_type': group.get('type'),
            'format': res['format'],
            'description': res['description']
            }
        row = sl.find_one(engine, table, resource_id=pkg['id'])
        if row and row['url'] != pkg['url']:
            # url has changed, so force retrieval next time
            data['retrieve_status'] = False
        sl.upsert(engine, table, data, ['resource_id'])

def connect():
    engine = db_connect()
    src_table = sl.get_table(engine, 'source')
    return engine, src_table

def build_index(department_filter=None):
    '''Searches CKAN for spending resources and writes their metadata to
    the database.'''
    engine, table = connect()
    client = ckan_client()
Example #46
0
        sl.add_row(engine, Referenz, dokument)
    except Exception, e:
        log.exception(e)

    sl.add_row(engine, Position, p)
    Person = sl.get_table(engine, 'person')
    Beitrag = sl.get_table(engine, 'beitrag')
    for belem in elem.findall("PERSOENLICHER_URHEBER"):
        b = pos_keys.copy()
        b['vorname'] = belem.findtext("VORNAME")
        b['nachname'] = belem.findtext("NACHNAME")
        b['funktion'] = belem.findtext("FUNKTION")
        b['ort'] = belem.findtext('WAHLKREISZUSATZ')
        p = sl.find_one(engine,
                        Person,
                        vorname=b['vorname'],
                        nachname=b['nachname'],
                        ort=b['ort'])
        if p is not None:
            b['person_source_url'] = p['source_url']
        b['ressort'] = belem.findtext("RESSORT")
        b['land'] = belem.findtext("BUNDESLAND")
        b['fraktion'] = FACTION_MAPS.get(belem.findtext("FRAKTION"),
                                         belem.findtext("FRAKTION"))
        b['seite'] = belem.findtext("SEITE")
        b['art'] = belem.findtext("AKTIVITAETSART")
        sl.add_row(engine, Beitrag, b)


class NoContentException(Exception):
    pass
Example #47
0
def fetch_row(engine, table_name, **kw):
    table = sl.get_table(engine, table_name)
    return sl.find_one(engine, table, **kw)
Example #48
0
def scrape_ablauf(engine, url, force=False):
    Ablauf = sl.get_table(engine, 'ablauf')

    key = int(url.rsplit('/', 1)[-1].split('.')[0])
    a = sl.find_one(engine, Ablauf, source_url=url)
    if a is not None and a['abgeschlossen'] and not force:
        raise Unmodified()
    response = fetch(url)
    a = check_tags(a or {}, response, force)
    a.update({'key': key, 'source_url': url})
    doc = inline_xml_from_page(response.content, url)
    if doc is None:
        raise NoContentException()

    a['wahlperiode'] = int(doc.findtext("WAHLPERIODE"))
    a['typ'] = doc.findtext("VORGANGSTYP")
    a['titel'] = doc.findtext("TITEL")

    if not a['titel'] or not len(a['titel'].strip()):
        raise NoContentException()

    if '\n' in a['titel']:
        t, k = a['titel'].rsplit('\n', 1)
        k = k.strip()
        if k.startswith('KOM') or k.startswith('SEK'):
            a['titel'] = t

    a['initiative'] = doc.findtext("INITIATIVE")
    a['stand'] = doc.findtext("AKTUELLER_STAND")
    a['signatur'] = doc.findtext("SIGNATUR")
    a['gesta_id'] = doc.findtext("GESTA_ORDNUNGSNUMMER")
    a['eu_dok_nr'] = doc.findtext("EU_DOK_NR")
    a['abstrakt'] = doc.findtext("ABSTRAKT")
    a['sachgebiet'] = doc.findtext("SACHGEBIET")
    a['zustimmungsbeduerftig'] = doc.findtext("ZUSTIMMUNGSBEDUERFTIGKEIT")
    #a.schlagworte = []
    Schlagwort = sl.get_table(engine, 'schlagwort')
    for sw in doc.findall("SCHLAGWORT"):
        wort = {'wort': sw.text, 'source_url': url}
        sl.upsert(engine, Schlagwort, wort, unique=wort.keys())
    log.info("Ablauf %s: %s", url, a['titel'].encode('ascii', 'replace'))
    a['titel'] = a['titel'].strip().lstrip('.').strip()
    a = expand_dok_nr(a)
    a['abgeschlossen'] = DIP_ABLAUF_STATES_FINISHED.get(a['stand'], False)

    if a['wahlperiode'] != max(app.config.get('WAHLPERIODEN')):
        a['abgeschlossen'] = True

    if 'Originaltext der Frage(n):' in a['abstrakt']:
        _, a['abstrakt'] = a['abstrakt'].split('Originaltext der Frage(n):', 1)

    sl.delete(engine, sl.get_table(engine, 'position'), source_url=url)
    sl.delete(engine, sl.get_table(engine, 'beitrag'), source_url=url)
    sl.delete(engine, sl.get_table(engine, 'zuweisung'), source_url=url)
    sl.delete(engine, sl.get_table(engine, 'beschluss'), source_url=url)
    sl.delete(engine, sl.get_table(engine, 'referenz'), source_url=url)

    for elem in doc.findall(".//VORGANGSPOSITION"):
        scrape_activity(engine, url, elem)

    Referenz = sl.get_table(engine, 'referenz')
    for elem in doc.findall("WICHTIGE_DRUCKSACHE"):
        link = elem.findtext("DRS_LINK")
        hash = None
        if link is not None and '#' in link:
            link, hash = link.rsplit('#', 1)
        dokument = dokument_by_id(elem.findtext("DRS_HERAUSGEBER"),
                                  'drs',
                                  elem.findtext("DRS_NUMMER"),
                                  link=link)
        dokument['text'] = elem.findtext("DRS_TYP")
        dokument['seiten'] = hash
        dokument['source_url'] = url
        sl.upsert(engine,
                  Referenz,
                  dokument,
                  unique=['link', 'source_url', 'seiten'])

    for elem in doc.findall("PLENUM"):
        link = elem.findtext("PLPR_LINK")
        if link is not None and '#' in link:
            link, hash = link.rsplit('#', 1)
        dokument = dokument_by_id(elem.findtext("PLPR_HERAUSGEBER"),
                                  'plpr',
                                  elem.findtext("PLPR_NUMMER"),
                                  link=link)
        dokument['text'] = elem.findtext("PLPR_KLARTEXT")
        dokument['seiten'] = elem.findtext("PLPR_SEITEN")
        dokument['source_url'] = url
        sl.upsert(engine,
                  Referenz,
                  dokument,
                  unique=['link', 'source_url', 'seiten'])

    sl.upsert(engine, Ablauf, a, unique=['source_url'])
    return a