コード例 #1
0
ファイル: categories.py プロジェクト: tttp/lobbyfacts
def code_categories(engine):
    table = sl.get_table(engine, 'representative')
    for cat in sl.distinct(engine, table, 'main_category'):
        if not cat['main_category']: continue
        c=newcats.get(cat['main_category'],cat['main_category'])
        cat['main_category_id'] = CATEGORIES[c]
        sl.upsert(engine, table, cat, ['main_category'])
コード例 #2
0
ファイル: geocode.py プロジェクト: tttp/lobbyfacts
def transform(engine):
    log.info("Geo-coding representatives...")
    table = sl.get_table(engine, 'representative')
    for row in sl.all(engine, table):
        out = {'id': row['id']}
        if row.get('contact_lon'):
            continue
        query = {
            'format': 'json',
            'limit': 1,
            'city': row.get('contact_town'),
            'street': row.get('contact_street'),
            'country': row.get('contact_country'),
            'postalcode': row.get('contact_post_code')
            }
        response = requests.get(URL, params=query)
        try:
            json = response.json()
        except: continue
        if json and len(json):
            geo = json[0]
            log.info("%s @ %s", row.get('name'), geo.get('display_name'))
            out['contact_geoname'] = geo.get('display_name')
            out['contact_lon'] = geo.get('lon')
            out['contact_lat'] = geo.get('lat')
            sl.upsert(engine, table, out, ['id'])
コード例 #3
0
ファイル: reginterests.py プロジェクト: stef/lobbyfacts
def load_person(person, role, childBase, engine):
    table = sl.get_table(engine, "person")
    person_ = childBase.copy()
    person_.update(person)
    person_["role"] = role
    person_["name"] = " ".join((person["title"] or "", person["first_name"] or "", person["last_name"] or ""))
    sl.upsert(engine, table, person_, ["representative_etl_id", "role", "name"])
コード例 #4
0
ファイル: reginterests.py プロジェクト: stef/lobbyfacts
def load_contact(contact, childBase, engine):
    if contact == {}:
        return
    table = sl.get_table(engine, "contact")
    contact_ = childBase.copy()
    contact_.update(contact)
    sl.upsert(engine, table, contact_, ["representative_etl_id", "country", "type"])
コード例 #5
0
ファイル: unreginterest.py プロジェクト: stef/lobbyfacts
def load_rep(line, engine, unregtag):
    rep={}
    rep['original_name'] = line[0].strip()
    rep['name'] = line[0].strip()
    rep['identification_code'] = line[1] or hashlib.sha512(line[0].strip()).hexdigest()[:16]
    rep['etl_id'] = "%s//ALL" % rep['identification_code']
    rep['web_site_url'] = line[2] or ''

    if line[3].strip():
        rep['contact_street'] = line[3]
    if line[4].strip():
        tmp=line[4].split()
        if tmp[0][0] == 'B':
            rep['contact_country'] = 'Belgium'
        elif tmp[0][0] == 'F':
            rep['contact_country'] = 'France'
        else:
            print 'bad zipcode country code', line[4]

        rep['contact_post_code'] = tmp[0][2:]
        rep['contact_town'] = ' '.join(tmp[1:])

    rep['network_extracted'] = False
    sl.upsert(engine, sl.get_table(engine, 'representative'), rep,
              ['etl_id'])

    inserted=sl.find_one(engine,sl.get_table(engine, 'representative'),**rep)
    if inserted:
        sl.upsert(engine, sl.get_table(engine, 'tags'),
                  {'representative_id': inserted['id'], 'tag_id': unregtag['id']},
                  ['representative_id', 'tag_id'])
コード例 #6
0
ファイル: meetings.py プロジェクト: tttp/lobbyfacts
def extract(engine):
    table = sl.get_table(engine, 'meeting')

    i=0
    for title, url in uuids:
        for meeting in scrape(url, title):
            sl.upsert(engine, table, meeting, ['meetid', 'identification_code'])
            i+=1
            if i % 100 == 0:
                log.info("Extracted: %s...", i)
コード例 #7
0
ファイル: reginterests.py プロジェクト: tttp/lobbyfacts
def load_person(person, role, childBase, engine):
    table = sl.get_table(engine, 'person')
    person_ = childBase.copy()
    person_.update(person)
    person_['role'] = role
    person_['name'] = ' '.join((person['title'] or '',
                                person['first_name'] or '',
                                person['last_name'] or ''))
    sl.upsert(engine, table, person_, ['representative_etl_id',
                                       'role',
                                       'name'])
コード例 #8
0
ファイル: dedup.py プロジェクト: erikwesselius/lobbyfacts
def dedup_fields(engine, field):
    table = sl.get_table(engine, 'representative')
    for rep in sl.all(engine, table):
        others = list(sl.find(engine, table, **{field: rep[field]}))
        if len(others) > 1:
            log.info("Duplicates for: %s", rep['name'])
            for i, re in enumerate(others):
                text = "(Duplicate %s)" % (i+1)
                sl.upsert(engine, table,
                    {'name_suffix': text,
                     'identification_code': re['identification_code']},
                    ['identification_code'])
コード例 #9
0
ファイル: unreginterest.py プロジェクト: stef/lobbyfacts
def extract_data(engine):
    log.info("Extracting unregistered interests data...")
    taglabel='situation:unregistered'
    unregtag={'tag': taglabel}
    sl.upsert(engine, sl.get_table(engine, 'tag'), unregtag, ['tag'])
    unregtag=sl.find_one(engine,sl.get_table(engine, 'tag'),tag=taglabel)

    with app.open_resource('resources/unregistered-companies.csv') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for i, rep in enumerate(csvreader):
            load_rep(rep, engine, unregtag)
            if i % 100 == 0:
                log.info("Extracted: %s...", i)
コード例 #10
0
ファイル: meetings.py プロジェクト: stef/lobbyfacts
def extract(engine):
    table = sl.get_table(engine, 'meeting')
    try:
        sl.update(engine, 'meeting', {}, {'status': 'inactive'}, ensure=False)
        sl.update(engine, 'meeting_participants', {}, {'status': 'inactive'}, ensure=False)
    except sqlalchemy.exc.CompileError:
        pass

    i=0
    for url, org, title in get_urls():
        for meeting in scrape(url, title, org):
            sl.upsert(engine, table, meeting, ['meetid', 'identification_code'])
            i+=1
            if i % 100 == 0:
                log.info("Extracted: %s...", i)
コード例 #11
0
def fetch_taskruns(engine):
    log.info("Fetching responses from pyBossa...")
    net = sl.get_table(engine, 'network_entity')
    app = setup()
    results = defaultdict(list)
    for taskrun in _iterate(pbclient.find_taskruns, app_id=app.id):
        results[taskrun.info.get('etl_id')].extend(taskrun.info.get('matches'))
    for etl_id, matches in results.items():
        uniques = defaultdict(list)
        for m in matches: 
            uniques[m.strip().lower()].append(m)
        for vs in uniques.values():
            if not len(vs) >= QUORUM:
                continue
            sl.upsert(engine, net, {'etl_id': etl_id, 'name': vs[0].strip()},
                      ['etl_id', 'name'])
コード例 #12
0
def save(person, engine):
    table = sl.get_table(engine, 'person')
    orgs = list(sl.find(engine, sl.get_table(engine, 'representative'),
                   identification_code=person['org_identification_code']))
    if len(orgs):
        org = max(orgs, key=lambda o: o['last_update_date'])
        person['representative_etl_id'] = org['etl_id']
        person['role'] = 'accredited'
        name = '%s %s %s' % (person['title'] or '',
                             person['first_name'] or '',
                             person['last_name'] or '')
        person['name'] = name.strip()
        log.debug("Accreditation: %s", name)
        sl.upsert(engine, table, person,
            ['representative_etl_id', 'role', 'name'])
    else:
        log.warn("Cannot associate with a registered interest: %r", person)
コード例 #13
0
ファイル: tag.py プロジェクト: stef/lobbyfacts
def load_tag(rec, engine):
    tags=[]
    for tag in rec['tags']:
        sl.upsert(engine, sl.get_table(engine, 'tag'), {'tag': tag} , ['tag'])
        tags.append(sl.find_one(engine,sl.get_table(engine, 'tag'),tag=tag))
    if rec['id']:
        rep=sl.find_one(engine,sl.get_table(engine, 'representative'), identification_code=rec['id'])
    else:
        rep=sl.find_one(engine,sl.get_table(engine, 'representative'), original_name=rec['name'])
    if not rep:
        print >>sys.stderr, "couldn't find", rec['id'] or rec['name'].encode('utf8')
        return
    for tag in tags:
        sl.upsert(engine, sl.get_table(engine, 'tags'),
                  {'representative_id': rep['id'],
                   'tag_id': tag['id']},
                  ['representative_id', 'tag_id'])
    return
コード例 #14
0
ファイル: dedup.py プロジェクト: stef/lobbyfacts
def dedup_fields(engine, field):
    table = sl.get_table(engine, 'representative')
    seen=set([])
    for n, rep in enumerate(sl.all(engine, table)):
        if n % 100 == 0:
            print n, 'done'
        if not rep[field] or not rep[field].strip() or rep[field] in seen: continue
        seen.update(rep[field])
        others = list(sl.find(engine, table, **{field: rep[field]}))
        if len(others) > 1:
            log.info("Duplicates for: %s", rep['name'])
            for i, re in enumerate(others):
                if re == rep: continue
                text = "(Duplicate %s)" % (i+1)
                sl.upsert(engine, table,
                    {'name_suffix': text,
                     'identification_code': re['identification_code']},
                    ['identification_code'])
コード例 #15
0
ファイル: reginterests.py プロジェクト: stef/lobbyfacts
def load_rep(rep, engine):
    # etlId = rep['etlId'] = "%s//%s" % (rep['identificationCode'],
    #                                   rep['lastUpdateDate'].isoformat())
    etlId = rep["etl_id"] = "%s//ALL" % rep["identification_code"]
    childBase = {
        "representative_etl_id": etlId,
        "representative_update_date": rep["last_update_date"],
        "status": "active",
    }
    if not rep["original_name"]:
        log.error("Unnamed representative: %r", rep)
        return

    load_contact(rep.pop("head_contact", {}), childBase, engine)
    load_contact(rep.pop("be_contact", {}), childBase, engine)

    load_person(rep.pop("legal_person"), "legal", childBase, engine)
    load_person(rep.pop("head_person"), "head", childBase, engine)
    for actionField in rep.pop("action_fields"):
        rec = childBase.copy()
        rec["action_field"] = actionField
        sl.upsert(engine, sl.get_table(engine, "action_field"), rec, ["representative_etl_id", "action_field"])

    for interest in rep.pop("interests"):
        rec = childBase.copy()
        rec["interest"] = interest
        sl.upsert(engine, sl.get_table(engine, "interest"), rec, ["representative_etl_id", "interest"])

    for countryOfMember in rep.pop("country_of_members"):
        rec = childBase.copy()
        rec["country"] = countryOfMember
        sl.upsert(engine, sl.get_table(engine, "country_of_member"), rec, ["representative_etl_id", "country"])

    for organisation in rep.pop("organisations"):
        rec = childBase.copy()
        rec.update(organisation)
        rec["name"] = organisation["name"].strip()
        sl.upsert(engine, sl.get_table(engine, "organisation"), rec, ["representative_etl_id", "name"])

    load_finances(rep.pop("fd"), childBase, engine)
    rep["name"] = rep["original_name"].strip()
    rep["network_extracted"] = False
    sl.upsert(engine, sl.get_table(engine, "representative"), rep, ["etl_id"])
コード例 #16
0
ファイル: reginterests.py プロジェクト: tttp/lobbyfacts
def load_rep(rep, engine):
    #etlId = rep['etlId'] = "%s//%s" % (rep['identificationCode'],
    #                                   rep['lastUpdateDate'].isoformat())
    etlId = rep['etl_id'] = "%s//ALL" % rep['identification_code']
    childBase = {'representative_etl_id': etlId,
                 'representative_update_date': rep['last_update_date'],
                 'status': 'active'}
    if not rep['original_name']:
        log.error("Unnamed representative: %r", rep)
        return
    load_person(rep.pop('legal_person'), 'legal', childBase, engine)
    load_person(rep.pop('head_person'), 'head', childBase, engine)
    for actionField in rep.pop('action_fields'):
        rec = childBase.copy()
        rec['action_field'] = actionField
        sl.upsert(engine, sl.get_table(engine, 'action_field'), rec,
                  ['representative_etl_id', 'action_field'])

    for interest in rep.pop('interests'):
        rec = childBase.copy()
        rec['interest'] = interest
        sl.upsert(engine, sl.get_table(engine, 'interest'), rec,
                  ['representative_etl_id', 'interest'])

    for countryOfMember in rep.pop('country_of_members'):
        rec = childBase.copy()
        rec['country'] = countryOfMember
        sl.upsert(engine, sl.get_table(engine, 'country_of_member'), rec,
                  ['representative_etl_id', 'country'])

    for organisation in rep.pop('organisations'):
        rec = childBase.copy()
        rec.update(organisation)
        rec['name'] = organisation['name'].strip()
        sl.upsert(engine, sl.get_table(engine, 'organisation'), rec,
                  ['representative_etl_id', 'name'])

    load_finances(rep.pop('fd'), childBase, engine)
    rep['name'] = rep['original_name'].strip()
    rep['network_extracted'] = False
    sl.upsert(engine, sl.get_table(engine, 'representative'), rep,
              ['etl_id'])
コード例 #17
0
ファイル: reginterests.py プロジェクト: stef/lobbyfacts
def load_finances(financialData, childBase, engine):
    if financialData == {}:
        return
    etlId = "%s//%s" % (financialData["start_date"].isoformat(), financialData["end_date"].isoformat())

    financial_sources = [(s, "other") for s in financialData.pop("other_customized")] + [
        (s, "public") for s in financialData.pop("public_customized")
    ]
    for financial_source, type_ in financial_sources:
        financial_source["type"] = type_
        financial_source["financial_data_etl_id"] = etlId
        financial_source.update(childBase)
        sl.upsert(
            engine,
            sl.get_table(engine, "financial_data_custom_source"),
            financial_source,
            ["representative_etl_id", "financial_data_etl_id", "type", "name"],
        )

    for turnover in financialData.pop("turnover_breakdown"):
        turnover["financial_data_etl_id"] = etlId
        turnover["name"] = turnover["name"].strip()
        turnover.update(childBase)
        sl.upsert(
            engine,
            sl.get_table(engine, "financial_data_turnover"),
            turnover,
            ["representative_etl_id", "financial_data_etl_id", "name"],
        )

    financialData["etl_id"] = etlId
    financialData.update(childBase)
    sl.upsert(engine, sl.get_table(engine, "financial_data"), financialData, ["representative_etl_id", "etl_id"])
コード例 #18
0
def load_finances(financialData, childBase, engine):
    etlId = '%s//%s' % (financialData['start_date'].isoformat(),
                        financialData['end_date'].isoformat())

    financial_sources = \
        [(s, 'other') for s in financialData.pop("other_customized")] + \
        [(s, 'public') for s in financialData.pop("public_customized")]
    for financial_source, type_ in financial_sources:
        financial_source['type'] = type_
        financial_source['financial_data_etl_id'] = etlId
        financial_source.update(childBase)
        sl.upsert(engine, sl.get_table(engine, 'financial_data_custom_source'),
                  financial_source, ['representative_etl_id',
                      'financial_data_etl_id', 'type', 'name'])

    for turnover in financialData.pop("turnover_breakdown"):
        turnover['financial_data_etl_id'] = etlId
        turnover['name'] = turnover['name'].strip()
        turnover.update(childBase)
        sl.upsert(engine, sl.get_table(engine, 'financial_data_turnover'),
                  turnover, ['representative_etl_id', 'financial_data_etl_id',
                             'name'])

    financialData['etl_id'] = etlId
    financialData.update(childBase)
    sl.upsert(engine, sl.get_table(engine, 'financial_data'),
              financialData, ['representative_etl_id', 'etl_id'])
コード例 #19
0
ファイル: names.py プロジェクト: erikwesselius/lobbyfacts
def map_names(map_func, engine, table_name, source_column='name',
        out_column='canonical_name'):
    table = sl.get_table(engine, table_name)
    seen_values = set()
    log.info("Normalising names on '%s', column '%s'...", table_name,
             source_column)
    for row in sl.find(engine, table):
        value = row.get(source_column)
        if value in seen_values:
            continue
        seen_values.add(value)
        d = {source_column: value, 'etl_clean': True,
             out_column: None}
        try:
            out = map_func(value, row)
            if out is None:
                d['etl_clean'] = False
            else:
                d[out_column] = out
        except ValueError, ve:
            d['etl_clean'] = False
        sl.upsert(engine, table, d, [source_column])
コード例 #20
0
ファイル: regexpert.py プロジェクト: erikwesselius/lobbyfacts
 def _s(data):
     if 'subgroup_status' in data:
         del data['subgroup_status']
     for policy_area in data.pop('policy_area', []):
         sl.upsert(engine, sl.get_table(engine, 'expertgroup_member_policy_area'),
                 {'expertgroup_etl_id': etlId, 'member': data['name'],
                  'policy_area': policy_area, 'subgroup': data['subgroup']},
                 ['expertgroup_etl_id', 'policy_area', 'member', 'subgroup'])
     for country in data.pop('countries/area_represented',
         data.pop('countries/areas_represented', [])):
         sl.upsert(engine, sl.get_table(engine, 'expertgroup_member_country'),
                 {'expertgroup_etl_id': etlId, 'member': data['name'],
                   'country': country, 'subgroup': data['subgroup']},
                   ['expertgroup_etl_id', 'country', 'member', 'subgroup'])
     data['expertgroup_etl_id'] = etlId
     sl.upsert(engine, sl.get_table(engine, 'expertgroup_member'),
         data, ['expertgroup_etl_id', 'name', 'subgroup'])
コード例 #21
0
ファイル: regexpert.py プロジェクト: erikwesselius/lobbyfacts
def save(engine, group):
    #etlId = "%s//%s" % (group['identifier'], group['last_updated'])
    etlId = "%s//ALL" % group['identifier']
    for policy_area in group.pop('policy_area', []):
        sl.upsert(engine, sl.get_table(engine, 'expertgroup_policy_area'),
                  {'expertgroup_etl_id': etlId, 'policy_area': policy_area},
                  ['expertgroup_etl_id', 'policy_area'])
    for task in group.pop('task', []):
        sl.upsert(engine, sl.get_table(engine, 'expertgroup_task'),
                  {'expertgroup_etl_id': etlId, 'task': task},
                  ['expertgroup_etl_id', 'task'])
    for composition in group.pop('composition', []):
        sl.upsert(engine, sl.get_table(engine, 'expertgroup_composition'),
                  {'expertgroup_etl_id': etlId, 'composition': composition},
                  ['expertgroup_etl_id', 'composition'])
    for associated_dg in group.pop('associated_dg', []):
        sl.upsert(engine, sl.get_table(engine, 'expertgroup_directorate'),
                  {'expertgroup_etl_id': etlId, 'directorate': associated_dg},
                  ['expertgroup_etl_id', 'directorate'])
    for lead_dg in group.pop('lead_dg', []):
        sl.upsert(engine, sl.get_table(engine, 'expertgroup_directorate'),
                  {'expertgroup_etl_id': etlId, 'directorate': lead_dg, 'lead': True},
                  ['expertgroup_etl_id', 'directorate'])
    for member in group.pop('members'):
        save_member(engine, etlId, member)

    for subgroup in group.pop('subgroups'):
        subgroup['expertgroup_etl_id'] = etlId
        for member in subgroup.pop('members'):
            member['subgroup'] = subgroup['name']
            save_member(engine, etlId, member)
        sl.upsert(engine, sl.get_table(engine, 'expertgroup_subgroup'),
                  subgroup, ['expertgroup_etl_id', 'name'])
    void = group.pop('additional_info')

    group['etl_id'] = etlId
    group.pop('link_to_website', '')
    sl.upsert(engine, sl.get_table(engine, 'expertgroup'),
              group, ['etl_id'])
コード例 #22
0
def code_categories(engine):
    table = sl.get_table(engine, 'representative')
    for cat in sl.distinct(engine, table, 'main_category'):
        cat['main_category_id'] = CATEGORIES[cat['main_category']]
        sl.upsert(engine, table, cat, ['main_category'])
コード例 #23
0
ファイル: categories.py プロジェクト: stef/lobbyfacts
def code_subcategories(engine):
    table = sl.get_table(engine, 'representative')
    for cat in sl.distinct(engine, table, 'sub_category'):
        if not cat['sub_category']: continue
        cat['sub_category_id'] = SUBCATEGORIES.get(cat['sub_category'])
        sl.upsert(engine, table, cat, ['sub_category'])