def code_categories(engine): table = sl.get_table(engine, 'representative') for cat in sl.distinct(engine, table, 'main_category'): if not cat['main_category']: continue c=newcats.get(cat['main_category'],cat['main_category']) cat['main_category_id'] = CATEGORIES[c] sl.upsert(engine, table, cat, ['main_category'])
def transform(engine): log.info("Geo-coding representatives...") table = sl.get_table(engine, 'representative') for row in sl.all(engine, table): out = {'id': row['id']} if row.get('contact_lon'): continue query = { 'format': 'json', 'limit': 1, 'city': row.get('contact_town'), 'street': row.get('contact_street'), 'country': row.get('contact_country'), 'postalcode': row.get('contact_post_code') } response = requests.get(URL, params=query) try: json = response.json() except: continue if json and len(json): geo = json[0] log.info("%s @ %s", row.get('name'), geo.get('display_name')) out['contact_geoname'] = geo.get('display_name') out['contact_lon'] = geo.get('lon') out['contact_lat'] = geo.get('lat') sl.upsert(engine, table, out, ['id'])
def load_person(person, role, childBase, engine): table = sl.get_table(engine, "person") person_ = childBase.copy() person_.update(person) person_["role"] = role person_["name"] = " ".join((person["title"] or "", person["first_name"] or "", person["last_name"] or "")) sl.upsert(engine, table, person_, ["representative_etl_id", "role", "name"])
def load_contact(contact, childBase, engine): if contact == {}: return table = sl.get_table(engine, "contact") contact_ = childBase.copy() contact_.update(contact) sl.upsert(engine, table, contact_, ["representative_etl_id", "country", "type"])
def load_rep(line, engine, unregtag): rep={} rep['original_name'] = line[0].strip() rep['name'] = line[0].strip() rep['identification_code'] = line[1] or hashlib.sha512(line[0].strip()).hexdigest()[:16] rep['etl_id'] = "%s//ALL" % rep['identification_code'] rep['web_site_url'] = line[2] or '' if line[3].strip(): rep['contact_street'] = line[3] if line[4].strip(): tmp=line[4].split() if tmp[0][0] == 'B': rep['contact_country'] = 'Belgium' elif tmp[0][0] == 'F': rep['contact_country'] = 'France' else: print 'bad zipcode country code', line[4] rep['contact_post_code'] = tmp[0][2:] rep['contact_town'] = ' '.join(tmp[1:]) rep['network_extracted'] = False sl.upsert(engine, sl.get_table(engine, 'representative'), rep, ['etl_id']) inserted=sl.find_one(engine,sl.get_table(engine, 'representative'),**rep) if inserted: sl.upsert(engine, sl.get_table(engine, 'tags'), {'representative_id': inserted['id'], 'tag_id': unregtag['id']}, ['representative_id', 'tag_id'])
def extract(engine): table = sl.get_table(engine, 'meeting') i=0 for title, url in uuids: for meeting in scrape(url, title): sl.upsert(engine, table, meeting, ['meetid', 'identification_code']) i+=1 if i % 100 == 0: log.info("Extracted: %s...", i)
def load_person(person, role, childBase, engine): table = sl.get_table(engine, 'person') person_ = childBase.copy() person_.update(person) person_['role'] = role person_['name'] = ' '.join((person['title'] or '', person['first_name'] or '', person['last_name'] or '')) sl.upsert(engine, table, person_, ['representative_etl_id', 'role', 'name'])
def dedup_fields(engine, field): table = sl.get_table(engine, 'representative') for rep in sl.all(engine, table): others = list(sl.find(engine, table, **{field: rep[field]})) if len(others) > 1: log.info("Duplicates for: %s", rep['name']) for i, re in enumerate(others): text = "(Duplicate %s)" % (i+1) sl.upsert(engine, table, {'name_suffix': text, 'identification_code': re['identification_code']}, ['identification_code'])
def extract_data(engine): log.info("Extracting unregistered interests data...") taglabel='situation:unregistered' unregtag={'tag': taglabel} sl.upsert(engine, sl.get_table(engine, 'tag'), unregtag, ['tag']) unregtag=sl.find_one(engine,sl.get_table(engine, 'tag'),tag=taglabel) with app.open_resource('resources/unregistered-companies.csv') as csvfile: csvreader = csv.reader(csvfile, delimiter=',', quotechar='"') for i, rep in enumerate(csvreader): load_rep(rep, engine, unregtag) if i % 100 == 0: log.info("Extracted: %s...", i)
def extract(engine): table = sl.get_table(engine, 'meeting') try: sl.update(engine, 'meeting', {}, {'status': 'inactive'}, ensure=False) sl.update(engine, 'meeting_participants', {}, {'status': 'inactive'}, ensure=False) except sqlalchemy.exc.CompileError: pass i=0 for url, org, title in get_urls(): for meeting in scrape(url, title, org): sl.upsert(engine, table, meeting, ['meetid', 'identification_code']) i+=1 if i % 100 == 0: log.info("Extracted: %s...", i)
def fetch_taskruns(engine): log.info("Fetching responses from pyBossa...") net = sl.get_table(engine, 'network_entity') app = setup() results = defaultdict(list) for taskrun in _iterate(pbclient.find_taskruns, app_id=app.id): results[taskrun.info.get('etl_id')].extend(taskrun.info.get('matches')) for etl_id, matches in results.items(): uniques = defaultdict(list) for m in matches: uniques[m.strip().lower()].append(m) for vs in uniques.values(): if not len(vs) >= QUORUM: continue sl.upsert(engine, net, {'etl_id': etl_id, 'name': vs[0].strip()}, ['etl_id', 'name'])
def save(person, engine): table = sl.get_table(engine, 'person') orgs = list(sl.find(engine, sl.get_table(engine, 'representative'), identification_code=person['org_identification_code'])) if len(orgs): org = max(orgs, key=lambda o: o['last_update_date']) person['representative_etl_id'] = org['etl_id'] person['role'] = 'accredited' name = '%s %s %s' % (person['title'] or '', person['first_name'] or '', person['last_name'] or '') person['name'] = name.strip() log.debug("Accreditation: %s", name) sl.upsert(engine, table, person, ['representative_etl_id', 'role', 'name']) else: log.warn("Cannot associate with a registered interest: %r", person)
def load_tag(rec, engine): tags=[] for tag in rec['tags']: sl.upsert(engine, sl.get_table(engine, 'tag'), {'tag': tag} , ['tag']) tags.append(sl.find_one(engine,sl.get_table(engine, 'tag'),tag=tag)) if rec['id']: rep=sl.find_one(engine,sl.get_table(engine, 'representative'), identification_code=rec['id']) else: rep=sl.find_one(engine,sl.get_table(engine, 'representative'), original_name=rec['name']) if not rep: print >>sys.stderr, "couldn't find", rec['id'] or rec['name'].encode('utf8') return for tag in tags: sl.upsert(engine, sl.get_table(engine, 'tags'), {'representative_id': rep['id'], 'tag_id': tag['id']}, ['representative_id', 'tag_id']) return
def dedup_fields(engine, field): table = sl.get_table(engine, 'representative') seen=set([]) for n, rep in enumerate(sl.all(engine, table)): if n % 100 == 0: print n, 'done' if not rep[field] or not rep[field].strip() or rep[field] in seen: continue seen.update(rep[field]) others = list(sl.find(engine, table, **{field: rep[field]})) if len(others) > 1: log.info("Duplicates for: %s", rep['name']) for i, re in enumerate(others): if re == rep: continue text = "(Duplicate %s)" % (i+1) sl.upsert(engine, table, {'name_suffix': text, 'identification_code': re['identification_code']}, ['identification_code'])
def load_rep(rep, engine): # etlId = rep['etlId'] = "%s//%s" % (rep['identificationCode'], # rep['lastUpdateDate'].isoformat()) etlId = rep["etl_id"] = "%s//ALL" % rep["identification_code"] childBase = { "representative_etl_id": etlId, "representative_update_date": rep["last_update_date"], "status": "active", } if not rep["original_name"]: log.error("Unnamed representative: %r", rep) return load_contact(rep.pop("head_contact", {}), childBase, engine) load_contact(rep.pop("be_contact", {}), childBase, engine) load_person(rep.pop("legal_person"), "legal", childBase, engine) load_person(rep.pop("head_person"), "head", childBase, engine) for actionField in rep.pop("action_fields"): rec = childBase.copy() rec["action_field"] = actionField sl.upsert(engine, sl.get_table(engine, "action_field"), rec, ["representative_etl_id", "action_field"]) for interest in rep.pop("interests"): rec = childBase.copy() rec["interest"] = interest sl.upsert(engine, sl.get_table(engine, "interest"), rec, ["representative_etl_id", "interest"]) for countryOfMember in rep.pop("country_of_members"): rec = childBase.copy() rec["country"] = countryOfMember sl.upsert(engine, sl.get_table(engine, "country_of_member"), rec, ["representative_etl_id", "country"]) for organisation in rep.pop("organisations"): rec = childBase.copy() rec.update(organisation) rec["name"] = organisation["name"].strip() sl.upsert(engine, sl.get_table(engine, "organisation"), rec, ["representative_etl_id", "name"]) load_finances(rep.pop("fd"), childBase, engine) rep["name"] = rep["original_name"].strip() rep["network_extracted"] = False sl.upsert(engine, sl.get_table(engine, "representative"), rep, ["etl_id"])
def load_rep(rep, engine): #etlId = rep['etlId'] = "%s//%s" % (rep['identificationCode'], # rep['lastUpdateDate'].isoformat()) etlId = rep['etl_id'] = "%s//ALL" % rep['identification_code'] childBase = {'representative_etl_id': etlId, 'representative_update_date': rep['last_update_date'], 'status': 'active'} if not rep['original_name']: log.error("Unnamed representative: %r", rep) return load_person(rep.pop('legal_person'), 'legal', childBase, engine) load_person(rep.pop('head_person'), 'head', childBase, engine) for actionField in rep.pop('action_fields'): rec = childBase.copy() rec['action_field'] = actionField sl.upsert(engine, sl.get_table(engine, 'action_field'), rec, ['representative_etl_id', 'action_field']) for interest in rep.pop('interests'): rec = childBase.copy() rec['interest'] = interest sl.upsert(engine, sl.get_table(engine, 'interest'), rec, ['representative_etl_id', 'interest']) for countryOfMember in rep.pop('country_of_members'): rec = childBase.copy() rec['country'] = countryOfMember sl.upsert(engine, sl.get_table(engine, 'country_of_member'), rec, ['representative_etl_id', 'country']) for organisation in rep.pop('organisations'): rec = childBase.copy() rec.update(organisation) rec['name'] = organisation['name'].strip() sl.upsert(engine, sl.get_table(engine, 'organisation'), rec, ['representative_etl_id', 'name']) load_finances(rep.pop('fd'), childBase, engine) rep['name'] = rep['original_name'].strip() rep['network_extracted'] = False sl.upsert(engine, sl.get_table(engine, 'representative'), rep, ['etl_id'])
def load_finances(financialData, childBase, engine): if financialData == {}: return etlId = "%s//%s" % (financialData["start_date"].isoformat(), financialData["end_date"].isoformat()) financial_sources = [(s, "other") for s in financialData.pop("other_customized")] + [ (s, "public") for s in financialData.pop("public_customized") ] for financial_source, type_ in financial_sources: financial_source["type"] = type_ financial_source["financial_data_etl_id"] = etlId financial_source.update(childBase) sl.upsert( engine, sl.get_table(engine, "financial_data_custom_source"), financial_source, ["representative_etl_id", "financial_data_etl_id", "type", "name"], ) for turnover in financialData.pop("turnover_breakdown"): turnover["financial_data_etl_id"] = etlId turnover["name"] = turnover["name"].strip() turnover.update(childBase) sl.upsert( engine, sl.get_table(engine, "financial_data_turnover"), turnover, ["representative_etl_id", "financial_data_etl_id", "name"], ) financialData["etl_id"] = etlId financialData.update(childBase) sl.upsert(engine, sl.get_table(engine, "financial_data"), financialData, ["representative_etl_id", "etl_id"])
def load_finances(financialData, childBase, engine): etlId = '%s//%s' % (financialData['start_date'].isoformat(), financialData['end_date'].isoformat()) financial_sources = \ [(s, 'other') for s in financialData.pop("other_customized")] + \ [(s, 'public') for s in financialData.pop("public_customized")] for financial_source, type_ in financial_sources: financial_source['type'] = type_ financial_source['financial_data_etl_id'] = etlId financial_source.update(childBase) sl.upsert(engine, sl.get_table(engine, 'financial_data_custom_source'), financial_source, ['representative_etl_id', 'financial_data_etl_id', 'type', 'name']) for turnover in financialData.pop("turnover_breakdown"): turnover['financial_data_etl_id'] = etlId turnover['name'] = turnover['name'].strip() turnover.update(childBase) sl.upsert(engine, sl.get_table(engine, 'financial_data_turnover'), turnover, ['representative_etl_id', 'financial_data_etl_id', 'name']) financialData['etl_id'] = etlId financialData.update(childBase) sl.upsert(engine, sl.get_table(engine, 'financial_data'), financialData, ['representative_etl_id', 'etl_id'])
def map_names(map_func, engine, table_name, source_column='name', out_column='canonical_name'): table = sl.get_table(engine, table_name) seen_values = set() log.info("Normalising names on '%s', column '%s'...", table_name, source_column) for row in sl.find(engine, table): value = row.get(source_column) if value in seen_values: continue seen_values.add(value) d = {source_column: value, 'etl_clean': True, out_column: None} try: out = map_func(value, row) if out is None: d['etl_clean'] = False else: d[out_column] = out except ValueError, ve: d['etl_clean'] = False sl.upsert(engine, table, d, [source_column])
def _s(data): if 'subgroup_status' in data: del data['subgroup_status'] for policy_area in data.pop('policy_area', []): sl.upsert(engine, sl.get_table(engine, 'expertgroup_member_policy_area'), {'expertgroup_etl_id': etlId, 'member': data['name'], 'policy_area': policy_area, 'subgroup': data['subgroup']}, ['expertgroup_etl_id', 'policy_area', 'member', 'subgroup']) for country in data.pop('countries/area_represented', data.pop('countries/areas_represented', [])): sl.upsert(engine, sl.get_table(engine, 'expertgroup_member_country'), {'expertgroup_etl_id': etlId, 'member': data['name'], 'country': country, 'subgroup': data['subgroup']}, ['expertgroup_etl_id', 'country', 'member', 'subgroup']) data['expertgroup_etl_id'] = etlId sl.upsert(engine, sl.get_table(engine, 'expertgroup_member'), data, ['expertgroup_etl_id', 'name', 'subgroup'])
def save(engine, group): #etlId = "%s//%s" % (group['identifier'], group['last_updated']) etlId = "%s//ALL" % group['identifier'] for policy_area in group.pop('policy_area', []): sl.upsert(engine, sl.get_table(engine, 'expertgroup_policy_area'), {'expertgroup_etl_id': etlId, 'policy_area': policy_area}, ['expertgroup_etl_id', 'policy_area']) for task in group.pop('task', []): sl.upsert(engine, sl.get_table(engine, 'expertgroup_task'), {'expertgroup_etl_id': etlId, 'task': task}, ['expertgroup_etl_id', 'task']) for composition in group.pop('composition', []): sl.upsert(engine, sl.get_table(engine, 'expertgroup_composition'), {'expertgroup_etl_id': etlId, 'composition': composition}, ['expertgroup_etl_id', 'composition']) for associated_dg in group.pop('associated_dg', []): sl.upsert(engine, sl.get_table(engine, 'expertgroup_directorate'), {'expertgroup_etl_id': etlId, 'directorate': associated_dg}, ['expertgroup_etl_id', 'directorate']) for lead_dg in group.pop('lead_dg', []): sl.upsert(engine, sl.get_table(engine, 'expertgroup_directorate'), {'expertgroup_etl_id': etlId, 'directorate': lead_dg, 'lead': True}, ['expertgroup_etl_id', 'directorate']) for member in group.pop('members'): save_member(engine, etlId, member) for subgroup in group.pop('subgroups'): subgroup['expertgroup_etl_id'] = etlId for member in subgroup.pop('members'): member['subgroup'] = subgroup['name'] save_member(engine, etlId, member) sl.upsert(engine, sl.get_table(engine, 'expertgroup_subgroup'), subgroup, ['expertgroup_etl_id', 'name']) void = group.pop('additional_info') group['etl_id'] = etlId group.pop('link_to_website', '') sl.upsert(engine, sl.get_table(engine, 'expertgroup'), group, ['etl_id'])
def code_categories(engine): table = sl.get_table(engine, 'representative') for cat in sl.distinct(engine, table, 'main_category'): cat['main_category_id'] = CATEGORIES[cat['main_category']] sl.upsert(engine, table, cat, ['main_category'])
def code_subcategories(engine): table = sl.get_table(engine, 'representative') for cat in sl.distinct(engine, table, 'sub_category'): if not cat['sub_category']: continue cat['sub_category_id'] = SUBCATEGORIES.get(cat['sub_category']) sl.upsert(engine, table, cat, ['sub_category'])