def load_finances(financialData, childBase, engine): etlId = '%s//%s' % (financialData['start_date'].isoformat(), financialData['end_date'].isoformat()) financial_sources = \ [(s, 'other') for s in financialData.pop("other_customized")] + \ [(s, 'public') for s in financialData.pop("public_customized")] for financial_source, type_ in financial_sources: financial_source['type'] = type_ financial_source['financial_data_etl_id'] = etlId financial_source.update(childBase) sl.upsert(engine, sl.get_table(engine, 'financial_data_custom_source'), financial_source, ['representative_etl_id', 'financial_data_etl_id', 'type', 'name']) for turnover in financialData.pop("turnover_breakdown"): turnover['financial_data_etl_id'] = etlId turnover['name'] = turnover['name'].strip() turnover.update(childBase) sl.upsert(engine, sl.get_table(engine, 'financial_data_turnover'), turnover, ['representative_etl_id', 'financial_data_etl_id', 'name']) financialData['etl_id'] = etlId financialData.update(childBase) sl.upsert(engine, sl.get_table(engine, 'financial_data'), financialData, ['representative_etl_id', 'etl_id'])
def load_rep(line, engine, unregtag): rep={} rep['original_name'] = line[0].strip() rep['name'] = line[0].strip() rep['identification_code'] = line[1] or hashlib.sha512(line[0].strip()).hexdigest()[:16] rep['etl_id'] = "%s//ALL" % rep['identification_code'] rep['web_site_url'] = line[2] or '' if line[3].strip(): rep['contact_street'] = line[3] if line[4].strip(): tmp=line[4].split() if tmp[0][0] == 'B': rep['contact_country'] = 'Belgium' elif tmp[0][0] == 'F': rep['contact_country'] = 'France' else: print 'bad zipcode country code', line[4] rep['contact_post_code'] = tmp[0][2:] rep['contact_town'] = ' '.join(tmp[1:]) rep['network_extracted'] = False sl.upsert(engine, sl.get_table(engine, 'representative'), rep, ['etl_id']) inserted=sl.find_one(engine,sl.get_table(engine, 'representative'),**rep) if inserted: sl.upsert(engine, sl.get_table(engine, 'tags'), {'representative_id': inserted['id'], 'tag_id': unregtag['id']}, ['representative_id', 'tag_id'])
def load_finances(financialData, childBase, engine): if financialData == {}: return etlId = "%s//%s" % (financialData["start_date"].isoformat(), financialData["end_date"].isoformat()) financial_sources = [(s, "other") for s in financialData.pop("other_customized")] + [ (s, "public") for s in financialData.pop("public_customized") ] for financial_source, type_ in financial_sources: financial_source["type"] = type_ financial_source["financial_data_etl_id"] = etlId financial_source.update(childBase) sl.upsert( engine, sl.get_table(engine, "financial_data_custom_source"), financial_source, ["representative_etl_id", "financial_data_etl_id", "type", "name"], ) for turnover in financialData.pop("turnover_breakdown"): turnover["financial_data_etl_id"] = etlId turnover["name"] = turnover["name"].strip() turnover.update(childBase) sl.upsert( engine, sl.get_table(engine, "financial_data_turnover"), turnover, ["representative_etl_id", "financial_data_etl_id", "name"], ) financialData["etl_id"] = etlId financialData.update(childBase) sl.upsert(engine, sl.get_table(engine, "financial_data"), financialData, ["representative_etl_id", "etl_id"])
def extract_data(engine): log.info("Extracting unregistered interests data...") taglabel='situation:unregistered' unregtag={'tag': taglabel} sl.upsert(engine, sl.get_table(engine, 'tag'), unregtag, ['tag']) unregtag=sl.find_one(engine,sl.get_table(engine, 'tag'),tag=taglabel) with app.open_resource('resources/unregistered-companies.csv') as csvfile: csvreader = csv.reader(csvfile, delimiter=',', quotechar='"') for i, rep in enumerate(csvreader): load_rep(rep, engine, unregtag) if i % 100 == 0: log.info("Extracted: %s...", i)
def load(engine): for i, rep in enumerate(sl.all(engine, sl.get_table(engine, 'representative'))): log.info("Loading(%s): %s", i, rep.get('name')) #if rep['etl_clean'] is False: # log.debug("Skipping!") # continue load_representative(engine, rep)
def create_tasks(engine): log.info("Updating tasks on pyBossa...") app = setup() with flask_app.open_resource('resources/pbnetworks_template.html') as f: app.info['task_presenter'] = f.read() pbclient.update_app(app) tasks = pbclient.get_tasks(app.id, limit=30000) existing = dict([(t.data.get('info').get('signature'), t) for t in tasks]) for rep in sl.all(engine, sl.get_table(engine, 'representative')): networking = rep.get('networking') if networking is None or len(networking.strip()) < 3: continue signature = rep.get('identification_code') + networking signature = sha1(signature.encode('ascii', 'ignore')).hexdigest() rep['signature'] = signature print [rep.get('name')] log.debug("Task: %s", rep['name']) rep['last_update_date'] = rep['last_update_date'].isoformat() rep['registration_date'] = rep['registration_date'].isoformat() #print [(k, type(v)) for k,v in rep.items()] if signature in existing: task = existing.get(signature) task.data['info'] = rep pbclient.update_task(task) else: pbclient.create_task(app.id, rep)
def load_person(person, role, childBase, engine): table = sl.get_table(engine, "person") person_ = childBase.copy() person_.update(person) person_["role"] = role person_["name"] = " ".join((person["title"] or "", person["first_name"] or "", person["last_name"] or "")) sl.upsert(engine, table, person_, ["representative_etl_id", "role", "name"])
def load_contact(contact, childBase, engine): if contact == {}: return table = sl.get_table(engine, "contact") contact_ = childBase.copy() contact_.update(contact) sl.upsert(engine, table, contact_, ["representative_etl_id", "country", "type"])
def code_categories(engine): table = sl.get_table(engine, 'representative') for cat in sl.distinct(engine, table, 'main_category'): if not cat['main_category']: continue c=newcats.get(cat['main_category'],cat['main_category']) cat['main_category_id'] = CATEGORIES[c] sl.upsert(engine, table, cat, ['main_category'])
def transform(engine): log.info("Geo-coding representatives...") table = sl.get_table(engine, 'representative') for row in sl.all(engine, table): out = {'id': row['id']} if row.get('contact_lon'): continue query = { 'format': 'json', 'limit': 1, 'city': row.get('contact_town'), 'street': row.get('contact_street'), 'country': row.get('contact_country'), 'postalcode': row.get('contact_post_code') } response = requests.get(URL, params=query) try: json = response.json() except: continue if json and len(json): geo = json[0] log.info("%s @ %s", row.get('name'), geo.get('display_name')) out['contact_geoname'] = geo.get('display_name') out['contact_lon'] = geo.get('lon') out['contact_lat'] = geo.get('lat') sl.upsert(engine, table, out, ['id'])
def load(engine): for rep in sl.all(engine, sl.get_table(engine, 'representative')): log.info("Loading: %s", rep.get('name')) if rep['etl_clean'] is False: log.debug("Skipping!") continue load_representative(engine, rep)
def _s(data): if 'subgroup_status' in data: del data['subgroup_status'] for policy_area in data.pop('policy_area', []): sl.upsert(engine, sl.get_table(engine, 'expertgroup_member_policy_area'), {'expertgroup_etl_id': etlId, 'member': data['name'], 'policy_area': policy_area, 'subgroup': data['subgroup']}, ['expertgroup_etl_id', 'policy_area', 'member', 'subgroup']) for country in data.pop('countries/area_represented', data.pop('countries/areas_represented', [])): sl.upsert(engine, sl.get_table(engine, 'expertgroup_member_country'), {'expertgroup_etl_id': etlId, 'member': data['name'], 'country': country, 'subgroup': data['subgroup']}, ['expertgroup_etl_id', 'country', 'member', 'subgroup']) data['expertgroup_etl_id'] = etlId sl.upsert(engine, sl.get_table(engine, 'expertgroup_member'), data, ['expertgroup_etl_id', 'name', 'subgroup'])
def save(person, engine): table = sl.get_table(engine, 'person') orgs = list(sl.find(engine, sl.get_table(engine, 'representative'), identification_code=person['org_identification_code'])) if len(orgs): org = max(orgs, key=lambda o: o['last_update_date']) person['representative_etl_id'] = org['etl_id'] person['role'] = 'accredited' name = '%s %s %s' % (person['title'] or '', person['first_name'] or '', person['last_name'] or '') person['name'] = name.strip() log.debug("Accreditation: %s", name) sl.upsert(engine, table, person, ['representative_etl_id', 'role', 'name']) else: log.warn("Cannot associate with a registered interest: %r", person)
def load_tag(rec, engine): tags=[] for tag in rec['tags']: sl.upsert(engine, sl.get_table(engine, 'tag'), {'tag': tag} , ['tag']) tags.append(sl.find_one(engine,sl.get_table(engine, 'tag'),tag=tag)) if rec['id']: rep=sl.find_one(engine,sl.get_table(engine, 'representative'), identification_code=rec['id']) else: rep=sl.find_one(engine,sl.get_table(engine, 'representative'), original_name=rec['name']) if not rep: print >>sys.stderr, "couldn't find", rec['id'] or rec['name'].encode('utf8') return for tag in tags: sl.upsert(engine, sl.get_table(engine, 'tags'), {'representative_id': rep['id'], 'tag_id': tag['id']}, ['representative_id', 'tag_id']) return
def extract(engine): table = sl.get_table(engine, 'meeting') i=0 for title, url in uuids: for meeting in scrape(url, title): sl.upsert(engine, table, meeting, ['meetid', 'identification_code']) i+=1 if i % 100 == 0: log.info("Extracted: %s...", i)
def load_rep(rep, engine): # etlId = rep['etlId'] = "%s//%s" % (rep['identificationCode'], # rep['lastUpdateDate'].isoformat()) etlId = rep["etl_id"] = "%s//ALL" % rep["identification_code"] childBase = { "representative_etl_id": etlId, "representative_update_date": rep["last_update_date"], "status": "active", } if not rep["original_name"]: log.error("Unnamed representative: %r", rep) return load_contact(rep.pop("head_contact", {}), childBase, engine) load_contact(rep.pop("be_contact", {}), childBase, engine) load_person(rep.pop("legal_person"), "legal", childBase, engine) load_person(rep.pop("head_person"), "head", childBase, engine) for actionField in rep.pop("action_fields"): rec = childBase.copy() rec["action_field"] = actionField sl.upsert(engine, sl.get_table(engine, "action_field"), rec, ["representative_etl_id", "action_field"]) for interest in rep.pop("interests"): rec = childBase.copy() rec["interest"] = interest sl.upsert(engine, sl.get_table(engine, "interest"), rec, ["representative_etl_id", "interest"]) for countryOfMember in rep.pop("country_of_members"): rec = childBase.copy() rec["country"] = countryOfMember sl.upsert(engine, sl.get_table(engine, "country_of_member"), rec, ["representative_etl_id", "country"]) for organisation in rep.pop("organisations"): rec = childBase.copy() rec.update(organisation) rec["name"] = organisation["name"].strip() sl.upsert(engine, sl.get_table(engine, "organisation"), rec, ["representative_etl_id", "name"]) load_finances(rep.pop("fd"), childBase, engine) rep["name"] = rep["original_name"].strip() rep["network_extracted"] = False sl.upsert(engine, sl.get_table(engine, "representative"), rep, ["etl_id"])
def load_rep(rep, engine): #etlId = rep['etlId'] = "%s//%s" % (rep['identificationCode'], # rep['lastUpdateDate'].isoformat()) etlId = rep['etl_id'] = "%s//ALL" % rep['identification_code'] childBase = {'representative_etl_id': etlId, 'representative_update_date': rep['last_update_date'], 'status': 'active'} if not rep['original_name']: log.error("Unnamed representative: %r", rep) return load_person(rep.pop('legal_person'), 'legal', childBase, engine) load_person(rep.pop('head_person'), 'head', childBase, engine) for actionField in rep.pop('action_fields'): rec = childBase.copy() rec['action_field'] = actionField sl.upsert(engine, sl.get_table(engine, 'action_field'), rec, ['representative_etl_id', 'action_field']) for interest in rep.pop('interests'): rec = childBase.copy() rec['interest'] = interest sl.upsert(engine, sl.get_table(engine, 'interest'), rec, ['representative_etl_id', 'interest']) for countryOfMember in rep.pop('country_of_members'): rec = childBase.copy() rec['country'] = countryOfMember sl.upsert(engine, sl.get_table(engine, 'country_of_member'), rec, ['representative_etl_id', 'country']) for organisation in rep.pop('organisations'): rec = childBase.copy() rec.update(organisation) rec['name'] = organisation['name'].strip() sl.upsert(engine, sl.get_table(engine, 'organisation'), rec, ['representative_etl_id', 'name']) load_finances(rep.pop('fd'), childBase, engine) rep['name'] = rep['original_name'].strip() rep['network_extracted'] = False sl.upsert(engine, sl.get_table(engine, 'representative'), rep, ['etl_id'])
def load_person(person, role, childBase, engine): table = sl.get_table(engine, 'person') person_ = childBase.copy() person_.update(person) person_['role'] = role person_['name'] = ' '.join((person['title'] or '', person['first_name'] or '', person['last_name'] or '')) sl.upsert(engine, table, person_, ['representative_etl_id', 'role', 'name'])
def dedup_fields(engine, field): table = sl.get_table(engine, 'representative') for rep in sl.all(engine, table): others = list(sl.find(engine, table, **{field: rep[field]})) if len(others) > 1: log.info("Duplicates for: %s", rep['name']) for i, re in enumerate(others): text = "(Duplicate %s)" % (i+1) sl.upsert(engine, table, {'name_suffix': text, 'identification_code': re['identification_code']}, ['identification_code'])
def extract(engine): table = sl.get_table(engine, 'meeting') try: sl.update(engine, 'meeting', {}, {'status': 'inactive'}, ensure=False) sl.update(engine, 'meeting_participants', {}, {'status': 'inactive'}, ensure=False) except sqlalchemy.exc.CompileError: pass i=0 for url, org, title in get_urls(): for meeting in scrape(url, title, org): sl.upsert(engine, table, meeting, ['meetid', 'identification_code']) i+=1 if i % 100 == 0: log.info("Extracted: %s...", i)
def fetch_taskruns(engine): log.info("Fetching responses from pyBossa...") net = sl.get_table(engine, 'network_entity') app = setup() results = defaultdict(list) for taskrun in _iterate(pbclient.find_taskruns, app_id=app.id): results[taskrun.info.get('etl_id')].extend(taskrun.info.get('matches')) for etl_id, matches in results.items(): uniques = defaultdict(list) for m in matches: uniques[m.strip().lower()].append(m) for vs in uniques.values(): if not len(vs) >= QUORUM: continue sl.upsert(engine, net, {'etl_id': etl_id, 'name': vs[0].strip()}, ['etl_id', 'name'])
def dedup_fields(engine, field): table = sl.get_table(engine, 'representative') seen=set([]) for n, rep in enumerate(sl.all(engine, table)): if n % 100 == 0: print n, 'done' if not rep[field] or not rep[field].strip() or rep[field] in seen: continue seen.update(rep[field]) others = list(sl.find(engine, table, **{field: rep[field]})) if len(others) > 1: log.info("Duplicates for: %s", rep['name']) for i, re in enumerate(others): if re == rep: continue text = "(Duplicate %s)" % (i+1) sl.upsert(engine, table, {'name_suffix': text, 'identification_code': re['identification_code']}, ['identification_code'])
def map_names(map_func, engine, table_name, source_column='name', out_column='canonical_name'): table = sl.get_table(engine, table_name) seen_values = set() log.info("Normalising names on '%s', column '%s'...", table_name, source_column) for row in sl.find(engine, table): value = row.get(source_column) if value in seen_values: continue seen_values.add(value) d = {source_column: value, 'etl_clean': True, out_column: None} try: out = map_func(value, row) if out is None: d['etl_clean'] = False else: d[out_column] = out except ValueError, ve: d['etl_clean'] = False sl.upsert(engine, table, d, [source_column])
def save(engine, group): #etlId = "%s//%s" % (group['identifier'], group['last_updated']) etlId = "%s//ALL" % group['identifier'] for policy_area in group.pop('policy_area', []): sl.upsert(engine, sl.get_table(engine, 'expertgroup_policy_area'), {'expertgroup_etl_id': etlId, 'policy_area': policy_area}, ['expertgroup_etl_id', 'policy_area']) for task in group.pop('task', []): sl.upsert(engine, sl.get_table(engine, 'expertgroup_task'), {'expertgroup_etl_id': etlId, 'task': task}, ['expertgroup_etl_id', 'task']) for composition in group.pop('composition', []): sl.upsert(engine, sl.get_table(engine, 'expertgroup_composition'), {'expertgroup_etl_id': etlId, 'composition': composition}, ['expertgroup_etl_id', 'composition']) for associated_dg in group.pop('associated_dg', []): sl.upsert(engine, sl.get_table(engine, 'expertgroup_directorate'), {'expertgroup_etl_id': etlId, 'directorate': associated_dg}, ['expertgroup_etl_id', 'directorate']) for lead_dg in group.pop('lead_dg', []): sl.upsert(engine, sl.get_table(engine, 'expertgroup_directorate'), {'expertgroup_etl_id': etlId, 'directorate': lead_dg, 'lead': True}, ['expertgroup_etl_id', 'directorate']) for member in group.pop('members'): save_member(engine, etlId, member) for subgroup in group.pop('subgroups'): subgroup['expertgroup_etl_id'] = etlId for member in subgroup.pop('members'): member['subgroup'] = subgroup['name'] save_member(engine, etlId, member) sl.upsert(engine, sl.get_table(engine, 'expertgroup_subgroup'), subgroup, ['expertgroup_etl_id', 'name']) void = group.pop('additional_info') group['etl_id'] = etlId group.pop('link_to_website', '') sl.upsert(engine, sl.get_table(engine, 'expertgroup'), group, ['etl_id'])
def load(engine): for i, meet in enumerate(sl.all(engine, sl.get_table(engine, 'meeting'))): log.info("Loading(%s): %s", i, meet.get('name')) load_meeting(engine, meet)
def load_representative(engine, rep): entity = upsert_entity(rep.get('canonical_name'), name=rep.get('original_name'), suffix=rep.get('name_suffix'), acronym=rep.get('acronym')) assert entity is not None, entity assert entity.id is not None, entity rep['entity'] = entity rep['members_25'] = to_integer(rep['members_25']) rep['members_50'] = to_integer(rep['members_50']) rep['members_75'] = to_integer(rep['members_75']) rep['members_100'] = to_integer(rep['members_100']) rep['members_fte'] = to_float(rep['members_fte']) rep['members'] = to_integer(rep['members']) rep['number_of_natural_persons'] = to_integer(rep['number_of_natural_persons']) if rep.get('main_category'): main_category = upsert_category(rep.get('main_category_id'), rep.get('main_category')) rep['main_category'] = main_category if rep.get('sub_category'): rep['sub_category'] = upsert_category(rep.get('sub_category_id'), rep.get('sub_category'), main_category) accreditations = [] for person_data in sl.find(engine, sl.get_table(engine, 'person'), representative_etl_id=rep['etl_id']): person = upsert_person(person_data) if person_data.get('role') == 'head': rep['head'] = person if person_data.get('role') == 'legal': rep['legal'] = person if person_data.get('role') == 'accredited': accreditations.append((person, person_data)) representative = Representative.by_identification_code(rep['identification_code']) if representative is None: representative = Representative.create(rep) else: representative.update(rep) for contact_data in sl.find(engine, sl.get_table(engine, 'contact'), representative_etl_id=rep['etl_id'], status='active'): if len([x for x in contact_data.values() if x])<7: continue contact_ = { 'town': contact_data['town'], 'street': contact_data['street'], 'post_code': contact_data['post_code'], 'postbox': contact_data['postbox'], 'lat': to_float(contact_data['lat']), 'lon': to_float(contact_data['lon']), 'phone': " ".join((contact_data.get('indic_phone') or '', contact_data.get('phone') or '')).strip(), 'country': Country.by_code(contact_data['country_code']), } if contact_data['type'] == 'head': if representative.head_office_id is None: contact = Contact.create(contact_) representative.head_office=contact representative.contact_country=contact.country else: representative.head_office.update(contact_) else: if representative.be_office_id is None: contact = Contact.create(contact_) representative.be_office=contact else: representative.be_office.update(contact_) for person, data_ in accreditations: data_['person'] = person data_['representative'] = representative accreditation = Accreditation.by_rp(person, representative) if accreditation is None: accreditation = Accreditation.create(data_) else: accreditation.update(data_) for fd in sl.find(engine, sl.get_table(engine, 'financial_data'), representative_etl_id=rep['etl_id']): fd['turnover_min'] = to_integer(fd.get('turnover_min')) fd['turnover_max'] = to_integer(fd.get('turnover_max')) fd['turnover_absolute'] = to_integer(fd.get('turnover_absolute')) fd['cost_min'] = to_integer(fd.get('cost_min')) fd['cost_max'] = to_integer(fd.get('cost_max')) fd['cost_absolute'] = to_integer(fd.get('cost_absolute')) fd['direct_rep_costs_min'] = to_integer(fd.get('direct_rep_costs_min')) fd['direct_rep_costs_max'] = to_integer(fd.get('direct_rep_costs_max')) fd['total_budget'] = to_integer(fd.get('total_budget')) fd['public_financing_total'] = to_integer(fd.get('public_financing_total')) fd['public_financing_infranational'] = to_integer(fd.get('public_financing_infranational')) fd['public_financing_national'] = to_integer(fd.get('public_financing_national')) fd['eur_sources_grants'] = to_integer(fd.get('eur_sources_grants')) fd['eur_sources_procurement'] = to_integer(fd.get('eur_sources_procurement')) fd['other_sources_donation'] = to_integer(fd.get('other_sources_donation')) fd['other_sources_contributions'] = to_integer(fd.get('other_sources_donation')) fd['other_sources_total'] = to_integer(fd.get('other_sources_total')) fd['eur_sources_procurement_src'] = fd.get('eur_sources_procurement_src') fd['eur_sources_grants_src'] = fd.get('eur_sources_grants_src') fd['other_financial_information'] = fd.get('other_financial_information') fd['new_organisation'] = fd.get('new_organisation') fd['representative'] = representative financial_data = FinancialData.by_rsd(representative, fd.get('start_date')) if financial_data is None: financial_data = FinancialData.create(fd) else: financial_data.update(fd) for src_ in sl.find(engine, sl.get_table(engine, 'financial_data_custom_source'), representative_etl_id=rep['etl_id'], financial_data_etl_id=fd['etl_id']): src_['financial_data'] = financial_data src_['amount'] = to_integer(src_.get('amount')) src = CustomIncome.by_fdn(financial_data, src_['name']) if src is None: src = CustomIncome.create(src_) else: src.update(src_) for turnover_ in sl.find(engine, sl.get_table(engine, 'financial_data_turnover'), representative_etl_id=rep['etl_id'], financial_data_etl_id=fd['etl_id']): #if turnover_.get('etl_clean') is False: # continue turnover_['entity'] = upsert_entity(turnover_.get('canonical_name'), turnover_.get('name')) assert turnover_['entity'] is not None, turnover_['entity'] turnover_['financial_data'] = financial_data turnover_['min'] = to_integer(turnover_.get('min')) turnover_['max'] = to_integer(turnover_.get('max')) turnover = FinancialTurnover.by_fde(financial_data, turnover_['entity']) if turnover is None: turnover = FinancialTurnover.create(turnover_) else: turnover.update(turnover_) for org in sl.find(engine, sl.get_table(engine, 'organisation'), representative_etl_id=rep['etl_id']): #if org.get('etl_clean') is False: # continue org['number_of_members'] = to_integer(org['number_of_members']) organisation = upsert_organisation(org) omdata = {'representative': representative, 'status': org.get('status'), 'organisation': organisation} om = OrganisationMembership.by_rpo(representative, organisation) if om is None: om = OrganisationMembership.create(omdata) else: om.update(omdata) for country_ in sl.find(engine, sl.get_table(engine, 'country_of_member'), representative_etl_id=rep['etl_id']): if not country_.get('country_code'): continue #if country_.get('etl_clean') is False: # continue cdata = {'representative': representative, 'status': country_.get('status'), 'country': Country.by_code(country_.get('country_code'))} cm = CountryMembership.by_rpc(representative, cdata.get('country')) if cm is None: cm = CountryMembership.create(cdata) else: cm.update(cdata) for action_ in sl.find(engine, sl.get_table(engine, 'action_field'), representative_etl_id=rep['etl_id']): if not action_.get('action_field'): continue af = ActionField.by_action(action_.get('action_field')) if af is None: af = ActionField.create({'action': action_.get('action_field')}) db.session.commit() adata = {'representative': representative, 'status': action_.get('status'), 'action': af} am = AssociatedAction.by_rpa(representative, af) if am is None: am = AssociatedAction.create(adata) db.session.commit() else: am.update(adata) for interest_ in sl.find(engine, sl.get_table(engine, 'interest'), representative_etl_id=rep['etl_id']): if not interest_.get('interest'): continue i = Interest.by_interest(interest_.get('interest')) if i is None: i = Interest.create({'interest': interest_.get('interest')}) db.session.commit() adata = {'representative': representative, 'status': action_.get('status'), 'interest': i} ai = AssociatedInterest.by_rpi(representative, i) if ai is None: ai = AssociatedInterest.create(adata) db.session.commit() else: ai.update(adata) for taglink in sl.find(engine, sl.get_table(engine, 'tags'), representative_id=rep['id']): etltag=sl.find_one(engine, sl.get_table(engine, 'tag'), id=taglink['tag_id']) tag = upsert_tag(etltag['tag']) if not tag in representative.tags: representative.tags.append(tag) db.session.commit()
def code_categories(engine): table = sl.get_table(engine, 'representative') for cat in sl.distinct(engine, table, 'main_category'): cat['main_category_id'] = CATEGORIES[cat['main_category']] sl.upsert(engine, table, cat, ['main_category'])
def remap_subcategories(engine): table = sl.get_table(engine, 'representative') for cat in sl.distinct(engine, table, 'sub_category'): c=newsubcats.get(cat['sub_category']) if c: sl.update(engine, 'representative', {'sub_category': cat['sub_category']}, {'sub_category': c}, ensure=False)
def load_representative(engine, rep): entity = upsert_entity(rep.get('canonical_name'), name=rep.get('original_name'), suffix=rep.get('name_suffix'), acronym=rep.get('acronym')) assert entity is not None, entity assert entity.id is not None, entity rep['entity'] = entity rep['members'] = to_integer(rep['members']) rep['number_of_natural_persons'] = to_integer(rep['number_of_natural_persons']) rep['number_of_organisations'] = to_integer(rep['number_of_organisations']) rep['contact_lat'] = to_float(rep['contact_lat']) rep['contact_lon'] = to_float(rep['contact_lon']) rep['contact_phone'] = " ".join((rep.get('contact_indic_phone') or '', rep.get('contact_phone') or '')).strip() rep['contact_fax'] = " ".join((rep.get('contact_indic_fax') or '', rep.get('contact_fax') or '')).strip() rep['contact_country'] = Country.by_code(rep['country_code']) main_category = upsert_category(rep.get('main_category_id'), rep.get('main_category')) rep['main_category'] = main_category rep['sub_category'] = upsert_category(rep.get('sub_category_id'), rep.get('sub_category'), main_category) accreditations = [] for person_data in sl.find(engine, sl.get_table(engine, 'person'), representative_etl_id=rep['etl_id']): person = upsert_person(person_data) if person_data.get('role') == 'head': rep['head'] = person if person_data.get('role') == 'legal': rep['legal'] = person if person_data.get('role') == 'accredited': accreditations.append((person, person_data)) representative = Representative.by_identification_code(rep['identification_code']) if representative is None: representative = Representative.create(rep) else: representative.update(rep) for person, data_ in accreditations: data_['person'] = person data_['representative'] = representative accreditation = Accreditation.by_rp(person, representative) if accreditation is None: accreditation = Accreditation.create(data_) else: accreditation.update(data_) for fd in sl.find(engine, sl.get_table(engine, 'financial_data'), representative_etl_id=rep['etl_id']): fd['turnover_min'] = to_integer(fd.get('turnover_min')) fd['turnover_max'] = to_integer(fd.get('turnover_max')) fd['turnover_absolute'] = to_integer(fd.get('turnover_absolute')) fd['cost_min'] = to_integer(fd.get('cost_min')) fd['cost_max'] = to_integer(fd.get('cost_max')) fd['cost_absolute'] = to_integer(fd.get('cost_absolute')) fd['direct_rep_costs_min'] = to_integer(fd.get('direct_rep_costs_min')) fd['direct_rep_costs_max'] = to_integer(fd.get('direct_rep_costs_max')) fd['total_budget'] = to_integer(fd.get('total_budget')) fd['public_financing_total'] = to_integer(fd.get('public_financing_total')) fd['public_financing_infranational'] = to_integer(fd.get('public_financing_infranational')) fd['public_financing_national'] = to_integer(fd.get('public_financing_national')) fd['eur_sources_grants'] = to_integer(fd.get('eur_sources_grants')) fd['eur_sources_procurement'] = to_integer(fd.get('eur_sources_procurement')) fd['other_sources_donation'] = to_integer(fd.get('other_sources_donation')) fd['other_sources_contributions'] = to_integer(fd.get('other_sources_donation')) fd['other_sources_total'] = to_integer(fd.get('other_sources_total')) fd['representative'] = representative financial_data = FinancialData.by_rsd(representative, fd.get('start_date')) if financial_data is None: financial_data = FinancialData.create(fd) else: financial_data.update(fd) for turnover_ in sl.find(engine, sl.get_table(engine, 'financial_data_turnover'), representative_etl_id=rep['etl_id'], financial_data_etl_id=fd['etl_id']): if turnover_.get('etl_clean') is False: continue turnover_['entity'] = upsert_entity(turnover_.get('canonical_name'), turnover_.get('name')) assert turnover_['entity'] is not None, turnover_['entity'] turnover_['financial_data'] = financial_data turnover_['min'] = to_integer(turnover_.get('min')) turnover_['max'] = to_integer(turnover_.get('max')) turnover = FinancialTurnover.by_fde(financial_data, turnover_['entity']) if turnover is None: turnover = FinancialTurnover.create(turnover_) else: turnover.update(turnover_) for org in sl.find(engine, sl.get_table(engine, 'organisation'), representative_etl_id=rep['etl_id']): if org.get('etl_clean') is False: continue org['number_of_members'] = to_integer(org['number_of_members']) organisation = upsert_organisation(org) omdata = {'representative': representative, 'organisation': organisation} om = OrganisationMembership.by_rpo(representative, organisation) if om is None: om = OrganisationMembership.create(omdata) else: om.update(omdata) for country_ in sl.find(engine, sl.get_table(engine, 'country_of_member'), representative_etl_id=rep['etl_id']): if country_.get('etl_clean') is False: continue cdata = {'representative': representative, 'country': Country.by_code(country_.get('country_code'))} cm = CountryMembership.by_rpc(representative, cdata.get('country')) if cm is None: cm = CountryMembership.create(cdata) else: cm.update(cdata) db.session.commit()
def code_subcategories(engine): table = sl.get_table(engine, 'representative') for cat in sl.distinct(engine, table, 'sub_category'): if not cat['sub_category']: continue cat['sub_category_id'] = SUBCATEGORIES.get(cat['sub_category']) sl.upsert(engine, table, cat, ['sub_category'])