def create_tasks(engine): log.info("Updating tasks on pyBossa...") app = setup() with flask_app.open_resource('resources/pbnetworks_template.html') as f: app.info['task_presenter'] = f.read() pbclient.update_app(app) tasks = pbclient.get_tasks(app.id, limit=30000) existing = dict([(t.data.get('info').get('signature'), t) for t in tasks]) for rep in sl.all(engine, sl.get_table(engine, 'representative')): networking = rep.get('networking') if networking is None or len(networking.strip()) < 3: continue signature = rep.get('identification_code') + networking signature = sha1(signature.encode('ascii', 'ignore')).hexdigest() rep['signature'] = signature print [rep.get('name')] log.debug("Task: %s", rep['name']) rep['last_update_date'] = rep['last_update_date'].isoformat() rep['registration_date'] = rep['registration_date'].isoformat() #print [(k, type(v)) for k,v in rep.items()] if signature in existing: task = existing.get(signature) task.data['info'] = rep pbclient.update_task(task) else: pbclient.create_task(app.id, rep)
def load(engine): for rep in sl.all(engine, sl.get_table(engine, 'representative')): log.info("Loading: %s", rep.get('name')) if rep['etl_clean'] is False: log.debug("Skipping!") continue load_representative(engine, rep)
def load(engine): for i, rep in enumerate(sl.all(engine, sl.get_table(engine, 'representative'))): log.info("Loading(%s): %s", i, rep.get('name')) #if rep['etl_clean'] is False: # log.debug("Skipping!") # continue load_representative(engine, rep)
def transform(engine): log.info("Geo-coding representatives...") table = sl.get_table(engine, 'representative') for row in sl.all(engine, table): out = {'id': row['id']} if row.get('contact_lon'): continue query = { 'format': 'json', 'limit': 1, 'city': row.get('contact_town'), 'street': row.get('contact_street'), 'country': row.get('contact_country'), 'postalcode': row.get('contact_post_code') } response = requests.get(URL, params=query) try: json = response.json() except: continue if json and len(json): geo = json[0] log.info("%s @ %s", row.get('name'), geo.get('display_name')) out['contact_geoname'] = geo.get('display_name') out['contact_lon'] = geo.get('lon') out['contact_lat'] = geo.get('lat') sl.upsert(engine, table, out, ['id'])
def dedup_fields(engine, field): table = sl.get_table(engine, 'representative') for rep in sl.all(engine, table): others = list(sl.find(engine, table, **{field: rep[field]})) if len(others) > 1: log.info("Duplicates for: %s", rep['name']) for i, re in enumerate(others): text = "(Duplicate %s)" % (i+1) sl.upsert(engine, table, {'name_suffix': text, 'identification_code': re['identification_code']}, ['identification_code'])
def dedup_fields(engine, field): table = sl.get_table(engine, 'representative') seen=set([]) for n, rep in enumerate(sl.all(engine, table)): if n % 100 == 0: print n, 'done' if not rep[field] or not rep[field].strip() or rep[field] in seen: continue seen.update(rep[field]) others = list(sl.find(engine, table, **{field: rep[field]})) if len(others) > 1: log.info("Duplicates for: %s", rep['name']) for i, re in enumerate(others): if re == rep: continue text = "(Duplicate %s)" % (i+1) sl.upsert(engine, table, {'name_suffix': text, 'identification_code': re['identification_code']}, ['identification_code'])
def load(engine): for i, meet in enumerate(sl.all(engine, sl.get_table(engine, 'meeting'))): log.info("Loading(%s): %s", i, meet.get('name')) load_meeting(engine, meet)