def DeletePIIColumns(table_name, column_name, verbose=True): '''Deleting columns that contain personal identifiable information.''' print '%s Deleting PII column `%s` on table `%s`.' % ( item('prompt_bullet').decode('utf-8'), column_name, table_name) # # Fetch keys from column in database. # try: c = scraperwiki.sqlite.execute( 'select * from {table_name} limit 1'.format( table_name=table_name))['keys'] except Exception as e: print '%s Could not connect with database.' % item('prompt_error') if verbose: print e return False # # Copy data into backup table without # PII column, then copy it back. # From: http://stackoverflow.com/questions/10660435/pythonic-way-to-create-a-long-multi-line-string # columns = ','.join([t for t in c if t != column_name]) sql_statements = [ 'CREATE TABLE {table_name}_backup({columns});'.format( table_name=table_name, columns=columns), 'INSERT INTO {table_name}_backup SELECT {columns} FROM {table_name};'. format(table_name=table_name, columns=columns), 'DROP TABLE {table_name};'.format(table_name=table_name), 'CREATE TABLE {table_name}({columns});'.format(table_name=table_name, columns=columns), 'INSERT INTO {table_name} SELECT {columns} FROM {table_name}_backup;'. format(table_name=table_name, columns=columns), 'DROP TABLE {table_name}_backup;'.format(table_name=table_name, columns=columns) ] try: i = 1 for sql in sql_statements: # print '%s SQL statement %s' % (item('prompt_warn'), str(i)) scraperwiki.sqlite.execute(sql) scraperwiki.sqlite._State.new_transaction() i += 1 except Exception as e: print '%s Could not delete column `%s` from table `%s`.' % ( item('prompt_error'), column_name, table_name) if verbose: print e return False # # Close connection with database. # return True
def ConvertEpochDates(table_name, column_name, verbose=False): '''Convert timestamps from /Date(1420449209053)/ to 2015-01-05T09:13:29''' print '%s Cleaning epoch dates for table `%s` and field `%s`.' % ( item('prompt_bullet').decode('utf-8'), table_name, column_name) # # Collect data from database. # sql = 'select * from %s' % table_name try: result_proxy = scraperwiki.sqlite.execute(sql)['data'] data = [] for row in result_proxy: data.append(dict(row.items())) except Exception as e: print '%s Could not collect data from database.' % item('prompt_error') return False if verbose: print e # # Converting dates. # for date in data: d = date[column_name] # # Seeing if the regex works. # if d != None: try: # # Making transformations. # convent_date = int(d[d.find("(") + 1:d.find(")")]) epoch = convent_date / 1000 dt = datetime.datetime.utcfromtimestamp(epoch) iso_format = dt.isoformat() date[column_name] = iso_format except Exception as e: print '%s Regex did not quite work on table `%s`. Aborting.' % ( item('prompt_warn'), table_name) if verbose: print e return False if verbose: print '%s Date: %s' % (item('prompt_bullet'), iso_format) # # Store data. # StoreRecords(data=data, table=table_name)
def ConvertEpochDates(table_name, column_name, verbose=False): '''Convert timestamps from /Date(1420449209053)/ to 2015-01-05T09:13:29''' print '%s Cleaning epoch dates for table `%s` and field `%s`.' % (item('prompt_bullet').decode('utf-8'), table_name, column_name) # # Collect data from database. # sql = 'select * from %s' % table_name try: result_proxy = scraperwiki.sqlite.execute(sql)['data'] data = [] for row in result_proxy: data.append(dict(row.items())) except Exception as e: print '%s Could not collect data from database.' % item('prompt_error') return False if verbose: print e # # Converting dates. # for date in data: d = date[column_name] # # Seeing if the regex works. # if d != None: try: # # Making transformations. # convent_date = int(d[d.find("(")+1:d.find(")")]) epoch = convent_date / 1000 dt = datetime.datetime.utcfromtimestamp(epoch) iso_format = dt.isoformat() date[column_name] = iso_format except Exception as e: print '%s Regex did not quite work on table `%s`. Aborting.' % (item('prompt_warn'), table_name) if verbose: print e return False if verbose: print '%s Date: %s' % (item('prompt_bullet'), iso_format) # # Store data. # StoreRecords(data=data, table=table_name)
def CollectPackageActivityData(limit=None): '''Collect packacge activity data from HDX.''' print '%s Querying HDX for package activity stream. Limit is %s.' % (item('prompt_bullet'), str(limit)) # # Building URL. # u = 'https://data.hdx.rwlabs.org/api/action/recently_changed_packages_activity_list?limit=' + str(limit) r = requests.get(u) # # Checking the status code. # if r.status_code != requests.codes.ok: print '%s HDX did not responde with a positive HTTP code.' % item('prompt_error') if verbose: print r.status_code return False else: # # Iterating over results. # records = [] results = r.json()['result'] i = 0 widgets = [item('prompt_bullet'), ' Collecting country data:', pb.Percentage(), ' ', pb.Bar('-'), ' ', pb.ETA(), ' '] pbar = pb.ProgressBar(widgets=widgets, maxval=len(results)).start() for result in results: revision_data = { 'user_id': result['user_id'], 'timestamp': result['timestamp'], 'revision_id': result['revision_id'], 'dataset_id': result['data']['package']['name'], 'owner_org': result['data']['package']['owner_org'], 'activity_type': result['activity_type'] } records.append(revision_data) i += 1 pbar.update(i) # # Store records in database. # pbar.finish() StoreRecords(data=records, table='package_activity_data')
def DeletePIIColumns(table_name, column_name, verbose=True): '''Deleting columns that contain personal identifiable information.''' print '%s Deleting PII column `%s` on table `%s`.' % (item('prompt_bullet').decode('utf-8'), column_name, table_name) # # Fetch keys from column in database. # try: c = scraperwiki.sqlite.execute('select * from {table_name} limit 1'.format(table_name=table_name))['keys'] except Exception as e: print '%s Could not connect with database.' % item('prompt_error') if verbose: print e return False # # Copy data into backup table without # PII column, then copy it back. # From: http://stackoverflow.com/questions/10660435/pythonic-way-to-create-a-long-multi-line-string # columns = ','.join([t for t in c if t != column_name]) sql_statements = ['CREATE TABLE {table_name}_backup({columns});'.format(table_name=table_name, columns=columns), 'INSERT INTO {table_name}_backup SELECT {columns} FROM {table_name};'.format(table_name=table_name, columns=columns), 'DROP TABLE {table_name};'.format(table_name=table_name), 'CREATE TABLE {table_name}({columns});'.format(table_name=table_name, columns=columns), 'INSERT INTO {table_name} SELECT {columns} FROM {table_name}_backup;'.format(table_name=table_name, columns=columns), 'DROP TABLE {table_name}_backup;'.format(table_name=table_name, columns=columns)] try: i = 1 for sql in sql_statements: # print '%s SQL statement %s' % (item('prompt_warn'), str(i)) scraperwiki.sqlite.execute(sql) scraperwiki.sqlite._State.new_transaction() i += 1 except Exception as e: print '%s Could not delete column `%s` from table `%s`.' % (item('prompt_error'), column_name, table_name) if verbose: print e return False # # Close connection with database. # return True
def FetchSystemArguments(): '''Fetching arguments from the command line interface.''' try: arguments = { 'api_key': sys.argv[1], 'json_path': sys.argv[2], 'download_temp_path': sys.argv[3], 'stag_url': 'https://test-data.hdx.rwlabs.org', 'prod_url': 'https://data.hdx.rwlabs.org' } except IndexError: print '%s Not all arguments provided.' % item('prompt_error') return False # # Checking that all arguments have been provided. # for argument in arguments: if argument is None: print 'Argument %s is empty. That argument is necessary.' % argument.keys() return False return arguments
def FetchSystemArguments(): '''Fetching arguments from the command line interface.''' try: arguments = { 'api_key': sys.argv[1], 'json_path': sys.argv[2], 'download_temp_path': sys.argv[3], 'stag_url': 'https://test-data.hdx.rwlabs.org', 'prod_url': 'https://data.hdx.rwlabs.org' } except IndexError: print '%s Not all arguments provided.' % item('prompt_error') return False # # Checking that all arguments have been provided. # for argument in arguments: if argument is None: print 'Argument %s is empty. That argument is necessary.' % argument.keys( ) return False return arguments
def DefineSchema(file_name, verbose = False): '''Defining the schema to use. Does type-guessing.''' # # Reading downloaded CSV file. # try: reader = csv.DictReader(open(file_name)) except Exception as e: print '%s There was an error reading resource.' % item('prompt_error') if verbose: print e # # Building the schema. # for row in reader: keys = row.keys() break schema = {'fields': []} for key in keys: schema['fields'].append( {'id': key.lower(), 'type': 'text'} ) # datastores need lower cases. return schema
def CreateDatastore(ckan_url, api_key, json_path, resource_id, file_name, resource, verbose=False): '''Creating a CKAN DataStore.''' # # Configuring the remote CKAN instance. # ckan = ckanapi.RemoteCKAN(ckan_url, apikey=api_key) if DeleteDatastore( ckan_url=ckan_url, api_key=api_key, ckan_resource_id=resource_id) is False: return False # # Creating a DataStore. # ckan.action.datastore_create( resource_id=resource_id, force=True, fields=resource['schema']['fields'], primary_key=resource['schema'].get('primary_key') ) # # Reading CSV file and inserting data. # reader = csv.DictReader(open(file_name)) rows = [ row for row in reader ] # # Hack for managing different encoding data. # if len(json_path) is 36: rows_decoded = [] for row in rows: row_encoded = { key.lower():row[key].decode('utf-8') for key in row.keys() } rows_decoded.append(row_encoded) else: rows_decoded = [] for row in rows: row_encoded = { key:row[key].decode('utf-8') for key in row.keys() } rows_decoded.append(row_encoded) # # Sending N records at a time. # offset = 0 chunksize = 500 # N rows per POST request. while offset < len(rows_decoded): rowset = rows_decoded[offset:offset+chunksize] ckan.action.datastore_upsert( resource_id=resource_id, force=True, method='insert', records=rowset) offset += chunksize complete = str(float(offset)/len(rows_decoded) * 100)[:4] + '%' print '%s Update successful: %s completed' % (item('prompt_bullet'), complete)
def DownloadResourceFromHDX(ckan_url, file_name, resource_id, api_key, verbose = True): '''Downloading a resource from CKAN based on its id. Resources need to be downloaded in order to be correctly parsed by the CreateDatastore function.''' print "%s Downloading resource file from HDX." % item('prompt_bullet') headers = { 'Authorization': api_key, 'user-agent': 'HDX-Script/v.0.1.0' } # # Querying. # url = ckan_url + '/api/action/resource_show?id=' + resource_id r = requests.get(url, headers=headers, auth=('dataproject', 'humdata')) doc = r.json() if doc['success'] is False: if verbose: print json.dumps(doc) print '%s Failed to read resource.' % item('prompt_error') return False else: resource_file_url = doc["result"]["url"] # # Downloading. # try: with open(file_name, 'wb') as handle: response = requests.get(resource_file_url, stream=True, headers=headers, auth=('dataproject', 'humdata')) if not response.ok: print '%s Error: attempt to download resource failed.' % item('prompt_error') return for block in response.iter_content(1024): if not block: break handle.write(block) except Exception as e: print '%s There was an error downlaoding the file.' % item('prompt_error') if verbose: print e return False
def Main(verbose=True): '''Wrapper to run all the scheduled tasks.''' if verbose: print '%s Running scheduler.' % item('prompt_bullet') while True: schedule.run_pending() time.sleep(1)
def CreateRandomFileName(length, extension): '''Create a random, hash-based file name.''' # # Sanity check. # if type(length) != int: print '%s Provide an integer for the length parameter. %s provided.' % (item('prompt_error'), type(length)) return False if type(extension) != str: print '%s Provide an string for the extension parameter. %s provided' % (item('prompt_error'), type(extension)) return False # # Creating an unique file name. # file_name = h.sha1(str(r.random())).hexdigest()[0:length] + extension return file_name
def CollectDatasetData(): '''Collect data about all the datasets.''' u = 'https://data.hdx.rwlabs.org/api/action/package_list' print '%s Querying HDX for country activity stream.' % item('prompt_bullet') # # Building URL. # u = 'https://data.hdx.rwlabs.org/api/action/group_list' r = requests.get(u) # # Checking the status code. # if r.status_code != requests.codes.ok: print '%s HDX did not responde with a positive HTTP code.' % item('prompt_error') if verbose: print r.status_code return False
def Main(): '''Wrapper.''' resource_ids = [ '68788137-84d6-4e9d-87f1-f23f71ec705f', '89a28fad-a862-4133-82cd-9ef5f1938f38', '891b778a-9657-4eda-91b9-de9b41240a90', 'f28518b2-afed-47b2-a805-c36eb3b18dcf', 'de3eb9fa-fea8-4dec-8119-1814552198b3', '61995548-93ab-4927-b760-faf5239d32a9', '735b3f3a-eef6-4eb4-8b43-9307bac3177c', '831ef44b-d7eb-4c03-b152-ce5dce174626', '860fd6e4-589b-49c0-b1cf-fb9c0528d391' ] i = 1 for resource_id in resource_ids: print '%s Creating HDX DataStore %s / %s' % (item('prompt_bullet'), str(i), str(len(resource_ids))) CreateDatastoresFromResourceID(resource_id) print '%s Successfully created datastore for resource id %s' % (item('prompt_success'), resource_id) i += 1
def LoadLocalJson(json_path, verbose=False): '''Loading resources from a local json file.''' try: with open(json_path) as json_file: resources = json.load(json_file) # # Checking that the json provide contains at least # one resource. # if len(resources) < 1: print '%s Json looks odd! Please revise.' % item('prompt_error') return resources except Exception as e: print '%s Could not load local JSON: `%s`' % (item('prompt_error'), json_path) if verbose: print e return False
def FetchData(endpoint): '''Fetch data from specific endpoint.''' u = endpoint["url"] r = requests.get(u) if r.status_code != 200: print "%s Query returned error code: %s" % (item('prompt_error'), r) return else: json = r.json() return json
def CreateTables(): '''Creating the tables of the new database.''' endpoints = Config.LoadConfig() sql_statements = {} for endpoint in endpoints: table_name = endpoint["table_name"] statement = " TEXT, ".join(endpoint['table_schema']) statement = 'CREATE TABLE IF NOT EXISTS %s(%s TEXT)' % (table_name, statement) sql_statements[table_name] = statement for table in sql_statements: try: query = scraperwiki.sqlite.execute(sql_statements[table]) print "%s table `%s` created." % (item('prompt_bullet'), str(table)) except Exception as e: print e return False print "%s Database created successfully." % item('prompt_success') return True
def LoadConfig(j='prod.json', verbose=True): '''Load configuration parameters.''' data_dir = os.path.join(os.path.split(dir)[0], 'config') try: j = os.path.join(data_dir, j) with open(j) as json_file: config = json.load(json_file) except Exception as e: print "%s Couldn't load configuration." % item('prompt_error') if verbose: print e return False return config
def StoreRecords(data, table, verbose=False, db_lock_time=None): '''Store records in a ScraperWiki database.''' schemas = Config.LoadConfig() table_names = [] for schema in schemas: table_names.append(schema["table_name"]) if table not in table_names: print "%s select one of the following tables: %s." % ( item('prompt_error'), ", ".join(table_names)) return False try: tables = scraperwiki.sqlite.show_tables() if table in tables.keys(): old_records = scraperwiki.sqlite.execute( "select count(*) from %s" % table)["data"][0][0] # # Waiting to unlock database. # if db_lock_time: print '%s Waiting for database to unlock (%s seconds).' % ( item('prompt_bullet'), db_lock_time) time.sleep(db_lock_time) delete_statement = "DELETE FROM %s" % table scraperwiki.sqlite.execute(delete_statement) scraperwiki.sqlite._State.new_transaction() # closing connection print "%s Deleting %s records from database table: %s" % ( item('prompt_bullet').decode('utf-8'), old_records, table) # # Waiting to unlock database. # if db_lock_time: print '%s Waiting for database to unlock (%s seconds).' % ( item('prompt_bullet'), db_lock_time) time.sleep(db_lock_time) scraperwiki.sqlite.save(schema, data, table_name=table) print "%s Storing record %s in database." % ( item('prompt_bullet').decode('utf-8'), len(data)) # Before storing check that the record exists in database. except Exception as e: print "%s Failed to store record in database." % item('prompt_error') print e
def DeleteDatastore(ckan_url, api_key, ckan_resource_id, verbose=False): '''Delete a CKAN DataStore.''' # # Configuring the remote CKAN instance. # ckan = ckanapi.RemoteCKAN(ckan_url, apikey=api_key) try: ckan.action.datastore_delete(resource_id=ckan_resource_id, force=True) # # If DataStore doesn't exist # print warning, but let it pass. # except Exception as e: if verbose: print e print '%s Old DataStore could not be deleted.' % item('prompt_warn') pass
def StoreRecords(data, table, verbose=False, db_lock_time=None): '''Store records in a ScraperWiki database.''' schemas = Config.LoadConfig() table_names = [] for schema in schemas: table_names.append(schema["table_name"]) if table not in table_names: print "%s select one of the following tables: %s." % (item('prompt_error'), ", ".join(table_names)) return False try: tables = scraperwiki.sqlite.show_tables() if table in tables.keys(): old_records = scraperwiki.sqlite.execute("select count(*) from %s" % table)["data"][0][0] # # Waiting to unlock database. # if db_lock_time: print '%s Waiting for database to unlock (%s seconds).' % (item('prompt_bullet'), db_lock_time) time.sleep(db_lock_time) delete_statement = "DELETE FROM %s" % table scraperwiki.sqlite.execute(delete_statement) scraperwiki.sqlite._State.new_transaction() # closing connection if verbose: print "%s Deleting %s records from database table: %s" % (item('prompt_bullet').decode('utf-8'), old_records, table) # # Waiting to unlock database. # if db_lock_time: print '%s Waiting for database to unlock (%s seconds).' % (item('prompt_bullet'), db_lock_time) time.sleep(db_lock_time) scraperwiki.sqlite.save(schema, data, table_name=table) if verbose: print "%s Storing record %s in database." % (item('prompt_bullet').decode('utf-8'), len(data)) # Before storing check that the record exists in database. except Exception as e: print "%s Failed to store record in database." % item('prompt_error') print e
def CollectCountryActivityData(): '''Collecting country activity data from HDX.''' print '%s Querying HDX for country activity stream.' % item('prompt_bullet') # # Building URL. # u = 'https://data.hdx.rwlabs.org/api/action/group_list' r = requests.get(u) # # Checking the status code. # if r.status_code != requests.codes.ok: print '%s HDX did not responde with a positive HTTP code.' % item('prompt_error') if verbose: print r.status_code return False else: # # Iterating over each country. # country_activity = [] countries = r.json()['result'] i = 0 widgets = [item('prompt_bullet'), ' Parsing country list:', pb.Percentage(), ' ', pb.Bar('-'), ' ', pb.ETA(), ' '] pbar = pb.ProgressBar(widgets=widgets, maxval=len(countries)).start() for country in countries: u = 'https://data.hdx.rwlabs.org/api/action/group_show?id=' + country r = requests.get(u) # # Checking the status code. # if r.status_code != requests.codes.ok: print '%s HDX did not responde with a positive HTTP code.' % item('prompt_error') if verbose: print r.status_code return False else: country_data = r.json()['result'] # # Iterating for every dataset. # for dataset in country_data['packages']: country_select = { 'country_id': country_data['id'], 'country_name': country_data['display_name'], 'dataset_id': dataset['id'], 'dataset_owner_org': dataset['organization']['name'], 'dataset_date_created': dataset['metadata_created'] } country_activity.append(country_select) # # Updating progess bar. # i += 1 pbar.update(i) # # Store records in database. # pbar.finish() StoreRecords(data=country_activity, table='organization_activity_data', verbose=False)
def CreateDatastoresFromResourceID(resource_id, system_arguments=False): '''Create a datastore from a resource ID. Either collect system arguments if run from the command line or do type-guessing for the creation of schemas on the fly.''' # # Fetching arguments and configuring the script. # if system_arguments: p = FetchSystemArguments() # # Manual input of function parameters # else: p = { 'json_path': resource_id, 'prod_url': 'https://data.hdx.rwlabs.org/', 'download_temp_path': os.path.join('tool', 'data', CreateRandomFileName(length=5, extension='.csv')), 'api_key': LoadLocalJson(os.path.join(os.path.split(dir)[0], 'config', 'secrets.json'))['hdx_key'] } api_key = p['api_key'] ckan_url = p['prod_url'] download_temp_path = p['download_temp_path'] # # Loading resource information. # if len(p['json_path']) is 36: # length of resource ID. resources = [ {'resource_id': p['json_path']} ] # Download file. DownloadResourceFromHDX( ckan_url=ckan_url, file_name=download_temp_path, resource_id=resources[0]['resource_id'], api_key=api_key ) # Create schema. resources[0]['schema'] = DefineSchema(download_temp_path) resources[0]['indexes'] = [] # Create datastore. CreateDatastore( ckan_url=ckan_url, api_key=api_key, json_path=p['json_path'], file_name=download_temp_path, resource_id=resource_id, resource=resources[0] ) # # Delete temporary file. # print '%s Cleaning temp file.' % item('prompt_bullet') os.remove(download_temp_path) else: resources = LoadLocalJson(p['json_path']) # # Iterating over each resource provided. # for r in resources: resource_id = r['resource_id'] print '%s Creating DataStore for resource id: %s' % (item('prompt_bullet'), resource_id) try: DownloadResourceFromHDX( ckan_url=ckan_url, file_name=download_temp_path, resource_id=resource_id, api_key=api_key ) CreateDatastore( ckan_url=ckan_url, api_key=api_key, json_path=p['json_path'], file_name=download_temp_path, resource_id=resource_id, resource=r ) except Exception as e: print '%s DataStore creation failed.' % item('prompt_error') if verbose: print e return False