def DownloadResource(url, default_dir='data', verbose=True): '''Downloading a resource from HDX.''' # # Assemble file path. # file_name = os.path.basename(url) file_path = os.path.join('data', file_name) if verbose: print '%s Downloading resource %s' % (item('prompt_bullet'), file_name) # # Make request. # try: r = requests.get(url) if r.status_code == 200: with open(file_path, 'wb') as f: for chunk in r: f.write(chunk) if verbose: print '%s File %s was downloaded successfully.' % (item('prompt_bullet'), file_name) except Exception as e: if verbose: print e print '%s File %s failed to download.' % (item('prompt_error'), file_name) return False
def CollectAndStoreGaulData(csv_name, db_table='Gaul', verbose=True): '''Use a CSV file to store the WFP-modified GAUL on a local database.''' print '%s Storing GAUL database in DB (~5 mins).' % item('prompt_bullet') # # Data dir. # data_dir = os.path.split(dir)[0] gaul_location = os.path.join(data_dir, 'config', csv_name) # # Storing GAUL on database. # try: with open(gaul_location) as csv_file: data = csv.DictReader(csv_file) records = [] for row in data: StoreRecords(row, db_table, verbose=True) records.append(row) # StoreRecords(records, db_table, verbose=True) except Exception as e: print "%s Failed to store GAUL database in DB." % item('prompt_error') if verbose: print e return False
def StoreRecords(data, table, verbose = False): '''Store records in a ScraperWiki database.''' # Available schemas. schemas = { 'FCS': ["ADM0_ID", "ADM5_ID", "Methodology", "LivelihoodZoneName", "ADM4_ID", "FCS_borderline", "FCS_month", "IndicatorTypeID", "FCS_dataSource", "methodologyID", "FCS_year", "TargetGroup", "ADM3_ID", "ADM2_ID", "Lz_ID", "mr_id", "FCS_lowerThreshold", "FCS_id", "FCS_poor", "targetGroupID", "ADM1_ID", "FCS_upperThreshold", "FCS_acceptable", "FCS_mean"], 'CSI': ["CSI_rMediumCoping", "IndicatorTypeID", "ADM0_ID", "CSI_csHighCoping", "ADM5_ID", "LivelihoodZoneName", "ADM4_ID", "CSI_rDataSource", "CSI_csLowCoping", "MethodologyCs", "csMethodologyID", "CSI_rHighCoping", "CSI_id", "CSI_rMediumHighThreshold", "CSI_csMean", "CSI_rLowCoping", "CSI_rLowMediumThreshold", "rMethodologyID", "CSI_rMonth", "csTargetGroupID", "CSI_rNoCoping", "TargetGroupCs", "ADM3_ID", "CSI_csDataSource", "ADM2_ID", "TargetGroupR", "CSI_csLowMediumThreshold", "Lz_ID", "MethodologyR", "CSI_csMediumCoping", "mr_id", "CSI_csNoCoping", "CSI_rYear", "fdc", "CSI_csMediumHighThreshold", "rTargetGroupID", "CSI_csYear", "CSI_rMean", "ADM1_ID", "CSI_csMonth"], 'Income': ["IncomeSubCategoryID", "IncomeID", "Adm4_ID", "Adm0_ID", "IncomeYear", "Adm3_ID", "IndicatorTypeID", "Adm2_ID", "IncomeCategoryID", "Adm5_ID", "IncomeSubCategory", "IncomeCategory", "IncomeMonth", "mr_id", "IncomeValue", "Adm1_ID"] } try: schema = schemas[table] except Exception as e: if verbose is True: print "%s select one of the following tables: %s." % (item('prompt_error'), ", ".join(schemas.keys())) print e print '%s Could not find schema.' % item('prompt_error') return False try: for record in data: scraperwiki.sqlite.save(schema, record, table_name=table) except Exception as e: print "%s Failed to store record in database." % item('prompt_error') print e
def FetchData(url=Config.LoadConfig()['url']): '''Fetching data from the UNOSAT API.''' # # Loading main URL from the config # file and making request. # try: u = url r = requests.get(u) except Exception as e: print '%s Could not connect to url: %s' % (item('prompt_error'), url) print e return False # # Checking the status code. # if r.status_code != requests.codes.ok: print '%s Request to UNOSAT servers failed to complete.' % item('propmt_error') return False else: return r.json()
def CreateTables(config_path=Config.CONFIG_PATH, verbose=True): '''Creating the tables of the new database.''' try: endpoints = Config.LoadConfig(config_path) except Exception as e: if verbose: print e else: print '%s Could not load configuration file.' % item('prompt_error') sql_statements = {} for endpoint in endpoints['endpoints']: table_name = endpoint['database']['name'] statement = " TEXT, ".join(endpoint['database']['fields']) statement = 'CREATE TABLE IF NOT EXISTS %s(%s TEXT)' % (table_name, statement) sql_statements[table_name] = statement for table in sql_statements: try: query = scraperwiki.sqlite.execute(sql_statements[table]) print "%s table `%s` created." % (item('prompt_bullet'), str(table)) except Exception as e: print e return False print "%s Database created successfully." % item('prompt_success') return True
def collect_previous_ga_data(verbose = False, test_data = False): '''Collecting historical Google Analytics data with the new database.''' counter = 0 period_date = date.today() # Google Analytics only has data available # from 2014-05-25, not earlier. while period_date > date(2014, 5, 25): period_date = date.today() - timedelta(weeks=counter) counter += 1 try: print "%s collecting data for week %s of %s" % (I.item('prompt_bullet'), period_date.isocalendar()[1], period_date.isocalendar()[0]) records = ga_collect.collect_ga_data(period_date) S.StoreRecords(data = records, table = "funnel") if test_data is True and counter > 1: return records except Exception as e: if verbose: print e return False print "%s Google Analytics failed to run." % I.item('prompt_error') print "%s Google Analytics collection ran successfully." % I.item('prompt_success') return True
def CreateTables(config_path='dev.json', verbose=True): '''Creating the tables of the new database.''' # # Load configuration data. # try: config_data = Config.LoadConfig(config_path)['database'] except Exception as e: if verbose: print '%s Could not load configuration file.' % item('prompt_error') print e return False # # Create SQL statements for every table. # sql_statements = {} for table in config_data: table_name = table['database']['table_name'] statement = " TEXT, ".join(table['database']['fields']) statement = 'CREATE TABLE IF NOT EXISTS %s(%s TEXT)' % (table_name, statement) sql_statements[table_name] = statement for table in sql_statements: scraperwiki.sqlite.execute(sql_statements[table]) print "%s Table `%s` created." % (item('prompt_bullet'), str(table)) print "%s Database created successfully.\n" % item('prompt_success') return True
def Main(verbose=False): '''Wrapper.''' # # List of indicators to download. # indicators = [642, 653, 654, 593, 587, 3, 190, 504, 495, 343, 322, 337, 545, 384, 664, 645, 541, 540, 684, 588] # indicators = [322] for indicator in indicators: data = BuildQueryString(indicator) print '%s Processing data for `%s`' % (item('prompt_bullet'), data['metadato']['NOM_DATO'].encode('utf-8')) # # Error handler for the processing. # errors = [] try: table_name = 'sidih_' + str(indicator) StoreRecords(data=data['valores'], table=table_name, schema='sidih_schema') StoreRecords(data=data['valores'], table="sidih_all_data", schema='sidih_schema') except Exception as e: errors.append(indicator) print '%s Indicator %s failed to process.' % (item('prompt_bullet'), str(indicator)) if verbose: print e # # Pretty printing summary. # n_success = len(indicators) - len(errors) print '%s Successfully collected %s indicators from SIDIH.' % (item('prompt_success'), str(n_success)) if len(errors) > 0: print '%s %s indicators failed to collect: %s.' % (item('prompt_warn'), str(len(errors)), errors) return True
def QueryWFP(urls, db_table, endpoint, **kwargs): '''Query WFP's VAM API asynchronously.''' data_dir = kwargs['data_dir'] verbose = kwargs.get('verbose') make_json = kwargs.get('make_json') make_csv = kwargs.get('make_csv') store_db = kwargs.get('store_db', True) # # Load endpoint information. # preferred_fields = endpoint['preferred_fields'] url_list = list(urls) if verbose: for url in url_list: print '%s query: %s' % (item('prompt_bullet'), url) # # Defining the asynchronous request. # request_list = (requests.get(url) for url in url_list) responses = requests.map(request_list, exception_handler=handler) for index, r in enumerate(responses, 1): data = r.json() if r else [] length = len(data) # # Check if there is data available and store output. # if length and verbose: print "%s Data found." % item('prompt_bullet') elif verbose: print '%s Data not found.' % item('prompt_warn') # Store JSON. if length and make_json: j_path = p.join(DATA_DIR, 'data', '%s_%s_data.json' % (db_table, index)) with open(j_path, 'w') as outfile: json.dump(data, outfile) # Store CSV. if length and make_csv: c_path = p.join(DATA_DIR, 'data', '%s_%s_data.csv' % (db_table, index)) f = csv.writer(open(c_path, "wb+")) f.writerow(data[0].keys()) [f.writerow(flatten_row(row, preferred_fields).values()) for row in data] # # Storing results in DB. # if length and store_db: schema = endpoint['database']['fields'] for row in data: flattened_row = flatten_row(row, preferred_fields) StoreRecords([flattened_row], schema, db_table)
def CalculateMetric(json, test_data = False): '''Process dataset list data and store output.''' print "%s Calculating private datasets." % I.item('prompt_bullet') records = [{ 'metricid': 'ckan-number-of-private-dataset', 'period': str(time.strftime("%Y-%m-%d")), 'period_start_date': str(time.strftime("%Y-%m-%d")), 'period_end_date': str(time.strftime("%Y-%m-%d")), 'period_type': 'd', 'value': 0 }] i = 0 for dataset in json['result']: if dataset['private']: records[0]['value'] += 1 i += 1 progress = round((float(i) / len(json['result'])),3) * 100 print "%s Progress: %s%%" % (I.item('prompt_bullet'), progress) # Create week-record current_day_date = datetime.strptime(time.strftime("%Y-%m-%d"), "%Y-%m-%d") current_week = time.strftime("%Y-W") + str(int(time.strftime('%U')) + 1) start = current_day_date - timedelta(days = current_day_date.weekday()) end = start + timedelta(days = 6) first_day_of_current_week = start.strftime('%Y-%m-%d') last_day_of_current_week = end.strftime('%Y-%m-%d') ## Faking week data ## for test purposes. if test_data is True: current_day_date = last_day_of_current_week if current_day_date == last_day_of_current_week: print "%s Generating week record." % I.item('prompt_bullet') record_week = { 'metricid': 'ckan-number-of-orgs', 'period': current_week, # week starts at 01 'period_start_date': first_day_of_current_week, 'period_end_date': last_day_of_current_week, 'period_type': 'w', 'value': records[0]['value'] } records.append(record_week) S.StoreRecords(data = records, table = 'funnel') if test_data is True: return records else: return True
def ProcessHDXUserList(json, test_data = False): '''Process data and store output.''' if json["success"] is False: print "%s the resulting JSON is empty. Review your HDX query and try again." % I.item('prompt_error') # Calculating the record. if json["success"] is True: print "%s Processing results" % I.item('prompt_bullet') ## Create day-record. records = [{ 'metricid': 'ckan-number-of-users', 'period': str(time.strftime("%Y-%m-%d")), 'period_start_date': str(time.strftime("%Y-%m-%d")), 'period_end_date': str(time.strftime("%Y-%m-%d")), 'period_type': 'd', 'value': len(json["result"]) }] ## Create week-record. current_day_date = datetime.strptime(time.strftime("%Y-%m-%d"), "%Y-%m-%d") current_week = time.strftime("%Y-W") + str(int(time.strftime('%U')) + 1) start = current_day_date - timedelta(days = current_day_date.weekday()) end = start + timedelta(days = 6) first_day_of_current_week = start.strftime('%Y-%m-%d') last_day_of_current_week = end.strftime('%Y-%m-%d') ## Faking week data ## for test purposes. if test_data is True: current_day_date = last_day_of_current_week if current_day_date == last_day_of_current_week: # Store in database. print "%s Generating week record." % I.item('prompt_bullet') record_week = { 'metricid': 'ckan-number-of-users', 'period': current_week, # week starts at 01 'period_start_date': first_day_of_current_week, 'period_end_date': last_day_of_current_week, 'period_type': 'w', 'value': len(json["result"]) } records.append(record_week) S.StoreRecords(data = records, table = 'funnel') if test_data is True: return records else: return True
def Main(patch=True, write_json=False): '''Wrapper.''' try: d = DownloadAndProcessData() # # For testing purposes. # if write_json: import json with open(os.path.join('data', 'test.json'), 'w') as outfile: json.dump(d, outfile) StoreData(data=d, table_name='unprocessed_data') # # Patching original data. # if patch: try: # # Adding dates and country codes. # dates_data = Clean.CleanDates(data=d) country_data = Clean.IdentifyCountries(data=dates_data) file_type_data = Clean.IdentifyFileTypeAndFileName(data=country_data) # # Variable for export. # export_data = file_type_data # # Cleaning title and adding tags. # data_title = Clean.CleanTitle(data=export_data) # # Storing results. # StoreData(data=data_title, table_name='processed_data') print '%s Successfully patched %s records.' % (item('prompt_success'), len(export_data)) except Exception as e: print '%s Failed to patch data.' % item('prompt_error') print e return False print '%s Successfully fetched %s records from the UNOSAT Flood Portal.\n' % (item('prompt_success'), len(d)) except Exception as e: print e return False
def run_historical_calculations(): '''Making the calculations.''' print "%s Making historical calculations." % I.item('prompt_bullet') try: calc.get_initial_setup_data() except Exception as e: print e print "%s successfully performed historical calculations.\n" % I.item('prompt_success')
def CreateDbAndTable(config_file='dev.json', verbose=True): '''Creating tables in PostgreSQL database.''' # # Loading database information # from config file. # database = LoadConfig(config_file)['database'] # # TODO: add environment variables # to these default values. # conn = psycopg2.connect(host=HOST_DATABASE, dbname='rolltime', user='******', password='******') cur = conn.cursor() # # Build each table. # for table in database: # # Construct SQL statement. # table_sql = "" for f in table['fields']: s = '%s %s, ' % (f['field_name'], f['type']) table_sql += s statement = 'CREATE TABLE IF NOT EXISTS %s(%sPRIMARY KEY (%s))' % (table['name'], table_sql, ", ".join(table['primary_key'])) # # Make statements to the database. # try: cur.execute(statement) conn.commit() print "%s table `%s` created." % (item('prompt_bullet'), str(table['name'])) except Exception as e: print '%s Table `%s` could not be created.' % (item('prompt_error'), table['name']) if verbose: print e return False # # Close communication. # cur.close() conn.close()
def CreateTables(config_path=Config.DEV_CONFIG_PATH, verbose=True): '''Creating the tables of the new database.''' # # Load configuration data. # try: config_data = Config.LoadConfig(config_path)['database'] except Exception as e: if verbose: print '%s Could not load configuration file.' % item('prompt_error') print e return False # # Create SQL statements for every table. # sql_statements = {} for endpoint in config_data: table_name = endpoint['database']['table_name'] statement = " TEXT, ".join(endpoint['database']['fields']) statement = 'CREATE TABLE IF NOT EXISTS %s(%s TEXT)' % (table_name, statement) sql_statements[table_name] = statement for table in sql_statements: scraperwiki.sqlite.execute(sql_statements[table]) print "%s Table `%s` created." % (item('prompt_bullet'), str(table)) # ## I'm unable to test the following chunk. ## As a result, it will remain commented ## below. # # for table in sql_statements: # try: # scraperwiki.sqlite.execute(sql_statements[table]) # print "%s Table `%s` created." % (item('prompt_bullet'), str(table)) # except Exception as e: # if verbose: # print '%s Failed to create table %s.' % (item('prompt_error'), table_name) # print e # return False print "%s Database created successfully.\n" % item('prompt_success') return True
def FetchResourceInfo(package_id, preferred_format='ZIPPED SHAPEFILE', verbose=False, **kwargs): '''Query HDX for a list of datasets that belong to an organization. Only fetches resources that match a preferred file format.''' # # Fetch configuration. # if kwargs.get('config_file') is not None: config = LoadConfig(kwargs.get('config_file')) else: config = LoadConfig() # default: dev.json header = { 'X-CKAN-API-Key': config['hdx_key'] , 'content-type': 'application/json' } u = config['hdx_site'] + '/api/action/package_show?id=' + package_id try: # # If not production, we need to # add simple HTTP authorization. # if config['production']: r = requests.get(u, verify=True) # turns-off SSL certificate verification else: r = requests.get(u, auth=(config['auth'][0], config['auth'][1]), verify=True) except Exception as e: print '%s There was a connection error. Host %s is now known.' % (item('prompt_error'), u) return False if r.status_code != 200: print '%s HDX query returned an error: "%s"' % (item('prompt_error'), r.json()['error']['message']) return False else: # # Fetching URL information # from all organization packages. # data = r.json() package_array = [] for resource in data['result']['resources']: if resource['format'] == preferred_format: d = { 'resource_id': resource['id'], 'dataset_id': package_id, 'resource_url': resource['url'] } package_array.append(d) return package_array
def GetHDXUserList(): '''Querying the CKAN API with a specific parameter.''' # Querying CKAN. # This takes a bit to complete ... u = "https://data.hdx.rwlabs.org/api/action/user_list" try: print "%s Connecting to HDX" % (I.item('prompt_bullet')) j = r.get(u).json() return j except Exception as e: print "%s There was an error connecting to the CKAN API. Aborting." % I.item('prompt_error') return False
def GetDatasetList(): '''Query CKAN for a list of datasets.''' # Querying CKAN. u = "https://data.hdx.rwlabs.org/api/action/current_package_list_with_resources?limit=2000" headers = { 'Authorization': L.LoadConfig('dev')['hdx_key'] } try: print "%s Fetching dataset list from HDX." % (I.item('prompt_bullet')) j = r.get(u, headers=headers).json() return j except Exception as e: print "%s There was an error connecting to the CKAN API. Aborting." % I.item('prompt_error') return False
def Main(): '''Wrapper.''' tables = ['station'] try: for table in tables: Db.CreateDbAndTable() except Exception as e: print '%s Database configuration failed' % item('prompt_error') print e return False print '%s Database configured successfully.\n' % item('prompt_success')
def DownloadAndProcess(delete_files=True, verbose=False, **kwargs): '''Download and process the packages from UNOSAT.''' try: results = [] a = AssemblePackageData(**kwargs) for package in a: # # Download. # DownloadResource(package['resource_url']) # # Analyze. # b = FetchZipInformation(package) results.append(b) # # Clean downloaded file. # if delete_files: f = os.path.basename(b['resource_url']) os.remove(os.path.join('data', f)) return results except Exception as e: print '%s Failed to download and process files.' % item('prompt_error') print e return results
def StoreRecords(data, table, progress_bar=False, verbose=False): '''Store records in a PostgreSQL database.''' # # TODO: add environment variables # to these default values. # conn = psycopg2.connect(host=HOST_DATABASE, dbname='rolltime', user='******', password='******') cur = conn.cursor() try: for record in data: # # Check no NULL values are passed. # for key in record.keys(): if record.get(key) is None: record.pop(key) # # TODO: Check that the upsert statement # is supported by PostgreSQL 9.5 # c = 'INSERT INTO {table} ({columns}) '.format(table=table, columns=",".join(record.keys())) # v = 'VALUES ({values}) ON CONFLICT UPDATE'.format(values="'" + "','".join(str(v) for v in record.values()) + "'") v = 'VALUES ({values})'.format(values="'" + "','".join(str(v) for v in record.values()) + "'") cur.execute(c + v) # # Commit all records. # And close cursor and connection. # conn.commit() cur.close() conn.close() except Exception as e: if e.pgcode == '23505': print '%s Record already exists. Skipping.' % item('prompt_warn') return else: if verbose: print "%s Failed to store record in database." % item('prompt_error') print 'PosgreSQL error code: %s' % e.pgcode return False
def CreateDBTable(table_name='Gaul', verbose=True): '''Creating the GAUL db table.''' db_fields = ["ADM_ID","CONTINENT","REGION","UN_CODE","ISO3","WFP_ISO3","ADM0_NAME","ADM0_CODE","ADM1_NAME","ADM1_CODE","ADM2_NAME","ADM2_CODE","ADM3_NAME","ADM3_CODE","ADM4_NAME","ADM4_CODE","ADM5_NAME","ADM5_CODE","SALB0","SALB1","SALB2","STR_YEAR0","STR_YEAR1","STR_YEAR2","STR_YEAR3","STR_YEAR4","STR_YEAR5","EXP_YEAR0","EXP_YEAR1","EXP_YEAR2","EXP_YEAR3","EXP_YEAR4","EXP_YEAR5","LAST_UPDAT"] statement = " TEXT, ".join(db_fields) statement = 'CREATE TABLE IF NOT EXISTS %s(%s TEXT)' % (table_name, statement) try: scraperwiki.sqlite.execute(statement) scraperwiki.sqlite._State.new_transaction() print "%s table `%s` created." % (item('prompt_bullet'), str(table_name)) except Exception as e: print '%s Table `%s` could not be created.' % (item('prompt_error'), table_name) if verbose: print e return False
def Main(verbose=True): '''Wrapper to run all the scheduled tasks.''' if verbose: print '%s Running scheduler.' % item('prompt_bullet') while True: schedule.run_pending() time.sleep(1)
def Main(): '''Wrapper.''' try: status = FetchLatestStationData() except Exception as e: print '%s Failed to fetch data from the CitiBike API.' % item('prompt_error') print e return False if status != False: print '%s Collection worked successfully.' % item('prompt_success') return True else: print '%s Failed to fetch data from the CitiBike API.' % item('prompt_error') return False
def WriteCSV(json_path='output/analysis.json', verbose=True): '''Writes a CSV output based on a JSON input.''' # # Configuring path. # data_dir = os.path.split(json_path)[0] output_path = os.path.join(data_dir, 'analysis.csv') # # Read json file. # try: with open(json_path) as data_file: data = json.load(data_file) except Exception as e: print '%s Could not ope JSON file.' % item('prompt_error') print e return False # # Writting file. # try: with open(output_path, 'wb') as f: writter = csv.writer(f, delimiter=',', quotechar='"') i = 0 for row in data: if i == 0: writter.writerow([ k for k in row.keys() ]) writter.writerow([ k for k in row.values() ]) i += 1 else: writter.writerow([ v for v in row.values() ]) i += 1 f.flush() except Exception as e: print '%s Could not write CSV file.' % item('prompt_error') print e return False
def FetchPackageList(organization_id, verbose=False, **kwargs): '''Query HDX for a list of datasets that belong to an organization.''' # # Fetch configuration. # if kwargs.get('config_file') is not None: config = LoadConfig(kwargs.get('config_file')) else: config = LoadConfig() # default: dev.json header = { 'X-CKAN-API-Key': config['hdx_key'] , 'content-type': 'application/json' } u = config['hdx_site'] + '/api/action/organization_show?id=' + organization_id try: # # If not production, we need to # add simple HTTP authorization. # if config['production']: r = requests.get(u, verify=True) # turns off SSL certificate verification. else: r = requests.get(u, auth=(config['auth'][0], config['auth'][1]), verify=True) except Exception as e: print '%s There was a connection error. Host %s is now known.' % (item('prompt_error'), u) return False if r.status_code != 200: print '%s HDX query returned an error: "%s"' % (item('prompt_error'), r.json()['error']['message']) return False else: # # Assembling a list of packages # and returning. # data = r.json() dataset_array = [ name['name'] for name in data['result']['packages'] ] return dataset_array
def CollectDaily(verbose = True): '''Collecting daily data.''' try: json = GetHDXUserList() ProcessHDXUserList(json) print "%s Number of registered users fetched successfully." % I.item('prompt_success') return True except Exception as e: if verbose is True: print e return False else: print "%s Failed to fetch number of registered users." % I.item('prompt_error') return False
def CollectDaily(verbose = True): '''Collecting daily data.''' try: dataset_list = GetDatasetList() CalculateMetric(json=dataset_list) print "%s Number of private datasets fetched successfully." % I.item('prompt_success') return True except Exception as e: if verbose is True: print e return False else: print "%s Failed to fetch number of private datasets." % I.item('prompt_error') return False
def Main(): '''Wrapper.''' # # Creating table and storing records. # CreateDBTable() if CollectAndStoreGaulData('modified_admin_units.csv') is not False: print '%s Stored GAUL database on DB successfully.' % item('prompt_success')
def StoreRecords(data, schema, table): '''Store records in a ScraperWiki database.''' try: for record in data: scraperwiki.sqlite.save(schema, record, table_name=table) except Exception as e: print "%s Failed to store record in database." % item('prompt_error') print e