def data_to_s3(endpoint): # throws error occured if there was a problem accessing data # otherwise downloads and uploads to s3 source_dataset_url = 'https://data.nextstrain.org/ncov' try: response = urlopen(source_dataset_url + endpoint) except HTTPError as e: raise Exception('HTTPError: ', e.code, endpoint) except URLError as e: raise Exception('URLError: ', e.reason, endpoint) else: data_set_name = os.environ['DATASET_NAME'] filename = data_set_name + endpoint new_s3_key = data_set_name + '/dataset/' + filename file_location = '/tmp/' + filename with open(file_location + '.gz', 'wb') as f: f.write(response.read()) with gzip.open(file_location + '.gz', 'rb') as g, open(file_location, 'w', encoding='utf-8') as f: str_data = g.read().decode() dict_data = json.loads(str_data) f.write(json.dumps(dict_data)) os.remove(file_location + '.gz') # variables/resources used to upload to s3 s3_bucket = os.environ['ASSET_BUCKET'] s3 = boto3.client('s3') has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location) if has_changes: s3.upload_file(file_location, s3_bucket, new_s3_key) print('Uploaded: ' + filename) # deletes to preserve limited space in aws lamdba os.remove(file_location) # dicts to be used to add assets to the dataset revision asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key} return {'has_changes': has_changes, 'asset_source': asset_source}
def data_to_s3(endpoint): source_dataset_url = 'https://www.cryptodatadownload.com/cdd/Bitstamp_' if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)): ssl._create_default_https_context = ssl._create_unverified_context response = None retries = 5 for attempt in range(retries): try: response = urlopen(source_dataset_url + endpoint) except HTTPError as e: if attempt == retries: raise Exception('HTTPError: ', e.code) time.sleep(0.2 * attempt) except URLError as e: if attempt == retries: raise Exception('URLError: ', e.reason) time.sleep(0.2 * attempt) else: break if response == None: raise Exception('There was an issue downloading the dataset') data_set_name = os.environ['DATA_SET_NAME'] filename = data_set_name + endpoint file_location = '/tmp/' + filename with open(file_location, 'wb') as f: f.write(response.read()) s3_bucket = os.environ['S3_BUCKET'] new_s3_key = data_set_name + '/dataset/' + filename s3 = boto3.client('s3') has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location) if has_changes: s3.upload_file(file_location, s3_bucket, new_s3_key) print('Uploaded: ' + filename) else: print('No changes in: ' + filename) asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key} return {'has_changes': has_changes, 'asset_source': asset_source}
def source_dataset(s3_bucket, new_s3_key): # list of enpoints to be used to access data included with product endpoints = [ 'us.csv', 'us-states.csv', 'us-counties.csv', 'live/us.csv', 'live/us-states.csv', 'live/us-counties.csv' ] # multithreading speed up accessing data, making lambda run quicker with (Pool(6)) as p: p.map(data_to_s3, endpoints) # uploading to s3 s3_uploads = [] s3 = boto3.client('s3') for filename in os.listdir('/tmp'): file_location = '/tmp/' + filename has_changes = md5_compare( s3, s3_bucket, new_s3_key + filename, file_location) if has_changes: s3.upload_file(file_location, s3_bucket, new_s3_key + filename) print('Uploaded: ' + filename) else: print('No changes in: ' + filename) asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key + filename} s3_uploads.append({'has_changes': has_changes, 'asset_source': asset_source}) count_updated_data = sum( upload['has_changes'] == True for upload in s3_uploads) asset_list = [] if count_updated_data > 0: asset_list = list( map(lambda upload: upload['asset_source'], s3_uploads)) if len(asset_list) == 0: raise Exception('Something went wrong when uploading files to s3') return asset_list
def data_to_s3(frmt): # throws error occured if there was a problem accessing data # otherwise downloads and uploads to s3 source_dataset_url = 'https://fred.stlouisfed.org/graph/fredgraph' url_end = '?id=TRUCKD11' try: response = urlopen(source_dataset_url + frmt + url_end) except HTTPError as e: raise Exception('HTTPError: ', e.code, frmt) except URLError as e: raise Exception('URLError: ', e.reason, frmt) else: data_set_name = os.environ['DATA_SET_NAME'] filename = data_set_name + frmt file_location = '/tmp/' + filename with open(file_location, 'wb') as f: f.write(response.read()) f.close() # variables/resources used to upload to s3 s3_bucket = os.environ['S3_BUCKET'] new_s3_key = data_set_name + '/dataset/' + filename s3 = boto3.client('s3') # If the md5 hash of our new file does NOT match the s3 etag, upload the new file has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location) if has_changes: s3.upload_file(file_location, s3_bucket, new_s3_key) print('Uploaded: ' + filename) else: print('No update needed for ' + filename) # deletes to preserve limited space in aws lamdba os.remove(file_location) # dicts to be used to add assets to the dataset revision asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key} return {'has_changes': has_changes, 'asset_source': asset_source}
def data_to_s3(endpoint): source_dataset_url = 'https://fred.stlouisfed.org/graph/fredgraph' url_end = '?id=AWHAE' response = None retries = 5 for attempt in range(retries): try: response = urlopen(source_dataset_url + endpoint + url_end) except HTTPError as e: if attempt == retries: raise Exception('HTTPError: ', e.code) time.sleep(0.2 * attempt) except URLError as e: if attempt == retries: raise Exception('URLError: ', e.reason) time.sleep(0.2 * attempt) else: break if response == None: raise Exception('There was an issue downloading the dataset') data_set_name = os.environ['DATA_SET_NAME'] filename = data_set_name + endpoint file_location = '/tmp/' + filename with open(file_location, 'wb') as f: f.write(response.read()) s3_bucket = os.environ['S3_BUCKET'] new_s3_key = data_set_name + '/dataset/' + filename s3 = boto3.client('s3') has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location) if has_changes: s3.upload_file(file_location, s3_bucket, new_s3_key) print('Uploaded: ' + filename) else: print('No changes in: ' + filename) asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key} return {'has_changes': has_changes, 'asset_source': asset_source}
def source_dataset(): source_dataset_url = "https://fred.stlouisfed.org/graph/fredgraph.csv?id=PERMIT" response = None retries = 5 for attempt in range(retries): try: response = urlopen(source_dataset_url) except HTTPError as e: if attempt == retries: raise Exception('HTTPError: ', e.code) time.sleep(0.2 * attempt) except URLError as e: if attempt == retries: raise Exception('URLError: ', e.reason) time.sleep(0.2 * attempt) else: break if response is None: raise Exception('There was an issue downloading the dataset') data_set_name = os.environ['DATA_SET_NAME'] data_dir = '/tmp' if not os.path.exists(data_dir): os.mkdir(data_dir) file_location = os.path.join(data_dir, data_set_name+'.csv') s3_bucket = os.environ['S3_BUCKET'] s3 = boto3.client('s3') s3_resource = boto3.resource('s3') config = TransferConfig(multipart_threshold=1024*25, max_concurrency=10, multipart_chunksize=1024*25, use_threads=True) s3_uploads = [] asset_list = [] obj_name = file_location.split('/', 3).pop().replace(' ', '_').lower() file_location = os.path.join(data_dir, obj_name) new_s3_key = data_set_name + '/dataset/' + obj_name filedata = response.read() has_changes = md5_compare(s3, s3_bucket, new_s3_key, BytesIO(filedata)) if has_changes: s3_resource.Object(s3_bucket, new_s3_key).put(Body=filedata) # sys.exit(0) print('Uploaded: ' + file_location) else: print('No changes in: ' + file_location) asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key} s3_uploads.append({'has_changes': has_changes, 'asset_source': asset_source}) count_updated_data = sum(upload['has_changes'] == True for upload in s3_uploads) if count_updated_data > 0: asset_list = list(map(lambda upload: upload['asset_source'], s3_uploads)) if len(asset_list) == 0: raise Exception('Something went wrong when uploading files to s3') # asset_list is returned to be used in lamdba_handler function # if it is empty, lambda_handler will not republish return asset_list
def source_dataset(): source_dataset_url = 'https://projects.fivethirtyeight.com/data-webpage-data/datasets/congress-generic-ballot.zip' response = None retries = 5 for attempt in range(retries): try: response = urlopen(source_dataset_url) except HTTPError as e: if attempt == retries: raise Exception('HTTPError: ', e.code) time.sleep(0.2 * attempt) except URLError as e: if attempt == retries: raise Exception('URLError: ', e.reason) time.sleep(0.2 * attempt) else: break if response == None: raise Exception('There was an issue downloading the dataset') data_set_name = os.environ['DATA_SET_NAME'] zip_location = '/tmp/' + data_set_name + '.zip' # unzips the zipped folder with open(zip_location, 'wb') as f: f.write(response.read()) with ZipFile(zip_location, 'r') as z: z.extractall('/tmp') os.remove(zip_location) folder_dir = os.listdir('/tmp')[0] # variables/resources used to upload to s3 s3_bucket = os.environ['S3_BUCKET'] s3 = boto3.client('s3') s3_uploads = [] for r, d, f in os.walk('/tmp/' + folder_dir): for filename in f: obj_name = os.path.join(r, filename).split( '/', 3).pop().replace(' ', '_').lower() file_location = os.path.join(r, filename) new_s3_key = data_set_name + '/dataset/' + obj_name has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location) if has_changes: s3.upload_file(file_location, s3_bucket, new_s3_key) print('Uploaded: ' + filename) else: print('No changes in: ' + filename) asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key} s3_uploads.append({'has_changes': has_changes, 'asset_source': asset_source}) count_updated_data = sum( upload['has_changes'] == True for upload in s3_uploads) if count_updated_data > 0: asset_list = list( map(lambda upload: upload['asset_source'], s3_uploads)) if len(asset_list) == 0: raise Exception('Something went wrong when uploading files to s3') return asset_list else: return []
def source_dataset(): bea_dataset_name = os.getenv('BEA_DATASET_NAME', 'regional') table_name = os.getenv('BEA_TABLE_NAME', 'CAINC4') source_dataset_url = "https://apps.bea.gov/{}/zip/{}.zip".format( bea_dataset_name, table_name) response = None retries = 5 for attempt in range(retries): try: response = urlopen(source_dataset_url) except HTTPError as e: if attempt == retries: raise Exception('HTTPError: ', e.code) time.sleep(0.2 * attempt) except URLError as e: if attempt == retries: raise Exception('URLError: ', e.reason) time.sleep(0.2 * attempt) else: break if response == None: raise Exception('There was an issue downloading the dataset') data_set_name = os.environ['DATA_SET_NAME'] data_dir = '/tmp' if not os.path.exists(data_dir): os.mkdir(data_dir) zip_location = os.path.join(data_dir, data_set_name + '.zip') with open(zip_location, 'wb') as f: f.write(response.read()) with ZipFile(zip_location, 'r') as z: z.extractall(data_dir) os.remove(zip_location) s3_bucket = os.environ['S3_BUCKET'] s3 = boto3.client('s3') unzipped_name = os.listdir(data_dir)[0] s3_uploads = [] asset_list = [] for r, d, f in os.walk(data_dir): for filename in f: obj_name = os.path.join(r, filename).split('/', 3).pop().replace( ' ', '_').lower() file_location = os.path.join(r, filename) new_s3_key = data_set_name + '/dataset/' + obj_name has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location) if has_changes: s3.upload_file(file_location, s3_bucket, new_s3_key) print('Uploaded: ' + filename) else: print('No changes in: ' + filename) asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key} s3_uploads.append({ 'has_changes': has_changes, 'asset_source': asset_source }) count_updated_data = sum(upload['has_changes'] == True for upload in s3_uploads) if count_updated_data > 0: asset_list = list( map(lambda upload: upload['asset_source'], s3_uploads)) if len(asset_list) == 0: raise Exception('Something went wrong when uploading files to s3') # asset_list is returned to be used in lamdba_handler function # if it is empty, lambda_handler will not republish return asset_list
def source_dataset(): ny_us = pd.read_csv(nytimes["us"]) ny_states = pd.read_csv(nytimes["states"]) ny_counties = pd.read_csv(nytimes["counties"]) owid_data = pd.read_csv(owid["data"]) dataapi_us = pd.read_csv(dataapi["us-summary"]) dataapi_states = pd.read_csv(dataapi["states"]) print('ny_us') print(ny_us.columns) print('ny_states') print(ny_states.columns) print('ny_counties') print(ny_counties.columns) print('owid_data') print(owid_data.columns) print('dataapi_us') print(dataapi_us.columns) print('dataapi_states') print(dataapi_states.columns) data_dir = '/tmp' if not os.path.exists(data_dir): os.mkdir(data_dir) county_codes = pd.read_csv('county_codes.csv') state_codes = pd.read_csv('state_codes.csv') country_codes = pd.read_csv('country_codes.csv') # state data file covid_us_states = dataapi_states.copy() covid_us_states = covid_us_states.rename(columns={ "negative": "tests_negative", "positive": "tests_positive", "pending": "tests_pending", "totalTestResults": "tests", "hospitalizedCurrently": "patients_hosp", "inIcuCurrently": "patients_icu", "onVentilatorCurrently": "patients_vent" }) covid_us_states = covid_us_states.set_index('state').join( state_codes[['state_name', 'post_code', 'state_fips', 'lat', 'long']].set_index('post_code'), how='left').reset_index() right_df = ny_states.rename(columns={'state':'state_name'}) covid_us_states = pd.merge(covid_us_states, right_df[['date', 'state_name', 'cases', 'deaths']], how='left', on=['date', 'state_name']) cols = covid_us_states_columns for col in cols: if col not in covid_us_states.columns: covid_us_states[col] = None covid_us_states = covid_us_states[covid_us_states_columns] covid_us_states.to_csv(os.path.join(data_dir, 'covid_19_us_states.csv'), index=False) # county data file covid_us_counties = ny_counties.copy() covid_us_counties = covid_us_counties.rename(columns={ "county": "county_name", "state": "state_name", "fips": "county_fips" }) covid_us_counties = covid_us_counties.set_index('state_name').join( state_codes[['state_name', 'state_fips']].set_index('state_name'), how='left').reset_index() covid_us_counties = covid_us_counties.set_index('county_name').join( county_codes[['county_name', 'lat', 'long']].set_index('county_name'), how='left').reset_index() covid_us_counties['area_name'] = None covid_us_counties_columns = ['state_fips', 'state_name', 'county_fips', 'county_name', 'area_name', 'lat', 'long', 'date', 'cases', 'deaths'] cols = covid_us_counties_columns for col in cols: if col not in covid_us_counties.columns: covid_us_counties[col] = None covid_us_counties = covid_us_counties[covid_us_counties_columns] covid_us_counties.to_csv(os.path.join(data_dir, 'covid_19_us_counties.csv'), index=False) # global country data file covid_global_countries = owid_data.copy() covid_global_countries = covid_global_countries.rename(columns={ "location": "country_name", "total_cases": "cases", "total_deaths": "deaths", "total_tests": "tests", "tests_units": "tests_units" }) covid_global_countries = covid_global_countries.set_index("country_name").join( country_codes.set_index("country_name"), how="left").reset_index() cols = covid_global_countries_columns for col in cols: if col not in covid_global_countries.columns: covid_global_countries[col] = None covid_global_countries = covid_global_countries[covid_global_countries_columns] covid_global_countries.to_csv(os.path.join(data_dir, 'covid_19_global_countries.csv'), index=False) # global all regions data file now = datetime.datetime.now() version_timestamp = now.strftime('%Y%m%d%H%M') covid_global = owid_data[[ 'continent', 'location', 'date', 'total_cases', 'total_deaths', 'total_tests', 'tests_units', 'population']].copy() # counties and states parts counties = covid_us_counties.copy() states = covid_us_states.copy() counties['geographic_level'] = 'US County' states['geographic_level'] = 'US State' counties['country_name'] = 'United States' states['country_name'] = 'United States' states = states.merge(country_codes[['country_name', 'country_iso2', 'country_iso3']], how='left', on='country_name') counties = counties.merge(country_codes[['country_name', 'country_iso2', 'country_iso3']], how='left', on='country_name') # world part world = covid_global[covid_global['location'] == 'World'].copy() world['geographic_level'] = 'Global' world['country_iso3'] = 'OWID_WRL' world = world.rename(columns={ "total_deaths": "deaths", "total_cases": "cases", "total_tests": "tests" }) # international part international = covid_global[covid_global['location'] == 'International'].copy() international['geographic_level'] = 'Country' international['country_name'] = 'International' international = international.rename(columns={ "total_deaths": "deaths", "total_cases": "cases", "total_tests": "tests" }) # us part us = covid_global[covid_global['location'] == 'United States'].copy() us['geographic_level'] = 'Country' us['country_name'] = 'United States' us_cases = dataapi_us.copy() us_cases = us_cases.rename(columns={ "negative": "tests_negative", "positive": "tests_positive", "totalTestResults": "tests", "hospitalizedCurrently": "patients_hosp", "inIcuCurrently": "patients_icu", "onVentilatorCurrently": "patients_vent" }) right_df = ny_us.rename(columns={'state':'state_name'}) us_cases = pd.merge(us_cases, right_df[['date', 'cases']], how='left', on=['date']) cols = ['date', 'tests', 'patients_icu', 'patients_hosp', 'cases', 'tests_negative', 'patients_vent', 'tests_positive', 'recovered'] for col in cols: if col not in us_cases.columns: us_cases[col] = None us_cases = us_cases[cols].copy() us = us.merge(us_cases, how='left', on='date') us = us[[ 'continent', 'date', 'cases', 'total_deaths', 'tests', 'tests_units', 'population', 'geographic_level', 'country_name', 'patients_icu', 'patients_hosp', 'patients_vent', 'tests_negative', 'tests_positive', 'recovered' ]] us = us.rename(columns={ "total_deaths": "deaths", }) us = us.merge(country_codes[['country_name', 'country_iso2', 'country_iso3', 'lat', 'long']], how='left', on='country_name') # countries part countries = covid_global[(covid_global['location'] != 'International') & \ (covid_global['location'] != 'World') & \ (covid_global['location'] != 'United States')].copy() countries = countries.rename(columns={'location': 'country_name'}) countries['geographic_level'] = 'Country' countries = countries.rename(columns={ "total_deaths": "deaths", "total_cases": "cases", "total_tests": "tests" }) countries = countries.merge(country_codes[['country_name', 'country_iso2', 'country_iso3', 'lat', 'long']], how='left', on='country_name') # add missing columns data_parts = [us, states, counties, countries, international, world] for df in data_parts: cols = covid_global_columns for col in cols: if col not in df.columns: df[col] = None df = df[covid_global_columns] # concatenate all together merged = pd.concat(data_parts, join='outer', ignore_index = True) merged['version_timestamp'] = version_timestamp merged = merged[covid_global_columns] merged.to_csv(os.path.join(data_dir, 'covid_19_global.csv'), index=False) # upload to s3 data_set_name = os.environ['DATA_SET_NAME'] s3_bucket = os.environ['S3_BUCKET'] s3 = boto3.client('s3') s3_uploads = [] asset_list = [] for r, d, f in os.walk(data_dir): for filename in f: obj_name = os.path.join(r, filename).split('/', 3).pop().replace(' ', '_').lower() file_location = os.path.join(r, filename) new_s3_key = data_set_name + '/dataset/' + obj_name has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location) if has_changes: s3.upload_file(file_location, s3_bucket, new_s3_key) print('Uploaded: ' + filename) else: print('No changes in: ' + filename) asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key} s3_uploads.append({'has_changes': has_changes, 'asset_source': asset_source}) count_updated_data = sum(upload['has_changes'] == True for upload in s3_uploads) if count_updated_data > 0: asset_list = list(map(lambda upload: upload['asset_source'], s3_uploads)) if len(asset_list) == 0: raise Exception('Something went wrong when uploading files to s3') # asset_list is returned to be used in lamdba_handler function # if it is empty, lambda_handler will not republish return asset_list
def source_dataset(): #new_filename, s3_bucket, new_s3_key): dataset_name = os.getenv('DATASET_NAME') asset_bucket = os.getenv('ASSET_BUCKET') data_dir = '/tmp' if not os.path.exists(data_dir): os.mkdir(data_dir) file_location_csv = os.path.join(data_dir, dataset_name + '.csv') file_location_json = os.path.join(data_dir, dataset_name + '.json') urls = [ 'https://www.nytimes.com/interactive/2020/us/states-reopen-map-coronavirus.html', 'https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/state/detail/SCPRC-EST2019-18+POP-RES.csv' ] with (Pool(2)) as p: data_source = p.map(download_data, urls) html = data_source[0].replace('\n', '').replace('\t', '') parser = MyHTMLParser() parser.feed(html) # download and format population data population_csv = data_source[1].replace( 'Puerto Rico Commonwealth', 'Puerto Rico').splitlines() population_data = {} for state in population_csv[2:]: row = state.split(',') population_data[row[4]] = row[5] # creating fieldnames variable to set order of data fieldnames = ['state_abbreviation', 'state', 'businesses', 'masks', 'community', 'status_details', 'external_link'] # adding categories variations to fieldnames for category in parser.categories: fieldnames.append(category) fieldnames.append('population') # 5/10/2021 as source data has changed fieldnames.append('reopening') # creating the csv file with open(file_location_csv, 'w', encoding='utf-8') as c: writer = csv.DictWriter(c, fieldnames=fieldnames) writer.writeheader() for row in parser.full_data: row['population'] = int(population_data[row['state']]) writer.writerow(row) # creating the json file with open(file_location_json, 'w', encoding='utf-8') as j, open(file_location_csv, 'r') as c: reader = csv.DictReader(c) j.write('[') j.write(',\n'.join(json.dumps(row).replace('""', 'null') for row in reader)) j.write(']') # uploading to s3 s3_uploads = [] s3 = boto3.client('s3') for filename in os.listdir('/tmp/'): if filename.startswith(dataset_name): file_location = '/tmp/' + filename obj_name = file_location.split('/', 3).pop().replace(' ', '_').lower() new_s3_key = dataset_name + '/dataset/' + obj_name has_changes = md5_compare(s3, asset_bucket, new_s3_key, file_location) if has_changes: s3.upload_file(file_location, asset_bucket, new_s3_key) print('Uploaded: ' + filename) else: print('No changes in: ' + filename) asset_source = {'Bucket': asset_bucket, 'Key': new_s3_key} s3_uploads.append({'has_changes': has_changes, 'asset_source': asset_source}) count_updated_data = sum( upload['has_changes'] == True for upload in s3_uploads) asset_list = [] if count_updated_data > 0: asset_list = list( map(lambda upload: upload['asset_source'], s3_uploads)) if len(asset_list) == 0: raise Exception('Something went wrong when uploading files to s3') return asset_list
def source_dataset(source_data_url, s3_bucket, dataset_name): """Download the source data from URL and put it in S3""" s3 = boto3.client('s3') data_dir = '/tmp' if not os.path.exists(data_dir): os.mkdir(data_dir) response = None retries = 5 for attempt in range(retries): try: response = urlopen(source_data_url) except HTTPError as e: if attempt == retries: raise Exception('HTTPError: ', e.code) time.sleep(0.2 * attempt) except URLError as e: if attempt == retries: raise Exception('URLError: ', e.reason) time.sleep(0.2 * attempt) else: break if response is None: raise Exception('There was an issue downloading the dataset') zip_location = os.path.join(data_dir, dataset_name+'.zip') with open(zip_location, 'wb') as f: f.write(response.read()) with ZipFile(zip_location, 'r') as z: z.extractall(data_dir) os.remove(zip_location) s3_uploads = [] asset_list = [] for r, d, f in os.walk(data_dir): for filename in f: obj_name = os.path.join(r, filename).split('/', 3).pop().replace(' ', '_').lower() file_location = os.path.join(r, filename) new_s3_key = os.path.join(dataset_name, 'dataset', obj_name) has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location) if has_changes: s3.upload_file(file_location, s3_bucket, new_s3_key) print('Uploaded: ' + filename) else: print('No changes in: ' + filename) asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key} s3_uploads.append({'has_changes': has_changes, 'asset_source': asset_source}) count_updated_data = sum(upload['has_changes'] is True for upload in s3_uploads) if count_updated_data > 0: asset_list = list(map(lambda upload: upload['asset_source'], s3_uploads)) if len(asset_list) == 0: raise Exception('Something went wrong when uploading files to s3') # asset_list is returned to be used in create_dataset_revision function # if it is empty, lambda_handler will not republish return asset_list
def source_dataset(): button_xpath = '//*[@id="report-wrapper"]/div/div[2]/div[2]/div[1]/div/div/div/div[1]/div/button[2]' page_url = 'https://www.kff.org/report-section/state-covid-19-data-and-policy-actions-policy-actions/' button_xpath_map = { 'covid-19-state-vaccine-priority-populations': '//*[@id="report-wrapper"]/div/div[2]/div[2]/div[1]/div/div/div/div[1]/div/button[2]', 'covid-19-state-populations-eligiblity-and-residency-requirements': '//*[@id="report-wrapper"]/div/div[2]/div[1]/div/div/div[2]/div/div/div/div/div[1]/div/button[2]', 'covid-19-state-social-distancing-actions': '//*[@id="report-wrapper"]/div/div[2]/div[2]/div[5]/div/div/div/div/div/div/div[1]/div/button[2]', 'covid-19-state-health-policy-actions': '//*[@id="report-wrapper"]/div/div[2]/div[2]/div[7]/div[3]/div/div/div/div/div/div/div[1]/div/button[2]', 'covid-19-state-actions-on-telehealth': '//*[@id="report-wrapper"]/div/div[2]/div[2]/div[7]/div[5]/div[2]/div/div/div[1]/div/div/div/div[1]/div/button[2]', 'covid-19-state-health-care-provider-capacity': '//*[@id="report-wrapper"]/div/div[2]/div[2]/div[7]/div[5]/div[4]/div[2]/div/div/div/div/div/div/div/div/div/div/div/div/div/div[2]/div[1]/div[2]/div[2]/div[1]/div/div/div/div/div/div/div/div[1]/div/button[2]', } data_dir = '/tmp/downloads' if not os.path.exists(data_dir): os.mkdir(data_dir) s3_bucket = os.environ['S3_BUCKET'] data_set_name = os.environ['DATA_SET_NAME'] driver = WebDriver() page_source = driver.get_pagesource(page_url, button_xpath_map) # print(page_source) for root, dirs, files in os.walk(data_dir): print(root) print(files) print(dirs) print('--') # sys.exit(0) s3_uploads = [] asset_list = [] # filename = data_set_name + '.csv' # file_location = os.path.join(data_dir, filename) s3 = boto3.client('s3') for root, dirs, files in os.walk(data_dir): for f in files: print(f) new_s3_key = data_set_name + '/dataset/' + f has_changes = True filedata = None file_location = os.path.join(root, f) with open(file_location, 'rb') as reader: #, encoding='utf-8' filedata = reader has_changes = md5_compare(s3, s3_bucket, new_s3_key, filedata) #BytesIO(filedata) if (has_changes): # s3_resource.Object(s3_bucket, new_s3_key).put(Body=filedata) s3.upload_file(file_location, s3_bucket, new_s3_key) print('Uploaded: ' + f) else: print('No changes in: ' + f) asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key} s3_uploads.append({'has_changes': has_changes, 'asset_source': asset_source}) count_updated_data = sum(upload['has_changes'] == True for upload in s3_uploads) if count_updated_data > 0: asset_list = list(map(lambda upload: upload['asset_source'], s3_uploads)) if len(asset_list) == 0: raise Exception('Something went wrong when uploading files to s3') # asset_list is returned to be used in lamdba_handler function # if it is empty, lambda_handler will not republish return asset_list
def source_dataset(new_filename, s3_bucket, new_s3_key): source_url = 'https://www.google.com/covid19/mobility/' # throws error occured if there was a problem accessing data # otherwise downloads and uploads to s3 try: source_response = urlopen(source_url) except HTTPError as e: raise Exception('HTTPError: ', e.code, new_filename) except URLError as e: raise Exception('URLError: ', e.reason, new_filename) else: html = source_response.read().decode() parser = MyHTMLParser() parser.feed(html) try: data_response = urlopen(parser.data) except HTTPError as e: raise Exception('HTTPError: ', e.code, new_filename) except URLError as e: raise Exception('URLError: ', e.reason, new_filename) else: data = data_response.read() s3_uploads = [] s3 = boto3.resource('s3') has_changes = md5_compare(s3_bucket, new_s3_key + new_filename, BytesIO(data)) if has_changes: s3.Object(s3_bucket, new_s3_key + new_filename).put(Body=data) print('Uploaded: ' + new_filename) else: print('No changes in: ' + new_filename) asset_source = { 'Bucket': s3_bucket, 'Key': new_s3_key + new_filename } s3_uploads.append({ 'has_changes': has_changes, 'asset_source': asset_source }) count_updated_data = sum(upload['has_changes'] == True for upload in s3_uploads) asset_list = [] if count_updated_data > 0: asset_list = list( map(lambda upload: upload['asset_source'], s3_uploads)) if len(asset_list) == 0: raise Exception( 'Something went wrong when uploading files to s3') return asset_list