Python md5_compare Exemples, s3_md5_compare.md5_compare Python Exemples

Exemple #1

0

Afficher le fichier

def data_to_s3(endpoint):

    # throws error occured if there was a problem accessing data
    # otherwise downloads and uploads to s3

    source_dataset_url = 'https://data.nextstrain.org/ncov'

    try:
        response = urlopen(source_dataset_url + endpoint)

    except HTTPError as e:
        raise Exception('HTTPError: ', e.code, endpoint)

    except URLError as e:
        raise Exception('URLError: ', e.reason, endpoint)

    else:
        data_set_name = os.environ['DATASET_NAME']
        filename = data_set_name + endpoint
        new_s3_key = data_set_name + '/dataset/' + filename
        file_location = '/tmp/' + filename

        with open(file_location + '.gz', 'wb') as f:
            f.write(response.read())

        with gzip.open(file_location + '.gz',
                       'rb') as g, open(file_location, 'w',
                                        encoding='utf-8') as f:
            str_data = g.read().decode()
            dict_data = json.loads(str_data)
            f.write(json.dumps(dict_data))

        os.remove(file_location + '.gz')

        # variables/resources used to upload to s3
        s3_bucket = os.environ['ASSET_BUCKET']
        s3 = boto3.client('s3')

        has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location)
        if has_changes:
            s3.upload_file(file_location, s3_bucket, new_s3_key)
            print('Uploaded: ' + filename)

        # deletes to preserve limited space in aws lamdba
        os.remove(file_location)

        # dicts to be used to add assets to the dataset revision
        asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key}
        return {'has_changes': has_changes, 'asset_source': asset_source}

Exemple #2

0

Afficher le fichier

def data_to_s3(endpoint):
    source_dataset_url = 'https://www.cryptodatadownload.com/cdd/Bitstamp_'
    if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
    getattr(ssl, '_create_unverified_context', None)):
        ssl._create_default_https_context = ssl._create_unverified_context
    response = None
    retries = 5
    for attempt in range(retries):
        try:
            response = urlopen(source_dataset_url + endpoint)
        except HTTPError as e:
            if attempt == retries:
                raise Exception('HTTPError: ', e.code)
            time.sleep(0.2 * attempt)
        except URLError as e:
            if attempt == retries:
                raise Exception('URLError: ', e.reason)
            time.sleep(0.2 * attempt)
        else:
            break

    if response == None:
        raise Exception('There was an issue downloading the dataset')

    data_set_name = os.environ['DATA_SET_NAME']
    filename = data_set_name + endpoint
    file_location = '/tmp/' + filename

    with open(file_location, 'wb') as f:
        f.write(response.read())

    s3_bucket = os.environ['S3_BUCKET']
    new_s3_key = data_set_name + '/dataset/' + filename
    s3 = boto3.client('s3')

    has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location)
    if has_changes:
        s3.upload_file(file_location, s3_bucket, new_s3_key)
        print('Uploaded: ' + filename)
    else:
        print('No changes in: ' + filename)

    asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key}
    return {'has_changes': has_changes, 'asset_source': asset_source}

Exemple #3

0

Afficher le fichier

Fichier : source_data.py Projet : rearc-data/covid-19-nyt-data-in-usa

def source_dataset(s3_bucket, new_s3_key):

    # list of enpoints to be used to access data included with product
    endpoints = [
        'us.csv',
        'us-states.csv',
        'us-counties.csv',
        'live/us.csv',
        'live/us-states.csv',
        'live/us-counties.csv'
    ]

    # multithreading speed up accessing data, making lambda run quicker
    with (Pool(6)) as p:
        p.map(data_to_s3, endpoints)

    # uploading to s3
    s3_uploads = []
    s3 = boto3.client('s3')

    for filename in os.listdir('/tmp'):
        file_location = '/tmp/' + filename
        has_changes = md5_compare(
            s3, s3_bucket, new_s3_key + filename, file_location)
        if has_changes:
            s3.upload_file(file_location, s3_bucket, new_s3_key + filename)
            print('Uploaded: ' + filename)
        else:
            print('No changes in: ' + filename)

        asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key + filename}
        s3_uploads.append({'has_changes': has_changes,
                           'asset_source': asset_source})

    count_updated_data = sum(
        upload['has_changes'] == True for upload in s3_uploads)
    asset_list = []
    if count_updated_data > 0:
        asset_list = list(
            map(lambda upload: upload['asset_source'], s3_uploads))
        if len(asset_list) == 0:
            raise Exception('Something went wrong when uploading files to s3')

    return asset_list

Exemple #4

0

Afficher le fichier

def data_to_s3(frmt):
    # throws error occured if there was a problem accessing data
    # otherwise downloads and uploads to s3

    source_dataset_url = 'https://fred.stlouisfed.org/graph/fredgraph'
    url_end = '?id=TRUCKD11'
    try:
        response = urlopen(source_dataset_url + frmt + url_end)

    except HTTPError as e:
        raise Exception('HTTPError: ', e.code, frmt)

    except URLError as e:
        raise Exception('URLError: ', e.reason, frmt)

    else:
        data_set_name = os.environ['DATA_SET_NAME']
        filename = data_set_name + frmt
        file_location = '/tmp/' + filename

        with open(file_location, 'wb') as f:
            f.write(response.read())
            f.close()

        # variables/resources used to upload to s3
        s3_bucket = os.environ['S3_BUCKET']
        new_s3_key = data_set_name + '/dataset/' + filename
        s3 = boto3.client('s3')

        # If the md5 hash of our new file does NOT match the s3 etag, upload the new file
        has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location)
        if has_changes:
            s3.upload_file(file_location, s3_bucket, new_s3_key)
            print('Uploaded: ' + filename)
        else:
            print('No update needed for ' + filename)

        # deletes to preserve limited space in aws lamdba
        os.remove(file_location)

        # dicts to be used to add assets to the dataset revision
        asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key}
        return {'has_changes': has_changes, 'asset_source': asset_source}

Exemple #5

0

Afficher le fichier

def data_to_s3(endpoint):
    source_dataset_url = 'https://fred.stlouisfed.org/graph/fredgraph'
    url_end = '?id=AWHAE'
    response = None
    retries = 5
    for attempt in range(retries):
        try:
            response = urlopen(source_dataset_url + endpoint + url_end)
        except HTTPError as e:
            if attempt == retries:
                raise Exception('HTTPError: ', e.code)
            time.sleep(0.2 * attempt)
        except URLError as e:
            if attempt == retries:
                raise Exception('URLError: ', e.reason)
            time.sleep(0.2 * attempt)
        else:
            break

    if response == None:
        raise Exception('There was an issue downloading the dataset')

    data_set_name = os.environ['DATA_SET_NAME']
    filename = data_set_name + endpoint
    file_location = '/tmp/' + filename

    with open(file_location, 'wb') as f:
        f.write(response.read())

    s3_bucket = os.environ['S3_BUCKET']
    new_s3_key = data_set_name + '/dataset/' + filename
    s3 = boto3.client('s3')

    has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location)
    if has_changes:
        s3.upload_file(file_location, s3_bucket, new_s3_key)
        print('Uploaded: ' + filename)
    else:
        print('No changes in: ' + filename)

    asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key}
    return {'has_changes': has_changes, 'asset_source': asset_source}

Exemple #6

0

Afficher le fichier

def source_dataset():
    source_dataset_url = "https://fred.stlouisfed.org/graph/fredgraph.csv?id=PERMIT"
    
    response = None
    retries = 5
    for attempt in range(retries):
        try:
            response = urlopen(source_dataset_url)
        except HTTPError as e:
            if attempt == retries:
                raise Exception('HTTPError: ', e.code)
            time.sleep(0.2 * attempt)
        except URLError as e:
            if attempt == retries:
                raise Exception('URLError: ', e.reason)
            time.sleep(0.2 * attempt)
        else:
            break
            
    if response is None:
        raise Exception('There was an issue downloading the dataset')
            
    data_set_name = os.environ['DATA_SET_NAME']

    data_dir = '/tmp'
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)

    file_location = os.path.join(data_dir, data_set_name+'.csv')

    s3_bucket = os.environ['S3_BUCKET']
    s3 = boto3.client('s3')
    s3_resource = boto3.resource('s3')
    config = TransferConfig(multipart_threshold=1024*25, max_concurrency=10,
                            multipart_chunksize=1024*25, use_threads=True)

    s3_uploads = []
    asset_list = []

    obj_name = file_location.split('/', 3).pop().replace(' ', '_').lower()
    file_location = os.path.join(data_dir, obj_name)
    new_s3_key = data_set_name + '/dataset/' + obj_name
    filedata = response.read()

    has_changes = md5_compare(s3, s3_bucket, new_s3_key, BytesIO(filedata))
    if has_changes:
        s3_resource.Object(s3_bucket, new_s3_key).put(Body=filedata)
        # sys.exit(0)
        print('Uploaded: ' + file_location)
    else:
        print('No changes in: ' + file_location)

    asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key}
    s3_uploads.append({'has_changes': has_changes, 'asset_source': asset_source})

    count_updated_data = sum(upload['has_changes'] == True for upload in s3_uploads)
    if count_updated_data > 0:
        asset_list = list(map(lambda upload: upload['asset_source'], s3_uploads))
        if len(asset_list) == 0:
            raise Exception('Something went wrong when uploading files to s3')

    # asset_list is returned to be used in lamdba_handler function
    # if it is empty, lambda_handler will not republish
    return asset_list

Exemple #7

0

Afficher le fichier

Fichier : source_data.py Projet : rearc-data/fivethirtyeight-congress-generic-ballot

def source_dataset():

    source_dataset_url = 'https://projects.fivethirtyeight.com/data-webpage-data/datasets/congress-generic-ballot.zip'
    response = None

    retries = 5
    for attempt in range(retries):
        try:
            response = urlopen(source_dataset_url)
        except HTTPError as e:
            if attempt == retries:
                raise Exception('HTTPError: ', e.code)
            time.sleep(0.2 * attempt)

        except URLError as e:
            if attempt == retries:
                raise Exception('URLError: ', e.reason)
            time.sleep(0.2 * attempt)
        else:
            break

    if response == None:
        raise Exception('There was an issue downloading the dataset')

    data_set_name = os.environ['DATA_SET_NAME']
    zip_location = '/tmp/' + data_set_name + '.zip'

    # unzips the zipped folder
    with open(zip_location, 'wb') as f:
        f.write(response.read())

    with ZipFile(zip_location, 'r') as z:
        z.extractall('/tmp')

    os.remove(zip_location)

    folder_dir = os.listdir('/tmp')[0]

    # variables/resources used to upload to s3
    s3_bucket = os.environ['S3_BUCKET']
    s3 = boto3.client('s3')

    s3_uploads = []
    for r, d, f in os.walk('/tmp/' + folder_dir):
        for filename in f:
            obj_name = os.path.join(r, filename).split(
                '/', 3).pop().replace(' ', '_').lower()
            file_location = os.path.join(r, filename)
            new_s3_key = data_set_name + '/dataset/' + obj_name

            has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location)
            if has_changes:
                s3.upload_file(file_location, s3_bucket, new_s3_key)
                print('Uploaded: ' + filename)
            else:
                print('No changes in: ' + filename)

            asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key}
            s3_uploads.append({'has_changes': has_changes,
                               'asset_source': asset_source})

    count_updated_data = sum(
        upload['has_changes'] == True for upload in s3_uploads)
    if count_updated_data > 0:
        asset_list = list(
            map(lambda upload: upload['asset_source'], s3_uploads))
        if len(asset_list) == 0:
            raise Exception('Something went wrong when uploading files to s3')
        return asset_list
    else:
        return []

Exemple #8

0

Afficher le fichier

Fichier : source_data.py Projet : rearc-data/bea-regional-personal-income-and-employment-by-county

def source_dataset():
    bea_dataset_name = os.getenv('BEA_DATASET_NAME', 'regional')
    table_name = os.getenv('BEA_TABLE_NAME', 'CAINC4')
    source_dataset_url = "https://apps.bea.gov/{}/zip/{}.zip".format(
        bea_dataset_name, table_name)

    response = None
    retries = 5
    for attempt in range(retries):
        try:
            response = urlopen(source_dataset_url)
        except HTTPError as e:
            if attempt == retries:
                raise Exception('HTTPError: ', e.code)
            time.sleep(0.2 * attempt)
        except URLError as e:
            if attempt == retries:
                raise Exception('URLError: ', e.reason)
            time.sleep(0.2 * attempt)
        else:
            break

    if response == None:
        raise Exception('There was an issue downloading the dataset')

    data_set_name = os.environ['DATA_SET_NAME']

    data_dir = '/tmp'
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)

    zip_location = os.path.join(data_dir, data_set_name + '.zip')

    with open(zip_location, 'wb') as f:
        f.write(response.read())

    with ZipFile(zip_location, 'r') as z:
        z.extractall(data_dir)

    os.remove(zip_location)

    s3_bucket = os.environ['S3_BUCKET']
    s3 = boto3.client('s3')

    unzipped_name = os.listdir(data_dir)[0]

    s3_uploads = []
    asset_list = []

    for r, d, f in os.walk(data_dir):
        for filename in f:
            obj_name = os.path.join(r, filename).split('/', 3).pop().replace(
                ' ', '_').lower()
            file_location = os.path.join(r, filename)
            new_s3_key = data_set_name + '/dataset/' + obj_name

            has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location)
            if has_changes:
                s3.upload_file(file_location, s3_bucket, new_s3_key)
                print('Uploaded: ' + filename)
            else:
                print('No changes in: ' + filename)

            asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key}
            s3_uploads.append({
                'has_changes': has_changes,
                'asset_source': asset_source
            })

    count_updated_data = sum(upload['has_changes'] == True
                             for upload in s3_uploads)
    if count_updated_data > 0:
        asset_list = list(
            map(lambda upload: upload['asset_source'], s3_uploads))
        if len(asset_list) == 0:
            raise Exception('Something went wrong when uploading files to s3')

    # asset_list is returned to be used in lamdba_handler function
    # if it is empty, lambda_handler will not republish
    return asset_list

Exemple #9

0

Afficher le fichier

def source_dataset():
    ny_us = pd.read_csv(nytimes["us"])
    ny_states = pd.read_csv(nytimes["states"])
    ny_counties = pd.read_csv(nytimes["counties"])
    owid_data = pd.read_csv(owid["data"])
    dataapi_us = pd.read_csv(dataapi["us-summary"])
    dataapi_states = pd.read_csv(dataapi["states"])

    print('ny_us')
    print(ny_us.columns)
    print('ny_states')
    print(ny_states.columns)
    print('ny_counties')
    print(ny_counties.columns)
    print('owid_data')
    print(owid_data.columns)
    print('dataapi_us')
    print(dataapi_us.columns)
    print('dataapi_states')
    print(dataapi_states.columns)

    data_dir = '/tmp'
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)

    county_codes = pd.read_csv('county_codes.csv')
    state_codes = pd.read_csv('state_codes.csv')
    country_codes = pd.read_csv('country_codes.csv')

    # state data file
    covid_us_states = dataapi_states.copy()
    
    covid_us_states = covid_us_states.rename(columns={
        "negative": "tests_negative",
        "positive": "tests_positive",
        "pending": "tests_pending",
        "totalTestResults": "tests",
        "hospitalizedCurrently": "patients_hosp",
        "inIcuCurrently": "patients_icu",
        "onVentilatorCurrently": "patients_vent"
    })

    covid_us_states = covid_us_states.set_index('state').join(
                    state_codes[['state_name', 'post_code', 'state_fips', 'lat', 'long']].set_index('post_code'), 
                     how='left').reset_index()

    right_df = ny_states.rename(columns={'state':'state_name'})
    covid_us_states = pd.merge(covid_us_states, right_df[['date', 'state_name', 'cases', 'deaths']], 
                                how='left', 
                                on=['date', 'state_name'])

    cols = covid_us_states_columns 
    for col in cols:
        if col not in covid_us_states.columns:
            covid_us_states[col] = None
    covid_us_states = covid_us_states[covid_us_states_columns]

    covid_us_states.to_csv(os.path.join(data_dir, 'covid_19_us_states.csv'), index=False)


    # county data file
    covid_us_counties = ny_counties.copy()

    covid_us_counties = covid_us_counties.rename(columns={
                            "county": "county_name", 
                            "state": "state_name",
                            "fips": "county_fips"
                        })

    covid_us_counties = covid_us_counties.set_index('state_name').join(
        state_codes[['state_name', 'state_fips']].set_index('state_name'), 
        how='left').reset_index()

    covid_us_counties = covid_us_counties.set_index('county_name').join(
        county_codes[['county_name', 'lat', 'long']].set_index('county_name'),
        how='left').reset_index()

    covid_us_counties['area_name'] = None
    covid_us_counties_columns = ['state_fips', 'state_name', 'county_fips', 'county_name', 'area_name',
        'lat', 'long', 'date', 'cases', 'deaths']

    cols = covid_us_counties_columns 
    for col in cols:
        if col not in covid_us_counties.columns:
            covid_us_counties[col] = None
    covid_us_counties = covid_us_counties[covid_us_counties_columns]

    covid_us_counties.to_csv(os.path.join(data_dir, 'covid_19_us_counties.csv'), index=False)

    # global country data file
    covid_global_countries = owid_data.copy()

    covid_global_countries = covid_global_countries.rename(columns={
        "location": "country_name", 
        "total_cases": "cases", 
        "total_deaths": "deaths", 
        "total_tests": "tests",
        "tests_units": "tests_units"
    })

    covid_global_countries = covid_global_countries.set_index("country_name").join(
                                country_codes.set_index("country_name"), how="left").reset_index()

    cols = covid_global_countries_columns 
    for col in cols:
        if col not in covid_global_countries.columns:
            covid_global_countries[col] = None
    covid_global_countries = covid_global_countries[covid_global_countries_columns]

    covid_global_countries.to_csv(os.path.join(data_dir, 'covid_19_global_countries.csv'), index=False)


    # global all regions data file

    now = datetime.datetime.now()
    version_timestamp = now.strftime('%Y%m%d%H%M')

    covid_global = owid_data[[
        'continent', 'location', 'date', 'total_cases', 
        'total_deaths', 'total_tests', 'tests_units', 'population']].copy()

    # counties and states parts
    counties = covid_us_counties.copy()
    states = covid_us_states.copy()

    counties['geographic_level'] = 'US County'
    states['geographic_level'] = 'US State'
    counties['country_name'] = 'United States'
    states['country_name'] = 'United States'

    states = states.merge(country_codes[['country_name', 'country_iso2', 'country_iso3']], how='left', on='country_name')
    counties = counties.merge(country_codes[['country_name', 'country_iso2', 'country_iso3']], how='left', on='country_name')

    # world part
    world = covid_global[covid_global['location'] == 'World'].copy()
    world['geographic_level'] = 'Global'
    world['country_iso3'] = 'OWID_WRL'

    world = world.rename(columns={
        "total_deaths": "deaths",
        "total_cases": "cases",
        "total_tests": "tests"
    })

    # international part
    international = covid_global[covid_global['location'] == 'International'].copy()
    international['geographic_level'] = 'Country'
    international['country_name'] = 'International'

    international = international.rename(columns={
        "total_deaths": "deaths",
        "total_cases": "cases",
        "total_tests": "tests"
    })

    # us part
    us = covid_global[covid_global['location'] == 'United States'].copy()
    us['geographic_level'] = 'Country'
    us['country_name'] = 'United States'

    us_cases = dataapi_us.copy()

    us_cases = us_cases.rename(columns={
        "negative": "tests_negative",
        "positive": "tests_positive",
        "totalTestResults": "tests",
        "hospitalizedCurrently": "patients_hosp",
        "inIcuCurrently": "patients_icu",
        "onVentilatorCurrently": "patients_vent"
    })

    right_df = ny_us.rename(columns={'state':'state_name'})
    us_cases = pd.merge(us_cases, right_df[['date', 'cases']], 
                        how='left', 
                        on=['date'])

    cols = ['date', 'tests', 'patients_icu', 'patients_hosp', 'cases', 
       'tests_negative', 'patients_vent', 'tests_positive', 'recovered'] 
    for col in cols:
        if col not in us_cases.columns:
            us_cases[col] = None
    us_cases = us_cases[cols].copy()

    us = us.merge(us_cases, how='left', on='date')

    us = us[[
        'continent', 'date', 'cases',
       'total_deaths', 'tests', 
       'tests_units', 
       'population', 'geographic_level',
       'country_name', 'patients_icu', 'patients_hosp', 'patients_vent', 
       'tests_negative', 'tests_positive', 'recovered' 
    ]]

    us = us.rename(columns={
        "total_deaths": "deaths",
    })

    us = us.merge(country_codes[['country_name', 'country_iso2', 'country_iso3', 'lat', 'long']], how='left', on='country_name')

    # countries part
    countries = covid_global[(covid_global['location'] != 'International') & \
                         (covid_global['location'] != 'World') & \
                         (covid_global['location'] != 'United States')].copy()
    countries = countries.rename(columns={'location': 'country_name'})
    countries['geographic_level'] = 'Country'

    countries = countries.rename(columns={
        "total_deaths": "deaths",
        "total_cases": "cases",
        "total_tests": "tests"
    })

    countries = countries.merge(country_codes[['country_name', 'country_iso2', 'country_iso3', 'lat', 'long']], how='left', on='country_name')
    
    # add missing columns
    data_parts = [us, states, counties, countries, international, world]
    for df in data_parts:
        cols = covid_global_columns 
        for col in cols:
            if col not in df.columns:
                df[col] = None
        df = df[covid_global_columns]

    # concatenate all together
    merged = pd.concat(data_parts, join='outer', ignore_index = True)
    merged['version_timestamp'] = version_timestamp

    merged = merged[covid_global_columns]

    merged.to_csv(os.path.join(data_dir, 'covid_19_global.csv'), index=False)

    # upload to s3
    data_set_name = os.environ['DATA_SET_NAME']

    s3_bucket = os.environ['S3_BUCKET']
    s3 = boto3.client('s3')

    s3_uploads = []
    asset_list = []

    for r, d, f in os.walk(data_dir):
        for filename in f:
            obj_name = os.path.join(r, filename).split('/', 3).pop().replace(' ', '_').lower()
            file_location = os.path.join(r, filename)
            new_s3_key = data_set_name + '/dataset/' + obj_name

            has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location)
            if has_changes:
                s3.upload_file(file_location, s3_bucket, new_s3_key)
                print('Uploaded: ' + filename)
            else:
                print('No changes in: ' + filename)

            asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key}
            s3_uploads.append({'has_changes': has_changes, 'asset_source': asset_source})

    count_updated_data = sum(upload['has_changes'] == True for upload in s3_uploads)
    if count_updated_data > 0:
        asset_list = list(map(lambda upload: upload['asset_source'], s3_uploads))
        if len(asset_list) == 0:
            raise Exception('Something went wrong when uploading files to s3')

    # asset_list is returned to be used in lamdba_handler function
    # if it is empty, lambda_handler will not republish
    return asset_list

Exemple #10

0

Afficher le fichier

Fichier : source_data.py Projet : rearc-data/nyt-states-reopen-status-covid-19

def source_dataset(): #new_filename, s3_bucket, new_s3_key):

    dataset_name = os.getenv('DATASET_NAME')
    asset_bucket = os.getenv('ASSET_BUCKET')

    data_dir = '/tmp'
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)

    file_location_csv = os.path.join(data_dir, dataset_name + '.csv')
    file_location_json = os.path.join(data_dir, dataset_name + '.json')

    urls = [
        'https://www.nytimes.com/interactive/2020/us/states-reopen-map-coronavirus.html',
        'https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/state/detail/SCPRC-EST2019-18+POP-RES.csv'
    ]

    with (Pool(2)) as p:
        data_source = p.map(download_data, urls)

    html = data_source[0].replace('\n', '').replace('\t', '')
    parser = MyHTMLParser()
    parser.feed(html)

    # download and format population data
    population_csv = data_source[1].replace(
        'Puerto Rico Commonwealth', 'Puerto Rico').splitlines()

    population_data = {}

    for state in population_csv[2:]:
        row = state.split(',')
        population_data[row[4]] = row[5]

    # creating fieldnames variable to set order of data
    fieldnames = ['state_abbreviation', 'state', 'businesses',
                  'masks', 'community', 'status_details', 'external_link']

    # adding categories variations to fieldnames
    for category in parser.categories:
        fieldnames.append(category)

    fieldnames.append('population')

    # 5/10/2021 as source data has changed
    fieldnames.append('reopening')


    # creating the csv file
    with open(file_location_csv, 'w', encoding='utf-8') as c:
        writer = csv.DictWriter(c, fieldnames=fieldnames)
        writer.writeheader()

        for row in parser.full_data:
            row['population'] = int(population_data[row['state']])
            writer.writerow(row)

    # creating the json file
    with open(file_location_json, 'w', encoding='utf-8') as j, open(file_location_csv, 'r') as c:
        reader = csv.DictReader(c)
        j.write('[')
        j.write(',\n'.join(json.dumps(row).replace('""', 'null')
                           for row in reader))
        j.write(']')

    # uploading to s3
    s3_uploads = []
    s3 = boto3.client('s3')

    for filename in os.listdir('/tmp/'):
        if filename.startswith(dataset_name):

            file_location = '/tmp/' + filename

            obj_name = file_location.split('/', 3).pop().replace(' ', '_').lower()
            new_s3_key = dataset_name + '/dataset/' + obj_name

            has_changes = md5_compare(s3, asset_bucket, new_s3_key, file_location)
            if has_changes:
                s3.upload_file(file_location, asset_bucket, new_s3_key)
                print('Uploaded: ' + filename)
            else:
                print('No changes in: ' + filename)

            asset_source = {'Bucket': asset_bucket, 'Key': new_s3_key}
            s3_uploads.append({'has_changes': has_changes,
                                'asset_source': asset_source})

    count_updated_data = sum(
        upload['has_changes'] == True for upload in s3_uploads)
    asset_list = []
    if count_updated_data > 0:
        asset_list = list(
            map(lambda upload: upload['asset_source'], s3_uploads))
        if len(asset_list) == 0:
            raise Exception('Something went wrong when uploading files to s3')

    return asset_list

Exemple #11

0

Afficher le fichier

def source_dataset(source_data_url, s3_bucket, dataset_name):
    """Download the source data from URL and put it in S3"""
    s3 = boto3.client('s3')

    data_dir = '/tmp'
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)

    response = None
    retries = 5
    for attempt in range(retries):
        try:
            response = urlopen(source_data_url)
        except HTTPError as e:
            if attempt == retries:
                raise Exception('HTTPError: ', e.code)
            time.sleep(0.2 * attempt)
        except URLError as e:
            if attempt == retries:
                raise Exception('URLError: ', e.reason)
            time.sleep(0.2 * attempt)
        else:
            break

    if response is None:
        raise Exception('There was an issue downloading the dataset')

    zip_location = os.path.join(data_dir, dataset_name+'.zip')

    with open(zip_location, 'wb') as f:
        f.write(response.read())

    with ZipFile(zip_location, 'r') as z:
        z.extractall(data_dir)

    os.remove(zip_location)

    s3_uploads = []
    asset_list = []

    for r, d, f in os.walk(data_dir):
        for filename in f:
            obj_name = os.path.join(r, filename).split('/', 3).pop().replace(' ', '_').lower()
            file_location = os.path.join(r, filename)
            new_s3_key = os.path.join(dataset_name, 'dataset', obj_name)

            has_changes = md5_compare(s3, s3_bucket, new_s3_key, file_location)
            if has_changes:
                s3.upload_file(file_location, s3_bucket, new_s3_key)
                print('Uploaded: ' + filename)
            else:
                print('No changes in: ' + filename)

            asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key}
            s3_uploads.append({'has_changes': has_changes, 'asset_source': asset_source})

    count_updated_data = sum(upload['has_changes'] is True for upload in s3_uploads)
    if count_updated_data > 0:
        asset_list = list(map(lambda upload: upload['asset_source'], s3_uploads))
        if len(asset_list) == 0:
            raise Exception('Something went wrong when uploading files to s3')

    # asset_list is returned to be used in create_dataset_revision function
    # if it is empty, lambda_handler will not republish
    return asset_list

Exemple #12

0

Afficher le fichier

Fichier : source_data.py Projet : rearc-data/state-covid-19-vaccine-priority

def source_dataset():
    button_xpath = '//*[@id="report-wrapper"]/div/div[2]/div[2]/div[1]/div/div/div/div[1]/div/button[2]'
    page_url = 'https://www.kff.org/report-section/state-covid-19-data-and-policy-actions-policy-actions/'
    
    button_xpath_map = {
        'covid-19-state-vaccine-priority-populations': '//*[@id="report-wrapper"]/div/div[2]/div[2]/div[1]/div/div/div/div[1]/div/button[2]',
        'covid-19-state-populations-eligiblity-and-residency-requirements': '//*[@id="report-wrapper"]/div/div[2]/div[1]/div/div/div[2]/div/div/div/div/div[1]/div/button[2]',
        'covid-19-state-social-distancing-actions': '//*[@id="report-wrapper"]/div/div[2]/div[2]/div[5]/div/div/div/div/div/div/div[1]/div/button[2]',
        'covid-19-state-health-policy-actions': '//*[@id="report-wrapper"]/div/div[2]/div[2]/div[7]/div[3]/div/div/div/div/div/div/div[1]/div/button[2]',
        'covid-19-state-actions-on-telehealth': '//*[@id="report-wrapper"]/div/div[2]/div[2]/div[7]/div[5]/div[2]/div/div/div[1]/div/div/div/div[1]/div/button[2]',
        'covid-19-state-health-care-provider-capacity': '//*[@id="report-wrapper"]/div/div[2]/div[2]/div[7]/div[5]/div[4]/div[2]/div/div/div/div/div/div/div/div/div/div/div/div/div/div[2]/div[1]/div[2]/div[2]/div[1]/div/div/div/div/div/div/div/div[1]/div/button[2]',
    }
    
    data_dir = '/tmp/downloads'
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)
        
    s3_bucket = os.environ['S3_BUCKET']
    data_set_name = os.environ['DATA_SET_NAME']

    driver = WebDriver()
    page_source = driver.get_pagesource(page_url, button_xpath_map)
    # print(page_source)
    
    for root, dirs, files in os.walk(data_dir):
        print(root)
        print(files)
        print(dirs)
        print('--')

    # sys.exit(0)
    
    s3_uploads = []
    asset_list = []

    # filename = data_set_name + '.csv'
    # file_location = os.path.join(data_dir, filename)

    s3 = boto3.client('s3')
    
    for root, dirs, files in os.walk(data_dir):
        for f in files:
            print(f)
            new_s3_key = data_set_name + '/dataset/' + f
            
            has_changes = True
            filedata = None
            file_location = os.path.join(root, f)
            with open(file_location, 'rb') as reader: #, encoding='utf-8'
                filedata = reader
                has_changes = md5_compare(s3, s3_bucket, new_s3_key, filedata) #BytesIO(filedata)
            if (has_changes):
                # s3_resource.Object(s3_bucket, new_s3_key).put(Body=filedata)
                s3.upload_file(file_location, s3_bucket, new_s3_key)
                print('Uploaded: ' + f)
            else:
                print('No changes in: ' + f)
            asset_source = {'Bucket': s3_bucket, 'Key': new_s3_key}
            s3_uploads.append({'has_changes': has_changes, 'asset_source': asset_source})

    count_updated_data = sum(upload['has_changes'] == True for upload in s3_uploads)
    if count_updated_data > 0:
        asset_list = list(map(lambda upload: upload['asset_source'], s3_uploads))
        if len(asset_list) == 0:
            raise Exception('Something went wrong when uploading files to s3')

    # asset_list is returned to be used in lamdba_handler function
    # if it is empty, lambda_handler will not republish
    return asset_list

Exemple #13

0

Afficher le fichier

def source_dataset(new_filename, s3_bucket, new_s3_key):

    source_url = 'https://www.google.com/covid19/mobility/'

    # throws error occured if there was a problem accessing data
    # otherwise downloads and uploads to s3

    try:
        source_response = urlopen(source_url)

    except HTTPError as e:
        raise Exception('HTTPError: ', e.code, new_filename)

    except URLError as e:
        raise Exception('URLError: ', e.reason, new_filename)

    else:

        html = source_response.read().decode()

        parser = MyHTMLParser()
        parser.feed(html)

        try:
            data_response = urlopen(parser.data)

        except HTTPError as e:
            raise Exception('HTTPError: ', e.code, new_filename)

        except URLError as e:
            raise Exception('URLError: ', e.reason, new_filename)

        else:

            data = data_response.read()

            s3_uploads = []
            s3 = boto3.resource('s3')

            has_changes = md5_compare(s3_bucket, new_s3_key + new_filename,
                                      BytesIO(data))
            if has_changes:
                s3.Object(s3_bucket, new_s3_key + new_filename).put(Body=data)
                print('Uploaded: ' + new_filename)
            else:
                print('No changes in: ' + new_filename)
            asset_source = {
                'Bucket': s3_bucket,
                'Key': new_s3_key + new_filename
            }
            s3_uploads.append({
                'has_changes': has_changes,
                'asset_source': asset_source
            })

            count_updated_data = sum(upload['has_changes'] == True
                                     for upload in s3_uploads)

            asset_list = []
            if count_updated_data > 0:
                asset_list = list(
                    map(lambda upload: upload['asset_source'], s3_uploads))
                if len(asset_list) == 0:
                    raise Exception(
                        'Something went wrong when uploading files to s3')

            return asset_list