def processData(existing_ids): new_data = [] new_ids = [] startTime = datetime.datetime.today() # Iterate backwards 1-week at a time while startTime > MAX_AGE: endTime = startTime startTime = startTime - datetime.timedelta(days=7) query = SOURCE_URL.format(startTime=startTime, endTime=endTime, minSig=SIGNIFICANT_THRESHOLD) logging.info('Fetching data between {} and {}'.format( startTime, endTime)) res = requests.get(query) if not res.ok: logging.error(res.text) data = res.json() new_data = [] for feature in data['features']: coords = feature['geometry']['coordinates'] lat = coords[1] lon = coords[0] depth = coords[2] props = feature['properties'] dt = datetime.datetime.utcfromtimestamp( props['time'] / 1000).strftime(DATETIME_FORMAT) _uid = genUID(lat, lon, depth, dt) if _uid not in existing_ids and _uid not in new_ids: new_ids.append(_uid) row = [] for field in CARTO_SCHEMA: if field == UID_FIELD: row.append(_uid) elif field == 'the_geom': geom = {'type': 'Point', 'coordinates': [lon, lat]} row.append(geom) elif field == 'depth_in_km': row.append(depth) elif field == 'datetime': row.append(dt) else: row.append(props[field]) new_data.append(row) num_new = len(new_data) if num_new: logging.info('Adding {} new records'.format(num_new)) cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_data) elif not PROCESS_HISTORY: # Break if no results for a week otherwise keep going break return (len(new_ids))
def processData(SOURCE_URL, filename, existing_ids): """ Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate Actions: Retrives data, dedupes and formats it, and adds to Carto table Output: Number of new rows added """ num_new = 0 res_rows = tryRetrieveData(SOURCE_URL, filename, TIMEOUT, ENCODING) new_data = {} for row in res_rows: if not (row.startswith("HDR")): row = row.split() if len(row) == len(CARTO_SCHEMA): logging.debug("Processing row: {}".format(row)) date = decimalToDatetime(float(row[DATETIME_INDEX])) row[DATETIME_INDEX] = date new_data = insertIfNew(date, row, existing_ids, new_data) else: logging.debug("Skipping row: {}".format(row)) if len(new_data): num_new += len(new_data) new_data = list(new_data.values()) cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_data) return (num_new)
def processData(existing_ids): """ Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate Actions: Retrives data, dedupes and formats it, and adds to Carto table Output: Number of new rows added """ new_rows = [] num_new = 0 # Use .splitlines(): # https://stackoverflow.com/questions/21351882/reading-data-from-a-csv-file-online-in-python-3 csv_stream = urllib.request.urlopen(SOURCE_URL) csv_reader = csv.reader(csv_stream.read().decode(ENCODING).splitlines()) # See comment under John Machin's answer: # https://stackoverflow.com/questions/3428532/how-to-import-a-csv-file-using-python-with-headers-intact-where-first-column-is headers = next(csv_reader, None) for _row in csv_reader: if len(headers) == len(_row): row = structure_row(headers, _row) if row['id'] not in existing_ids: new_row = [] for field in CARTO_SCHEMA: if field == 'uid': new_row.append(row['id']) elif field == 'the_geom': # Check for whether valid lat lon provided, will fail if either are '' try: lon = float(row['lon']) lat = float(row['lat']) geometry = { 'type': 'Point', 'coordinates': [lon, lat] } new_row.append(geometry) except: logging.error( 'No lat long available for this data point - skipping!' ) new_row.append(None) else: # To fix trouble w/ cartosql not being able to handle '': val = row[field] if val: new_row.append(val) else: new_row.append(None) new_rows.append(new_row) if len(new_rows): num_new = len(new_rows) cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_rows) return num_new
def processData(): ''' Function to download data and upload it to Carto Will first try to get the data for today three times Then decrease a day up until 8 tries until it finds one ''' date = datetime.date.today() - datetime.timedelta(days=1) success = False tries = 0 while tries < MAX_TRIES and success == False: logging.info("Fetching data for {}".format(str(date))) f = getFilename(date) url = SOURCE_URL.format(date=date.strftime('%Y%m%d')) try: urllib.request.urlretrieve(url, f) except Exception as inst: logging.info("Error fetching data for {}".format(str(date))) if tries >= 2: date = date - datetime.timedelta(days=1) tries = tries + 1 if tries == MAX_TRIES: logging.error( "Error fetching data for {}, and max tries reached. See source for last data update." .format(str(datetime.date.today()))) success = False else: df = pd.read_csv(f, header=0, usecols=[ 'Lat_DNB', 'Lon_DNB', 'Date_Mscan', 'Date_LTZ', 'QF_Detect', 'EEZ', 'Land_Mask' ]) df = df.drop(df[df.QF_Detect == 999999].index) df['the_geom'] = df.apply( lambda row: getGeom(row['Lon_DNB'], row['Lat_DNB']), axis=1) df = df[[ 'the_geom', 'QF_Detect', 'Date_Mscan', 'Date_LTZ', 'Land_Mask', 'Lon_DNB', 'Lat_DNB', 'EEZ' ]] if not cartosql.tableExists(CARTO_TABLE): logging.info('Table {} does not exist'.format(CARTO_TABLE)) cartosql.createTable(CARTO_TABLE, CARTO_SCHEMA) else: cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL') cartosql.createTable(CARTO_TABLE, CARTO_SCHEMA) rows = df.values.tolist() logging.info('Success!') #logging.info('The following includes the first ten rows added to Carto:') #logging.info(rows[:10]) if len(rows): cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), rows) tries = tries + 1 success = True
def processData(existing_ids): """ Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate Actions: Retrives data, dedupes and formats it, and adds to Carto table Output: Number of new rows added """ new_rows = [] res = requests.get(SOURCE_URL) csv_reader = csv.reader(res.iter_lines(decode_unicode=True)) headers = next(csv_reader, None) idx = {k: v for v, k in enumerate(headers)} for row in csv_reader: if not len(row): break else: if row[idx['id']] not in existing_ids: new_row = [] for field in CARTO_SCHEMA: if field == 'uid': new_row.append(row[idx['id']]) elif field == 'the_geom': # Check for whether valid lat lon provided, will fail if either are '' lon = float(row[idx['lon']]) lat = float(row[idx['lat']]) if lat and lon: geometry = { 'type': 'Point', 'coordinates': [lon, lat] } new_row.append(geometry) else: logging.debug( 'No lat long available for this data point - skipping!' ) new_row.append(None) else: # To fix trouble w/ cartosql not being able to handle '' for numeric: val = row[ idx[field]] if row[idx[field]] != '' else None new_row.append(val) new_rows.append(new_row) num_new = len(new_rows) if num_new: cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_rows) return num_new
def processData(SOURCE_URL, filename, existing_ids): """ Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate Actions: Retrives data, dedupes and formats it, and adds to Carto table Output: Number of new rows added """ num_new = 0 ### Specific to each page/chunk in data processing res_rows = tryRetrieveData(SOURCE_URL, filename, TIMEOUT, ENCODING) new_data = {} for row in res_rows: ### ## CHANGE TO REFLECT CRITERIA FOR KEEPING ROWS FROM THIS DATA SOURCE ### if not (row.startswith("HDR")): row = row.split() ### ## CHANGE TO REFLECT CRITERIA FOR KEEPING ROWS FROM THIS DATA SOURCE ### if len(row) == len(CARTO_SCHEMA): logging.debug("Processing row: {}".format(row)) # Pull data available in each line VALUE_INDEX = 3 value = row[VALUE_INDEX] # Pull times associated with those data dttm_elems = {"year_ix": 0, "month_ix": 1, "day_ix": 2} date = datetime(year=int(row[0]), month=int(row[1]), day=int(row[2])).strftime("%Y-%m-%d") UID = genUID('value_type', date) values = [UID, date, value, "value_type"] new_data = insertIfNew(UID, values, existing_ids, new_data) else: logging.debug("Skipping row: {}".format(row)) if len(new_data): num_new += len(new_data) new_data = list(new_data.values()) cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_data) ### End page/chunk processing return (num_new)
def processData(url, filename, existing_ids): ''' Fetch, process and upload new data INPUT url: url where you can find the download link for the source data (string) filename: filename for source data (string) existing_ids: list of date IDs that we already have in our Carto table (list of strings) RETURN num_new: number of rows of new data sent to Carto table (integer) ''' num_new = 0 # get the data from source as a list of strings, with each string holding one line from the source data file res_rows = tryRetrieveData(url, filename) # create an empty dictionary to store new data (data that's not already in our Carto table) new_data = {} # go through each line of content retrieved from source for row in res_rows: # get dates by processing lines that come after the header (header lines start with "HDR") if not (row.startswith("HDR")): # split line by space to get dates row = row.split() # if length of contents in row matches the length of CARTO_SCHEMA if len(row) == len(CARTO_SCHEMA): logging.debug("Processing row: {}".format(row)) # get date by accessing the third element in the list of row date = decimalToDatetime(row[2]) # replace decimal date with datetime in data row row[2] = date # For new date, check whether this is already in our table. # If not, add it to the queue for processing new_data = insertIfNew(date, row, existing_ids, new_data) else: logging.debug("Skipping row: {}".format(row)) # if we have found new dates to process if len(new_data): num_new += len(new_data) # create a list of new data new_data = list(new_data.values()) # insert new data into the carto table cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_data, user=CARTO_USER, key=CARTO_KEY) return (num_new)
def processData(existing_ids): """ Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate Actions: Retrives data, dedupes and formats it, and adds to Carto table Output: Number of new rows added """ new_data = [] new_ids = [] today = datetime.today() if PROCESS_HISTORY: startTime = MAX_AGE endTime = startTime + timedelta(days=31) while startTime < today: logging.info('Fetching data between {} and {}'.format( startTime, endTime)) new_data, new_ids = appendTimeFrame(existing_ids, startTime, endTime, new_data, new_ids) startTime = endTime endTime = startTime + timedelta(days=31) else: # Use defaults of endpoint startTime = '' endTime = '' logging.info('Fetching data for last 30 days') new_data, new_ids = appendTimeFrame(existing_ids, startTime, endTime, new_data, new_ids) num_new = len(new_ids) if num_new: logging.info('Adding {} new records'.format(num_new)) cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_data) return (num_new)
# add the WB Indicator Code column name and type for this value to the Carto Schema CARTO_SCHEMA.update({'indicator_code' + str(i + 1): 'text'}) # add the RW country name and country code columns to the table CARTO_SCHEMA.update({"rw_country_name": 'text'}) CARTO_SCHEMA.update({"rw_country_code": 'text'}) cartosql.deleteRows(table_name, 'cartodb_id IS NOT NULL', user=CARTO_USER, key=CARTO_KEY) # Insert new observations if len(all_world_bank_data): cartosql.blockInsertRows(table_name, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), all_world_bank_data.values.tolist(), user=CARTO_USER, key=CARTO_KEY) logging.info('Success! New rows have been added to Carto.') else: logging.info('No rows to add to Carto.') ''' Upload original data and processed data to Amazon S3 storage ''' logging.info('Uploading original data to S3.') # Copy the raw data into a zipped file to upload to S3 raw_data_dir = os.path.join(data_dir, dataset_name + '.zip') with ZipFile(raw_data_dir, 'w') as zip: for raw_data_file in raw_data_files: zip.write(raw_data_file, os.path.basename(raw_data_file))
# UPLOAD # specify column names and types CARTO_SCHEMA = { 'iso3': 'text', 'country': 'text', 'year': 'numeric', 'vulnerability': 'numeric', 'readiness': 'numeric', 'gain': 'numeric' } # check if table exists if cartosql.tableExists(CARTO_TABLE, user=os.getenv('CARTO_WRI_RW_USER'), key=os.getenv('CARTO_WRI_RW_KEY')): print('This table already exists. Please change the name and try again.') else: # create table with appropriate columns cartosql.createTable(CARTO_TABLE, CARTO_SCHEMA, user=os.getenv('CARTO_WRI_RW_USER'), key=os.getenv('CARTO_WRI_RW_KEY')) # send processed data to table cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), final_df.values.tolist(), user=os.getenv('CARTO_WRI_RW_USER'), key=os.getenv('CARTO_WRI_RW_KEY'))
def processData(existing_ids): """ Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate Actions: Retrives data, dedupes and formats it, and adds to Carto table Output: Number of new rows added """ num_new = 0 year = datetime.today().year logging.info("Fetching data for {}".format(year)) headers, rows = fetchAndFormatData(year) logging.info("Num rows: {}".format(len(rows))) year_history = 10 count = 0 while count < year_history: year -= 1 logging.info("Fetching data for {}".format(year)) try: more_headers, more_rows = fetchAndFormatData(year) # Check that headers for historical data match the newest data logging.info('More headers: {}'.format(more_headers)) assert(headers == more_headers) rows.extend(more_rows) logging.info('Fetched additional data for year {}'.format(year)) except: logging.warning('Couldn\'t fetch data for year {}'.format(year)) logging.info("Num rows: {}".format(len(rows))) count += 1 new_rows = [] for _row in rows: row = structure_row(headers, _row) if str(row['Web ID']) not in existing_ids: uid = row['Web ID'] logging.debug('Row: {}'.format(row)) lat, lon = [float(loc.strip()) for loc in row['Location Coordinates'].split(',')] geometry = { 'type':'Point', 'coordinates':[lon, lat] } new_row = [] for field in CARTO_SCHEMA: if field == UID_FIELD: new_row.append(uid) elif field == 'the_geom': new_row.append(geometry) else: new_row.append(row[field.replace('_', ' ')]) new_row = clean_row(new_row) new_rows.append(new_row) if len(new_rows): num_new = len(new_rows) logging.debug("15 rows from middle of new_rows: {}".format(new_rows[1000:1015])) cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_rows) return(num_new)
def processData(existing_ids): """ Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate Actions: Retrives data, dedupes and formats it, and adds to Carto table Output: Number of new rows added """ new_data = [] new_ids = [] res = req.get(SOURCE_URL) xml = lxml.etree.fromstring(res.content) json = xml2json.data(xml) items = json['channel']['item'] for item in items: title = item['title'].split(')')[0].split('(') place_info = [place.strip() for place in title] volcano_name = place_info[0] country_name = place_info[1] coords = item['{http://www.georss.org/georss}point'].split(' ') dt = parser.parse(item['pubDate'], fuzzy=True).strftime(DATETIME_FORMAT) lat = coords[0] lon = coords[1] geom = {'type': 'Point', 'coordinates': [lon, lat]} info = item['description'].split('Source:') if len(info) < 2: info = item['description'].split('Sources:') description_text = [ text.replace('<p>', '').replace('</p>', '') for text in info ] description = description_text[0] sources = description_text[1] _uid = genUID(lat, lon, dt) if _uid not in existing_ids + new_ids: new_ids.append(_uid) row = [] for field in CARTO_SCHEMA: if field == 'uid': row.append(_uid) elif field == 'the_geom': row.append(geom) elif field == 'pubdate': row.append(dt) elif field == 'description': row.append(description) elif field == 'sources': row.append(sources) elif field == 'volcano_name': row.append(volcano_name) elif field == 'country_name': row.append(country_name) new_data.append(row) num_new = len(new_ids) if num_new: logging.info('Adding {} new records'.format(num_new)) cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_data) return (num_new)
def processData(existing_ids): """ Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate Actions: Retrives data, dedupes and formats it, and adds to Carto table Output: Number of new rows added """ new_rows = [] res = requests.get(SOURCE_URL) csv_reader = csv.reader(res.iter_lines(decode_unicode=True)) headers = next(csv_reader, None) idx = {k: v for v, k in enumerate(headers)} for row in csv_reader: #skip empty rows if not len(row): continue else: # This data set has some entries with breaks in the last column, which the csv_reader interprets # as an individual row. See if new id can be converted to an integer. If it can, it is probably a # new row. try: int(row[idx['id']]) id = row[idx['id']] if id not in existing_ids: logging.info('new row for {}'.format(id)) new_row = [] for field in CARTO_SCHEMA: if field == 'uid': new_row.append(row[idx['id']]) elif field == 'the_geom': # Check for whether valid lat lon provided, will fail if either are '' lon = row[idx['lon']] lat = row[idx['lat']] if lat and lon: geometry = { 'type': 'Point', 'coordinates': [float(lon), float(lat)] } new_row.append(geometry) else: logging.debug( 'No lat long available for this data point - skipping!' ) new_row.append(None) else: # To fix trouble w/ cartosql not being able to handle '' for numeric: try: val = row[idx[field]] if row[ idx[field]] != '' else None new_row.append(val) except IndexError: pass new_rows.append(new_row) #If we can't convert to an integer, the last row probably got cut off. except ValueError: # Using the id from the last entry, if this id was already in the Carto table, we will skip it if id in existing_ids: pass # If it is a new id, we need to go fix that row. else: # If the row is only one item, append the rest of the information to the last description. if len(row) == 1: new_rows[-1][ -1] = new_rows[-1][-1] + ' ' + row[0].replace( '\t', '') # If several things are in the row, the break was probably mid-row. elif len(row) > 1 and len(row) < 17: # finish the last desciption new_rows[-1][ -1] = new_rows[-1][-1] + ' ' + row[0].replace( '\t', '') # append other items to row new_row = new_rows[-1] offset_factor = len(new_rows[-1]) - 1 for field in CARTO_SCHEMA: if field == 'uid' or field == 'the_geom': continue try: loc = idx[field] - offset_factor if loc > 0: val = row[loc] if row[loc] != '' else None new_row.append(val) except IndexError: pass new_rows[-1] == new_row ''' for item in row[1:]: val = row[idx[field]] if row[idx[field]] != '' else None new_row.append(val) new_rows[-1].append(item) ''' num_new = len(new_rows) if num_new: cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_rows) return num_new
def processData(existing_ids): ''' Fetch, process, upload, and clean new data INPUT existing_ids: list of WDPA IDs that we already have in our Carto table (list of strings) RETURN num_new: number of rows of data sent to Carto table (integer) ''' # turn list of existing ids from strings into integers existing_ids_int = [int(i) for i in existing_ids] # fetch list of WDPA IDs (list of all IDs and list of new ones) so that we can pull info from the API about each area new_ids, all_ids = fetch_ids(existing_ids_int) # if we have designated that we want to replace all the ids, then the list of IDs we will query (id_list) will # include all the IDs available; otherwise, we will just pull the new IDs if REPLACE_ALL==True: id_list = all_ids else: id_list = new_ids # create empty list to store IDs for rows we want to send to Carto so that we can delete any current entries before # sending new data send_list=[] # create empty lists to store data we will be sending to Carto table new_data = [] # go through and fetch information for each of the ids for id in id_list: # set try number to 0 for this area's ID because this will be our first try fetching the data try_num=0 # generate the url to pull data for this area from the WDPA API # WDPA API Reference document: https://api.protectedplanet.net/documentation#get-v3protectedareas url = "https://api.protectedplanet.net/v3/protected_areas/{}?token={}".format(id, os.getenv('WDPA_key')) # try at least 3 times to fetch the data for this area from the source if try_num <3: try: r = requests.get(url) except: # if the API call fails, wait 60 seconds before moving on to the next attempt to fetch the data time.sleep(60) try_num+=1 else: # after 3 failures to fetch data for this ID, log that the data could not be fetched logging.info(f'Could not fetch {id}') # process the retrieved data try: # pull data from request response json data = r.json()['protected_area'] # create an empty list to store the processed data for this row that we will send to Carto row = [] # go through each column in the Carto table for key in CARTO_SCHEMA.keys(): # find the location in the json where you can find this column's data location = JSON_LOC[key] # make a copy of the data that we can modify key_data = copy.copy(data) # if we are fetching data for the country_name column and there is more than one country, # we will need to process this entry if key == 'country_name' and len(key_data['countries']) > 1: # get the list of countries countries = key_data["countries"] # make a list of the country names c_list=[] for country in countries: c_list.append(country["name"]) # turn this list into a single string with the countries names listed, separated by a semicolon key_data = '; '.join(c_list) # we will also need to process the iso3 data if there is more than one country elif key == 'iso3' and len(key_data['countries']) > 1: # get the list of countries countries= key_data["countries"] # make a list of the country iso3 values c_list=[] for country in countries: c_list.append(country["iso_3"]) # turn this list into a single string with the countries iso3s listed, separated by a semicolon key_data = '; '.join(c_list) # for any other column, no special processing is required at this point, just pull out the data from # the correct location in the json else: # go through each nested name for sub in location: # try to pull out the data from that name try: key_data = key_data[sub] # if the data is a string, remove and leading or tailing whitespace if type(key_data)==str: key_data = key_data.rstrip() # if we aren't able to find the data for this column, set the data as a None value and move # on to the next column except (TypeError, IndexError): key_data=None break # if we were able to successfully find the value for the column, do any additional required processing if key_data: # pull the year from the data from the 'legal status updated at' field if key == 'status_yr': key_data=int(key_data[-4:]) # turn the wdpa_id into an integer if key == 'wdpa_id': # pull it from the API entry, if possible if key_data: key_data = int(key_data) # otherwise just use the id from the list of ids we are going through (some entries are missing # this field on the API) else: key_data=int(id) # add this ID to the list of IDs we are sending new data for send_list.append(key_data) # turn these columns into float data if key == 'no_tk_area' or key == 'rep_area' or key == 'rep_m_area': key_data=float(key_data) # turn the legal_status_updated_at column into a datetime if key == 'legal_status_updated_at': key_data=datetime.datetime.strptime(key_data, '%m/%d/%Y') # if no data was found for this column, make sure the entry is None else: key_data=None # add this value to the row data row.append(key_data) # if this ID's row of data was processed, add it to the new data to be sent to Carto if len(row): new_data.append(row) # if we failed to process this data, log an error except Exception as e: logging.error('error pulling {}'.format(id)) # send data # for every 1000 rows processed, send the data to Carto if (id_list.index(id) % 1000)==0 and id_list.index(id)>1: logging.info('{} records processed.'.format(id_list.index(id))) num_new = len(new_data) if num_new: # delete the old entries in the Carto table for the IDs we have processed logging.info('Deleting old records in this batch') delete_carto_entries(send_list, 'wdpa_id') # push new data rows to Carto logging.info('Adding {} new records.'.format(num_new)) cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_data, user=CARTO_USER, key=CARTO_KEY) # start with empty lists again to process the next batch of data new_data = [] send_list = [] # delete rows for areas that are no longer in the WDPA dataset logging.info('Deleting records that are no longer in the database.') # get a list of IDs that are in the Carto table but not in the most recent WDPA dataset deleted_ids = np.setdiff1d(existing_ids_int, id_list) # delete these rows from the Carto table delete_carto_entries(deleted_ids, 'wdpa_id') logging.info('{} ids deleted'.format(len(deleted_ids))) return(num_new)
def processNewData(url): ''' Fetch, process and upload new data INPUT url: url where you can find the download link for the source data (string) RETURN num_new: number of rows of new data sent to Carto table (integer) ''' # specify the starting page of source url we want to pull page = 1 # generate the url and pull data for this page r = requests.get(url.format(page=page)) # pull data from request response json raw_data = r.json()['data'] # if data is available from source url if len(raw_data) > 0: # if the table exists if cartosql.tableExists(CARTO_TABLE, user=CARTO_USER, key=CARTO_KEY): # delete all the rows cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL', user=CARTO_USER, key=CARTO_KEY) logging.info('Updating {}'.format(CARTO_TABLE)) else: # raise an error that data is not available from source url logging.error("Source data missing. Table will not update.") # create an empty list to store new data new_data = [] # if data is available from source url while len(raw_data) > 0: logging.info('Processing page {}'.format(page)) # read in source data as a pandas dataframe df = pd.DataFrame(raw_data) # go through each rows in the dataframe for row_num in range(df.shape[0]): # get the row of data row = df.iloc[row_num] # create an empty list to store data from this row new_row = [] # go through each column in the Carto table for field in CARTO_SCHEMA: # if we are fetching data for unique id column if field == 'uid': # add the unique id to the list of data from this row new_row.append(row[UID_FIELD]) # for any other column, check if there are values available from the source for this row else: # if data available from source for this field, populate the field with the data # else populate with None val = row[field] if row[field] != '' else None # add this value to the list of data from this row new_row.append(val) # add the list of values from this row to the list of new data new_data.append(new_row) # go to the next page and check for data page += 1 # generate the url and pull data for this page r = requests.get(url.format(page=page)) # pull data from request response json raw_data = r.json()['data'] # find the length (number of rows) of new_data num_new = len(new_data) # if we have found new dates to process if num_new: # insert new data into the carto table cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_data, user=CARTO_USER, key=CARTO_KEY) return num_new
def processNewData(existing_ids, existing_files): file_list = list_available_files(SOURCE_URL) file_ids = [get_file_id(file) for file in file_list] file_base = 'https://dataverse.harvard.edu/api/access/datafile/' new_file_urls = [] new_ids = [] for file_id in file_ids: if file_id not in existing_files: new_file_urls.append(file_base + file_id) new_ids.append(file_id) logging.info('Number of new files: {}'.format(len(new_ids))) all_urls = range(len(new_file_urls)) total_new = 0 for file_num in all_urls: if new_ids[file_num] in BAD_FILES: continue file_url = new_file_urls[file_num] logging.info('Processing file {}'.format(new_ids[file_num])) new_rows = [] res = urlopen(file_url) zipfile = ZipFile(BytesIO(res.read())) df = pd.read_csv(zipfile.open(zipfile.namelist()[0]), sep='\t') df['File ID'] = new_ids[file_num] for row_num in range(df.shape[0]): row = df.iloc[row_num] if not len(row): break elif pd.isna(row['Longitude']) or pd.isna(row['Latitude']): continue else: if row['Event ID'] not in existing_ids: new_row = [] for field in CARTO_SCHEMA: if field == 'uid': new_row.append(str(row['Event ID'])) elif field == 'Event_ID': new_row.append(str(row['Event ID'])) elif field == 'Event_ID': new_row.append(str(row['Event ID'])) elif field == 'the_geom': # Check for whether valid lat lon provided, will fail if either are '' lon = float(row['Longitude']) lat = float(row['Latitude']) geometry = { 'type': 'Point', 'coordinates': [lon, lat] } new_row.append(geometry) else: # To fix trouble w/ cartosql not being able to handle '' for numeric: val = row[field.replace('_', ' ')] if val == '' or (type(val) == float and np.isnan(val)): val = None new_row.append(val) new_rows.append(new_row) num_new = len(new_rows) if num_new: cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_rows, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) total_new += num_new return total_new
def processInteractions(): r = cartosql.get( "SELECT * FROM {} WHERE current='True'".format(CARTO_TABLE), user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) interaction_data = r.json()['rows'] try_num = 0 #if we didn't get data back, wait a few minutes and try again while not len(interaction_data): logging.info('Sleeping and trying again.') try_num += 1 time.sleep(300) interaction_data = r.json()['rows'] if try_num > 5: logging.error('Problem fetching data to generate interactions') exit() countries_with_interaction = [] for interaction in interaction_data: ctry = interaction['country_iso3'] if ctry not in countries_with_interaction: countries_with_interaction.append(ctry) if cartosql.tableExists(CARTO_TABLE_INTERACTION, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')): cartosql.deleteRows(CARTO_TABLE_INTERACTION, 'cartodb_id IS NOT NULL', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) #run to create new table #existing_interaction_ids = checkCreateTable(CARTO_TABLE_INTERACTION, CARTO_SCHEMA_INTERACTION, UID_FIELD) new_interactions = [] for ctry in countries_with_interaction: r = cartosql.get( "SELECT * FROM {} WHERE current='True' AND country_iso3='{}'". format(CARTO_TABLE, ctry), user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) ctry_interaction_data = r.json()['rows'] event_num = 1 for interaction in ctry_interaction_data: event = interaction['event_name'].split(": ", 1) if event_num == 1: if len(event) == 1: interaction_str = '{} ({})'.format(event[0], interaction['url']) else: interaction_str = '{} ({})'.format(event[1], interaction['url']) else: if len(event) == 1: interaction_str = interaction_str + '; ' + '{} ({})'.format( event[0], interaction['url']) else: interaction_str = interaction_str + '; ' + '{} ({})'.format( event[1], interaction['url']) event_num += 1 #uid = gen_interaction_uid(ctry) if ctry_interaction_data: row = [] for key in CARTO_SCHEMA_INTERACTION.keys(): try: if key == 'the_geom': lon = ctry_interaction_data[0]['lon'] lat = ctry_interaction_data[0]['lat'] item = {'type': 'Point', 'coordinates': [lon, lat]} elif key == 'interaction': item = interaction_str else: item = ctry_interaction_data[0][key] except KeyError: item = None row.append(item) new_interactions.append(row) logging.info('Adding {} new interactions'.format(len(new_interactions))) cartosql.blockInsertRows(CARTO_TABLE_INTERACTION, CARTO_SCHEMA_INTERACTION.keys(), CARTO_SCHEMA_INTERACTION.values(), new_interactions, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY'))
def processData(existing_ids): new_data = [] new_ids = [] r = requests.get(SOURCE_URL) data_bytes = r.content decoded = data_bytes.decode('utf8') json_data = json.loads(decoded) data_dict = json_data['data'] for entry in data_dict: event_id = entry['id'] ids = [] names = [] for t in entry['fields']['type']: ids.append(t['id']) names.append(t['name']) ids = ', '.join(map(str, ids)) names = ', '.join(map(str, names)) for country in entry['fields']['country']: country_id = country['id'] uid = gen_uid(event_id, country_id) if uid not in existing_ids + new_ids: new_ids.append(uid) row = [] for key in CARTO_SCHEMA.keys(): try: if key == 'the_geom': lon = country['location']['lon'] lat = country['location']['lat'] item = {'type': 'Point', 'coordinates': [lon, lat]} elif key == 'uid': item = uid elif key == 'event_id': item = int(event_id) elif key == 'event_name': item = entry['fields']['name'] elif key == 'description': item = entry['fields']['description'] elif key == 'status': item = entry['fields']['status'] elif key == 'date': item = datetime.datetime.strptime( entry['fields']['date']['created'], DATETIME_FORMAT) elif key == 'glide': item = entry['fields']['glide'] elif key == 'related_glide': item = ', '.join( map(str, entry['fields']['related_glide'])) elif key == 'featured': item = str(entry['fields']['featured']) elif key == 'primary_country': item = entry['fields']['primary_country']['iso3'] elif key == 'country_name': item = country['name'] elif key == 'country_shortname': item = country['shortname'] elif key == 'country_iso3': item = country['iso3'] elif key == 'current': item = str(entry['fields']['current']) elif key == 'event_type_ids': item = ids elif key == 'event_types': item = names elif key == 'url': item = entry['fields']['url'] elif key == 'lon': item = country['location']['lon'] elif key == 'lat': item = country['location']['lat'] except KeyError: item = None row.append(item) new_data.append(row) num_new = len(new_ids) if num_new: logging.info('Adding {} new records'.format(num_new)) cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_data) return (num_new)