Ejemplo n.º 1
0
def processData(existing_ids):
    new_data = []
    new_ids = []

    startTime = datetime.datetime.today()

    # Iterate backwards 1-week at a time
    while startTime > MAX_AGE:
        endTime = startTime
        startTime = startTime - datetime.timedelta(days=7)
        query = SOURCE_URL.format(startTime=startTime,
                                  endTime=endTime,
                                  minSig=SIGNIFICANT_THRESHOLD)

        logging.info('Fetching data between {} and {}'.format(
            startTime, endTime))
        res = requests.get(query)
        if not res.ok:
            logging.error(res.text)
        data = res.json()
        new_data = []

        for feature in data['features']:
            coords = feature['geometry']['coordinates']
            lat = coords[1]
            lon = coords[0]
            depth = coords[2]

            props = feature['properties']
            dt = datetime.datetime.utcfromtimestamp(
                props['time'] / 1000).strftime(DATETIME_FORMAT)

            _uid = genUID(lat, lon, depth, dt)
            if _uid not in existing_ids and _uid not in new_ids:
                new_ids.append(_uid)
                row = []
                for field in CARTO_SCHEMA:
                    if field == UID_FIELD:
                        row.append(_uid)
                    elif field == 'the_geom':
                        geom = {'type': 'Point', 'coordinates': [lon, lat]}
                        row.append(geom)
                    elif field == 'depth_in_km':
                        row.append(depth)
                    elif field == 'datetime':
                        row.append(dt)
                    else:
                        row.append(props[field])
                new_data.append(row)

        num_new = len(new_data)
        if num_new:
            logging.info('Adding {} new records'.format(num_new))
            cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(),
                                     CARTO_SCHEMA.values(), new_data)
        elif not PROCESS_HISTORY:
            # Break if no results for a week otherwise keep going
            break

    return (len(new_ids))
Ejemplo n.º 2
0
def processData(SOURCE_URL, filename, existing_ids):
    """
    Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate
    Actions: Retrives data, dedupes and formats it, and adds to Carto table
    Output: Number of new rows added
    """
    num_new = 0

    res_rows = tryRetrieveData(SOURCE_URL, filename, TIMEOUT, ENCODING)
    new_data = {}
    for row in res_rows:
        if not (row.startswith("HDR")):
            row = row.split()
            if len(row) == len(CARTO_SCHEMA):
                logging.debug("Processing row: {}".format(row))
                date = decimalToDatetime(float(row[DATETIME_INDEX]))
                row[DATETIME_INDEX] = date
                new_data = insertIfNew(date, row, existing_ids, new_data)
            else:
                logging.debug("Skipping row: {}".format(row))

    if len(new_data):
        num_new += len(new_data)
        new_data = list(new_data.values())
        cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(),
                                 CARTO_SCHEMA.values(), new_data)

    return (num_new)
Ejemplo n.º 3
0
def processData(existing_ids):
    """
    Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate
    Actions: Retrives data, dedupes and formats it, and adds to Carto table
    Output: Number of new rows added
    """
    new_rows = []
    num_new = 0

    # Use .splitlines():
    # https://stackoverflow.com/questions/21351882/reading-data-from-a-csv-file-online-in-python-3
    csv_stream = urllib.request.urlopen(SOURCE_URL)
    csv_reader = csv.reader(csv_stream.read().decode(ENCODING).splitlines())
    # See comment under John Machin's answer:
    # https://stackoverflow.com/questions/3428532/how-to-import-a-csv-file-using-python-with-headers-intact-where-first-column-is
    headers = next(csv_reader, None)

    for _row in csv_reader:
        if len(headers) == len(_row):
            row = structure_row(headers, _row)
            if row['id'] not in existing_ids:

                new_row = []
                for field in CARTO_SCHEMA:
                    if field == 'uid':
                        new_row.append(row['id'])
                    elif field == 'the_geom':
                        # Check for whether valid lat lon provided, will fail if either are ''
                        try:
                            lon = float(row['lon'])
                            lat = float(row['lat'])
                            geometry = {
                                'type': 'Point',
                                'coordinates': [lon, lat]
                            }
                            new_row.append(geometry)
                        except:
                            logging.error(
                                'No lat long available for this data point - skipping!'
                            )
                            new_row.append(None)
                    else:
                        # To fix trouble w/ cartosql not being able to handle '':
                        val = row[field]
                        if val:
                            new_row.append(val)
                        else:
                            new_row.append(None)

                new_rows.append(new_row)

    if len(new_rows):
        num_new = len(new_rows)
        cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(),
                                 CARTO_SCHEMA.values(), new_rows)

    return num_new
Ejemplo n.º 4
0
def processData():
    '''
    Function to download data and upload it to Carto
    Will first try to get the data for today three times
    Then decrease a day up until 8 tries until it finds one
    '''
    date = datetime.date.today() - datetime.timedelta(days=1)
    success = False
    tries = 0
    while tries < MAX_TRIES and success == False:
        logging.info("Fetching data for {}".format(str(date)))
        f = getFilename(date)
        url = SOURCE_URL.format(date=date.strftime('%Y%m%d'))
        try:
            urllib.request.urlretrieve(url, f)

        except Exception as inst:
            logging.info("Error fetching data for {}".format(str(date)))
            if tries >= 2:
                date = date - datetime.timedelta(days=1)
            tries = tries + 1
            if tries == MAX_TRIES:
                logging.error(
                    "Error fetching data for {}, and max tries reached. See source for last data update."
                    .format(str(datetime.date.today())))
            success = False
        else:
            df = pd.read_csv(f,
                             header=0,
                             usecols=[
                                 'Lat_DNB', 'Lon_DNB', 'Date_Mscan',
                                 'Date_LTZ', 'QF_Detect', 'EEZ', 'Land_Mask'
                             ])
            df = df.drop(df[df.QF_Detect == 999999].index)
            df['the_geom'] = df.apply(
                lambda row: getGeom(row['Lon_DNB'], row['Lat_DNB']), axis=1)

            df = df[[
                'the_geom', 'QF_Detect', 'Date_Mscan', 'Date_LTZ', 'Land_Mask',
                'Lon_DNB', 'Lat_DNB', 'EEZ'
            ]]
            if not cartosql.tableExists(CARTO_TABLE):
                logging.info('Table {} does not exist'.format(CARTO_TABLE))
                cartosql.createTable(CARTO_TABLE, CARTO_SCHEMA)
            else:
                cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL')
                cartosql.createTable(CARTO_TABLE, CARTO_SCHEMA)

                rows = df.values.tolist()
                logging.info('Success!')
                #logging.info('The following includes the first ten rows added to Carto:')
                #logging.info(rows[:10])
                if len(rows):
                    cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(),
                                             CARTO_SCHEMA.values(), rows)
            tries = tries + 1
            success = True
Ejemplo n.º 5
0
def processData(existing_ids):
    """
    Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate
    Actions: Retrives data, dedupes and formats it, and adds to Carto table
    Output: Number of new rows added
    """
    new_rows = []

    res = requests.get(SOURCE_URL)
    csv_reader = csv.reader(res.iter_lines(decode_unicode=True))
    headers = next(csv_reader, None)
    idx = {k: v for v, k in enumerate(headers)}

    for row in csv_reader:
        if not len(row):
            break
        else:
            if row[idx['id']] not in existing_ids:
                new_row = []
                for field in CARTO_SCHEMA:
                    if field == 'uid':
                        new_row.append(row[idx['id']])
                    elif field == 'the_geom':
                        # Check for whether valid lat lon provided, will fail if either are ''
                        lon = float(row[idx['lon']])
                        lat = float(row[idx['lat']])
                        if lat and lon:
                            geometry = {
                                'type': 'Point',
                                'coordinates': [lon, lat]
                            }
                            new_row.append(geometry)
                        else:
                            logging.debug(
                                'No lat long available for this data point - skipping!'
                            )
                            new_row.append(None)
                    else:
                        # To fix trouble w/ cartosql not being able to handle '' for numeric:
                        val = row[
                            idx[field]] if row[idx[field]] != '' else None
                        new_row.append(val)

                new_rows.append(new_row)

    num_new = len(new_rows)
    if num_new:
        cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(),
                                 CARTO_SCHEMA.values(), new_rows)

    return num_new
Ejemplo n.º 6
0
def processData(SOURCE_URL, filename, existing_ids):
    """
    Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate
    Actions: Retrives data, dedupes and formats it, and adds to Carto table
    Output: Number of new rows added
    """
    num_new = 0

    ### Specific to each page/chunk in data processing

    res_rows = tryRetrieveData(SOURCE_URL, filename, TIMEOUT, ENCODING)
    new_data = {}
    for row in res_rows:
        ###
        ## CHANGE TO REFLECT CRITERIA FOR KEEPING ROWS FROM THIS DATA SOURCE
        ###
        if not (row.startswith("HDR")):
            row = row.split()
            ###
            ## CHANGE TO REFLECT CRITERIA FOR KEEPING ROWS FROM THIS DATA SOURCE
            ###
            if len(row) == len(CARTO_SCHEMA):
                logging.debug("Processing row: {}".format(row))
                # Pull data available in each line
                VALUE_INDEX = 3
                value = row[VALUE_INDEX]

                # Pull times associated with those data
                dttm_elems = {"year_ix": 0, "month_ix": 1, "day_ix": 2}

                date = datetime(year=int(row[0]),
                                month=int(row[1]),
                                day=int(row[2])).strftime("%Y-%m-%d")

                UID = genUID('value_type', date)
                values = [UID, date, value, "value_type"]

                new_data = insertIfNew(UID, values, existing_ids, new_data)
            else:
                logging.debug("Skipping row: {}".format(row))

    if len(new_data):
        num_new += len(new_data)
        new_data = list(new_data.values())
        cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(),
                                 CARTO_SCHEMA.values(), new_data)

    ### End page/chunk processing

    return (num_new)
Ejemplo n.º 7
0
def processData(url, filename, existing_ids):
    '''
    Fetch, process and upload new data
    INPUT   url: url where you can find the download link for the source data (string)
            filename: filename for source data (string)
            existing_ids: list of date IDs that we already have in our Carto table (list of strings)
    RETURN  num_new: number of rows of new data sent to Carto table (integer)
    '''
    num_new = 0
    # get the data from source as a list of strings, with each string holding one line from the source data file
    res_rows = tryRetrieveData(url, filename)
    # create an empty dictionary to store new data (data that's not already in our Carto table)
    new_data = {}
    # go through each line of content retrieved from source
    for row in res_rows:
        # get dates by processing lines that come after the header (header lines start with "HDR")
        if not (row.startswith("HDR")):
            # split line by space to get dates
            row = row.split()
            # if length of contents in row matches the length of CARTO_SCHEMA
            if len(row) == len(CARTO_SCHEMA):
                logging.debug("Processing row: {}".format(row))
                # get date by accessing the third element in the list of row
                date = decimalToDatetime(row[2])
                # replace decimal date with datetime in data row
                row[2] = date
                # For new date, check whether this is already in our table.
                # If not, add it to the queue for processing
                new_data = insertIfNew(date, row, existing_ids, new_data)
            else:
                logging.debug("Skipping row: {}".format(row))

    # if we have found new dates to process
    if len(new_data):
        num_new += len(new_data)
        # create a list of new data
        new_data = list(new_data.values())
        # insert new data into the carto table
        cartosql.blockInsertRows(CARTO_TABLE,
                                 CARTO_SCHEMA.keys(),
                                 CARTO_SCHEMA.values(),
                                 new_data,
                                 user=CARTO_USER,
                                 key=CARTO_KEY)

    return (num_new)
Ejemplo n.º 8
0
def processData(existing_ids):
    """
    Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate
    Actions: Retrives data, dedupes and formats it, and adds to Carto table
    Output: Number of new rows added
    """
    new_data = []
    new_ids = []

    today = datetime.today()

    if PROCESS_HISTORY:

        startTime = MAX_AGE
        endTime = startTime + timedelta(days=31)
        while startTime < today:
            logging.info('Fetching data between {} and {}'.format(
                startTime, endTime))
            new_data, new_ids = appendTimeFrame(existing_ids, startTime,
                                                endTime, new_data, new_ids)
            startTime = endTime
            endTime = startTime + timedelta(days=31)

    else:
        # Use defaults of endpoint
        startTime = ''
        endTime = ''
        logging.info('Fetching data for last 30 days')

        new_data, new_ids = appendTimeFrame(existing_ids, startTime, endTime,
                                            new_data, new_ids)

    num_new = len(new_ids)
    if num_new:
        logging.info('Adding {} new records'.format(num_new))
        cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(),
                                 CARTO_SCHEMA.values(), new_data)

    return (num_new)
Ejemplo n.º 9
0
            # add the WB Indicator Code column name and type for this value to the Carto Schema
            CARTO_SCHEMA.update({'indicator_code' + str(i + 1): 'text'})
        # add the RW country name and country code columns to the table
        CARTO_SCHEMA.update({"rw_country_name": 'text'})
        CARTO_SCHEMA.update({"rw_country_code": 'text'})

        cartosql.deleteRows(table_name,
                            'cartodb_id IS NOT NULL',
                            user=CARTO_USER,
                            key=CARTO_KEY)

        # Insert new observations
        if len(all_world_bank_data):
            cartosql.blockInsertRows(table_name,
                                     CARTO_SCHEMA.keys(),
                                     CARTO_SCHEMA.values(),
                                     all_world_bank_data.values.tolist(),
                                     user=CARTO_USER,
                                     key=CARTO_KEY)
            logging.info('Success! New rows have been added to Carto.')
        else:
            logging.info('No rows to add to Carto.')
    '''
    Upload original data and processed data to Amazon S3 storage
    '''
    logging.info('Uploading original data to S3.')
    # Copy the raw data into a zipped file to upload to S3
    raw_data_dir = os.path.join(data_dir, dataset_name + '.zip')
    with ZipFile(raw_data_dir, 'w') as zip:
        for raw_data_file in raw_data_files:
            zip.write(raw_data_file, os.path.basename(raw_data_file))
Ejemplo n.º 10
0
# UPLOAD
# specify column names and types
CARTO_SCHEMA = {
    'iso3': 'text',
    'country': 'text',
    'year': 'numeric',
    'vulnerability': 'numeric',
    'readiness': 'numeric',
    'gain': 'numeric'
}

# check if table exists
if cartosql.tableExists(CARTO_TABLE,
                        user=os.getenv('CARTO_WRI_RW_USER'),
                        key=os.getenv('CARTO_WRI_RW_KEY')):
    print('This table already exists. Please change the name and try again.')
else:
    # create table with appropriate columns
    cartosql.createTable(CARTO_TABLE,
                         CARTO_SCHEMA,
                         user=os.getenv('CARTO_WRI_RW_USER'),
                         key=os.getenv('CARTO_WRI_RW_KEY'))
    # send processed data to table
    cartosql.blockInsertRows(CARTO_TABLE,
                             CARTO_SCHEMA.keys(),
                             CARTO_SCHEMA.values(),
                             final_df.values.tolist(),
                             user=os.getenv('CARTO_WRI_RW_USER'),
                             key=os.getenv('CARTO_WRI_RW_KEY'))
Ejemplo n.º 11
0
def processData(existing_ids):
    """
    Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate
    Actions: Retrives data, dedupes and formats it, and adds to Carto table
    Output: Number of new rows added
    """
    num_new = 0

    year = datetime.today().year
    logging.info("Fetching data for {}".format(year))
    headers, rows = fetchAndFormatData(year)
    logging.info("Num rows: {}".format(len(rows)))

    year_history = 10

    count = 0
    while count < year_history:
        year -= 1
        logging.info("Fetching data for {}".format(year))
        try:
            more_headers, more_rows = fetchAndFormatData(year)
            # Check that headers for historical data match the newest data
            logging.info('More headers: {}'.format(more_headers))
            assert(headers == more_headers)
            rows.extend(more_rows)
            logging.info('Fetched additional data for year {}'.format(year))
        except:
            logging.warning('Couldn\'t fetch data for year {}'.format(year))
        logging.info("Num rows: {}".format(len(rows)))
        count += 1

    new_rows = []
    for _row in rows:
        row = structure_row(headers, _row)
        if str(row['Web ID']) not in existing_ids:
            uid = row['Web ID']
            logging.debug('Row: {}'.format(row))
            lat, lon = [float(loc.strip()) for loc in row['Location Coordinates'].split(',')]

            geometry = {
                 'type':'Point',
                 'coordinates':[lon, lat]
                 }

            new_row = []
            for field in CARTO_SCHEMA:
                if field == UID_FIELD:
                    new_row.append(uid)
                elif field == 'the_geom':
                    new_row.append(geometry)
                else:
                    new_row.append(row[field.replace('_', ' ')])

            new_row = clean_row(new_row)
            new_rows.append(new_row)

    if len(new_rows):
        num_new = len(new_rows)
        logging.debug("15 rows from middle of new_rows: {}".format(new_rows[1000:1015]))
        cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_rows)

    return(num_new)
Ejemplo n.º 12
0
def processData(existing_ids):
    """
    Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate
    Actions: Retrives data, dedupes and formats it, and adds to Carto table
    Output: Number of new rows added
    """
    new_data = []
    new_ids = []

    res = req.get(SOURCE_URL)
    xml = lxml.etree.fromstring(res.content)
    json = xml2json.data(xml)
    items = json['channel']['item']

    for item in items:
        title = item['title'].split(')')[0].split('(')
        place_info = [place.strip() for place in title]
        volcano_name = place_info[0]
        country_name = place_info[1]

        coords = item['{http://www.georss.org/georss}point'].split(' ')
        dt = parser.parse(item['pubDate'],
                          fuzzy=True).strftime(DATETIME_FORMAT)

        lat = coords[0]
        lon = coords[1]
        geom = {'type': 'Point', 'coordinates': [lon, lat]}

        info = item['description'].split('Source:')
        if len(info) < 2:
            info = item['description'].split('Sources:')

        description_text = [
            text.replace('<p>', '').replace('</p>', '') for text in info
        ]
        description = description_text[0]
        sources = description_text[1]

        _uid = genUID(lat, lon, dt)
        if _uid not in existing_ids + new_ids:
            new_ids.append(_uid)
            row = []
            for field in CARTO_SCHEMA:
                if field == 'uid':
                    row.append(_uid)
                elif field == 'the_geom':
                    row.append(geom)
                elif field == 'pubdate':
                    row.append(dt)
                elif field == 'description':
                    row.append(description)
                elif field == 'sources':
                    row.append(sources)
                elif field == 'volcano_name':
                    row.append(volcano_name)
                elif field == 'country_name':
                    row.append(country_name)

            new_data.append(row)

    num_new = len(new_ids)
    if num_new:
        logging.info('Adding {} new records'.format(num_new))
        cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(),
                                 CARTO_SCHEMA.values(), new_data)

    return (num_new)
Ejemplo n.º 13
0
def processData(existing_ids):
    """
    Inputs: FTP SOURCE_URL and filename where data is stored, existing_ids not to duplicate
    Actions: Retrives data, dedupes and formats it, and adds to Carto table
    Output: Number of new rows added
    """
    new_rows = []

    res = requests.get(SOURCE_URL)
    csv_reader = csv.reader(res.iter_lines(decode_unicode=True))
    headers = next(csv_reader, None)
    idx = {k: v for v, k in enumerate(headers)}

    for row in csv_reader:
        #skip empty rows
        if not len(row):
            continue
        else:
            # This data set has some entries with breaks in the last column, which the csv_reader interprets
            # as an individual row. See if new id can be converted to an integer. If it can, it is probably a
            # new row.
            try:
                int(row[idx['id']])
                id = row[idx['id']]
                if id not in existing_ids:
                    logging.info('new row for {}'.format(id))
                    new_row = []
                    for field in CARTO_SCHEMA:
                        if field == 'uid':
                            new_row.append(row[idx['id']])
                        elif field == 'the_geom':
                            # Check for whether valid lat lon provided, will fail if either are ''
                            lon = row[idx['lon']]
                            lat = row[idx['lat']]
                            if lat and lon:
                                geometry = {
                                    'type': 'Point',
                                    'coordinates': [float(lon),
                                                    float(lat)]
                                }
                                new_row.append(geometry)
                            else:
                                logging.debug(
                                    'No lat long available for this data point - skipping!'
                                )
                                new_row.append(None)
                        else:
                            # To fix trouble w/ cartosql not being able to handle '' for numeric:
                            try:
                                val = row[idx[field]] if row[
                                    idx[field]] != '' else None
                                new_row.append(val)
                            except IndexError:
                                pass
                    new_rows.append(new_row)
            #If we can't convert to an integer, the last row probably got cut off.
            except ValueError:
                #  Using the id from the last entry, if this id was already in the Carto table, we will skip it
                if id in existing_ids:
                    pass
                # If it is a new id, we need to go fix that row.
                else:
                    # If the row is only one item, append the rest of the information to the last description.
                    if len(row) == 1:
                        new_rows[-1][
                            -1] = new_rows[-1][-1] + ' ' + row[0].replace(
                                '\t', '')
                    # If several things are in the row, the break was probably mid-row.
                    elif len(row) > 1 and len(row) < 17:
                        # finish the last desciption
                        new_rows[-1][
                            -1] = new_rows[-1][-1] + ' ' + row[0].replace(
                                '\t', '')
                        # append other items to row
                        new_row = new_rows[-1]
                        offset_factor = len(new_rows[-1]) - 1
                        for field in CARTO_SCHEMA:
                            if field == 'uid' or field == 'the_geom':
                                continue
                            try:
                                loc = idx[field] - offset_factor
                                if loc > 0:
                                    val = row[loc] if row[loc] != '' else None
                                    new_row.append(val)
                            except IndexError:
                                pass
                        new_rows[-1] == new_row
                        '''
                        for item in row[1:]:
                            val = row[idx[field]] if row[idx[field]] != '' else None
                            new_row.append(val)
                            new_rows[-1].append(item)
                        '''

    num_new = len(new_rows)
    if num_new:
        cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(),
                                 CARTO_SCHEMA.values(), new_rows)

    return num_new
Ejemplo n.º 14
0
def processData(existing_ids):
    '''
    Fetch, process, upload, and clean new data
    INPUT   existing_ids: list of WDPA IDs that we already have in our Carto table  (list of strings)
    RETURN  num_new: number of rows of data sent to Carto table (integer)
    '''
    # turn list of existing ids from strings into integers
    existing_ids_int = [int(i) for i in existing_ids]
    # fetch list of WDPA IDs (list of all IDs and list of new ones) so that we can pull info from the API about each area
    new_ids, all_ids = fetch_ids(existing_ids_int)
    # if we have designated that we want to replace all the ids, then the list of IDs we will query (id_list) will
    # include all the IDs available; otherwise, we will just pull the new IDs
    if REPLACE_ALL==True:
        id_list = all_ids
    else:
        id_list = new_ids
    # create empty list to store IDs for rows we want to send to Carto so that we can delete any current entries before
    # sending new data
    send_list=[]
    # create empty lists to store data we will be sending to Carto table
    new_data = []
    # go through and fetch information for each of the ids
    for id in id_list:
        # set try number to 0 for this area's ID because this will be our first try fetching the data
        try_num=0
        # generate the url to pull data for this area from the WDPA API
        # WDPA API Reference document: https://api.protectedplanet.net/documentation#get-v3protectedareas
        url = "https://api.protectedplanet.net/v3/protected_areas/{}?token={}".format(id, os.getenv('WDPA_key'))
        # try at least 3 times to fetch the data for this area from the source
        if try_num <3:
            try:
                r = requests.get(url)
            except:
                # if the API call fails, wait 60 seconds before moving on to the next attempt to fetch the data
                time.sleep(60)
                try_num+=1
        else:
            # after 3 failures to fetch data for this ID, log that the data could not be fetched
            logging.info(f'Could not fetch {id}')

        # process the retrieved data
        try:
            # pull data from request response json
            data = r.json()['protected_area']
            # create an empty list to store the processed data for this row that we will send to Carto
            row = []
            # go through each column in the Carto table
            for key in CARTO_SCHEMA.keys():
                # find the location in the json where you can find this column's data
                location = JSON_LOC[key]
                # make a copy of the data that we can modify
                key_data = copy.copy(data)
                # if we are fetching data for the country_name column and there is more than one country,
                # we will need to process this entry
                if key == 'country_name' and len(key_data['countries']) > 1:
                    # get the list of countries
                    countries = key_data["countries"]
                    # make a list of the country names
                    c_list=[]
                    for country in countries:
                        c_list.append(country["name"])
                    # turn this list into a single string with the countries names listed, separated by a semicolon
                    key_data = '; '.join(c_list)
                # we will also need to process the iso3 data if there is more than one country
                elif key == 'iso3' and len(key_data['countries']) > 1:
                    # get the list of countries
                    countries= key_data["countries"]
                    # make a list of the country iso3 values
                    c_list=[]
                    for country in countries:
                        c_list.append(country["iso_3"])
                    # turn this list into a single string with the countries iso3s listed, separated by a semicolon
                    key_data = '; '.join(c_list)
                # for any other column, no special processing is required at this point, just pull out the data from
                # the correct location in the json
                else:
                    # go through each nested name
                    for sub in location:
                        # try to pull out the data from that name
                        try:
                            key_data = key_data[sub]
                            # if the data is a string, remove and leading or tailing whitespace
                            if type(key_data)==str:
                                key_data = key_data.rstrip()
                        # if we aren't able to find the data for this column, set the data as a None value and move
                        # on to the next column
                        except (TypeError, IndexError):
                            key_data=None
                            break
                # if we were able to successfully find the value for the column, do any additional required processing
                if key_data:
                    # pull the year from the data from the 'legal status updated at' field
                    if key == 'status_yr':
                        key_data=int(key_data[-4:])
                    # turn the wdpa_id into an integer
                    if key == 'wdpa_id':
                        # pull it from the API entry, if possible
                        if key_data:
                            key_data = int(key_data)
                        # otherwise just use the id from the list of ids we are going through (some entries are missing
                        # this field on the API)
                        else:
                            key_data=int(id)
                        # add this ID to the list of IDs we are sending new data for
                        send_list.append(key_data)
                    # turn these columns into float data
                    if key == 'no_tk_area' or key == 'rep_area' or key == 'rep_m_area':
                        key_data=float(key_data)
                    # turn the legal_status_updated_at column into a datetime
                    if key == 'legal_status_updated_at':
                        key_data=datetime.datetime.strptime(key_data, '%m/%d/%Y')
                # if no data was found for this column, make sure the entry is None
                else:
                    key_data=None
                # add this value to the row data
                row.append(key_data)
            # if this ID's row of data was processed, add it to the new data to be sent to Carto
            if len(row):
                new_data.append(row)
        # if we failed to process this data, log an error
        except Exception as e:
            logging.error('error pulling {}'.format(id))

        # send data
        # for every 1000 rows processed, send the data to Carto
        if (id_list.index(id) % 1000)==0 and id_list.index(id)>1:
            logging.info('{} records processed.'.format(id_list.index(id)))
            num_new = len(new_data)
            if num_new:
                # delete the old entries in the Carto table for the IDs we have processed
                logging.info('Deleting old records in this batch')
                delete_carto_entries(send_list, 'wdpa_id')

                # push new data rows to Carto
                logging.info('Adding {} new records.'.format(num_new))
                cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_data, user=CARTO_USER, key=CARTO_KEY)

                # start with empty lists again to process the next batch of data
                new_data = []
                send_list = []

    # delete rows for areas that are no longer in the WDPA dataset
    logging.info('Deleting records that are no longer in the database.')
    # get a list of IDs that are in the Carto table but not in the most recent WDPA dataset
    deleted_ids = np.setdiff1d(existing_ids_int, id_list)
    # delete these rows from the Carto table
    delete_carto_entries(deleted_ids, 'wdpa_id')
    logging.info('{} ids deleted'.format(len(deleted_ids)))
    return(num_new)
Ejemplo n.º 15
0
def processNewData(url):
    '''
    Fetch, process and upload new data
    INPUT   url: url where you can find the download link for the source data (string)
    RETURN  num_new: number of rows of new data sent to Carto table (integer)
    '''
    # specify the starting page of source url we want to pull
    page = 1
    # generate the url and pull data for this page
    r = requests.get(url.format(page=page))
    # pull data from request response json
    raw_data = r.json()['data']
    # if data is available from source url
    if len(raw_data) > 0:
        # if the table exists
        if cartosql.tableExists(CARTO_TABLE, user=CARTO_USER, key=CARTO_KEY):
            # delete all the rows
            cartosql.deleteRows(CARTO_TABLE,
                                'cartodb_id IS NOT NULL',
                                user=CARTO_USER,
                                key=CARTO_KEY)
        logging.info('Updating {}'.format(CARTO_TABLE))
    else:
        # raise an error that data is not available from source url
        logging.error("Source data missing. Table will not update.")
    # create an empty list to store new data
    new_data = []
    # if data is available from source url
    while len(raw_data) > 0:
        logging.info('Processing page {}'.format(page))
        # read in source data as a pandas dataframe
        df = pd.DataFrame(raw_data)
        # go through each rows in the dataframe
        for row_num in range(df.shape[0]):
            # get the row of data
            row = df.iloc[row_num]
            # create an empty list to store data from this row
            new_row = []
            # go through each column in the Carto table
            for field in CARTO_SCHEMA:
                # if we are fetching data for unique id column
                if field == 'uid':
                    # add the unique id to the list of data from this row
                    new_row.append(row[UID_FIELD])
                # for any other column, check if there are values available from the source for this row
                else:
                    # if data available from source for this field, populate the field with the data
                    # else populate with None
                    val = row[field] if row[field] != '' else None
                    # add this value to the list of data from this row
                    new_row.append(val)
            # add the list of values from this row to the list of new data
            new_data.append(new_row)
        # go to the next page and check for data
        page += 1
        # generate the url and pull data for this page
        r = requests.get(url.format(page=page))
        # pull data from request response json
        raw_data = r.json()['data']

    # find the length (number of rows) of new_data
    num_new = len(new_data)
    # if we have found new dates to process
    if num_new:
        # insert new data into the carto table
        cartosql.blockInsertRows(CARTO_TABLE,
                                 CARTO_SCHEMA.keys(),
                                 CARTO_SCHEMA.values(),
                                 new_data,
                                 user=CARTO_USER,
                                 key=CARTO_KEY)

    return num_new
Ejemplo n.º 16
0
def processNewData(existing_ids, existing_files):
    file_list = list_available_files(SOURCE_URL)
    file_ids = [get_file_id(file) for file in file_list]
    file_base = 'https://dataverse.harvard.edu/api/access/datafile/'
    new_file_urls = []
    new_ids = []
    for file_id in file_ids:
        if file_id not in existing_files:
            new_file_urls.append(file_base + file_id)
            new_ids.append(file_id)
    logging.info('Number of new files: {}'.format(len(new_ids)))
    all_urls = range(len(new_file_urls))
    total_new = 0
    for file_num in all_urls:
        if new_ids[file_num] in BAD_FILES:
            continue
        file_url = new_file_urls[file_num]
        logging.info('Processing file {}'.format(new_ids[file_num]))
        new_rows = []
        res = urlopen(file_url)
        zipfile = ZipFile(BytesIO(res.read()))
        df = pd.read_csv(zipfile.open(zipfile.namelist()[0]), sep='\t')
        df['File ID'] = new_ids[file_num]

        for row_num in range(df.shape[0]):
            row = df.iloc[row_num]
            if not len(row):
                break
            elif pd.isna(row['Longitude']) or pd.isna(row['Latitude']):
                continue
            else:
                if row['Event ID'] not in existing_ids:
                    new_row = []
                    for field in CARTO_SCHEMA:
                        if field == 'uid':
                            new_row.append(str(row['Event ID']))
                        elif field == 'Event_ID':
                            new_row.append(str(row['Event ID']))
                        elif field == 'Event_ID':
                            new_row.append(str(row['Event ID']))
                        elif field == 'the_geom':
                            # Check for whether valid lat lon provided, will fail if either are ''
                            lon = float(row['Longitude'])
                            lat = float(row['Latitude'])
                            geometry = {
                                'type': 'Point',
                                'coordinates': [lon, lat]
                            }
                            new_row.append(geometry)
                        else:
                            # To fix trouble w/ cartosql not being able to handle '' for numeric:
                            val = row[field.replace('_', ' ')]
                            if val == '' or (type(val) == float
                                             and np.isnan(val)):
                                val = None
                            new_row.append(val)
                    new_rows.append(new_row)
        num_new = len(new_rows)

        if num_new:
            cartosql.blockInsertRows(CARTO_TABLE,
                                     CARTO_SCHEMA.keys(),
                                     CARTO_SCHEMA.values(),
                                     new_rows,
                                     user=os.getenv('CARTO_USER'),
                                     key=os.getenv('CARTO_KEY'))
            total_new += num_new

    return total_new
Ejemplo n.º 17
0
def processInteractions():
    r = cartosql.get(
        "SELECT * FROM {} WHERE current='True'".format(CARTO_TABLE),
        user=os.getenv('CARTO_USER'),
        key=os.getenv('CARTO_KEY'))
    interaction_data = r.json()['rows']
    try_num = 0
    #if we didn't get data back, wait a few minutes and try again
    while not len(interaction_data):
        logging.info('Sleeping and trying again.')
        try_num += 1
        time.sleep(300)
        interaction_data = r.json()['rows']
        if try_num > 5:
            logging.error('Problem fetching data to generate interactions')
            exit()

    countries_with_interaction = []
    for interaction in interaction_data:
        ctry = interaction['country_iso3']
        if ctry not in countries_with_interaction:
            countries_with_interaction.append(ctry)
    if cartosql.tableExists(CARTO_TABLE_INTERACTION,
                            user=os.getenv('CARTO_USER'),
                            key=os.getenv('CARTO_KEY')):
        cartosql.deleteRows(CARTO_TABLE_INTERACTION,
                            'cartodb_id IS NOT NULL',
                            user=os.getenv('CARTO_USER'),
                            key=os.getenv('CARTO_KEY'))
    #run to create new table
    #existing_interaction_ids = checkCreateTable(CARTO_TABLE_INTERACTION, CARTO_SCHEMA_INTERACTION, UID_FIELD)
    new_interactions = []
    for ctry in countries_with_interaction:
        r = cartosql.get(
            "SELECT * FROM {} WHERE current='True' AND country_iso3='{}'".
            format(CARTO_TABLE, ctry),
            user=os.getenv('CARTO_USER'),
            key=os.getenv('CARTO_KEY'))
        ctry_interaction_data = r.json()['rows']
        event_num = 1
        for interaction in ctry_interaction_data:
            event = interaction['event_name'].split(": ", 1)
            if event_num == 1:
                if len(event) == 1:
                    interaction_str = '{} ({})'.format(event[0],
                                                       interaction['url'])
                else:
                    interaction_str = '{} ({})'.format(event[1],
                                                       interaction['url'])
            else:
                if len(event) == 1:
                    interaction_str = interaction_str + '; ' + '{} ({})'.format(
                        event[0], interaction['url'])
                else:
                    interaction_str = interaction_str + '; ' + '{} ({})'.format(
                        event[1], interaction['url'])
            event_num += 1
        #uid = gen_interaction_uid(ctry)
        if ctry_interaction_data:
            row = []
            for key in CARTO_SCHEMA_INTERACTION.keys():
                try:
                    if key == 'the_geom':
                        lon = ctry_interaction_data[0]['lon']
                        lat = ctry_interaction_data[0]['lat']
                        item = {'type': 'Point', 'coordinates': [lon, lat]}
                    elif key == 'interaction':
                        item = interaction_str
                    else:
                        item = ctry_interaction_data[0][key]
                except KeyError:
                    item = None
                row.append(item)
            new_interactions.append(row)
    logging.info('Adding {} new interactions'.format(len(new_interactions)))
    cartosql.blockInsertRows(CARTO_TABLE_INTERACTION,
                             CARTO_SCHEMA_INTERACTION.keys(),
                             CARTO_SCHEMA_INTERACTION.values(),
                             new_interactions,
                             user=os.getenv('CARTO_USER'),
                             key=os.getenv('CARTO_KEY'))
Ejemplo n.º 18
0
def processData(existing_ids):
    new_data = []
    new_ids = []

    r = requests.get(SOURCE_URL)
    data_bytes = r.content
    decoded = data_bytes.decode('utf8')
    json_data = json.loads(decoded)
    data_dict = json_data['data']

    for entry in data_dict:
        event_id = entry['id']
        ids = []
        names = []
        for t in entry['fields']['type']:
            ids.append(t['id'])
            names.append(t['name'])
        ids = ', '.join(map(str, ids))
        names = ', '.join(map(str, names))
        for country in entry['fields']['country']:
            country_id = country['id']
            uid = gen_uid(event_id, country_id)
            if uid not in existing_ids + new_ids:
                new_ids.append(uid)
                row = []
                for key in CARTO_SCHEMA.keys():
                    try:
                        if key == 'the_geom':
                            lon = country['location']['lon']
                            lat = country['location']['lat']
                            item = {'type': 'Point', 'coordinates': [lon, lat]}
                        elif key == 'uid':
                            item = uid
                        elif key == 'event_id':
                            item = int(event_id)
                        elif key == 'event_name':
                            item = entry['fields']['name']
                        elif key == 'description':
                            item = entry['fields']['description']
                        elif key == 'status':
                            item = entry['fields']['status']
                        elif key == 'date':
                            item = datetime.datetime.strptime(
                                entry['fields']['date']['created'],
                                DATETIME_FORMAT)
                        elif key == 'glide':
                            item = entry['fields']['glide']
                        elif key == 'related_glide':
                            item = ', '.join(
                                map(str, entry['fields']['related_glide']))
                        elif key == 'featured':
                            item = str(entry['fields']['featured'])
                        elif key == 'primary_country':
                            item = entry['fields']['primary_country']['iso3']
                        elif key == 'country_name':
                            item = country['name']
                        elif key == 'country_shortname':
                            item = country['shortname']
                        elif key == 'country_iso3':
                            item = country['iso3']
                        elif key == 'current':
                            item = str(entry['fields']['current'])
                        elif key == 'event_type_ids':
                            item = ids
                        elif key == 'event_types':
                            item = names
                        elif key == 'url':
                            item = entry['fields']['url']
                        elif key == 'lon':
                            item = country['location']['lon']
                        elif key == 'lat':
                            item = country['location']['lat']
                    except KeyError:
                        item = None
                    row.append(item)
                new_data.append(row)

    num_new = len(new_ids)
    if num_new:
        logging.info('Adding {} new records'.format(num_new))
        cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(),
                                 CARTO_SCHEMA.values(), new_data)
    return (num_new)