Ejemplo n.º 1
0
def main():
    logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL)

    if CLEAR_TABLE_FIRST:
        if cartosql.tableExists(CARTO_TABLE):
            cartosql.dropTable(CARTO_TABLE)

    ### 1. Check if table exists, if not, create it
    checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD)

    ### 2. Retrieve existing data
    r = cartosql.getFields(UID_FIELD, CARTO_TABLE,
                           order='{} desc'.format(TIME_FIELD), f='csv')
    existing_ids = r.text.split('\r\n')[1:-1]
    num_existing = len(existing_ids)

    ### 3. Fetch data from FTP, dedupe, process
    num_new = processData(existing_ids)

    ### 4. Delete data to get back to MAX_ROWS
    num_dropped = deleteExcessRows(CARTO_TABLE, MAX_ROWS, TIME_FIELD, MAX_AGE)

    ### 5. Notify results
    total = num_existing + num_new - num_dropped

    # Get most recent update date
    most_recent_date = get_most_recent_date(CARTO_TABLE)
    lastUpdateDate(DATASET_ID, most_recent_date)
    
    logging.info('Existing rows: {},  New rows: {}, Max: {}'.format(total, num_new, MAX_ROWS))
    logging.info("SUCCESS")
Ejemplo n.º 2
0
def checkCreateTable(table, schema, id_field, time_field=''):
    '''
    Create the table if it does not exist, and pull list of IDs already in the table if it does
    INPUT   table: Carto table to check or create (string)
            schema: dictionary of column names and types, used if we are creating the table for the first time (dictionary)
            id_field: name of column that we want to use as a unique ID for this table; this will be used to compare the
                    source data to the our table each time we run the script so that we only have to pull data we
                    haven't previously uploaded (string)
            time_field:  optional, name of column that will store datetime information (string)
    RETURN  list of existing IDs in the table, pulled from the id_field column (list of strings)
    '''
    # check it the table already exists in Carto
    if cartosql.tableExists(table, user=CARTO_USER, key=CARTO_KEY):
        # if the table does exist, get a list of all the values in the id_field column
        logging.info('Fetching existing IDs')
        r = cartosql.getFields(id_field, table, f='csv', post=True, user=CARTO_USER, key=CARTO_KEY)
        # turn the response into a list of strings, removing the first and last entries (header and an empty space at end)
        return r.text.split('\r\n')[1:-1]
    else:
        # if the table does not exist, create it with columns based on the schema input
        logging.info('Table {} does not exist, creating'.format(table))
        cartosql.createTable(table, schema, user=CARTO_USER, key=CARTO_KEY)
        # if a unique ID field is specified, set it as a unique index in the Carto table; when you upload data, Carto
        # will ensure no two rows have the same entry in this column and return an error if you try to upload a row with
        # a duplicate unique ID
        if id_field:
            cartosql.createIndex(table, id_field, unique=True, user=CARTO_USER, key=CARTO_KEY)
        # if a time_field is specified, set it as an index in the Carto table; this is not a unique index
        if time_field:
            cartosql.createIndex(table, time_field, user=CARTO_USER, key=CARTO_KEY)
        # return an empty list because there are no IDs in the new table yet
        return []
Ejemplo n.º 3
0
def deleteExcessRows(table, max_rows, time_field):
    ''' 
    Delete rows to bring count down to max_rows
    INPUT   table: name of table in Carto from which we will delete excess rows (string)
            max_rows: maximum rows that can be stored in the Carto table (integer)
            time_field: column that stores datetime information (string) 
    RETURN  num_dropped: number of rows that have been dropped from the table (integer)
    '''
    # initialize number of rows that will be dropped as 0
    num_dropped = 0
    # get cartodb_ids from carto table sorted by date (new->old)
    r = cartosql.getFields('cartodb_id',
                           table,
                           order='{} desc'.format(time_field),
                           f='csv',
                           user=CARTO_USER,
                           key=CARTO_KEY)
    # turn response into a list of strings of the ids
    ids = r.text.split('\r\n')[1:-1]

    # if number of rows is greater than max_rows, delete excess rows
    if len(ids) > max_rows:
        r = cartosql.deleteRowsByIDs(table, ids[max_rows:], CARTO_USER,
                                     CARTO_KEY)
        # get the number of rows that have been dropped from the table
        num_dropped += r.json()['total_rows']
    if num_dropped:
        logging.info('Dropped {} old rows from {}'.format(num_dropped, table))

    return (num_dropped)
Ejemplo n.º 4
0
def main():
    logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL)

    ### 1. Authenticate to Carto
    CARTO_USER = os.environ.get('CARTO_USER')
    CARTO_KEY = os.environ.get('CARTO_KEY')


    cc = cartoframes.CartoContext(base_url='https://{}.carto.com/'.format(CARTO_USER),
                                  api_key=CARTO_KEY)
    #check size of old table
    r = cartosql.getFields('datetime', CARTO_TABLE, f='csv')
    existing_ids = r.text.split('\r\n')[1:-1]
    num_existing = len(existing_ids)

    ### 2. Fetch data from FTP, dedupe, process
    df = processData()

    num_rows = df.shape[0]
    cc.write(df, CARTO_TABLE, overwrite=True, privacy='public')

    # Get most recent update date
    if num_rows > num_existing:
        most_recent_date = datetime.datetime.utcnow()
        lastUpdateDate(DATASET_ID, most_recent_date)

    ### 3. Notify results
    logging.info('Existing rows: {}'.format(num_rows))
    logging.info("SUCCESS")
Ejemplo n.º 5
0
def checkCreateTable(table, schema, id_field, time_field):
    '''Get existing ids or create table'''
    if cartosql.tableExists(table,
                            user=os.getenv('CARTO_USER'),
                            key=os.getenv('CARTO_KEY')):
        logging.info('Fetching existing IDs')
        r = cartosql.getFields(id_field,
                               table,
                               f='csv',
                               user=os.getenv('CARTO_USER'),
                               key=os.getenv('CARTO_KEY'))
        return r.text.split('\r\n')[1:-1]
    else:
        logging.info('Table {} does not exist, creating'.format(table))
        cartosql.createTable(table,
                             schema,
                             user=os.getenv('CARTO_USER'),
                             key=os.getenv('CARTO_KEY'))
        cartosql.createIndex(table,
                             id_field,
                             unique=True,
                             user=os.getenv('CARTO_USER'),
                             key=os.getenv('CARTO_KEY'))
        cartosql.createIndex(table,
                             time_field,
                             user=os.getenv('CARTO_USER'),
                             key=os.getenv('CARTO_KEY'))
    return []
Ejemplo n.º 6
0
def deleteExcessRows(table, max_rows, time_field, max_age=''):
    '''Delete excess rows by age or count'''
    num_dropped = 0
    if isinstance(max_age, datetime.datetime):
        max_age = max_age.isoformat()

    # 1. delete by age
    if max_age:
        r = cartosql.deleteRows(table, "{} < '{}'".format(time_field, max_age))
        num_dropped = r.json()['total_rows']

    # 2. get sorted ids (old->new)
    r = cartosql.getFields('cartodb_id',
                           table,
                           order='{}'.format(time_field),
                           f='csv')
    ids = r.text.split('\r\n')[1:-1]

    # 3. delete excess
    if len(ids) > max_rows:
        r = cartosql.deleteRowsByIDs(table, ids[:-max_rows])
        num_dropped += r.json()['total_rows']
    if num_dropped:
        logging.info('Dropped {} old rows from {}'.format(num_dropped, table))
    return num_dropped
Ejemplo n.º 7
0
def get_most_recent_date(table):
    #pubdate is the date the report was published
    r = cartosql.getFields('pubdate', table, f='csv', post=True)
    dates = r.text.split('\r\n')[1:-1]
    dates.sort()
    most_recent_date = datetime.datetime.strptime(dates[-1], '%Y-%m-%d %H:%M:%S')
    return most_recent_date
Ejemplo n.º 8
0
def get_most_recent_date(table):
    r = cartosql.getFields(TIME_FIELD, table, f='csv', post=True)
    dates = r.text.split('\r\n')[1:-1]
    dates.sort()
    most_recent_date = datetime.datetime.strptime(dates[-1],
                                                  '%Y-%m-%d %H:%M:%S')
    return most_recent_date
Ejemplo n.º 9
0
def main():
    logging.basicConfig(stream=sys.stderr, level=logging.INFO)

    if CLEAR_TABLE_FIRST:
        cartosql.dropTable(CARTO_TABLE)

    ### 1. Check if table exists, if not, create it
    checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD)

    ### 2. Delete old rows
    num_expired = cleanOldRows(CARTO_TABLE, TIME_FIELD, MAX_AGE)

    ### 3. Retrieve existing data
    r = cartosql.getFields(UID_FIELD,
                           CARTO_TABLE,
                           order='{} desc'.format(TIME_FIELD),
                           f='csv')
    existing_ids = r.text.split('\r\n')[1:-1]
    num_existing = len(existing_ids)

    logging.debug("First 10 IDs already in table: {}".format(
        existing_ids[:10]))

    ### 4. Fetch data from FTP, dedupe, process
    filename = fetchDataFileName(SOURCE_URL)
    num_new = processData(SOURCE_URL, filename, existing_ids)

    ### 5. Delete data to get back to MAX_ROWS
    num_deleted = deleteExcessRows(CARTO_TABLE, MAX_ROWS, TIME_FIELD)

    ### 6. Notify results
    logging.info(
        'Expired rows: {}, Previous rows: {},  New rows: {}, Dropped rows: {}, Max: {}'
        .format(num_expired, num_existing, num_new, num_deleted, MAX_ROWS))
    logging.info("SUCCESS")
Ejemplo n.º 10
0
def getIds(table, id_field):
    '''get ids from table'''
    r = cartosql.getFields(id_field,
                           table,
                           f='csv',
                           user=os.getenv('CARTO_USER'),
                           key=os.getenv('CARTO_KEY'))
    return np.unique(r.text.split('\r\n')[1:-1])
Ejemplo n.º 11
0
def get_most_recent_date(table):
    #get only check times for current state (CS) because dates associated with projections are
    #in the future and don't make sense to list as our most recent update date
    r = cartosql.getFields(TIME_FIELD, table, where="ifc_type LIKE 'CS'", f='csv', post=True)
    dates = r.text.split('\r\n')[1:-1]
    dates.sort()
    most_recent_date = datetime.datetime.strptime(dates[-1], '%Y-%m-%d %H:%M:%S')
    return most_recent_date
Ejemplo n.º 12
0
def get_most_recent_date(param):
    r = cartosql.getFields(TIME_FIELD, CARTO_TABLES[param], f='csv', post=True)
    dates = r.text.split('\r\n')[1:-1]
    dates.sort()
    most_recent_date = datetime.datetime.strptime(dates[-1],
                                                  '%Y-%m-%d %H:%M:%S')
    now = datetime.datetime.utcnow()
    if most_recent_date > now:
        most_recent_date = now
    return most_recent_date
Ejemplo n.º 13
0
def get_most_recent_date(table):
    r = cartosql.getFields('ended',
                           table,
                           f='csv',
                           post=True,
                           user=os.getenv('CARTO_USER'),
                           key=os.getenv('CARTO_KEY'))
    dates = r.text.split('\r\n')[1:-1]
    dates.sort()
    most_recent_date = datetime.datetime.strptime(dates[-1],
                                                  '%Y-%m-%d %H:%M:%S')
    return most_recent_date
Ejemplo n.º 14
0
def checkCreateTable(table, schema, id_field, time_field=''):
    '''Get existing ids or create table'''
    if cartosql.tableExists(table):
        logging.info('Fetching existing IDs')
        r = cartosql.getFields(id_field, table, f='csv', post=True)
        return r.text.split('\r\n')[1:-1]
    else:
        logging.info('Table {} does not exist, creating'.format(table))
        cartosql.createTable(table, schema)
        cartosql.createIndex(table, id_field, unique=True)
        if time_field:
            cartosql.createIndex(table, time_field)
    return []
Ejemplo n.º 15
0
def get_most_recent_date(table):
    '''
    Find the most recent date of data in the specified Carto table
    INPUT   table: name of table in Carto we want to find the most recent date for (string)
    RETURN  most_recent_date: most recent date of data in the Carto table, found in the TIME_FIELD column of the table (datetime object)
    '''
    # get dates in TIME_FIELD column
    r = cartosql.getFields(TIME_FIELD, table, f='csv', post=True, user=CARTO_USER, key=CARTO_KEY)
    # turn the response into a list of dates
    dates = r.text.split('\r\n')[1:-1]
    # sort the dates from oldest to newest
    dates.sort()
    # turn the last (newest) date into a datetime object
    most_recent_date = datetime.datetime.strptime(dates[-1], '%Y-%m-%d %H:%M:%S')
    return most_recent_date
Ejemplo n.º 16
0
def deleteExcessRows(table, max_rows, time_field):
    '''Delete rows to bring count down to max_rows'''
    num_dropped=0
    # 1. get sorted ids (old->new)
    r = cartosql.getFields('cartodb_id', table, order='{} desc'.format(time_field),
                           f='csv')
    ids = r.text.split('\r\n')[1:-1]

    # 2. delete excess
    if len(ids) > max_rows:
        r = cartosql.deleteRowsByIDs(table, ids[max_rows:])
        num_dropped += r.json()['total_rows']
    if num_dropped:
        logging.info('Dropped {} old rows from {}'.format(num_dropped, table))

    return(num_dropped)
Ejemplo n.º 17
0
def processArgs(args):
    opts = {}
    if args['-u']:
        opts['user'] = args['-u']
    if args['-k']:
        opts['key'] = args['-k']
    f = args['-f']
    if f:
        opts['f'] = f
    if args['--help'] or not cartosql.init():
        return __doc__
    if args['<sql>']:
        if args['post']:
            r = cartosql.post(args['<sql>'], **opts)
            return returnFormat(r, f)
        elif args['get']:
            r = cartosql.get(args['<sql>'], **opts)
            return returnFormat(r, f)
    elif args['select']:
        if args['<fields>'] and args['<table>']:
            if args['-w']:
                opts['where'] = args['-w']
            if args['-o']:
                opts['order'] = args['-o']
            if args['-l']:
                opts['limit'] = args['-l']
            r = cartosql.getFields(args['<fields>'], args['<table>'], **opts)
            return returnFormat(r, f)
    elif args['ls']:
        r = cartosql.getTables(**opts)
        if f is None or f == 'csv':
            return prettyJson(r)
        return returnFormat(r, f)
    elif args['exists'] and args['<table>']:
        r = cartosql.tableExists(args['<table>'], **opts)
        return r
    elif args['drop'] and args['<table>']:
        confirm = args['--confirm']
        if not confirm:
            confirm = input('Drop table {}? (y/N)'.format(args['<table>'])) == 'y'
        if confirm:
            r = cartosql.dropTable(args['<table>'], **opts)
            return returnFormat(r, f)
        else:
            print('Pass option --confirm to drop table')
    return __doc__
Ejemplo n.º 18
0
def deleteExcessRows(table, max_rows, age_field):
    '''Delete excess rows by age or count'''
    num_dropped = 0

    # get sorted ids (old->new)
    r = cartosql.getFields('cartodb_id',
                           table,
                           order='{}'.format(age_field.lower()),
                           f='csv')
    ids = r.text.split('\r\n')[1:-1]

    #  delete excess
    if len(ids) > max_rows:
        r = cartosql.deleteRowsByIDs(table, ids[:-max_rows])
        num_dropped += r.json()['total_rows']
    if num_dropped:
        logging.info('Dropped {} old rows from {}'.format(num_dropped, table))
    return num_dropped
Ejemplo n.º 19
0
def processArgs(args):
    opts = {}
    if args['-u']:
        opts['user'] = args['-u']
    if args['-k']:
        opts['key'] = args['-k']
    f = args['-f']
    if f:
        opts['f'] = f
    if args['--help']:
        return __doc__
    elif args['post'] and args['<sql>']:
        r = cartosql.post(args['<sql>'], **opts)
        return returnFormat(r, f)
    elif args['get']:
        if args['<sql>']:
            r = cartosql.get(args['<sql>'], **opts)
            return returnFormat(r, f)
        if args['<fields>'] and args['<table>']:
            if args['-w']:
                opts['where'] = args['-w']
            if args['-o']:
                opts['order'] = args['-o']
            r = cartosql.getFields(args['<fields>'], args['<table>'], **opts)
            return returnFormat(r, f)
    elif args['ls']:
        r = cartosql.getTables(**opts)
        if f is None or f == 'csv':
            return prettyJson(r)
        return returnFormat(r, f)
    elif args['exists'] and args['<table>']:
        r = cartosql.tableExists(args['<table>'], **opts)
        return returnFormat(r, f)
    elif args['drop'] and args['<table>']:
        r = cartosql.dropTable(args['<table>'], **opts)
        return returnFormat(r, f)
    return __doc__
Ejemplo n.º 20
0
def getFieldAsList(field, table, **args):
    ''''''
    return csql.getFields(field, table, f='csv', **args).text.splitlines()[1:]
Ejemplo n.º 21
0
def getIds(table, id_field):
    '''get ids from table'''
    r = cartosql.getFields(id_field, table, f='csv')
    return r.text.split('\r\n')[1:-1]
Ejemplo n.º 22
0
def getFieldAsList(field, table, **args):
    r = cartosql.getFields(field, table, f='csv', **args)
    return r.text.splitlines()[1:]
Ejemplo n.º 23
0
def get_most_recent_date(table):
    r = cartosql.getFields(AGE_FIELD, table, f='csv', post=True)
    dates = r.text.split('\r\n')[1:-1]
    dates.sort()
    most_recent_date = datetime.datetime.strptime(dates[-1], DATETIME_FORMAT)
    return most_recent_date
Ejemplo n.º 24
0
def getFieldAsList(table, field, orderBy=''):
    assert isinstance(field, str), 'Field must be a single string'
    r = cartosql.getFields(field, table, order='{}'.format(orderBy),
                           f='csv')
    return(r.text.split('\r\n')[1:-1])
Ejemplo n.º 25
0
def processInteractions(markets_updated):
    num_new_interactions = 0
    #new_rows = []
    if PROCESS_HISTORY_INTERACTIONS == True:
        # get all markets
        logging.info('Processing interactions for all ALPS data')
        markets_to_process = getIds(CARTO_MARKET_TABLE, 'uid')

    else:
        logging.info('Getting IDs of interactions that should be updated')
        r = cartosql.getFields(
            ['region_id', 'market_id', 'market_name'],
            CARTO_INTERACTION_TABLE,
            where="{} < current_date - interval '{}' month".format(
                INTERACTION_TIME_FIELD, LOOKBACK),
            f='csv',
            user=os.getenv('CARTO_USER'),
            key=os.getenv('CARTO_KEY'))
        old_ids = [market.split(',') for market in r.text.split('\r\n')[1:-1]]
        old_market_uids = [
            genMarketUID(old_id[0], old_id[1], old_id[2]) for old_id in old_ids
        ]

        logging.info(
            'Processing interactions for new ALPS data and re-processing interactions that are out of date'
        )
        markets_to_process = np.unique(markets_updated + old_market_uids)
    #go through each market that was updated and create the correct rows for them
    num_markets = len(markets_to_process)
    market_num = 1
    logging.info('{} markets to update interactions for'.format(
        len(markets_to_process)))
    for m_uid in markets_to_process:
        logging.info('processing {} out of {} markets'.format(
            market_num, len(markets_to_process)))
        new_rows = []
        for food_category, sql_query in CATEGORIES.items():
            try_num = 1
            while try_num <= 3:
                try:
                    #logging.info('Processing interaction for {} at uid {}, try number {} (market {} of {})'.format(food_category, m_uid, try_num, market_num, num_markets))
                    # get information about market
                    r = cartosql.get("SELECT * FROM {} WHERE uid='{}'".format(
                        CARTO_MARKET_TABLE, m_uid),
                                     user=os.getenv('CARTO_USER'),
                                     key=os.getenv('CARTO_KEY'))
                    if r.json()['total_rows'] == 0:
                        #logging.info('No rows for interaction')
                        alps_entries = []
                        break
                    market_entry = r.json()['rows'][0]

                    # get information about food prices at market
                    # SQL gets most recent entry for each commodity at each market that is NOT a forecast
                    request = "SELECT DISTINCT ON (mktid, cmname) * FROM {table} WHERE mktid={market_id} AND mktname='{market_name}' AND adm1id={region_id} AND category LIKE '{cat_name}' AND date > current_date - interval '{x}' month AND forecast = 'False' ORDER  BY mktid, cmname, date desc".format(
                        table=CARTO_ALPS_TABLE,
                        market_id=market_entry['market_id'],
                        market_name=market_entry['market_name'].replace(
                            "'", "''"),
                        region_id=market_entry['region_id'],
                        cat_name=sql_query,
                        x=LOOKBACK)
                    r = cartosql.get(request,
                                     user=os.getenv('CARTO_USER'),
                                     key=os.getenv('CARTO_KEY'))
                    alps_entries = r.json()['rows']
                    break
                except:
                    try_num += 1
            uid = genInteractionUID(market_entry['region_id'],
                                    market_entry['market_id'],
                                    market_entry['market_name'], food_category)
            if alps_entries:
                commodity_num = 1
                for entry in alps_entries:
                    if commodity_num == 1:
                        interaction_string = INTERACTION_STRING_FORMAT.format(
                            num=commodity_num,
                            commodity=entry['cmname'],
                            alps=entry['alps'].lower(),
                            date=entry['date'][:10])
                    else:
                        interaction_string = interaction_string + '; ' + INTERACTION_STRING_FORMAT.format(
                            num=commodity_num,
                            commodity=entry['cmname'],
                            alps=entry['alps'].lower(),
                            date=entry['date'][:10])
                    commodity_num += 1
                # create new Carto row
                row = []
                for field in CARTO_INTERACTION_SCHEMA.keys():
                    if field == 'uid':
                        row.append(uid)
                    elif field == 'market_id':
                        row.append(int(market_entry['market_id']))
                    elif field == 'the_geom':
                        shapely_point = wkb.loads(market_entry['the_geom'],
                                                  hex=True)
                        json_point = json.loads(
                            json.dumps(
                                shapely.geometry.mapping(shapely_point)))
                        row.append(json_point)
                    elif field == 'region_name':
                        row.append(market_entry['region_name'])
                    elif field == 'region_id':
                        row.append(market_entry['region_id'])
                    elif field == 'market_name':
                        row.append(market_entry['market_name'])
                    elif field == 'market_interaction':
                        if len(alps_entries) == 0:
                            row.append(None)
                        else:
                            row.append(interaction_string)
                    elif field == 'category':
                        row.append(food_category)
                    elif field == 'highest_pewi':
                        if len(alps_entries) == 0:
                            row.append(None)
                        else:
                            highest_pewi = max(
                                [entry['pewi'] for entry in alps_entries])
                            row.append(highest_pewi)
                    elif field == 'highest_alps':
                        if len(alps_entries) == 0:
                            row.append(None)
                        else:
                            highest_alps_category = assignALPS(highest_pewi)
                            row.append(highest_alps_category)
                    elif field == INTERACTION_TIME_FIELD:
                        if len(alps_entries) == 0:
                            row.append(None)
                        else:
                            row.append(
                                min(entry['date'] for entry in alps_entries))
                new_rows.append(row)
                num_new_interactions += 1
            #delete old entries for the markets that were updated
            #logging.info('Deleting old interactions from Carto')
            try:
                cartosql.deleteRows(CARTO_INTERACTION_TABLE,
                                    "{} = '{}'".format(UID_FIELD, uid),
                                    user=os.getenv('CARTO_USER'),
                                    key=os.getenv('CARTO_KEY'))
            except:
                pass
        #cartosql.deleteRowsByIDs(CARTO_INTERACTION_TABLE, markets_to_process, id_field='market_id')
        #send new rows for these markets
        #logging.info('Sending new interactions to Carto')
        if new_rows:
            cartosql.insertRows(CARTO_INTERACTION_TABLE,
                                CARTO_INTERACTION_SCHEMA.keys(),
                                CARTO_INTERACTION_SCHEMA.values(),
                                new_rows,
                                blocksize=500,
                                user=os.getenv('CARTO_USER'),
                                key=os.getenv('CARTO_KEY'))
        market_num += 1
    return num_new_interactions