def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) if CLEAR_TABLE_FIRST: if cartosql.tableExists(CARTO_TABLE): cartosql.dropTable(CARTO_TABLE) ### 1. Check if table exists, if not, create it checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) ### 2. Retrieve existing data r = cartosql.getFields(UID_FIELD, CARTO_TABLE, order='{} desc'.format(TIME_FIELD), f='csv') existing_ids = r.text.split('\r\n')[1:-1] num_existing = len(existing_ids) ### 3. Fetch data from FTP, dedupe, process num_new = processData(existing_ids) ### 4. Delete data to get back to MAX_ROWS num_dropped = deleteExcessRows(CARTO_TABLE, MAX_ROWS, TIME_FIELD, MAX_AGE) ### 5. Notify results total = num_existing + num_new - num_dropped # Get most recent update date most_recent_date = get_most_recent_date(CARTO_TABLE) lastUpdateDate(DATASET_ID, most_recent_date) logging.info('Existing rows: {}, New rows: {}, Max: {}'.format(total, num_new, MAX_ROWS)) logging.info("SUCCESS")
def checkCreateTable(table, schema, id_field, time_field=''): ''' Create the table if it does not exist, and pull list of IDs already in the table if it does INPUT table: Carto table to check or create (string) schema: dictionary of column names and types, used if we are creating the table for the first time (dictionary) id_field: name of column that we want to use as a unique ID for this table; this will be used to compare the source data to the our table each time we run the script so that we only have to pull data we haven't previously uploaded (string) time_field: optional, name of column that will store datetime information (string) RETURN list of existing IDs in the table, pulled from the id_field column (list of strings) ''' # check it the table already exists in Carto if cartosql.tableExists(table, user=CARTO_USER, key=CARTO_KEY): # if the table does exist, get a list of all the values in the id_field column logging.info('Fetching existing IDs') r = cartosql.getFields(id_field, table, f='csv', post=True, user=CARTO_USER, key=CARTO_KEY) # turn the response into a list of strings, removing the first and last entries (header and an empty space at end) return r.text.split('\r\n')[1:-1] else: # if the table does not exist, create it with columns based on the schema input logging.info('Table {} does not exist, creating'.format(table)) cartosql.createTable(table, schema, user=CARTO_USER, key=CARTO_KEY) # if a unique ID field is specified, set it as a unique index in the Carto table; when you upload data, Carto # will ensure no two rows have the same entry in this column and return an error if you try to upload a row with # a duplicate unique ID if id_field: cartosql.createIndex(table, id_field, unique=True, user=CARTO_USER, key=CARTO_KEY) # if a time_field is specified, set it as an index in the Carto table; this is not a unique index if time_field: cartosql.createIndex(table, time_field, user=CARTO_USER, key=CARTO_KEY) # return an empty list because there are no IDs in the new table yet return []
def deleteExcessRows(table, max_rows, time_field): ''' Delete rows to bring count down to max_rows INPUT table: name of table in Carto from which we will delete excess rows (string) max_rows: maximum rows that can be stored in the Carto table (integer) time_field: column that stores datetime information (string) RETURN num_dropped: number of rows that have been dropped from the table (integer) ''' # initialize number of rows that will be dropped as 0 num_dropped = 0 # get cartodb_ids from carto table sorted by date (new->old) r = cartosql.getFields('cartodb_id', table, order='{} desc'.format(time_field), f='csv', user=CARTO_USER, key=CARTO_KEY) # turn response into a list of strings of the ids ids = r.text.split('\r\n')[1:-1] # if number of rows is greater than max_rows, delete excess rows if len(ids) > max_rows: r = cartosql.deleteRowsByIDs(table, ids[max_rows:], CARTO_USER, CARTO_KEY) # get the number of rows that have been dropped from the table num_dropped += r.json()['total_rows'] if num_dropped: logging.info('Dropped {} old rows from {}'.format(num_dropped, table)) return (num_dropped)
def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) ### 1. Authenticate to Carto CARTO_USER = os.environ.get('CARTO_USER') CARTO_KEY = os.environ.get('CARTO_KEY') cc = cartoframes.CartoContext(base_url='https://{}.carto.com/'.format(CARTO_USER), api_key=CARTO_KEY) #check size of old table r = cartosql.getFields('datetime', CARTO_TABLE, f='csv') existing_ids = r.text.split('\r\n')[1:-1] num_existing = len(existing_ids) ### 2. Fetch data from FTP, dedupe, process df = processData() num_rows = df.shape[0] cc.write(df, CARTO_TABLE, overwrite=True, privacy='public') # Get most recent update date if num_rows > num_existing: most_recent_date = datetime.datetime.utcnow() lastUpdateDate(DATASET_ID, most_recent_date) ### 3. Notify results logging.info('Existing rows: {}'.format(num_rows)) logging.info("SUCCESS")
def checkCreateTable(table, schema, id_field, time_field): '''Get existing ids or create table''' if cartosql.tableExists(table, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')): logging.info('Fetching existing IDs') r = cartosql.getFields(id_field, table, f='csv', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) return r.text.split('\r\n')[1:-1] else: logging.info('Table {} does not exist, creating'.format(table)) cartosql.createTable(table, schema, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) cartosql.createIndex(table, id_field, unique=True, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) cartosql.createIndex(table, time_field, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) return []
def deleteExcessRows(table, max_rows, time_field, max_age=''): '''Delete excess rows by age or count''' num_dropped = 0 if isinstance(max_age, datetime.datetime): max_age = max_age.isoformat() # 1. delete by age if max_age: r = cartosql.deleteRows(table, "{} < '{}'".format(time_field, max_age)) num_dropped = r.json()['total_rows'] # 2. get sorted ids (old->new) r = cartosql.getFields('cartodb_id', table, order='{}'.format(time_field), f='csv') ids = r.text.split('\r\n')[1:-1] # 3. delete excess if len(ids) > max_rows: r = cartosql.deleteRowsByIDs(table, ids[:-max_rows]) num_dropped += r.json()['total_rows'] if num_dropped: logging.info('Dropped {} old rows from {}'.format(num_dropped, table)) return num_dropped
def get_most_recent_date(table): #pubdate is the date the report was published r = cartosql.getFields('pubdate', table, f='csv', post=True) dates = r.text.split('\r\n')[1:-1] dates.sort() most_recent_date = datetime.datetime.strptime(dates[-1], '%Y-%m-%d %H:%M:%S') return most_recent_date
def get_most_recent_date(table): r = cartosql.getFields(TIME_FIELD, table, f='csv', post=True) dates = r.text.split('\r\n')[1:-1] dates.sort() most_recent_date = datetime.datetime.strptime(dates[-1], '%Y-%m-%d %H:%M:%S') return most_recent_date
def main(): logging.basicConfig(stream=sys.stderr, level=logging.INFO) if CLEAR_TABLE_FIRST: cartosql.dropTable(CARTO_TABLE) ### 1. Check if table exists, if not, create it checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) ### 2. Delete old rows num_expired = cleanOldRows(CARTO_TABLE, TIME_FIELD, MAX_AGE) ### 3. Retrieve existing data r = cartosql.getFields(UID_FIELD, CARTO_TABLE, order='{} desc'.format(TIME_FIELD), f='csv') existing_ids = r.text.split('\r\n')[1:-1] num_existing = len(existing_ids) logging.debug("First 10 IDs already in table: {}".format( existing_ids[:10])) ### 4. Fetch data from FTP, dedupe, process filename = fetchDataFileName(SOURCE_URL) num_new = processData(SOURCE_URL, filename, existing_ids) ### 5. Delete data to get back to MAX_ROWS num_deleted = deleteExcessRows(CARTO_TABLE, MAX_ROWS, TIME_FIELD) ### 6. Notify results logging.info( 'Expired rows: {}, Previous rows: {}, New rows: {}, Dropped rows: {}, Max: {}' .format(num_expired, num_existing, num_new, num_deleted, MAX_ROWS)) logging.info("SUCCESS")
def getIds(table, id_field): '''get ids from table''' r = cartosql.getFields(id_field, table, f='csv', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) return np.unique(r.text.split('\r\n')[1:-1])
def get_most_recent_date(table): #get only check times for current state (CS) because dates associated with projections are #in the future and don't make sense to list as our most recent update date r = cartosql.getFields(TIME_FIELD, table, where="ifc_type LIKE 'CS'", f='csv', post=True) dates = r.text.split('\r\n')[1:-1] dates.sort() most_recent_date = datetime.datetime.strptime(dates[-1], '%Y-%m-%d %H:%M:%S') return most_recent_date
def get_most_recent_date(param): r = cartosql.getFields(TIME_FIELD, CARTO_TABLES[param], f='csv', post=True) dates = r.text.split('\r\n')[1:-1] dates.sort() most_recent_date = datetime.datetime.strptime(dates[-1], '%Y-%m-%d %H:%M:%S') now = datetime.datetime.utcnow() if most_recent_date > now: most_recent_date = now return most_recent_date
def get_most_recent_date(table): r = cartosql.getFields('ended', table, f='csv', post=True, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) dates = r.text.split('\r\n')[1:-1] dates.sort() most_recent_date = datetime.datetime.strptime(dates[-1], '%Y-%m-%d %H:%M:%S') return most_recent_date
def checkCreateTable(table, schema, id_field, time_field=''): '''Get existing ids or create table''' if cartosql.tableExists(table): logging.info('Fetching existing IDs') r = cartosql.getFields(id_field, table, f='csv', post=True) return r.text.split('\r\n')[1:-1] else: logging.info('Table {} does not exist, creating'.format(table)) cartosql.createTable(table, schema) cartosql.createIndex(table, id_field, unique=True) if time_field: cartosql.createIndex(table, time_field) return []
def get_most_recent_date(table): ''' Find the most recent date of data in the specified Carto table INPUT table: name of table in Carto we want to find the most recent date for (string) RETURN most_recent_date: most recent date of data in the Carto table, found in the TIME_FIELD column of the table (datetime object) ''' # get dates in TIME_FIELD column r = cartosql.getFields(TIME_FIELD, table, f='csv', post=True, user=CARTO_USER, key=CARTO_KEY) # turn the response into a list of dates dates = r.text.split('\r\n')[1:-1] # sort the dates from oldest to newest dates.sort() # turn the last (newest) date into a datetime object most_recent_date = datetime.datetime.strptime(dates[-1], '%Y-%m-%d %H:%M:%S') return most_recent_date
def deleteExcessRows(table, max_rows, time_field): '''Delete rows to bring count down to max_rows''' num_dropped=0 # 1. get sorted ids (old->new) r = cartosql.getFields('cartodb_id', table, order='{} desc'.format(time_field), f='csv') ids = r.text.split('\r\n')[1:-1] # 2. delete excess if len(ids) > max_rows: r = cartosql.deleteRowsByIDs(table, ids[max_rows:]) num_dropped += r.json()['total_rows'] if num_dropped: logging.info('Dropped {} old rows from {}'.format(num_dropped, table)) return(num_dropped)
def processArgs(args): opts = {} if args['-u']: opts['user'] = args['-u'] if args['-k']: opts['key'] = args['-k'] f = args['-f'] if f: opts['f'] = f if args['--help'] or not cartosql.init(): return __doc__ if args['<sql>']: if args['post']: r = cartosql.post(args['<sql>'], **opts) return returnFormat(r, f) elif args['get']: r = cartosql.get(args['<sql>'], **opts) return returnFormat(r, f) elif args['select']: if args['<fields>'] and args['<table>']: if args['-w']: opts['where'] = args['-w'] if args['-o']: opts['order'] = args['-o'] if args['-l']: opts['limit'] = args['-l'] r = cartosql.getFields(args['<fields>'], args['<table>'], **opts) return returnFormat(r, f) elif args['ls']: r = cartosql.getTables(**opts) if f is None or f == 'csv': return prettyJson(r) return returnFormat(r, f) elif args['exists'] and args['<table>']: r = cartosql.tableExists(args['<table>'], **opts) return r elif args['drop'] and args['<table>']: confirm = args['--confirm'] if not confirm: confirm = input('Drop table {}? (y/N)'.format(args['<table>'])) == 'y' if confirm: r = cartosql.dropTable(args['<table>'], **opts) return returnFormat(r, f) else: print('Pass option --confirm to drop table') return __doc__
def deleteExcessRows(table, max_rows, age_field): '''Delete excess rows by age or count''' num_dropped = 0 # get sorted ids (old->new) r = cartosql.getFields('cartodb_id', table, order='{}'.format(age_field.lower()), f='csv') ids = r.text.split('\r\n')[1:-1] # delete excess if len(ids) > max_rows: r = cartosql.deleteRowsByIDs(table, ids[:-max_rows]) num_dropped += r.json()['total_rows'] if num_dropped: logging.info('Dropped {} old rows from {}'.format(num_dropped, table)) return num_dropped
def processArgs(args): opts = {} if args['-u']: opts['user'] = args['-u'] if args['-k']: opts['key'] = args['-k'] f = args['-f'] if f: opts['f'] = f if args['--help']: return __doc__ elif args['post'] and args['<sql>']: r = cartosql.post(args['<sql>'], **opts) return returnFormat(r, f) elif args['get']: if args['<sql>']: r = cartosql.get(args['<sql>'], **opts) return returnFormat(r, f) if args['<fields>'] and args['<table>']: if args['-w']: opts['where'] = args['-w'] if args['-o']: opts['order'] = args['-o'] r = cartosql.getFields(args['<fields>'], args['<table>'], **opts) return returnFormat(r, f) elif args['ls']: r = cartosql.getTables(**opts) if f is None or f == 'csv': return prettyJson(r) return returnFormat(r, f) elif args['exists'] and args['<table>']: r = cartosql.tableExists(args['<table>'], **opts) return returnFormat(r, f) elif args['drop'] and args['<table>']: r = cartosql.dropTable(args['<table>'], **opts) return returnFormat(r, f) return __doc__
def getFieldAsList(field, table, **args): '''''' return csql.getFields(field, table, f='csv', **args).text.splitlines()[1:]
def getIds(table, id_field): '''get ids from table''' r = cartosql.getFields(id_field, table, f='csv') return r.text.split('\r\n')[1:-1]
def getFieldAsList(field, table, **args): r = cartosql.getFields(field, table, f='csv', **args) return r.text.splitlines()[1:]
def get_most_recent_date(table): r = cartosql.getFields(AGE_FIELD, table, f='csv', post=True) dates = r.text.split('\r\n')[1:-1] dates.sort() most_recent_date = datetime.datetime.strptime(dates[-1], DATETIME_FORMAT) return most_recent_date
def getFieldAsList(table, field, orderBy=''): assert isinstance(field, str), 'Field must be a single string' r = cartosql.getFields(field, table, order='{}'.format(orderBy), f='csv') return(r.text.split('\r\n')[1:-1])
def processInteractions(markets_updated): num_new_interactions = 0 #new_rows = [] if PROCESS_HISTORY_INTERACTIONS == True: # get all markets logging.info('Processing interactions for all ALPS data') markets_to_process = getIds(CARTO_MARKET_TABLE, 'uid') else: logging.info('Getting IDs of interactions that should be updated') r = cartosql.getFields( ['region_id', 'market_id', 'market_name'], CARTO_INTERACTION_TABLE, where="{} < current_date - interval '{}' month".format( INTERACTION_TIME_FIELD, LOOKBACK), f='csv', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) old_ids = [market.split(',') for market in r.text.split('\r\n')[1:-1]] old_market_uids = [ genMarketUID(old_id[0], old_id[1], old_id[2]) for old_id in old_ids ] logging.info( 'Processing interactions for new ALPS data and re-processing interactions that are out of date' ) markets_to_process = np.unique(markets_updated + old_market_uids) #go through each market that was updated and create the correct rows for them num_markets = len(markets_to_process) market_num = 1 logging.info('{} markets to update interactions for'.format( len(markets_to_process))) for m_uid in markets_to_process: logging.info('processing {} out of {} markets'.format( market_num, len(markets_to_process))) new_rows = [] for food_category, sql_query in CATEGORIES.items(): try_num = 1 while try_num <= 3: try: #logging.info('Processing interaction for {} at uid {}, try number {} (market {} of {})'.format(food_category, m_uid, try_num, market_num, num_markets)) # get information about market r = cartosql.get("SELECT * FROM {} WHERE uid='{}'".format( CARTO_MARKET_TABLE, m_uid), user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) if r.json()['total_rows'] == 0: #logging.info('No rows for interaction') alps_entries = [] break market_entry = r.json()['rows'][0] # get information about food prices at market # SQL gets most recent entry for each commodity at each market that is NOT a forecast request = "SELECT DISTINCT ON (mktid, cmname) * FROM {table} WHERE mktid={market_id} AND mktname='{market_name}' AND adm1id={region_id} AND category LIKE '{cat_name}' AND date > current_date - interval '{x}' month AND forecast = 'False' ORDER BY mktid, cmname, date desc".format( table=CARTO_ALPS_TABLE, market_id=market_entry['market_id'], market_name=market_entry['market_name'].replace( "'", "''"), region_id=market_entry['region_id'], cat_name=sql_query, x=LOOKBACK) r = cartosql.get(request, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) alps_entries = r.json()['rows'] break except: try_num += 1 uid = genInteractionUID(market_entry['region_id'], market_entry['market_id'], market_entry['market_name'], food_category) if alps_entries: commodity_num = 1 for entry in alps_entries: if commodity_num == 1: interaction_string = INTERACTION_STRING_FORMAT.format( num=commodity_num, commodity=entry['cmname'], alps=entry['alps'].lower(), date=entry['date'][:10]) else: interaction_string = interaction_string + '; ' + INTERACTION_STRING_FORMAT.format( num=commodity_num, commodity=entry['cmname'], alps=entry['alps'].lower(), date=entry['date'][:10]) commodity_num += 1 # create new Carto row row = [] for field in CARTO_INTERACTION_SCHEMA.keys(): if field == 'uid': row.append(uid) elif field == 'market_id': row.append(int(market_entry['market_id'])) elif field == 'the_geom': shapely_point = wkb.loads(market_entry['the_geom'], hex=True) json_point = json.loads( json.dumps( shapely.geometry.mapping(shapely_point))) row.append(json_point) elif field == 'region_name': row.append(market_entry['region_name']) elif field == 'region_id': row.append(market_entry['region_id']) elif field == 'market_name': row.append(market_entry['market_name']) elif field == 'market_interaction': if len(alps_entries) == 0: row.append(None) else: row.append(interaction_string) elif field == 'category': row.append(food_category) elif field == 'highest_pewi': if len(alps_entries) == 0: row.append(None) else: highest_pewi = max( [entry['pewi'] for entry in alps_entries]) row.append(highest_pewi) elif field == 'highest_alps': if len(alps_entries) == 0: row.append(None) else: highest_alps_category = assignALPS(highest_pewi) row.append(highest_alps_category) elif field == INTERACTION_TIME_FIELD: if len(alps_entries) == 0: row.append(None) else: row.append( min(entry['date'] for entry in alps_entries)) new_rows.append(row) num_new_interactions += 1 #delete old entries for the markets that were updated #logging.info('Deleting old interactions from Carto') try: cartosql.deleteRows(CARTO_INTERACTION_TABLE, "{} = '{}'".format(UID_FIELD, uid), user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) except: pass #cartosql.deleteRowsByIDs(CARTO_INTERACTION_TABLE, markets_to_process, id_field='market_id') #send new rows for these markets #logging.info('Sending new interactions to Carto') if new_rows: cartosql.insertRows(CARTO_INTERACTION_TABLE, CARTO_INTERACTION_SCHEMA.keys(), CARTO_INTERACTION_SCHEMA.values(), new_rows, blocksize=500, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) market_num += 1 return num_new_interactions