def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') if CLEAR_TABLE_FIRST: if cartosql.tableExists(CARTO_TABLE): cartosql.dropTable(CARTO_TABLE) # 1. Check if table exists and create table existing_ids = [] if cartosql.tableExists(CARTO_TABLE): logging.info('Fetching existing ids') existing_ids = getIds(CARTO_TABLE, UID_FIELD) else: logging.info('Table {} does not exist, creating'.format(CARTO_TABLE)) createTableWithIndex(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) # 2. Iterively fetch, parse and post new data new_ids = processNewData(existing_ids) new_count = len(new_ids) existing_count = new_count + len(existing_ids) logging.info('Total rows: {}, New: {}, Max: {}'.format( existing_count, new_count, MAXROWS)) # 3. Remove old observations deleteExcessRows(CARTO_TABLE, MAXROWS, TIME_FIELD) # MAXAGE) logging.info('SUCCESS')
def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') if CLEAR_TABLE_FIRST: if cartosql.tableExists(CARTO_TABLE): logging.info("Clearing table") cartosql.dropTable(CARTO_TABLE) # 1. Check if table exists and create table existing_ids = [] if cartosql.tableExists(CARTO_TABLE): existing_ids = getFieldAsList(CARTO_TABLE, UID_FIELD) else: createTableWithIndices(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) # 2. Iterively fetch, parse and post new data num_new = processNewData(existing_ids) existing_count = num_new + len(existing_ids) logging.info('Total rows: {}, New: {}, Max: {}'.format( existing_count, num_new, MAXROWS)) # 3. Remove old observations deleteExcessRows(CARTO_TABLE, MAXROWS, TIME_FIELD, MAXAGE) ### logging.info('SUCCESS')
def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') if CLEAR_TABLE_FIRST: if cartosql.tableExists(CARTO_TABLE): cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) # 1. Check if table exists and create table existing_ids = [] if cartosql.tableExists(CARTO_TABLE): logging.info('Fetching existing ids') existing_ids = getIds(CARTO_TABLE, UID_FIELD) else: logging.info('Table {} does not exist, creating'.format(CARTO_TABLE)) createTableWithIndex(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) # 2. Iterively fetch, parse and post new data new_count = processNewData(existing_ids) total_count = len(existing_ids) logging.info('Total rows: {}, New: {}, Max: {}'.format( total_count, new_count, MAXROWS)) # 3. Remove old observations deleteExcessRows(CARTO_TABLE, MAXROWS, TIME_FIELD) # Get most recent update date if new_count > 0: most_recent_date = datetime.datetime.utcnow() lastUpdateDate(DATASET_ID, most_recent_date) logging.info('SUCCESS')
def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') if CLEAR_TABLE_FIRST: if cartosql.tableExists(CARTO_TABLE): logging.info("Clearing table") cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) # 1. Check if table exists and create table existing_ids = [] if cartosql.tableExists(CARTO_TABLE): existing_ids = getFieldAsList(CARTO_TABLE, UID_FIELD) else: createTableWithIndices(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) # 2. Iterively fetch, parse and post new data num_new = processNewData(existing_ids) existing_count = num_new + len(existing_ids) logging.info('Total rows: {}, New: {}, Max: {}'.format(existing_count, num_new, MAXROWS)) # 3. Remove old observations deleteExcessRows(CARTO_TABLE, MAXROWS, TIME_FIELD, MAXAGE) # Get most recent update date most_recent_date = get_most_recent_date(CARTO_TABLE) lastUpdateDate(DATASET_ID, most_recent_date) ### logging.info('SUCCESS')
def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') if CLEAR_TABLE_FIRST: logging.info('Clearing explore table') cartosql.dropTable(CARTO_TABLE_EXPLORE) cartosql.dropTable(CARTO_TABLE_PLANETPULSE) # 1. Check if table exists and create table ## EXPLORE TABLE existing_explore_ids = [] if cartosql.tableExists(CARTO_TABLE_EXPLORE): logging.info('Fetching existing explore ids') existing_explore_ids = getIds(CARTO_TABLE_EXPLORE, UID_FIELD) else: logging.info( 'Table {} does not exist, creating'.format(CARTO_TABLE_EXPLORE)) createTableWithIndex(CARTO_TABLE_EXPLORE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) ## PLANET PULSE TABLE existing_pp_ids = [] if cartosql.tableExists(CARTO_TABLE_PLANETPULSE): logging.info('Fetching existing planet pulse ids') existing_pp_ids = getIds(CARTO_TABLE_PLANETPULSE, UID_FIELD) else: logging.info('Table {} does not exist, creating'.format( CARTO_TABLE_PLANETPULSE)) createTableWithIndex(CARTO_TABLE_PLANETPULSE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) # 2. Iterively fetch, parse and post new data num_new_explore, num_new_pp = processNewData(existing_explore_ids, existing_pp_ids) existing_count_explore = num_new_explore + len(existing_explore_ids) existing_count_pp = num_new_pp + len(existing_pp_ids) logging.info('Total rows in Explore table: {}, New: {}, Max: {}'.format( existing_count_explore, num_new_explore, MAXROWS_EXPLORE)) logging.info( 'Total rows in Planet Pulse table: {}, New: {}, Max: {}'.format( existing_count_pp, num_new_pp, MAXROWS_PLANETPULSE)) # 3. Remove old observations deleteExcessRows(CARTO_TABLE_EXPLORE, MAXROWS_EXPLORE, TIME_FIELD, MAXAGE_EXPLORE) deleteExcessRows(CARTO_TABLE_PLANETPULSE, MAXROWS_PLANETPULSE, TIME_FIELD, MAXAGE_PLANETPULSE) logging.info('SUCCESS')
def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') if CLEAR_TABLE_FIRST: if cartosql.tableExists(CARTO_MARKET_TABLE): cartosql.dropTable(CARTO_MARKET_TABLE) if cartosql.tableExists(CARTO_ALPS_TABLE): cartosql.dropTable(CARTO_ALPS_TABLE) # 1. Check if table exists and create table existing_markets = [] if cartosql.tableExists(CARTO_MARKET_TABLE): logging.info('Fetching existing ids') existing_markets = getIds(CARTO_MARKET_TABLE, UID_FIELD) else: logging.info( 'Table {} does not exist, creating'.format(CARTO_MARKET_TABLE)) createTableWithIndex(CARTO_MARKET_TABLE, CARTO_MARKET_SCHEMA, UID_FIELD) existing_alps = [] if cartosql.tableExists(CARTO_ALPS_TABLE): logging.info('Fetching existing ids') existing_alps = getIds(CARTO_ALPS_TABLE, UID_FIELD) else: logging.info( 'Table {} does not exist, creating'.format(CARTO_ALPS_TABLE)) createTableWithIndex(CARTO_ALPS_TABLE, CARTO_ALPS_SCHEMA, UID_FIELD, TIME_FIELD) # 2. Iterively fetch, parse and post new data num_new_markets, num_new_alps = processNewData(existing_markets, existing_alps) existing_markets = num_new_markets + len(existing_markets) logging.info('Total market rows: {}, New: {}, Max: {}'.format( existing_markets, num_new_markets, MAXROWS)) existing_alps = num_new_alps + len(existing_alps) logging.info('Total alps rows: {}, New: {}, Max: {}'.format( existing_alps, num_new_alps, MAXROWS)) # 3. Remove old observations deleteExcessRows(CARTO_ALPS_TABLE, MAXROWS, TIME_FIELD) # MAXAGE) # Get most recent update date most_recent_date = get_most_recent_date(CARTO_ALPS_TABLE) lastUpdateDate(DATASET_ID, most_recent_date) logging.info('SUCCESS')
def checkCreateTable(table, schema, id_field='', time_field=''): ''' Create the table if it does not exist, and pull list of IDs already in the table if it does INPUT table: Carto table to check or create (string) schema: dictionary of column names and types, used if we are creating the table for the first time (dictionary) id_field: optional, name of column that we want to use as a unique ID for this table; this will be used to compare the source data to the our table each time we run the script so that we only have to pull data we haven't previously uploaded (string) time_field: optional, name of column that will store datetime information (string) ''' # check it the table already exists in Carto if cartosql.tableExists(table, user=CARTO_USER, key=CARTO_KEY): # if the table does exist, get a list of all the values in the id_field column print('Carto table already exists.') else: # if the table does not exist, create it with columns based on the schema input print('Table {} does not exist, creating'.format(table)) cartosql.createTable(table, schema, user=CARTO_USER, key=CARTO_KEY) # if a unique ID field is specified, set it as a unique index in the Carto table; when you upload data, Carto # will ensure no two rows have the same entry in this column and return an error if you try to upload a row with # a duplicate unique ID if id_field: cartosql.createIndex(table, id_field, unique=True, user=CARTO_USER, key=CARTO_KEY) # if a time_field is specified, set it as an index in the Carto table; this is not a unique index if time_field: cartosql.createIndex(table, time_field, user=CARTO_USER, key=CARTO_KEY)
def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) if CLEAR_TABLE_FIRST: if cartosql.tableExists(CARTO_TABLE): cartosql.dropTable(CARTO_TABLE) ### 1. Check if table exists, if not, create it checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) ### 2. Retrieve existing data r = cartosql.getFields(UID_FIELD, CARTO_TABLE, order='{} desc'.format(TIME_FIELD), f='csv') existing_ids = r.text.split('\r\n')[1:-1] num_existing = len(existing_ids) ### 3. Fetch data from FTP, dedupe, process num_new = processData(existing_ids) ### 4. Delete data to get back to MAX_ROWS num_dropped = deleteExcessRows(CARTO_TABLE, MAX_ROWS, TIME_FIELD, MAX_AGE) ### 5. Notify results total = num_existing + num_new - num_dropped # Get most recent update date most_recent_date = get_most_recent_date(CARTO_TABLE) lastUpdateDate(DATASET_ID, most_recent_date) logging.info('Existing rows: {}, New rows: {}, Max: {}'.format(total, num_new, MAX_ROWS)) logging.info("SUCCESS")
def checkCreateTable(table, schema, id_field, time_field): '''Get existing ids or create table''' if cartosql.tableExists(table, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')): logging.info('Fetching existing IDs') r = cartosql.getFields(id_field, table, f='csv', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) return r.text.split('\r\n')[1:-1] else: logging.info('Table {} does not exist, creating'.format(table)) cartosql.createTable(table, schema, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) cartosql.createIndex(table, id_field, unique=True, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) cartosql.createIndex(table, time_field, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) return []
def cleanOldRows(table, time_field, max_age, date_format='%Y-%m-%d %H:%M:%S'): ''' Delete rows that are older than a certain threshold INPUT table: name of table in Carto from which we will delete the old data (string) time_field: column that stores datetime information (string) max_age: oldest date that can be stored in the Carto table (datetime object) date_format: format of dates in Carto table (string) RETURN num_expired: number of rows that have been dropped from the table (integer) ''' # initialize number of rows that will be dropped as 0 num_expired = 0 # if the table exists if cartosql.tableExists(table, CARTO_USER, CARTO_KEY): # check if max_age variable is a datetime object if isinstance(max_age, datetime.datetime): # convert datetime object to string formatted according to date_format max_age = max_age.strftime(date_format) elif isinstance(max_age, str): # raise an error if max_age is a string logging.error( 'Max age must be expressed as a datetime.datetime object') # delete rows from table which are older than the max_age r = cartosql.deleteRows(table, "{} < '{}'".format(time_field, max_age), CARTO_USER, CARTO_KEY) # get the number of rows that were dropped from the table num_expired = r.json()['total_rows'] else: # raise an error if the table doesn't exist logging.error("{} table does not exist yet".format(table)) return (num_expired)
def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) if CLEAR_TABLE_FIRST: if cartosql.tableExists(CARTO_TABLE): cartosql.dropTable(CARTO_TABLE) ### 1. Check if table exists and create table checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) existing_ids = getFieldAsList(CARTO_TABLE, UID_FIELD) num_existing = len(existing_ids) ### 2. Fetch data from FTP, dedupe, process num_new = processData(existing_ids) num_total = num_existing + num_new ### 3. Notify results logging.info('Total rows: {}, New rows: {}, Max: {}'.format(num_total, num_new, MAX_ROWS)) deleteExcessRows(CARTO_TABLE, MAX_ROWS, TIME_FIELD) # Get most recent update date #if new rows were added to table, make today the most recent update date if num_new > 0: most_recent_date = datetime.datetime.utcnow() lastUpdateDate(DATASET_ID, most_recent_date) logging.info("SUCCESS")
def main(): logging.basicConfig(stream=sys.stderr, level=logging.INFO) logging.info('STARTING') # clear the table before starting, if specified if CLEAR_TABLE_FIRST: logging.info('Clearing Table') # if the table exists if cartosql.tableExists(CARTO_TABLE, user=CARTO_USER, key=CARTO_KEY): # delete all the rows cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL', user=CARTO_USER, key=CARTO_KEY) # note: we do not delete the entire table because this will cause the dataset visualization on Resource Watch # to disappear until we log into Carto and open the table again. If we simply delete all the rows, this # problem does not occur # Check if table exists, create it if it does not logging.info('Checking if table exists and getting existing IDs.') existing_ids = checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD) # Fetch, process, and upload the new data logging.info('Fetching new data') num_new = processData(existing_ids) logging.info('Previous rows: {}, New rows: {}'.format(len(existing_ids), num_new)) # Update Resource Watch updateResourceWatch(num_new) # Delete local files in Docker container delete_local() logging.info('SUCCESS')
def checkCreateTable(table, schema, id_field, time_field=''): ''' Create the table if it does not exist, and pull list of IDs already in the table if it does INPUT table: Carto table to check or create (string) schema: dictionary of column names and types, used if we are creating the table for the first time (dictionary) id_field: name of column that we want to use as a unique ID for this table; this will be used to compare the source data to the our table each time we run the script so that we only have to pull data we haven't previously uploaded (string) time_field: optional, name of column that will store datetime information (string) RETURN list of existing IDs in the table, pulled from the id_field column (list of strings) ''' # check it the table already exists in Carto if cartosql.tableExists(table, user=CARTO_USER, key=CARTO_KEY): # if the table does exist, get a list of all the values in the id_field column logging.info('Fetching existing IDs') r = cartosql.getFields(id_field, table, f='csv', post=True, user=CARTO_USER, key=CARTO_KEY) # turn the response into a list of strings, removing the first and last entries (header and an empty space at end) return r.text.split('\r\n')[1:-1] else: # if the table does not exist, create it with columns based on the schema input logging.info('Table {} does not exist, creating'.format(table)) cartosql.createTable(table, schema, user=CARTO_USER, key=CARTO_KEY) # if a unique ID field is specified, set it as a unique index in the Carto table; when you upload data, Carto # will ensure no two rows have the same entry in this column and return an error if you try to upload a row with # a duplicate unique ID if id_field: cartosql.createIndex(table, id_field, unique=True, user=CARTO_USER, key=CARTO_KEY) # if a time_field is specified, set it as an index in the Carto table; this is not a unique index if time_field: cartosql.createIndex(table, time_field, user=CARTO_USER, key=CARTO_KEY) # return an empty list because there are no IDs in the new table yet return []
def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') if CLEAR_TABLE_FIRST: logging.info('Clearing table') cartosql.dropTable(CARTO_TABLE) # 1. Check if table exists and create table existing_ids = [] if cartosql.tableExists(CARTO_TABLE): logging.info('Fetching existing ids') existing_ids = getIds(CARTO_TABLE, UID_FIELD) else: logging.info('Table {} does not exist, creating'.format(CARTO_TABLE)) createTableWithIndex(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) # 2. Iterively fetch, parse and post new data num_new = processNewData(existing_ids) existing_count = num_new + len(existing_ids) logging.info('Total rows: {}, New: {}, Max: {}'.format( existing_count, num_new, MAXROWS)) # 3. Remove old observations deleteExcessRows(CARTO_TABLE, MAXROWS, TIME_FIELD, datetime.datetime(year=MAXAGE, month=1, day=1)) # Get most recent update date most_recent_date = get_most_recent_date(CARTO_TABLE) lastUpdateDate(DATASET_ID, most_recent_date) logging.info('SUCCESS')
def processData(): ''' Function to download data and upload it to Carto Will first try to get the data for today three times Then decrease a day up until 8 tries until it finds one ''' date = datetime.date.today() - datetime.timedelta(days=1) success = False tries = 0 while tries < MAX_TRIES and success == False: logging.info("Fetching data for {}".format(str(date))) f = getFilename(date) url = SOURCE_URL.format(date=date.strftime('%Y%m%d')) try: urllib.request.urlretrieve(url, f) except Exception as inst: logging.info("Error fetching data for {}".format(str(date))) if tries >= 2: date = date - datetime.timedelta(days=1) tries = tries + 1 if tries == MAX_TRIES: logging.error( "Error fetching data for {}, and max tries reached. See source for last data update." .format(str(datetime.date.today()))) success = False else: df = pd.read_csv(f, header=0, usecols=[ 'Lat_DNB', 'Lon_DNB', 'Date_Mscan', 'Date_LTZ', 'QF_Detect', 'EEZ', 'Land_Mask' ]) df = df.drop(df[df.QF_Detect == 999999].index) df['the_geom'] = df.apply( lambda row: getGeom(row['Lon_DNB'], row['Lat_DNB']), axis=1) df = df[[ 'the_geom', 'QF_Detect', 'Date_Mscan', 'Date_LTZ', 'Land_Mask', 'Lon_DNB', 'Lat_DNB', 'EEZ' ]] if not cartosql.tableExists(CARTO_TABLE): logging.info('Table {} does not exist'.format(CARTO_TABLE)) cartosql.createTable(CARTO_TABLE, CARTO_SCHEMA) else: cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL') cartosql.createTable(CARTO_TABLE, CARTO_SCHEMA) rows = df.values.tolist() logging.info('Success!') #logging.info('The following includes the first ten rows added to Carto:') #logging.info(rows[:10]) if len(rows): cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), rows) tries = tries + 1 success = True
def checkCreateTable(table, schema, id_field, time_field): '''Get existing ids or create table''' if cartosql.tableExists(table): logging.info('Fetching existing IDs') return getFieldAsList(id_field, table) else: logging.info('Table {} does not exist, creating'.format(table)) cartosql.createTable(table, schema) cartosql.createIndex(table, id_field, unique=True) cartosql.createIndex(table, time_field) return []
def checkCreateTable(table, schema, id_field, time_field): ''' Get existing ids or create table Return a list of existing ids in time order ''' if cartosql.tableExists(table): logging.info('Table {} already exists'.format(table)) else: logging.info('Creating Table {}'.format(table)) cartosql.createTable(table, schema) cartosql.createIndex(table, id_field, unique=True) if id_field != time_field: cartosql.createIndex(table, time_field)
def checkCreateTable(table, schema, id_field, time_field=''): '''Get existing ids or create table''' if cartosql.tableExists(table): logging.info('Fetching existing IDs') r = cartosql.getFields(id_field, table, f='csv', post=True) return r.text.split('\r\n')[1:-1] else: logging.info('Table {} does not exist, creating'.format(table)) cartosql.createTable(table, schema) cartosql.createIndex(table, id_field, unique=True) if time_field: cartosql.createIndex(table, time_field) return []
def processArgs(args): opts = {} if args['-u']: opts['user'] = args['-u'] if args['-k']: opts['key'] = args['-k'] f = args['-f'] if f: opts['f'] = f if args['--help'] or not cartosql.init(): return __doc__ if args['<sql>']: if args['post']: r = cartosql.post(args['<sql>'], **opts) return returnFormat(r, f) elif args['get']: r = cartosql.get(args['<sql>'], **opts) return returnFormat(r, f) elif args['select']: if args['<fields>'] and args['<table>']: if args['-w']: opts['where'] = args['-w'] if args['-o']: opts['order'] = args['-o'] if args['-l']: opts['limit'] = args['-l'] r = cartosql.getFields(args['<fields>'], args['<table>'], **opts) return returnFormat(r, f) elif args['ls']: r = cartosql.getTables(**opts) if f is None or f == 'csv': return prettyJson(r) return returnFormat(r, f) elif args['exists'] and args['<table>']: r = cartosql.tableExists(args['<table>'], **opts) return r elif args['drop'] and args['<table>']: confirm = args['--confirm'] if not confirm: confirm = input('Drop table {}? (y/N)'.format(args['<table>'])) == 'y' if confirm: r = cartosql.dropTable(args['<table>'], **opts) return returnFormat(r, f) else: print('Pass option --confirm to drop table') return __doc__
def processData(): ''' Function to download data and upload it to Carto Will first try to get the data for MAX_TRIES then quits ''' success = False tries = 0 df = None while tries < MAX_TRIES and success == False: logging.info('Try running feeds, try number = {}'.format(tries)) try: df = feeds() success = True except Exception as inst: logging.info(inst) logging.info("Error fetching data trying again") tries = tries + 1 if tries == MAX_TRIES: logging.error( "Error fetching data, and max tries reached. See source for last data update." ) success = False if success == True: if not cartosql.tableExists(CARTO_TABLE, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')): logging.info('Table {} does not exist'.format(CARTO_TABLE)) cartosql.createTable(CARTO_TABLE, CARTO_SCHEMA) # Send dataframe to Carto logging.info('Writing to Carto') cc = cartoframes.CartoContext( base_url="https://{user}.carto.com/".format(user=CARTO_USER), api_key=CARTO_KEY) cc.write(df, CARTO_TABLE, overwrite=True, privacy='public') else: cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) # Send dataframe to Carto logging.info('Writing to Carto') cc = cartoframes.CartoContext( base_url="https://{user}.carto.com/".format(user=CARTO_USER), api_key=CARTO_KEY) cc.write(df, CARTO_TABLE, overwrite=True, privacy='public')
def cleanOldRows(table, time_field, max_age, date_format='%Y-%m-%d %H:%M:%S'): ''' Delete excess rows by age Max_Age should be a datetime object or string Return number of dropped rows ''' num_expired = 0 if cartosql.tableExists(table): if isinstance(max_age, datetime.datetime): max_age = max_age.strftime(date_format) elif isinstance(max_age, str): logging.error('Max age must be expressed as a datetime.datetime object') r = cartosql.deleteRows(table, "{} < '{}'".format(time_field, max_age)) num_expired = r.json()['total_rows'] else: logging.error("{} table does not exist yet".format(table)) return(num_expired)
def main(): logging.basicConfig(stream=sys.stderr, level=logging.INFO) logging.info('STARTING') # clear the table before starting, if specified if CLEAR_TABLE_FIRST: logging.info("clearing table") # if the table exists if cartosql.tableExists(CARTO_TABLE, user=CARTO_USER, key=CARTO_KEY): # delete all the rows cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL', user=CARTO_USER, key=CARTO_KEY) # note: we do not delete the entire table because this will cause the dataset visualization on Resource Watch # to disappear until we log into Carto and open the table again. If we simply delete all the rows, this # problem does not occur # Check if table exists, create it if it does not logging.info('Checking if table exists and getting existing IDs.') existing_ids = checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) # Delete rows that are older than a certain threshold num_expired = cleanOldRows(CARTO_TABLE, TIME_FIELD, MAX_AGE) # Get the filename from source url for which we want to download data filename = fetchDataFileName(SOURCE_URL) # Fetch, process, and upload new data logging.info('Fetching new data') num_new = processData(SOURCE_URL, filename, existing_ids) logging.info('Previous rows: {}, New rows: {}'.format( len(existing_ids), num_new)) # Delete data to get back to MAX_ROWS num_deleted = deleteExcessRows(CARTO_TABLE, MAX_ROWS, TIME_FIELD) # Update Resource Watch updateResourceWatch(num_new) logging.info("SUCCESS")
def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) if CLEAR_TABLE_FIRST: if cartosql.tableExists(CARTO_TABLE): cartosql.dropTable(CARTO_TABLE) ### 1. Check if table exists and create table checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD, TIME_FIELD) existing_ids = getFieldAsList(CARTO_TABLE, UID_FIELD) num_existing = len(existing_ids) ### 2. Fetch data from FTP, dedupe, process num_new = processData(existing_ids) num_total = num_existing + num_new ### 3. Notify results logging.info('Total rows: {}, New rows: {}, Max: {}'.format( num_total, num_new, MAX_ROWS)) deleteExcessRows(CARTO_TABLE, MAX_ROWS, TIME_FIELD) logging.info("SUCCESS")
def processArgs(args): opts = {} if args['-u']: opts['user'] = args['-u'] if args['-k']: opts['key'] = args['-k'] f = args['-f'] if f: opts['f'] = f if args['--help']: return __doc__ elif args['post'] and args['<sql>']: r = cartosql.post(args['<sql>'], **opts) return returnFormat(r, f) elif args['get']: if args['<sql>']: r = cartosql.get(args['<sql>'], **opts) return returnFormat(r, f) if args['<fields>'] and args['<table>']: if args['-w']: opts['where'] = args['-w'] if args['-o']: opts['order'] = args['-o'] r = cartosql.getFields(args['<fields>'], args['<table>'], **opts) return returnFormat(r, f) elif args['ls']: r = cartosql.getTables(**opts) if f is None or f == 'csv': return prettyJson(r) return returnFormat(r, f) elif args['exists'] and args['<table>']: r = cartosql.tableExists(args['<table>'], **opts) return returnFormat(r, f) elif args['drop'] and args['<table>']: r = cartosql.dropTable(args['<table>'], **opts) return returnFormat(r, f) return __doc__
def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') if CLEAR_TABLE_FIRST: if cartosql.tableExists(CARTO_TABLE): cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) ### 1. Check if table exists, if not, create it logging.info('Checking if table exists and getting existing IDs.') existing_ids = checkCreateTable(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD) num_existing = len(existing_ids) ### 2. Fetch data from FTP, dedupe, process logging.info('Fetching new data') num_new = processData(existing_ids) logging.info('Processing interactions') processInteractions() ### 3. Delete data to get back to MAX_ROWS logging.info('Deleting excess rows') num_dropped = deleteExcessRows(CARTO_TABLE, MAX_ROWS, AGE_FIELD) ### 4. Notify results total = num_existing + num_new - num_dropped # If updates, change update date on RW if num_new > 0: lastUpdateDate(DATASET_ID, datetime.datetime.utcnow()) logging.info('Existing rows: {}, New rows: {}, Max: {}'.format( total, num_new, MAX_ROWS)) logging.info("SUCCESS")
def main(): '''Ingest new data into EE and delete old data''' logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') if CLEAR_TABLE_FIRST: cartosql.dropTable(CARTO_TABLE) # 1. Check collection, create table if necessary existing_ids = [] if cartosql.tableExists(CARTO_TABLE): logging.info('Fetching existing ids') existing_ids = getIds(CARTO_TABLE, UID_FIELD) else: logging.info('Table {} does not exist, creating'.format(CARTO_TABLE)) createTableWithIndex(CARTO_TABLE, CARTO_SCHEMA, UID_FIELD) # 2. Ingest new data num_new_vectors = processNewVectorData(existing_ids) # 3. Report results existing_count = num_new_vectors + len(existing_ids) logging.info('Total rows: {}, New: {}, Max: {}'.format( existing_count, num_new_vectors, 'none'))
def processInteractions(): r = cartosql.get( "SELECT * FROM {} WHERE current='True'".format(CARTO_TABLE), user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) interaction_data = r.json()['rows'] try_num = 0 #if we didn't get data back, wait a few minutes and try again while not len(interaction_data): logging.info('Sleeping and trying again.') try_num += 1 time.sleep(300) interaction_data = r.json()['rows'] if try_num > 5: logging.error('Problem fetching data to generate interactions') exit() countries_with_interaction = [] for interaction in interaction_data: ctry = interaction['country_iso3'] if ctry not in countries_with_interaction: countries_with_interaction.append(ctry) if cartosql.tableExists(CARTO_TABLE_INTERACTION, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')): cartosql.deleteRows(CARTO_TABLE_INTERACTION, 'cartodb_id IS NOT NULL', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) #run to create new table #existing_interaction_ids = checkCreateTable(CARTO_TABLE_INTERACTION, CARTO_SCHEMA_INTERACTION, UID_FIELD) new_interactions = [] for ctry in countries_with_interaction: r = cartosql.get( "SELECT * FROM {} WHERE current='True' AND country_iso3='{}'". format(CARTO_TABLE, ctry), user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) ctry_interaction_data = r.json()['rows'] event_num = 1 for interaction in ctry_interaction_data: event = interaction['event_name'].split(": ", 1) if event_num == 1: if len(event) == 1: interaction_str = '{} ({})'.format(event[0], interaction['url']) else: interaction_str = '{} ({})'.format(event[1], interaction['url']) else: if len(event) == 1: interaction_str = interaction_str + '; ' + '{} ({})'.format( event[0], interaction['url']) else: interaction_str = interaction_str + '; ' + '{} ({})'.format( event[1], interaction['url']) event_num += 1 #uid = gen_interaction_uid(ctry) if ctry_interaction_data: row = [] for key in CARTO_SCHEMA_INTERACTION.keys(): try: if key == 'the_geom': lon = ctry_interaction_data[0]['lon'] lat = ctry_interaction_data[0]['lat'] item = {'type': 'Point', 'coordinates': [lon, lat]} elif key == 'interaction': item = interaction_str else: item = ctry_interaction_data[0][key] except KeyError: item = None row.append(item) new_interactions.append(row) logging.info('Adding {} new interactions'.format(len(new_interactions))) cartosql.blockInsertRows(CARTO_TABLE_INTERACTION, CARTO_SCHEMA_INTERACTION.keys(), CARTO_SCHEMA_INTERACTION.values(), new_interactions, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY'))
def processNewData(url): ''' Fetch, process and upload new data INPUT url: url where you can find the download link for the source data (string) RETURN num_new: number of rows of new data sent to Carto table (integer) ''' # specify the starting page of source url we want to pull page = 1 # generate the url and pull data for this page r = requests.get(url.format(page=page)) # pull data from request response json raw_data = r.json()['data'] # if data is available from source url if len(raw_data) > 0: # if the table exists if cartosql.tableExists(CARTO_TABLE, user=CARTO_USER, key=CARTO_KEY): # delete all the rows cartosql.deleteRows(CARTO_TABLE, 'cartodb_id IS NOT NULL', user=CARTO_USER, key=CARTO_KEY) logging.info('Updating {}'.format(CARTO_TABLE)) else: # raise an error that data is not available from source url logging.error("Source data missing. Table will not update.") # create an empty list to store new data new_data = [] # if data is available from source url while len(raw_data) > 0: logging.info('Processing page {}'.format(page)) # read in source data as a pandas dataframe df = pd.DataFrame(raw_data) # go through each rows in the dataframe for row_num in range(df.shape[0]): # get the row of data row = df.iloc[row_num] # create an empty list to store data from this row new_row = [] # go through each column in the Carto table for field in CARTO_SCHEMA: # if we are fetching data for unique id column if field == 'uid': # add the unique id to the list of data from this row new_row.append(row[UID_FIELD]) # for any other column, check if there are values available from the source for this row else: # if data available from source for this field, populate the field with the data # else populate with None val = row[field] if row[field] != '' else None # add this value to the list of data from this row new_row.append(val) # add the list of values from this row to the list of new data new_data.append(new_row) # go to the next page and check for data page += 1 # generate the url and pull data for this page r = requests.get(url.format(page=page)) # pull data from request response json raw_data = r.json()['data'] # find the length (number of rows) of new_data num_new = len(new_data) # if we have found new dates to process if num_new: # insert new data into the carto table cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), new_data, user=CARTO_USER, key=CARTO_KEY) return num_new
final_df = final_df.where((pd.notnull(final_df)), None) # UPLOAD # specify column names and types CARTO_SCHEMA = { 'iso3': 'text', 'country': 'text', 'year': 'numeric', 'vulnerability': 'numeric', 'readiness': 'numeric', 'gain': 'numeric' } # check if table exists if cartosql.tableExists(CARTO_TABLE, user=os.getenv('CARTO_WRI_RW_USER'), key=os.getenv('CARTO_WRI_RW_KEY')): print('This table already exists. Please change the name and try again.') else: # create table with appropriate columns cartosql.createTable(CARTO_TABLE, CARTO_SCHEMA, user=os.getenv('CARTO_WRI_RW_USER'), key=os.getenv('CARTO_WRI_RW_KEY')) # send processed data to table cartosql.blockInsertRows(CARTO_TABLE, CARTO_SCHEMA.keys(), CARTO_SCHEMA.values(), final_df.values.tolist(), user=os.getenv('CARTO_WRI_RW_USER'), key=os.getenv('CARTO_WRI_RW_KEY'))
def main(): logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL) logging.info('STARTING') if CLEAR_TABLE_FIRST: if cartosql.tableExists(CARTO_MARKET_TABLE): cartosql.deleteRows(CARTO_MARKET_TABLE, 'cartodb_id IS NOT NULL', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) if cartosql.tableExists(CARTO_ALPS_TABLE): cartosql.deleteRows(CARTO_ALPS_TABLE, 'cartodb_id IS NOT NULL', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) if cartosql.tableExists(CARTO_INTERACTION_TABLE): cartosql.deleteRows(CARTO_INTERACTION_TABLE, 'cartodb_id IS NOT NULL', user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')) # 1. Check if table exists and create table existing_markets = [] if cartosql.tableExists(CARTO_MARKET_TABLE, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')): logging.info('Fetching existing market ids') existing_markets = getIds(CARTO_MARKET_TABLE, UID_FIELD) else: logging.info( 'Table {} does not exist, creating'.format(CARTO_MARKET_TABLE)) createTableWithIndex(CARTO_MARKET_TABLE, CARTO_MARKET_SCHEMA, UID_FIELD) existing_alps = [] if cartosql.tableExists(CARTO_ALPS_TABLE, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')): logging.info('Fetching existing ALPS ids') existing_alps = getIds(CARTO_ALPS_TABLE, UID_FIELD) else: logging.info( 'Table {} does not exist, creating'.format(CARTO_ALPS_TABLE)) createTableWithIndex(CARTO_ALPS_TABLE, CARTO_ALPS_SCHEMA, UID_FIELD, TIME_FIELD) existing_interactions = [] if cartosql.tableExists(CARTO_INTERACTION_TABLE, user=os.getenv('CARTO_USER'), key=os.getenv('CARTO_KEY')): logging.info('Fetching existing interaction ids') existing_interactions = getIds(CARTO_INTERACTION_TABLE, UID_FIELD) else: logging.info('Table {} does not exist, creating'.format( CARTO_INTERACTION_TABLE)) createTableWithIndex(CARTO_INTERACTION_TABLE, CARTO_INTERACTION_SCHEMA, UID_FIELD, INTERACTION_TIME_FIELD) # 2. Iterively fetch, parse and post new data num_new_markets, num_new_alps, markets_updated = processNewData( existing_markets, existing_alps) # Update Interaction table num_new_interactions = processInteractions(markets_updated) # Report new data count num_existing_markets = num_new_markets + len(existing_markets) logging.info('Total market rows: {}, New: {}, Max: {}'.format( num_existing_markets, num_new_markets, MAXROWS)) num_existing_alps = num_new_alps + len(existing_alps) logging.info('Total alps rows: {}, New: {}, Max: {}'.format( num_existing_alps, num_new_alps, MAXROWS)) num_existing_interactions = num_new_interactions + len( existing_interactions) logging.info('Total interaction rows: {}, New: {}, Max: {}'.format( num_existing_interactions, num_new_interactions, MAXROWS)) # 3. Remove old observations deleteExcessRows(CARTO_ALPS_TABLE, MAXROWS, TIME_FIELD) # MAXAGE) # Get most recent update date most_recent_date = get_most_recent_date(CARTO_ALPS_TABLE) lastUpdateDate(DATASET_ID, most_recent_date) logging.info('SUCCESS')