def main(self): """ Run for the configured table to load. """ try: self.logger.info("START Create: %s;", self.tmp_file) if self.config.get('permutations'): self._write_permutations() else: self._write_options() self.logger.info("FINISH Create: %s", self.tmp_file) self._load_options() LoadState(self.oxdb.connection, variable_name=LOAD_STATE_TEMPLATE % self.table_name.upper()).upsert(commit=True) except Exception as error: self.logger.error("Options Loader failed for %s. Error %s", self.table_name.upper(), error) self.oxdb.rollback() raise Exception("%s options loader failed. Error %s" % (self.table_name.upper(), error)) finally: try: self.oxdb.close() except Exception: pass
def update_load_state(self): """ Update load_state and commit. """ LoadState(self.oxdb.connection, CONFIG['LOAD_STATE_VAR']).upsert(commit=True) LOGGER.info("Updated and Committed load_state variable for %s", CONFIG['LOAD_STATE_VAR'])
def update_load_state(self, dataset): """ Update loadstate(s) if needed. """ # If we are not a republish the update the load_state if not self.is_republish(dataset): variable_value = readable_interval_datetime( dataset.meta['readableInterval']) self.job.logger.info("DATASET %s(%s) is loaded", variable_value, dataset.serial) self.job.load_state.upsert(variable_value) # If any, then set the AUXILIARY_LOAD_STATE_VARS for variable_name in self.job.config.get( 'AUXILIARY_LOAD_STATE_VARS', []): LoadState(self.job.dbh, variable_name).upsert(variable_value) else: # Increment grid_fact_reload_version load state variable LoadState(self.job.dbh, GRID_FACT_RELOAD_VERSION).increment_by_seq( SEQ_GRID_FACT_RELOAD_VERSION)
def __init__(self, name, dbh, logger, options=None): self.name = name self.dbh = dbh self.logger = logger self.config = get_conf(self.name) self.feed = Feeds(ODFI_CONF)[self.feed_name] self.load_state = LoadState( self.dbh, variable_name=self.config['LOAD_STATE_VAR']) self.options = options self._depends_on = None
def load_tmp_file(self): """ Will run the SQL statements to merge any new data into the DW. """ with OXDB('SNOWFLAKE') as oxdb: for stmt in STATEMENTS: stmt = stmt.format(self.temp_file) self.logger.info("Executing:%s;", stmt) self.logger.info("Results:%s;", oxdb.execute(stmt)) self.logger.info("Setting load state '%s'", LOAD_STATE_NAME) LoadState(oxdb.connection, variable_name=LOAD_STATE_NAME).upsert(commit=True)
def __call__(self, max_datasets=None, since_serial=None): """ Iterate through the datasets and process. :param max_datasets: Optional. ONLY used for TESTING. :param since_serial: Optional. ONLY used for TESTING. NOTE: Above optional parameters should only be used for testing as they can break the upload logic leading to data issues downstream. """ if since_serial is None: since_serial = self.get_since_serial() self.job.logger.debug("SINCE_SERIAL: %s;", since_serial) last_dataset = self.job.get_current_dataset() loaded_datasets = self.get_serials_since(since_serial) self.job.logger.debug("LOADED_DATASETS: %s;", loaded_datasets) for dataset in self.job.feed.get_datasets_since_serial( since_serial, max_datasets=max_datasets, sort_key=UPLOAD_SORT_KEY): if dataset.serial in loaded_datasets: self.job.logger.info( "Dataset serial %s is already loaded, skipping.", dataset.serial) continue if self.is_correct_interval(last_dataset, dataset): last_dataset = dataset start_time = datetime.utcnow() if self.upload(dataset): self.record_state(dataset, start_time) self.queue_rev_adjustment(dataset.readable_interval, self.is_republish(dataset)) self.queue_rollup(dataset.readable_interval) continue else: # Upload did not occur. return False else: # GAP detected stop the loop return self.has_data if self.has_data: LoadState(self.job.dbh, self.load_state_serial_name).upsert(last_dataset.serial, commit=True) return self.has_data
def _update_load_state(db, rollup_config, time_rollup, last_hour): # Update load_state only if last_hour is <= source data's last hour or if # we don't have source data's last hour if 'source_load_state_variable_name' in time_rollup and \ 'load_state_variable_name' in time_rollup: src_load_state_val = [ row for row in db.cursor().execute( rollup_config['QUERY_LOAD_STATE'], tuple(time_rollup['source_load_state_variable_name'])) ] if src_load_state_val: LoadState( db, time_rollup['load_state_variable_name']).upsert(last_hour)
def bootstrap(options, dbh): """ Before you can run you need to initialize the load_state for a given job_name/feed_name. :param options: NamedTuple including options defined for this action in .actions.py """ job = Job(options.job_name, dbh, get_logger(NAME, debug=options.debug)) proceed = 'n' if job.load_state.variable_value: while True: proceed = str( input("%s already exists with value %s. " "Okay to Update to new values?(Y/n) " % (job.load_state.variable_name, job.load_state.variable_value))).lower() or 'y' if proceed in ('y', 'n'): if proceed == 'n': sys.exit(0) break now = datetime.utcnow() downloader = Downloader(job) uploader = Uploader(job) # Create if not exists stage and status table downloader.create_stage() downloader.create_status_table() uploader.create_status_table() # Add new data to the DB try: min_dataset = job.feed.get_min_dataset(options.readable_interval_str) job.load_state.upsert( readable_interval_datetime(options.readable_interval_str)) downloader.add_status(min_dataset, now) LoadState(job.dbh, uploader.load_state_serial_name).upsert(min_dataset.serial) action = 'Updated' if proceed == 'y' else 'Added' sys.stderr.write("%s load_state:%s;%s\n" % (action, job.load_state.variable_name, job.load_state.variable_value)) except NoDataSetException as error: job.logger.error(error) print('ERROR', error)
def sync_table(self): """ Performs the defined SQL for the specific table(s). """ for sync_mode in ['DESTALE', 'DELETE', 'INSERT']: statements = self.config.get(sync_mode) if not isinstance(statements, (list, tuple)): statements = [statements] for statement in statements: statement %= { 'DEST_SCHEMA': DB_NAMESPACE, 'SRC_TABLE': self.table_name_raw } self.logger.debug(statement) self.dest_db.execute(statement) LoadState(self.dest_db.connection, variable_name='%s_updated' % self.table_name).upsert(commit=True)
def setUp(self): self.load_state = LoadState(OXDB(DB_CONNECTION_NAME).connection, variable_name=VARIABLE_NAME)
def update_load_state(self, var_name): LoadState(self.job.dbh, variable_name=var_name).update_variable_datetime( variable_value=self.revenue_job["jq_utc_timestamp"])
def setUp(self): self.load_state = LoadState(sqlite3.connect(DB_FILE), variable_name=VARIABLE_NAME)
def run(config): """ Required in the config are: STATEMENTS: Ordered list of SQL statements to run. Optional: DW_NAME: Data warehouse name. Required either in config file or sql_runner argument, -d (--dw_name). APP_NAME: This is used for the logger. Defaults to LOAD_STATE_VAR or sql_runner LOAD_STATE_VAR: If present will update this load state var. This will fail at the first statement that fails and will not continue. Be sure the use local temporary or temporary tables as there is no clean up. """ job_name = config.get('APP_NAME', APP_NAME) logger = get_etl_logger(job_name) try: with pid.PidFile(pidname="%s.LOCK" % job_name, piddir=LOCK_ROOT, enforce_dotpid_postfix=False) as p_lock: logger.info("-------------------------------") logger.info("Running %s application with process id: %d", job_name, p_lock.pid) logger.info("Starting %s for load_state_variable %s", job_name, config.get('LOAD_STATE_VAR')) if sys.stdout.isatty(): sys.stderr.write("Logging all output to %s\n" % logger.handlers[0].baseFilename) logger.info("Connecting to %s", config.get('DW_NAME')) with OXDB(config.get('DW_NAME')) as oxdb: size = len(config.get('STATEMENTS')) # Set dynamic variables for key, val in config.get('VARIABLES').items(): if str(val).lower().startswith('select '): val %= config.get('VARIABLES') config['VARIABLES'][key], = \ oxdb.get_executed_cursor(val).fetchone() for index, statement in enumerate(config.get('STATEMENTS'), start=1): statement %= config.get('VARIABLES') logger.info("STATEMENT(%s/%s) %s;", index, size, statement) cursor = oxdb.get_executed_cursor(statement) if str(statement).lower().startswith('select '): writer = \ csv.writer( sys.stdout, delimiter=config.get('FIELD_SEP', DEFAULT_FIELD_SEP)) if config.get('HEADERS', False): writer.writerow(col[0] for col in cursor.description) for row in cursor: writer.writerow(row) else: cursor.execute(statement) if config.get('LOAD_STATE_VAR') is not None: logger.info("SETTING %s in load_state.", config.get('LOAD_STATE_VAR')) LoadState( oxdb.connection, variable_name=config.get('LOAD_STATE_VAR')).upsert() logger.info("Completed %s for load_state_variable %s", job_name, config.get('LOAD_STATE_VAR')) except (pid.PidFileAlreadyRunningError, pid.PidFileAlreadyLockedError): logger.warning("Unable to get lock for %s application. Exiting...", job_name) except Exception as err: logger.error("Application %s FAILED. ERROR %s", job_name, err) raise Exception("Application %s FAILED. ERROR %s" % (job_name, err))