class timestamp_dim_loader: def __init__( self ): # Reading configuration file ( YAML file ) self.logger=EtlLogger.get_logger(self.__class__.__name__) # use class name as the log name self.lock = JobLock(self.__class__.__name__) # use class name as the lock name self.env= yaml.load( open( config['ENV'] ) ) self.db = OnlineDB( self.env['DSN'], logger=self.logger ) set_schema_sql = self.env['SET_SCHEMA_SQL'] self.db.executeSQL( set_schema_sql ) self.config = config def loadTimestamp( self,date_sid): row_sql=config['CHECK_COUNT'] %(date_sid) existing_rows = self.db.retrieveSQL(row_sql) if not existing_rows: date_part=date_sid[0:4]+"-"+date_sid[4:6]+"-"+date_sid[6:8] self.logger.info("Starting to load hourly timestamp") sql=config['INSERT_NEW_RECORD'] %(date_sid,date_part,date_part) self.db.executeSQL(sql, False) self.logger.info("inserted records for %s" %(date_part)) def updateLoadState(self): # update load_state and commit self.db.updateLoadStateWithCurrentTime(self.config['LOAD_STATE_VAR']) self.db.commit() self.logger.info("Updated load_state variable %s and committed txn" % ( self.config['LOAD_STATE_VAR']))
class timestamp_dim_loader: def __init__(self): # Reading configuration file ( YAML file ) self.logger = EtlLogger.get_logger( self.__class__.__name__) # use class name as the log name self.lock = JobLock( self.__class__.__name__) # use class name as the lock name self.env = yaml.load(open(config['ENV'])) self.db = OnlineDB(self.env['DSN'], logger=self.logger) set_schema_sql = self.env['SET_SCHEMA_SQL'] self.db.executeSQL(set_schema_sql) self.config = config def loadTimestamp(self, date_sid): row_sql = config['CHECK_COUNT'] % (date_sid) existing_rows = self.db.retrieveSQL(row_sql) if not existing_rows: date_part = date_sid[0:4] + "-" + date_sid[4:6] + "-" + date_sid[ 6:8] self.logger.info("Starting to load hourly timestamp") sql = config['INSERT_NEW_RECORD'] % (date_sid, date_part, date_part) self.db.executeSQL(sql, False) self.logger.info("inserted records for %s" % (date_part)) def updateLoadState(self): # update load_state and commit self.db.updateLoadStateWithCurrentTime(self.config['LOAD_STATE_VAR']) self.db.commit() self.logger.info("Updated load_state variable %s and committed txn" % (self.config['LOAD_STATE_VAR']))
class schema_mismatch: def __init__(self, yaml_file): # Bootstrap environment self.config = yaml.load(open(yaml_file)) self.env = yaml.load(open(self.config['ENV'])) self.db = OnlineDB( self.env['DSN'] ) set_schema_sql = self.env['SET_SCHEMA_SQL'] self.db.executeSQL(set_schema_sql, True) def find_mismatch(self): #table_mismatch = self.db.retrieveSQL() self.db.executeSQL(self.config['INSERT_TABLE_MISMATCH'], True) self.db.executeSQL(self.config['INSERT_COLUMN_MISMATCH'], True) #[(res,)] = self.db.retrieveSQLArgs(check_sql, args[0]) def transfer_schema_data(self): first_host = self.config['FIRST_HOST'] second_host = self.config['SECOND_HOST'] self.db.executeSQL(self.config['TRUNCATE_TEMP_COLUMNS'], True) self.db.executeSQL(self.config['TRUNCATE_TABLE_MISMATCH'], True) self.db.executeSQL(self.config['TRUNCATE_COLUMN_MISMATCH'], True) query="time /opt/vertica/bin/vsql -h "+second_host+" -U "+self.env['VERTICA_USER']+" -w "+self.env['VERTICA_PASSWORD']+" -c " query=query+" \"CONNECT TO VERTICA DW USER "+self.env['VERTICA_USER']+" PASSWORD '"+self.env['VERTICA_PASSWORD']+"' ON '"+first_host+"',"+self.env['TCP_PORT']+";" query=query+"EXPORT TO VERTICA DW.schema.temp_columns AS select * from DW.v_catalog.columns;\"" print query subprocess.call(query,shell=True)
class ox_openrtb_sum_hourly: def __init__(self, yaml_file, intrvl, feed_hour, data_files, schema_file): # Bootstrap environment self.config = yaml.load(open(yaml_file)) self.env = yaml.load(open(self.config['ENV'])) self.rollup_config = yaml.load(open(self.config['ROLLUP_CONFIG'])) self.db = OnlineDB( self.env['DSN'] ) self.intrvl = intrvl self.feed_hour = feed_hour self.data_files = data_files self.schema_file = schema_file set_schema_sql = self.env['SET_SCHEMA_SQL'] self.db.executeSQL(set_schema_sql, True) def load_temporary_table(self): tmp_table_name = self.config['FEED_TMP_TABLE_NAME'] drop_stmt = "DROP TABLE IF EXISTS %s" % (tmp_table_name) print "Removing temporary table %s" % (tmp_table_name) self.db.executeSQL(drop_stmt, True) feed_columns = self.config['FEED_COLUMNS'] orderby = self.config['FEED_TMP_TABLE_ORDER_BY_CLAUSE'] segmentation = self.config['FEED_TMP_TABLE_SEGMENTATION_CLAUSE'] create_stmt = vertica_create_sql_detailed(self.schema_file, tmp_table_name, feed_columns, temp=True, pretty=False, orderby=orderby, segmentation=segmentation) print "Creating temporary table %s using column mapping: %s" % ( tmp_table_name, feed_columns) self.db.executeSQL(create_stmt, commit=True) load_stmt = vertica_multifile_load_sql_detailed(self.data_files, self.schema_file, tmp_table_name, feed_columns, local=True, pretty=True) print "Loading table from data file %s" % ( self.data_files ) result_count = self.db.executeSQL(load_stmt, commit=True) # Sanity check: the data should be for the hour ODFI claims it's for wrong_hour_result = self.db.retrieveSQLArgs(self.config['FEED_TMP_CHECK_HOUR'], self.feed_hour) if wrong_hour_result: wrong_hour_count = wrong_hour_result[0][0] if wrong_hour_count > 0: raise Exception("ERROR: File(s) %s contain %d rows not for the current hour (%s)" % (self.data_files, wrong_hour_count, self.feed_hour)) return result_count def max_element(self,itemList): counter = {} maxItemCount = 0 mostPopularItem=itemList[0][0] for item in itemList: if item[0][0] in counter.keys(): counter[item[0][0]] += 1 else: counter[item[0][0]] = 1 if counter[item[0][0]] > maxItemCount: maxItemCount = counter[item[0][0]] mostPopularItem = item[0][0] return mostPopularItem def update_carrier_data(self): code=self.db.retrieveSQLArgs(self.config['GET_MISSING_CARRIER_CODES'],()) code_part=[x[0].encode('UTF-8') for x in code if x[0] is not None] print code_part for codes in code_part: if codes.find(',')>0: single_codes=codes.split(',') final_code={} for single_code in single_codes: print "Single code from collection : %s" %(single_code) try: code_name=self.db.retrieveSQL(self.config['GET_SINGLE_CARRIER']+"'"+single_code+"'") except: print 'Problem retrieving carrier by single code: %s' %(single_code) code_name=None print '%s (Single Code) -> %s (Code Name)' %(single_code, code_name) if code_name: final_code[single_code]=code_name final_code_name="Unknown - Unknown" if final_code: final_code_name=self.max_element(final_code.values()) print "Missing code_name inserting into carrier_dim",final_code_name self.db.executeSQLArgs(self.config['INSERT_CARRIER_DIM'], ((final_code_name,codes,datetime.now().strftime('%Y-%m-%d %H:%M:%S'),datetime.now().strftime('%Y-%m-%d %H:%M:%S')))) else: single_code=codes print "Single code : %s" % (single_code) try: code_name=self.db.retrieveSQL(self.config['GET_SINGLE_CARRIER']+"'"+single_code+"'") except: print 'Problem retrieving carrier by single code: %s' %(single_code) code_name=None print '%s (Single Code) -> %s (Code Name)' %(single_code, code_name) if not code_name or not code_name[0][0]: self.db.executeSQLArgs(self.config['INSERT_CARRIER_DIM'], (("Unknown - Unknown",codes,datetime.now().strftime('%Y-%m-%d %H:%M:%S'),datetime.now().strftime('%Y-%m-%d %H:%M:%S')))) else: self.db.executeSQLArgs(self.config['INSERT_CARRIER_DIM'], ((code_name[0][0],codes,datetime.now().strftime('%Y-%m-%d %H:%M:%S'),datetime.now().strftime('%Y-%m-%d %H:%M:%S')))) def remove_existing_data(self): print "DELETE_EXISTING_HOUR: Removing data for hour %s " % (self.feed_hour) self.db.executeSQLArgs(self.config['DELETE_EXISTING_HOUR'], ((self.feed_hour,self.feed_hour))) def insert_merged_data(self): row_count = self.db.executeSQL(self.config['INSERT_DATA'], False) print "INSERT_DATA: Inserted %d rows for hour %s" % ( row_count, self.feed_hour ) def updateLoadState(self): # update load_state and commit self.db.updateLoadStateWithFeedIntrvl(self.config['LOAD_STATE_VAR'], self.intrvl) self.db.commit() print "Updated and Committed load_state variable %s for %s" % ( self.config['LOAD_STATE_VAR'], self.intrvl) def queueDailyRollupJobsForRepublishing(self): queue_hour(self.db, self.rollup_config, 'ox_openrtb_sum_hourly', 'day', self.feed_hour) def run_rollups(self): run_rollups(self.db, self.rollup_config, 'ox_openrtb_sum_hourly', self.feed_hour)
class sqoop_table: def __init__(self,table_name,config,logger): self.logger = logger # use class name as the log name # Reading configuration file ( YAML file ) self.config=config self.env = yaml.load(open(config['ENV'])) self.db = OnlineDB(self.env['DSN'], self.logger) set_schema_sql = self.env['SET_SCHEMA_SQL'] self.db.executeSQL(set_schema_sql) def drop_table(self, table): sql = "drop table if exists " + table self.logger.info(sql) self.db.executeSQL(sql) def check_table(self, table): sql = "select table_name from v_catalog.tables where table_name= ?" result=self.db.retrieveSQLArgs(sql,(table)) if not result: return False elif result[0][0]==table: return True def get_primary_column(self, table_name): # Select the month/day/primaryid which is not backed up yet sql = "select primary_partition_column from sqoop_etl_status where is_complete='f' and table_name='"+table_name+"' order by primary_partition_value LIMIT 1" primary_id = self.db.retrieveSQL(sql) if primary_id and primary_id[0][0]: print "SELECT: %s value selected for backup is %s" % (primary_id, primary_id[0][0]) else: print "No new data available for copying" return primary_id def get_primary_value(self,table_name): sql = "select primary_partition_value from sqoop_etl_status where is_complete='f' and table_name='"+table_name+"' order by primary_partition_value LIMIT 1" primary_value = self.db.retrieveSQL(sql) return primary_value def get_row_count(self,table_name,primary_id,primary_value): sql = "select count(*) from "+table_name+" where "+ primary_id +" = "+ primary_value row_count = self.db.retrieveSQL(sql) return row_count def update_backup_status(self, table_name, primary_id, primary_value): # Insert records to the secondary table for the given month if primary_id: sql = "update sqoop_etl_status set is_complete='t', end_date = '" + str(datetime.datetime.now())+"' where primary_partition_column=? and primary_partition_value=? and table_name=?" self.db.executeSQLArgs(sql, (primary_id,primary_value,table_name)) self.db.commit() self.logger.info("updated sqoop_etl status for table_name=%s",table_name) print "COMPLETED: Backed up the data for the %s = %s" % (primary_id, primary_value) def deleteCurrentSecondaryId(self, secTable, secID): # Since the sqoop job for this secondary id is completed we are deleting it sql = "delete from " + secTable + " where secondary_id= ?" self.db.executeSQLArgs(sql, (secID)) self.db.commit() print "DELETE: Completed backup for the secondary id %s" % (secID) def create_temp_table(self, tempTable, table_name): # Creating the temporary table with additional rowId column and partitioned on rowID createSQL = "create table IF NOT EXISTS " + self.config['DW_DB']+"."+tempTable + " like " + self.config['DW_DB']+"."+table_name self.db.executeSQL(createSQL) alterSQL = "alter table " +self.config['DW_DB']+"."+ tempTable + " add column rowId int not null" self.db.executeSQL(alterSQL) partitionSQL = "ALTER TABLE " + self.config['DW_DB']+"."+tempTable + " PARTITION BY rowId REORGANIZE" self.db.executeSQL(partitionSQL) self.logger.info("CREATE TEMP TABLE: %s",self.config['DW_DB']+"."+table_name) print "CREATE: Created temporary table with row id... " def truncate_table(self, table_name): # Clean up the the temp table for the next iteration... sql = "truncate table " + self.config['DW_DB']+"."+table_name self.logger.info("TRUNCATE TABLE: %s",table_name) self.db.executeSQL(sql) print "TRUNCATE: Truncated temp table... " def pupulate_temp_table(self, tempTable,sequence, numMappers, table_name, primary_id, primary_value): columns = self.get_temp_table_column(table_name) if primary_id: sql = "insert into " + self.config['DW_DB']+"."+ tempTable + "(rowId, " + columns + ") select " + sequence + ".nextval % ? ," + columns + " from " + self.config['DW_DB']+"."+table_name + " where " + primary_id + "= ? ;" num_rows = self.db.executeSQLArgs(sql, (numMappers, primary_value)) else: sql="insert into " + self.config['DW_DB']+"."+ tempTable + "(rowId, " + columns + ") select " + sequence + ".nextval % ? ," + columns + " from " + self.config['DW_DB']+"."+table_name + "" num_rows = self.db.executeSQLArgs(sql, (numMappers)) print sql self.db.commit() self.logger.info("RUNNING: %s",sql) def backup_status(self,table_name): sql = "select 1 from sqoop_etl_status where is_complete='f' and table_name='"+table_name+"' order by primary_partition_value LIMIT 1" value= self.db.retrieveSQL(sql) if not value: return "false" else: return "true" def get_table_column(self, table_name): # get the the column names of the given table sql = "select column_name from v_catalog.columns where table_name = ? and table_schema='"+self.config['DW_DB']+"'" cols = self.db.retrieveSQLArgs(sql, (table_name)) columns = ','.join(str(v[0]) for v in cols) print columns return columns def get_table_column_with_type(self, table_name): primary_id = self.get_primary_column(table_name) # get the the column names of the given table sql = "select column_name || ' ' || data_type from v_catalog.columns where table_name = ? and table_schema='"+self.config['DW_DB']+"'" cols = self.db.retrieveSQLArgs(sql, (table_name)) columns = ','.join(str(v[0]) for v in cols) print columns return columns def generate_status_table(self, table_name): sql = "select export_objects('','"+self.config['DW_DB']+"."+table_name+"')" table_statement = self.db.retrieveSQL(sql) table_statement=table_statement[0][0] table_statement=table_statement.split(";")[0] primary_id = self.config['PRIMARY_ID'] skip_last_months=self.config['SKIP_LATEST_MONTHS'] # get the the column names of the given table #sql = "select distinct "+primary_id+" from " + table_name sql="insert into sqoop_etl_status " if primary_id=="": primary_id="NULL" sql=sql+" select '"+table_name+"','"+self.config['HIVE_DB']+"',CAST('"+str(datetime.datetime.now())+"' AS timestamp),"+"null"+","+primary_id+","+primary_id+",'"+'f'+"',"+self.config['NUM_MAPPERS'] +" WHERE not exists(select 1 from sqoop_etl_status where table_name='"+table_name+"')" else: sql=sql+" select '"+table_name+"','"+self.config['HIVE_DB']+"','"+str(datetime.datetime.now())+"',"+"null"+",'"+primary_id+"',"+primary_id+",'"+'f'+"',"+self.config['NUM_MAPPERS']+" from (select "+primary_id+", row_number() over(order by "+primary_id+" desc) rn from (select distinct "+primary_id+" from "+self.config['DW_DB']+"."+table_name+" ) C group by 1) A where rn>"+skip_last_months+" and not exists(select 1 from sqoop_etl_status where primary_partition_value="+primary_id+");" self.logger.info("RUNNING: %s",sql) self.db.executeSQL(sql) self.db.commit(); def get_temp_table_column(self, table_name): primary_id = self.get_primary_column(table_name) # get the the column names of the given table sql = "select column_name from v_catalog.columns where table_name = ? and table_schema='"+self.config['DW_DB']+"'" cols = self.db.retrieveSQLArgs(sql, (table_name)) columns = ','.join(str(v[0]) for v in cols) return columns def create_external_table(self, table_name): table_statement=self.get_table_column_with_type(table_name) +" )" table_statement=table_statement+" as copy from '"+self.env['HDFS_STORAGE_HOST'] table_statement=table_statement+table_name+"/*' ON ANY NODE ORC" table_statement="CREATE EXTERNAL TABLE " + self.config['DW_DB']+"."+ table_name+"_ext ( "+table_statement self.db.executeSQL(table_statement) self.logger.info("RUNNING: %s",table_statement) print "TRUNCATE: Truncated temp table... " def get_secondary_value(self, sec_table_name): sql = "select secondary_id from " + sec_table_name + " limit 1" secondary_value = self.db.retrieveSQL(sql) if secondary_value: print "SELECT: The secondary id that will be backed up next is %s" % (secondary_value[0][0]) else: print "SELECT: Completed all secondary ids" return secondary_value