def insert_hive(self): """Insert buffered logs to Hive.""" global DEBUG log_txt = '\n'.join(self.buffer_logs) _log_txt = self._escape(log_txt.strip()) _now = datetime.datetime.now() _time = _now.strftime("%Y-%m-%d %H:%M:%S.%f") format_data = { 'failed_table_name': self.source_table.strip(), 'log': _log_txt, 'log_time': _time, 'status': self.qa_status_log } with open(LOG_FILE, 'wb') as file_h: file_h.write(log_txt) # File-based insert operation self.pyhdfs.insert_update('', self.prepares_data(format_data), is_append=True) # We need to perform INVALIDATE METADATA to reflect Hive # file-based operations in Impala. ImpalaConnect.invalidate_metadata(self.host_name, 'ibis.qa_export_results') msg = '\n' + '#' * 100 msg += ("\nFor QA results: Run query in Impala: " "select status, * from ibis.qa_export_results where " "log_time='{0}'") msg += '\n' + '#' * 100 msg = msg.format(_time) print msg logging.info('Inserted logs to Hive')
def get_record(self): """Get it-table record""" query = ("SELECT * FROM {0} WHERE " "source_table_name='{1}' AND " "source_database_name='{2}' AND " "db_env='{3}';") query = query.format(self.it_table, self.table_name, self.database, self.db_env) row = None try: row = ImpalaConnect.run_query(self.it_table_host, self.it_table, query, op='select') except socket.error as err: print "Socket error({0}): {1}".format(err.errno, err.strerror) print 'Trying again...' row = ImpalaConnect.run_query(self.it_table_host, self.it_table, query, op='select') if not row: err_msg = "Failed to fetch results for query: {0}" err_msg = err_msg.format(query) raise ValueError(err_msg) return row[0]
def update_checks_balances(self, workflow, start_time=None): """Given a workflow app name, query oozie api for workflow job, update checks_balances table with records """ if workflow: for record in self.get_records(workflow): # Update records to checks balances table # Parquet size is dependent on live location size in hdfs. # Records only get replaced in live if # successful otherwise keep last ingest records self.update_old_cb_table( record.domain, record.table_name, record) # We need to perform INVALIDATE METADATA to reflect Hive # file-based operations in Impala. ImpalaConnect.invalidate_metadata(self.host, 'ibis.checks_balances_export')
def main(): global logger args = { 'source_database_name': sys.argv[1], 'source_table_name': sys.argv[2], 'database': sys.argv[3], 'target_table': sys.argv[4], 'jars': sys.argv[5], 'jdbc_url': sys.argv[6], 'connection_factories': sys.argv[7], 'user_name': sys.argv[8], 'jceks': sys.argv[9], 'password_alias': sys.argv[10], 'host_name': os.environ['IMPALA_HOST'], 'target_schema': sys.argv[12], 'oozie_url': sys.argv[13], 'workflow_name': sys.argv[14], 'qa_exp_results_dir': sys.argv[16] } DEBUG = True if sys.argv[15] == 'true' else False cb_mgr = ChecksBalancesExportManager(args['host_name'], args['oozie_url'], args['qa_exp_results_dir']) actions = cb_mgr.check_if_workflow_actions(args['workflow_name']) for action in actions: if sys.argv[2] + '_export' in action.get_name(): new_action = action source_table = args['source_database_name'] + '.' + \ args['source_table_name'] logger = LogHandler(args['host_name'], source_table, args['qa_exp_results_dir']) try: target_table = args['database'] + '.' + args['target_table'] val_obj = TableValidation(source_table, target_table, args) bool_status = val_obj.start_data_sampling(new_action) logger.insert_hive() exit_code = 0 if bool_status else 1 except Exception as e: exit_code = 1 logger.error(traceback.format_exc()) logger.insert_hive() ImpalaConnect.close_conn() sys.exit(exit_code)
def get_source_row_count(self): """Fetch row count of source table. Method to query the hive db and find the row count for each table in the source list. Returns a dictionary with key as table name and value as the row count """ row_count = -1 count_query = "SELECT COUNT(*) FROM {0};".format(self.table) output = ImpalaConnect.run_query(self.host_name, self.table, count_query) row_count = output[0][0] return int(row_count)
def get_schema(self): """Get it-table schema""" query = "describe {0};".format(self.it_table) row = ImpalaConnect.run_query(self.it_table_host, self.it_table, query, op='select') if not row: err_msg = "Failed to fetch results for query: {0}" err_msg = err_msg.format(query) raise ValueError(err_msg) return row
def main(): global logger args = { 'source_database_name': sys.argv[1], 'source_table_name': sys.argv[2], 'database': sys.argv[3], 'target_table': sys.argv[4], 'jars': sys.argv[5], 'jdbc_url': sys.argv[6], 'connection_factories': sys.argv[7], 'user_name': sys.argv[8], 'jceks': sys.argv[9], 'password_alias': sys.argv[10], 'host_name': os.environ['IMPALA_HOST'], 'target_schema': sys.argv[11], 'qa_exp_results_dir': sys.argv[13] } DEBUG = True if sys.argv[12] == 'true' else False source_table = args['source_database_name'] + '.' + \ args['source_table_name'] logger = LogHandler(source_table, args['host_name'], args['qa_exp_results_dir']) try: target_table = args['database'] + '.' + args['target_table'] val_obj = TableValidation(source_table, target_table, args) bool_status_list = val_obj.start_data_sampling() logger.insert_hive() exit_code = 0 if bool_status_list[0] else 1 if 'db2' in args['jdbc_url']: exit_code = 0 if (bool_status_list[0] and bool_status_list[1] and bool_status_list[2]) else 1 except Exception as e: exit_code = 1 logger.error(traceback.format_exc()) logger.insert_hive() ImpalaConnect.close_conn() sys.exit(exit_code)
def get_ddl(self): """Query hive database for column name, data type, and column count. Returns a dictionary of key as table name and values as column name and datatype """ ddl_list = [] ddl_query = self.build_ddl_query() output = ImpalaConnect.run_query(self.host_name, self.table, ddl_query) # print 'output', output ddl_list = self.set_column_props(output) # sort the list of ddl objects as source and target ddl output # are not ordered # ddl_list.sort(key=lambda ddl: ddl.column_name) return ddl_list
job_numbers[action.get_name()] = action.get_external_id() sub_actions = "" if normal_or_sub: sub_workflow_to_update = job_numbers[max(job_numbers.keys())] id_externals = self.ooz.get_job(sub_workflow_to_update) workflow = id_externals[1] sub_actions = id_externals[1].get_actions() else: workflow = workflow_job return workflow def check_if_workflow_actions(self, app_name): workflow_job = self.get_workflow_job(app_name) actions = workflow_job.get_actions() print "actions", actions return actions if __name__ == "__main__": # Application name app_name = sys.argv[1] HOST = os.environ['IMPALA_HOST'] OOZIE_URL = sys.argv[2] CHK_BAL_EXP_DIR = sys.argv[3] export_cb_mgr = ChecksBalancesExportManager( HOST, OOZIE_URL, CHK_BAL_EXP_DIR) app_name = export_cb_mgr.check_if_workflow(app_name) export_cb_mgr.update_checks_balances(app_name) ImpalaConnect.close_conn()
def gen_view_hql(target_db, domain=None): """creates view hql""" if domain is not None: team_view_name = '{team_nm}.{team_db}_{team_table}' team_view_name = team_view_name.format( team_nm=domain, team_db=self.clean_database, team_table=self.clean_table_name) view_full_name = '{view_name}.{database}_{table_name}' view_full_name = view_full_name.format( view_name=target_db, database=self.clean_database, table_name=self.clean_table_name) src_view = '{src_vw_name}.{src_db}_{src_tbl_name}' src_view = src_view.format(src_vw_name=self.domain, src_db=self.clean_database, src_tbl_name=self.clean_table_name) phi_domain = target_db.split('_non_phi')[0] only_domain = self.domain_list only_domain = only_domain.split(",") only_domain = set(only_domain).intersection([phi_domain]) only_domain = list(only_domain) if rpc_method == 'incremental': add_partition_hql = '' elif rpc_method == 'full_load': add_partition_hql = ( "ALTER TABLE `{view_full_name}` ADD PARTITION " "(incr_ingest_timestamp='full_{ingest_date}');\n\n") add_partition_hql = add_partition_hql.format( view_full_name=view_full_name, ingest_date=self.partition_name) if self.ibis_env == 'PERF' and len(only_domain) < 1: try: try: ImpalaConnect.run_query( self.host_name, view_full_name, 'USE {db}'.format(db=view_full_name.split(".")[0]), 'create') is_table_available = ImpalaConnect.run_query( self.host_name, view_full_name, "SHOW TABLES LIKE '{table_name}'".format( table_name=view_full_name.split(".")[1])) if is_table_available is None or \ is_table_available is '': raise ValueError("Table not found") except: raise ValueError("Database/Table not found") max_trgt_ingest = ('select max(ingest_timestamp) from ' '{view_full_name}') max_trgt_ingest = max_trgt_ingest.format( view_full_name=view_full_name) max_trgt_ingest = ImpalaConnect.run_query( self.host_name, view_full_name, max_trgt_ingest) team_run_freq = ("select * from {freq_ingest} where" " view_name='{view_nm_splt}' and " "full_table_name='{table_name}'") team_run_freq = team_run_freq.format( view_nm_splt=target_db, table_name=src_view, freq_ingest=self.freq_ingest) team_run_freq = ImpalaConnect.run_query( self.host_name, self.freq_ingest, team_run_freq) frequency = team_run_freq[0][0] if frequency == 'none': team_freq = 0 elif frequency == 'daily': team_freq = 1 elif frequency == 'weekly': team_freq = 7 elif frequency == 'biweekly': team_freq = 14 elif frequency == 'fortnightly': team_freq = 15 elif frequency == 'monthly': team_freq = 30 elif frequency == 'quarterly': team_freq = 90 elif frequency == 'yearly': team_freq = 364 max_ingest = datetime.datetime.strptime( max_trgt_ingest[0][0], '%Y-%m-%d %H:%M:%S') curr_date_str = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") curr_date = datetime.datetime.strptime( curr_date_str, '%Y-%m-%d %H:%M:%S') last_ingest_day = (curr_date - max_ingest).days if team_freq != 0 and last_ingest_day >= team_freq and \ team_run_freq[0][1].lower() == 'yes': if domain: select_query = "select * from {team_view_name}".\ format(team_view_name=team_view_name) views_hql = ( 'DROP VIEW IF EXISTS {view_full_name};\n' 'CREATE DATABASE IF NOT EXISTS {view_name};' '\n' 'CREATE DATABASE IF NOT EXISTS {team_name};' '\n' 'DROP VIEW IF EXISTS {team_view_name};\n' 'DROP TABLE IF EXISTS {team_view_name};\n' 'CREATE VIEW {team_view_name} AS ' 'SELECT * FROM {src_view}; \n' 'CREATE TABLE IF NOT EXISTS {view_full_name} ' 'like {src_view};\n\n') views_hql = views_hql.format( view_full_name=view_full_name, view_name=target_db, src_view=src_view, team_view_name=team_view_name, team_name=domain) else: select_query = "select * from {src_view}".format( src_view=src_view) views_hql = ( 'DROP VIEW IF EXISTS {view_full_name};\n' 'CREATE DATABASE IF NOT EXISTS {view_name};' '\n' 'CREATE TABLE IF NOT EXISTS {view_full_name} ' 'like {src_view};\n\n' 'INSERT OVERWRITE TABLE {view_full_name} ' 'PARTITION(incr_ingest_timestamp) ' 'select * from {src_view} ;\n\n') views_hql = views_hql.format( view_full_name=view_full_name, view_name=target_db, src_view=src_view) if rpc_method == 'incremental': insert_statement = ( "INSERT OVERWRITE TABLE {view_full_name} " "PARTITION(incr_ingest_timestamp) " "{select_query} where " "ingest_timestamp > '{maxtime}';\n\n") insert_statement = insert_statement.format( view_full_name=view_full_name, maxtime=max_trgt_ingest[0][0], select_query=select_query) else: insert_statement = ( 'INSERT OVERWRITE TABLE {view_full_name} ' 'PARTITION(incr_ingest_timestamp) ' '{select_query};\n\n') insert_statement = insert_statement.format( view_full_name=view_full_name, select_query=select_query) drop_hql = ( 'DROP TABLE IF EXISTS {view_full_name};\n\n') views_hql = drop_hql.format( view_full_name=view_full_name) + views_hql views_hql += insert_statement views_hql += "msck repair table {0};\n\n".format( view_full_name) else: print 'No views hql created' views_hql = '' except ValueError as ex: print str(ex) if domain: views_hql = ( 'DROP VIEW IF EXISTS {view_full_name};\n' 'DROP TABLE IF EXISTS {view_full_name};\n\n' 'CREATE DATABASE IF NOT EXISTS {view_name};\n' 'CREATE DATABASE IF NOT EXISTS {team_name};\n' 'DROP VIEW IF EXISTS {team_view_name};\n' 'DROP TABLE IF EXISTS {team_view_name};\n' 'CREATE VIEW {team_view_name} AS ' 'SELECT * FROM {src_view}; \n' 'CREATE TABLE {view_full_name} ' 'like {src_view};\n\n' 'INSERT OVERWRITE TABLE {view_full_name} ' 'PARTITION(incr_ingest_timestamp) ' 'select * from {team_view_name} ;\n\n') views_hql = views_hql.format( view_full_name=view_full_name, view_name=target_db, src_view=src_view, team_view_name=team_view_name, team_name=domain) else: views_hql = ( 'DROP VIEW IF EXISTS {view_full_name};\n' 'DROP TABLE IF EXISTS {view_full_name};\n\n' 'CREATE DATABASE IF NOT EXISTS {view_name};\n' 'CREATE TABLE {view_full_name} ' 'like {src_view};\n\n' 'INSERT OVERWRITE TABLE {view_full_name} ' 'PARTITION(incr_ingest_timestamp) ' 'select * from {src_view} ;\n\n') views_hql = views_hql.format( view_full_name=view_full_name, view_name=target_db, src_view=src_view) views_hql += "msck repair table `{0}`;\n\n".format( view_full_name) elif self.ibis_env == 'PERF' and len(only_domain) > 0: views_hql = ('DROP VIEW IF EXISTS {view_full_name};\n' 'DROP TABLE IF EXISTS {view_full_name};\n' 'CREATE VIEW {view_full_name} AS ' 'SELECT * FROM {src_view}; \n') views_hql = views_hql.format(view_full_name=view_full_name, src_view=src_view) else: views_hql = ( 'DROP VIEW IF EXISTS `{view_full_name}`;\n' 'DROP TABLE IF EXISTS `{view_full_name}`;\n\n' 'CREATE DATABASE IF NOT EXISTS `{view_name}`;\n' 'CREATE EXTERNAL TABLE `{view_full_name}` (' '{create_column_hql},\n `ingest_timestamp` string)\n' 'partitioned by (incr_ingest_timestamp string)\n' "stored as parquet location 'hdfs://" "/user/data/{target_dir}/live/';\n\n") views_hql = views_hql.format(view_full_name=view_full_name, view_name=target_db, create_column_hql=create_hql, target_dir=self.target_dir) views_hql += add_partition_hql views_hql += "msck repair table `{0}`;\n\n".format( view_full_name) return views_hql