def get_record(self): """Get it-table record""" query = ("SELECT * FROM {0} WHERE " "source_table_name='{1}' AND " "source_database_name='{2}' AND " "db_env='{3}';") query = query.format(self.it_table, self.table_name, self.database, self.db_env) row = None try: row = ImpalaConnect.run_query(self.it_table_host, self.it_table, query, op='select') except socket.error as err: print "Socket error({0}): {1}".format(err.errno, err.strerror) print 'Trying again...' row = ImpalaConnect.run_query(self.it_table_host, self.it_table, query, op='select') if not row: err_msg = "Failed to fetch results for query: {0}" err_msg = err_msg.format(query) raise ValueError(err_msg) return row[0]
def get_source_row_count(self): """Fetch row count of source table. Method to query the hive db and find the row count for each table in the source list. Returns a dictionary with key as table name and value as the row count """ row_count = -1 count_query = "SELECT COUNT(*) FROM {0};".format(self.table) output = ImpalaConnect.run_query(self.host_name, self.table, count_query) row_count = output[0][0] return int(row_count)
def get_schema(self): """Get it-table schema""" query = "describe {0};".format(self.it_table) row = ImpalaConnect.run_query(self.it_table_host, self.it_table, query, op='select') if not row: err_msg = "Failed to fetch results for query: {0}" err_msg = err_msg.format(query) raise ValueError(err_msg) return row
def get_ddl(self): """Query hive database for column name, data type, and column count. Returns a dictionary of key as table name and values as column name and datatype """ ddl_list = [] ddl_query = self.build_ddl_query() output = ImpalaConnect.run_query(self.host_name, self.table, ddl_query) # print 'output', output ddl_list = self.set_column_props(output) # sort the list of ddl objects as source and target ddl output # are not ordered # ddl_list.sort(key=lambda ddl: ddl.column_name) return ddl_list
def gen_view_hql(target_db, domain=None): """creates view hql""" if domain is not None: team_view_name = '{team_nm}.{team_db}_{team_table}' team_view_name = team_view_name.format( team_nm=domain, team_db=self.clean_database, team_table=self.clean_table_name) view_full_name = '{view_name}.{database}_{table_name}' view_full_name = view_full_name.format( view_name=target_db, database=self.clean_database, table_name=self.clean_table_name) src_view = '{src_vw_name}.{src_db}_{src_tbl_name}' src_view = src_view.format(src_vw_name=self.domain, src_db=self.clean_database, src_tbl_name=self.clean_table_name) phi_domain = target_db.split('_non_phi')[0] only_domain = self.domain_list only_domain = only_domain.split(",") only_domain = set(only_domain).intersection([phi_domain]) only_domain = list(only_domain) if rpc_method == 'incremental': add_partition_hql = '' elif rpc_method == 'full_load': add_partition_hql = ( "ALTER TABLE `{view_full_name}` ADD PARTITION " "(incr_ingest_timestamp='full_{ingest_date}');\n\n") add_partition_hql = add_partition_hql.format( view_full_name=view_full_name, ingest_date=self.partition_name) if self.ibis_env == 'PERF' and len(only_domain) < 1: try: try: ImpalaConnect.run_query( self.host_name, view_full_name, 'USE {db}'.format(db=view_full_name.split(".")[0]), 'create') is_table_available = ImpalaConnect.run_query( self.host_name, view_full_name, "SHOW TABLES LIKE '{table_name}'".format( table_name=view_full_name.split(".")[1])) if is_table_available is None or \ is_table_available is '': raise ValueError("Table not found") except: raise ValueError("Database/Table not found") max_trgt_ingest = ('select max(ingest_timestamp) from ' '{view_full_name}') max_trgt_ingest = max_trgt_ingest.format( view_full_name=view_full_name) max_trgt_ingest = ImpalaConnect.run_query( self.host_name, view_full_name, max_trgt_ingest) team_run_freq = ("select * from {freq_ingest} where" " view_name='{view_nm_splt}' and " "full_table_name='{table_name}'") team_run_freq = team_run_freq.format( view_nm_splt=target_db, table_name=src_view, freq_ingest=self.freq_ingest) team_run_freq = ImpalaConnect.run_query( self.host_name, self.freq_ingest, team_run_freq) frequency = team_run_freq[0][0] if frequency == 'none': team_freq = 0 elif frequency == 'daily': team_freq = 1 elif frequency == 'weekly': team_freq = 7 elif frequency == 'biweekly': team_freq = 14 elif frequency == 'fortnightly': team_freq = 15 elif frequency == 'monthly': team_freq = 30 elif frequency == 'quarterly': team_freq = 90 elif frequency == 'yearly': team_freq = 364 max_ingest = datetime.datetime.strptime( max_trgt_ingest[0][0], '%Y-%m-%d %H:%M:%S') curr_date_str = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") curr_date = datetime.datetime.strptime( curr_date_str, '%Y-%m-%d %H:%M:%S') last_ingest_day = (curr_date - max_ingest).days if team_freq != 0 and last_ingest_day >= team_freq and \ team_run_freq[0][1].lower() == 'yes': if domain: select_query = "select * from {team_view_name}".\ format(team_view_name=team_view_name) views_hql = ( 'DROP VIEW IF EXISTS {view_full_name};\n' 'CREATE DATABASE IF NOT EXISTS {view_name};' '\n' 'CREATE DATABASE IF NOT EXISTS {team_name};' '\n' 'DROP VIEW IF EXISTS {team_view_name};\n' 'DROP TABLE IF EXISTS {team_view_name};\n' 'CREATE VIEW {team_view_name} AS ' 'SELECT * FROM {src_view}; \n' 'CREATE TABLE IF NOT EXISTS {view_full_name} ' 'like {src_view};\n\n') views_hql = views_hql.format( view_full_name=view_full_name, view_name=target_db, src_view=src_view, team_view_name=team_view_name, team_name=domain) else: select_query = "select * from {src_view}".format( src_view=src_view) views_hql = ( 'DROP VIEW IF EXISTS {view_full_name};\n' 'CREATE DATABASE IF NOT EXISTS {view_name};' '\n' 'CREATE TABLE IF NOT EXISTS {view_full_name} ' 'like {src_view};\n\n' 'INSERT OVERWRITE TABLE {view_full_name} ' 'PARTITION(incr_ingest_timestamp) ' 'select * from {src_view} ;\n\n') views_hql = views_hql.format( view_full_name=view_full_name, view_name=target_db, src_view=src_view) if rpc_method == 'incremental': insert_statement = ( "INSERT OVERWRITE TABLE {view_full_name} " "PARTITION(incr_ingest_timestamp) " "{select_query} where " "ingest_timestamp > '{maxtime}';\n\n") insert_statement = insert_statement.format( view_full_name=view_full_name, maxtime=max_trgt_ingest[0][0], select_query=select_query) else: insert_statement = ( 'INSERT OVERWRITE TABLE {view_full_name} ' 'PARTITION(incr_ingest_timestamp) ' '{select_query};\n\n') insert_statement = insert_statement.format( view_full_name=view_full_name, select_query=select_query) drop_hql = ( 'DROP TABLE IF EXISTS {view_full_name};\n\n') views_hql = drop_hql.format( view_full_name=view_full_name) + views_hql views_hql += insert_statement views_hql += "msck repair table {0};\n\n".format( view_full_name) else: print 'No views hql created' views_hql = '' except ValueError as ex: print str(ex) if domain: views_hql = ( 'DROP VIEW IF EXISTS {view_full_name};\n' 'DROP TABLE IF EXISTS {view_full_name};\n\n' 'CREATE DATABASE IF NOT EXISTS {view_name};\n' 'CREATE DATABASE IF NOT EXISTS {team_name};\n' 'DROP VIEW IF EXISTS {team_view_name};\n' 'DROP TABLE IF EXISTS {team_view_name};\n' 'CREATE VIEW {team_view_name} AS ' 'SELECT * FROM {src_view}; \n' 'CREATE TABLE {view_full_name} ' 'like {src_view};\n\n' 'INSERT OVERWRITE TABLE {view_full_name} ' 'PARTITION(incr_ingest_timestamp) ' 'select * from {team_view_name} ;\n\n') views_hql = views_hql.format( view_full_name=view_full_name, view_name=target_db, src_view=src_view, team_view_name=team_view_name, team_name=domain) else: views_hql = ( 'DROP VIEW IF EXISTS {view_full_name};\n' 'DROP TABLE IF EXISTS {view_full_name};\n\n' 'CREATE DATABASE IF NOT EXISTS {view_name};\n' 'CREATE TABLE {view_full_name} ' 'like {src_view};\n\n' 'INSERT OVERWRITE TABLE {view_full_name} ' 'PARTITION(incr_ingest_timestamp) ' 'select * from {src_view} ;\n\n') views_hql = views_hql.format( view_full_name=view_full_name, view_name=target_db, src_view=src_view) views_hql += "msck repair table `{0}`;\n\n".format( view_full_name) elif self.ibis_env == 'PERF' and len(only_domain) > 0: views_hql = ('DROP VIEW IF EXISTS {view_full_name};\n' 'DROP TABLE IF EXISTS {view_full_name};\n' 'CREATE VIEW {view_full_name} AS ' 'SELECT * FROM {src_view}; \n') views_hql = views_hql.format(view_full_name=view_full_name, src_view=src_view) else: views_hql = ( 'DROP VIEW IF EXISTS `{view_full_name}`;\n' 'DROP TABLE IF EXISTS `{view_full_name}`;\n\n' 'CREATE DATABASE IF NOT EXISTS `{view_name}`;\n' 'CREATE EXTERNAL TABLE `{view_full_name}` (' '{create_column_hql},\n `ingest_timestamp` string)\n' 'partitioned by (incr_ingest_timestamp string)\n' "stored as parquet location 'hdfs://" "/user/data/{target_dir}/live/';\n\n") views_hql = views_hql.format(view_full_name=view_full_name, view_name=target_db, create_column_hql=create_hql, target_dir=self.target_dir) views_hql += add_partition_hql views_hql += "msck repair table `{0}`;\n\n".format( view_full_name) return views_hql