Exemple #1
0
    def get_record(self):
        """Get it-table record"""
        query = ("SELECT * FROM {0} WHERE "
                 "source_table_name='{1}' AND "
                 "source_database_name='{2}' AND "
                 "db_env='{3}';")
        query = query.format(self.it_table, self.table_name, self.database,
                             self.db_env)
        row = None
        try:
            row = ImpalaConnect.run_query(self.it_table_host,
                                          self.it_table,
                                          query,
                                          op='select')
        except socket.error as err:
            print "Socket error({0}): {1}".format(err.errno, err.strerror)
            print 'Trying again...'
            row = ImpalaConnect.run_query(self.it_table_host,
                                          self.it_table,
                                          query,
                                          op='select')

        if not row:
            err_msg = "Failed to fetch results for query: {0}"
            err_msg = err_msg.format(query)
            raise ValueError(err_msg)

        return row[0]
Exemple #2
0
 def get_source_row_count(self):
     """Fetch row count of source table.
     Method to query the hive db and find the row count for each table
     in the source list.
     Returns a dictionary with key as table name and value as the row count
     """
     row_count = -1
     count_query = "SELECT COUNT(*) FROM {0};".format(self.table)
     output = ImpalaConnect.run_query(self.host_name, self.table,
                                      count_query)
     row_count = output[0][0]
     return int(row_count)
Exemple #3
0
 def get_schema(self):
     """Get it-table schema"""
     query = "describe {0};".format(self.it_table)
     row = ImpalaConnect.run_query(self.it_table_host,
                                   self.it_table,
                                   query,
                                   op='select')
     if not row:
         err_msg = "Failed to fetch results for query: {0}"
         err_msg = err_msg.format(query)
         raise ValueError(err_msg)
     return row
Exemple #4
0
 def get_ddl(self):
     """Query hive database for column name, data type, and column count.
     Returns a dictionary of key as table name and values as column
     name and datatype
     """
     ddl_list = []
     ddl_query = self.build_ddl_query()
     output = ImpalaConnect.run_query(self.host_name, self.table, ddl_query)
     # print 'output', output
     ddl_list = self.set_column_props(output)
     # sort the list of ddl objects as source and target ddl output
     # are not ordered
     # ddl_list.sort(key=lambda ddl: ddl.column_name)
     return ddl_list
Exemple #5
0
        def gen_view_hql(target_db, domain=None):
            """creates view hql"""
            if domain is not None:
                team_view_name = '{team_nm}.{team_db}_{team_table}'
                team_view_name = team_view_name.format(
                    team_nm=domain,
                    team_db=self.clean_database,
                    team_table=self.clean_table_name)

            view_full_name = '{view_name}.{database}_{table_name}'
            view_full_name = view_full_name.format(
                view_name=target_db,
                database=self.clean_database,
                table_name=self.clean_table_name)
            src_view = '{src_vw_name}.{src_db}_{src_tbl_name}'
            src_view = src_view.format(src_vw_name=self.domain,
                                       src_db=self.clean_database,
                                       src_tbl_name=self.clean_table_name)

            phi_domain = target_db.split('_non_phi')[0]
            only_domain = self.domain_list
            only_domain = only_domain.split(",")
            only_domain = set(only_domain).intersection([phi_domain])
            only_domain = list(only_domain)

            if rpc_method == 'incremental':
                add_partition_hql = ''
            elif rpc_method == 'full_load':
                add_partition_hql = (
                    "ALTER TABLE `{view_full_name}` ADD PARTITION "
                    "(incr_ingest_timestamp='full_{ingest_date}');\n\n")
                add_partition_hql = add_partition_hql.format(
                    view_full_name=view_full_name,
                    ingest_date=self.partition_name)

            if self.ibis_env == 'PERF' and len(only_domain) < 1:
                try:
                    try:
                        ImpalaConnect.run_query(
                            self.host_name, view_full_name,
                            'USE {db}'.format(db=view_full_name.split(".")[0]),
                            'create')
                        is_table_available = ImpalaConnect.run_query(
                            self.host_name, view_full_name,
                            "SHOW TABLES LIKE '{table_name}'".format(
                                table_name=view_full_name.split(".")[1]))

                        if is_table_available is None or \
                                is_table_available is '':
                            raise ValueError("Table not found")
                    except:
                        raise ValueError("Database/Table not found")

                    max_trgt_ingest = ('select max(ingest_timestamp) from '
                                       '{view_full_name}')
                    max_trgt_ingest = max_trgt_ingest.format(
                        view_full_name=view_full_name)
                    max_trgt_ingest = ImpalaConnect.run_query(
                        self.host_name, view_full_name, max_trgt_ingest)

                    team_run_freq = ("select * from {freq_ingest} where"
                                     " view_name='{view_nm_splt}' and "
                                     "full_table_name='{table_name}'")
                    team_run_freq = team_run_freq.format(
                        view_nm_splt=target_db,
                        table_name=src_view,
                        freq_ingest=self.freq_ingest)
                    team_run_freq = ImpalaConnect.run_query(
                        self.host_name, self.freq_ingest, team_run_freq)
                    frequency = team_run_freq[0][0]

                    if frequency == 'none':
                        team_freq = 0
                    elif frequency == 'daily':
                        team_freq = 1
                    elif frequency == 'weekly':
                        team_freq = 7
                    elif frequency == 'biweekly':
                        team_freq = 14
                    elif frequency == 'fortnightly':
                        team_freq = 15
                    elif frequency == 'monthly':
                        team_freq = 30
                    elif frequency == 'quarterly':
                        team_freq = 90
                    elif frequency == 'yearly':
                        team_freq = 364

                    max_ingest = datetime.datetime.strptime(
                        max_trgt_ingest[0][0], '%Y-%m-%d %H:%M:%S')
                    curr_date_str = datetime.datetime.now().strftime(
                        "%Y-%m-%d %H:%M:%S")
                    curr_date = datetime.datetime.strptime(
                        curr_date_str, '%Y-%m-%d %H:%M:%S')
                    last_ingest_day = (curr_date - max_ingest).days
                    if team_freq != 0 and last_ingest_day >= team_freq and \
                            team_run_freq[0][1].lower() == 'yes':
                        if domain:
                            select_query = "select * from {team_view_name}".\
                                format(team_view_name=team_view_name)
                            views_hql = (
                                'DROP VIEW IF EXISTS {view_full_name};\n'
                                'CREATE DATABASE IF NOT EXISTS {view_name};'
                                '\n'
                                'CREATE DATABASE IF NOT EXISTS {team_name};'
                                '\n'
                                'DROP VIEW IF EXISTS {team_view_name};\n'
                                'DROP TABLE IF EXISTS {team_view_name};\n'
                                'CREATE VIEW  {team_view_name} AS '
                                'SELECT * FROM {src_view}; \n'
                                'CREATE TABLE IF NOT EXISTS {view_full_name} '
                                'like {src_view};\n\n')
                            views_hql = views_hql.format(
                                view_full_name=view_full_name,
                                view_name=target_db,
                                src_view=src_view,
                                team_view_name=team_view_name,
                                team_name=domain)

                        else:
                            select_query = "select * from {src_view}".format(
                                src_view=src_view)
                            views_hql = (
                                'DROP VIEW IF EXISTS {view_full_name};\n'
                                'CREATE DATABASE IF NOT EXISTS {view_name};'
                                '\n'
                                'CREATE  TABLE IF NOT EXISTS {view_full_name} '
                                'like {src_view};\n\n'
                                'INSERT OVERWRITE TABLE {view_full_name} '
                                'PARTITION(incr_ingest_timestamp) '
                                'select * from {src_view} ;\n\n')

                            views_hql = views_hql.format(
                                view_full_name=view_full_name,
                                view_name=target_db,
                                src_view=src_view)

                        if rpc_method == 'incremental':
                            insert_statement = (
                                "INSERT OVERWRITE TABLE {view_full_name} "
                                "PARTITION(incr_ingest_timestamp) "
                                "{select_query} where "
                                "ingest_timestamp > '{maxtime}';\n\n")
                            insert_statement = insert_statement.format(
                                view_full_name=view_full_name,
                                maxtime=max_trgt_ingest[0][0],
                                select_query=select_query)
                        else:
                            insert_statement = (
                                'INSERT OVERWRITE TABLE {view_full_name} '
                                'PARTITION(incr_ingest_timestamp) '
                                '{select_query};\n\n')
                            insert_statement = insert_statement.format(
                                view_full_name=view_full_name,
                                select_query=select_query)

                            drop_hql = (
                                'DROP TABLE IF EXISTS {view_full_name};\n\n')

                            views_hql = drop_hql.format(
                                view_full_name=view_full_name) + views_hql
                        views_hql += insert_statement

                        views_hql += "msck repair table {0};\n\n".format(
                            view_full_name)

                    else:
                        print 'No views hql created'
                        views_hql = ''

                except ValueError as ex:
                    print str(ex)
                    if domain:
                        views_hql = (
                            'DROP VIEW IF EXISTS {view_full_name};\n'
                            'DROP TABLE IF EXISTS {view_full_name};\n\n'
                            'CREATE DATABASE IF NOT EXISTS {view_name};\n'
                            'CREATE DATABASE IF NOT EXISTS {team_name};\n'
                            'DROP VIEW IF EXISTS {team_view_name};\n'
                            'DROP TABLE IF EXISTS {team_view_name};\n'
                            'CREATE VIEW  {team_view_name} AS '
                            'SELECT * FROM {src_view}; \n'
                            'CREATE  TABLE {view_full_name} '
                            'like {src_view};\n\n'
                            'INSERT OVERWRITE TABLE {view_full_name} '
                            'PARTITION(incr_ingest_timestamp) '
                            'select * from {team_view_name} ;\n\n')
                        views_hql = views_hql.format(
                            view_full_name=view_full_name,
                            view_name=target_db,
                            src_view=src_view,
                            team_view_name=team_view_name,
                            team_name=domain)
                    else:
                        views_hql = (
                            'DROP VIEW IF EXISTS {view_full_name};\n'
                            'DROP TABLE IF EXISTS {view_full_name};\n\n'
                            'CREATE DATABASE IF NOT EXISTS {view_name};\n'
                            'CREATE  TABLE {view_full_name} '
                            'like {src_view};\n\n'
                            'INSERT OVERWRITE TABLE {view_full_name} '
                            'PARTITION(incr_ingest_timestamp) '
                            'select * from {src_view} ;\n\n')

                        views_hql = views_hql.format(
                            view_full_name=view_full_name,
                            view_name=target_db,
                            src_view=src_view)
                    views_hql += "msck repair table `{0}`;\n\n".format(
                        view_full_name)

            elif self.ibis_env == 'PERF' and len(only_domain) > 0:
                views_hql = ('DROP VIEW IF EXISTS {view_full_name};\n'
                             'DROP TABLE IF EXISTS {view_full_name};\n'
                             'CREATE VIEW {view_full_name} AS '
                             'SELECT * FROM {src_view}; \n')
                views_hql = views_hql.format(view_full_name=view_full_name,
                                             src_view=src_view)
            else:

                views_hql = (
                    'DROP VIEW IF EXISTS `{view_full_name}`;\n'
                    'DROP TABLE IF EXISTS `{view_full_name}`;\n\n'
                    'CREATE DATABASE IF NOT EXISTS `{view_name}`;\n'
                    'CREATE EXTERNAL TABLE `{view_full_name}` ('
                    '{create_column_hql},\n `ingest_timestamp` string)\n'
                    'partitioned by (incr_ingest_timestamp string)\n'
                    "stored as parquet location 'hdfs://"
                    "/user/data/{target_dir}/live/';\n\n")

                views_hql = views_hql.format(view_full_name=view_full_name,
                                             view_name=target_db,
                                             create_column_hql=create_hql,
                                             target_dir=self.target_dir)

                views_hql += add_partition_hql
                views_hql += "msck repair table `{0}`;\n\n".format(
                    view_full_name)
            return views_hql