Beispiel #1
0
 def requires(self):
     yield (
         ExternalHiveTask(table='student_courseenrollment',
                          database=hive_database_name()),
         ExternalHiveTask(table='auth_user', database=hive_database_name()),
         ExternalHiveTask(table='last_country_of_user',
                          database=hive_database_name()),
     )
    def query(self):
        query = """
        USE {database_name};
        INSERT OVERWRITE TABLE {table} PARTITION ({partition.query_spec})
        SELECT
            act.course_id as course_id,
            CONCAT(cal.iso_week_start, ' 00:00:00') as interval_start,
            CONCAT(cal.iso_week_end, ' 00:00:00') as interval_end,
            act.category as label,
            COUNT(DISTINCT username) as count
        FROM user_activity act
        JOIN calendar cal
            ON act.`date` = cal.`date` AND act.dt >= "{interval_start}" AND act.dt < "{interval_end}"
        WHERE
            "{interval_start}" <= cal.`date` AND cal.`date` < "{interval_end}"
        GROUP BY
            act.course_id,
            cal.iso_week_start,
            cal.iso_week_end,
            act.category;
        """.format(
            database_name=hive_database_name(),
            table=self.hive_table_task.table,
            partition=self.partition,
            interval_start=self.interval.date_a.isoformat(),
            interval_end=self.interval.date_b.isoformat(),
        )

        return query
    def query(self):

        query_format = textwrap.dedent("""
            USE {database_name};
            DROP TABLE IF EXISTS {table_name};
            CREATE EXTERNAL TABLE {table_name} (
                `date` STRING,
                course_id STRING,
                country_code STRING,
                count INT,
                cumulative_count INT
            )
            ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
            LOCATION '{location}';

            INSERT OVERWRITE TABLE {table_name}
            SELECT
                sce.dt,
                sce.course_id,
                uc.country_code,
                sum(if(sce.is_active, 1, 0)),
                count(sce.user_id)
            FROM student_courseenrollment sce
            LEFT OUTER JOIN last_country_of_user_id uc on sce.user_id = uc.user_id
            GROUP BY sce.dt, sce.course_id, uc.country_code;
        """)

        query = query_format.format(
            database_name=hive_database_name(),
            location=self.table_location,
            table_name=self.table,
        )
        log.debug('Executing hive query: %s', query)
        return query
    def query(self):

        query_format = textwrap.dedent("""
            USE {database_name};
            DROP TABLE IF EXISTS {table_name};
            CREATE EXTERNAL TABLE {table_name} (
                `date` STRING,
                course_id STRING,
                country_code STRING,
                count INT,
                cumulative_count INT
            )
            ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
            LOCATION '{location}';

            INSERT OVERWRITE TABLE {table_name}
            SELECT
                sce.dt,
                sce.course_id,
                uc.country_code,
                sum(if(sce.is_active, 1, 0)),
                count(sce.user_id)
            FROM student_courseenrollment sce
            LEFT OUTER JOIN last_country_of_user_id uc on sce.user_id = uc.user_id
            GROUP BY sce.dt, sce.course_id, uc.country_code;
        """)

        query = query_format.format(
            database_name=hive_database_name(),
            location=self.table_location,
            table_name=self.table,
        )
        log.debug('Executing hive query: %s', query)
        return query
    def query(self):

        query_format = textwrap.dedent("""
            USE {database_name};
            DROP TABLE IF EXISTS {table_name};
            CREATE EXTERNAL TABLE {table_name} (
                date STRING,
                course_id STRING,
                country_code STRING,
                count INT
            )
            ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
            LOCATION '{location}';

            INSERT OVERWRITE TABLE course_enrollment_location_current
            SELECT
                sce.dt,
                sce.course_id,
                uc.country_code,
                count(sce.user_id)
            FROM student_courseenrollment sce
            LEFT OUTER JOIN auth_user au on sce.user_id = au.id
            LEFT OUTER JOIN last_country_of_user uc on au.username = uc.username
            WHERE sce.is_active > 0
            GROUP BY sce.dt, sce.course_id, uc.country_code;
        """)

        query = query_format.format(
            database_name=hive_database_name(),
            location=self.output().path,
            table_name='course_enrollment_location_current',
        )

        log.debug('Executing hive query: %s', query)
        return query
    def query(self):
        query = """
        USE {database_name};
        INSERT OVERWRITE TABLE {table} PARTITION ({partition.query_spec})
        SELECT
            act.course_id as course_id,
            CONCAT(cal.iso_week_start, ' 00:00:00') as interval_start,
            CONCAT(cal.iso_week_end, ' 00:00:00') as interval_end,
            act.category as label,
            COUNT(DISTINCT user_id) as count
        FROM user_activity_by_user act
        JOIN calendar cal
            ON act.`date` = cal.`date` AND act.dt >= "{interval_start}" AND act.dt < "{interval_end}"
        WHERE
            "{interval_start}" <= cal.`date` AND cal.`date` < "{interval_end}"
        GROUP BY
            act.course_id,
            cal.iso_week_start,
            cal.iso_week_end,
            act.category;
        """.format(
            database_name=hive_database_name(),
            table=self.hive_table_task.table,
            partition=self.partition,
            interval_start=self.interval.date_a.isoformat(),
            interval_end=self.interval.date_b.isoformat(),
        )

        return query
Beispiel #7
0
    def query(self):

        query_format = textwrap.dedent("""
            USE {database_name};
            DROP TABLE IF EXISTS {table_name};
            CREATE EXTERNAL TABLE {table_name} (
                date STRING,
                course_id STRING,
                country_code STRING,
                count INT
            )
            ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
            LOCATION '{location}';

            INSERT OVERWRITE TABLE course_enrollment_location_current
            SELECT
                sce.dt,
                sce.course_id,
                uc.country_code,
                count(sce.user_id)
            FROM student_courseenrollment sce
            LEFT OUTER JOIN auth_user au on sce.user_id = au.id
            LEFT OUTER JOIN last_country_of_user uc on au.username = uc.username
            WHERE sce.is_active > 0
            GROUP BY sce.dt, sce.course_id, uc.country_code;
        """)

        query = query_format.format(
            database_name=hive_database_name(),
            location=self.output().path,
            table_name='course_enrollment_location_current',
        )

        log.debug('Executing hive query: %s', query)
        return query
Beispiel #8
0
 def query(self):  # pragma: no cover
     full_insert_query = """
                 USE {database_name};
                 INSERT INTO TABLE {table}
                 PARTITION ({partition.query_spec})
                 {insert_query};
             """.format(database_name=hive_database_name(),
                        table=self.partition_task.hive_table_task.table,
                        partition=self.partition,
                        insert_query=self.insert_query.strip(),  # pylint: disable=no-member
                       )
     return textwrap.dedent(full_insert_query)
 def query(self):  # pragma: no cover
     full_insert_query = """
                 USE {database_name};
                 INSERT INTO TABLE {table}
                 PARTITION ({partition.query_spec})
                 {insert_query};
             """.format(database_name=hive_database_name(),
                        table=self.partition_task.hive_table_task.table,
                        partition=self.partition,
                        insert_query=self.insert_query.strip(),  # pylint: disable=no-member
                       )
     return textwrap.dedent(full_insert_query)
Beispiel #10
0
    def query(self):
        query = """
            USE {database_name};
            INSERT OVERWRITE TABLE {table} PARTITION ({partition.query_spec}) {if_not_exists}
            SELECT
                pr.course_id,
                pr.answer_id,
                pr.problem_id,
                pr.problem,
                pr.username,
                pr.question,
                pr.score,
                pr.max_score,
                pr.correct,
                pr.answer,
                pr.total_attempts,
                pr.first_attempt_date,
                pr.last_attempt_date,
                CONCAT(COALESCE(cb.course_path, '{deleted_blocks_path}'), '{path_delimiter}', pr.problem) as location,
                COALESCE(cb.sort_idx, -1) as sort_idx
            FROM {problem_response_table} pr
            LEFT OUTER JOIN {course_blocks_table} cb
                ON (cb.block_id=pr.problem_id and cb.{course_blocks_partition})
            WHERE pr.{problem_response_partition}
            ORDER BY pr.course_id, sort_idx, pr.first_attempt_date
        """.format(
            database_name=hive_database_name(),
            table=self.hive_table_task.table,
            partition=self.partition,
            path_delimiter=self.path_delimiter,
            deleted_blocks_path=self.deleted_blocks_path,
            if_not_exists='' if self.overwrite else 'IF NOT EXISTS',
            problem_response_table=self.problem_response_partition.
            hive_table_task.table,
            problem_response_partition="{}='{}'".format(
                self.problem_response_partition.hive_table_task.partition_by,
                self.problem_response_partition.partition_value),
            course_blocks_table=self.course_blocks_partition.hive_table_task.
            table,
            course_blocks_partition="{}='{}'".format(
                self.course_blocks_partition.hive_table_task.partition_by,
                self.course_blocks_partition.partition_value),
        )

        query = textwrap.dedent(query)
        log.debug('query: %s', query)
        return query
Beispiel #11
0
    def query(self):
        # TODO: Figure out how to clean up old data. This just cleans
        # out old metastore info, and doesn't actually remove the table
        # data.

        # Ensure there is exactly one available partition in the
        # table. Don't keep historical partitions since we don't want
        # to commit to taking snapshots at any regular interval. They
        # will happen when/if they need to happen.  Table snapshots
        # should *not* be used for analyzing trends, instead we should
        # rely on events or database tables that keep historical
        # information.
        query_format = textwrap.dedent("""
            USE {database_name};
            DROP TABLE IF EXISTS `{table_name}`;
            CREATE EXTERNAL TABLE `{table_name}` (
                {col_spec}
            )
            PARTITIONED BY (dt STRING)
            {table_format}
            LOCATION '{location}';
            ALTER TABLE `{table_name}` ADD PARTITION (dt = '{partition_date}');
        """)

        query = query_format.format(
            database_name=hive_database_name(),
            table_name=self.table_name,
            col_spec=','.join([
                '`{}` {}'.format(name, col_type)
                for name, col_type in self.columns
            ]),
            location=self.table_location,
            table_format=self.table_format,
            partition_date=self.partition_date,
        )

        log.debug('Executing hive query: %s', query)

        # Mark the output as having been removed, even though
        # that doesn't technically happen until the query has been
        # executed (and in particular that the 'DROP TABLE' is executed).
        log.info("Marking existing output as having been removed for task %s",
                 str(self))
        self.attempted_removal = True

        return query
    def query(self):
        # TODO: Figure out how to clean up old data. This just cleans
        # out old metastore info, and doesn't actually remove the table
        # data.

        # Ensure there is exactly one available partition in the
        # table. Don't keep historical partitions since we don't want
        # to commit to taking snapshots at any regular interval. They
        # will happen when/if they need to happen.  Table snapshots
        # should *not* be used for analyzing trends, instead we should
        # rely on events or database tables that keep historical
        # information.
        query_format = textwrap.dedent(
            """
            USE {database_name};
            DROP TABLE IF EXISTS {table_name};
            CREATE EXTERNAL TABLE {table_name} (
                {col_spec}
            )
            PARTITIONED BY (dt STRING)
            {table_format}
            LOCATION '{location}';
            ALTER TABLE {table_name} ADD PARTITION (dt = '{partition_date}');
        """
        )

        query = query_format.format(
            database_name=hive_database_name(),
            table_name=self.table_name,
            col_spec=",".join([" ".join(c) for c in self.columns]),
            location=self.table_location,
            table_format=self.table_format,
            partition_date=self.partition_date,
        )

        log.debug("Executing hive query: %s", query)

        # Mark the output as having been removed, even though
        # that doesn't technically happen until the query has been
        # executed (and in particular that the 'DROP TABLE' is executed).
        log.info("Marking existing output as having been removed for task %s", str(self))
        self.attempted_removal = True

        return query
    def query(self):
        query = """
            USE {database_name};
            INSERT OVERWRITE TABLE {table} PARTITION ({partition.query_spec}) {if_not_exists}
            SELECT
                pr.course_id,
                pr.answer_id,
                pr.problem_id,
                pr.problem,
                pr.username,
                pr.question,
                pr.score,
                pr.max_score,
                pr.correct,
                pr.answer,
                pr.total_attempts,
                pr.first_attempt_date,
                pr.last_attempt_date,
                CONCAT(COALESCE(cb.course_path, '{deleted_blocks_path}'), '{path_delimiter}', pr.problem) as location,
                COALESCE(cb.sort_idx, -1) as sort_idx
            FROM {problem_response_table} pr
            LEFT OUTER JOIN {course_blocks_table} cb
                ON (cb.block_id=pr.problem_id and cb.{course_blocks_partition})
            WHERE pr.{problem_response_partition}
            ORDER BY pr.course_id, sort_idx, pr.first_attempt_date
        """.format(
            database_name=hive_database_name(),
            table=self.hive_table_task.table,
            partition=self.partition,
            path_delimiter=self.path_delimiter,
            deleted_blocks_path=self.deleted_blocks_path,
            if_not_exists='' if self.overwrite else 'IF NOT EXISTS',
            problem_response_table=self.problem_response_partition.hive_table_task.table,
            problem_response_partition="{}='{}'".format(self.problem_response_partition.hive_table_task.partition_by,
                                                        self.problem_response_partition.partition_value),
            course_blocks_table=self.course_blocks_partition.hive_table_task.table,
            course_blocks_partition="{}='{}'".format(self.course_blocks_partition.hive_table_task.partition_by,
                                                     self.course_blocks_partition.partition_value),
        )

        query = textwrap.dedent(query)
        log.debug('query: %s', query)
        return query
Beispiel #14
0
    def query(self):
        query = """
        USE {database_name};
        INSERT OVERWRITE TABLE {table} PARTITION ({partition.query_spec})
        SELECT
            au.id,
            ua.course_id,
            ua.`date`,
            ua.category,
            ua.count
        FROM auth_user au
        JOIN user_activity ua
            ON au.username = ua.username;
        """.format(
            database_name=hive_database_name(),
            table=self.hive_table_task.table,
            partition=self.partition,
        )

        return query
    def query(self):
        query = """
        USE {database_name};
        DROP TABLE IF EXISTS {table_name};
        CREATE EXTERNAL TABLE {table_name} (
            program_id STRING,
            program_type STRING,
            program_title STRING,
            catalog_course STRING,
            catalog_course_title STRING,
            course_id STRING,
            org_id STRING,
            partner_short_code STRING,
            program_slot_number INT
        )
        ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
        LOCATION '{location}';

        INSERT OVERWRITE TABLE {table_name}
        SELECT
            p.program_id,
            p.program_type,
            p.program_title,
            p.catalog_course,
            p.catalog_course_title,
            p.course_id,
            p.org_id,
            p.partner_short_code,
            o.program_slot_number
        FROM program_course p
        LEFT JOIN program_course_order o ON p.program_id = o.program_id AND p.catalog_course = o.catalog_course;
        """.format(
            database_name=hive_database_name(),
            location=self.table_location,
            table_name=self.table,
        )
        log.debug('Executing hive query: %s', query)
        return query
    def query(self):
        query = """
        USE {database_name};
        DROP TABLE IF EXISTS {table_name};
        CREATE EXTERNAL TABLE {table_name} (
            program_id STRING,
            program_type STRING,
            program_title STRING,
            catalog_course STRING,
            catalog_course_title STRING,
            course_id STRING,
            org_id STRING,
            partner_short_code STRING,
            program_slot_number INT
        )
        ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
        LOCATION '{location}';

        INSERT OVERWRITE TABLE {table_name}
        SELECT
            p.program_id,
            p.program_type,
            p.program_title,
            p.catalog_course,
            p.catalog_course_title,
            p.course_id,
            p.org_id,
            p.partner_short_code,
            o.program_slot_number
        FROM program_course p
        LEFT JOIN program_course_order o ON p.program_id = o.program_id AND p.catalog_course = o.catalog_course;
        """.format(
            database_name=hive_database_name(),
            location=self.table_location,
            table_name=self.table,
        )
        log.debug('Executing hive query: %s', query)
        return query
 def requires(self):
     yield (
         ExternalHiveTask(table='student_courseenrollment', database=hive_database_name()),
         ExternalHiveTask(table='auth_user', database=hive_database_name()),
         ExternalHiveTask(table='last_country_of_user', database=hive_database_name()),
     )
 def output(self):
     return HivePartitionTarget(
         self.table_name, self.partition, database=hive_database_name(), fail_missing_table=False
     )
 def output(self):
     return HivePartitionTarget(self.table_name,
                                self.partition,
                                database=hive_database_name(),
                                fail_missing_table=False)