Exemple #1
0
 def table(self):
     table_name = request.args.get("table")
     m = HiveMetastoreHook(METASTORE_CONN_ID)
     table = m.get_table(table_name)
     return self.render(
         "metastore_browser/table.html",
         table=table, table_name=table_name, datetime=datetime, int=int)
Exemple #2
0
def max_partition(table,
                  schema="default",
                  field=None,
                  filter=None,
                  metastore_conn_id='metastore_default'):
    '''
    Gets the max partition for a table.

    :param schema: The hive schema the table lives in
    :type schema: string
    :param table: The hive table you are interested in, supports the dot
        notation as in "my_database.my_table", if a dot is found,
        the schema param is disregarded
    :type table: string
    :param hive_conn_id: The hive connection you are interested in.
        If your default is set you don't need to use this parameter.
    :type hive_conn_id: string
    :param filter: filter on a subset of partition as in
        `sub_part='specific_value'`
    :type filter: string
    :param field: the field to get the max value from. If there's only
        one partition field, this will be inferred

    >>> max_partition('airflow.static_babynames_partitioned')
    '2015-01-01'
    '''
    from airflow.hooks import HiveMetastoreHook
    if '.' in table:
        schema, table = table.split('.')
    hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id)
    return hh.max_partition(schema=schema,
                            table_name=table,
                            field=field,
                            filter=filter)
Exemple #3
0
def max_partition(
        table, schema="default", field=None, filter=None,
        metastore_conn_id='metastore_default'):
    '''
    Gets the max partition for a table.

    :param schema: The hive schema the table lives in
    :type schema: string
    :param table: The hive table you are interested in, supports the dot
        notation as in "my_database.my_table", if a dot is found,
        the schema param is disregarded
    :type table: string
    :param hive_conn_id: The hive connection you are interested in.
        If your default is set you don't need to use this parameter.
    :type hive_conn_id: string
    :param filter: filter on a subset of partition as in
        `sub_part='specific_value'`
    :type filter: string
    :param field: the field to get the max value from. If there's only
        one partition field, this will be inferred

    >>> max_partition('airflow.static_babynames_partitioned')
    '2015-01-01'
    '''
    from airflow.hooks import HiveMetastoreHook
    if '.' in table:
        schema, table = table.split('.')
    hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id)
    return hh.max_partition(
        schema=schema, table_name=table, field=field, filter=filter)
Exemple #4
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        logging.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str']
        sql = self.sql.strip().strip(';')
        hql = """\
        set mapred.output.compress=false;
        set hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE AS
        {sql};
        """.format(**locals())
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)
        t = m.get_table(hive_table)

        columns = [col.name for col in t.sd.cols]

        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find('/user')
        static_path = hdfs_uri[pos:]

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
        logging.info("Inserting rows into Druid")
        druid.load_from_hdfs(datasource=self.druid_datasource,
                             intervals=self.intervals,
                             static_path=static_path,
                             ts_dim=self.ts_dim,
                             columns=columns,
                             metric_spec=self.metric_spec)
        logging.info("Load seems to have succeeded!")
Exemple #5
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        logging.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str']
        sql = self.sql.strip().strip(';')
        hql = """\
        set mapred.output.compress=false;
        set hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE AS
        {sql};
        """.format(**locals())
        #hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)
        t = m.get_table(hive_table)

        columns = [col.name for col in t.sd.cols]

        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find('/user')
        static_path = hdfs_uri[pos:]

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
        logging.info("Inserting rows into Druid")
        druid.load_from_hdfs(
            datasource=self.druid_datasource,
            intervals=self.intervals,
            static_path=static_path, ts_dim=self.ts_dim,
            columns=columns, metric_spec=self.metric_spec)
        logging.info("Load seems to have succeeded!")
Exemple #6
0
 def table(self):
     table_name = request.args.get("table")
     m = HiveMetastoreHook(METASTORE_CONN_ID)
     table = m.get_table(table_name)
     return self.render("metastore_browser/table.html",
                        table=table,
                        table_name=table_name,
                        datetime=datetime,
                        int=int)
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        logging.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str']
        sql = self.sql.strip().strip(';')
        hql = """\
        set mapred.output.compress=false;
        set hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = '')
        AS
        {sql}
        """.format(**locals())
        hive.run_cli(hql)
        #hqls = hql.split(';')
        #logging.info(str(hqls))
        #from airflow.hooks import HiveServer2Hook
        #hive = HiveServer2Hook(hiveserver2_conn_id="hiveserver2_silver")
        #hive.get_results(hqls)


        m = HiveMetastoreHook(self.metastore_conn_id)
        t = m.get_table(hive_table)

        columns = [col.name for col in t.sd.cols]

        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find('/user')
        static_path = hdfs_uri[pos:]

        schema, table = hive_table.split('.')

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
        logging.info("Inserting rows into Druid")
        logging.info("HDFS path: " + static_path)

        druid.load_from_hdfs(
            datasource=self.druid_datasource,
            intervals=self.intervals,
            static_path=static_path, ts_dim=self.ts_dim,
            columns=columns, metric_spec=self.metric_spec,
            hadoop_dependency_coordinates=self.hadoop_dependency_coordinates)
        logging.info("Load seems to have succeeded!")

        logging.info(
            "Cleaning up by dropping the temp "
            "Hive table {}".format(hive_table))
        hql = "DROP TABLE IF EXISTS {}".format(hive_table)
Exemple #8
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        logging.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str']
        sql = self.sql.strip().strip(';')
        hql = """\
        set mapred.output.compress=false;
        set hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = '')
        AS
        {sql}
        """.format(**locals())
        hive.run_cli(hql)
        #hqls = hql.split(';')
        #logging.info(str(hqls))
        #from airflow.hooks import HiveServer2Hook
        #hive = HiveServer2Hook(hiveserver2_conn_id="hiveserver2_silver")
        #hive.get_results(hqls)

        m = HiveMetastoreHook(self.metastore_conn_id)
        t = m.get_table(hive_table)

        columns = [col.name for col in t.sd.cols]

        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find('/user')
        static_path = hdfs_uri[pos:]

        schema, table = hive_table.split('.')

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
        logging.info("Inserting rows into Druid")
        logging.info("HDFS path: " + static_path)

        druid.load_from_hdfs(
            datasource=self.druid_datasource,
            intervals=self.intervals,
            static_path=static_path,
            ts_dim=self.ts_dim,
            columns=columns,
            metric_spec=self.metric_spec,
            hadoop_dependency_coordinates=self.hadoop_dependency_coordinates)
        logging.info("Load seems to have succeeded!")

        logging.info("Cleaning up by dropping the temp "
                     "Hive table {}".format(hive_table))
        hql = "DROP TABLE IF EXISTS {}".format(hive_table)
Exemple #9
0
def closest_ds_partition(table,
                         ds,
                         before=True,
                         schema="default",
                         metastore_conn_id='metastore_default'):
    '''
    This function finds the date in a list closest to the target date.
    An optional parameter can be given to get the closest before or after.

    :param table: A hive table name
    :type table: str
    :param ds: A datestamp ``%Y-%m-%d`` e.g. ``yyyy-mm-dd``
    :type ds: datetime.date list
    :param before: closest before (True), after (False) or either side of ds
    :type before: bool or None
    :returns: The closest date
    :rtype: str or None

    >>> tbl = 'airflow.static_babynames_partitioned'
    >>> closest_ds_partition(tbl, '2015-01-02')
    '2015-01-01'
    '''
    from airflow.hooks import HiveMetastoreHook
    if '.' in table:
        schema, table = table.split('.')
    hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id)
    partitions = hh.get_partitions(schema=schema, table_name=table)
    if not partitions:
        return None
    part_vals = [list(p.values())[0] for p in partitions]
    if ds in part_vals:
        return ds
    else:
        parts = [
            datetime.datetime.strptime(pv, '%Y-%m-%d') for pv in part_vals
        ]
        target_dt = datetime.datetime.strptime(ds, '%Y-%m-%d')
        closest_ds = _closest_date(target_dt, parts, before_target=before)
        return closest_ds.isoformat()
Exemple #10
0
def closest_ds_partition(
        table, ds, before=True, schema="default",
        metastore_conn_id='metastore_default'):
    '''
    This function finds the date in a list closest to the target date.
    An optional paramter can be given to get the closest before or after.

    :param table: A hive table name
    :type table: str
    :param ds: A datestamp ``%Y-%m-%d`` i.e. ``yyyy-mm-dd``
    :type ds: datetime.date list
    :param before: closest before (True), after (False) or either side of ds
    :type before: bool or None
    :returns: The closest date
    :rtype: str or None

    >>> tbl = 'airflow.static_babynames_partitioned'
    >>> closest_ds_partition(tbl, '2015-01-02')
    '2015-01-01'
    '''
    from airflow.hooks import HiveMetastoreHook
    if '.' in table:
        schema, table = table.split('.')
    hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id)
    partitions = hh.get_partitions(schema=schema, table_name=table)
    if not partitions:
        return None
    part_vals = [p.values()[0] for p in partitions]
    if ds in part_vals:
        return ds
    else:
        parts = [datetime.datetime.strptime(pv, '%Y-%m-%d')
                 for pv in part_vals]
        target_dt = datetime.datetime.strptime(ds, '%Y-%m-%d')
        closest_ds = _closest_date(target_dt, parts, before_target=before)
        return closest_ds.isoformat()
    def execute(self, context=None):
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs = {
            ('', 'count'): 'COUNT(*)'
        }
        for col, col_type in field_types.items():
            d = {}
            if self.assignment_func:
                d = self.assignment_func(col, col_type)
                if d is None:
                    d = self.get_default_exprs(col, col_type)
            else:
                d = self.get_default_exprs(col, col_type)
            exprs.update(d)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join([
            v + " AS " + k[0] + '__' + k[1]
            for k, v in exprs.items()])

        where_clause = [
            "{0} = '{1}'".format(k, v) for k, v in self.partition.items()]
        where_clause = " AND\n        ".join(where_clause)
        sql = """
        SELECT
            {exprs_str}
        FROM {self.table}
        WHERE
            {where_clause};
        """.format(**locals())

        hook = PrestoHook(presto_conn_id=self.presto_conn_id)
        logging.info('Executing SQL check: ' + sql)
        row = hook.get_first(hql=sql)
        logging.info("Record: " + str(row))
        if not row:
            raise Exception("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        logging.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{self.table}' AND
            partition_repr='{part_json}' AND
            dttm='{self.dttm}'
        LIMIT 1;
        """.format(**locals())
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{self.table}' AND
                partition_repr='{part_json}' AND
                dttm='{self.dttm}';
            """.format(**locals())
            mysql.run(sql)

        logging.info("Pivoting and loading cells into the Airflow db")
        rows = [
            (self.ds, self.dttm, self.table, part_json) +
            (r[0][0], r[0][1], r[1])
            for r in zip(exprs, row)]
        mysql.insert_rows(
            table='hive_stats',
            rows=rows,
            target_fields=[
                'ds',
                'dttm',
                'table_name',
                'partition_repr',
                'col',
                'metric',
                'value',
            ]
        )
Exemple #12
0
 def db(self):
     db = request.args.get("db")
     m = HiveMetastoreHook(METASTORE_CONN_ID)
     tables = sorted(m.get_tables(db=db), key=lambda x: x.tableName)
     return self.render(
         "metastore_browser/db.html", tables=tables, db=db)
    def execute(self, context=None):
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs = {('', 'count'): 'COUNT(*)'}
        for col, col_type in field_types.items():
            d = {}
            if self.assignment_func:
                d = self.assignment_func(col, col_type)
                if d is None:
                    d = self.get_default_exprs(col, col_type)
            else:
                d = self.get_default_exprs(col, col_type)
            exprs.update(d)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join(
            [v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()])

        where_clause = [
            "{0} = '{1}'".format(k, v) for k, v in self.partition.items()
        ]
        where_clause = " AND\n        ".join(where_clause)
        sql = """
        SELECT
            {exprs_str}
        FROM {self.table}
        WHERE
            {where_clause};
        """.format(**locals())

        hook = PrestoHook(presto_conn_id=self.presto_conn_id)
        logging.info('Executing SQL check: ' + sql)
        row = hook.get_first(hql=sql)
        logging.info("Record: " + str(row))
        if not row:
            raise AirflowException("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        logging.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{self.table}' AND
            partition_repr='{part_json}' AND
            dttm='{self.dttm}'
        LIMIT 1;
        """.format(**locals())
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{self.table}' AND
                partition_repr='{part_json}' AND
                dttm='{self.dttm}';
            """.format(**locals())
            mysql.run(sql)

        logging.info("Pivoting and loading cells into the Airflow db")
        rows = [(self.ds, self.dttm, self.table, part_json) +
                (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)]
        mysql.insert_rows(table='hive_stats',
                          rows=rows,
                          target_fields=[
                              'ds',
                              'dttm',
                              'table_name',
                              'partition_repr',
                              'col',
                              'metric',
                              'value',
                          ])
Exemple #14
0
 def db(self):
     db = request.args.get("db")
     m = HiveMetastoreHook(METASTORE_CONN_ID)
     tables = sorted(m.get_tables(db=db), key=lambda x: x.tableName)
     return self.render("metastore_browser/db.html", tables=tables, db=db)