def setUp(self):
        self._upload_dataframe()
        args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
        self.dag = DAG('test_dag_id', default_args=args)
        self.database = 'airflow'
        self.table = 'hive_server_hook'

        self.hql = """
        CREATE DATABASE IF NOT EXISTS {{ params.database }};
        USE {{ params.database }};
        DROP TABLE IF EXISTS {{ params.table }};
        CREATE TABLE IF NOT EXISTS {{ params.table }} (
            a int,
            b int)
        ROW FORMAT DELIMITED
        FIELDS TERMINATED BY ',';
        LOAD DATA LOCAL INPATH '{{ params.csv_path }}'
        OVERWRITE INTO TABLE {{ params.table }};
        """
        self.columns = [f'{self.table}.a', f'{self.table}.b']

        with mock.patch(
            'airflow.providers.apache.hive.hooks.hive.HiveMetastoreHook.get_metastore_client'
        ) as get_metastore_mock:
            get_metastore_mock.return_value = mock.MagicMock()

            self.hook = HiveMetastoreHook()
 def poke(self, context: Dict[str, Any]) -> bool:
     if '.' in self.table:
         self.schema, self.table = self.table.split('.')
     self.log.info('Poking for table %s.%s, partition %s', self.schema, self.table, self.partition)
     if not hasattr(self, 'hook'):
         hook = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
     return hook.check_for_partition(self.schema, self.table, self.partition)
 def setUp(self):
     args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     self.dag = DAG('test_dag_id', default_args=args)
     self.next_day = (DEFAULT_DATE + timedelta(days=1)).isoformat()[:10]
     self.database = 'airflow'
     self.partition_by = 'ds'
     self.table = 'static_babynames_partitioned'
     self.hql = """
             CREATE DATABASE IF NOT EXISTS {{ params.database }};
             USE {{ params.database }};
             DROP TABLE IF EXISTS {{ params.table }};
             CREATE TABLE IF NOT EXISTS {{ params.table }} (
                 state string,
                 year string,
                 name string,
                 gender string,
                 num int)
             PARTITIONED BY ({{ params.partition_by }} string);
             ALTER TABLE {{ params.table }}
             ADD PARTITION({{ params.partition_by }}='{{ ds }}');
             """
     self.hook = HiveMetastoreHook()
     op = operators.hive_operator.HiveOperator(
         task_id='HiveHook_' + str(random.randint(1, 10000)),
         params={
             'database': self.database,
             'table': self.table,
             'partition_by': self.partition_by
         },
         hive_cli_conn_id='hive_cli_default',
         hql=self.hql,
         dag=self.dag)
     op.run(start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_ti_state=True)
Beispiel #4
0
 def setUp(self):
     self._upload_dataframe()
     args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     self.dag = DAG('test_dag_id', default_args=args)
     self.database = 'airflow'
     self.table = 'hive_server_hook'
     self.hql = """
     CREATE DATABASE IF NOT EXISTS {{ params.database }};
     USE {{ params.database }};
     DROP TABLE IF EXISTS {{ params.table }};
     CREATE TABLE IF NOT EXISTS {{ params.table }} (
         a int,
         b int)
     ROW FORMAT DELIMITED
     FIELDS TERMINATED BY ',';
     LOAD DATA LOCAL INPATH '{{ params.csv_path }}'
     OVERWRITE INTO TABLE {{ params.table }};
     """
     self.columns = ['{}.a'.format(self.table), '{}.b'.format(self.table)]
     self.hook = HiveMetastoreHook()
     op = HiveOperator(task_id='HiveHook_' + str(random.randint(1, 10000)),
                       params={
                           'database': self.database,
                           'table': self.table,
                           'csv_path': self.local_path
                       },
                       hive_cli_conn_id='hive_cli_default',
                       hql=self.hql,
                       dag=self.dag)
     op.run(start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            ignore_ti_state=True)
Beispiel #5
0
def max_partition(
        table, schema="default", field=None, filter_map=None,
        metastore_conn_id='metastore_default'):
    """
    Gets the max partition for a table.

    :param schema: The hive schema the table lives in
    :type schema: str
    :param table: The hive table you are interested in, supports the dot
        notation as in "my_database.my_table", if a dot is found,
        the schema param is disregarded
    :type table: str
    :param metastore_conn_id: The hive connection you are interested in.
        If your default is set you don't need to use this parameter.
    :type metastore_conn_id: str
    :param filter_map: partition_key:partition_value map used for partition filtering,
                       e.g. {'key1': 'value1', 'key2': 'value2'}.
                       Only partitions matching all partition_key:partition_value
                       pairs will be considered as candidates of max partition.
    :type filter_map: map
    :param field: the field to get the max value from. If there's only
        one partition field, this will be inferred
    :type field: str

    >>> max_partition('airflow.static_babynames_partitioned')
    '2015-01-01'
    """
    from airflow.providers.apache.hive.hooks.hive import HiveMetastoreHook
    if '.' in table:
        schema, table = table.split('.')
    hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id)
    return hh.max_partition(
        schema=schema, table_name=table, field=field, filter_map=filter_map)
Beispiel #6
0
    def test_get_conn(self):
        with mock.patch('airflow.providers.apache.hive.hooks.hive.HiveMetastoreHook._find_valid_server'
                        ) as find_valid_server:
            find_valid_server.return_value = mock.MagicMock(return_value={})
            metastore_hook = HiveMetastoreHook()

        self.assertIsInstance(metastore_hook.get_conn(), HMSClient)
Beispiel #7
0
 def db(self):
     """Show tables in database"""
     db = request.args.get("db")
     metastore = HiveMetastoreHook(METASTORE_CONN_ID)
     tables = sorted(metastore.get_tables(db=db), key=lambda x: x.tableName)
     return self.render_template("metastore_browser/db.html",
                                 tables=tables,
                                 db=db)
Beispiel #8
0
 def table(self):
     """Create table view"""
     table_name = request.args.get("table")
     metastore = HiveMetastoreHook(METASTORE_CONN_ID)
     table = metastore.get_table(table_name)
     return self.render_template("metastore_browser/table.html",
                                 table=table,
                                 table_name=table_name,
                                 datetime=datetime,
                                 int=int)
Beispiel #9
0
    def poke_partition(self, partition):
        if not self.hook:
            from airflow.providers.apache.hive.hooks.hive import HiveMetastoreHook
            self.hook = HiveMetastoreHook(
                metastore_conn_id=self.metastore_conn_id)

        schema, table, partition = self.parse_partition_name(partition)

        self.log.info('Poking for %s.%s/%s', schema, table, partition)
        return self.hook.check_for_named_partition(schema, table, partition)
 def poke(self, context):
     if '.' in self.table:
         self.schema, self.table = self.table.split('.')
     self.log.info('Poking for table %s.%s, partition %s', self.schema,
                   self.table, self.partition)
     if not hasattr(self, 'hook'):
         from airflow.providers.apache.hive.hooks.hive import HiveMetastoreHook
         hook = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
     return hook.check_for_partition(self.schema, self.table,
                                     self.partition)
    def setUp(self):
        self.next_day = (DEFAULT_DATE + datetime.timedelta(days=1)).isoformat()[:10]
        self.database = 'airflow'
        self.partition_by = 'ds'
        self.table = 'static_babynames_partitioned'
        with mock.patch(
            'airflow.providers.apache.hive.hooks.hive.HiveMetastoreHook.get_metastore_client'
        ) as get_metastore_mock:
            get_metastore_mock.return_value = mock.MagicMock()

            self.hook = HiveMetastoreHook()
Beispiel #12
0
    def execute(self, context: Dict[str, Any]) -> None:
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        self.log.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str'].replace(
            '.', '_')
        sql = self.sql.strip().strip(';')
        tblproperties = ''.join([
            ", '{}' = '{}'".format(k, v)
            for k, v in self.hive_tblproperties.items()
        ])
        hql = f"""\
        SET mapred.output.compress=false;
        SET hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = ''{tblproperties})
        AS
        {sql}
        """
        self.log.info("Running command:\n %s", hql)
        hive.run_cli(hql)

        meta_hook = HiveMetastoreHook(self.metastore_conn_id)

        # Get the Hive table and extract the columns
        table = meta_hook.get_table(hive_table)
        columns = [col.name for col in table.sd.cols]

        # Get the path on hdfs
        static_path = meta_hook.get_table(hive_table).sd.location

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)

        try:
            index_spec = self.construct_ingest_query(
                static_path=static_path,
                columns=columns,
            )

            self.log.info("Inserting rows into Druid, hdfs path: %s",
                          static_path)

            druid.submit_indexing_job(index_spec)

            self.log.info("Load seems to have succeeded!")
        finally:
            self.log.info("Cleaning up by dropping the temp Hive table %s",
                          hive_table)
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)
Beispiel #13
0
def closest_ds_partition(table,
                         ds,
                         before=True,
                         schema="default",
                         metastore_conn_id='metastore_default'):
    """
    This function finds the date in a list closest to the target date.
    An optional parameter can be given to get the closest before or after.

    :param table: A hive table name
    :type table: str
    :param ds: A datestamp ``%Y-%m-%d`` e.g. ``yyyy-mm-dd``
    :type ds: list[datetime.date]
    :param before: closest before (True), after (False) or either side of ds
    :type before: bool or None
    :param schema: table schema
    :type schema: str
    :param metastore_conn_id: which metastore connection to use
    :type metastore_conn_id: str
    :returns: The closest date
    :rtype: str or None

    >>> tbl = 'airflow.static_babynames_partitioned'
    >>> closest_ds_partition(tbl, '2015-01-02')
    '2015-01-01'
    """
    from airflow.providers.apache.hive.hooks.hive import HiveMetastoreHook

    if '.' in table:
        schema, table = table.split('.')
    hive_hook = HiveMetastoreHook(metastore_conn_id=metastore_conn_id)
    partitions = hive_hook.get_partitions(schema=schema, table_name=table)
    if not partitions:
        return None
    part_vals = [list(p.values())[0] for p in partitions]
    if ds in part_vals:
        return ds
    else:
        parts = [
            datetime.datetime.strptime(pv, '%Y-%m-%d') for pv in part_vals
        ]
        target_dt = datetime.datetime.strptime(ds, '%Y-%m-%d')
        closest_ds = _closest_date(target_dt, parts, before_target=before)
        return closest_ds.isoformat()
Beispiel #14
0
    def execute(self, context: Optional[Dict[str, Any]] = None) -> None:
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs: Any = {('', 'count'): 'COUNT(*)'}
        for col, col_type in list(field_types.items()):
            if self.assignment_func:
                assign_exprs = self.assignment_func(col, col_type)
                if assign_exprs is None:
                    assign_exprs = self.get_default_exprs(col, col_type)
            else:
                assign_exprs = self.get_default_exprs(col, col_type)
            exprs.update(assign_exprs)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join([v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()])

        where_clause_ = ["{} = '{}'".format(k, v) for k, v in self.partition.items()]
        where_clause = " AND\n        ".join(where_clause_)
        sql = "SELECT {exprs_str} FROM {table} WHERE {where_clause};".format(
            exprs_str=exprs_str, table=self.table, where_clause=where_clause
        )

        presto = PrestoHook(presto_conn_id=self.presto_conn_id)
        self.log.info('Executing SQL check: %s', sql)
        row = presto.get_first(hql=sql)
        self.log.info("Record: %s", row)
        if not row:
            raise AirflowException("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        self.log.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{table}' AND
            partition_repr='{part_json}' AND
            dttm='{dttm}'
        LIMIT 1;
        """.format(
            table=self.table, part_json=part_json, dttm=self.dttm
        )
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{table}' AND
                partition_repr='{part_json}' AND
                dttm='{dttm}';
            """.format(
                table=self.table, part_json=part_json, dttm=self.dttm
            )
            mysql.run(sql)

        self.log.info("Pivoting and loading cells into the Airflow db")
        rows = [
            (self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)
        ]
        mysql.insert_rows(
            table='hive_stats',
            rows=rows,
            target_fields=[
                'ds',
                'dttm',
                'table_name',
                'partition_repr',
                'col',
                'metric',
                'value',
            ],
        )
 def tearDown(self):
     hook = HiveMetastoreHook()
     with hook.get_conn() as metastore:
         metastore.drop_table(self.database, self.table, deleteData=True)