def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) logging.info("Extracting data from Hive") hive_table = "druid." + context["task_instance_key_str"].replace(".", "_") sql = self.sql.strip().strip(";") hql = """\ set mapred.output.compress=false; set hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = '') AS {sql} """.format( **locals() ) logging.info("Running command:\n {}".format(hql)) hive.run_cli(hql) m = HiveMetastoreHook(self.metastore_conn_id) t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find("/user") static_path = hdfs_uri[pos:] schema, table = hive_table.split(".") druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) logging.info("Inserting rows into Druid") logging.info("HDFS path: " + static_path) try: druid.load_from_hdfs( datasource=self.druid_datasource, intervals=self.intervals, static_path=static_path, ts_dim=self.ts_dim, columns=columns, num_shards=self.num_shards, target_partition_size=self.target_partition_size, query_granularity=self.query_granularity, segment_granularity=self.segment_granularity, metric_spec=self.metric_spec, hadoop_dependency_coordinates=self.hadoop_dependency_coordinates, ) logging.info("Load seems to have succeeded!") finally: logging.info("Cleaning up by dropping the temp " "Hive table {}".format(hive_table)) hql = "DROP TABLE IF EXISTS {}".format(hive_table) hive.run_cli(hql)
def test_get_max_partition_from_valid_part_specs_and_invalid_filter_map(self): with self.assertRaises(AirflowException): HiveMetastoreHook._get_max_partition_from_part_specs( [{'key1': 'value1', 'key2': 'value2'}, {'key1': 'value3', 'key2': 'value4'}], 'key1', {'key3': 'value5'})
def max_partition( table, schema="default", field=None, filter=None, metastore_conn_id='metastore_default'): ''' Gets the max partition for a table. :param schema: The hive schema the table lives in :type schema: string :param table: The hive table you are interested in, supports the dot notation as in "my_database.my_table", if a dot is found, the schema param is disregarded :type table: string :param hive_conn_id: The hive connection you are interested in. If your default is set you don't need to use this parameter. :type hive_conn_id: string :param filter: filter on a subset of partition as in `sub_part='specific_value'` :type filter: string :param field: the field to get the max value from. If there's only one partition field, this will be inferred >>> max_partition('airflow.static_babynames_partitioned') '2015-01-01' ''' from airflow.hooks.hive_hooks import HiveMetastoreHook if '.' in table: schema, table = table.split('.') hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id) return hh.max_partition( schema=schema, table_name=table, field=field, filter=filter)
def test_get_max_partition_from_valid_part_specs_and_none_partition_key(self): with self.assertRaises(AirflowException): HiveMetastoreHook._get_max_partition_from_part_specs( [{'key1': 'value1', 'key2': 'value2'}, {'key1': 'value3', 'key2': 'value4'}], None, self.VALID_FILTER_MAP)
def table(self): table_name = request.args.get("table") m = HiveMetastoreHook(METASTORE_CONN_ID) table = m.get_table(table_name) return self.render( "metastore_browser/table.html", table=table, table_name=table_name, datetime=datetime, int=int)
def max_partition( table, schema="default", field=None, filter_map=None, metastore_conn_id='metastore_default'): """ Gets the max partition for a table. :param schema: The hive schema the table lives in :type schema: str :param table: The hive table you are interested in, supports the dot notation as in "my_database.my_table", if a dot is found, the schema param is disregarded :type table: str :param metastore_conn_id: The hive connection you are interested in. If your default is set you don't need to use this parameter. :type metastore_conn_id: str :param filter_map: partition_key:partition_value map used for partition filtering, e.g. {'key1': 'value1', 'key2': 'value2'}. Only partitions matching all partition_key:partition_value pairs will be considered as candidates of max partition. :type filter_map: map :param field: the field to get the max value from. If there's only one partition field, this will be inferred :type field: str >>> max_partition('airflow.static_babynames_partitioned') '2015-01-01' """ from airflow.hooks.hive_hooks import HiveMetastoreHook if '.' in table: schema, table = table.split('.') hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id) return hh.max_partition( schema=schema, table_name=table, field=field, filter_map=filter_map)
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) self.log.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'].replace('.', '_') sql = self.sql.strip().strip(';') tblproperties = ''.join([", '{}' = '{}'" .format(k, v) for k, v in self.hive_tblproperties.items()]) hql = """\ SET mapred.output.compress=false; SET hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = ''{tblproperties}) AS {sql} """.format(hive_table=hive_table, tblproperties=tblproperties, sql=sql) self.log.info("Running command:\n %s", hql) hive.run_cli(hql) m = HiveMetastoreHook(self.metastore_conn_id) # Get the Hive table and extract the columns t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] # Get the path on hdfs static_path = m.get_table(hive_table).sd.location schema, table = hive_table.split('.') druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) try: index_spec = self.construct_ingest_query( static_path=static_path, columns=columns, ) self.log.info("Inserting rows into Druid, hdfs path: %s", static_path) druid.submit_indexing_job(index_spec) self.log.info("Load seems to have succeeded!") finally: self.log.info( "Cleaning up by dropping the temp Hive table %s", hive_table ) hql = "DROP TABLE IF EXISTS {}".format(hive_table) hive.run_cli(hql)
def test_get_max_partition_from_mal_valid_part_names(self): max_partition = \ HiveMetastoreHook._get_max_partition_from_part_names(['some_key=value1', 'some_key=value2', 'some_key=value3'], 'some_key') self.assertEqual(max_partition, 'value3')
def test_get_max_partition_from_valid_part_specs(self): max_partition = \ HiveMetastoreHook._get_max_partition_from_part_specs( [{'key1': 'value1', 'key2': 'value2'}, {'key1': 'value3', 'key2': 'value4'}], 'key1', self.VALID_FILTER_MAP) self.assertEqual(max_partition, b'value1')
def test_get_max_partition_from_valid_part_specs_and_none_filter_map(self): max_partition = \ HiveMetastoreHook._get_max_partition_from_part_specs( [{'key1': 'value1', 'key2': 'value2'}, {'key1': 'value3', 'key2': 'value4'}], 'key1', None) # No partition will be filtered out. self.assertEqual(max_partition, b'value3')
def poke_partition(self, partition): if not self.hook: from airflow.hooks.hive_hooks import HiveMetastoreHook self.hook = HiveMetastoreHook( metastore_conn_id=self.metastore_conn_id) schema, table, partition = self.parse_partition_name(partition) self.log.info('Poking for %s.%s/%s', schema, table, partition) return self.hook.check_for_named_partition( schema, table, partition)
def poke(self, context): if '.' in self.table: self.schema, self.table = self.table.split('.') self.log.info( 'Poking for table {self.schema}.{self.table}, ' 'partition {self.partition}'.format(**locals())) if not hasattr(self, 'hook'): from airflow.hooks.hive_hooks import HiveMetastoreHook self.hook = HiveMetastoreHook( metastore_conn_id=self.metastore_conn_id) return self.hook.check_for_partition( self.schema, self.table, self.partition)
def poke(self, context): if '.' in self.table: self.schema, self.table = self.table.split('.') self.log.info( 'Poking for table %s.%s, partition %s', self.schema, self.table, self.partition ) if not hasattr(self, 'hook'): from airflow.hooks.hive_hooks import HiveMetastoreHook self.hook = HiveMetastoreHook( metastore_conn_id=self.metastore_conn_id) return self.hook.check_for_partition( self.schema, self.table, self.partition)
class HivePartitionSensor(BaseSensorOperator): """ Waits for a partition to show up in Hive. Note: Because ``partition`` supports general logical operators, it can be inefficient. Consider using NamedHivePartitionSensor instead if you don't need the full flexibility of HivePartitionSensor. :param table: The name of the table to wait for, supports the dot notation (my_database.my_table) :type table: string :param partition: The partition clause to wait for. This is passed as is to the metastore Thrift client ``get_partitions_by_filter`` method, and apparently supports SQL like notation as in ``ds='2015-01-01' AND type='value'`` and comparison operators as in ``"ds>=2015-01-01"`` :type partition: string :param metastore_conn_id: reference to the metastore thrift service connection id :type metastore_conn_id: str """ template_fields = ('schema', 'table', 'partition',) ui_color = '#C5CAE9' @apply_defaults def __init__( self, table, partition="ds='{{ ds }}'", metastore_conn_id='metastore_default', schema='default', poke_interval=60*3, *args, **kwargs): super(HivePartitionSensor, self).__init__( poke_interval=poke_interval, *args, **kwargs) if not partition: partition = "ds='{{ ds }}'" self.metastore_conn_id = metastore_conn_id self.table = table self.partition = partition self.schema = schema def poke(self, context): if '.' in self.table: self.schema, self.table = self.table.split('.') self.log.info( 'Poking for table {self.schema}.{self.table}, ' 'partition {self.partition}'.format(**locals())) if not hasattr(self, 'hook'): from airflow.hooks.hive_hooks import HiveMetastoreHook self.hook = HiveMetastoreHook( metastore_conn_id=self.metastore_conn_id) return self.hook.check_for_partition( self.schema, self.table, self.partition)
def poke_partition(self, partition): if not self.hook: from airflow.hooks.hive_hooks import HiveMetastoreHook self.hook = HiveMetastoreHook( metastore_conn_id=self.metastore_conn_id) schema, table, partition = self.parse_partition_name(partition) self.log.info( 'Poking for {schema}.{table}/{partition}'.format(**locals()) ) return self.hook.check_for_named_partition( schema, table, partition)
def closest_ds_partition( table, ds, before=True, schema="default", metastore_conn_id='metastore_default'): """ This function finds the date in a list closest to the target date. An optional parameter can be given to get the closest before or after. :param table: A hive table name :type table: str :param ds: A datestamp ``%Y-%m-%d`` e.g. ``yyyy-mm-dd`` :type ds: list[datetime.date] :param before: closest before (True), after (False) or either side of ds :type before: bool or None :returns: The closest date :rtype: str or None >>> tbl = 'airflow.static_babynames_partitioned' >>> closest_ds_partition(tbl, '2015-01-02') '2015-01-01' """ from airflow.hooks.hive_hooks import HiveMetastoreHook if '.' in table: schema, table = table.split('.') hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id) partitions = hh.get_partitions(schema=schema, table_name=table) if not partitions: return None part_vals = [list(p.values())[0] for p in partitions] if ds in part_vals: return ds else: parts = [datetime.datetime.strptime(pv, '%Y-%m-%d') for pv in part_vals] target_dt = datetime.datetime.strptime(ds, '%Y-%m-%d') closest_ds = _closest_date(target_dt, parts, before_target=before) return closest_ds.isoformat()
def poke(self, context): if not hasattr(self, 'hook'): from airflow.hooks.hive_hooks import HiveMetastoreHook self.hook = HiveMetastoreHook( metastore_conn_id=self.metastore_conn_id) def poke_partition(partition): schema, table, partition = self.parse_partition_name(partition) self.log.info( 'Poking for {schema}.{table}/{partition}'.format(**locals()) ) return self.hook.check_for_named_partition( schema, table, partition) while self.next_poke_idx < len(self.partition_names): if poke_partition(self.partition_names[self.next_poke_idx]): self.next_poke_idx += 1 else: return False return True
def setUp(self): configuration.load_test_config() args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args) self.next_day = (DEFAULT_DATE + datetime.timedelta(days=1)).isoformat()[:10] self.database = 'airflow' self.partition_by = 'ds' self.table = 'static_babynames_partitioned' self.hql = """ CREATE DATABASE IF NOT EXISTS {{ params.database }}; USE {{ params.database }}; DROP TABLE IF EXISTS {{ params.table }}; CREATE TABLE IF NOT EXISTS {{ params.table }} ( state string, year string, name string, gender string, num int) PARTITIONED BY ({{ params.partition_by }} string); ALTER TABLE {{ params.table }} ADD PARTITION({{ params.partition_by }}='{{ ds }}'); """ self.hook = HiveMetastoreHook() t = operators.hive_operator.HiveOperator( task_id='HiveHook_' + str(random.randint(1, 10000)), params={ 'database': self.database, 'table': self.table, 'partition_by': self.partition_by }, hive_cli_conn_id='beeline_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def tearDown(self): hook = HiveMetastoreHook() with hook.get_conn() as metastore: metastore.drop_table(self.database, self.table, deleteData=True)
class NamedHivePartitionSensor(BaseSensorOperator): """ Waits for a set of partitions to show up in Hive. :param partition_names: List of fully qualified names of the partitions to wait for. A fully qualified name is of the form ``schema.table/pk1=pv1/pk2=pv2``, for example, default.users/ds=2016-01-01. This is passed as is to the metastore Thrift client ``get_partitions_by_name`` method. Note that you cannot use logical or comparison operators as in HivePartitionSensor. :type partition_names: list[str] :param metastore_conn_id: reference to the metastore thrift service connection id :type metastore_conn_id: str """ template_fields = ('partition_names',) ui_color = '#8d99ae' @apply_defaults def __init__(self, partition_names, metastore_conn_id='metastore_default', poke_interval=60 * 3, hook=None, *args, **kwargs): super().__init__( poke_interval=poke_interval, *args, **kwargs) if isinstance(partition_names, basestring): raise TypeError('partition_names must be an array of strings') self.metastore_conn_id = metastore_conn_id self.partition_names = partition_names self.hook = hook if self.hook and metastore_conn_id != 'metastore_default': self.log.warning( 'A hook was passed but a non defaul metastore_conn_id=%s was used', metastore_conn_id ) @staticmethod def parse_partition_name(partition): first_split = partition.split('.', 1) if len(first_split) == 1: schema = 'default' table_partition = max(first_split) # poor man first else: schema, table_partition = first_split second_split = table_partition.split('/', 1) if len(second_split) == 1: raise ValueError('Could not parse ' + partition + 'into table, partition') else: table, partition = second_split return schema, table, partition def poke_partition(self, partition): if not self.hook: from airflow.hooks.hive_hooks import HiveMetastoreHook self.hook = HiveMetastoreHook( metastore_conn_id=self.metastore_conn_id) schema, table, partition = self.parse_partition_name(partition) self.log.info('Poking for %s.%s/%s', schema, table, partition) return self.hook.check_for_named_partition( schema, table, partition) def poke(self, context): self.partition_names = [ partition_name for partition_name in self.partition_names if not self.poke_partition(partition_name) ] return not self.partition_names
def test_get_max_partition_from_empty_part_names(self): max_partition = \ HiveMetastoreHook._get_max_partition_from_part_names([], 'some_key') self.assertIsNone(max_partition)
def test_get_max_partition_from_mal_formatted_part_names(self): with self.assertRaises(AirflowException): HiveMetastoreHook._get_max_partition_from_part_names( ['bad_partition_name'], 'some_key')
def execute(self, context=None): metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs = { ('', 'count'): 'COUNT(*)' } for col, col_type in list(field_types.items()): d = {} if self.assignment_func: d = self.assignment_func(col, col_type) if d is None: d = self.get_default_exprs(col, col_type) else: d = self.get_default_exprs(col, col_type) exprs.update(d) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join([ v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause = ["{} = '{}'".format(k, v) for k, v in self.partition.items()] where_clause = " AND\n ".join(where_clause) sql = "SELECT {exprs_str} FROM {table} WHERE {where_clause};".format( exprs_str=exprs_str, table=self.table, where_clause=where_clause) presto = PrestoHook(presto_conn_id=self.presto_conn_id) self.log.info('Executing SQL check: %s', sql) row = presto.get_first(hql=sql) self.log.info("Record: %s", row) if not row: raise AirflowException("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) self.log.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{table}' AND partition_repr='{part_json}' AND dttm='{dttm}' LIMIT 1; """.format(table=self.table, part_json=part_json, dttm=self.dttm) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{table}' AND partition_repr='{part_json}' AND dttm='{dttm}'; """.format(table=self.table, part_json=part_json, dttm=self.dttm) mysql.run(sql) self.log.info("Pivoting and loading cells into the Airflow db") rows = [(self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)] mysql.insert_rows( table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ] )
class NamedHivePartitionSensor(BaseSensorOperator): """ Waits for a set of partitions to show up in Hive. :param partition_names: List of fully qualified names of the partitions to wait for. A fully qualified name is of the form ``schema.table/pk1=pv1/pk2=pv2``, for example, default.users/ds=2016-01-01. This is passed as is to the metastore Thrift client ``get_partitions_by_name`` method. Note that you cannot use logical or comparison operators as in HivePartitionSensor. :type partition_names: list of strings :param metastore_conn_id: reference to the metastore thrift service connection id :type metastore_conn_id: str """ template_fields = ('partition_names', ) ui_color = '#8d99ae' @apply_defaults def __init__( self, partition_names, metastore_conn_id='metastore_default', poke_interval=60 * 3, *args, **kwargs): super(NamedHivePartitionSensor, self).__init__( poke_interval=poke_interval, *args, **kwargs) if isinstance(partition_names, basestring): raise TypeError('partition_names must be an array of strings') self.metastore_conn_id = metastore_conn_id self.partition_names = partition_names self.next_poke_idx = 0 @classmethod def parse_partition_name(self, partition): try: schema, table_partition = partition.split('.', 1) table, partition = table_partition.split('/', 1) return schema, table, partition except ValueError as e: raise ValueError('Could not parse ' + partition) def poke(self, context): if not hasattr(self, 'hook'): from airflow.hooks.hive_hooks import HiveMetastoreHook self.hook = HiveMetastoreHook( metastore_conn_id=self.metastore_conn_id) def poke_partition(partition): schema, table, partition = self.parse_partition_name(partition) self.log.info( 'Poking for {schema}.{table}/{partition}'.format(**locals()) ) return self.hook.check_for_named_partition( schema, table, partition) while self.next_poke_idx < len(self.partition_names): if poke_partition(self.partition_names[self.next_poke_idx]): self.next_poke_idx += 1 else: return False return True
class NamedHivePartitionSensor(BaseSensorOperator): """ Waits for a set of partitions to show up in Hive. :param partition_names: List of fully qualified names of the partitions to wait for. A fully qualified name is of the form ``schema.table/pk1=pv1/pk2=pv2``, for example, default.users/ds=2016-01-01. This is passed as is to the metastore Thrift client ``get_partitions_by_name`` method. Note that you cannot use logical or comparison operators as in HivePartitionSensor. :type partition_names: list[str] :param metastore_conn_id: reference to the metastore thrift service connection id :type metastore_conn_id: str """ template_fields = ('partition_names',) ui_color = '#8d99ae' @apply_defaults def __init__(self, partition_names, metastore_conn_id='metastore_default', poke_interval=60 * 3, hook=None, *args, **kwargs): super().__init__( poke_interval=poke_interval, *args, **kwargs) if isinstance(partition_names, str): raise TypeError('partition_names must be an array of strings') self.metastore_conn_id = metastore_conn_id self.partition_names = partition_names self.hook = hook if self.hook and metastore_conn_id != 'metastore_default': self.log.warning( 'A hook was passed but a non defaul metastore_conn_id=%s was used', metastore_conn_id ) @staticmethod def parse_partition_name(partition): first_split = partition.split('.', 1) if len(first_split) == 1: schema = 'default' table_partition = max(first_split) # poor man first else: schema, table_partition = first_split second_split = table_partition.split('/', 1) if len(second_split) == 1: raise ValueError('Could not parse ' + partition + 'into table, partition') else: table, partition = second_split return schema, table, partition def poke_partition(self, partition): if not self.hook: from airflow.hooks.hive_hooks import HiveMetastoreHook self.hook = HiveMetastoreHook( metastore_conn_id=self.metastore_conn_id) schema, table, partition = self.parse_partition_name(partition) self.log.info('Poking for %s.%s/%s', schema, table, partition) return self.hook.check_for_named_partition( schema, table, partition) def poke(self, context): self.partition_names = [ partition_name for partition_name in self.partition_names if not self.poke_partition(partition_name) ] return not self.partition_names
def execute(self, context=None): metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs = {('', 'count'): 'COUNT(*)'} for col, col_type in list(field_types.items()): d = {} if self.assignment_func: d = self.assignment_func(col, col_type) if d is None: d = self.get_default_exprs(col, col_type) else: d = self.get_default_exprs(col, col_type) exprs.update(d) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join( [v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause = [ "{} = '{}'".format(k, v) for k, v in self.partition.items() ] where_clause = " AND\n ".join(where_clause) sql = "SELECT {exprs_str} FROM {table} WHERE {where_clause};".format( exprs_str=exprs_str, table=self.table, where_clause=where_clause) presto = PrestoHook(presto_conn_id=self.presto_conn_id) self.log.info('Executing SQL check: %s', sql) row = presto.get_first(hql=sql) self.log.info("Record: %s", row) if not row: raise AirflowException("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) self.log.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{table}' AND partition_repr='{part_json}' AND dttm='{dttm}' LIMIT 1; """.format(table=self.table, part_json=part_json, dttm=self.dttm) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{table}' AND partition_repr='{part_json}' AND dttm='{dttm}'; """.format(table=self.table, part_json=part_json, dttm=self.dttm) mysql.run(sql) self.log.info("Pivoting and loading cells into the Airflow db") rows = [(self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)] mysql.insert_rows(table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ])
def db(self): db = request.args.get("db") m = HiveMetastoreHook(METASTORE_CONN_ID) tables = sorted(m.get_tables(db=db), key=lambda x: x.tableName) return self.render( "metastore_browser/db.html", tables=tables, db=db)
def test_get_max_partition_from_empty_part_specs(self): max_partition = \ HiveMetastoreHook._get_max_partition_from_part_specs([], 'key1', self.VALID_FILTER_MAP) self.assertIsNone(max_partition)
def test_get_metastore_databases(self): if six.PY2: from airflow.hooks.hive_hooks import HiveMetastoreHook hook = HiveMetastoreHook() hook.get_databases()
class NamedHivePartitionSensor(BaseSensorOperator): """ Waits for a set of partitions to show up in Hive. :param partition_names: List of fully qualified names of the partitions to wait for. A fully qualified name is of the form ``schema.table/pk1=pv1/pk2=pv2``, for example, default.users/ds=2016-01-01. This is passed as is to the metastore Thrift client ``get_partitions_by_name`` method. Note that you cannot use logical or comparison operators as in HivePartitionSensor. :type partition_names: list of strings :param metastore_conn_id: reference to the metastore thrift service connection id :type metastore_conn_id: str """ template_fields = ('partition_names', ) ui_color = '#8d99ae' @apply_defaults def __init__(self, partition_names, metastore_conn_id='metastore_default', poke_interval=60 * 3, *args, **kwargs): super(NamedHivePartitionSensor, self).__init__(poke_interval=poke_interval, *args, **kwargs) if isinstance(partition_names, basestring): raise TypeError('partition_names must be an array of strings') self.metastore_conn_id = metastore_conn_id self.partition_names = partition_names self.next_poke_idx = 0 @classmethod def parse_partition_name(self, partition): try: schema, table_partition = partition.split('.', 1) table, partition = table_partition.split('/', 1) return schema, table, partition except ValueError as e: raise ValueError('Could not parse ' + partition) def poke(self, context): if not hasattr(self, 'hook'): from airflow.hooks.hive_hooks import HiveMetastoreHook self.hook = HiveMetastoreHook( metastore_conn_id=self.metastore_conn_id) def poke_partition(partition): schema, table, partition = self.parse_partition_name(partition) self.log.info( 'Poking for {schema}.{table}/{partition}'.format(**locals())) return self.hook.check_for_named_partition(schema, table, partition) while self.next_poke_idx < len(self.partition_names): if poke_partition(self.partition_names[self.next_poke_idx]): self.next_poke_idx += 1 else: return False return True
def db(self): db = request.args.get("db") m = HiveMetastoreHook(METASTORE_CONN_ID) tables = sorted(m.get_tables(db=db), key=lambda x: x.tableName) return self.render("metastore_browser/db.html", tables=tables, db=db)
def test_check_hms_clients_load_balance(self, mock_client, mock_get_conn, mock_shuffle): HiveMetastoreHook()._find_valid_server() mock_shuffle.assert_called_once_with(mock_get_conn.return_value)