def setUp(self): self.hive = HiveUtils() self.hive.tables = { 'table1': { 'partitions': [ 'year=2013/month=10/day=01/hour=01', 'year=2013/month=10/day=01/hour=02' ], 'interval': 'hourly', 'depth': 4, 'location': '/path/to/data/table1/hourly', }, 'table2': { 'partitions': ['year=2013/month=10/day=01'], 'interval': 'daily', 'depth': 3, 'location': '/path/to/data/table2/daily', }, }
def setUp(self): self.hive = HiveUtils() self.hive.tables = { 'table1': { 'partitions': ['year=2013/month=10/day=01/hour=01', 'year=2013/month=10/day=01/hour=02'], 'interval': 'hourly', 'depth': 4, 'location': '/path/to/data/table1/hourly', }, 'table2': { 'partitions': ['year=2013/month=10/day=01'], 'interval': 'daily', 'depth': 3, 'location': '/path/to/data/table2/daily', }, }
class TestHiveUtil(TestCase): def setUp(self): self.hive = HiveUtils() self.hive.tables = { 'table1': { 'partitions': [ 'year=2013/month=10/day=01/hour=01', 'year=2013/month=10/day=01/hour=02' ], 'interval': 'hourly', 'depth': 4, 'location': '/path/to/data/table1/hourly', }, 'table2': { 'partitions': ['year=2013/month=10/day=01'], 'interval': 'daily', 'depth': 3, 'location': '/path/to/data/table2/daily', }, } def test_table_exists(self): self.assertTrue(self.hive.table_exists('table1')) self.assertFalse(self.hive.table_exists('nonya')) def test_partition_ddl(self): d = datetime(2013, 10, 15, 10) for table in self.hive.tables.keys(): interval = self.hive.tables[table]['interval'] expect = 'PARTITION (%s) LOCATION \'%s/%s\'' % ( d.strftime(interval_hierarchies[interval] ['hive_partition_ddl_format']), self.hive.tables[table]['location'], d.strftime(interval_hierarchies[interval]['directory_format'])) ddl = self.hive.partition_ddl(table, d) self.assertEqual(ddl, expect) def test_add_partitions_ddl(self): table = 'table1' interval = 'hourly' ddl_format = interval_hierarchies[interval][ 'hive_partition_ddl_format'] ds = [datetime(2013, 10, 15, 10), datetime(2013, 10, 15, 11)] expect_ddls = [ 'PARTITION (%s) LOCATION \'%s/%s\'' % (d.strftime( interval_hierarchies[interval]['hive_partition_ddl_format']), self.hive.tables[table]['location'], d.strftime(interval_hierarchies[interval]['directory_format'])) for d in ds ] expect = '\n'.join([ 'ALTER TABLE %s ADD %s;' % (table, partition_ddl) for partition_ddl in expect_ddls ]) add_statement = self.hive.add_partitions_ddl(table, ds) self.assertEqual(add_statement, expect) def test_create_partitions(self): table = 'table1' ddl_format = interval_hierarchies['hourly'][ 'hive_partition_ddl_format'] ds = [datetime(2013, 10, 15, 10), datetime(2013, 10, 15, 11)] expect = "ALTER TABLE table1 ADD\nPARTITION (%s)\nPARTITION (%s);" % ( ds[0].strftime(ddl_format), ds[1].strftime(ddl_format)) def test_partition_interval(self): for t in self.hive.tables.keys(): expect = self.hive.tables[t]['interval'] self.assertEqual(self.hive.partition_interval(t), expect) def table_location(self): self.assertEqual(self.hive.table_location('table1'), self.hive.tables['table1']['location']) def test_partition_location(self): interval = 'hourly' d = datetime(2013, 10, 15, 10) expect = self.hive.table_location('table1') + '/' + d.strftime( interval_hierarchies[interval]['directory_format']) self.assertEqual(self.hive.partition_location('table1', d), expect)
log_level = logging.INFO if verbose: log_level = logging.DEBUG logging.basicConfig(level=log_level, format='%(asctime)s %(levelname)-6s %(message)s', datefmt='%Y-%m-%dT%H:%M:%S') if tables: tables = tables.split(',') else: properties = load_properties('/u/apps/camus/shared/camus.properties') tables = fetch_kafka_trekkie_topics(properties['kafka.brokers']) hive = HiveUtils(database, hive_options) for table in tables: hdfs_location = table table = table.replace('.', '_').replace('-', '_') if not hive.table_exists(table): if dry_run: logging.info(str(hive_trekkie_create_table_stmt(table, hdfs_location))) else: hive.table_create(table, hdfs_location) if dry_run: logging.info(str(hive.get_missing_partitions_ddl(table))) else: hive.add_missing_partitions(table)
class TestHiveUtil(TestCase): def setUp(self): self.hive = HiveUtils() self.hive.tables = { 'table1': { 'partitions': ['year=2013/month=10/day=01/hour=01', 'year=2013/month=10/day=01/hour=02'], 'interval': 'hourly', 'depth': 4, 'location': '/path/to/data/table1/hourly', }, 'table2': { 'partitions': ['year=2013/month=10/day=01'], 'interval': 'daily', 'depth': 3, 'location': '/path/to/data/table2/daily', }, } def test_table_exists(self): self.assertTrue(self.hive.table_exists('table1')) self.assertFalse(self.hive.table_exists('nonya')) def test_partition_ddl(self): d = datetime(2013,10,15,10) for table in self.hive.tables.keys(): interval = self.hive.tables[table]['interval'] expect = 'PARTITION (%s) LOCATION \'%s/%s\'' % ( d.strftime(interval_hierarchies[interval]['hive_partition_ddl_format']), self.hive.tables[table]['location'], d.strftime(interval_hierarchies[interval]['directory_format']) ) ddl = self.hive.partition_ddl(table, d) self.assertEqual(ddl, expect) def test_add_partitions_ddl(self): table = 'table1' interval = 'hourly' ddl_format = interval_hierarchies[interval]['hive_partition_ddl_format'] ds = [datetime(2013,10,15,10), datetime(2013,10,15,11)] expect_ddls = ['PARTITION (%s) LOCATION \'%s/%s\'' % ( d.strftime(interval_hierarchies[interval]['hive_partition_ddl_format']), self.hive.tables[table]['location'], d.strftime(interval_hierarchies[interval]['directory_format']) ) for d in ds] expect = '\n'.join(['ALTER TABLE %s ADD %s;' % (table, partition_ddl) for partition_ddl in expect_ddls]) add_statement = self.hive.add_partitions_ddl(table, ds) self.assertEqual(add_statement, expect) def test_create_partitions(self): table = 'table1' ddl_format = interval_hierarchies['hourly']['hive_partition_ddl_format'] ds = [datetime(2013,10,15,10), datetime(2013,10,15,11)] expect = "ALTER TABLE table1 ADD\nPARTITION (%s)\nPARTITION (%s);" % (ds[0].strftime(ddl_format), ds[1].strftime(ddl_format)) def test_partition_interval(self): for t in self.hive.tables.keys(): expect = self.hive.tables[t]['interval'] self.assertEqual(self.hive.partition_interval(t), expect) def table_location(self): self.assertEqual(self.hive.table_location('table1'), self.hive.tables['table1']['location']) def test_partition_location(self): interval = 'hourly' d = datetime(2013,10,15,10) expect = self.hive.table_location('table1') + '/' + d.strftime(interval_hierarchies[interval]['directory_format']) self.assertEqual(self.hive.partition_location('table1', d), expect)