Beispiel #1
0
 def setUp(self):
     self.hive = HiveUtils()
     self.hive.tables = {
         'table1': {
             'partitions': [
                 'year=2013/month=10/day=01/hour=01',
                 'year=2013/month=10/day=01/hour=02'
             ],
             'interval':
             'hourly',
             'depth':
             4,
             'location':
             '/path/to/data/table1/hourly',
         },
         'table2': {
             'partitions': ['year=2013/month=10/day=01'],
             'interval': 'daily',
             'depth': 3,
             'location': '/path/to/data/table2/daily',
         },
     }
Beispiel #2
0
 def setUp(self):
     self.hive = HiveUtils()
     self.hive.tables = {
         'table1': {
             'partitions': ['year=2013/month=10/day=01/hour=01', 'year=2013/month=10/day=01/hour=02'],
             'interval':   'hourly',
             'depth':      4,
             'location':   '/path/to/data/table1/hourly',
         },
         'table2': {
             'partitions': ['year=2013/month=10/day=01'],
             'interval':   'daily',
             'depth':      3,
             'location':   '/path/to/data/table2/daily',
         },
     }
Beispiel #3
0
class TestHiveUtil(TestCase):
    def setUp(self):
        self.hive = HiveUtils()
        self.hive.tables = {
            'table1': {
                'partitions': [
                    'year=2013/month=10/day=01/hour=01',
                    'year=2013/month=10/day=01/hour=02'
                ],
                'interval':
                'hourly',
                'depth':
                4,
                'location':
                '/path/to/data/table1/hourly',
            },
            'table2': {
                'partitions': ['year=2013/month=10/day=01'],
                'interval': 'daily',
                'depth': 3,
                'location': '/path/to/data/table2/daily',
            },
        }

    def test_table_exists(self):
        self.assertTrue(self.hive.table_exists('table1'))
        self.assertFalse(self.hive.table_exists('nonya'))

    def test_partition_ddl(self):
        d = datetime(2013, 10, 15, 10)
        for table in self.hive.tables.keys():

            interval = self.hive.tables[table]['interval']
            expect = 'PARTITION (%s) LOCATION \'%s/%s\'' % (
                d.strftime(interval_hierarchies[interval]
                           ['hive_partition_ddl_format']),
                self.hive.tables[table]['location'],
                d.strftime(interval_hierarchies[interval]['directory_format']))
            ddl = self.hive.partition_ddl(table, d)
            self.assertEqual(ddl, expect)

    def test_add_partitions_ddl(self):
        table = 'table1'
        interval = 'hourly'
        ddl_format = interval_hierarchies[interval][
            'hive_partition_ddl_format']
        ds = [datetime(2013, 10, 15, 10), datetime(2013, 10, 15, 11)]

        expect_ddls = [
            'PARTITION (%s) LOCATION \'%s/%s\'' %
            (d.strftime(
                interval_hierarchies[interval]['hive_partition_ddl_format']),
             self.hive.tables[table]['location'],
             d.strftime(interval_hierarchies[interval]['directory_format']))
            for d in ds
        ]
        expect = '\n'.join([
            'ALTER TABLE %s ADD %s;' % (table, partition_ddl)
            for partition_ddl in expect_ddls
        ])

        add_statement = self.hive.add_partitions_ddl(table, ds)
        self.assertEqual(add_statement, expect)

    def test_create_partitions(self):
        table = 'table1'
        ddl_format = interval_hierarchies['hourly'][
            'hive_partition_ddl_format']
        ds = [datetime(2013, 10, 15, 10), datetime(2013, 10, 15, 11)]
        expect = "ALTER TABLE table1 ADD\nPARTITION (%s)\nPARTITION (%s);" % (
            ds[0].strftime(ddl_format), ds[1].strftime(ddl_format))

    def test_partition_interval(self):
        for t in self.hive.tables.keys():
            expect = self.hive.tables[t]['interval']
            self.assertEqual(self.hive.partition_interval(t), expect)

    def table_location(self):
        self.assertEqual(self.hive.table_location('table1'),
                         self.hive.tables['table1']['location'])

    def test_partition_location(self):
        interval = 'hourly'
        d = datetime(2013, 10, 15, 10)
        expect = self.hive.table_location('table1') + '/' + d.strftime(
            interval_hierarchies[interval]['directory_format'])
        self.assertEqual(self.hive.partition_location('table1', d), expect)
    log_level = logging.INFO
    if verbose:
        log_level = logging.DEBUG

    logging.basicConfig(level=log_level,
                        format='%(asctime)s %(levelname)-6s %(message)s',
                        datefmt='%Y-%m-%dT%H:%M:%S')

    if tables:
        tables = tables.split(',')
    else:
        properties = load_properties('/u/apps/camus/shared/camus.properties')
        tables = fetch_kafka_trekkie_topics(properties['kafka.brokers'])

    hive = HiveUtils(database, hive_options)
    for table in tables:
        hdfs_location = table
        table = table.replace('.', '_').replace('-', '_')
        if not hive.table_exists(table):
            if dry_run:
                logging.info(str(hive_trekkie_create_table_stmt(table, hdfs_location)))
            else:
                hive.table_create(table, hdfs_location)

        if dry_run:
            logging.info(str(hive.get_missing_partitions_ddl(table)))
        else:
            hive.add_missing_partitions(table)

Beispiel #5
0
class TestHiveUtil(TestCase):
    def setUp(self):
        self.hive = HiveUtils()
        self.hive.tables = {
            'table1': {
                'partitions': ['year=2013/month=10/day=01/hour=01', 'year=2013/month=10/day=01/hour=02'],
                'interval':   'hourly',
                'depth':      4,
                'location':   '/path/to/data/table1/hourly',
            },
            'table2': {
                'partitions': ['year=2013/month=10/day=01'],
                'interval':   'daily',
                'depth':      3,
                'location':   '/path/to/data/table2/daily',
            },
        }

    def test_table_exists(self):
        self.assertTrue(self.hive.table_exists('table1'))
        self.assertFalse(self.hive.table_exists('nonya'))

    def test_partition_ddl(self):
        d = datetime(2013,10,15,10)
        for table in self.hive.tables.keys():

            interval = self.hive.tables[table]['interval']
            expect = 'PARTITION (%s) LOCATION \'%s/%s\'' % (
                d.strftime(interval_hierarchies[interval]['hive_partition_ddl_format']),
                self.hive.tables[table]['location'],
                d.strftime(interval_hierarchies[interval]['directory_format'])
            )
            ddl = self.hive.partition_ddl(table, d)
            self.assertEqual(ddl, expect)

    def test_add_partitions_ddl(self):
        table = 'table1'
        interval = 'hourly'
        ddl_format = interval_hierarchies[interval]['hive_partition_ddl_format']
        ds = [datetime(2013,10,15,10), datetime(2013,10,15,11)]


        expect_ddls = ['PARTITION (%s) LOCATION \'%s/%s\'' % (
            d.strftime(interval_hierarchies[interval]['hive_partition_ddl_format']),
            self.hive.tables[table]['location'],
            d.strftime(interval_hierarchies[interval]['directory_format'])
        ) for d in ds]
        expect = '\n'.join(['ALTER TABLE %s ADD %s;' % (table, partition_ddl) for partition_ddl in expect_ddls])

        add_statement = self.hive.add_partitions_ddl(table, ds)
        self.assertEqual(add_statement, expect)

    def test_create_partitions(self):
        table = 'table1'
        ddl_format = interval_hierarchies['hourly']['hive_partition_ddl_format']
        ds = [datetime(2013,10,15,10), datetime(2013,10,15,11)]
        expect = "ALTER TABLE table1 ADD\nPARTITION (%s)\nPARTITION (%s);" % (ds[0].strftime(ddl_format), ds[1].strftime(ddl_format))

    def test_partition_interval(self):
        for t in self.hive.tables.keys():
            expect = self.hive.tables[t]['interval']
            self.assertEqual(self.hive.partition_interval(t), expect)

    def table_location(self):
        self.assertEqual(self.hive.table_location('table1'), self.hive.tables['table1']['location'])

    def test_partition_location(self):
        interval = 'hourly'
        d = datetime(2013,10,15,10)
        expect = self.hive.table_location('table1') + '/' + d.strftime(interval_hierarchies[interval]['directory_format'])
        self.assertEqual(self.hive.partition_location('table1', d), expect)