def tearDown(self): metadata = reg.get_metadata("household_power_consumption") self.t_hive._query("drop table if exists " + reg.db_name(metadata) + '.' + reg.db_table(metadata)) self.t_hive._query("drop table if exists " + reg.db_name(metadata) + '.' + reg.db_table(metadata, type="work")) self.t_hive._query("drop table if exists " + reg.db_name(metadata, stage="valid") + '.' + reg.db_table(metadata)) metadata2 = reg.get_metadata("small") self.t_hive._query("drop table if exists " + reg.db_name(metadata2) + '.' + reg.db_table(metadata2)) self.t_hive._query("drop table if exists " + reg.db_name(metadata2) + '.' + reg.db_table(metadata2, type="work")) self.t_hive._query("drop table if exists " + reg.db_name(metadata2, stage="valid") + '.' + reg.db_table(metadata2))
def test_register_valid(self): self.assertTrue(registry.validate_uuid(registry.register_valid( registry.get_metadata('sample.txt'), uuid4(), 'sample.txt', 123, 'validation query' )))
def test_sandbox_nodelim(self): metadata = reg.get_metadata("no_delim") test_header = 'Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;' test_header += 'Sub_metering_1;Sub_metering_2;Sub_metering_3' self.t_hive.create_hive_table(metadata, reset=True, header=test_header) self.assertEqual(self.t_hive._query('describe ' + reg.db_name(metadata) + '.' + reg.db_table(metadata)), [('column1', 'string', '')] )
def test_raw_hive_work(self): metadata = reg.get_metadata("small") self.t_hive.create_hive_table(metadata, reset=True, type="work") self.assertEqual(self.t_hive._query('describe ' + reg.db_name(metadata) + '.' + reg.db_table(metadata, type="work")), [('date_field', 'string', 'from deserializer'), ('time_field', 'string', 'from deserializer'), ('globalactivepower', 'string', 'from deserializer')] )
def test_RegisteredFile(self): print json.dumps(registry.get_metadata('sampleFile.db'), indent=4) self.assertEqual(registry.get_metadata('sampleFile.db'), { "fields": [], "file": { "dataPartition": "none", "technical": { "tableName": "sample" }, "subjectArea": "test", "deleted": "true", "key": "sample", "guid": "7e1a614c-9570-42a6-9bc7-315f2b6218be" } } )
def test_z_copy_compare_append(self): # z in the name so this test runs last in Pycharm metadata = reg.get_metadata("append_power_consumption") self.t_hive.copy_and_compare(metadata, uuid4(), "append_power_consumption") raw_work = reg.file_path(metadata, stage="raw", type='work') + "/household_power_consumption_50.txt.gz" raw_reg = reg.file_path(metadata, stage="raw") + "/household_power_consumption_50.txt.gz" logging.info("raw_work : " + raw_work) logging.info("raw regular : " + raw_reg)
def test_z_copy_compare_full(self): # z in the name so this test runs last in Pycharm metadata = reg.get_metadata("household_power_consumption") raw_reg = reg.file_path(metadata, stage="raw") + "/household_power_consumption_50.txt.gz" valid = reg.file_path(metadata, stage="valid") logging.info("valid work : " + valid) logging.info("raw regular : " + raw_reg) self.t_hive.copy_and_compare(metadata, uuid4(), "household_power_consumption")
def test_valid_hive(self): metadata = reg.get_metadata("small") self.t_hive.create_hive_table(metadata, reset=True, stage="valid") self.assertEqual( self.t_hive._query('describe ' + reg.db_name(metadata, stage="valid") + '.' + reg.db_table(metadata)), [('globalactivepower', 'decimal(8,3)', ''), ('instance_guid', 'string', ''), ('time_field', 'string', ''), ('date_field', 'string', ''), ('', None, None), ('# Partition Information', None, None), ('# col_name ', 'data_type ', 'comment '), ('', None, None), ('time_field', 'string', ''), ('date_field', 'string', '')] )
def test_sandbox_full_MD(self): metadata = reg.get_metadata("fullsand") self.t_hive.create_hive_table(metadata, reset=True) self.assertEqual(self.t_hive._query('describe ' + reg.db_name(metadata) + '.' + reg.db_table(metadata)), [('date_field', 'string', 'from deserializer'), ('time_field', 'string', 'from deserializer'), ('globalactivepower', 'string', 'from deserializer'), ('globalreactivepower', 'string', 'from deserializer'), ('voltage', 'string', 'from deserializer'), ('globalintensity', 'string', 'from deserializer'), ('submetering1', 'string', 'from deserializer'), ('submetering2', 'string', 'from deserializer'), ('submetering3', 'string', 'from deserializer')] )
def test_valid_hive_work(self): metadata = reg.get_metadata("small") self.t_hive.create_hive_table(metadata, reset=True, type="work", stage="valid") self.assertEqual( self.t_hive._query('describe ' + reg.db_name(metadata, type="work", stage="valid") + '.' + reg.db_table(metadata, type="work", stage="valid")), [ ("date_field", "string", ""), ("time_field", "string", ""), ("globalactivepower", "decimal(8,3)", ""), ("instance_guid", "string", "") ] ) self.t_hive._query("drop table if exists " + reg.db_name(metadata) + '.' + reg.db_table(metadata))
def test_sandbox_header_delim(self): metadata = reg.get_metadata("sandbox") test_header = 'Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;' test_header += 'Sub_metering_1;Sub_metering_2;Sub_metering_3' self.t_hive.create_hive_table(metadata, reset=True, header=test_header) self.assertEqual(self.t_hive._query('describe ' + reg.db_name(metadata) + '.' + reg.db_table(metadata)), [('date_field', 'string', 'from deserializer'), ('time_field', 'string', 'from deserializer'), ('globalactivepower', 'string', 'from deserializer'), ('globalreactivepower', 'string', 'from deserializer'), ('voltage', 'string', 'from deserializer'), ('globalintensity', 'string', 'from deserializer'), ('submetering1', 'string', 'from deserializer'), ('submetering2', 'string', 'from deserializer'), ('submetering3', 'string', 'from deserializer')] )
def test_db_name_refined(self): self.assertEqual( registry.db_name(registry.get_metadata('sample.txt'), stage='refined'), 'dev_none_test', "Incorrect db returned")
def test_file_path_stage(self): self.assertEqual( registry.file_path(registry.get_metadata('sample'), stage='valid'), registry.data_root + '/none/test/valid/sample', "incorrect file path")
def test_file_path_work(self): logger.info(registry.file_path(registry.get_metadata('sample'), type='work')) self.assertEqual( registry.file_path(registry.get_metadata('sample'), type='work'), registry.data_root + '/none/test/raw_work/sample', "incorrect file path")
def test_file_path_work_sandbox(self): logger.info(registry.file_path(registry.get_metadata('sandbox'), type='work')) self.assertEqual( registry.file_path(registry.get_metadata('sandbox'), type='work'), '/user/cloudera/data/sandbox/bria644/household_electric_power_consumption_work')
def test_db_name_test(self): self.assertEqual( registry.db_name(registry.get_metadata('sample.txt'), env='test'), 'test_none_test_raw')
def test_db_name(self): self.assertEqual( registry.db_name(registry.get_metadata('sample')), 'dev_none_test_raw')
def test_key_matched(self): self.assertEqual(registry.key_matched(registry.get_metadata('sample')), 'sample', "Didn't get UUID")
def test_db_name_valid(self): self.assertEqual( registry.db_name(registry.get_metadata('sampleFile.txt'), stage='valid'), 'dev_none_test')
def test_file_path_sandbox(self): # service_account/data/sandbox/uid/table_name self.assertEqual( registry.file_path(registry.get_metadata('sandbox')), registry.data_root + '/sandbox/bria644/household_electric_power_consumption')
def test_register_raw(self): # self.assertTrue(validate_uuid(register_raw(get_metadata('sampleFile.txt'), 'sampleFile.txt', 'raw', 123)), # "Didn't get UUID") self.assertTrue(registry.validate_uuid(registry.register_raw(registry.get_metadata('household_power_consumption_50.txt.gz'), 'household_power_consumption_50.txt.gz', 'raw', 123)), "Didn't get UUID")
def test_NotRegisteredFile(self): # self.assertEqual(get_metadata('badfile.db'), {u'fields': [], u'file': {}}) self.assertEqual(registry.get_metadata('badfile.db'), {u'fields': [], u'file': {}})
def test_db_table(self): self.assertEqual( registry.db_table(registry.get_metadata('sample')), 'sample', "wrong db table")
def test_db_table_work(self): logger.info(registry.db_table(registry.get_metadata('sample'), type='work')) self.assertEqual( registry.db_table(registry.get_metadata('sample'), type='work'), 'sample_work', "Bad Work Table")
def test_register_invalid_multireason(self): self.assertTrue(registry.validate_uuid( registry.register_invalid(registry.get_metadata('sample.txt'), uuid4(), 'sample.txt', {"datatypeMismatch": 3, "rowCountMismatch": 12}, 123, 'validation_query')))
def test_file_path(self): self.assertEqual( registry.file_path(registry.get_metadata('sample')), registry.data_root + '/none/test/raw/sample')
def setUp(self): self.t_hive = validator.Hive() metadata = reg.get_metadata("household_power_consumption") self.t_hive.create_hive_table(metadata, reset=True, type="work") self.t_hive.create_hive_table(metadata, reset=True, stage="valid")
def test_template_guid(self): self.assertEqual( registry.template_guid(registry.get_metadata('sample')), '7e1a614c-9570-42a6-9bc7-315f2b6218be', "Didn't get UUID")
def test_delta(self): metadata = reg.get_metadata("householdElectricPowerConsumption") self.t_hive.create_hive_table(metadata, stage="valid", type='work') self.t_hive.delta(metadata)
def test_db_name_work(self): self.assertEqual( registry.db_name(registry.get_metadata('sample.txt'), type='work'), 'dev_none_test_raw')