def test_dataset_schema(self): columns = [Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT)] num_header_rows = None df = DataFormat(FileFormat.PARQUET, RowFormat.NONE, num_header_rows) ds = Dataset(data=DatasetData('test-dataset', 'test_dataset_table', 's3://bucket/prefix', df, columns)) obj_before = ds.to_dict() obj_after = default_and_validate(ds, dataset_schema()).to_dict() # num_header_rows should have been defaulted to 0, making these unequal self.assertNotEqual(obj_before, obj_after)
def test_hive_table_definition_step(self): ds = Dataset(data=DatasetData( name='owen_eu_v01', table_name='owen_eu', location='s3://s3-rpt-uss-dat-warehouse/prd/inbound/overlord/eu-all-events', data_format=DataFormat( file_format=FileFormat.TEXTFILE, row_format=RowFormat.JSON, ), columns=[ Column('host', 'STRING', path='metadata.host'), Column('pageName', 'STRING', path='owen.context.pageName'), Column('viewInstanceUuid', 'STRING', path='owen.context.viewInstanceUuid'), Column('previousPageName', 'STRING', path='owen.context.previousPageName'), Column('previousViewInstanceUuid', 'STRING', path='owen.context.previousViewInstanceUuid'), Column('session', 'STRING', path='owen.context.session'), Column('pageType', 'STRING', path='owen.context.pageType'), Column('propertyName', 'STRING', path='owen.context.propertyName'), Column('enviroment', 'STRING', path='owen.context.environment'), Column('appForegroundFlag', 'BOOLEAN', path='owen.context.appForegroundFlag'), Column('bluetoothEnabledFlag', 'BOOLEAN', path='owen.context.bluetoothEnabledFlag'), Column('favoriteFlag', 'BOOLEAN', path='owen.context.favoriteFlag'), Column('locationEnabledFlag', 'BOOLEAN', path='owen.context.locationEnabledFlag'), Column('loggedInFlag', 'BOOLEAN', path='owen.context.loggedInFlag'), Column('notificationEnabledFlag', 'BOOLEAN', path='owen.context.notificationEnabledFlag'), Column('personalizationFlag', 'BOOLEAN', path='owen.context.personalizationFlag'), Column('advertiserUuid', 'STRING', path='owen.context.advertiserUuid'), Column('udid', 'STRING', path='owen.context.udid'), Column('userQualifier', 'STRING', path='owen.context.userQualifier'), Column('userId', 'STRING', path='owen.context.custom.legacy.userId'), Column('userUuid', 'STRING', path='owen.context.userUuid'), Column('macAddress', 'STRING', path='owen.context.macAddress'), Column('ipAddress', 'STRING', path='owen.context.ipAddress'), Column('osVersion', 'STRING', path='owen.context.osVersion'), Column('osFamily', 'STRING', path='owen.context.osFamily'), Column('osName', 'STRING', path='owen.context.osName'), Column('browserFamily', 'STRING', path='owen.context.browserFamily'), Column('deviceCategory', 'STRING', path='owen.context.deviceCategory'), Column('deviceMake', 'STRING', path='owen.context.mobileDeviceMake'), Column('deviceModel', 'STRING', path='owen.context.mobileDeviceModel'), Column('connectionType', 'STRING', path='owen.context.connectionType'), Column('userAgent', 'STRING', path='owen.context.userAgent'), Column('geofenceId', 'STRING', path='owen.context.custom.legacy.geofenceId'), Column('eventTimestamp', 'TIMESTAMP', path='owen.event.eventTimestamp', date_pattern="yyyy-MM-dd'T'HH:mm:ssZ"), Column('eventInstanceUuid', 'STRING', path='owen.event.eventInstanceUuid'), Column('eventPlatformVersion', 'STRING', path='owen.event.eventPlatformVersion'), Column('eventVersion', 'STRING', path='owen.event.eventVersion'), Column('eventCategory', 'STRING', path='owen.event.eventCategory'), Column('eventName', 'STRING', path='owen.event.eventName'), Column('eventAction', 'STRING', path='owen.event.eventAction'), Column('eventPlatform', 'STRING', path='owen.event.eventPlatform'), Column('testUnixTimestampSecondsPattern', 'TIMESTAMP', path='some.fake.path.testUnixTimestampSecondsPattern', date_pattern='UNIX_TIMESTAMP_SECONDS'), Column('testUnixTimestampMillisPattern', 'TIMESTAMP', path='some.fake.path.testUnixTimestampMillisPattern', date_pattern='UNIX_TIMESTAMP_MILLIS'), ], compression='GZIP', partitions=[ Column('year', 'STRING'), Column('week', 'STRING'), Column('day', 'STRING'), ], )) call('mkdir -p /tmp/dart-emr-test/hive/') this_path = os.path.dirname(os.path.abspath(__file__)) shutil.copyfile(this_path + '/../../../engine/emr/steps/hive/copy_to_table.hql', '/tmp/dart-emr-test/hive/copy_to_table.hql') action_id = 'actionid123' target_dataset = Dataset.from_dict(ds.to_dict()) target_dataset.data.data_format.num_header_rows = 0 target_dataset.data.data_format = DataFormat(FileFormat.RCFILE, RowFormat.NONE) stage_dataset = Dataset.from_dict(ds.to_dict()) assert isinstance(stage_dataset, Dataset) for c in stage_dataset.data.columns: c.data_type = DataType.STRING hive_copy_to_table(stage_dataset, 'owen_eu_stage', target_dataset, 'owen_eu', 's3://test', '/tmp/dart-emr-test/', action_id, None, 1, 1) with open(os.path.join(this_path, 'copy_to_table_owen_eu.hql')) as f: expected_contents = f.read() with open('/tmp/dart-emr-test/hive/copy_to_table_owen_eu.hql') as f: actual_contents = f.read() self.assertEqual(expected_contents, actual_contents)