def test_dataset_schema_invalid(self): with self.assertRaises(DartValidationException) as context: columns = [ Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT) ] df = DataFormat(FileFormat.PARQUET, RowFormat.NONE) location = None ds = Dataset(data=DatasetData(name='test-dataset', table_name='test_dataset_table', load_type=LoadType.INSERT, location=location, data_format=df, columns=columns, tags=[])) # should fail because location is required default_and_validate(ds, dataset_schema()) self.assertTrue(isinstance(context.exception, DartValidationException))
def setUp(self): dart = Dart(host='localhost', port=5000) """ :type dart: dart.client.python.dart_client.Dart """ self.dart = dart cs = [ Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT) ] df = DataFormat(FileFormat.TEXTFILE, RowFormat.DELIMITED) dataset_data = DatasetData( 'test-dataset', 'test_dataset_table', 's3://' + os.environ['DART_TEST_BUCKET'] + '/impala', df, cs) self.dataset = self.dart.save_dataset(Dataset(data=dataset_data)) start = 's3://' + os.environ['DART_TEST_BUCKET'] + '/impala/impala' end = 's3://' + os.environ['DART_TEST_BUCKET'] + '/impala/install' regex = '.*\\.rpm' ds = Subscription(data=SubscriptionData( 'test-subscription', self.dataset.id, start, end, regex)) self.subscription = self.dart.save_subscription(ds) dst_args = {'action_sleep_time_in_seconds': 0} dst = Datastore(data=DatastoreData('test-datastore', 'no_op_engine', args=dst_args, state=DatastoreState.TEMPLATE)) self.datastore = self.dart.save_datastore(dst) wf = Workflow(data=WorkflowData( 'test-workflow', self.datastore.id, state=WorkflowState.ACTIVE)) self.workflow = self.dart.save_workflow(wf, self.datastore.id) a_args = {'subscription_id': self.subscription.id} a0 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE)) a1 = Action(data=ActionData(NoOpActionTypes.consume_subscription.name, NoOpActionTypes.consume_subscription.name, a_args, state=ActionState.TEMPLATE)) self.action0, self.action1 = self.dart.save_actions( [a0, a1], workflow_id=self.workflow.id)
LoadType if __name__ == '__main__': dart = Dart('localhost', 5000) assert isinstance(dart, Dart) dataset = dart.save_dataset( Dataset(data=(DatasetData( name='beacon_native_app_parsed_v01', table_name='beacon_native_app', location='s3://example-bucket/nb.retailmenot.com/parsed_logs', load_type=LoadType.INSERT, data_format=DataFormat(FileFormat.TEXTFILE, RowFormat.DELIMITED, delimited_by='\t', quoted_by='"', escaped_by='\\', null_string='NULL', num_header_rows=1), compression=Compression.NONE, partitions=[ Column('year', DataType.STRING), Column('week', DataType.STRING), ], columns=[ Column('logFileId', DataType.BIGINT), Column('lineNumber', DataType.INT), Column('created', DataType.TIMESTAMP, date_pattern="yyyy-MM-dd HH:mm:ss"), Column('remoteip', DataType.STRING),
from dart.model.action import ActionData from dart.model.dataset import Column, DatasetData, Dataset, DataFormat, FileFormat, RowFormat, DataType, Compression from dart.model.datastore import Datastore, DatastoreData, DatastoreState if __name__ == '__main__': dart = Dart('localhost', 5000) # dart = Dart() assert isinstance(dart, Dart) dataset = dart.save_dataset(Dataset(data=DatasetData( name='weblogs_v01', table_name='weblogs', location='s3://example-bucket/weblogs/www.retailmenot.com/ec2/', data_format=DataFormat( file_format=FileFormat.TEXTFILE, row_format=RowFormat.REGEX, regex_input="(?<ip>^(?:(?:unknown(?:,\\s)?|(?:\\d+\\.\\d+\\.\\d+\\.\\d+(?:,\\s)?))+)|\\S*)\\s+\\S+\\s+(?<userIdentifier>(?:[^\\[]+|\\$\\S+\\['\\S+'\\]|\\[username\\]))\\s*\\s+\\[(?<requestDate>[^\\]]+)\\]\\s+\"(?<httpMethod>(?:GET|HEAD|POST|PUT|DELETE|TRACE))\\s(?<urlPath>(?:[^ ?]+))(?:\\?(?<queryString>(?:[^ ]+)))?\\sHTTP/(?<httpVersion>(?:[\\d\\.]+))\"\\s+(?<statusCode>[0-9]+)\\s+(?<bytesSent>\\S+)\\s+\"(?<referrer>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userAgent>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+(?<responseTime>[-0-9]*)\\s+\"(?<hostName>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userFingerprint>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<sessionId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<requestId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<visitorId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<vegSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<fruitSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<cacheHitMiss>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s*\\Z", regex_output="%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21s", ), columns=[ Column('ip', DataType.STRING), Column('user', DataType.STRING), Column('requestDate', DataType.TIMESTAMP, date_pattern='dd/MMM/yyyy:HH:mm:ss Z'), Column('httpMethod', DataType.STRING), Column('urlPath', DataType.STRING), Column('queryString', DataType.STRING), Column('httpVersion', DataType.STRING), Column('statusCode', DataType.STRING), Column('bytesSent', DataType.INT), Column('referrer', DataType.STRING), Column('userAgent', DataType.STRING), Column('responseTime', DataType.BIGINT),
from dart.model.trigger import TriggerData from dart.model.workflow import Workflow, WorkflowState from dart.model.workflow import WorkflowData if __name__ == '__main__': dart = Dart('localhost', 5000) assert isinstance(dart, Dart) dataset = dart.save_dataset( Dataset(data=(DatasetData( name='owen_eu_DW-3213_v3', table_name='owen_eu', location='s3://example-bucket/prd/inbound/overlord/eu-all-events', load_type=LoadType.MERGE, data_format=DataFormat( file_format=FileFormat.TEXTFILE, row_format=RowFormat.JSON, ), compression=Compression.GZIP, partitions=[ Column('year', DataType.STRING), Column('month', DataType.STRING), Column('day', DataType.STRING), ], columns=[ Column('host', DataType.STRING, path='metadata.host'), Column('referer', DataType.STRING, path='metadata.referer'), Column('userAgent', DataType.STRING, path='owen.context.userAgent'), Column('ipAddress', DataType.STRING,
from dart.model.event import EventData from dart.model.trigger import Trigger, TriggerData from dart.model.workflow import Workflow, WorkflowState from dart.model.workflow import WorkflowData if __name__ == '__main__': dart = Dart('localhost', 5000) assert isinstance(dart, Dart) dataset = dart.save_dataset(Dataset(data=(DatasetData( name='beacon_native_app_v02', table_name='beacon_native_app', location='s3://example-bucket/prd/beacon/native_app/v2/parquet/snappy', hive_compatible_partition_folders=True, load_type=LoadType.INSERT, data_format=DataFormat('parquet'), columns=[ Column('logFileId', DataType.BIGINT), Column('lineNumber', DataType.INT), Column('created', DataType.BIGINT), Column('remoteip', DataType.STRING), Column('useragent', DataType.STRING), Column('eventType', DataType.STRING), Column('appVersion', DataType.STRING), Column('advertiserID', DataType.STRING), Column('couponsOnPage', DataType.INT), Column('coupons', DataType.STRING), Column('channel', DataType.STRING), Column('geoCouponCount', DataType.STRING), Column('geofence', DataType.STRING), Column('geofenceTimeSpent', DataType.STRING),
def prepare_load_dataset_steps(dry_run, args_by_name, datastore, dataset, action_id, s3_path_and_file_size_gen, target_is_dynamodb=False): """ :type dataset: dart.model.dataset.Dataset """ def add_to(step_partials, step_num, func, *args): # add all params except the last one, which is the total steps (known at the end) step_partials.append( functools.partial(func, *(list(args) + [step_num]))) return step_num + 1 def stage_table_not_needed(ds, file_format, row_format, compression, delimited_by, quoted_by, escaped_by, null_string): """ :type ds: dart.model.dataset.Dataset """ return file_format == ds.data.data_format.file_format\ and row_format == ds.data.data_format.row_format\ and compression == ds.data.compression\ and delimited_by == ds.data.data_format.delimited_by\ and quoted_by == ds.data.data_format.quoted_by\ and escaped_by == ds.data.data_format.escaped_by\ and null_string == ds.data.data_format.null_string # some steps require producing a dataset specific file based on a template, so we will copy all to a tempdir tempdir = tempfile.mkdtemp() try: local_step_path, s3_step_path, s3_temp_path = prepare_step_paths( datastore, tempdir) target_table_name = args_by_name.get( 'target_table_name') or dataset.data.table_name target_file_format = args_by_name.get( 'target_file_format') or dataset.data.data_format.file_format target_row_format = args_by_name.get( 'target_row_format') or dataset.data.data_format.row_format target_compression = args_by_name.get( 'target_compression') or dataset.data.compression target_delimited_by = args_by_name.get( 'target_delimited_by') or dataset.data.data_format.delimited_by target_quoted_by = args_by_name.get( 'target_quoted_by') or dataset.data.data_format.quoted_by target_escaped_by = args_by_name.get( 'target_escaped_by') or dataset.data.data_format.escaped_by target_null_string = args_by_name.get( 'target_null_string') or dataset.data.data_format.null_string stage_table_name = target_table_name + '_stage_for_action_' + action_id staging_not_needed = stage_table_not_needed( dataset, target_file_format, target_row_format, target_compression, target_delimited_by, target_quoted_by, target_escaped_by, target_null_string) first_table_name = target_table_name if staging_not_needed and not target_is_dynamodb else stage_table_name drop_table_names = [] step_funcs = [] i = 1 # ------------------------------------------------------------------------------------------------------------ # all code paths below require copying the data to HDFS, and lowercasing the table is required because of hive # ------------------------------------------------------------------------------------------------------------ i = add_to(step_funcs, i, s3distcp_files_step, s3_path_and_file_size_gen, first_table_name.lower(), dataset, s3_step_path, local_step_path, action_id) # ------------------------------------------------------------------------------------------------------------ # not all folder structures on s3 are hive compatible... if not, rename directories after copying # ------------------------------------------------------------------------------------------------------------ if dataset.data.partitions and not dataset.data.hive_compatible_partition_folders: i = add_to(step_funcs, i, python_fix_partition_folder_names, first_table_name.lower(), dataset.data.partitions, s3_step_path, action_id) # ------------------------------------------------------------------------------------------------------------ # special case to share functionality with the dynamodb_engine # ------------------------------------------------------------------------------------------------------------ if target_is_dynamodb: dyn_dataset = Dataset.from_dict(dataset.to_dict()) assert isinstance(dyn_dataset, Dataset) dyn_dataset.data.data_format = DataFormat('DYNAMODB_TABLE', RowFormat.NONE, 0) dyn_dataset.data.compression = Compression.NONE dyn_dataset.data.columns = [ Column(c.name, dynamodb_column_type(c)) for c in dataset.data.columns ] set_hive_vars = 'SET dynamodb.retry.duration = 0;\nSET dynamodb.throughput.write.percent = %s;' set_hive_vars = set_hive_vars % args_by_name[ 'write_capacity_utilization_percent'] i = add_to(step_funcs, i, hive_table_definition_step, stage_table_name, dataset, s3_step_path, local_step_path, action_id, False) i = add_to(step_funcs, i, hive_table_definition_step, target_table_name, dyn_dataset, s3_step_path, local_step_path, action_id, True) i = add_to(step_funcs, i, hive_msck_repair_table_step, stage_table_name, s3_step_path, action_id) i = add_to(step_funcs, i, hive_copy_to_table, dataset, stage_table_name, dyn_dataset, target_table_name, s3_step_path, local_step_path, action_id, set_hive_vars) # ------------------------------------------------------------------------------------------------------------ # if no stage tables are needed, much complexity can be skipped # ------------------------------------------------------------------------------------------------------------ elif staging_not_needed: i = add_to(step_funcs, i, hive_table_definition_step, target_table_name, dataset, s3_step_path, local_step_path, action_id, False) i = add_to(step_funcs, i, hive_msck_repair_table_step, target_table_name, s3_step_path, action_id) # ------------------------------------------------------------------------------------------------------------ # one or more staging tables are needed # ------------------------------------------------------------------------------------------------------------ else: stage_dataset = dataset target_dataset = Dataset.from_dict(dataset.to_dict()) target_dataset.data.data_format = DataFormat( target_file_format, target_row_format, 0, target_delimited_by, target_quoted_by, target_escaped_by, target_null_string) target_dataset.data.compression = target_compression drop_table_names.append(stage_table_name) # -------------------------------------------------------------------------------------------------------- # define string types for JSON/REGEX based datasets (safe), and we will cast appropriately during insert # -------------------------------------------------------------------------------------------------------- if stage_dataset.data.data_format.row_format in [ RowFormat.JSON, RowFormat.REGEX ]: # make a copy since we are modifying the columns stage_dataset = Dataset.from_dict(dataset.to_dict()) assert isinstance(stage_dataset, Dataset) for c in stage_dataset.data.columns: c.data_type = DataType.STRING i = add_to(step_funcs, i, hive_table_definition_step, stage_table_name, stage_dataset, s3_step_path, local_step_path, action_id, False) i = add_to(step_funcs, i, hive_table_definition_step, target_table_name, target_dataset, s3_step_path, local_step_path, action_id, False) i = add_to(step_funcs, i, hive_msck_repair_table_step, stage_table_name, s3_step_path, action_id) # -------------------------------------------------------------------------------------------------------- # hive has issues creating parquet files # -------------------------------------------------------------------------------------------------------- if target_file_format != FileFormat.PARQUET: i = add_to(step_funcs, i, hive_copy_to_table, stage_dataset, stage_table_name, target_dataset, target_table_name, s3_step_path, local_step_path, action_id, None) # -------------------------------------------------------------------------------------------------------- # impala is better for creating parquet files # -------------------------------------------------------------------------------------------------------- else: # ---------------------------------------------------------------------------------------------------- # no additional staging tables needed if the source dataset file format is RCFILE (impala friendly) # ---------------------------------------------------------------------------------------------------- if dataset.data.data_format.file_format == FileFormat.RCFILE: i = add_to(step_funcs, i, hive_copy_to_table, stage_dataset, stage_table_name, target_dataset, target_table_name, s3_step_path, local_step_path, action_id, None) # ---------------------------------------------------------------------------------------------------- # impala cannot read all hive formats, so we will introduce another staging table # ---------------------------------------------------------------------------------------------------- else: rc_table_name = target_table_name + '_rcfile_stage_for_action_' + action_id rc_dataset = Dataset.from_dict(target_dataset.to_dict()) rc_dataset.data.data_format = DataFormat( FileFormat.RCFILE, RowFormat.NONE, 0) rc_dataset.data.compression = Compression.NONE drop_table_names.append(rc_table_name) i = add_to(step_funcs, i, hive_table_definition_step, rc_table_name, rc_dataset, s3_step_path, local_step_path, action_id, False) i = add_to(step_funcs, i, hive_copy_to_table, stage_dataset, stage_table_name, rc_dataset, rc_table_name, s3_step_path, local_step_path, action_id, None) i = add_to(step_funcs, i, impala_copy_to_table, rc_dataset, rc_table_name, target_dataset, target_table_name, s3_step_path, local_step_path, action_id) # ------------------------------------------------------------------------------------------------------------ # at this point, the load should be considered complete even if something goes wrong in the steps below, # so we will indicate this in the step wrapper # ------------------------------------------------------------------------------------------------------------ considered_successful_at_this_index = i # ------------------------------------------------------------------------------------------------------------ # drop any staging tables created # ------------------------------------------------------------------------------------------------------------ if drop_table_names: script = '\n'.join( ['DROP TABLE %s;' % name for name in drop_table_names]) i = add_to(step_funcs, i, hive_run_script_contents_step, script, s3_step_path, local_step_path, action_id) # ------------------------------------------------------------------------------------------------------------ # inform impala about changes # ------------------------------------------------------------------------------------------------------------ if not target_is_dynamodb: i = add_to(step_funcs, i, impala_invalidate_metadata_step, s3_step_path, action_id) total_steps = i - 1 steps = [] for index, f in enumerate(step_funcs, 1): step_wrapper = f(total_steps) assert isinstance(step_wrapper, StepWrapper) if index >= considered_successful_at_this_index: step_wrapper.action_considered_successful = True steps.append(step_wrapper) if not dry_run: s3_copy_recursive(local_step_path, s3_step_path) return steps finally: shutil.rmtree(tempdir)
def setUp(self): dart = Dart(host='localhost', port=5000) """ :type dart: dart.client.python.dart_client.Dart """ self.dart = dart cs = [Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT)] df = DataFormat(FileFormat.PARQUET, RowFormat.NONE) dataset_data = DatasetData('test-dataset0', 'test_dataset_table0', 's3://test/dataset/0/%s' + random_id(), df, cs) self.dataset0 = self.dart.save_dataset(Dataset(data=dataset_data)) cs = [Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT)] df = DataFormat(FileFormat.PARQUET, RowFormat.NONE) dataset1_location = 's3://test/dataset/1/%s' + random_id() dataset_data = DatasetData('test-dataset1', 'test_dataset_table1', dataset1_location, df, cs) self.dataset1 = self.dart.save_dataset(Dataset(data=dataset_data)) cs = [Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT)] df = DataFormat(FileFormat.PARQUET, RowFormat.NONE) dataset_data = DatasetData('test-dataset2-no-show', 'test_dataset_table2', 's3://test/dataset/2/%s' + random_id(), df, cs) self.dataset2 = self.dart.save_dataset(Dataset(data=dataset_data)) s = Subscription(data=SubscriptionData('test-subscription0', self.dataset0.id)) self.subscription0 = self.dart.save_subscription(s) s = Subscription(data=SubscriptionData('test-subscription2-no-show', self.dataset2.id)) self.subscription2 = self.dart.save_subscription(s) dst_args = {'action_sleep_time_in_seconds': 0} dst = Datastore(data=DatastoreData('test-datastore0', 'no_op_engine', args=dst_args, state=DatastoreState.TEMPLATE)) self.datastore0 = self.dart.save_datastore(dst) dst = Datastore(data=DatastoreData('test-datastore1', 'no_op_engine', args=dst_args, state=DatastoreState.TEMPLATE)) self.datastore1 = self.dart.save_datastore(dst) dst = Datastore(data=DatastoreData('test-datastore2-no-show', 'no_op_engine', args=dst_args, state=DatastoreState.ACTIVE)) self.datastore2 = self.dart.save_datastore(dst) wf0 = Workflow(data=WorkflowData('test-workflow0', self.datastore0.id, state=WorkflowState.ACTIVE)) self.workflow0 = self.dart.save_workflow(wf0, self.datastore0.id) wf1 = Workflow(data=WorkflowData('test-workflow1', self.datastore1.id, state=WorkflowState.ACTIVE)) self.workflow1 = self.dart.save_workflow(wf1, self.datastore1.id) wf2 = Workflow(data=WorkflowData('test-workflow2-no-show', self.datastore2.id, state=WorkflowState.ACTIVE)) self.workflow2 = self.dart.save_workflow(wf2, self.datastore2.id) a_args = {'source_hdfs_path': 'hdfs:///user/hive/warehouse/test', 'destination_s3_path': dataset1_location} a00 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE)) a01 = Action(data=ActionData(NoOpActionTypes.consume_subscription.name, NoOpActionTypes.consume_subscription.name, {'subscription_id': self.subscription0.id}, state=ActionState.TEMPLATE)) a02 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE)) a03 = Action(data=ActionData(NoOpActionTypes.copy_hdfs_to_s3_action.name, NoOpActionTypes.copy_hdfs_to_s3_action.name, a_args, state=ActionState.TEMPLATE)) a04 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE)) self.action00, self.action01, self.action02, self.action03, self.action04 = \ self.dart.save_actions([a00, a01, a02, a03, a04], workflow_id=self.workflow0.id) a10 = Action(data=ActionData(NoOpActionTypes.load_dataset.name, NoOpActionTypes.load_dataset.name, {'dataset_id': self.dataset1.id}, state=ActionState.TEMPLATE)) self.action10 = self.dart.save_actions([a10], workflow_id=self.workflow1.id) a20 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.HAS_NEVER_RUN)) a21 = Action(data=ActionData(NoOpActionTypes.load_dataset.name, NoOpActionTypes.load_dataset.name, {'dataset_id': self.dataset2.id}, state=ActionState.TEMPLATE)) self.action20 = self.dart.save_actions([a20], datastore_id=self.datastore2.id) self.action21 = self.dart.save_actions([a21], workflow_id=self.workflow2.id) self.event1 = self.dart.save_event(Event(data=EventData('test-event1', state=EventState.ACTIVE))) self.event2 = self.dart.save_event(Event(data=EventData('test-event2-no-show', state=EventState.ACTIVE))) tr_args = {'event_id': self.event1.id} tr = Trigger(data=TriggerData('test-event-trigger1', 'event', [self.workflow1.id], tr_args, TriggerState.ACTIVE)) self.event_trigger1 = self.dart.save_trigger(tr) tr_args = {'event_id': self.event2.id} tr = Trigger(data=TriggerData('test-event-trigger2-no-show', 'event', [self.workflow2.id], tr_args, TriggerState.ACTIVE)) self.event_trigger2 = self.dart.save_trigger(tr) st_args = {'fire_after': 'ALL', 'completed_trigger_ids': [self.event_trigger1.id]} st = Trigger(data=TriggerData('test-super-trigger1', 'super', None, st_args, TriggerState.ACTIVE)) self.super_trigger1 = self.dart.save_trigger(st) st_args = {'fire_after': 'ANY', 'completed_trigger_ids': [self.super_trigger1.id]} st = Trigger(data=TriggerData('test-super-trigger2', 'super', [self.workflow1.id], st_args, TriggerState.ACTIVE)) self.super_trigger2 = self.dart.save_trigger(st)
def test_hive_table_definition_step(self): ds = Dataset(data=DatasetData( name='weblogs_v01', table_name='weblogs', location='s3://wsm-log-servers/weblogs/www.retailmenot.com/ec2/', data_format=DataFormat( file_format=FileFormat.TEXTFILE, row_format=RowFormat.REGEX, regex_input= "(?<ip>^(?:(?:unknown(?:,\\s)?|(?:\\d+\\.\\d+\\.\\d+\\.\\d+(?:,\\s)?))+)|\\S*)\\s+\\S+\\s+(?<userIdentifier>(?:[^\\[]+|\\$\\S+\\['\\S+'\\]|\\[username\\]))\\s*\\s+\\[(?<requestDate>[^\\]]+)\\]\\s+\"(?<httpMethod>(?:GET|HEAD|POST|PUT|DELETE|TRACE))\\s(?<urlPath>(?:[^ ?]+))(?:\\?(?<queryString>(?:[^ ]+)))?\\sHTTP/(?<httpVersion>(?:[\\d\\.]+))\"\\s+(?<statusCode>[0-9]+)\\s+(?<bytesSent>\\S+)\\s+\"(?<referrer>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userAgent>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+(?<responseTime>[-0-9]*)\\s+\"(?<hostName>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userFingerprint>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<sessionId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<requestId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<visitorId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<vegSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<fruitSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<cacheHitMiss>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s*\\Z", regex_output= "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21s", ), columns=[ Column('ip', 'STRING'), Column('user', 'STRING'), Column('requestDate', 'TIMESTAMP', date_pattern='dd/MMM/yyyy:HH:mm:ss Z'), Column('httpMethod', 'STRING'), Column('urlPath', 'STRING'), Column('queryString', 'STRING'), Column('httpVersion', 'STRING'), Column('statusCode', 'STRING'), Column('bytesSent', 'INT'), Column('referrer', 'STRING'), Column('userAgent', 'STRING'), Column('responseTime', 'BIGINT'), Column('hostname', 'STRING'), Column('userFingerprint', 'STRING'), Column('userId', 'STRING'), Column('sessionId', 'STRING'), Column('requestId', 'STRING'), Column('visitorId', 'STRING'), Column('vegSlice', 'STRING'), Column('fruitSlice', 'STRING'), Column('cacheHitMiss', 'STRING'), ], compression='GZIP', partitions=[ Column('year', 'STRING'), Column('week', 'STRING'), Column('day', 'STRING'), ], )) call('mkdir -p /tmp/dart-emr-test/hive/') this_path = os.path.dirname(os.path.abspath(__file__)) shutil.copyfile( this_path + '/../../../engine/emr/steps/hive/copy_to_table.hql', '/tmp/dart-emr-test/hive/copy_to_table.hql') hive_copy_to_table(ds, 'weblogs_stage', ds, 'weblogs', 's3://test', '/tmp/dart-emr-test/', 'actionid123', None, 1, 1) with open(os.path.join(this_path, 'copy_to_table_weblogs.hql')) as f: expected_contents = f.read() with open('/tmp/dart-emr-test/hive/copy_to_table_weblogs.hql') as f: actual_contents = f.read() self.assertEqual(expected_contents, actual_contents)
def test_hive_table_definition_step(self): ds = Dataset(data=DatasetData( name='owen_eu_v01', table_name='owen_eu', location='s3://s3-rpt-uss-dat-warehouse/prd/inbound/overlord/eu-all-events', data_format=DataFormat( file_format=FileFormat.TEXTFILE, row_format=RowFormat.JSON, ), columns=[ Column('host', 'STRING', path='metadata.host'), Column('pageName', 'STRING', path='owen.context.pageName'), Column('viewInstanceUuid', 'STRING', path='owen.context.viewInstanceUuid'), Column('previousPageName', 'STRING', path='owen.context.previousPageName'), Column('previousViewInstanceUuid', 'STRING', path='owen.context.previousViewInstanceUuid'), Column('session', 'STRING', path='owen.context.session'), Column('pageType', 'STRING', path='owen.context.pageType'), Column('propertyName', 'STRING', path='owen.context.propertyName'), Column('enviroment', 'STRING', path='owen.context.environment'), Column('appForegroundFlag', 'BOOLEAN', path='owen.context.appForegroundFlag'), Column('bluetoothEnabledFlag', 'BOOLEAN', path='owen.context.bluetoothEnabledFlag'), Column('favoriteFlag', 'BOOLEAN', path='owen.context.favoriteFlag'), Column('locationEnabledFlag', 'BOOLEAN', path='owen.context.locationEnabledFlag'), Column('loggedInFlag', 'BOOLEAN', path='owen.context.loggedInFlag'), Column('notificationEnabledFlag', 'BOOLEAN', path='owen.context.notificationEnabledFlag'), Column('personalizationFlag', 'BOOLEAN', path='owen.context.personalizationFlag'), Column('advertiserUuid', 'STRING', path='owen.context.advertiserUuid'), Column('udid', 'STRING', path='owen.context.udid'), Column('userQualifier', 'STRING', path='owen.context.userQualifier'), Column('userId', 'STRING', path='owen.context.custom.legacy.userId'), Column('userUuid', 'STRING', path='owen.context.userUuid'), Column('macAddress', 'STRING', path='owen.context.macAddress'), Column('ipAddress', 'STRING', path='owen.context.ipAddress'), Column('osVersion', 'STRING', path='owen.context.osVersion'), Column('osFamily', 'STRING', path='owen.context.osFamily'), Column('osName', 'STRING', path='owen.context.osName'), Column('browserFamily', 'STRING', path='owen.context.browserFamily'), Column('deviceCategory', 'STRING', path='owen.context.deviceCategory'), Column('deviceMake', 'STRING', path='owen.context.mobileDeviceMake'), Column('deviceModel', 'STRING', path='owen.context.mobileDeviceModel'), Column('connectionType', 'STRING', path='owen.context.connectionType'), Column('userAgent', 'STRING', path='owen.context.userAgent'), Column('geofenceId', 'STRING', path='owen.context.custom.legacy.geofenceId'), Column('eventTimestamp', 'TIMESTAMP', path='owen.event.eventTimestamp', date_pattern="yyyy-MM-dd'T'HH:mm:ssZ"), Column('eventInstanceUuid', 'STRING', path='owen.event.eventInstanceUuid'), Column('eventPlatformVersion', 'STRING', path='owen.event.eventPlatformVersion'), Column('eventVersion', 'STRING', path='owen.event.eventVersion'), Column('eventCategory', 'STRING', path='owen.event.eventCategory'), Column('eventName', 'STRING', path='owen.event.eventName'), Column('eventAction', 'STRING', path='owen.event.eventAction'), Column('eventPlatform', 'STRING', path='owen.event.eventPlatform'), Column('testUnixTimestampSecondsPattern', 'TIMESTAMP', path='some.fake.path.testUnixTimestampSecondsPattern', date_pattern='UNIX_TIMESTAMP_SECONDS'), Column('testUnixTimestampMillisPattern', 'TIMESTAMP', path='some.fake.path.testUnixTimestampMillisPattern', date_pattern='UNIX_TIMESTAMP_MILLIS'), ], compression='GZIP', partitions=[ Column('year', 'STRING'), Column('week', 'STRING'), Column('day', 'STRING'), ], )) call('mkdir -p /tmp/dart-emr-test/hive/') this_path = os.path.dirname(os.path.abspath(__file__)) shutil.copyfile(this_path + '/../../../engine/emr/steps/hive/copy_to_table.hql', '/tmp/dart-emr-test/hive/copy_to_table.hql') action_id = 'actionid123' target_dataset = Dataset.from_dict(ds.to_dict()) target_dataset.data.data_format.num_header_rows = 0 target_dataset.data.data_format = DataFormat(FileFormat.RCFILE, RowFormat.NONE) stage_dataset = Dataset.from_dict(ds.to_dict()) assert isinstance(stage_dataset, Dataset) for c in stage_dataset.data.columns: c.data_type = DataType.STRING hive_copy_to_table(stage_dataset, 'owen_eu_stage', target_dataset, 'owen_eu', 's3://test', '/tmp/dart-emr-test/', action_id, None, 1, 1) with open(os.path.join(this_path, 'copy_to_table_owen_eu.hql')) as f: expected_contents = f.read() with open('/tmp/dart-emr-test/hive/copy_to_table_owen_eu.hql') as f: actual_contents = f.read() self.assertEqual(expected_contents, actual_contents)