def __init__(self, ec2_keyname, instance_profile, service_role, subnet_id, region, core_node_limit, impala_docker_repo_base_url, impala_version, cluster_tags, dart_host, dart_port, dart_api_version=1): super(EmrEngine, self).__init__() self._action_handlers = { EmrActionTypes.start_datastore.name: start_datastore, EmrActionTypes.terminate_datastore.name: terminate_datastore, EmrActionTypes.load_dataset.name: load_dataset, EmrActionTypes.consume_subscription.name: consume_subscription, EmrActionTypes.run_hive_script_action.name: run_hive_script, EmrActionTypes.run_impala_script_action.name: run_impala_script, EmrActionTypes.run_pyspark_script_action.name: run_pyspark_script, EmrActionTypes.copy_hdfs_to_s3_action.name: copy_hdfs_to_s3, } self._region = RegionInfo(self, region, 'elasticmapreduce.%s.amazonaws.com' % region) if region else None self._conn = None self.ec2_keyname = ec2_keyname self.core_node_limit = core_node_limit self.instance_profile = instance_profile self.service_role = service_role self.subnet_id = subnet_id self.cluster_tags = cluster_tags self.impala_docker_repo_base_url = impala_docker_repo_base_url self.impala_version = impala_version self.dart = Dart(dart_host, dart_port, dart_api_version)
class S3Engine(object): def __init__(self, region, dart_host, dart_port, dart_api_version): self.region = region self.dart = Dart(dart_host, dart_port, dart_api_version) self._action_handlers = { S3ActionTypes.copy.name: copy, S3ActionTypes.data_check.name: data_check, } def run(self): action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID')) action = action_context.action datastore = action_context.datastore state = ActionResultState.SUCCESS error_message = None try: action_type_name = action.data.action_type_name _logger.info("*** S3Engine.run_action: %s", action_type_name) assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name handler = self._action_handlers[action_type_name] handler(self, datastore, action) except Exception as e: state = ActionResultState.FAILURE error_message = e.message + '\n\n\n' + traceback.format_exc() finally: self.dart.engine_action_checkin(action.id, ActionResult(state, error_message))
def __init__(self, region, dart_host, dart_port, dart_api_version): self.region = region self.dart = Dart(dart_host, dart_port, dart_api_version) self._action_handlers = { S3ActionTypes.copy.name: copy, S3ActionTypes.data_check.name: data_check, }
class ElasticsearchEngine(object): def __init__(self, kms_key_arn, secrets_s3_path, dart_host, dart_port, dart_api_version=1): self.dart = Dart(dart_host, dart_port, dart_api_version) self._action_handlers = { ElasticsearchActionTypes.data_check.name: data_check, ElasticsearchActionTypes.create_index.name: create_index, ElasticsearchActionTypes.create_template.name: create_template, ElasticsearchActionTypes.create_mapping.name: create_mapping, ElasticsearchActionTypes.delete_index.name: delete_index, ElasticsearchActionTypes.delete_template.name: delete_template, ElasticsearchActionTypes.force_merge_index.name: force_merge_index, } self.secrets = Secrets(kms_key_arn, secrets_s3_path) def run(self): action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID')) action = action_context.action datastore = action_context.datastore state = ActionResultState.SUCCESS error_message = None try: action_type_name = action.data.action_type_name _logger.info('**** ElasticsearchEngine.run_action: %s', action_type_name) assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name handler = self._action_handlers[action_type_name] handler(self, datastore, action) except Exception as e: state = ActionResultState.FAILURE error_message = e.message + '\n\n\n' + traceback.format_exc() finally: self.dart.engine_action_checkin(action.id, ActionResult(state, error_message))
class NoOpEngine(object): def __init__(self, region, dart_host='localhost', dart_port=5000, dart_api_version=1): self.region = region self.dart = Dart(dart_host, dart_port, dart_api_version) def run(self): action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID')) action = action_context.action datastore = action_context.datastore state = ActionResultState.SUCCESS error_message = None try: sleep_seconds = datastore.data.args['action_sleep_time_in_seconds'] _logger.info('sleeping for %s seconds...' % sleep_seconds) time.sleep(sleep_seconds) if action.data.action_type_name == NoOpActionTypes.action_that_fails.name: state = ActionResultState.FAILURE error_message = '%s failed as expected' % NoOpActionTypes.action_that_fails.name if action.data.action_type_name == NoOpActionTypes.consume_subscription.name: subscription_elements = self.dart.get_subscription_elements(action.id) _logger.info('consuming subscription, size = %s' % len(list(subscription_elements))) except Exception as e: state = ActionResultState.FAILURE error_message = e.message + '\n\n\n' + traceback.format_exc() finally: self.dart.engine_action_checkin(action.id, ActionResult(state, error_message))
class S3Engine(ActionRunner): def __init__(self, region, dart_host, dart_port, dart_api_version): super(S3Engine, self).__init__() self.region = region self.dart = Dart(dart_host, dart_port, dart_api_version) self._action_handlers = { S3ActionTypes.copy.name: copy, S3ActionTypes.data_check.name: data_check, } def run(self): action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID')) action = action_context.action datastore = action_context.datastore state = ActionResultState.SUCCESS error_message = None try: action_type_name = action.data.action_type_name _logger.info("*** S3Engine.run_action: %s", action_type_name) assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name handler = self._action_handlers[action_type_name] handler(self, datastore, action) except Exception as e: state = ActionResultState.FAILURE error_message = '{m}\r\r\r{t}'.format( m=str(e.message), t=traceback.format_exc(), ) finally: self.dart.engine_action_checkin(action.id, ActionResult(state, error_message)) self.publish_sns_message(action, error_message, state)
def __init__(self, emr_ec2_keyname, emr_instance_profile, emr_service_role, emr_region, emr_core_node_limit, emr_impala_docker_repo_base_url, emr_impala_version, emr_cluster_tags, emr_cluster_availability_zone, dart_host, dart_port, dart_api_version=1, emr_release_label='emr-4.2.0', emr_instance_type='m3.2xlarge'): self.emr_release_label = emr_release_label self.emr_instance_type = emr_instance_type self._action_handlers = { DynamoDBActionTypes.create_table.name: create_table, DynamoDBActionTypes.delete_table.name: delete_table, DynamoDBActionTypes.load_dataset.name: load_dataset, } self.emr_engine = EmrEngine(emr_ec2_keyname, emr_instance_profile, emr_service_role, emr_region, emr_core_node_limit, emr_impala_docker_repo_base_url, emr_impala_version, emr_cluster_tags, emr_cluster_availability_zone, dart_host, dart_port, dart_api_version) self.dart = Dart(dart_host, dart_port, dart_api_version)
def __init__(self, region, dart_host='localhost', dart_port=5000, dart_api_version=1): self.region = region self.dart = Dart(dart_host, dart_port, dart_api_version)
class S3Engine(object): def __init__(self, dart_host, dart_port, dart_api_version): self.dart = Dart(dart_host, dart_port, dart_api_version) self._action_handlers = { S3ActionTypes.copy.name: s3_copy } def run(self): action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID')) action = action_context.action state = ActionResultState.SUCCESS error_message = None try: _logger.info("*** S3Engine.run_action: %s", action.data.action_type_name) error_message = 'unsupported action: %s' % action.data.action_type_name assert action.data.action_type_name in self._action_handlers, error_message handler = self._action_handlers[action.data.action_type_name] handler(**action.data.args) except Exception as e: state = ActionResultState.FAILURE error_message = e.message + '\n\n\n' + traceback.format_exc() finally: self.dart.engine_action_checkin(action.id, ActionResult(state, error_message))
def __init__(self, kms_key_arn, secrets_s3_path, vpc_subnet, security_group_ids, region, availability_zones, publicly_accessible, cluster_tags, dart_host, dart_port, dart_api_version=1): self.dart = Dart(dart_host, dart_port, dart_api_version) self._action_handlers = { RedshiftActionTypes.start_datastore.name: start_datastore, RedshiftActionTypes.stop_datastore.name: stop_datastore, RedshiftActionTypes.execute_sql.name: execute_sql, RedshiftActionTypes.load_dataset.name: load_dataset, RedshiftActionTypes.consume_subscription.name: consume_subscription, RedshiftActionTypes.copy_to_s3.name: copy_to_s3, RedshiftActionTypes.create_snapshot.name: create_snapshot, RedshiftActionTypes.data_check.name: data_check, RedshiftActionTypes.cluster_maintenance.name: cluster_maintenance, } self.vpc_subnet = vpc_subnet self.availability_zones = availability_zones self.publicly_accessible = publicly_accessible self.security_group_ids = security_group_ids self.cluster_tags = cluster_tags self.region = region self.secrets = Secrets(kms_key_arn, secrets_s3_path)
def setUp(self): self.dart = Dart(host='localhost', port=5000) args = {'action_sleep_time_in_seconds': 0} dst = Datastore(data=DatastoreData('test-datastore', 'no_op_engine', args=args, state=DatastoreState.TEMPLATE)) self.datastore = self.dart.save_datastore(dst) wf = Workflow(data=WorkflowData('test-workflow', self.datastore.id, state=WorkflowState.ACTIVE)) self.workflow = self.dart.save_workflow(wf, self.datastore.id)
class TestWorkflowCrud(unittest.TestCase): def setUp(self): self.dart = Dart(host='localhost', port=5000) args = {'action_sleep_time_in_seconds': 0} dst = Datastore(data=DatastoreData('test-datastore', 'no_op_engine', args=args, state=DatastoreState.ACTIVE)) self.datastore = self.dart.save_datastore(dst) def tearDown(self): self.dart.delete_datastore(self.datastore.id) def test_crud(self): wf = Workflow(data=WorkflowData('test-workflow', self.datastore.id, engine_name='no_op_engine')) posted_wf = self.dart.save_workflow(wf, self.datastore.id) self.assertEqual(posted_wf.data.to_dict(), wf.data.to_dict()) workflow = self.dart.get_workflow(posted_wf.id) self.assertEqual(posted_wf.to_dict(), workflow.to_dict()) workflow.data.concurrency = 2 workflow.data.state = WorkflowState.ACTIVE put_workflow = self.dart.save_workflow(workflow) # not all properties can be modified self.assertEqual(put_workflow.data.concurrency, 1) self.assertEqual(put_workflow.data.state, WorkflowState.ACTIVE) self.assertNotEqual(posted_wf.to_dict(), put_workflow.to_dict()) self.dart.delete_workflow(workflow.id) try: self.dart.get_workflow(workflow.id) except DartRequestException as e: self.assertEqual(e.response.status_code, 404) return self.fail('workflow should have been missing after delete!')
class EmrEngine(object): def __init__(self, ec2_keyname, instance_profile, service_role, region, core_node_limit, impala_docker_repo_base_url, impala_version, cluster_tags, cluster_availability_zone, dart_host, dart_port, dart_api_version=1): self._action_handlers = { EmrActionTypes.start_datastore.name: start_datastore, EmrActionTypes.terminate_datastore.name: terminate_datastore, EmrActionTypes.load_dataset.name: load_dataset, EmrActionTypes.consume_subscription.name: consume_subscription, EmrActionTypes.run_hive_script_action.name: run_hive_script, EmrActionTypes.run_impala_script_action.name: run_impala_script, EmrActionTypes.run_pyspark_script_action.name: run_pyspark_script, EmrActionTypes.copy_hdfs_to_s3_action.name: copy_hdfs_to_s3, } self._region = RegionInfo(self, region, 'elasticmapreduce.%s.amazonaws.com' % region) if region else None self._conn = None self.ec2_keyname = ec2_keyname self.core_node_limit = core_node_limit self.instance_profile = instance_profile self.service_role = service_role self.cluster_tags = cluster_tags self.cluster_availability_zone = cluster_availability_zone self.impala_docker_repo_base_url = impala_docker_repo_base_url self.impala_version = impala_version self.dart = Dart(dart_host, dart_port, dart_api_version) @property def conn(self): if self._conn: return self._conn self._conn = EmrConnection(region=self._region) return self._conn def run(self): action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID')) action = action_context.action datastore = action_context.datastore state = ActionResultState.SUCCESS consume_subscription_state = None error_message = None try: action_type_name = action.data.action_type_name assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name handler = self._action_handlers[action_type_name] handler(self, datastore, action) except ActionFailedButConsumeSuccessfulException as e: state = ActionResultState.FAILURE consume_subscription_state = ConsumeSubscriptionResultState.SUCCESS error_message = e.message + '\n\n\n' + traceback.format_exc() except Exception as e: state = ActionResultState.FAILURE error_message = e.message + '\n\n\n' + traceback.format_exc() finally: self.dart.engine_action_checkin(action.id, ActionResult(state, error_message, consume_subscription_state))
def setUp(self): self.dart = Dart(host='localhost', port=5000) args = {'action_sleep_time_in_seconds': 0} dst = Datastore(data=DatastoreData(name='test-datastore', engine_name='no_op_engine', args=args, state=DatastoreState.ACTIVE)) self.datastore = self.dart.save_datastore(dst)
def __init__(self, region, dart_host='localhost', dart_port=5000, dart_api_version=1): super(NoOpEngine, self).__init__() self.region = region self.dart = Dart(dart_host, dart_port, dart_api_version)
class EmrEngine(object): def __init__(self, ec2_keyname, instance_profile, service_role, subnet_id, region, core_node_limit, impala_docker_repo_base_url, impala_version, cluster_tags, dart_host, dart_port, dart_api_version=1): self._action_handlers = { EmrActionTypes.start_datastore.name: start_datastore, EmrActionTypes.terminate_datastore.name: terminate_datastore, EmrActionTypes.load_dataset.name: load_dataset, EmrActionTypes.consume_subscription.name: consume_subscription, EmrActionTypes.run_hive_script_action.name: run_hive_script, EmrActionTypes.run_impala_script_action.name: run_impala_script, EmrActionTypes.run_pyspark_script_action.name: run_pyspark_script, EmrActionTypes.copy_hdfs_to_s3_action.name: copy_hdfs_to_s3, } self._region = RegionInfo(self, region, 'elasticmapreduce.%s.amazonaws.com' % region) if region else None self._conn = None self.ec2_keyname = ec2_keyname self.core_node_limit = core_node_limit self.instance_profile = instance_profile self.service_role = service_role self.subnet_id = subnet_id self.cluster_tags = cluster_tags self.impala_docker_repo_base_url = impala_docker_repo_base_url self.impala_version = impala_version self.dart = Dart(dart_host, dart_port, dart_api_version) @property def conn(self): if self._conn: return self._conn self._conn = EmrConnection(region=self._region) return self._conn def run(self): action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID')) action = action_context.action datastore = action_context.datastore state = ActionResultState.SUCCESS consume_subscription_state = None error_message = None try: action_type_name = action.data.action_type_name assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name handler = self._action_handlers[action_type_name] handler(self, datastore, action) except ActionFailedButConsumeSuccessfulException as e: state = ActionResultState.FAILURE consume_subscription_state = ConsumeSubscriptionResultState.SUCCESS error_message = e.message + '\n\n\n' + traceback.format_exc() except Exception as e: state = ActionResultState.FAILURE error_message = e.message + '\n\n\n' + traceback.format_exc() finally: self.dart.engine_action_checkin(action.id, ActionResult(state, error_message, consume_subscription_state))
class DynamoDBEngine(object): def __init__(self, emr_ec2_keyname, emr_instance_profile, emr_service_role, emr_region, emr_core_node_limit, emr_impala_docker_repo_base_url, emr_impala_version, emr_cluster_tags, emr_cluster_availability_zone, dart_host, dart_port, dart_api_version=1, emr_release_label='emr-4.2.0', emr_instance_type='m3.2xlarge'): self.emr_release_label = emr_release_label self.emr_instance_type = emr_instance_type self._action_handlers = { DynamoDBActionTypes.create_table.name: create_table, DynamoDBActionTypes.delete_table.name: delete_table, DynamoDBActionTypes.load_dataset.name: load_dataset, } self.emr_engine = EmrEngine(emr_ec2_keyname, emr_instance_profile, emr_service_role, emr_region, emr_core_node_limit, emr_impala_docker_repo_base_url, emr_impala_version, emr_cluster_tags, emr_cluster_availability_zone, dart_host, dart_port, dart_api_version) self.dart = Dart(dart_host, dart_port, dart_api_version) def run(self): action_context = self.dart.engine_action_checkout( os.environ.get('DART_ACTION_ID')) action = action_context.action datastore = action_context.datastore state = ActionResultState.SUCCESS consume_subscription_state = None error_message = None try: action_type_name = action.data.action_type_name assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name handler = self._action_handlers[action_type_name] handler(self, datastore, action) except Exception as e: state = ActionResultState.FAILURE error_message = e.message + '\n\n\n' + traceback.format_exc() finally: self.dart.engine_action_checkin( action.id, ActionResult(state, error_message, consume_subscription_state))
def add_s3_engine(config): engine_config = config['engines']['s3_engine'] opts = engine_config['options'] dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version']) assert isinstance(dart, Dart) _logger.info('saving s3 engine') engine_id = None for e in dart.get_engines(): if e.data.name == 's3_engine': engine_id = e.id ecs_task_definition = None if config['dart']['use_local_engines'] else { 'family': 'dart-%s-s3_engine' % config['dart']['env_name'], 'containerDefinitions': [ { 'name': 'dart-s3_engine', 'cpu': 64, 'memory': 256, 'image': engine_config['docker_image'], 'logConfiguration': {'logDriver': 'syslog'}, 'environment': [ {'name': 'DART_ROLE', 'value': 'worker:engine_s3'}, {'name': 'DART_CONFIG', 'value': engine_config['config']}, {'name': 'AWS_DEFAULT_REGION', 'value': opts['region']} ], 'mountPoints': [ { 'containerPath': '/mnt/ecs_agent_data', 'sourceVolume': 'ecs-agent-data', 'readOnly': True } ], } ], 'volumes': [ { 'host': {'sourcePath': '/var/lib/ecs/data'}, 'name': 'ecs-agent-data' } ], } e1 = dart.save_engine(Engine(id=engine_id, data=EngineData( name='s3_engine', description='For S3 operations', options_json_schema={}, supported_action_types=[ S3ActionTypes.copy, S3ActionTypes.data_check, ], ecs_task_definition=ecs_task_definition ))) _logger.info('Saved s3_engine: %s' % e1.id)
def add_s3_engine(config): engine_config = config['engines']['s3_engine'] opts = engine_config['options'] dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version']) assert isinstance(dart, Dart) _logger.info('saving s3 engine') engine_id = None for e in dart.get_engines(): if e.data.name == 's3_engine': engine_id = e.id ecs_task_definition = None if config['dart']['use_local_engines'] else { 'family': 'dart-%s-s3_engine' % config['dart']['env_name'], 'containerDefinitions': [ { 'name': 'dart-s3_engine', 'cpu': 64, 'memory': 256, 'image': engine_config['docker_image'], 'logConfiguration': {'logDriver': 'syslog'}, 'environment': [ {'name': 'DART_ROLE', 'value': 'worker:engine_s3'}, {'name': 'DART_CONFIG', 'value': engine_config['config']}, {'name': 'AWS_DEFAULT_REGION', 'value': opts['region']} ], 'mountPoints': [ { 'containerPath': '/mnt/ecs_agent_data', 'sourceVolume': 'ecs-agent-data', 'readOnly': True } ], } ], 'volumes': [ { 'host': {'sourcePath': '/var/lib/ecs/data'}, 'name': 'ecs-agent-data' } ], } e1 = dart.save_engine(engine=Engine( id=engine_id, data=EngineData( name='s3_engine', description='For S3 FileCopy', options_json_schema={}, supported_action_types=[ S3ActionTypes.copy ], ecs_task_definition=ecs_task_definition ))) _logger.info('Saved s3_engine: %s' % e1.id)
class RedshiftEngine(ActionRunner): def __init__(self, kms_key_arn, secrets_s3_path, vpc_subnet, security_group_ids, region, availability_zones, publicly_accessible, cluster_tags, dart_host, dart_port, dart_api_version=1): super(RedshiftEngine, self).__init__() self.dart = Dart(dart_host, dart_port, dart_api_version) self._action_handlers = { RedshiftActionTypes.start_datastore.name: start_datastore, RedshiftActionTypes.stop_datastore.name: stop_datastore, RedshiftActionTypes.execute_sql.name: execute_sql, RedshiftActionTypes.load_dataset.name: load_dataset, RedshiftActionTypes.consume_subscription.name: consume_subscription, RedshiftActionTypes.copy_to_s3.name: copy_to_s3, RedshiftActionTypes.create_snapshot.name: create_snapshot, RedshiftActionTypes.data_check.name: data_check, RedshiftActionTypes.cluster_maintenance.name: cluster_maintenance, } self.vpc_subnet = vpc_subnet self.availability_zones = availability_zones self.publicly_accessible = publicly_accessible self.security_group_ids = security_group_ids self.cluster_tags = cluster_tags self.region = region self.secrets = Secrets(kms_key_arn, secrets_s3_path) def random_availability_zone(self): return self.availability_zones[random.randint(0, len(self.availability_zones) - 1)] def run(self): action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID')) action = action_context.action datastore = action_context.datastore state = ActionResultState.SUCCESS error_message = None try: action_type_name = action.data.action_type_name _logger.info("**** RedshiftEngine.run_action: %s", action_type_name) assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name handler = self._action_handlers[action_type_name] handler(self, datastore, action) except Exception as e: state = ActionResultState.FAILURE error_message = '{m}\r\r\r{t}'.format( m=str(e.message), t=traceback.format_exc(), ) finally: self.dart.engine_action_checkin(action.id, ActionResult(state, error_message)) self.publish_sns_message(action, error_message, state)
def setUp(self): self.dart = Dart(host='localhost', port=5000) args = {'action_sleep_time_in_seconds': 0} dst = Datastore(data=DatastoreData(name='test-datastore', engine_name='no_op_engine', args=args, state=DatastoreState.TEMPLATE)) self.datastore = self.dart.save_datastore(dst) wf = Workflow(data=WorkflowData(name='test-workflow', datastore_id=self.datastore.id)) self.workflow = self.dart.save_workflow(workflow=wf, datastore_id=self.datastore.id) self.maxDiff = 99999
class ElasticsearchEngine(ActionRunner): def __init__(self, kms_key_arn, secrets_s3_path, dart_host, dart_port, dart_api_version=1, **kwargs): super(ElasticsearchEngine, self).__init__() self.dart = Dart(dart_host, dart_port, dart_api_version) self._action_handlers = { ElasticsearchActionTypes.data_check.name: data_check, ElasticsearchActionTypes.create_index.name: create_index, ElasticsearchActionTypes.create_template.name: create_template, ElasticsearchActionTypes.create_mapping.name: create_mapping, ElasticsearchActionTypes.delete_index.name: delete_index, ElasticsearchActionTypes.delete_template.name: delete_template, ElasticsearchActionTypes.force_merge_index.name: force_merge_index, } self.secrets = Secrets(kms_key_arn, secrets_s3_path) def run(self): action_context = self.dart.engine_action_checkout( os.environ.get('DART_ACTION_ID')) action = action_context.action datastore = action_context.datastore state = ActionResultState.SUCCESS error_message = None try: action_type_name = action.data.action_type_name _logger.info('**** ElasticsearchEngine.run_action: %s', action_type_name) assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name handler = self._action_handlers[action_type_name] handler(self, datastore, action) except Exception as e: state = ActionResultState.FAILURE error_message = '{m}\r\r\r{t}'.format( m=str(e.message), t=traceback.format_exc(), ) finally: self.dart.engine_action_checkin(action.id, ActionResult(state, error_message)) self.publish_sns_message(action, error_message, state)
def __init__(self, ec2_keyname, instance_profile, service_role, region, core_node_limit, impala_docker_repo_base_url, impala_version, cluster_tags, cluster_availability_zone, dart_host, dart_port, dart_api_version=1): self._action_handlers = { EmrActionTypes.start_datastore.name: start_datastore, EmrActionTypes.terminate_datastore.name: terminate_datastore, EmrActionTypes.load_dataset.name: load_dataset, EmrActionTypes.consume_subscription.name: consume_subscription, EmrActionTypes.run_hive_script_action.name: run_hive_script, EmrActionTypes.run_impala_script_action.name: run_impala_script, EmrActionTypes.run_pyspark_script_action.name: run_pyspark_script, EmrActionTypes.copy_hdfs_to_s3_action.name: copy_hdfs_to_s3, } self._region = RegionInfo(self, region, 'elasticmapreduce.%s.amazonaws.com' % region) if region else None self._conn = None self.ec2_keyname = ec2_keyname self.core_node_limit = core_node_limit self.instance_profile = instance_profile self.service_role = service_role self.cluster_tags = cluster_tags self.cluster_availability_zone = cluster_availability_zone self.impala_docker_repo_base_url = impala_docker_repo_base_url self.impala_version = impala_version self.dart = Dart(dart_host, dart_port, dart_api_version)
class DynamoDBEngine(ActionRunner): def __init__(self, emr_ec2_keyname, emr_instance_profile, emr_service_role, emr_region, emr_core_node_limit, emr_impala_docker_repo_base_url, emr_impala_version, emr_cluster_tags, emr_cluster_availability_zone, dart_host, dart_port, dart_api_version=1, emr_release_label='emr-4.2.0', emr_instance_type='m3.2xlarge'): super(DynamoDBEngine, self).__init__() self.emr_release_label = emr_release_label self.emr_instance_type = emr_instance_type self._action_handlers = { DynamoDBActionTypes.create_table.name: create_table, DynamoDBActionTypes.delete_table.name: delete_table, DynamoDBActionTypes.load_dataset.name: load_dataset, } self.emr_engine = EmrEngine( emr_ec2_keyname, emr_instance_profile, emr_service_role, emr_region, emr_core_node_limit, emr_impala_docker_repo_base_url, emr_impala_version, emr_cluster_tags, emr_cluster_availability_zone, dart_host, dart_port, dart_api_version ) self.dart = Dart(dart_host, dart_port, dart_api_version) def run(self): action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID')) action = action_context.action datastore = action_context.datastore state = ActionResultState.SUCCESS consume_subscription_state = None error_message = None try: action_type_name = action.data.action_type_name assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name handler = self._action_handlers[action_type_name] handler(self, datastore, action) except Exception as e: state = ActionResultState.FAILURE error_message = '{m}\r\r\r{t}'.format( m=str(e.message), t=traceback.format_exc(), ) finally: self.dart.engine_action_checkin(action.id, ActionResult(state, error_message, consume_subscription_state)) self.publish_sns_message(action, error_message, state)
class TestDatasetCrud(unittest.TestCase): def setUp(self): self.dart = Dart(host='localhost', port=5000) def test_crud(self): columns = [Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT)] df = DataFormat(FileFormat.PARQUET, RowFormat.NONE) ds = Dataset( data=DatasetData(name=NoOpActionTypes.action_that_succeeds.name, table_name=NoOpActionTypes.action_that_succeeds.name, load_type=LoadType.INSERT, location='s3://bucket/prefix', data_format=df, columns=columns, tags=['foo'])) ds.data.user_id = '*****@*****.**' posted_dataset = self.dart.save_dataset(ds) self.assertEqual(posted_dataset.data.to_dict(), ds.data.to_dict()) dataset = self.dart.get_dataset(posted_dataset.id) self.assertEqual(posted_dataset.to_dict(), dataset.to_dict()) dataset.data.compression = Compression.GZIP put_dataset = self.dart.save_dataset(dataset) self.assertEqual(put_dataset.data.compression, Compression.GZIP) self.assertNotEqual(posted_dataset.to_dict(), put_dataset.to_dict()) self.dart.delete_dataset(dataset.id) try: self.dart.get_dataset(dataset.id) except DartRequestException as e: self.assertEqual(e.response.status_code, 404) return self.fail('dataset should have been missing after delete!')
def __init__(self, kms_key_arn, secrets_s3_path, dart_host, dart_port, dart_api_version=1): self.dart = Dart(dart_host, dart_port, dart_api_version) self._action_handlers = { ElasticsearchActionTypes.data_check.name: data_check, ElasticsearchActionTypes.create_index.name: create_index, ElasticsearchActionTypes.create_template.name: create_template, ElasticsearchActionTypes.create_mapping.name: create_mapping, ElasticsearchActionTypes.delete_index.name: delete_index, ElasticsearchActionTypes.delete_template.name: delete_template, ElasticsearchActionTypes.force_merge_index.name: force_merge_index, } self.secrets = Secrets(kms_key_arn, secrets_s3_path)
class NoOpEngine(ActionRunner): def __init__(self, region, dart_host='localhost', dart_port=5000, dart_api_version=1): super(NoOpEngine, self).__init__() self.region = region self.dart = Dart(dart_host, dart_port, dart_api_version) def run(self): action_context = self.dart.engine_action_checkout( os.environ.get('DART_ACTION_ID')) action = action_context.action datastore = action_context.datastore state = ActionResultState.SUCCESS error_message = None try: sleep_seconds = datastore.data.args['action_sleep_time_in_seconds'] _logger.info('sleeping for %s seconds...' % sleep_seconds) time.sleep(sleep_seconds) if action.data.action_type_name == NoOpActionTypes.action_that_fails.name: state = ActionResultState.FAILURE error_message = '%s failed as expected' % NoOpActionTypes.action_that_fails.name if action.data.action_type_name == NoOpActionTypes.consume_subscription.name: subscription_elements = self.dart.get_subscription_elements( action.id) _logger.info('consuming subscription, size = %s' % len(list(subscription_elements))) except Exception as e: state = ActionResultState.FAILURE error_message = '{m}\r\r\r{t}'.format( m=str(e.message), t=traceback.format_exc(), ) finally: self.dart.engine_action_checkin(action.id, ActionResult(state, error_message)) self.publish_sns_message(action, error_message, state)
def setUp(self): dart = Dart(host='localhost', port=5000) """ :type dart: dart.client.python.dart_client.Dart """ self.dart = dart dst_args = {'action_sleep_time_in_seconds': 0} dst0 = Datastore(data=DatastoreData('test-datastore0', 'no_op_engine', args=dst_args, state=DatastoreState.TEMPLATE)) self.datastore0 = self.dart.save_datastore(dst0) dst1 = Datastore(data=DatastoreData('test-datastore1', 'no_op_engine', args=dst_args, state=DatastoreState.TEMPLATE)) self.datastore1 = self.dart.save_datastore(dst1) wf0 = Workflow(data=WorkflowData( 'test-workflow0', self.datastore0.id, state=WorkflowState.ACTIVE)) self.workflow0 = self.dart.save_workflow(wf0, self.datastore0.id) wf1 = Workflow(data=WorkflowData( 'test-workflow1', self.datastore1.id, state=WorkflowState.ACTIVE)) self.workflow1 = self.dart.save_workflow(wf1, self.datastore1.id) a00 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE)) a01 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE)) self.action00, self.action01 = self.dart.save_actions( [a00, a01], workflow_id=self.workflow0.id) a10 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE)) a11 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE)) self.action10, self.action11 = self.dart.save_actions( [a10, a11], workflow_id=self.workflow1.id) tr_args = {'completed_workflow_id': self.workflow0.id} tr = Trigger(data=TriggerData('test-trigger', 'workflow_completion', None, tr_args, TriggerState.ACTIVE)) self.trigger = self.dart.save_trigger(tr) st_args = { 'fire_after': 'ALL', 'completed_trigger_ids': [self.trigger.id] } st = Trigger(data=TriggerData('test-super-trigger', 'super', [self.workflow1.id], st_args, TriggerState.ACTIVE)) self.super_trigger = self.dart.save_trigger(st)
def setUp(self): dart = Dart(host='localhost', port=5000) """ :type dart: dart.client.python.dart_client.Dart """ self.dart = dart dst_args = {'action_sleep_time_in_seconds': 0} dst = Datastore(data=DatastoreData('test-datastore', 'no_op_engine', args=dst_args, state=DatastoreState.ACTIVE)) self.datastore = self.dart.save_datastore(dst)
class TestWorkflowCrud(unittest.TestCase): def setUp(self): self.dart = Dart(host='localhost', port=5000) args = {'action_sleep_time_in_seconds': 0} dst = Datastore(data=DatastoreData(name='test-datastore', engine_name='no_op_engine', args=args, state=DatastoreState.ACTIVE)) self.datastore = self.dart.save_datastore(dst) def tearDown(self): self.dart.delete_datastore(self.datastore.id) def test_crud(self): wf = Workflow(data=WorkflowData(name='test-workflow', datastore_id=self.datastore.id, engine_name='no_op_engine')) posted_wf = self.dart.save_workflow(wf, self.datastore.id) self.assertEqual(posted_wf.data.to_dict(), wf.data.to_dict()) workflow = self.dart.get_workflow(posted_wf.id) self.assertEqual(posted_wf.to_dict(), workflow.to_dict()) workflow.data.concurrency = 2 workflow.data.state = WorkflowState.ACTIVE put_workflow = self.dart.save_workflow(workflow) self.assertEqual(put_workflow.data.concurrency, 2) self.assertEqual(put_workflow.data.state, WorkflowState.ACTIVE) self.assertNotEqual(posted_wf.to_dict(), put_workflow.to_dict()) self.dart.delete_workflow(workflow.id) try: self.dart.get_workflow(workflow.id) except DartRequestException as e: self.assertEqual(e.response.status_code, 404) return self.fail('workflow should have been missing after delete!')
def setUp(self): dart = Dart(host='localhost', port=5000) """ :type dart: dart.client.python.dart_client.Dart """ self.dart = dart cs = [ Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT) ] df = DataFormat(FileFormat.TEXTFILE, RowFormat.DELIMITED) dataset_data = DatasetData( name='test-dataset', table_name='test_dataset_table', load_type=LoadType.INSERT, location=('s3://' + os.environ['DART_TEST_BUCKET'] + '/impala'), data_format=df, columns=cs, tags=[]) self.dataset = self.dart.save_dataset(Dataset(data=dataset_data)) start = 's3://' + os.environ['DART_TEST_BUCKET'] + '/impala/impala' end = 's3://' + os.environ['DART_TEST_BUCKET'] + '/impala/install' regex = '.*\\.rpm' ds = Subscription(data=SubscriptionData( 'test-subscription', self.dataset.id, start, end, regex)) self.subscription = self.dart.save_subscription(ds) dst_args = {'action_sleep_time_in_seconds': 0} dst = Datastore(data=DatastoreData('test-datastore', 'no_op_engine', args=dst_args, state=DatastoreState.TEMPLATE)) self.datastore = self.dart.save_datastore(dst) wf = Workflow(data=WorkflowData( 'test-workflow', self.datastore.id, state=WorkflowState.ACTIVE)) self.workflow = self.dart.save_workflow(wf, self.datastore.id) a_args = {'subscription_id': self.subscription.id} a0 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name, NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE)) a1 = Action(data=ActionData(NoOpActionTypes.consume_subscription.name, NoOpActionTypes.consume_subscription.name, a_args, state=ActionState.TEMPLATE)) self.action0, self.action1 = self.dart.save_actions( [a0, a1], workflow_id=self.workflow.id)
def __init__(self, emr_ec2_keyname, emr_instance_profile, emr_service_role, emr_region, emr_core_node_limit, emr_impala_docker_repo_base_url, emr_impala_version, emr_cluster_tags, emr_cluster_availability_zone, dart_host, dart_port, dart_api_version=1, emr_release_label='emr-4.2.0', emr_instance_type='m3.2xlarge'): self.emr_release_label = emr_release_label self.emr_instance_type = emr_instance_type self._action_handlers = { DynamoDBActionTypes.create_table.name: create_table, DynamoDBActionTypes.delete_table.name: delete_table, DynamoDBActionTypes.load_dataset.name: load_dataset, } self.emr_engine = EmrEngine( emr_ec2_keyname, emr_instance_profile, emr_service_role, emr_region, emr_core_node_limit, emr_impala_docker_repo_base_url, emr_impala_version, emr_cluster_tags, emr_cluster_availability_zone, dart_host, dart_port, dart_api_version ) self.dart = Dart(dart_host, dart_port, dart_api_version)
def setUp(self): dart = Dart(host='localhost', port=5000) """ :type dart: dart.client.python.dart_client.Dart """ self.dart = dart dst_args = {'action_sleep_time_in_seconds': 0} dst = Datastore(data=DatastoreData('test-datastore', 'no_op_engine', args=dst_args, state=DatastoreState.ACTIVE)) self.datastore = self.dart.save_datastore(dst) wf = Workflow(data=WorkflowData( 'test-workflow', self.datastore.id, state=WorkflowState.ACTIVE)) self.workflow = self.dart.save_workflow(wf, self.datastore.id) a = Action(data=ActionData(NoOpActionTypes.action_that_fails.name, NoOpActionTypes.action_that_fails.name, state=ActionState.TEMPLATE)) self.dart.save_actions([a], workflow_id=self.workflow.id)
def __init__(self, kms_key_arn, secrets_s3_path, vpc_subnet, security_group_ids, region, availability_zones, publicly_accessible, cluster_tags, dart_host, dart_port, dart_api_version=1): self.dart = Dart(dart_host, dart_port, dart_api_version) self._action_handlers = { RedshiftActionTypes.start_datastore.name: start_datastore, RedshiftActionTypes.stop_datastore.name: stop_datastore, RedshiftActionTypes.execute_sql.name: execute_sql, RedshiftActionTypes.load_dataset.name: load_dataset, RedshiftActionTypes.consume_subscription.name: consume_subscription, RedshiftActionTypes.copy_to_s3.name: copy_to_s3, RedshiftActionTypes.create_snapshot.name: create_snapshot, RedshiftActionTypes.data_check.name: data_check, } self.vpc_subnet = vpc_subnet self.availability_zones = availability_zones self.publicly_accessible = publicly_accessible self.security_group_ids = security_group_ids self.cluster_tags = cluster_tags self.region = region self.secrets = Secrets(kms_key_arn, secrets_s3_path)
class TestDatastoreCrud(unittest.TestCase): def setUp(self): self.dart = Dart(host='localhost', port=5000) def test_crud(self): dst = Datastore(data=DatastoreData( name='test-datastore', engine_name='no_op_engine', args={'action_sleep_time_in_seconds': 0}, tags=['foo'] )) posted_datastore = self.dart.save_datastore(dst) # copy fields that are populated at creation time dst.data.s3_artifacts_path = posted_datastore.data.s3_artifacts_path dst.data.s3_logs_path = posted_datastore.data.s3_logs_path dst.data.user_id = posted_datastore.data.user_id self.assertEqual(posted_datastore.data.to_dict(), dst.data.to_dict()) datastore = self.dart.get_datastore(posted_datastore.id) self.assertEqual(posted_datastore.to_dict(), datastore.to_dict()) datastore.data.engine_name = 'not_existing_engine' datastore.data.state = DatastoreState.ACTIVE put_datastore = self.dart.save_datastore(datastore) # not all properties can be modified self.assertEqual(put_datastore.data.engine_name, 'no_op_engine') self.assertEqual(put_datastore.data.state, DatastoreState.ACTIVE) self.assertNotEqual(posted_datastore.to_dict(), put_datastore.to_dict()) self.dart.delete_datastore(datastore.id) try: self.dart.get_datastore(datastore.id) except DartRequestException as e: self.assertEqual(e.response.status_code, 404) return self.fail('datastore should have been missing after delete!')
class TestDatasetCrud(unittest.TestCase): def setUp(self): self.dart = Dart(host='localhost', port=5000) def test_crud(self): columns = [ Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT) ] df = DataFormat(FileFormat.PARQUET, RowFormat.NONE) ds = Dataset(data=DatasetData( name=NoOpActionTypes.action_that_succeeds.name, table_name=NoOpActionTypes.action_that_succeeds.name, load_type=LoadType.INSERT, location='s3://bucket/prefix', data_format=df, columns=columns, tags=['foo'])) ds.data.user_id = '*****@*****.**' posted_dataset = self.dart.save_dataset(ds) self.assertEqual(posted_dataset.data.to_dict(), ds.data.to_dict()) dataset = self.dart.get_dataset(posted_dataset.id) self.assertEqual(posted_dataset.to_dict(), dataset.to_dict()) dataset.data.compression = Compression.GZIP put_dataset = self.dart.save_dataset(dataset) self.assertEqual(put_dataset.data.compression, Compression.GZIP) self.assertNotEqual(posted_dataset.to_dict(), put_dataset.to_dict()) self.dart.delete_dataset(dataset.id) try: self.dart.get_dataset(dataset.id) except DartRequestException as e: self.assertEqual(e.response.status_code, 404) return self.fail('dataset should have been missing after delete!')
class TestDatastoreCrud(unittest.TestCase): def setUp(self): self.dart = Dart(host='localhost', port=5000) def test_crud(self): dst = Datastore( data=DatastoreData(name='test-datastore', engine_name='no_op_engine', args={'action_sleep_time_in_seconds': 0}, tags=['foo'])) posted_datastore = self.dart.save_datastore(dst) # copy fields that are populated at creation time dst.data.s3_artifacts_path = posted_datastore.data.s3_artifacts_path dst.data.s3_logs_path = posted_datastore.data.s3_logs_path dst.data.user_id = posted_datastore.data.user_id self.assertEqual(posted_datastore.data.to_dict(), dst.data.to_dict()) datastore = self.dart.get_datastore(posted_datastore.id) self.assertEqual(posted_datastore.to_dict(), datastore.to_dict()) datastore.data.engine_name = 'not_existing_engine' datastore.data.state = DatastoreState.ACTIVE put_datastore = self.dart.save_datastore(datastore) # not all properties can be modified self.assertEqual(put_datastore.data.engine_name, 'no_op_engine') self.assertEqual(put_datastore.data.state, DatastoreState.ACTIVE) self.assertNotEqual(posted_datastore.to_dict(), put_datastore.to_dict()) self.dart.delete_datastore(datastore.id) try: self.dart.get_datastore(datastore.id) except DartRequestException as e: self.assertEqual(e.response.status_code, 404) return self.fail('datastore should have been missing after delete!')
def add_elasticsearch_engine(config): engine_config = config['engines']['elasticsearch_engine'] opts = engine_config['options'] dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version']) assert isinstance(dart, Dart) _logger.info('saving elasticsearch_engine') engine_id = None for e in dart.get_engines(): if e.data.name == 'elasticsearch_engine': engine_id = e.id ecs_task_definition = None if config['dart']['use_local_engines'] else { 'family': 'dart-%s-elasticsearch_engine' % config['dart']['env_name'], 'containerDefinitions': [ { 'name': 'dart-elasticsearch_engine', 'cpu': 64, 'memory': 256, 'image': engine_config['docker_image'], 'logConfiguration': {'logDriver': 'syslog'}, 'environment': [ {'name': 'DART_ROLE', 'value': 'worker:engine_elasticsearch'}, {'name': 'DART_CONFIG', 'value': engine_config['config']}, {'name': 'AWS_DEFAULT_REGION', 'value': opts['region']} ], 'mountPoints': [ { 'containerPath': '/mnt/ecs_agent_data', 'sourceVolume': 'ecs-agent-data', 'readOnly': True } ], } ], 'volumes': [ { 'host': {'sourcePath': '/var/lib/ecs/data'}, 'name': 'ecs-agent-data' } ], } e1 = dart.save_engine(Engine(id=engine_id, data=EngineData( name='elasticsearch_engine', description='For Elasticsearch clusters', options_json_schema={ 'type': 'object', 'properties': { 'access_key_id': { 'type': 'string', 'default': '', 'minLength': 0, 'maxLength': 20, 'description': 'the access_key_id for accessing this elasticsearch cluster. ' + 'Leave blank to use Dart\'s instance profile credentials' }, 'secret_access_key': { 'type': 'string', 'default': '', 'minLength': 0, 'maxLength': 40, 'x-dart-secret': True, 'description': 'the secret_access_key for accessing this elasticsearch cluster. ' + 'Leave blank to use Dart\'s instance profile credentials' }, 'endpoint': { 'type': 'string', 'minLength': 1, 'maxLength': 256, 'pattern': '^[a-zA-Z0-9]+[a-zA-Z0-9\-\.]*\.es\.amazonaws\.com$', 'description': 'The AWS Elasticsearch domain endpoint that you use to submit index and search requests.' }, }, 'additionalProperties': False, 'required': ['endpoint'] }, supported_action_types=[ ElasticsearchActionTypes.data_check, ElasticsearchActionTypes.create_index, ElasticsearchActionTypes.create_mapping, ElasticsearchActionTypes.create_template, ElasticsearchActionTypes.delete_index, ElasticsearchActionTypes.delete_template, ElasticsearchActionTypes.force_merge_index, ], ecs_task_definition=ecs_task_definition ))) _logger.info('saved elasticsearch_engine: %s' % e1.id)
from dart.client.python.dart_client import Dart from dart.model.action import Action from dart.model.action import ActionData from dart.model.dataset import FileFormat if __name__ == '__main__': dart = Dart('localhost', 5000) assert isinstance(dart, Dart) action = dart.save_actions([ Action(data=ActionData( 'load_dataset', 'load_dataset', args={ 'dataset_id': 'NVVLBI7CWB', 's3_path_start_prefix_inclusive': 's3://example-bucket/weblogs/www.retailmenot.com/ec2/2014/52', 's3_path_end_prefix_exclusive': 's3://example-bucket/weblogs/www.retailmenot.com/ec2/2015/00', 's3_path_regex_filter': 's3://example-bucket/weblogs/www.retailmenot.com/ec2/2014/../www\\.retailmenot\\.com.*', 'target_file_format': FileFormat.PARQUET, })), ], datastore_id='IOMUQ5L8AX')[0] print 'created action: %s' % action.id
def add_emr_engine_sub_graphs(config): engine_config = config['engines']['emr_engine'] opts = engine_config['options'] dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version']) assert isinstance(dart, Dart) _logger.info('saving emr_engine sub_graphs') engine_id = None for e in dart.get_engines(): if e.data.name == 'emr_engine': engine_id = e.id if not engine_id: raise subgraph_definitions = [ SubGraphDefinition(data=SubGraphDefinitionData( name='consume_subscription_workflow', description='Add to a datastore to create entities for loading a dataset on an ongoing basis', engine_name='emr_engine', related_type=EntityType.datastore, related_is_a=Relationship.PARENT, workflows=[ Workflow(id=Ref.workflow(1), data=WorkflowData( name='emr-workflow-consume_subscription', datastore_id=Ref.parent(), engine_name='emr_engine', )), ], subscriptions=[ Subscription(id=Ref.subscription(1), data=SubscriptionData( name='emr-subscription', dataset_id='' )), ], triggers=[ Trigger(id=Ref.trigger(1), data=TriggerData( name='emr-trigger-subscription-1G-batch', trigger_type_name=subscription_batch_trigger.name, workflow_ids=[Ref.workflow(1)], args={ 'subscription_id': Ref.subscription(1), 'unconsumed_data_size_in_bytes': 1000*1000*1000 } )), ], actions=[ Action(id=Ref.action(1), data=ActionData( name='emr-action-consume_subscription', action_type_name=EmrActionTypes.consume_subscription.name, engine_name='emr_engine', workflow_id=Ref.workflow(1), state=ActionState.TEMPLATE, args={'subscription_id': Ref.subscription(1)} )), ] )) ] for e in subgraph_definitions: s = dart.save_subgraph_definition(e, engine_id) _logger.info('created subgraph_definition: %s' % s.id)
class TestActionCrud(unittest.TestCase): def setUp(self): self.dart = Dart(host='localhost', port=5000) args = {'action_sleep_time_in_seconds': 0} dst = Datastore(data=DatastoreData(name='test-datastore', engine_name='no_op_engine', args=args, state=DatastoreState.TEMPLATE)) self.datastore = self.dart.save_datastore(dst) wf = Workflow(data=WorkflowData(name='test-workflow', datastore_id=self.datastore.id)) self.workflow = self.dart.save_workflow(workflow=wf, datastore_id=self.datastore.id) self.maxDiff = 99999 def tearDown(self): self.dart.delete_datastore(self.datastore.id) self.dart.delete_workflow(self.workflow.id) def test_crud_datastore(self): action0 = Action(data=ActionData(name=NoOpActionTypes.action_that_succeeds.name, action_type_name=NoOpActionTypes.action_that_succeeds.name, engine_name='no_op_engine')) action1 = Action(data=ActionData(name=NoOpActionTypes.action_that_succeeds.name, action_type_name=NoOpActionTypes.action_that_succeeds.name, engine_name='no_op_engine')) posted_actions = self.dart.save_actions(actions=[action0, action1], datastore_id=self.datastore.id) # copy fields that are populated at creation time action0.data.datastore_id = posted_actions[0].data.datastore_id action1.data.datastore_id = posted_actions[1].data.datastore_id action0.data.args = {} action1.data.args = {} action0.data.order_idx = posted_actions[0].data.order_idx action1.data.order_idx = posted_actions[1].data.order_idx action0.data.user_id = posted_actions[0].data.user_id action1.data.user_id = posted_actions[1].data.user_id self.assertEqual(posted_actions[0].data.to_dict(), action0.data.to_dict()) self.assertEqual(posted_actions[1].data.to_dict(), action1.data.to_dict()) # When retrieving an action, its queue time and state # differs from the action default values created by action0 and action1 a0 = self.dart.get_action(posted_actions[0].id) a1 = self.dart.get_action(posted_actions[1].id) action0.data.state = a0.data.state action1.data.state = a1.data.state action0.data.queued_time = a0.data.queued_time action1.data.queued_time = a1.data.queued_time self.assertEqual(a0.data.to_dict(), action0.data.to_dict()) self.assertEqual(a1.data.to_dict(), action1.data.to_dict()) self.dart.delete_action(a0.id) self.dart.delete_action(a1.id) try: self.dart.get_action(a0.id) except DartRequestException as e0: self.assertEqual(e0.response.status_code, 404) try: self.dart.get_action(a1.id) except DartRequestException as e1: self.assertEqual(e1.response.status_code, 404) return self.fail('action should have been missing after delete!') def test_crud_workflow(self): action0 = Action(data=ActionData(name=NoOpActionTypes.action_that_succeeds.name, action_type_name=NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE, engine_name='no_op_engine')) action1 = Action(data=ActionData(name=NoOpActionTypes.action_that_succeeds.name, action_type_name=NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE, engine_name='no_op_engine')) posted_actions = self.dart.save_actions([action0, action1], workflow_id=self.workflow.id) # copy fields that are populated at creation time action0.data.workflow_id = posted_actions[0].data.workflow_id action1.data.workflow_id = posted_actions[1].data.workflow_id action0.data.order_idx = posted_actions[0].data.order_idx action1.data.order_idx = posted_actions[1].data.order_idx action0.data.args = {} action1.data.args = {} action0.data.user_id = posted_actions[0].data.user_id action1.data.user_id = posted_actions[1].data.user_id self.assertEqual(posted_actions[0].data.to_dict(), action0.data.to_dict()) self.assertEqual(posted_actions[1].data.to_dict(), action1.data.to_dict()) a0 = self.dart.get_action(posted_actions[0].id) a1 = self.dart.get_action(posted_actions[1].id) self.assertEqual(a0.data.to_dict(), action0.data.to_dict()) self.assertEqual(a1.data.to_dict(), action1.data.to_dict()) self.dart.delete_action(a0.id) self.dart.delete_action(a1.id) try: self.dart.get_action(a0.id) except DartRequestException as e0: self.assertEqual(e0.response.status_code, 404) try: self.dart.get_action(a1.id) except DartRequestException as e1: self.assertEqual(e1.response.status_code, 404) return self.fail('action should have been missing after delete!')
def __init__(self, dart_host, dart_port, dart_api_version): self.dart = Dart(dart_host, dart_port, dart_api_version) self._action_handlers = { S3ActionTypes.copy.name: s3_copy }
def add_no_op_engine_sub_graphs(config): engine_config = config['engines']['no_op_engine'] opts = engine_config['options'] dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version']) assert isinstance(dart, Dart) _logger.info('saving no_op_engine sub_graphs') engine_id = None for e in dart.get_engines(): if e.data.name == 'no_op_engine': engine_id = e.id if not engine_id: raise subgraph_definitions = [ SubGraphDefinition(data=SubGraphDefinitionData( name='workflow chaining demo', description='demonstrate workflow chaining', engine_name='no_op_engine', related_type=EntityType.datastore, related_is_a=Relationship.PARENT, workflows=[ Workflow(id=Ref.workflow(1), data=WorkflowData( name='no-op-workflow-chaining-wf1', datastore_id=Ref.parent(), engine_name='no_op_engine', state=WorkflowState.ACTIVE, )), Workflow(id=Ref.workflow(2), data=WorkflowData( name='no-op-workflow-chaining-wf2', datastore_id=Ref.parent(), engine_name='no_op_engine', state=WorkflowState.ACTIVE, )), ], actions=[ Action(id=Ref.action(1), data=ActionData( name=NoOpActionTypes.action_that_succeeds.name, engine_name='no_op_engine', action_type_name=NoOpActionTypes. action_that_succeeds.name, workflow_id=Ref.workflow(1), order_idx=1, state=ActionState.TEMPLATE, )), Action(id=Ref.action(2), data=ActionData( name=NoOpActionTypes.action_that_succeeds.name, action_type_name=NoOpActionTypes. action_that_succeeds.name, engine_name='no_op_engine', workflow_id=Ref.workflow(1), order_idx=2, state=ActionState.TEMPLATE, )), Action(id=Ref.action(3), data=ActionData( name=NoOpActionTypes.action_that_succeeds.name, action_type_name=NoOpActionTypes. action_that_succeeds.name, engine_name='no_op_engine', workflow_id=Ref.workflow(1), order_idx=3, state=ActionState.TEMPLATE, )), Action(id=Ref.action(4), data=ActionData( name=NoOpActionTypes.action_that_succeeds.name, action_type_name=NoOpActionTypes. action_that_succeeds.name, engine_name='no_op_engine', workflow_id=Ref.workflow(1), order_idx=4, state=ActionState.TEMPLATE, )), Action(id=Ref.action(5), data=ActionData( name=NoOpActionTypes.action_that_succeeds.name, action_type_name=NoOpActionTypes. action_that_succeeds.name, engine_name='no_op_engine', workflow_id=Ref.workflow(2), order_idx=1, state=ActionState.TEMPLATE, )), Action(id=Ref.action(6), data=ActionData( name=NoOpActionTypes.action_that_succeeds.name, action_type_name=NoOpActionTypes. action_that_succeeds.name, engine_name='no_op_engine', workflow_id=Ref.workflow(2), order_idx=2, state=ActionState.TEMPLATE, )), Action(id=Ref.action(7), data=ActionData( name=NoOpActionTypes.action_that_fails.name, action_type_name=NoOpActionTypes.action_that_fails. name, engine_name='no_op_engine', workflow_id=Ref.workflow(2), order_idx=3, state=ActionState.TEMPLATE, )), ], triggers=[ Trigger(id=Ref.trigger(1), data=TriggerData( name='no-op-trigger-workflow-completion', trigger_type_name=workflow_completion_trigger.name, workflow_ids=[Ref.workflow(2)], state=TriggerState.ACTIVE, args={'completed_workflow_id': Ref.workflow(1)})), ], )) ] for e in subgraph_definitions: s = dart.save_subgraph_definition(e, engine_id) _logger.info('created subgraph_definition: %s' % s.id)
from dart.client.python.dart_client import Dart from dart.model.dataset import Column, DatasetData, Dataset, DataFormat, FileFormat, DataType, Compression, RowFormat, \ LoadType if __name__ == '__main__': dart = Dart('localhost', 5000) assert isinstance(dart, Dart) dataset = dart.save_dataset( Dataset(data=(DatasetData( name='beacon_native_app_parsed_v01', table_name='beacon_native_app', location='s3://example-bucket/nb.retailmenot.com/parsed_logs', load_type=LoadType.INSERT, data_format=DataFormat(FileFormat.TEXTFILE, RowFormat.DELIMITED, delimited_by='\t', quoted_by='"', escaped_by='\\', null_string='NULL', num_header_rows=1), compression=Compression.NONE, partitions=[ Column('year', DataType.STRING), Column('week', DataType.STRING), ], columns=[ Column('logFileId', DataType.BIGINT), Column('lineNumber', DataType.INT), Column('created', DataType.TIMESTAMP,
from dart.client.python.dart_client import Dart if __name__ == '__main__': dart = Dart('localhost', 5000) assert isinstance(dart, Dart) action = dart.get_action('8U7H6OLHC5') action = dart.patch_action(action, order_idx=5) print 'patched action: %s' % action.id
def add_emr_engine(config): engine_config = config['engines']['emr_engine'] opts = engine_config['options'] dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version']) assert isinstance(dart, Dart) _logger.info('saving emr_engine') engine_id = None for e in dart.get_engines(): if e.data.name == 'emr_engine': engine_id = e.id ecs_task_definition = None if config['dart']['use_local_engines'] else { 'family': 'dart-%s-emr_engine' % config['dart']['env_name'], 'containerDefinitions': [ { 'name': 'dart-emr_engine', 'cpu': 64, 'memory': 256, 'image': engine_config['docker_image'], 'logConfiguration': {'logDriver': 'syslog'}, 'environment': [ {'name': 'DART_ROLE', 'value': 'worker:engine_emr'}, {'name': 'DART_CONFIG', 'value': engine_config['config']}, {'name': 'AWS_DEFAULT_REGION', 'value': opts['region']} ], 'mountPoints': [ { 'containerPath': '/mnt/ecs_agent_data', 'sourceVolume': 'ecs-agent-data', 'readOnly': True } ], } ], 'volumes': [ { 'host': {'sourcePath': '/var/lib/ecs/data'}, 'name': 'ecs-agent-data' } ], } e1 = dart.save_engine(Engine(id=engine_id, data=EngineData( name='emr_engine', description='For EMR clusters that use Hive, Impala, Spark, etc.', options_json_schema={ 'type': 'object', 'properties': { 'release_label': {'type': 'string', 'pattern': '^emr-[0-9].[0-9].[0-9]+$', 'default': 'emr-4.2.0', 'description': 'desired EMR release label'}, 'instance_type': {'readonly': True, 'type': ['string', 'null'], 'default': 'm3.2xlarge', 'description': 'The ec2 instance type of master/core nodes'}, 'instance_count': {'type': ['integer', 'null'], 'default': None, 'minimum': 1, 'maximum': 50, 'description': 'The total number of nodes in this cluster (overrides data_to_freespace_ratio)'}, 'data_to_freespace_ratio': {'type': ['number', 'null'], 'default': 0.5, 'minimum': 0.0, 'maximum': 1.0, 'description': 'Desired ratio of HDFS data/free-space'}, 'dry_run': {'type': ['boolean', 'null'], 'default': False, 'description': 'write extra_data to actions, but do not actually run'}, 'ec2_keyname': {'type': 'string', 'description': 'The name of the ec2_key_pair for the emr cluster. If this is not defined, the default key-pair from config is chosen.', 'default': None}, }, 'additionalProperties': False, 'required': ['release_label'], }, supported_action_types=[ EmrActionTypes.start_datastore, EmrActionTypes.terminate_datastore, EmrActionTypes.load_dataset, EmrActionTypes.consume_subscription, EmrActionTypes.run_hive_script_action, EmrActionTypes.run_impala_script_action, EmrActionTypes.run_pyspark_script_action, EmrActionTypes.copy_hdfs_to_s3_action ], ecs_task_definition=ecs_task_definition ))) _logger.info('saved emr_engine: %s' % e1.id)
from dart.client.python.dart_client import Dart from dart.model.action import Action from dart.model.action import ActionData from dart.model.dataset import Column, DatasetData, Dataset, DataFormat, FileFormat, RowFormat, DataType, Compression from dart.model.datastore import Datastore, DatastoreData, DatastoreState if __name__ == '__main__': dart = Dart('localhost', 5000) # dart = Dart() assert isinstance(dart, Dart) dataset = dart.save_dataset(Dataset(data=DatasetData( name='weblogs_v01', table_name='weblogs', location='s3://example-bucket/weblogs/www.retailmenot.com/ec2/', data_format=DataFormat( file_format=FileFormat.TEXTFILE, row_format=RowFormat.REGEX, regex_input="(?<ip>^(?:(?:unknown(?:,\\s)?|(?:\\d+\\.\\d+\\.\\d+\\.\\d+(?:,\\s)?))+)|\\S*)\\s+\\S+\\s+(?<userIdentifier>(?:[^\\[]+|\\$\\S+\\['\\S+'\\]|\\[username\\]))\\s*\\s+\\[(?<requestDate>[^\\]]+)\\]\\s+\"(?<httpMethod>(?:GET|HEAD|POST|PUT|DELETE|TRACE))\\s(?<urlPath>(?:[^ ?]+))(?:\\?(?<queryString>(?:[^ ]+)))?\\sHTTP/(?<httpVersion>(?:[\\d\\.]+))\"\\s+(?<statusCode>[0-9]+)\\s+(?<bytesSent>\\S+)\\s+\"(?<referrer>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userAgent>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+(?<responseTime>[-0-9]*)\\s+\"(?<hostName>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userFingerprint>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<sessionId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<requestId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<visitorId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<vegSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<fruitSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<cacheHitMiss>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s*\\Z", regex_output="%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21s", ), columns=[ Column('ip', DataType.STRING), Column('user', DataType.STRING), Column('requestDate', DataType.TIMESTAMP, date_pattern='dd/MMM/yyyy:HH:mm:ss Z'), Column('httpMethod', DataType.STRING), Column('urlPath', DataType.STRING), Column('queryString', DataType.STRING), Column('httpVersion', DataType.STRING), Column('statusCode', DataType.STRING), Column('bytesSent', DataType.INT),
def add_no_op_engine(config): engine_config = config['engines']['no_op_engine'] opts = engine_config['options'] dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version']) assert isinstance(dart, Dart) _logger.info('saving no_op_engine') engine_id = None for e in dart.get_engines(): if e.data.name == 'no_op_engine': engine_id = e.id ecs_task_definition = None if config['dart']['use_local_engines'] else { 'family': 'dart-%s-no_op_engine' % config['dart']['env_name'], 'containerDefinitions': [ { 'name': 'dart-no_op_engine', 'cpu': 64, 'memory': 256, 'image': engine_config['docker_image'], 'logConfiguration': {'logDriver': 'syslog'}, 'environment': [ {'name': 'DART_ROLE', 'value': 'worker:engine_no_op'}, {'name': 'DART_CONFIG', 'value': engine_config['config']}, {'name': 'AWS_DEFAULT_REGION', 'value': opts['region']} ], 'mountPoints': [ { 'containerPath': '/mnt/ecs_agent_data', 'sourceVolume': 'ecs-agent-data', 'readOnly': True } ], } ], 'volumes': [ { 'host': {'sourcePath': '/var/lib/ecs/data'}, 'name': 'ecs-agent-data' } ], } e1 = dart.save_engine(Engine(id=engine_id, data=EngineData( name='no_op_engine', description='Helps engineering test dart', options_json_schema={ 'type': 'object', 'properties': { 'action_sleep_time_in_seconds': { 'type': 'integer', 'minimum': 0, 'default': 5, 'description': 'The time to sleep for each action before completing' }, }, 'additionalProperties': False, 'required': [], }, supported_action_types=[ NoOpActionTypes.action_that_succeeds, NoOpActionTypes.action_that_fails, NoOpActionTypes.copy_hdfs_to_s3_action, NoOpActionTypes.load_dataset, NoOpActionTypes.consume_subscription ], ecs_task_definition=ecs_task_definition ))) _logger.info('saved no_op_engine: %s' % e1.id)
from dart.client.python.dart_client import Dart from dart.model.workflow import Workflow, WorkflowState if __name__ == '__main__': dart = Dart('localhost', 5000) assert isinstance(dart, Dart) workflow = dart.get_workflow('456SGU4U6T') assert isinstance(workflow, Workflow) workflow.data.state = WorkflowState.INACTIVE dart.save_workflow(workflow)
from dart.client.python.dart_client import Dart from dart.model.dataset import Column, DatasetData, Dataset, DataFormat, FileFormat, DataType, Compression, RowFormat, \ LoadType if __name__ == '__main__': dart = Dart('localhost', 5000) assert isinstance(dart, Dart) dataset = dart.save_dataset(Dataset(id='PDUZ8EDNOR', data=(DatasetData( name='beacon_native_app_parsed_gzipped_v03', table_name='beacon_native_app', location='s3://example-bucket/prd/beacon/native_app/v3/dwh-delimited/gzipped', load_type=LoadType.INSERT, distribution_keys=['created'], sort_keys=['created', 'eventtype'], hive_compatible_partition_folders=True, data_format=DataFormat( FileFormat.TEXTFILE, RowFormat.DELIMITED, delimited_by='\t', quoted_by='"', escaped_by='\\', null_string='NULL', ), compression=Compression.GZIP, partitions=[Column('createdpartition', DataType.STRING)], columns=[ Column('logfileid', DataType.INT), Column('linenumber', DataType.INT), Column('created', DataType.TIMESTAMP, date_pattern="yyyy-MM-dd HH:mm:ss"), Column('remoteip', DataType.VARCHAR, 500),
def setUp(self): self.dart = Dart(host='localhost', port=5000)
from dart.client.python.dart_client import Dart from dart.model.action import Action, ActionState from dart.model.action import ActionData from dart.model.dataset import Column, DatasetData, Dataset, DataFormat, DataType, Compression, LoadType from dart.model.datastore import Datastore, DatastoreData, DatastoreState from dart.model.event import Event, EventState from dart.model.event import EventData from dart.model.trigger import Trigger, TriggerData from dart.model.workflow import Workflow, WorkflowState from dart.model.workflow import WorkflowData if __name__ == '__main__': dart = Dart('localhost', 5000) assert isinstance(dart, Dart) dataset = dart.save_dataset(Dataset(data=(DatasetData( name='beacon_native_app_v02', table_name='beacon_native_app', location='s3://example-bucket/prd/beacon/native_app/v2/parquet/snappy', hive_compatible_partition_folders=True, load_type=LoadType.INSERT, data_format=DataFormat('parquet'), columns=[ Column('logFileId', DataType.BIGINT), Column('lineNumber', DataType.INT), Column('created', DataType.BIGINT), Column('remoteip', DataType.STRING), Column('useragent', DataType.STRING), Column('eventType', DataType.STRING), Column('appVersion', DataType.STRING), Column('advertiserID', DataType.STRING),