Ejemplo n.º 1
0
    def __init__(self, ec2_keyname, instance_profile, service_role, subnet_id, region, core_node_limit,
                 impala_docker_repo_base_url, impala_version, cluster_tags, dart_host, dart_port, dart_api_version=1):
        super(EmrEngine, self).__init__()

        self._action_handlers = {
            EmrActionTypes.start_datastore.name: start_datastore,
            EmrActionTypes.terminate_datastore.name: terminate_datastore,
            EmrActionTypes.load_dataset.name: load_dataset,
            EmrActionTypes.consume_subscription.name: consume_subscription,
            EmrActionTypes.run_hive_script_action.name: run_hive_script,
            EmrActionTypes.run_impala_script_action.name: run_impala_script,
            EmrActionTypes.run_pyspark_script_action.name: run_pyspark_script,
            EmrActionTypes.copy_hdfs_to_s3_action.name: copy_hdfs_to_s3,
        }
        self._region = RegionInfo(self, region, 'elasticmapreduce.%s.amazonaws.com' % region) if region else None
        self._conn = None
        self.ec2_keyname = ec2_keyname
        self.core_node_limit = core_node_limit
        self.instance_profile = instance_profile
        self.service_role = service_role
        self.subnet_id = subnet_id
        self.cluster_tags = cluster_tags
        self.impala_docker_repo_base_url = impala_docker_repo_base_url
        self.impala_version = impala_version
        self.dart = Dart(dart_host, dart_port, dart_api_version)
Ejemplo n.º 2
0
class S3Engine(object):
    def __init__(self, region, dart_host, dart_port, dart_api_version):
        self.region = region
        self.dart = Dart(dart_host, dart_port, dart_api_version)
        self._action_handlers = {
            S3ActionTypes.copy.name: copy,
            S3ActionTypes.data_check.name: data_check,
        }

    def run(self):
        action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID'))
        action = action_context.action
        datastore = action_context.datastore

        state = ActionResultState.SUCCESS
        error_message = None
        try:
            action_type_name = action.data.action_type_name
            _logger.info("*** S3Engine.run_action: %s", action_type_name)
            assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name
            handler = self._action_handlers[action_type_name]
            handler(self, datastore, action)

        except Exception as e:
            state = ActionResultState.FAILURE
            error_message = e.message + '\n\n\n' + traceback.format_exc()

        finally:
            self.dart.engine_action_checkin(action.id, ActionResult(state, error_message))
Ejemplo n.º 3
0
 def __init__(self, region, dart_host, dart_port, dart_api_version):
     self.region = region
     self.dart = Dart(dart_host, dart_port, dart_api_version)
     self._action_handlers = {
         S3ActionTypes.copy.name: copy,
         S3ActionTypes.data_check.name: data_check,
     }
Ejemplo n.º 4
0
Archivo: es.py Proyecto: ophiradi/dart
class ElasticsearchEngine(object):
    def __init__(self, kms_key_arn, secrets_s3_path, dart_host, dart_port, dart_api_version=1):

        self.dart = Dart(dart_host, dart_port, dart_api_version)
        self._action_handlers = {
            ElasticsearchActionTypes.data_check.name: data_check,
            ElasticsearchActionTypes.create_index.name: create_index,
            ElasticsearchActionTypes.create_template.name: create_template,
            ElasticsearchActionTypes.create_mapping.name: create_mapping,
            ElasticsearchActionTypes.delete_index.name: delete_index,
            ElasticsearchActionTypes.delete_template.name: delete_template,
            ElasticsearchActionTypes.force_merge_index.name: force_merge_index,
        }
        self.secrets = Secrets(kms_key_arn, secrets_s3_path)

    def run(self):
        action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID'))
        action = action_context.action
        datastore = action_context.datastore

        state = ActionResultState.SUCCESS
        error_message = None
        try:
            action_type_name = action.data.action_type_name
            _logger.info('**** ElasticsearchEngine.run_action: %s', action_type_name)
            assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name
            handler = self._action_handlers[action_type_name]
            handler(self, datastore, action)

        except Exception as e:
            state = ActionResultState.FAILURE
            error_message = e.message + '\n\n\n' + traceback.format_exc()

        finally:
            self.dart.engine_action_checkin(action.id, ActionResult(state, error_message))
Ejemplo n.º 5
0
class NoOpEngine(object):
    def __init__(self, region, dart_host='localhost', dart_port=5000, dart_api_version=1):
        self.region = region
        self.dart = Dart(dart_host, dart_port, dart_api_version)

    def run(self):
        action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID'))
        action = action_context.action
        datastore = action_context.datastore

        state = ActionResultState.SUCCESS
        error_message = None
        try:
            sleep_seconds = datastore.data.args['action_sleep_time_in_seconds']
            _logger.info('sleeping for %s seconds...' % sleep_seconds)
            time.sleep(sleep_seconds)

            if action.data.action_type_name == NoOpActionTypes.action_that_fails.name:
                state = ActionResultState.FAILURE
                error_message = '%s failed as expected' % NoOpActionTypes.action_that_fails.name

            if action.data.action_type_name == NoOpActionTypes.consume_subscription.name:
                subscription_elements = self.dart.get_subscription_elements(action.id)
                _logger.info('consuming subscription, size = %s' % len(list(subscription_elements)))

        except Exception as e:
            state = ActionResultState.FAILURE
            error_message = e.message + '\n\n\n' + traceback.format_exc()

        finally:
            self.dart.engine_action_checkin(action.id, ActionResult(state, error_message))
Ejemplo n.º 6
0
class S3Engine(ActionRunner):
    def __init__(self, region, dart_host, dart_port, dart_api_version):
        super(S3Engine, self).__init__()
        self.region = region
        self.dart = Dart(dart_host, dart_port, dart_api_version)
        self._action_handlers = {
            S3ActionTypes.copy.name: copy,
            S3ActionTypes.data_check.name: data_check,
        }

    def run(self):
        action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID'))
        action = action_context.action
        datastore = action_context.datastore

        state = ActionResultState.SUCCESS
        error_message = None
        try:
            action_type_name = action.data.action_type_name
            _logger.info("*** S3Engine.run_action: %s", action_type_name)
            assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name
            handler = self._action_handlers[action_type_name]
            handler(self, datastore, action)

        except Exception as e:
            state = ActionResultState.FAILURE
            error_message = '{m}\r\r\r{t}'.format(
                m=str(e.message),
                t=traceback.format_exc(),
            )

        finally:
            self.dart.engine_action_checkin(action.id, ActionResult(state, error_message))
            self.publish_sns_message(action, error_message, state)
Ejemplo n.º 7
0
    def __init__(self,
                 emr_ec2_keyname,
                 emr_instance_profile,
                 emr_service_role,
                 emr_region,
                 emr_core_node_limit,
                 emr_impala_docker_repo_base_url,
                 emr_impala_version,
                 emr_cluster_tags,
                 emr_cluster_availability_zone,
                 dart_host,
                 dart_port,
                 dart_api_version=1,
                 emr_release_label='emr-4.2.0',
                 emr_instance_type='m3.2xlarge'):

        self.emr_release_label = emr_release_label
        self.emr_instance_type = emr_instance_type
        self._action_handlers = {
            DynamoDBActionTypes.create_table.name: create_table,
            DynamoDBActionTypes.delete_table.name: delete_table,
            DynamoDBActionTypes.load_dataset.name: load_dataset,
        }
        self.emr_engine = EmrEngine(emr_ec2_keyname, emr_instance_profile,
                                    emr_service_role, emr_region,
                                    emr_core_node_limit,
                                    emr_impala_docker_repo_base_url,
                                    emr_impala_version, emr_cluster_tags,
                                    emr_cluster_availability_zone, dart_host,
                                    dart_port, dart_api_version)
        self.dart = Dart(dart_host, dart_port, dart_api_version)
Ejemplo n.º 8
0
 def __init__(self,
              region,
              dart_host='localhost',
              dart_port=5000,
              dart_api_version=1):
     self.region = region
     self.dart = Dart(dart_host, dart_port, dart_api_version)
Ejemplo n.º 9
0
Archivo: s3.py Proyecto: karthich/dart
class S3Engine(object):
    def __init__(self, dart_host, dart_port, dart_api_version):
        self.dart = Dart(dart_host, dart_port, dart_api_version)
        self._action_handlers = {
            S3ActionTypes.copy.name: s3_copy
        }

    def run(self):
        action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID'))
        action = action_context.action

        state = ActionResultState.SUCCESS
        error_message = None
        try:
            _logger.info("*** S3Engine.run_action: %s", action.data.action_type_name)
            error_message = 'unsupported action: %s' % action.data.action_type_name
            assert action.data.action_type_name in self._action_handlers, error_message
            handler = self._action_handlers[action.data.action_type_name]
            handler(**action.data.args)
        except Exception as e:
            state = ActionResultState.FAILURE
            error_message = e.message + '\n\n\n' + traceback.format_exc()

        finally:
            self.dart.engine_action_checkin(action.id, ActionResult(state, error_message))
Ejemplo n.º 10
0
    def __init__(self,
                 kms_key_arn,
                 secrets_s3_path,
                 vpc_subnet,
                 security_group_ids,
                 region,
                 availability_zones,
                 publicly_accessible,
                 cluster_tags,
                 dart_host,
                 dart_port,
                 dart_api_version=1):

        self.dart = Dart(dart_host, dart_port, dart_api_version)
        self._action_handlers = {
            RedshiftActionTypes.start_datastore.name: start_datastore,
            RedshiftActionTypes.stop_datastore.name: stop_datastore,
            RedshiftActionTypes.execute_sql.name: execute_sql,
            RedshiftActionTypes.load_dataset.name: load_dataset,
            RedshiftActionTypes.consume_subscription.name:
            consume_subscription,
            RedshiftActionTypes.copy_to_s3.name: copy_to_s3,
            RedshiftActionTypes.create_snapshot.name: create_snapshot,
            RedshiftActionTypes.data_check.name: data_check,
            RedshiftActionTypes.cluster_maintenance.name: cluster_maintenance,
        }
        self.vpc_subnet = vpc_subnet
        self.availability_zones = availability_zones
        self.publicly_accessible = publicly_accessible
        self.security_group_ids = security_group_ids
        self.cluster_tags = cluster_tags
        self.region = region
        self.secrets = Secrets(kms_key_arn, secrets_s3_path)
Ejemplo n.º 11
0
 def setUp(self):
     self.dart = Dart(host='localhost', port=5000)
     args = {'action_sleep_time_in_seconds': 0}
     dst = Datastore(data=DatastoreData('test-datastore', 'no_op_engine', args=args, state=DatastoreState.TEMPLATE))
     self.datastore = self.dart.save_datastore(dst)
     wf = Workflow(data=WorkflowData('test-workflow', self.datastore.id, state=WorkflowState.ACTIVE))
     self.workflow = self.dart.save_workflow(wf, self.datastore.id)
Ejemplo n.º 12
0
class TestWorkflowCrud(unittest.TestCase):
    def setUp(self):
        self.dart = Dart(host='localhost', port=5000)
        args = {'action_sleep_time_in_seconds': 0}
        dst = Datastore(data=DatastoreData('test-datastore', 'no_op_engine', args=args, state=DatastoreState.ACTIVE))
        self.datastore = self.dart.save_datastore(dst)

    def tearDown(self):
        self.dart.delete_datastore(self.datastore.id)

    def test_crud(self):
        wf = Workflow(data=WorkflowData('test-workflow', self.datastore.id, engine_name='no_op_engine'))
        posted_wf = self.dart.save_workflow(wf, self.datastore.id)
        self.assertEqual(posted_wf.data.to_dict(), wf.data.to_dict())

        workflow = self.dart.get_workflow(posted_wf.id)
        self.assertEqual(posted_wf.to_dict(), workflow.to_dict())

        workflow.data.concurrency = 2
        workflow.data.state = WorkflowState.ACTIVE
        put_workflow = self.dart.save_workflow(workflow)
        # not all properties can be modified
        self.assertEqual(put_workflow.data.concurrency, 1)
        self.assertEqual(put_workflow.data.state, WorkflowState.ACTIVE)
        self.assertNotEqual(posted_wf.to_dict(), put_workflow.to_dict())

        self.dart.delete_workflow(workflow.id)
        try:
            self.dart.get_workflow(workflow.id)
        except DartRequestException as e:
            self.assertEqual(e.response.status_code, 404)
            return

        self.fail('workflow should have been missing after delete!')
Ejemplo n.º 13
0
class EmrEngine(object):
    def __init__(self, ec2_keyname, instance_profile, service_role, region, core_node_limit,
                 impala_docker_repo_base_url, impala_version, cluster_tags, cluster_availability_zone,
                 dart_host, dart_port, dart_api_version=1):

        self._action_handlers = {
            EmrActionTypes.start_datastore.name: start_datastore,
            EmrActionTypes.terminate_datastore.name: terminate_datastore,
            EmrActionTypes.load_dataset.name: load_dataset,
            EmrActionTypes.consume_subscription.name: consume_subscription,
            EmrActionTypes.run_hive_script_action.name: run_hive_script,
            EmrActionTypes.run_impala_script_action.name: run_impala_script,
            EmrActionTypes.run_pyspark_script_action.name: run_pyspark_script,
            EmrActionTypes.copy_hdfs_to_s3_action.name: copy_hdfs_to_s3,
        }
        self._region = RegionInfo(self, region, 'elasticmapreduce.%s.amazonaws.com' % region) if region else None
        self._conn = None
        self.ec2_keyname = ec2_keyname
        self.core_node_limit = core_node_limit
        self.instance_profile = instance_profile
        self.service_role = service_role
        self.cluster_tags = cluster_tags
        self.cluster_availability_zone = cluster_availability_zone
        self.impala_docker_repo_base_url = impala_docker_repo_base_url
        self.impala_version = impala_version
        self.dart = Dart(dart_host, dart_port, dart_api_version)

    @property
    def conn(self):
        if self._conn:
            return self._conn
        self._conn = EmrConnection(region=self._region)
        return self._conn

    def run(self):
        action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID'))
        action = action_context.action
        datastore = action_context.datastore

        state = ActionResultState.SUCCESS
        consume_subscription_state = None
        error_message = None
        try:
            action_type_name = action.data.action_type_name
            assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name
            handler = self._action_handlers[action_type_name]
            handler(self, datastore, action)

        except ActionFailedButConsumeSuccessfulException as e:
            state = ActionResultState.FAILURE
            consume_subscription_state = ConsumeSubscriptionResultState.SUCCESS
            error_message = e.message + '\n\n\n' + traceback.format_exc()

        except Exception as e:
            state = ActionResultState.FAILURE
            error_message = e.message + '\n\n\n' + traceback.format_exc()

        finally:
            self.dart.engine_action_checkin(action.id, ActionResult(state, error_message, consume_subscription_state))
Ejemplo n.º 14
0
 def setUp(self):
     self.dart = Dart(host='localhost', port=5000)
     args = {'action_sleep_time_in_seconds': 0}
     dst = Datastore(data=DatastoreData(name='test-datastore',
                                        engine_name='no_op_engine',
                                        args=args,
                                        state=DatastoreState.ACTIVE))
     self.datastore = self.dart.save_datastore(dst)
Ejemplo n.º 15
0
 def __init__(self,
              region,
              dart_host='localhost',
              dart_port=5000,
              dart_api_version=1):
     super(NoOpEngine, self).__init__()
     self.region = region
     self.dart = Dart(dart_host, dart_port, dart_api_version)
Ejemplo n.º 16
0
class EmrEngine(object):
    def __init__(self, ec2_keyname, instance_profile, service_role, subnet_id, region, core_node_limit,
                 impala_docker_repo_base_url, impala_version, cluster_tags, dart_host, dart_port, dart_api_version=1):

        self._action_handlers = {
            EmrActionTypes.start_datastore.name: start_datastore,
            EmrActionTypes.terminate_datastore.name: terminate_datastore,
            EmrActionTypes.load_dataset.name: load_dataset,
            EmrActionTypes.consume_subscription.name: consume_subscription,
            EmrActionTypes.run_hive_script_action.name: run_hive_script,
            EmrActionTypes.run_impala_script_action.name: run_impala_script,
            EmrActionTypes.run_pyspark_script_action.name: run_pyspark_script,
            EmrActionTypes.copy_hdfs_to_s3_action.name: copy_hdfs_to_s3,
        }
        self._region = RegionInfo(self, region, 'elasticmapreduce.%s.amazonaws.com' % region) if region else None
        self._conn = None
        self.ec2_keyname = ec2_keyname
        self.core_node_limit = core_node_limit
        self.instance_profile = instance_profile
        self.service_role = service_role
        self.subnet_id = subnet_id
        self.cluster_tags = cluster_tags
        self.impala_docker_repo_base_url = impala_docker_repo_base_url
        self.impala_version = impala_version
        self.dart = Dart(dart_host, dart_port, dart_api_version)

    @property
    def conn(self):
        if self._conn:
            return self._conn
        self._conn = EmrConnection(region=self._region)
        return self._conn

    def run(self):
        action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID'))
        action = action_context.action
        datastore = action_context.datastore

        state = ActionResultState.SUCCESS
        consume_subscription_state = None
        error_message = None
        try:
            action_type_name = action.data.action_type_name
            assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name
            handler = self._action_handlers[action_type_name]
            handler(self, datastore, action)

        except ActionFailedButConsumeSuccessfulException as e:
            state = ActionResultState.FAILURE
            consume_subscription_state = ConsumeSubscriptionResultState.SUCCESS
            error_message = e.message + '\n\n\n' + traceback.format_exc()

        except Exception as e:
            state = ActionResultState.FAILURE
            error_message = e.message + '\n\n\n' + traceback.format_exc()

        finally:
            self.dart.engine_action_checkin(action.id, ActionResult(state, error_message, consume_subscription_state))
Ejemplo n.º 17
0
class DynamoDBEngine(object):
    def __init__(self,
                 emr_ec2_keyname,
                 emr_instance_profile,
                 emr_service_role,
                 emr_region,
                 emr_core_node_limit,
                 emr_impala_docker_repo_base_url,
                 emr_impala_version,
                 emr_cluster_tags,
                 emr_cluster_availability_zone,
                 dart_host,
                 dart_port,
                 dart_api_version=1,
                 emr_release_label='emr-4.2.0',
                 emr_instance_type='m3.2xlarge'):

        self.emr_release_label = emr_release_label
        self.emr_instance_type = emr_instance_type
        self._action_handlers = {
            DynamoDBActionTypes.create_table.name: create_table,
            DynamoDBActionTypes.delete_table.name: delete_table,
            DynamoDBActionTypes.load_dataset.name: load_dataset,
        }
        self.emr_engine = EmrEngine(emr_ec2_keyname, emr_instance_profile,
                                    emr_service_role, emr_region,
                                    emr_core_node_limit,
                                    emr_impala_docker_repo_base_url,
                                    emr_impala_version, emr_cluster_tags,
                                    emr_cluster_availability_zone, dart_host,
                                    dart_port, dart_api_version)
        self.dart = Dart(dart_host, dart_port, dart_api_version)

    def run(self):
        action_context = self.dart.engine_action_checkout(
            os.environ.get('DART_ACTION_ID'))
        action = action_context.action
        datastore = action_context.datastore

        state = ActionResultState.SUCCESS
        consume_subscription_state = None
        error_message = None
        try:
            action_type_name = action.data.action_type_name
            assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name
            handler = self._action_handlers[action_type_name]
            handler(self, datastore, action)

        except Exception as e:
            state = ActionResultState.FAILURE
            error_message = e.message + '\n\n\n' + traceback.format_exc()

        finally:
            self.dart.engine_action_checkin(
                action.id,
                ActionResult(state, error_message, consume_subscription_state))
Ejemplo n.º 18
0
def add_s3_engine(config):
    engine_config = config['engines']['s3_engine']
    opts = engine_config['options']
    dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version'])
    assert isinstance(dart, Dart)

    _logger.info('saving s3 engine')

    engine_id = None
    for e in dart.get_engines():
        if e.data.name == 's3_engine':
            engine_id = e.id

    ecs_task_definition = None if config['dart']['use_local_engines'] else {
        'family': 'dart-%s-s3_engine' % config['dart']['env_name'],
        'containerDefinitions': [
            {
                'name': 'dart-s3_engine',
                'cpu': 64,
                'memory': 256,
                'image': engine_config['docker_image'],
                'logConfiguration': {'logDriver': 'syslog'},
                'environment': [
                    {'name': 'DART_ROLE', 'value': 'worker:engine_s3'},
                    {'name': 'DART_CONFIG', 'value': engine_config['config']},
                    {'name': 'AWS_DEFAULT_REGION', 'value': opts['region']}
                ],
                'mountPoints': [
                    {
                        'containerPath': '/mnt/ecs_agent_data',
                        'sourceVolume': 'ecs-agent-data',
                        'readOnly': True
                    }
                ],
            }
        ],
        'volumes': [
            {
                'host': {'sourcePath': '/var/lib/ecs/data'},
                'name': 'ecs-agent-data'
            }
        ],
    }

    e1 = dart.save_engine(Engine(id=engine_id, data=EngineData(
        name='s3_engine',
        description='For S3 operations',
        options_json_schema={},
        supported_action_types=[
            S3ActionTypes.copy,
            S3ActionTypes.data_check,
        ],
        ecs_task_definition=ecs_task_definition
    )))
    _logger.info('Saved s3_engine: %s' % e1.id)
Ejemplo n.º 19
0
def add_s3_engine(config):
    engine_config = config['engines']['s3_engine']
    opts = engine_config['options']
    dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version'])
    assert isinstance(dart, Dart)

    _logger.info('saving s3 engine')

    engine_id = None
    for e in dart.get_engines():
        if e.data.name == 's3_engine':
            engine_id = e.id

    ecs_task_definition = None if config['dart']['use_local_engines'] else {
        'family': 'dart-%s-s3_engine' % config['dart']['env_name'],
        'containerDefinitions': [
            {
                'name': 'dart-s3_engine',
                'cpu': 64,
                'memory': 256,
                'image': engine_config['docker_image'],
                'logConfiguration': {'logDriver': 'syslog'},
                'environment': [
                    {'name': 'DART_ROLE', 'value': 'worker:engine_s3'},
                    {'name': 'DART_CONFIG', 'value': engine_config['config']},
                    {'name': 'AWS_DEFAULT_REGION', 'value': opts['region']}
                ],
                'mountPoints': [
                    {
                        'containerPath': '/mnt/ecs_agent_data',
                        'sourceVolume': 'ecs-agent-data',
                        'readOnly': True
                    }
                ],
            }
        ],
        'volumes': [
            {
                'host': {'sourcePath': '/var/lib/ecs/data'},
                'name': 'ecs-agent-data'
            }
        ],
    }

    e1 = dart.save_engine(engine=Engine(
            id=engine_id, data=EngineData(
                    name='s3_engine',
                    description='For S3 FileCopy',
                    options_json_schema={},
                    supported_action_types=[
                        S3ActionTypes.copy
                    ],
                    ecs_task_definition=ecs_task_definition
            )))
    _logger.info('Saved s3_engine: %s' % e1.id)
Ejemplo n.º 20
0
class RedshiftEngine(ActionRunner):
    def __init__(self, kms_key_arn, secrets_s3_path, vpc_subnet, security_group_ids,
                 region, availability_zones, publicly_accessible, cluster_tags,
                 dart_host, dart_port, dart_api_version=1):
        super(RedshiftEngine, self).__init__()

        self.dart = Dart(dart_host, dart_port, dart_api_version)
        self._action_handlers = {
            RedshiftActionTypes.start_datastore.name: start_datastore,
            RedshiftActionTypes.stop_datastore.name: stop_datastore,
            RedshiftActionTypes.execute_sql.name: execute_sql,
            RedshiftActionTypes.load_dataset.name: load_dataset,
            RedshiftActionTypes.consume_subscription.name: consume_subscription,
            RedshiftActionTypes.copy_to_s3.name: copy_to_s3,
            RedshiftActionTypes.create_snapshot.name: create_snapshot,
            RedshiftActionTypes.data_check.name: data_check,
            RedshiftActionTypes.cluster_maintenance.name: cluster_maintenance,
        }
        self.vpc_subnet = vpc_subnet
        self.availability_zones = availability_zones
        self.publicly_accessible = publicly_accessible
        self.security_group_ids = security_group_ids
        self.cluster_tags = cluster_tags
        self.region = region
        self.secrets = Secrets(kms_key_arn, secrets_s3_path)

    def random_availability_zone(self):
        return self.availability_zones[random.randint(0, len(self.availability_zones) - 1)]

    def run(self):
        action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID'))
        action = action_context.action
        datastore = action_context.datastore

        state = ActionResultState.SUCCESS
        error_message = None
        try:
            action_type_name = action.data.action_type_name
            _logger.info("**** RedshiftEngine.run_action: %s", action_type_name)
            assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name
            handler = self._action_handlers[action_type_name]
            handler(self, datastore, action)

        except Exception as e:
            state = ActionResultState.FAILURE
            error_message = '{m}\r\r\r{t}'.format(
                m=str(e.message),
                t=traceback.format_exc(),
            )

        finally:
            self.dart.engine_action_checkin(action.id, ActionResult(state, error_message))
            self.publish_sns_message(action, error_message, state)
Ejemplo n.º 21
0
 def setUp(self):
     self.dart = Dart(host='localhost', port=5000)
     args = {'action_sleep_time_in_seconds': 0}
     dst = Datastore(data=DatastoreData(name='test-datastore',
                                        engine_name='no_op_engine',
                                        args=args,
                                        state=DatastoreState.TEMPLATE))
     self.datastore = self.dart.save_datastore(dst)
     wf = Workflow(data=WorkflowData(name='test-workflow',
                                     datastore_id=self.datastore.id))
     self.workflow = self.dart.save_workflow(workflow=wf,
                                             datastore_id=self.datastore.id)
     self.maxDiff = 99999
Ejemplo n.º 22
0
class ElasticsearchEngine(ActionRunner):
    def __init__(self,
                 kms_key_arn,
                 secrets_s3_path,
                 dart_host,
                 dart_port,
                 dart_api_version=1,
                 **kwargs):
        super(ElasticsearchEngine, self).__init__()

        self.dart = Dart(dart_host, dart_port, dart_api_version)
        self._action_handlers = {
            ElasticsearchActionTypes.data_check.name: data_check,
            ElasticsearchActionTypes.create_index.name: create_index,
            ElasticsearchActionTypes.create_template.name: create_template,
            ElasticsearchActionTypes.create_mapping.name: create_mapping,
            ElasticsearchActionTypes.delete_index.name: delete_index,
            ElasticsearchActionTypes.delete_template.name: delete_template,
            ElasticsearchActionTypes.force_merge_index.name: force_merge_index,
        }
        self.secrets = Secrets(kms_key_arn, secrets_s3_path)

    def run(self):
        action_context = self.dart.engine_action_checkout(
            os.environ.get('DART_ACTION_ID'))
        action = action_context.action
        datastore = action_context.datastore

        state = ActionResultState.SUCCESS
        error_message = None
        try:
            action_type_name = action.data.action_type_name
            _logger.info('**** ElasticsearchEngine.run_action: %s',
                         action_type_name)
            assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name
            handler = self._action_handlers[action_type_name]
            handler(self, datastore, action)

        except Exception as e:
            state = ActionResultState.FAILURE
            error_message = '{m}\r\r\r{t}'.format(
                m=str(e.message),
                t=traceback.format_exc(),
            )

        finally:
            self.dart.engine_action_checkin(action.id,
                                            ActionResult(state, error_message))
            self.publish_sns_message(action, error_message, state)
Ejemplo n.º 23
0
 def setUp(self):
     self.dart = Dart(host='localhost', port=5000)
     args = {'action_sleep_time_in_seconds': 0}
     dst = Datastore(data=DatastoreData('test-datastore', 'no_op_engine', args=args, state=DatastoreState.TEMPLATE))
     self.datastore = self.dart.save_datastore(dst)
     wf = Workflow(data=WorkflowData('test-workflow', self.datastore.id, state=WorkflowState.ACTIVE))
     self.workflow = self.dart.save_workflow(wf, self.datastore.id)
Ejemplo n.º 24
0
    def __init__(self, ec2_keyname, instance_profile, service_role, region, core_node_limit,
                 impala_docker_repo_base_url, impala_version, cluster_tags, cluster_availability_zone,
                 dart_host, dart_port, dart_api_version=1):

        self._action_handlers = {
            EmrActionTypes.start_datastore.name: start_datastore,
            EmrActionTypes.terminate_datastore.name: terminate_datastore,
            EmrActionTypes.load_dataset.name: load_dataset,
            EmrActionTypes.consume_subscription.name: consume_subscription,
            EmrActionTypes.run_hive_script_action.name: run_hive_script,
            EmrActionTypes.run_impala_script_action.name: run_impala_script,
            EmrActionTypes.run_pyspark_script_action.name: run_pyspark_script,
            EmrActionTypes.copy_hdfs_to_s3_action.name: copy_hdfs_to_s3,
        }
        self._region = RegionInfo(self, region, 'elasticmapreduce.%s.amazonaws.com' % region) if region else None
        self._conn = None
        self.ec2_keyname = ec2_keyname
        self.core_node_limit = core_node_limit
        self.instance_profile = instance_profile
        self.service_role = service_role
        self.cluster_tags = cluster_tags
        self.cluster_availability_zone = cluster_availability_zone
        self.impala_docker_repo_base_url = impala_docker_repo_base_url
        self.impala_version = impala_version
        self.dart = Dart(dart_host, dart_port, dart_api_version)
Ejemplo n.º 25
0
 def setUp(self):
     self.dart = Dart(host='localhost', port=5000)
     args = {'action_sleep_time_in_seconds': 0}
     dst = Datastore(data=DatastoreData(name='test-datastore',
                                        engine_name='no_op_engine',
                                        args=args,
                                        state=DatastoreState.ACTIVE))
     self.datastore = self.dart.save_datastore(dst)
Ejemplo n.º 26
0
 def setUp(self):
     self.dart = Dart(host='localhost', port=5000)
     args = {'action_sleep_time_in_seconds': 0}
     dst = Datastore(data=DatastoreData(name='test-datastore', engine_name='no_op_engine', args=args, state=DatastoreState.TEMPLATE))
     self.datastore = self.dart.save_datastore(dst)
     wf = Workflow(data=WorkflowData(name='test-workflow', datastore_id=self.datastore.id))
     self.workflow = self.dart.save_workflow(workflow=wf, datastore_id=self.datastore.id)
     self.maxDiff = 99999
Ejemplo n.º 27
0
class DynamoDBEngine(ActionRunner):
    def __init__(self, emr_ec2_keyname, emr_instance_profile, emr_service_role, emr_region, emr_core_node_limit,
                 emr_impala_docker_repo_base_url, emr_impala_version, emr_cluster_tags, emr_cluster_availability_zone,
                 dart_host, dart_port, dart_api_version=1, emr_release_label='emr-4.2.0',
                 emr_instance_type='m3.2xlarge'):
        super(DynamoDBEngine, self).__init__()

        self.emr_release_label = emr_release_label
        self.emr_instance_type = emr_instance_type
        self._action_handlers = {
            DynamoDBActionTypes.create_table.name: create_table,
            DynamoDBActionTypes.delete_table.name: delete_table,
            DynamoDBActionTypes.load_dataset.name: load_dataset,
        }
        self.emr_engine = EmrEngine(
            emr_ec2_keyname, emr_instance_profile, emr_service_role, emr_region, emr_core_node_limit,
            emr_impala_docker_repo_base_url, emr_impala_version, emr_cluster_tags, emr_cluster_availability_zone,
            dart_host, dart_port, dart_api_version
        )
        self.dart = Dart(dart_host, dart_port, dart_api_version)

    def run(self):
        action_context = self.dart.engine_action_checkout(os.environ.get('DART_ACTION_ID'))
        action = action_context.action
        datastore = action_context.datastore

        state = ActionResultState.SUCCESS
        consume_subscription_state = None
        error_message = None
        try:
            action_type_name = action.data.action_type_name
            assert action_type_name in self._action_handlers, 'unsupported action: %s' % action_type_name
            handler = self._action_handlers[action_type_name]
            handler(self, datastore, action)

        except Exception as e:
            state = ActionResultState.FAILURE
            error_message = '{m}\r\r\r{t}'.format(
                m=str(e.message),
                t=traceback.format_exc(),
            )

        finally:
            self.dart.engine_action_checkin(action.id, ActionResult(state, error_message, consume_subscription_state))
            self.publish_sns_message(action, error_message, state)
Ejemplo n.º 28
0
class TestDatasetCrud(unittest.TestCase):
    def setUp(self):
        self.dart = Dart(host='localhost', port=5000)

    def test_crud(self):
        columns = [Column('c1', DataType.VARCHAR, 50), Column('c2', DataType.BIGINT)]
        df = DataFormat(FileFormat.PARQUET, RowFormat.NONE)
        ds = Dataset(
            data=DatasetData(name=NoOpActionTypes.action_that_succeeds.name,
                             table_name=NoOpActionTypes.action_that_succeeds.name,
                             load_type=LoadType.INSERT,
                             location='s3://bucket/prefix',
                             data_format=df,
                             columns=columns,
                             tags=['foo']))
        ds.data.user_id = '*****@*****.**'
        posted_dataset = self.dart.save_dataset(ds)
        self.assertEqual(posted_dataset.data.to_dict(), ds.data.to_dict())

        dataset = self.dart.get_dataset(posted_dataset.id)
        self.assertEqual(posted_dataset.to_dict(), dataset.to_dict())

        dataset.data.compression = Compression.GZIP
        put_dataset = self.dart.save_dataset(dataset)
        self.assertEqual(put_dataset.data.compression, Compression.GZIP)
        self.assertNotEqual(posted_dataset.to_dict(), put_dataset.to_dict())

        self.dart.delete_dataset(dataset.id)
        try:
            self.dart.get_dataset(dataset.id)
        except DartRequestException as e:
            self.assertEqual(e.response.status_code, 404)
            return

        self.fail('dataset should have been missing after delete!')
Ejemplo n.º 29
0
    def __init__(self,
                 kms_key_arn,
                 secrets_s3_path,
                 dart_host,
                 dart_port,
                 dart_api_version=1):

        self.dart = Dart(dart_host, dart_port, dart_api_version)
        self._action_handlers = {
            ElasticsearchActionTypes.data_check.name: data_check,
            ElasticsearchActionTypes.create_index.name: create_index,
            ElasticsearchActionTypes.create_template.name: create_template,
            ElasticsearchActionTypes.create_mapping.name: create_mapping,
            ElasticsearchActionTypes.delete_index.name: delete_index,
            ElasticsearchActionTypes.delete_template.name: delete_template,
            ElasticsearchActionTypes.force_merge_index.name: force_merge_index,
        }
        self.secrets = Secrets(kms_key_arn, secrets_s3_path)
Ejemplo n.º 30
0
class NoOpEngine(ActionRunner):
    def __init__(self,
                 region,
                 dart_host='localhost',
                 dart_port=5000,
                 dart_api_version=1):
        super(NoOpEngine, self).__init__()
        self.region = region
        self.dart = Dart(dart_host, dart_port, dart_api_version)

    def run(self):
        action_context = self.dart.engine_action_checkout(
            os.environ.get('DART_ACTION_ID'))
        action = action_context.action
        datastore = action_context.datastore

        state = ActionResultState.SUCCESS
        error_message = None
        try:
            sleep_seconds = datastore.data.args['action_sleep_time_in_seconds']
            _logger.info('sleeping for %s seconds...' % sleep_seconds)
            time.sleep(sleep_seconds)

            if action.data.action_type_name == NoOpActionTypes.action_that_fails.name:
                state = ActionResultState.FAILURE
                error_message = '%s failed as expected' % NoOpActionTypes.action_that_fails.name

            if action.data.action_type_name == NoOpActionTypes.consume_subscription.name:
                subscription_elements = self.dart.get_subscription_elements(
                    action.id)
                _logger.info('consuming subscription, size = %s' %
                             len(list(subscription_elements)))

        except Exception as e:
            state = ActionResultState.FAILURE
            error_message = '{m}\r\r\r{t}'.format(
                m=str(e.message),
                t=traceback.format_exc(),
            )

        finally:
            self.dart.engine_action_checkin(action.id,
                                            ActionResult(state, error_message))
            self.publish_sns_message(action, error_message, state)
    def setUp(self):
        dart = Dart(host='localhost', port=5000)
        """ :type dart: dart.client.python.dart_client.Dart """
        self.dart = dart

        dst_args = {'action_sleep_time_in_seconds': 0}
        dst0 = Datastore(data=DatastoreData('test-datastore0',
                                            'no_op_engine',
                                            args=dst_args,
                                            state=DatastoreState.TEMPLATE))
        self.datastore0 = self.dart.save_datastore(dst0)
        dst1 = Datastore(data=DatastoreData('test-datastore1',
                                            'no_op_engine',
                                            args=dst_args,
                                            state=DatastoreState.TEMPLATE))
        self.datastore1 = self.dart.save_datastore(dst1)

        wf0 = Workflow(data=WorkflowData(
            'test-workflow0', self.datastore0.id, state=WorkflowState.ACTIVE))
        self.workflow0 = self.dart.save_workflow(wf0, self.datastore0.id)
        wf1 = Workflow(data=WorkflowData(
            'test-workflow1', self.datastore1.id, state=WorkflowState.ACTIVE))
        self.workflow1 = self.dart.save_workflow(wf1, self.datastore1.id)

        a00 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name,
                                     NoOpActionTypes.action_that_succeeds.name,
                                     state=ActionState.TEMPLATE))
        a01 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name,
                                     NoOpActionTypes.action_that_succeeds.name,
                                     state=ActionState.TEMPLATE))
        self.action00, self.action01 = self.dart.save_actions(
            [a00, a01], workflow_id=self.workflow0.id)

        a10 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name,
                                     NoOpActionTypes.action_that_succeeds.name,
                                     state=ActionState.TEMPLATE))
        a11 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name,
                                     NoOpActionTypes.action_that_succeeds.name,
                                     state=ActionState.TEMPLATE))
        self.action10, self.action11 = self.dart.save_actions(
            [a10, a11], workflow_id=self.workflow1.id)

        tr_args = {'completed_workflow_id': self.workflow0.id}
        tr = Trigger(data=TriggerData('test-trigger', 'workflow_completion',
                                      None, tr_args, TriggerState.ACTIVE))
        self.trigger = self.dart.save_trigger(tr)

        st_args = {
            'fire_after': 'ALL',
            'completed_trigger_ids': [self.trigger.id]
        }
        st = Trigger(data=TriggerData('test-super-trigger', 'super',
                                      [self.workflow1.id], st_args,
                                      TriggerState.ACTIVE))
        self.super_trigger = self.dart.save_trigger(st)
Ejemplo n.º 32
0
    def setUp(self):
        dart = Dart(host='localhost', port=5000)
        """ :type dart: dart.client.python.dart_client.Dart """
        self.dart = dart

        dst_args = {'action_sleep_time_in_seconds': 0}
        dst = Datastore(data=DatastoreData('test-datastore',
                                           'no_op_engine',
                                           args=dst_args,
                                           state=DatastoreState.ACTIVE))
        self.datastore = self.dart.save_datastore(dst)
Ejemplo n.º 33
0
Archivo: es.py Proyecto: ophiradi/dart
    def __init__(self, kms_key_arn, secrets_s3_path, dart_host, dart_port, dart_api_version=1):

        self.dart = Dart(dart_host, dart_port, dart_api_version)
        self._action_handlers = {
            ElasticsearchActionTypes.data_check.name: data_check,
            ElasticsearchActionTypes.create_index.name: create_index,
            ElasticsearchActionTypes.create_template.name: create_template,
            ElasticsearchActionTypes.create_mapping.name: create_mapping,
            ElasticsearchActionTypes.delete_index.name: delete_index,
            ElasticsearchActionTypes.delete_template.name: delete_template,
            ElasticsearchActionTypes.force_merge_index.name: force_merge_index,
        }
        self.secrets = Secrets(kms_key_arn, secrets_s3_path)
Ejemplo n.º 34
0
class TestWorkflowCrud(unittest.TestCase):
    def setUp(self):
        self.dart = Dart(host='localhost', port=5000)
        args = {'action_sleep_time_in_seconds': 0}
        dst = Datastore(data=DatastoreData(name='test-datastore',
                                           engine_name='no_op_engine',
                                           args=args,
                                           state=DatastoreState.ACTIVE))
        self.datastore = self.dart.save_datastore(dst)

    def tearDown(self):
        self.dart.delete_datastore(self.datastore.id)

    def test_crud(self):
        wf = Workflow(data=WorkflowData(name='test-workflow',
                                        datastore_id=self.datastore.id,
                                        engine_name='no_op_engine'))
        posted_wf = self.dart.save_workflow(wf, self.datastore.id)
        self.assertEqual(posted_wf.data.to_dict(), wf.data.to_dict())

        workflow = self.dart.get_workflow(posted_wf.id)
        self.assertEqual(posted_wf.to_dict(), workflow.to_dict())

        workflow.data.concurrency = 2
        workflow.data.state = WorkflowState.ACTIVE
        put_workflow = self.dart.save_workflow(workflow)
        self.assertEqual(put_workflow.data.concurrency, 2)
        self.assertEqual(put_workflow.data.state, WorkflowState.ACTIVE)
        self.assertNotEqual(posted_wf.to_dict(), put_workflow.to_dict())

        self.dart.delete_workflow(workflow.id)
        try:
            self.dart.get_workflow(workflow.id)
        except DartRequestException as e:
            self.assertEqual(e.response.status_code, 404)
            return

        self.fail('workflow should have been missing after delete!')
Ejemplo n.º 35
0
    def setUp(self):
        dart = Dart(host='localhost', port=5000)
        """ :type dart: dart.client.python.dart_client.Dart """
        self.dart = dart

        cs = [
            Column('c1', DataType.VARCHAR, 50),
            Column('c2', DataType.BIGINT)
        ]
        df = DataFormat(FileFormat.TEXTFILE, RowFormat.DELIMITED)
        dataset_data = DatasetData(
            name='test-dataset',
            table_name='test_dataset_table',
            load_type=LoadType.INSERT,
            location=('s3://' + os.environ['DART_TEST_BUCKET'] + '/impala'),
            data_format=df,
            columns=cs,
            tags=[])
        self.dataset = self.dart.save_dataset(Dataset(data=dataset_data))

        start = 's3://' + os.environ['DART_TEST_BUCKET'] + '/impala/impala'
        end = 's3://' + os.environ['DART_TEST_BUCKET'] + '/impala/install'
        regex = '.*\\.rpm'
        ds = Subscription(data=SubscriptionData(
            'test-subscription', self.dataset.id, start, end, regex))
        self.subscription = self.dart.save_subscription(ds)

        dst_args = {'action_sleep_time_in_seconds': 0}
        dst = Datastore(data=DatastoreData('test-datastore',
                                           'no_op_engine',
                                           args=dst_args,
                                           state=DatastoreState.TEMPLATE))
        self.datastore = self.dart.save_datastore(dst)

        wf = Workflow(data=WorkflowData(
            'test-workflow', self.datastore.id, state=WorkflowState.ACTIVE))
        self.workflow = self.dart.save_workflow(wf, self.datastore.id)

        a_args = {'subscription_id': self.subscription.id}
        a0 = Action(data=ActionData(NoOpActionTypes.action_that_succeeds.name,
                                    NoOpActionTypes.action_that_succeeds.name,
                                    state=ActionState.TEMPLATE))
        a1 = Action(data=ActionData(NoOpActionTypes.consume_subscription.name,
                                    NoOpActionTypes.consume_subscription.name,
                                    a_args,
                                    state=ActionState.TEMPLATE))
        self.action0, self.action1 = self.dart.save_actions(
            [a0, a1], workflow_id=self.workflow.id)
Ejemplo n.º 36
0
    def __init__(self, emr_ec2_keyname, emr_instance_profile, emr_service_role, emr_region, emr_core_node_limit,
                 emr_impala_docker_repo_base_url, emr_impala_version, emr_cluster_tags, emr_cluster_availability_zone,
                 dart_host, dart_port, dart_api_version=1, emr_release_label='emr-4.2.0',
                 emr_instance_type='m3.2xlarge'):

        self.emr_release_label = emr_release_label
        self.emr_instance_type = emr_instance_type
        self._action_handlers = {
            DynamoDBActionTypes.create_table.name: create_table,
            DynamoDBActionTypes.delete_table.name: delete_table,
            DynamoDBActionTypes.load_dataset.name: load_dataset,
        }
        self.emr_engine = EmrEngine(
            emr_ec2_keyname, emr_instance_profile, emr_service_role, emr_region, emr_core_node_limit,
            emr_impala_docker_repo_base_url, emr_impala_version, emr_cluster_tags, emr_cluster_availability_zone,
            dart_host, dart_port, dart_api_version
        )
        self.dart = Dart(dart_host, dart_port, dart_api_version)
Ejemplo n.º 37
0
    def setUp(self):
        dart = Dart(host='localhost', port=5000)
        """ :type dart: dart.client.python.dart_client.Dart """
        self.dart = dart

        dst_args = {'action_sleep_time_in_seconds': 0}
        dst = Datastore(data=DatastoreData('test-datastore',
                                           'no_op_engine',
                                           args=dst_args,
                                           state=DatastoreState.ACTIVE))
        self.datastore = self.dart.save_datastore(dst)

        wf = Workflow(data=WorkflowData(
            'test-workflow', self.datastore.id, state=WorkflowState.ACTIVE))
        self.workflow = self.dart.save_workflow(wf, self.datastore.id)

        a = Action(data=ActionData(NoOpActionTypes.action_that_fails.name,
                                   NoOpActionTypes.action_that_fails.name,
                                   state=ActionState.TEMPLATE))
        self.dart.save_actions([a], workflow_id=self.workflow.id)
Ejemplo n.º 38
0
    def __init__(self, kms_key_arn, secrets_s3_path, vpc_subnet, security_group_ids,
                 region, availability_zones, publicly_accessible, cluster_tags,
                 dart_host, dart_port, dart_api_version=1):

        self.dart = Dart(dart_host, dart_port, dart_api_version)
        self._action_handlers = {
            RedshiftActionTypes.start_datastore.name: start_datastore,
            RedshiftActionTypes.stop_datastore.name: stop_datastore,
            RedshiftActionTypes.execute_sql.name: execute_sql,
            RedshiftActionTypes.load_dataset.name: load_dataset,
            RedshiftActionTypes.consume_subscription.name: consume_subscription,
            RedshiftActionTypes.copy_to_s3.name: copy_to_s3,
            RedshiftActionTypes.create_snapshot.name: create_snapshot,
            RedshiftActionTypes.data_check.name: data_check,
        }
        self.vpc_subnet = vpc_subnet
        self.availability_zones = availability_zones
        self.publicly_accessible = publicly_accessible
        self.security_group_ids = security_group_ids
        self.cluster_tags = cluster_tags
        self.region = region
        self.secrets = Secrets(kms_key_arn, secrets_s3_path)
Ejemplo n.º 39
0
class TestDatastoreCrud(unittest.TestCase):
    def setUp(self):
        self.dart = Dart(host='localhost', port=5000)

    def test_crud(self):
        dst = Datastore(data=DatastoreData(
            name='test-datastore',
            engine_name='no_op_engine',
            args={'action_sleep_time_in_seconds': 0},
            tags=['foo']
        ))
        posted_datastore = self.dart.save_datastore(dst)

        # copy fields that are populated at creation time
        dst.data.s3_artifacts_path = posted_datastore.data.s3_artifacts_path
        dst.data.s3_logs_path = posted_datastore.data.s3_logs_path
        dst.data.user_id = posted_datastore.data.user_id
        self.assertEqual(posted_datastore.data.to_dict(), dst.data.to_dict())

        datastore = self.dart.get_datastore(posted_datastore.id)
        self.assertEqual(posted_datastore.to_dict(), datastore.to_dict())

        datastore.data.engine_name = 'not_existing_engine'
        datastore.data.state = DatastoreState.ACTIVE
        put_datastore = self.dart.save_datastore(datastore)
        # not all properties can be modified
        self.assertEqual(put_datastore.data.engine_name, 'no_op_engine')
        self.assertEqual(put_datastore.data.state, DatastoreState.ACTIVE)
        self.assertNotEqual(posted_datastore.to_dict(), put_datastore.to_dict())

        self.dart.delete_datastore(datastore.id)
        try:
            self.dart.get_datastore(datastore.id)
        except DartRequestException as e:
            self.assertEqual(e.response.status_code, 404)
            return

        self.fail('datastore should have been missing after delete!')
Ejemplo n.º 40
0
class TestDatasetCrud(unittest.TestCase):
    def setUp(self):
        self.dart = Dart(host='localhost', port=5000)

    def test_crud(self):
        columns = [
            Column('c1', DataType.VARCHAR, 50),
            Column('c2', DataType.BIGINT)
        ]
        df = DataFormat(FileFormat.PARQUET, RowFormat.NONE)
        ds = Dataset(data=DatasetData(
            name=NoOpActionTypes.action_that_succeeds.name,
            table_name=NoOpActionTypes.action_that_succeeds.name,
            load_type=LoadType.INSERT,
            location='s3://bucket/prefix',
            data_format=df,
            columns=columns,
            tags=['foo']))
        ds.data.user_id = '*****@*****.**'
        posted_dataset = self.dart.save_dataset(ds)
        self.assertEqual(posted_dataset.data.to_dict(), ds.data.to_dict())

        dataset = self.dart.get_dataset(posted_dataset.id)
        self.assertEqual(posted_dataset.to_dict(), dataset.to_dict())

        dataset.data.compression = Compression.GZIP
        put_dataset = self.dart.save_dataset(dataset)
        self.assertEqual(put_dataset.data.compression, Compression.GZIP)
        self.assertNotEqual(posted_dataset.to_dict(), put_dataset.to_dict())

        self.dart.delete_dataset(dataset.id)
        try:
            self.dart.get_dataset(dataset.id)
        except DartRequestException as e:
            self.assertEqual(e.response.status_code, 404)
            return

        self.fail('dataset should have been missing after delete!')
Ejemplo n.º 41
0
class TestDatastoreCrud(unittest.TestCase):
    def setUp(self):
        self.dart = Dart(host='localhost', port=5000)

    def test_crud(self):
        dst = Datastore(
            data=DatastoreData(name='test-datastore',
                               engine_name='no_op_engine',
                               args={'action_sleep_time_in_seconds': 0},
                               tags=['foo']))
        posted_datastore = self.dart.save_datastore(dst)

        # copy fields that are populated at creation time
        dst.data.s3_artifacts_path = posted_datastore.data.s3_artifacts_path
        dst.data.s3_logs_path = posted_datastore.data.s3_logs_path
        dst.data.user_id = posted_datastore.data.user_id
        self.assertEqual(posted_datastore.data.to_dict(), dst.data.to_dict())

        datastore = self.dart.get_datastore(posted_datastore.id)
        self.assertEqual(posted_datastore.to_dict(), datastore.to_dict())

        datastore.data.engine_name = 'not_existing_engine'
        datastore.data.state = DatastoreState.ACTIVE
        put_datastore = self.dart.save_datastore(datastore)
        # not all properties can be modified
        self.assertEqual(put_datastore.data.engine_name, 'no_op_engine')
        self.assertEqual(put_datastore.data.state, DatastoreState.ACTIVE)
        self.assertNotEqual(posted_datastore.to_dict(),
                            put_datastore.to_dict())

        self.dart.delete_datastore(datastore.id)
        try:
            self.dart.get_datastore(datastore.id)
        except DartRequestException as e:
            self.assertEqual(e.response.status_code, 404)
            return

        self.fail('datastore should have been missing after delete!')
Ejemplo n.º 42
0
def add_elasticsearch_engine(config):
    engine_config = config['engines']['elasticsearch_engine']
    opts = engine_config['options']
    dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version'])
    assert isinstance(dart, Dart)

    _logger.info('saving elasticsearch_engine')

    engine_id = None
    for e in dart.get_engines():
        if e.data.name == 'elasticsearch_engine':
            engine_id = e.id

    ecs_task_definition = None if config['dart']['use_local_engines'] else {
        'family': 'dart-%s-elasticsearch_engine' % config['dart']['env_name'],
        'containerDefinitions': [
            {
                'name': 'dart-elasticsearch_engine',
                'cpu': 64,
                'memory': 256,
                'image': engine_config['docker_image'],
                'logConfiguration': {'logDriver': 'syslog'},
                'environment': [
                    {'name': 'DART_ROLE', 'value': 'worker:engine_elasticsearch'},
                    {'name': 'DART_CONFIG', 'value': engine_config['config']},
                    {'name': 'AWS_DEFAULT_REGION', 'value': opts['region']}
                ],
                'mountPoints': [
                    {
                        'containerPath': '/mnt/ecs_agent_data',
                        'sourceVolume': 'ecs-agent-data',
                        'readOnly': True
                    }
                ],
            }
        ],
        'volumes': [
            {
                'host': {'sourcePath': '/var/lib/ecs/data'},
                'name': 'ecs-agent-data'
            }
        ],
    }

    e1 = dart.save_engine(Engine(id=engine_id, data=EngineData(
        name='elasticsearch_engine',
        description='For Elasticsearch clusters',
        options_json_schema={
            'type': 'object',
            'properties': {
                'access_key_id': {
                    'type': 'string',
                    'default': '',
                    'minLength': 0,
                    'maxLength': 20,
                    'description': 'the access_key_id for accessing this elasticsearch cluster. '
                                   + 'Leave blank to use Dart\'s instance profile credentials'
                },
                'secret_access_key': {
                    'type': 'string',
                    'default': '',
                    'minLength': 0,
                    'maxLength': 40,
                    'x-dart-secret': True,
                    'description': 'the secret_access_key for accessing this elasticsearch cluster. '
                                   + 'Leave blank to use Dart\'s instance profile credentials'
                },
                'endpoint': {
                    'type': 'string',
                    'minLength': 1,
                    'maxLength': 256,
                    'pattern': '^[a-zA-Z0-9]+[a-zA-Z0-9\-\.]*\.es\.amazonaws\.com$',
                    'description': 'The AWS Elasticsearch domain endpoint that you use to submit index and search requests.'
                },
            },
            'additionalProperties': False,
            'required': ['endpoint']
        },
        supported_action_types=[
            ElasticsearchActionTypes.data_check,
            ElasticsearchActionTypes.create_index,
            ElasticsearchActionTypes.create_mapping,
            ElasticsearchActionTypes.create_template,
            ElasticsearchActionTypes.delete_index,
            ElasticsearchActionTypes.delete_template,
            ElasticsearchActionTypes.force_merge_index,
        ],
        ecs_task_definition=ecs_task_definition
    )))
    _logger.info('saved elasticsearch_engine: %s' % e1.id)
Ejemplo n.º 43
0
from dart.client.python.dart_client import Dart
from dart.model.action import Action
from dart.model.action import ActionData
from dart.model.dataset import FileFormat

if __name__ == '__main__':
    dart = Dart('localhost', 5000)
    assert isinstance(dart, Dart)

    action = dart.save_actions([
        Action(data=ActionData(
            'load_dataset',
            'load_dataset',
            args={
                'dataset_id': 'NVVLBI7CWB',
                's3_path_start_prefix_inclusive':
                's3://example-bucket/weblogs/www.retailmenot.com/ec2/2014/52',
                's3_path_end_prefix_exclusive':
                's3://example-bucket/weblogs/www.retailmenot.com/ec2/2015/00',
                's3_path_regex_filter':
                's3://example-bucket/weblogs/www.retailmenot.com/ec2/2014/../www\\.retailmenot\\.com.*',
                'target_file_format': FileFormat.PARQUET,
            })),
    ],
                               datastore_id='IOMUQ5L8AX')[0]
    print 'created action: %s' % action.id
Ejemplo n.º 44
0
def add_emr_engine_sub_graphs(config):
    engine_config = config['engines']['emr_engine']
    opts = engine_config['options']
    dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version'])
    assert isinstance(dart, Dart)

    _logger.info('saving emr_engine sub_graphs')

    engine_id = None
    for e in dart.get_engines():
        if e.data.name == 'emr_engine':
            engine_id = e.id
    if not engine_id:
        raise

    subgraph_definitions = [
        SubGraphDefinition(data=SubGraphDefinitionData(
            name='consume_subscription_workflow',
            description='Add to a datastore to create entities for loading a dataset on an ongoing basis',
            engine_name='emr_engine',
            related_type=EntityType.datastore,
            related_is_a=Relationship.PARENT,
            workflows=[
                Workflow(id=Ref.workflow(1), data=WorkflowData(
                    name='emr-workflow-consume_subscription',
                    datastore_id=Ref.parent(),
                    engine_name='emr_engine',
                )),
            ],
            subscriptions=[
                Subscription(id=Ref.subscription(1), data=SubscriptionData(
                    name='emr-subscription',
                    dataset_id=''
                )),
            ],
            triggers=[
                Trigger(id=Ref.trigger(1), data=TriggerData(
                    name='emr-trigger-subscription-1G-batch',
                    trigger_type_name=subscription_batch_trigger.name,
                    workflow_ids=[Ref.workflow(1)],
                    args={
                        'subscription_id': Ref.subscription(1),
                        'unconsumed_data_size_in_bytes': 1000*1000*1000
                    }
                )),
            ],
            actions=[
                Action(id=Ref.action(1), data=ActionData(
                    name='emr-action-consume_subscription',
                    action_type_name=EmrActionTypes.consume_subscription.name,
                    engine_name='emr_engine',
                    workflow_id=Ref.workflow(1),
                    state=ActionState.TEMPLATE,
                    args={'subscription_id': Ref.subscription(1)}
                )),
            ]
        ))
    ]

    for e in subgraph_definitions:
        s = dart.save_subgraph_definition(e, engine_id)
        _logger.info('created subgraph_definition: %s' % s.id)
Ejemplo n.º 45
0
class TestActionCrud(unittest.TestCase):
    def setUp(self):
        self.dart = Dart(host='localhost', port=5000)
        args = {'action_sleep_time_in_seconds': 0}
        dst = Datastore(data=DatastoreData(name='test-datastore', engine_name='no_op_engine', args=args, state=DatastoreState.TEMPLATE))
        self.datastore = self.dart.save_datastore(dst)
        wf = Workflow(data=WorkflowData(name='test-workflow', datastore_id=self.datastore.id))
        self.workflow = self.dart.save_workflow(workflow=wf, datastore_id=self.datastore.id)
        self.maxDiff = 99999

    def tearDown(self):
        self.dart.delete_datastore(self.datastore.id)
        self.dart.delete_workflow(self.workflow.id)

    def test_crud_datastore(self):
        action0 = Action(data=ActionData(name=NoOpActionTypes.action_that_succeeds.name,
                                         action_type_name=NoOpActionTypes.action_that_succeeds.name,
                                         engine_name='no_op_engine'))
        action1 = Action(data=ActionData(name=NoOpActionTypes.action_that_succeeds.name,
                                         action_type_name=NoOpActionTypes.action_that_succeeds.name,
                                         engine_name='no_op_engine'))
        posted_actions = self.dart.save_actions(actions=[action0, action1], datastore_id=self.datastore.id)

        # copy fields that are populated at creation time
        action0.data.datastore_id = posted_actions[0].data.datastore_id
        action1.data.datastore_id = posted_actions[1].data.datastore_id
        action0.data.args = {}
        action1.data.args = {}
        action0.data.order_idx = posted_actions[0].data.order_idx
        action1.data.order_idx = posted_actions[1].data.order_idx

        action0.data.user_id = posted_actions[0].data.user_id
        action1.data.user_id = posted_actions[1].data.user_id

        self.assertEqual(posted_actions[0].data.to_dict(), action0.data.to_dict())
        self.assertEqual(posted_actions[1].data.to_dict(), action1.data.to_dict())

        # When retrieving an action, its queue time and state
        # differs from the action default values created by action0 and action1
        a0 = self.dart.get_action(posted_actions[0].id)
        a1 = self.dart.get_action(posted_actions[1].id)
        action0.data.state = a0.data.state
        action1.data.state = a1.data.state
        action0.data.queued_time = a0.data.queued_time
        action1.data.queued_time = a1.data.queued_time

        self.assertEqual(a0.data.to_dict(), action0.data.to_dict())
        self.assertEqual(a1.data.to_dict(), action1.data.to_dict())

        self.dart.delete_action(a0.id)
        self.dart.delete_action(a1.id)

        try:
            self.dart.get_action(a0.id)
        except DartRequestException as e0:
            self.assertEqual(e0.response.status_code, 404)
            try:
                self.dart.get_action(a1.id)
            except DartRequestException as e1:
                self.assertEqual(e1.response.status_code, 404)
                return

        self.fail('action should have been missing after delete!')

    def test_crud_workflow(self):
        action0 = Action(data=ActionData(name=NoOpActionTypes.action_that_succeeds.name, action_type_name=NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE, engine_name='no_op_engine'))
        action1 = Action(data=ActionData(name=NoOpActionTypes.action_that_succeeds.name, action_type_name=NoOpActionTypes.action_that_succeeds.name, state=ActionState.TEMPLATE, engine_name='no_op_engine'))
        posted_actions = self.dart.save_actions([action0, action1], workflow_id=self.workflow.id)

        # copy fields that are populated at creation time
        action0.data.workflow_id = posted_actions[0].data.workflow_id
        action1.data.workflow_id = posted_actions[1].data.workflow_id
        action0.data.order_idx = posted_actions[0].data.order_idx
        action1.data.order_idx = posted_actions[1].data.order_idx
        action0.data.args = {}
        action1.data.args = {}

        action0.data.user_id = posted_actions[0].data.user_id
        action1.data.user_id = posted_actions[1].data.user_id

        self.assertEqual(posted_actions[0].data.to_dict(), action0.data.to_dict())
        self.assertEqual(posted_actions[1].data.to_dict(), action1.data.to_dict())

        a0 = self.dart.get_action(posted_actions[0].id)
        a1 = self.dart.get_action(posted_actions[1].id)
        self.assertEqual(a0.data.to_dict(), action0.data.to_dict())
        self.assertEqual(a1.data.to_dict(), action1.data.to_dict())

        self.dart.delete_action(a0.id)
        self.dart.delete_action(a1.id)

        try:
            self.dart.get_action(a0.id)
        except DartRequestException as e0:
            self.assertEqual(e0.response.status_code, 404)
            try:
                self.dart.get_action(a1.id)
            except DartRequestException as e1:
                self.assertEqual(e1.response.status_code, 404)
                return

        self.fail('action should have been missing after delete!')
Ejemplo n.º 46
0
Archivo: s3.py Proyecto: karthich/dart
 def __init__(self, dart_host, dart_port, dart_api_version):
     self.dart = Dart(dart_host, dart_port, dart_api_version)
     self._action_handlers = {
         S3ActionTypes.copy.name: s3_copy
     }
Ejemplo n.º 47
0
def add_no_op_engine_sub_graphs(config):
    engine_config = config['engines']['no_op_engine']
    opts = engine_config['options']
    dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version'])
    assert isinstance(dart, Dart)

    _logger.info('saving no_op_engine sub_graphs')

    engine_id = None
    for e in dart.get_engines():
        if e.data.name == 'no_op_engine':
            engine_id = e.id
    if not engine_id:
        raise

    subgraph_definitions = [
        SubGraphDefinition(data=SubGraphDefinitionData(
            name='workflow chaining demo',
            description='demonstrate workflow chaining',
            engine_name='no_op_engine',
            related_type=EntityType.datastore,
            related_is_a=Relationship.PARENT,
            workflows=[
                Workflow(id=Ref.workflow(1),
                         data=WorkflowData(
                             name='no-op-workflow-chaining-wf1',
                             datastore_id=Ref.parent(),
                             engine_name='no_op_engine',
                             state=WorkflowState.ACTIVE,
                         )),
                Workflow(id=Ref.workflow(2),
                         data=WorkflowData(
                             name='no-op-workflow-chaining-wf2',
                             datastore_id=Ref.parent(),
                             engine_name='no_op_engine',
                             state=WorkflowState.ACTIVE,
                         )),
            ],
            actions=[
                Action(id=Ref.action(1),
                       data=ActionData(
                           name=NoOpActionTypes.action_that_succeeds.name,
                           engine_name='no_op_engine',
                           action_type_name=NoOpActionTypes.
                           action_that_succeeds.name,
                           workflow_id=Ref.workflow(1),
                           order_idx=1,
                           state=ActionState.TEMPLATE,
                       )),
                Action(id=Ref.action(2),
                       data=ActionData(
                           name=NoOpActionTypes.action_that_succeeds.name,
                           action_type_name=NoOpActionTypes.
                           action_that_succeeds.name,
                           engine_name='no_op_engine',
                           workflow_id=Ref.workflow(1),
                           order_idx=2,
                           state=ActionState.TEMPLATE,
                       )),
                Action(id=Ref.action(3),
                       data=ActionData(
                           name=NoOpActionTypes.action_that_succeeds.name,
                           action_type_name=NoOpActionTypes.
                           action_that_succeeds.name,
                           engine_name='no_op_engine',
                           workflow_id=Ref.workflow(1),
                           order_idx=3,
                           state=ActionState.TEMPLATE,
                       )),
                Action(id=Ref.action(4),
                       data=ActionData(
                           name=NoOpActionTypes.action_that_succeeds.name,
                           action_type_name=NoOpActionTypes.
                           action_that_succeeds.name,
                           engine_name='no_op_engine',
                           workflow_id=Ref.workflow(1),
                           order_idx=4,
                           state=ActionState.TEMPLATE,
                       )),
                Action(id=Ref.action(5),
                       data=ActionData(
                           name=NoOpActionTypes.action_that_succeeds.name,
                           action_type_name=NoOpActionTypes.
                           action_that_succeeds.name,
                           engine_name='no_op_engine',
                           workflow_id=Ref.workflow(2),
                           order_idx=1,
                           state=ActionState.TEMPLATE,
                       )),
                Action(id=Ref.action(6),
                       data=ActionData(
                           name=NoOpActionTypes.action_that_succeeds.name,
                           action_type_name=NoOpActionTypes.
                           action_that_succeeds.name,
                           engine_name='no_op_engine',
                           workflow_id=Ref.workflow(2),
                           order_idx=2,
                           state=ActionState.TEMPLATE,
                       )),
                Action(id=Ref.action(7),
                       data=ActionData(
                           name=NoOpActionTypes.action_that_fails.name,
                           action_type_name=NoOpActionTypes.action_that_fails.
                           name,
                           engine_name='no_op_engine',
                           workflow_id=Ref.workflow(2),
                           order_idx=3,
                           state=ActionState.TEMPLATE,
                       )),
            ],
            triggers=[
                Trigger(id=Ref.trigger(1),
                        data=TriggerData(
                            name='no-op-trigger-workflow-completion',
                            trigger_type_name=workflow_completion_trigger.name,
                            workflow_ids=[Ref.workflow(2)],
                            state=TriggerState.ACTIVE,
                            args={'completed_workflow_id': Ref.workflow(1)})),
            ],
        ))
    ]

    for e in subgraph_definitions:
        s = dart.save_subgraph_definition(e, engine_id)
        _logger.info('created subgraph_definition: %s' % s.id)
Ejemplo n.º 48
0
from dart.client.python.dart_client import Dart
from dart.model.dataset import Column, DatasetData, Dataset, DataFormat, FileFormat, DataType, Compression, RowFormat, \
    LoadType

if __name__ == '__main__':
    dart = Dart('localhost', 5000)
    assert isinstance(dart, Dart)

    dataset = dart.save_dataset(
        Dataset(data=(DatasetData(
            name='beacon_native_app_parsed_v01',
            table_name='beacon_native_app',
            location='s3://example-bucket/nb.retailmenot.com/parsed_logs',
            load_type=LoadType.INSERT,
            data_format=DataFormat(FileFormat.TEXTFILE,
                                   RowFormat.DELIMITED,
                                   delimited_by='\t',
                                   quoted_by='"',
                                   escaped_by='\\',
                                   null_string='NULL',
                                   num_header_rows=1),
            compression=Compression.NONE,
            partitions=[
                Column('year', DataType.STRING),
                Column('week', DataType.STRING),
            ],
            columns=[
                Column('logFileId', DataType.BIGINT),
                Column('lineNumber', DataType.INT),
                Column('created',
                       DataType.TIMESTAMP,
Ejemplo n.º 49
0
from dart.client.python.dart_client import Dart

if __name__ == '__main__':
    dart = Dart('localhost', 5000)
    assert isinstance(dart, Dart)

    action = dart.get_action('8U7H6OLHC5')
    action = dart.patch_action(action, order_idx=5)
    print 'patched action: %s' % action.id
Ejemplo n.º 50
0
def add_emr_engine(config):
    engine_config = config['engines']['emr_engine']
    opts = engine_config['options']
    dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version'])
    assert isinstance(dart, Dart)

    _logger.info('saving emr_engine')

    engine_id = None
    for e in dart.get_engines():
        if e.data.name == 'emr_engine':
            engine_id = e.id

    ecs_task_definition = None if config['dart']['use_local_engines'] else {
        'family': 'dart-%s-emr_engine' % config['dart']['env_name'],
        'containerDefinitions': [
            {
                'name': 'dart-emr_engine',
                'cpu': 64,
                'memory': 256,
                'image': engine_config['docker_image'],
                'logConfiguration': {'logDriver': 'syslog'},
                'environment': [
                    {'name': 'DART_ROLE', 'value': 'worker:engine_emr'},
                    {'name': 'DART_CONFIG', 'value': engine_config['config']},
                    {'name': 'AWS_DEFAULT_REGION', 'value': opts['region']}
                ],
                'mountPoints': [
                    {
                        'containerPath': '/mnt/ecs_agent_data',
                        'sourceVolume': 'ecs-agent-data',
                        'readOnly': True
                    }
                ],
            }
        ],
        'volumes': [
            {
                'host': {'sourcePath': '/var/lib/ecs/data'},
                'name': 'ecs-agent-data'
            }
        ],
    }

    e1 = dart.save_engine(Engine(id=engine_id, data=EngineData(
        name='emr_engine',
        description='For EMR clusters that use Hive, Impala, Spark, etc.',
        options_json_schema={
            'type': 'object',
            'properties': {
                'release_label': {'type': 'string', 'pattern': '^emr-[0-9].[0-9].[0-9]+$', 'default': 'emr-4.2.0', 'description': 'desired EMR release label'},
                'instance_type': {'readonly': True, 'type': ['string', 'null'], 'default': 'm3.2xlarge', 'description': 'The ec2 instance type of master/core nodes'},
                'instance_count': {'type': ['integer', 'null'], 'default': None, 'minimum': 1, 'maximum': 50, 'description': 'The total number of nodes in this cluster (overrides data_to_freespace_ratio)'},
                'data_to_freespace_ratio': {'type': ['number', 'null'], 'default': 0.5, 'minimum': 0.0, 'maximum': 1.0, 'description': 'Desired ratio of HDFS data/free-space'},
                'dry_run': {'type': ['boolean', 'null'], 'default': False, 'description': 'write extra_data to actions, but do not actually run'},
                'ec2_keyname': {'type': 'string', 'description': 'The name of the ec2_key_pair for the emr cluster. If this is not defined, the default key-pair from config is chosen.', 'default': None},
            },
            'additionalProperties': False,
            'required': ['release_label'],
        },
        supported_action_types=[
            EmrActionTypes.start_datastore,
            EmrActionTypes.terminate_datastore,
            EmrActionTypes.load_dataset,
            EmrActionTypes.consume_subscription,
            EmrActionTypes.run_hive_script_action,
            EmrActionTypes.run_impala_script_action,
            EmrActionTypes.run_pyspark_script_action,
            EmrActionTypes.copy_hdfs_to_s3_action
        ],
        ecs_task_definition=ecs_task_definition
    )))
    _logger.info('saved emr_engine: %s' % e1.id)
from dart.client.python.dart_client import Dart
from dart.model.action import Action
from dart.model.action import ActionData
from dart.model.dataset import Column, DatasetData, Dataset, DataFormat, FileFormat, RowFormat, DataType, Compression
from dart.model.datastore import Datastore, DatastoreData, DatastoreState

if __name__ == '__main__':
    dart = Dart('localhost', 5000)
    # dart = Dart()
    assert isinstance(dart, Dart)

    dataset = dart.save_dataset(Dataset(data=DatasetData(
        name='weblogs_v01',
        table_name='weblogs',
        location='s3://example-bucket/weblogs/www.retailmenot.com/ec2/',
        data_format=DataFormat(
            file_format=FileFormat.TEXTFILE,
            row_format=RowFormat.REGEX,
            regex_input="(?<ip>^(?:(?:unknown(?:,\\s)?|(?:\\d+\\.\\d+\\.\\d+\\.\\d+(?:,\\s)?))+)|\\S*)\\s+\\S+\\s+(?<userIdentifier>(?:[^\\[]+|\\$\\S+\\['\\S+'\\]|\\[username\\]))\\s*\\s+\\[(?<requestDate>[^\\]]+)\\]\\s+\"(?<httpMethod>(?:GET|HEAD|POST|PUT|DELETE|TRACE))\\s(?<urlPath>(?:[^ ?]+))(?:\\?(?<queryString>(?:[^ ]+)))?\\sHTTP/(?<httpVersion>(?:[\\d\\.]+))\"\\s+(?<statusCode>[0-9]+)\\s+(?<bytesSent>\\S+)\\s+\"(?<referrer>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userAgent>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+(?<responseTime>[-0-9]*)\\s+\"(?<hostName>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userFingerprint>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<userId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<sessionId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<requestId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<visitorId>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<vegSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<fruitSlice>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s+\"(?<cacheHitMiss>(?:[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*))\"\\s*\\Z",
            regex_output="%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21s",
        ),
        columns=[
            Column('ip', DataType.STRING),
            Column('user', DataType.STRING),
            Column('requestDate', DataType.TIMESTAMP, date_pattern='dd/MMM/yyyy:HH:mm:ss Z'),
            Column('httpMethod', DataType.STRING),
            Column('urlPath', DataType.STRING),
            Column('queryString', DataType.STRING),
            Column('httpVersion', DataType.STRING),
            Column('statusCode', DataType.STRING),
            Column('bytesSent', DataType.INT),
Ejemplo n.º 52
0
def add_no_op_engine(config):
    engine_config = config['engines']['no_op_engine']
    opts = engine_config['options']
    dart = Dart(opts['dart_host'], opts['dart_port'], opts['dart_api_version'])
    assert isinstance(dart, Dart)

    _logger.info('saving no_op_engine')

    engine_id = None
    for e in dart.get_engines():
        if e.data.name == 'no_op_engine':
            engine_id = e.id

    ecs_task_definition = None if config['dart']['use_local_engines'] else {
        'family': 'dart-%s-no_op_engine' % config['dart']['env_name'],
        'containerDefinitions': [
            {
                'name': 'dart-no_op_engine',
                'cpu': 64,
                'memory': 256,
                'image': engine_config['docker_image'],
                'logConfiguration': {'logDriver': 'syslog'},
                'environment': [
                    {'name': 'DART_ROLE', 'value': 'worker:engine_no_op'},
                    {'name': 'DART_CONFIG', 'value': engine_config['config']},
                    {'name': 'AWS_DEFAULT_REGION', 'value': opts['region']}
                ],
                'mountPoints': [
                    {
                        'containerPath': '/mnt/ecs_agent_data',
                        'sourceVolume': 'ecs-agent-data',
                        'readOnly': True
                    }
                ],
            }
        ],
        'volumes': [
            {
                'host': {'sourcePath': '/var/lib/ecs/data'},
                'name': 'ecs-agent-data'
            }
        ],
    }

    e1 = dart.save_engine(Engine(id=engine_id, data=EngineData(
        name='no_op_engine',
        description='Helps engineering test dart',
        options_json_schema={
            'type': 'object',
            'properties': {
                'action_sleep_time_in_seconds': {
                    'type': 'integer',
                    'minimum': 0,
                    'default': 5,
                    'description': 'The time to sleep for each action before completing'
                },
            },
            'additionalProperties': False,
            'required': [],
        },
        supported_action_types=[
            NoOpActionTypes.action_that_succeeds,
            NoOpActionTypes.action_that_fails,
            NoOpActionTypes.copy_hdfs_to_s3_action,
            NoOpActionTypes.load_dataset,
            NoOpActionTypes.consume_subscription
        ],
        ecs_task_definition=ecs_task_definition
    )))
    _logger.info('saved no_op_engine: %s' % e1.id)
Ejemplo n.º 53
0
from dart.client.python.dart_client import Dart
from dart.model.workflow import Workflow, WorkflowState

if __name__ == '__main__':
    dart = Dart('localhost', 5000)
    assert isinstance(dart, Dart)

    workflow = dart.get_workflow('456SGU4U6T')
    assert isinstance(workflow, Workflow)

    workflow.data.state = WorkflowState.INACTIVE
    dart.save_workflow(workflow)
from dart.client.python.dart_client import Dart
from dart.model.dataset import Column, DatasetData, Dataset, DataFormat, FileFormat, DataType, Compression, RowFormat, \
    LoadType

if __name__ == '__main__':
    dart = Dart('localhost', 5000)
    assert isinstance(dart, Dart)

    dataset = dart.save_dataset(Dataset(id='PDUZ8EDNOR', data=(DatasetData(
        name='beacon_native_app_parsed_gzipped_v03',
        table_name='beacon_native_app',
        location='s3://example-bucket/prd/beacon/native_app/v3/dwh-delimited/gzipped',
        load_type=LoadType.INSERT,
        distribution_keys=['created'],
        sort_keys=['created', 'eventtype'],
        hive_compatible_partition_folders=True,
        data_format=DataFormat(
            FileFormat.TEXTFILE,
            RowFormat.DELIMITED,
            delimited_by='\t',
            quoted_by='"',
            escaped_by='\\',
            null_string='NULL',
        ),
        compression=Compression.GZIP,
        partitions=[Column('createdpartition', DataType.STRING)],
        columns=[
            Column('logfileid', DataType.INT),
            Column('linenumber', DataType.INT),
            Column('created', DataType.TIMESTAMP, date_pattern="yyyy-MM-dd HH:mm:ss"),
            Column('remoteip', DataType.VARCHAR, 500),
Ejemplo n.º 55
0
 def setUp(self):
     self.dart = Dart(host='localhost', port=5000)
from dart.client.python.dart_client import Dart
from dart.model.action import Action, ActionState
from dart.model.action import ActionData
from dart.model.dataset import Column, DatasetData, Dataset, DataFormat, DataType, Compression, LoadType
from dart.model.datastore import Datastore, DatastoreData, DatastoreState
from dart.model.event import Event, EventState
from dart.model.event import EventData
from dart.model.trigger import Trigger, TriggerData
from dart.model.workflow import Workflow, WorkflowState
from dart.model.workflow import WorkflowData

if __name__ == '__main__':
    dart = Dart('localhost', 5000)
    assert isinstance(dart, Dart)

    dataset = dart.save_dataset(Dataset(data=(DatasetData(
        name='beacon_native_app_v02',
        table_name='beacon_native_app',
        location='s3://example-bucket/prd/beacon/native_app/v2/parquet/snappy',
        hive_compatible_partition_folders=True,
        load_type=LoadType.INSERT,
        data_format=DataFormat('parquet'),
        columns=[
            Column('logFileId', DataType.BIGINT),
            Column('lineNumber', DataType.INT),
            Column('created', DataType.BIGINT),
            Column('remoteip', DataType.STRING),
            Column('useragent', DataType.STRING),
            Column('eventType', DataType.STRING),
            Column('appVersion', DataType.STRING),
            Column('advertiserID', DataType.STRING),