Esempio n. 1
0
def data_check(s3_engine, datastore, action):
    """
    :type s3_engine: dart.engine.s3.s3.S3Engine
    :type datastore: dart.model.datastore.Datastore
    :type action: dart.model.action.Action
    """
    action = s3_engine.dart.patch_action(action, progress=.1)
    args = action.data.args
    offset = args.get('date_offset_in_seconds')
    now = datetime.utcnow()
    s3_path_prefix = substitute_date_tokens(args['s3_path_prefix'], now,
                                            offset)
    bucket_name = get_bucket_name(s3_path_prefix)
    prefix = get_key_name(s3_path_prefix)
    last_modified = args.get('s3_file_last_modified')

    s3_paginator = boto3.client('s3').get_paginator('list_objects')
    for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        for element in (page.get('Contents') or []):
            path = 's3://' + bucket_name + '/' + element['Key']
            s3_path_regex = args.get('s3_path_regex')
            if s3_path_regex and not re.match(
                    substitute_date_tokens(s3_path_regex, now, offset), path):
                continue
            if args.get('min_file_size_in_bytes'
                        ) and element['Size'] < args['min_file_size_in_bytes']:
                continue
            if last_modified and element['LastModified'] < now + timedelta(
                    seconds=offset):
                continue
            return

    raise Exception('Data check failed')
Esempio n. 2
0
def data_check(s3_engine, datastore, action):
    """
    :type s3_engine: dart.engine.s3.s3.S3Engine
    :type datastore: dart.model.datastore.Datastore
    :type action: dart.model.action.Action
    """
    action = s3_engine.dart.patch_action(action, progress=.1)
    args = action.data.args
    now = datetime.utcnow()
    if args.get('date_offset_in_seconds'):
        now = now + timedelta(seconds=args['date_offset_in_seconds'])
    s3_path_prefix = substitute_date_tokens(args['s3_path_prefix'], now)
    bucket_name = get_bucket_name(s3_path_prefix)
    prefix = get_key_name(s3_path_prefix)

    s3_paginator = boto3.client('s3').get_paginator('list_objects')
    for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        for element in (page.get('Contents') or []):
            path = 's3://' + bucket_name + '/' + element['Key']
            if args.get('s3_path_regex') and not re.match(substitute_date_tokens(args['s3_path_regex'], now), path):
                continue
            if args.get('min_file_size_in_bytes') and element['Size'] < args['min_file_size_in_bytes']:
                continue
            return

    raise Exception('Data check failed')
Esempio n. 3
0
def _upload_s3_copy_manifests_and_create_tracking_sql_files(
        action, dataset, datastore, batch_size, s3_path_and_updated_generator):
    """
    :type action: dart.model.action.Action
    :type dataset: dart.model.dataset.Dataset
    :type datastore: dart.model.datastore.Datastore
    """
    s3_path_and_updated_iterator = iter(s3_path_and_updated_generator)

    if dataset.data.load_type == LoadType.RELOAD_LAST:
        last = None
        for last in s3_path_and_updated_iterator:
            pass
        s3_path_and_updated_iterator = iter([last] if last else [])

    manifests = []
    tracking_sql_files = []
    current_part = 1
    while True:
        batch = list(islice(s3_path_and_updated_iterator, batch_size))
        if not batch:
            break

        with tempfile.NamedTemporaryFile() as f:
            values = (datastore.data.s3_artifacts_path, action.id,
                      current_part)
            s3_manifest_path = '%s/load-manifests/load-manifest-for-action-%s-part-%s.json' % values
            manifests.append(s3_manifest_path)
            # http://docs.aws.amazon.com/redshift/latest/dg/loading-data-files-using-manifest.html
            data = {
                'entries': [{
                    'mandatory': True,
                    'url': e[0]
                } for e in batch]
            }
            json.dump(data, f)
            # now rewind to the beginning of the file so it can be read
            f.seek(0)
            bucket_name = get_bucket_name(datastore.data.s3_artifacts_path)
            key_name = get_key_name(s3_manifest_path)
            boto3.client('s3').upload_file(f.name, bucket_name, key_name)

        with tempfile.NamedTemporaryFile(delete=False) as f:
            tracking_sql_files.append(f.name)
            schema_name, table_name = get_tracking_schema_and_table_name(
                action)
            sql = 'INSERT INTO %s.%s (s3_path, updated) VALUES \n' % (
                schema_name, table_name)
            sql += ',\n'.join([
                "('%s', %s)" %
                (e[0], "'%s'" % e[1].isoformat() if e[1] else 'NULL')
                for e in batch
            ])
            f.write(sql)

        current_part += 1
    return manifests, tracking_sql_files
Esempio n. 4
0
def _get_raw_config_data(config_path):
    if config_path.startswith('s3://'):
        response = boto3.client('s3').get_object(
            Bucket=get_bucket_name(config_path), Key=get_key_name(config_path))
        return response["Body"].read()

    # use as is for absolute paths, else make it relative to the project root
    path = config_path if config_path.startswith(
        '/') else dart_root_relative_path(config_path)
    with open(path) as f:
        return f.read()
Esempio n. 5
0
def _upload_s3_json_manifest(action, dataset, datastore):
    # http://docs.aws.amazon.com/redshift/latest/dg/copy-usage_notes-copy-from-json.html
    data = {'jsonpaths': ['$.%s' % c.path for c in dataset.data.columns]}
    values = (datastore.data.s3_artifacts_path, action.id)
    s3_json_manifest_path = '%s/json-manifests/json-manifest-for-action-%s.json' % values
    with tempfile.NamedTemporaryFile() as f:
        json.dump(data, f)
        # now rewind to the beginning of the file so it can be read
        f.seek(0)
        bucket_name = get_bucket_name(datastore.data.s3_artifacts_path)
        key_name = get_key_name(s3_json_manifest_path)
        boto3.client('s3').upload_file(f.name, bucket_name, key_name)
    return s3_json_manifest_path
Esempio n. 6
0
def _upload_s3_json_manifest(action, dataset, datastore):
    # http://docs.aws.amazon.com/redshift/latest/dg/copy-usage_notes-copy-from-json.html
    data = {'jsonpaths': ['$.%s' % c.path for c in dataset.data.columns]}
    values = (datastore.data.s3_artifacts_path, action.id)
    s3_json_manifest_path = '%s/json-manifests/json-manifest-for-action-%s.json' % values
    with tempfile.NamedTemporaryFile() as f:
        json.dump(data, f)
        # now rewind to the beginning of the file so it can be read
        f.seek(0)
        bucket_name = get_bucket_name(datastore.data.s3_artifacts_path)
        key_name = get_key_name(s3_json_manifest_path)
        boto3.client('s3').upload_file(f.name, bucket_name, key_name)
    return s3_json_manifest_path
Esempio n. 7
0
def create_nudge_subscription(subscription, dataset):
    """ :type subscription: dart.model.subscription.Subscription
        :type dataset: dart.model.dataset.Dataset
        :rtype str
    """
    host_url = current_app.dart_context.config.get('nudge').get('host_url')
    path = subscription.data.s3_path_start_prefix_inclusive if subscription.data.s3_path_start_prefix_inclusive else dataset.data.location
    json_body = {
        'Bucket': get_bucket_name(path),
        'Prefix': get_key_name(path),
        'Regex': subscription.data.s3_path_regex_filter,
        'Backfill': True
    }
    response = requests.post(url='{host_url}/Subscribe'.format(host_url=host_url),
                             json=json_body)
    return response.json()['SubscriptionId']
Esempio n. 8
0
def _upload_s3_copy_manifests_and_create_tracking_sql_files(action, dataset, datastore, batch_size,
                                                            s3_path_and_updated_generator):
    """
    :type action: dart.model.action.Action
    :type dataset: dart.model.dataset.Dataset
    :type datastore: dart.model.datastore.Datastore
    """
    s3_path_and_updated_iterator = iter(s3_path_and_updated_generator)

    if dataset.data.load_type == LoadType.RELOAD_LAST:
        last = None
        for last in s3_path_and_updated_iterator:
            pass
        s3_path_and_updated_iterator = iter([last] if last else [])

    manifests = []
    tracking_sql_files = []
    current_part = 1
    while True:
        batch = list(islice(s3_path_and_updated_iterator, batch_size))
        if not batch:
            break

        with tempfile.NamedTemporaryFile() as f:
            values = (datastore.data.s3_artifacts_path, action.id, current_part)
            s3_manifest_path = '%s/load-manifests/load-manifest-for-action-%s-part-%s.json' % values
            manifests.append(s3_manifest_path)
            # http://docs.aws.amazon.com/redshift/latest/dg/loading-data-files-using-manifest.html
            data = {'entries': [{'mandatory': True, 'url': e[0]} for e in batch]}
            json.dump(data, f)
            # now rewind to the beginning of the file so it can be read
            f.seek(0)
            bucket_name = get_bucket_name(datastore.data.s3_artifacts_path)
            key_name = get_key_name(s3_manifest_path)
            boto3.client('s3').upload_file(f.name, bucket_name, key_name)

        with tempfile.NamedTemporaryFile(delete=False) as f:
            tracking_sql_files.append(f.name)
            schema_name, table_name = get_tracking_schema_and_table_name(action)
            sql = 'INSERT INTO %s.%s (s3_path, updated) VALUES \n' % (schema_name, table_name)
            sql += ',\n'.join(["('%s', %s)" % (e[0], "'%s'" % e[1].isoformat() if e[1] else 'NULL') for e in batch])
            f.write(sql)

        current_part += 1
    return manifests, tracking_sql_files
Esempio n. 9
0
 def __init__(self, kms_key_arn, secrets_s3_path):
     self._kms_key_arn = kms_key_arn
     self._secrets_s3_path = secrets_s3_path.rstrip('/')
     self._bucket_name = get_bucket_name(self._secrets_s3_path)
     self._s3_prefix = get_key_name(self._secrets_s3_path)
Esempio n. 10
0
def extract_bucket_key(s3_path):
    return get_bucket_name(s3_path), get_key_name(s3_path)
Esempio n. 11
0
    def create_partial(self, output_config):
        _logger.info('updating configuration with trigger queue urls/arns')
        trigger_queue_arn, trigger_queue_url = self._ensure_queue_exists(
            output_config, 'trigger_queue')
        events_params = output_config['cloudformation_stacks']['events'][
            'boto_args']['Parameters']
        _get_element(events_params, 'ParameterKey',
                     'TriggerQueueUrl')['ParameterValue'] = trigger_queue_url
        _get_element(events_params, 'ParameterKey',
                     'TriggerQueueArn')['ParameterValue'] = trigger_queue_arn

        _logger.info('creating initial stacks')
        events_stack_name = self._create_stack('events', self.mode,
                                               output_config)
        rds_stack_name = self._create_stack('rds', self.mode, output_config)
        elb_stack_name = self._create_stack('elb', self.mode, output_config)
        elb_int_stack_name = self._create_stack('elb-internal', self.mode,
                                                output_config)
        engine_taskrunner_stack_name = self._create_stack(
            'engine-taskrunner', self.mode, output_config)

        _logger.info('waiting for stack completion')
        events_outputs = self._wait_for_stack_completion_and_get_outputs(
            events_stack_name, 1)
        rds_outputs = self._wait_for_stack_completion_and_get_outputs(
            rds_stack_name, 1)
        elb_outputs = self._wait_for_stack_completion_and_get_outputs(
            elb_stack_name, 1)
        elb_int_outputs = self._wait_for_stack_completion_and_get_outputs(
            elb_int_stack_name, 1)
        engine_taskrunner_outputs = self._wait_for_stack_completion_and_get_outputs(
            engine_taskrunner_stack_name, 1)

        _logger.info(
            'updating configuration with new cloudwatch scheduled events sns topic name'
        )
        sns_arn = events_outputs[0]['OutputValue']
        output_config['triggers']['scheduled'][
            'cloudwatch_scheduled_events_sns_arn'] = sns_arn

        _logger.info(
            'updating configuration with new rds endpoint and password')
        db_uri_secret_key = 'database-uri-%s' % self.environment_name
        output_config['flask'][
            'SQLALCHEMY_DATABASE_URI'] = '!decrypt %s' % db_uri_secret_key
        secrets_config = get_secrets_config(output_config)
        secrets_service = Secrets(secrets_config['kms_key_arn'],
                                  secrets_config['secrets_s3_path'])
        rds_pwd = os.environ['DART_RDS_PASSWORD']
        rds_host = rds_outputs[0]['OutputValue']
        secrets_service.put(
            db_uri_secret_key,
            'postgresql://*****:*****@%s:5432/dart' % (rds_pwd, rds_host))

        _logger.info('updating configuration with new elb name')
        web_params = output_config['cloudformation_stacks']['web'][
            'boto_args']['Parameters']
        elb_name_param = _get_element(web_params, 'ParameterKey',
                                      'WebEcsServiceLoadBalancerName')
        elb_name = elb_outputs[0]['OutputValue']
        elb_name_param['ParameterValue'] = elb_name

        _logger.info('updating configuration with new internal elb name')
        web_int_params = output_config['cloudformation_stacks'][
            'web-internal']['boto_args']['Parameters']
        elb_int_name_param = _get_element(web_int_params, 'ParameterKey',
                                          'WebEcsServiceLoadBalancerName')
        elb_int_name = elb_int_outputs[0]['OutputValue']
        elb_int_name_param['ParameterValue'] = elb_int_name

        _logger.info(
            'updating configuration with new engine taskrunner ecs cluster name'
        )
        output_config['dart'][
            'engine_taskrunner_ecs_cluster'] = engine_taskrunner_outputs[0][
                'OutputValue']

        _logger.info(
            'updating configuration with encrypted dart email username/password'
        )
        mailer_options = output_config['email']['mailer']
        mailer_options['usr'] = '******'
        mailer_options['pwd'] = '!decrypt email-password'
        secrets_service.put('email-username', self.dart_email_username)
        secrets_service.put('email-password', self.dart_email_password)

        _logger.info('uploading the output configuration to s3')
        body = yaml.dump(output_config, default_flow_style=False)
        body = re.sub(r"'!decrypt (.+?)'", r"!decrypt \1", body)
        body = re.sub(r"'!env (.+?)'", r"!env \1", body)
        body = re.sub(r"__DARTBANG__", r"!", body)
        body = re.sub(r"__DARTQUOTE__", r"'", body)
        body = re.sub(r"__DARTDOLLAR__", r"$", body)
        boto3.client('s3').put_object(
            Bucket=get_bucket_name(self.output_config_s3_path),
            Key=get_key_name(self.output_config_s3_path),
            Body=body)

        _logger.info('creating and waiting for web stacks')
        web_stack_name = self._create_stack('web', self.mode, output_config)
        web_internal_stack_name = self._create_stack('web-internal', self.mode,
                                                     output_config)
        web_outputs = self._wait_for_stack_completion_and_get_outputs(
            web_stack_name, 2)
        self._wait_for_stack_completion_and_get_outputs(
            web_internal_stack_name)

        _logger.info('waiting for web ecs service to stabilize')
        cluster_name = _get_element(web_outputs, 'OutputKey',
                                    'EcsClusterResourceName')['OutputValue']
        service_name = _get_element(web_outputs, 'OutputKey',
                                    'WebEcsServiceResourceName')['OutputValue']
        boto3.client('ecs').get_waiter('services_stable').wait(
            cluster=cluster_name, services=[service_name])
        _logger.info('done')

        _logger.info('waiting for web app to attach to load balancer')
        self._wait_for_web_app(elb_name)
        time.sleep(5)

        _logger.info('initializing database schema')
        dart_host = _get_dart_host(output_config)
        response = requests.post('http://%s/admin/create_all' % dart_host)
        response.raise_for_status()
        time.sleep(5)

        _logger.info('creating database triggers')
        with open(dart_root_relative_path('src', 'database',
                                          'triggers.sql')) as f:
            engine = sqlalchemy.create_engine(
                'postgresql://*****:*****@%s:5432/dart' % (rds_pwd, rds_host))
            engine.execute(f.read())
        _logger.info('done')
        time.sleep(5)

        _logger.info('adding engines')
        self._with_retries(add_no_op_engine, output_config)
        self._with_retries(add_no_op_engine_sub_graphs, output_config)
        self._with_retries(add_emr_engine, output_config)
        self._with_retries(add_emr_engine_sub_graphs, output_config)
        self._with_retries(add_dynamodb_engine, output_config)
        self._with_retries(add_redshift_engine, output_config)
        self._with_retries(add_s3_engine, output_config)

        _logger.info('creating and waiting for remaining stacks')
        engine_worker_stack_name = self._create_stack('engine-worker',
                                                      self.mode, output_config)
        trigger_worker_stack_name = self._create_stack('trigger-worker',
                                                       self.mode,
                                                       output_config)
        subscription_worker_stack_name = self._create_stack(
            'subscription-worker', self.mode, output_config)
        self._wait_for_stack_completion_and_get_outputs(
            engine_worker_stack_name)
        self._wait_for_stack_completion_and_get_outputs(
            trigger_worker_stack_name)
        self._wait_for_stack_completion_and_get_outputs(
            subscription_worker_stack_name)
Esempio n. 12
0
    def create_partial(self, output_config):
        _logger.info('updating configuration with trigger queue urls/arns')
        trigger_queue_arn, trigger_queue_url = self._ensure_queue_exists(output_config, 'trigger_queue')
        events_params = output_config['cloudformation_stacks']['events']['boto_args']['Parameters']
        self._get_element(events_params, 'ParameterKey', 'TriggerQueueUrl')['ParameterValue'] = trigger_queue_url
        self._get_element(events_params, 'ParameterKey', 'TriggerQueueArn')['ParameterValue'] = trigger_queue_arn

        _logger.info('creating initial stacks')
        events_stack_name = self._create_stack('events', output_config)
        rds_stack_name = self._create_stack('rds', output_config)
        elb_stack_name = self._create_stack('elb', output_config)
        elb_int_stack_name = self._create_stack('elb-internal', output_config)
        engine_taskrunner_stack_name = self._create_stack('engine-taskrunner', output_config)

        _logger.info('waiting for stack completion')
        events_outputs = self._wait_for_stack_completion_and_get_outputs(events_stack_name, 1)
        rds_outputs = self._wait_for_stack_completion_and_get_outputs(rds_stack_name, 1)
        elb_outputs = self._wait_for_stack_completion_and_get_outputs(elb_stack_name, 1)
        elb_int_outputs = self._wait_for_stack_completion_and_get_outputs(elb_int_stack_name, 1)
        engine_taskrunner_outputs = self._wait_for_stack_completion_and_get_outputs(engine_taskrunner_stack_name, 1)

        _logger.info('updating configuration with new cloudwatch scheduled events sns topic name')
        sns_arn = events_outputs[0]['OutputValue']
        output_config['triggers']['scheduled']['cloudwatch_scheduled_events_sns_arn'] = sns_arn

        _logger.info('updating configuration with new rds endpoint and password')
        db_uri_secret_key = 'database-uri-%s' % self.environment_name
        output_config['flask']['SQLALCHEMY_DATABASE_URI'] = '!decrypt %s' % db_uri_secret_key
        secrets_config = get_secrets_config(output_config)
        secrets_service = Secrets(secrets_config['kms_key_arn'], secrets_config['secrets_s3_path'])
        rds_pwd = os.environ['DART_RDS_PASSWORD']
        rds_host = rds_outputs[0]['OutputValue']
        secrets_service.put(db_uri_secret_key, 'postgresql://*****:*****@%s:5432/dart' % (rds_pwd, rds_host))

        _logger.info('updating configuration with new elb name')
        web_params = output_config['cloudformation_stacks']['web']['boto_args']['Parameters']
        elb_name_param = self._get_element(web_params, 'ParameterKey', 'WebEcsServiceLoadBalancerName')
        elb_name = elb_outputs[0]['OutputValue']
        elb_name_param['ParameterValue'] = elb_name

        _logger.info('updating configuration with new internal elb name')
        web_int_params = output_config['cloudformation_stacks']['web-internal']['boto_args']['Parameters']
        elb_int_name_param = self._get_element(web_int_params, 'ParameterKey', 'WebEcsServiceLoadBalancerName')
        elb_int_name = elb_int_outputs[0]['OutputValue']
        elb_int_name_param['ParameterValue'] = elb_int_name

        _logger.info('updating configuration with new engine taskrunner ecs cluster name')
        output_config['dart']['engine_taskrunner_ecs_cluster'] = engine_taskrunner_outputs[0]['OutputValue']

        _logger.info('updating configuration with encrypted dart email username/password')
        mailer_options = output_config['email']['mailer']
        mailer_options['usr'] = '******'
        mailer_options['pwd'] = '!decrypt email-password'
        secrets_service.put('email-username', self.dart_email_username)
        secrets_service.put('email-password', self.dart_email_password)

        _logger.info('uploading the output configuration to s3')
        body = yaml.dump(output_config, default_flow_style=False)
        body = re.sub(r"'!decrypt (.+?)'", r"!decrypt \1", body)
        body = re.sub(r"'!env (.+?)'", r"!env \1", body)
        body = re.sub(r"__DARTBANG__", r"!", body)
        body = re.sub(r"__DARTQUOTE__", r"'", body)
        body = re.sub(r"__DARTDOLLAR__", r"$", body)
        boto3.client('s3').put_object(
            Bucket=get_bucket_name(self.output_config_s3_path),
            Key=get_key_name(self.output_config_s3_path),
            Body=body
        )

        _logger.info('creating and waiting for web stacks')
        web_stack_name = self._create_stack('web', output_config)
        web_internal_stack_name = self._create_stack('web-internal', output_config)
        web_outputs = self._wait_for_stack_completion_and_get_outputs(web_stack_name, 2)
        self._wait_for_stack_completion_and_get_outputs(web_internal_stack_name)

        _logger.info('waiting for web ecs service to stabilize')
        cluster_name = self._get_element(web_outputs, 'OutputKey', 'EcsClusterResourceName')['OutputValue']
        service_name = self._get_element(web_outputs, 'OutputKey', 'WebEcsServiceResourceName')['OutputValue']
        boto3.client('ecs').get_waiter('services_stable').wait(cluster=cluster_name, services=[service_name])
        _logger.info('done')

        _logger.info('waiting for web app to attach to load balancer')
        self._wait_for_web_app(elb_name)
        time.sleep(5)

        _logger.info('initializing database schema')
        dart_host = self._get_dart_host(output_config)
        response = requests.post('http://%s/admin/create_all' % dart_host)
        response.raise_for_status()
        time.sleep(5)

        _logger.info('creating database triggers')
        with open(dart_root_relative_path('src', 'database', 'triggers.sql')) as f:
            engine = sqlalchemy.create_engine('postgresql://*****:*****@%s:5432/dart' % (rds_pwd, rds_host))
            engine.execute(f.read())
        _logger.info('done')
        time.sleep(5)

        _logger.info('adding engines')
        add_no_op_engine(output_config)
        add_no_op_engine_sub_graphs(output_config)
        add_emr_engine(output_config)
        add_emr_engine_sub_graphs(output_config)
        add_redshift_engine(output_config)

        _logger.info('creating and waiting for remaining stacks')
        engine_worker_stack_name = self._create_stack('engine-worker', output_config)
        trigger_worker_stack_name = self._create_stack('trigger-worker', output_config)
        subscription_worker_stack_name = self._create_stack('subscription-worker', output_config)
        self._wait_for_stack_completion_and_get_outputs(engine_worker_stack_name)
        self._wait_for_stack_completion_and_get_outputs(trigger_worker_stack_name)
        self._wait_for_stack_completion_and_get_outputs(subscription_worker_stack_name)
Esempio n. 13
0
 def __init__(self, kms_key_arn, secrets_s3_path):
     self._kms_key_arn = kms_key_arn
     self._secrets_s3_path = secrets_s3_path.rstrip('/')
     self._bucket_name = get_bucket_name(self._secrets_s3_path)
     self._s3_prefix = get_key_name(self._secrets_s3_path)
Esempio n. 14
0
def extract_bucket_key(s3_path):
    return get_bucket_name(s3_path), get_key_name(s3_path)