Beispiel #1
0
    def __init__(self, config_loc, config_override_loc, emailer,
                 num_processes=1, wait_timeout_sec=60):
        """
        :param config_loc: path of config.yaml
        :type config_loc: string

        :param config_override_loc: path of config-env-dev.yaml
        :type config_override_loc: string

        :param run_local: run local flag
        :type run_local: boolean

        :param num_processes: number of worker processes to use for sqs request
        :type num_processes: int

        :param wait_timeout_sec: A timeout passed to conditional variable wait
            function.  If thread is woken up on timeout, do some maintenance work.
        :type wait_timeout_sec: int

        """
        self._config_loc = config_loc
        self._config_override_loc = config_override_loc
        self._stop_requested = False
        self._run_once = False
        self.max_error_retries = staticconf.read_int('max_error_retries')
        self.etl_helper = ETLStatusHelper()
        self.jobs_db = TableConnection.get_connection('ScheduledJobs')
        self.runs_db = TableConnection.get_connection('ETLRecords')
        self._num_processes = num_processes
        self._cond = threading.Condition(threading.Lock())
        self._wait_timeout_sec = max(wait_timeout_sec, 60)
        self.emailer = emailer
Beispiel #2
0
def et_scanner_main(args):
    """ Create an instance of ETScanner and run it once.
    """
    setup_config(args, 'ETScanner')
    sqs_scanner_queue = SQSWrapper(read_string("sqs.et_scanner_queue_name"))
    sqs_worker_queue = SQSWrapper(read_string("sqs.et_queue_name"))
    scanner = ETScanner(TableConnection.get_connection('ScheduledJobs'),
                        sqs_scanner_queue, sqs_worker_queue, Mailer(args.run_local))
    scanner.run()
Beispiel #3
0
def et_scanner_main(args):
    """ Create an instance of ETScanner and run it once.
    """
    setup_config(args, 'ETScanner')
    sqs_scanner_queue = SQSWrapper(read_string("sqs.et_scanner_queue_name"))
    sqs_worker_queue = SQSWrapper(read_string("sqs.et_queue_name"))
    scanner = ETScanner(TableConnection.get_connection('ScheduledJobs'),
                        sqs_scanner_queue, sqs_worker_queue,
                        Mailer(args.run_local))
    scanner.run()
Beispiel #4
0
def post_job(scheduled_jobs_object, et_scanner_sqs, request_body_str):
    """
    the request body should be a dictionary (so \*\*request_body are kwargs),
    and it with the following required keys:

    * redshift_id
    * log_name
    * log_schema_version
    * start_date
    * s3_path
    * contact_emails

    :param scheduled_jobs_object: the ScheduledJobs to which we post jobs
    :type scheduled_jobs_object: an instance of ScheduledJobs
    :param et_scanner_sqs: the scanner sqs to send message to
    :type et_scanner_sqs: SQSWrapper object
    :param request_body_str: a string version of the request body dict
    :type request_body_str: string
    :param scheduled_jobs_object:  an instance of ScheduledJobs
    :type scheduled_jobs_object: ScheduledJobs

    :returns: success
    :rtype: boolean

    :raises S3ResponseError: if the bytes written don't match the length of
        the content
    """
    request_body_dict = simplejson.loads(request_body_str)
    if "contact_emails" in request_body_dict and request_body_dict["contact_emails"] is not None:
        request_body_dict["contact_emails"] = set(request_body_dict["contact_emails"])
    _check_required_args(request_body_dict, "post")
    s_date = datetime.datetime.strptime(request_body_dict["start_date"], "%Y-%m-%d")
    if "end_date" not in request_body_dict:
        request_body_dict["end_date"] = None
    elif request_body_dict["end_date"] is not None:
        e_date = datetime.datetime.strptime(request_body_dict["end_date"], "%Y-%m-%d")
        if s_date > e_date:
            raise ValueError("start date should not be greater than end date")

    if not request_body_dict.get("log_format"):
        request_body_dict["log_format"] = "json"

    request_body_dict["additional_arguments"] = _validate_additional_args(request_body_dict)

    # check that redshift cluster exists, throws ItemNotFound
    list_cluster_by_name(TableConnection.get_connection("RedshiftClusters"), request_body_dict["redshift_id"])

    request_body_dict["hash_key"] = _create_hash_key(request_body_dict)
    request_body_dict["uuid"] = _get_uuid(scheduled_jobs_object)
    request_body_dict["et_status"] = NULL
    ret = scheduled_jobs_object.put(**request_body_dict)
    if ret:
        dummy_message = {"message": "dummy"}  # TODO: use meaningful message instead of dummy
        et_scanner_sqs.write_message_to_queue(dummy_message)
    return {"post_accepted": {"result": ret, "uuid": request_body_dict["uuid"]}}
Beispiel #5
0
def runs_filtered(request):
    """
    runs_filtered handles requests from the runs endpoint with a job_id.  All
    runs for the job_id requested are returned.

    **GET /v1/runs/**\ *{string: job_id}*

    **Query Parameters:**

    * **job_id** - the job_id of the runs we wish to review

    Example: ``/v1/runs/234332104332``

    *Example Response* ::

        [
            {
             ‘job_id’: <string>,
             ‘etl_status’: <string>,
             ‘last_updated’: timestamp,
             ‘data_date’: ‘YYYY-mm-dd’,
             ‘schema_checksum’: <string>,
             ‘s3_path’: <string>,
             ‘et_starttime’: timestamp,
             ‘et_runtime’: int,
             ‘load_starttime’: timestamp,
             ‘load_runtime’: int,
             ‘redshift_id’: <string>,
             ‘db_schema’: <string>,
             ‘run_by’: <string>,
             'additional_arguments': <string>
            }
        ]

    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **404**      Invalid job_id
    **500**      unknown exception
    ============ ===========

    * **Encoding type:** *application/json*
    """
    job_id = request.matchdict.get('job_id', None)

    try:
        return 200, list_runs_by_job_id(job_id, TableConnection.get_connection('ETLRecords'))
    except ValueError as e:
        return 404, {'error': repr(e)}
    except Exception as unknown_exception:
        return 500, {'error': repr(unknown_exception)}
Beispiel #6
0
    def __init__(self,
                 config_loc,
                 config_override_loc,
                 emailer,
                 num_processes=1,
                 wait_timeout_sec=60):
        """
        :param config_loc: path of config.yaml
        :type config_loc: string

        :param config_override_loc: path of config-env-dev.yaml
        :type config_override_loc: string

        :param run_local: run local flag
        :type run_local: boolean

        :param num_processes: number of worker processes to use for sqs request
        :type num_processes: int

        :param wait_timeout_sec: A timeout passed to conditional variable wait
            function.  If thread is woken up on timeout, do some maintenance work.
        :type wait_timeout_sec: int

        """
        self._config_loc = config_loc
        self._config_override_loc = config_override_loc
        self._stop_requested = False
        self._run_once = False
        self.max_error_retries = staticconf.read_int('max_error_retries')
        self.etl_helper = ETLStatusHelper()
        self.jobs_db = TableConnection.get_connection('ScheduledJobs')
        self.runs_db = TableConnection.get_connection('ETLRecords')
        self._num_processes = num_processes
        self._cond = threading.Condition(threading.Lock())
        self._wait_timeout_sec = max(wait_timeout_sec, 60)
        self.emailer = emailer
Beispiel #7
0
def jobs_update_job(request):
    """
    jobs_update_job_by_job_id handles requests from the jobs endpoint.

    **PUT /v1/jobs/job/**

    Example: ``v1/jobs/job/``

    **Query Parameters:**

    * **request.body** -- the json string of job details

    *Example request.body* ::

        "{ 'log_name': 'ad_click',
           'log_schema_version': 'initial',
           'start_date': '2014-04-01',
           'end_date': '',
           'redshift_id': 'rs1',
           'cancel_requested': True,
        }"

    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **400**      bad hash_key: redshift_id, log_name,
                 log_schema_version and start_date must all be present
    **404**      invalid job parameters
    **500**      unknown exception
    ============ ===========

    * **Encoding type:** *application/json*
    """
    try:
        return 200, put_job(TableConnection.get_connection('ScheduledJobs'),
                            get_scanner_queue('et'), request.body)
    except PrimaryKeyError as e:
        return 400, {'error': 'bad hash_key'}
    except JSONDecodeError as e:
        return 400, {'error': 'json decode error'}
    except ValueError as e:
        return 404, {'error': repr(e)}
    except Exception as unknown_exception:
        return 500, {'error': repr(unknown_exception)}
Beispiel #8
0
def cluster_by_name(request):
    """
    cluster_by_name returns a dictionary with key "clusters" and value of a
    list of clusters with one entry -- the cluster specified in the route

    **GET /v1/clusters/**\ *{string: cluster_name}*

    **Query Parameters:**

    * **cluster_name** - the name of the log for which we want to see clusters

    Example: ``/v1/clusters/cluster-1``

    *Example Response* ::

        { 'clusters': [
            {
                'redshift_id': 'cluster-1',
                'port': 5439,
                'host': 'cluster-1.account.region.redshift.amazonaws.com',
                'db_schema': 'public',
                'groups': ['search_infra', 'biz']
            }]
        }

    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **404**      invalid cluster_name
    **500**      unknown exception
    ============ ===========

    * **Encoding type:** *application/json*
    """
    cluster_name = request.matchdict.get('cluster_name')

    try:
        return 200, list_cluster_by_name(
            TableConnection.get_connection('RedshiftClusters'), cluster_name)
    except ValueError as e:
        return 404, {'error': repr(e)}
    except Exception as unknown_exception:
        return 500, {'error': repr(unknown_exception)}
Beispiel #9
0
def jobs_update_job(request):
    """
    jobs_update_job_by_job_id handles requests from the jobs endpoint.

    **PUT /v1/jobs/job/**

    Example: ``v1/jobs/job/``

    **Query Parameters:**

    * **request.body** -- the json string of job details

    *Example request.body* ::

        "{ 'log_name': 'ad_click',
           'log_schema_version': 'initial',
           'start_date': '2014-04-01',
           'end_date': '',
           'redshift_id': 'rs1',
           'cancel_requested': True,
        }"

    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **400**      bad hash_key: redshift_id, log_name,
                 log_schema_version and start_date must all be present
    **404**      invalid job parameters
    **500**      unknown exception
    ============ ===========

    * **Encoding type:** *application/json*
    """
    try:
        return 200, put_job(TableConnection.get_connection("ScheduledJobs"), get_scanner_queue("et"), request.body)
    except PrimaryKeyError as e:
        return 400, {"error": "bad hash_key"}
    except JSONDecodeError as e:
        return 400, {"error": "json decode error"}
    except ValueError as e:
        return 404, {"error": repr(e)}
    except Exception as unknown_exception:
        return 500, {"error": repr(unknown_exception)}
Beispiel #10
0
    def _get_redshift_cluster_details(self, rs_id):
        """ Get the host and port for a particular redshift id
        :param rs_id: redshift id of a cluster e.g., cluster-1
        :type rs_id: string
        :returns: a 2 tuple containing a redshift host name and redshift port
        :rtype: tuple
        """
        cluster = list_cluster_by_name(
            TableConnection.get_connection('RedshiftClusters'),
            rs_id)

        if cluster:
            port = cluster['port']
            host = cluster['host']
            schema = cluster['db_schema']
        else:
            raise ValueError("No cluster named: {0}".format(rs_id))

        return host, port, schema
Beispiel #11
0
def clusters(request):
    """
    clusters handles GET and POST requests from the clusters endpoint

    **GET /v1/clusters/**

    Example: ``/v1/clusters/``

    *Example Response* ::

        [
            {
                'redshift_id': 'cluster-1',
                'port': 5439,
                'host': 'cluster-1.account.region.redshift.amazonaws.com',
                'db_schema': 'public',
                'groups': ['search_infra', 'biz']
            },
            {
                'redshift_id': 'cluster-1-user',
                'port': 5439,
                'host': 'cluster-1-user.account.region.redshift.amazonaws.com',
                'db_schema': 'public',
                'groups': ['search_infra', 'log_infra']
            },
            {
                'redshift_id': 'cluster-2',
                'port': 5439,
                'host': cluster-2.account.region.redshift.amazonaws.com,
                'db_schema': 'public',
                'groups': ['mobile', 'log_infra']
            },
        ]


    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **500**      unknown exception
    ============ ===========

    * **Encoding type:** *application/json*

    **POST /v1/clusters/**


    Example: ``/v1/clusters``

    **Query Parameters:**

    * **request.body** -- the json string of cluster details

    *Example request.body* ::

        "{
            'redshift_id': 'cluster-2',
            'port': 5439,
            'host': 'cluster-2.account.region.redshift.amazonaws.com'
        }"

    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **404**      invalid cluster parameters
    **500**      unknown exception
    ============ ===========

    * **Encoding type:** *application/json*
    """

    try:
        if request.method == "POST":
            return 200, post_cluster(
                TableConnection.get_connection('RedshiftClusters'),
                request.body)
        elif request.method == "GET":
            return 200, list_all_clusters(
                TableConnection.get_connection('RedshiftClusters'))
    except PrimaryKeyError as e:
        return 400, {'error': 'bad hash_key or missing required arguments'}
    except ValueError as e:
        if "ConditionalCheckFailedException" in repr(e):
            return 404, {
                'error': "ConditionalCheckFailed; possible duplicate cluster"
            }
        return 404, {'error': repr(e)}
    except Exception as unknown_exception:
        return 500, {'error': repr(unknown_exception)}
Beispiel #12
0
def jobs_filtered(request):
    """
    jobs_filtered handles requests from the jobs endpoint with a log_name and
    optional version.  If there's no version all jobs will be for the given
    log_name will be returned, otherwise all jobs for the log name and version
    combination will be returned.

    **GET /v1/jobs/**\ *{string: log_name}*

    **Query Parameters:**

    * **log_name** - the name of the log for which we want to see jobs

    Example: ``/v1/jobs/ad_click``

    *Example Response* ::

        [
            {'log_name': 'ad_click',
             'log_schema_version': 'initial',
             's3_log_uri': http://ad_click/schema.yaml?Signature=b?Expires=c?AccessKeyId=xxx
             'start_date': '2014-05-01',
             'end_date': '',
             'contact_emails': ['*****@*****.**', '*****@*****.**'],
             'redshift_id': 'abc123',
             'additional_arguments': '{"load_step": ["--force-load"]}'
            },
            {'log_name': 'ad_click',
             'log_schema_version': 'minimal',
             's3_log_uri': http://ad_min/schema.yaml?Signature=b?Expires=b?AccessKeyId=yyy
             'start_date': '2014-05-01',
             'end_date': '2014-05-07',
             'contact_emails': ['*****@*****.**', '*****@*****.**'],
             'redshift_id': 'abc123',
             'additional_arguments': '{"load_step": ["--force-load"]}'
            }
        ]

    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **404**      invalid log_name
    **500**      unknown exception
    ============ ===========


    **GET /v1/jobs/**\ *{string: log_name}/{string: log_schema_version}*

    **Query Parameters:**

    * **log_name** - the name of the log for which we want to see jobs
    * **log_schema_version** - the version of the log for which we want to see jobs

    Example: ``/v1/jobs/ad_click/initial``

    *Example Response* ::

        [
            {'log_name': 'ad_click',
             'log_schema_version': 'initial',
             's3_log_uri': http://ad_click/schema.yaml?Signature=b?Expires=c?AccessKeyId=xxx
             'start_date': '2014-05-01',
             'end_date': '',
             'emails': ['*****@*****.**', '*****@*****.**'],
             'redshift_id': 'abc123',
             'additional_arguments': '{"et_step": ["--force-et"]}'
            }
        ]

    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **404**      invalid log_name or log_version
    **500**      unknown exception
    ============ ===========

    * **Encoding type:** *application/json*
    """
    log_name = request.matchdict.get('log_name')
    log_version = request.matchdict.get('log_schema_version', None)

    try:
        if log_version is None:
            return 200, list_jobs_by_name(
                log_name, TableConnection.get_connection('ScheduledJobs'))
        return 200, list_jobs_by_name_version(
            log_name, log_version,
            TableConnection.get_connection('ScheduledJobs'))
    except ValueError as e:
        return 404, {'error': repr(e)}
    except Exception as unknown_exception:
        return 500, {'error': repr(unknown_exception)}
Beispiel #13
0
 def __init__(self):
     self.etl_db = TableConnection.get_connection('ETLRecords')
     self.worker_id = '{0}:{1}'.format(socket.gethostname(), os.getpid())
Beispiel #14
0
 def __init__(self):
     self.etl_db = TableConnection.get_connection('ETLRecords')
     self.worker_id = '{0}:{1}'.format(socket.gethostname(), os.getpid())
Beispiel #15
0
 def _table_conn(self):
     if not hasattr(self, '_cached_table_conn'):
         self._configure_mycroft()
         self._cached_table_conn = TableConnection.get_connection(
             'RedshiftClusters')
     return self._cached_table_conn
Beispiel #16
0
 def _table_conn(self):
     if not hasattr(self, '_cached_table_conn'):
         self._configure_mycroft()
         self._cached_table_conn = TableConnection.get_connection('RedshiftClusters')
     return self._cached_table_conn
Beispiel #17
0
def jobs_filtered(request):
    """
    jobs_filtered handles requests from the jobs endpoint with a log_name and
    optional version.  If there's no version all jobs will be for the given
    log_name will be returned, otherwise all jobs for the log name and version
    combination will be returned.

    **GET /v1/jobs/**\ *{string: log_name}*

    **Query Parameters:**

    * **log_name** - the name of the log for which we want to see jobs

    Example: ``/v1/jobs/ad_click``

    *Example Response* ::

        [
            {'log_name': 'ad_click',
             'log_schema_version': 'initial',
             's3_log_uri': http://ad_click/schema.yaml?Signature=b?Expires=c?AccessKeyId=xxx
             'start_date': '2014-05-01',
             'end_date': '',
             'contact_emails': ['*****@*****.**', '*****@*****.**'],
             'redshift_id': 'abc123',
             'additional_arguments': '{"load_step": ["--force-load"]}'
            },
            {'log_name': 'ad_click',
             'log_schema_version': 'minimal',
             's3_log_uri': http://ad_min/schema.yaml?Signature=b?Expires=b?AccessKeyId=yyy
             'start_date': '2014-05-01',
             'end_date': '2014-05-07',
             'contact_emails': ['*****@*****.**', '*****@*****.**'],
             'redshift_id': 'abc123',
             'additional_arguments': '{"load_step": ["--force-load"]}'
            }
        ]

    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **404**      invalid log_name
    **500**      unknown exception
    ============ ===========


    **GET /v1/jobs/**\ *{string: log_name}/{string: log_schema_version}*

    **Query Parameters:**

    * **log_name** - the name of the log for which we want to see jobs
    * **log_schema_version** - the version of the log for which we want to see jobs

    Example: ``/v1/jobs/ad_click/initial``

    *Example Response* ::

        [
            {'log_name': 'ad_click',
             'log_schema_version': 'initial',
             's3_log_uri': http://ad_click/schema.yaml?Signature=b?Expires=c?AccessKeyId=xxx
             'start_date': '2014-05-01',
             'end_date': '',
             'emails': ['*****@*****.**', '*****@*****.**'],
             'redshift_id': 'abc123',
             'additional_arguments': '{"et_step": ["--force-et"]}'
            }
        ]

    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **404**      invalid log_name or log_version
    **500**      unknown exception
    ============ ===========

    * **Encoding type:** *application/json*
    """
    log_name = request.matchdict.get("log_name")
    log_version = request.matchdict.get("log_schema_version", None)

    try:
        if log_version is None:
            return 200, list_jobs_by_name(log_name, TableConnection.get_connection("ScheduledJobs"))
        return 200, list_jobs_by_name_version(log_name, log_version, TableConnection.get_connection("ScheduledJobs"))
    except ValueError as e:
        return 404, {"error": repr(e)}
    except Exception as unknown_exception:
        return 500, {"error": repr(unknown_exception)}
Beispiel #18
0
def jobs(request):
    """
    jobs_name_and_version handles requests from the jobs endpoint with
    log_name and log_version, getting contents from the dynamo location

    **GET /v1/jobs/**

    Example: ``/v1/jobs/``

    *Example Response* ::

        [
            {'log_name': 'ad_click',
             'log_schema_version': 'initial',
             's3_log_uri': http://ad_click/schema.yaml?Signature=b?Expires=c?AccessKeyId=xxx
             'start_date': '2014-05-01',
             'end_date': '',
             'contact_emails': ['*****@*****.**', '*****@*****.**'],
             'redshift_id': 'abc123',
             'additional_arguments': '{"et_step": ["--force-et"]}'
            },
            {'log_name': 'ad_click',
             'log_schema_version': 'minimal',
             's3_log_uri': http://ad_min/schema.yaml?Signature=b?Expires=b?AccessKeyId=yyy
             'start_date': '2014-05-01',
             'end_date': '2014-05-07',
             'contact_emails': ['*****@*****.**', '*****@*****.**'],
             'redshift_id': 'abc123'
             'additional_arguments': '{"et_step": ["--force-et"]}'
            },
            {'log_name': 'bing_geocoder',
             'log_schema_version': 'bing2',
             's3_log_uri': http://bing/schema.yaml?Signature=b?Expires=a?AccessKeyId=zzz
             'start_date': '2014-05-02',
             'end_date': '2014-06-07',
             'contact_emails': ['*****@*****.**', '*****@*****.**'],
             'redshift_id': 'abc123'
             'additional_arguments': '{"et_step": ["--force-et"]}'
            }
        ]


    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **500**      unknown exception
    ============ ===========

    * **Encoding type:** *application/json*

    **POST /v1/jobs/**

    Example: ``v1/jobs``

    **Query Parameters:**

    * **request.body** -- the json string of job details

    *Example request.body* ::

        "{ 'log_name': 'ad_click',
           'log_schema_version': 'initial',
           's3_log_uri': 'llll',
           'start_date': '2014-04-01',
           'end_date': '',
           'contact_emails': ['*****@*****.**', '*****@*****.**'],
           'redshift_id': 'rs1',
           'additional_arguments': '{"load_step": ["--force-load"]}'
        }"

    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **400**      bad hash_key: redshift_id, log_name,
                 log_schema_version and start_date must all be present
    **404**      invalid job parameters
    **500**      unknown exception
    ============ ===========

    * **Encoding type:** *application/json*
    """

    try:
        if request.method == "POST":
            return 200, post_job(
                TableConnection.get_connection('ScheduledJobs'),
                get_scanner_queue('et'), request.body)
        elif request.method == "GET":
            return 200, list_all_jobs(
                TableConnection.get_connection('ScheduledJobs'))
    except PrimaryKeyError as e:
        return 400, {'error': 'bad hash_key'}
    except ValueError as e:
        if "ConditionalCheckFailedException" in repr(e):
            return 404, {
                'error':
                "ConditionalCheckFailed; possible duplicate job.  \
Delete existing job first"
            }
        return 404, {'error': repr(e)}
    except Exception as unknown_exception:
        return 500, {'error': repr(unknown_exception)}
Beispiel #19
0
def jobs(request):
    """
    jobs_name_and_version handles requests from the jobs endpoint with
    log_name and log_version, getting contents from the dynamo location

    **GET /v1/jobs/**

    Example: ``/v1/jobs/``

    *Example Response* ::

        [
            {'log_name': 'ad_click',
             'log_schema_version': 'initial',
             's3_log_uri': http://ad_click/schema.yaml?Signature=b?Expires=c?AccessKeyId=xxx
             'start_date': '2014-05-01',
             'end_date': '',
             'contact_emails': ['*****@*****.**', '*****@*****.**'],
             'redshift_id': 'abc123',
             'additional_arguments': '{"et_step": ["--force-et"]}'
            },
            {'log_name': 'ad_click',
             'log_schema_version': 'minimal',
             's3_log_uri': http://ad_min/schema.yaml?Signature=b?Expires=b?AccessKeyId=yyy
             'start_date': '2014-05-01',
             'end_date': '2014-05-07',
             'contact_emails': ['*****@*****.**', '*****@*****.**'],
             'redshift_id': 'abc123'
             'additional_arguments': '{"et_step": ["--force-et"]}'
            },
            {'log_name': 'bing_geocoder',
             'log_schema_version': 'bing2',
             's3_log_uri': http://bing/schema.yaml?Signature=b?Expires=a?AccessKeyId=zzz
             'start_date': '2014-05-02',
             'end_date': '2014-06-07',
             'contact_emails': ['*****@*****.**', '*****@*****.**'],
             'redshift_id': 'abc123'
             'additional_arguments': '{"et_step": ["--force-et"]}'
            }
        ]


    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **500**      unknown exception
    ============ ===========

    * **Encoding type:** *application/json*

    **POST /v1/jobs/**

    Example: ``v1/jobs``

    **Query Parameters:**

    * **request.body** -- the json string of job details

    *Example request.body* ::

        "{ 'log_name': 'ad_click',
           'log_schema_version': 'initial',
           's3_log_uri': 'llll',
           'start_date': '2014-04-01',
           'end_date': '',
           'contact_emails': ['*****@*****.**', '*****@*****.**'],
           'redshift_id': 'rs1',
           'additional_arguments': '{"load_step": ["--force-load"]}'
        }"

    ============ ===========
    Status Code  Description
    ============ ===========
    **200**      Success
    **400**      bad hash_key: redshift_id, log_name,
                 log_schema_version and start_date must all be present
    **404**      invalid job parameters
    **500**      unknown exception
    ============ ===========

    * **Encoding type:** *application/json*
    """

    try:
        if request.method == "POST":
            return 200, post_job(TableConnection.get_connection("ScheduledJobs"), get_scanner_queue("et"), request.body)
        elif request.method == "GET":
            return 200, list_all_jobs(TableConnection.get_connection("ScheduledJobs"))
    except PrimaryKeyError as e:
        return 400, {"error": "bad hash_key"}
    except ValueError as e:
        if "ConditionalCheckFailedException" in repr(e):
            return (
                404,
                {
                    "error": "ConditionalCheckFailed; possible duplicate job.  \
Delete existing job first"
                },
            )
        return 404, {"error": repr(e)}
    except Exception as unknown_exception:
        return 500, {"error": repr(unknown_exception)}
Beispiel #20
0
def post_job(scheduled_jobs_object, et_scanner_sqs, request_body_str):
    """
    the request body should be a dictionary (so \*\*request_body are kwargs),
    and it with the following required keys:

    * redshift_id
    * log_name
    * log_schema_version
    * start_date
    * s3_path
    * contact_emails

    :param scheduled_jobs_object: the ScheduledJobs to which we post jobs
    :type scheduled_jobs_object: an instance of ScheduledJobs
    :param et_scanner_sqs: the scanner sqs to send message to
    :type et_scanner_sqs: SQSWrapper object
    :param request_body_str: a string version of the request body dict
    :type request_body_str: string
    :param scheduled_jobs_object:  an instance of ScheduledJobs
    :type scheduled_jobs_object: ScheduledJobs

    :returns: success
    :rtype: boolean

    :raises S3ResponseError: if the bytes written don't match the length of
        the content
    """
    request_body_dict = simplejson.loads(request_body_str)
    if 'contact_emails' in request_body_dict and request_body_dict[
            'contact_emails'] is not None:
        request_body_dict['contact_emails'] = set(
            request_body_dict['contact_emails'])
    _check_required_args(request_body_dict, 'post')
    s_date = datetime.datetime.strptime(request_body_dict['start_date'],
                                        "%Y-%m-%d")
    if 'end_date' not in request_body_dict:
        request_body_dict['end_date'] = None
    elif request_body_dict['end_date'] is not None:
        e_date = datetime.datetime.strptime(request_body_dict['end_date'],
                                            "%Y-%m-%d")
        if s_date > e_date:
            raise ValueError("start date should not be greater than end date")

    if not request_body_dict.get('log_format'):
        request_body_dict['log_format'] = 'json'

    request_body_dict['additional_arguments'] = _validate_additional_args(
        request_body_dict)

    # check that redshift cluster exists, throws ItemNotFound
    list_cluster_by_name(TableConnection.get_connection('RedshiftClusters'),
                         request_body_dict['redshift_id'])

    request_body_dict['hash_key'] = _create_hash_key(request_body_dict)
    request_body_dict['uuid'] = _get_uuid(scheduled_jobs_object)
    request_body_dict['et_status'] = NULL
    ret = scheduled_jobs_object.put(**request_body_dict)
    if ret:
        dummy_message = {
            'message': 'dummy'
        }  # TODO: use meaningful message instead of dummy
        et_scanner_sqs.write_message_to_queue(dummy_message)
    return {
        'post_accepted': {
            'result': ret,
            'uuid': request_body_dict['uuid']
        }
    }