def validate_job(self, request):
        """ Gets file for job, validates each row, and sends valid rows to a staging table
        Args:
        request -- HTTP request containing the jobId
        Returns:
        Http response object
        """
        # Create connection to job tracker database
        sess = GlobalDB.db().session

        requestDict = RequestDictionary(request)
        if requestDict.exists('job_id'):
            job_id = requestDict.getValue('job_id')
        else:
            # Request does not have a job ID, can't validate
            validation_error_type = ValidationError.jobError
            raise ResponseException('No job ID specified in request',
                                    StatusCode.CLIENT_ERROR, None,
                                    validation_error_type)

        # Get the job
        job = sess.query(Job).filter_by(job_id=job_id).one_or_none()
        if job is None:
            validation_error_type = ValidationError.jobError
            writeFileError(job_id, None, validation_error_type)
            raise ResponseException(
                'Job ID {} not found in database'.format(job_id),
                StatusCode.CLIENT_ERROR, None, validation_error_type)

        # Make sure job's prerequisites are complete
        if not run_job_checks(job_id):
            validation_error_type = ValidationError.jobError
            writeFileError(job_id, None, validation_error_type)
            raise ResponseException(
                'Prerequisites for Job ID {} are not complete'.format(job_id),
                StatusCode.CLIENT_ERROR, None, validation_error_type)

        # Make sure this is a validation job
        if job.job_type.name in ('csv_record_validation', 'validation'):
            job_type_name = job.job_type.name
        else:
            validation_error_type = ValidationError.jobError
            writeFileError(job_id, None, validation_error_type)
            raise ResponseException(
                'Job ID {} is not a validation job (job type is {})'.format(
                    job_id, job.job_type.name), StatusCode.CLIENT_ERROR, None,
                validation_error_type)

        # set job status to running and do validations
        mark_job_status(job_id, "running")
        if job_type_name == 'csv_record_validation':
            self.runValidation(job)
        elif job_type_name == 'validation':
            self.runCrossValidation(job)
        else:
            raise ResponseException("Bad job type for validator",
                                    StatusCode.INTERNAL_ERROR)

        return JsonResponse.create(StatusCode.OK,
                                   {"message": "Validation complete"})
def start_a_generation(job, start_date, end_date, agency_code):
    """ Validates the start and end dates of the generation and sends the job information to SQS.

        Args:
            job: File generation job to start
            start_date: String to parse as the start date of the generation
            end_date: String to parse as the end date of the generation
            agency_code: Agency code for A file generations
    """
    if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)):
        raise ResponseException("Start or end date cannot be parsed into a date of format MM/DD/YYYY",
                                StatusCode.CLIENT_ERROR)

    # Update the Job's start and end dates
    sess = GlobalDB.db().session
    job.start_date = start_date
    job.end_date = end_date
    sess.commit()

    mark_job_status(job.job_id, "waiting")

    file_type = job.file_type.letter_name
    log_data = {'message': 'Sending {} file generation job {} to Validator in SQS'.format(file_type, job.job_id),
                'message_type': 'BrokerInfo', 'job_id': job.job_id, 'file_type': file_type}
    logger.info(log_data)

    # Set SQS message attributes
    message_attr = {'agency_code': {'DataType': 'String', 'StringValue': agency_code}}

    # Add job_id to the SQS job queue
    queue = sqs_queue()
    msg_response = queue.send_message(MessageBody=str(job.job_id), MessageAttributes=message_attr)

    log_data['message'] = 'SQS message response: {}'.format(msg_response)
    logger.debug(log_data)
def start_a_generation(job, start_date, end_date, agency_code):
    """ Validates the start and end dates of the generation and sends the job information to SQS.

        Args:
            job: File generation job to start
            start_date: String to parse as the start date of the generation
            end_date: String to parse as the end date of the generation
            agency_code: Agency code for A file generations
    """
    if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)):
        raise ResponseException("Start or end date cannot be parsed into a date of format MM/DD/YYYY",
                                StatusCode.CLIENT_ERROR)

    # Update the Job's start and end dates
    sess = GlobalDB.db().session
    job.start_date = start_date
    job.end_date = end_date
    sess.commit()

    mark_job_status(job.job_id, "waiting")

    file_type = job.file_type.letter_name
    log_data = {'message': 'Sending {} file generation job {} to Validator in SQS'.format(file_type, job.job_id),
                'message_type': 'BrokerInfo', 'job_id': job.job_id, 'file_type': file_type}
    logger.info(log_data)

    # Set SQS message attributes
    message_attr = {'agency_code': {'DataType': 'String', 'StringValue': agency_code}}

    # Add job_id to the SQS job queue
    queue = sqs_queue()
    msg_response = queue.send_message(MessageBody=str(job.job_id), MessageAttributes=message_attr)

    log_data['message'] = 'SQS message response: {}'.format(msg_response)
    logger.debug(log_data)
Example #4
0
def start_e_f_generation(job):
    """ Passes the Job ID for an E or F generation Job to SQS

        Args:
            job: File generation job to start
    """
    mark_job_status(job.job_id, "waiting")

    file_type = job.file_type.letter_name
    log_data = {
        'message':
        'Sending {} file generation job {} to Validator in SQS'.format(
            file_type, job.job_id),
        'message_type':
        'BrokerInfo',
        'submission_id':
        job.submission_id,
        'job_id':
        job.job_id,
        'file_type':
        file_type
    }
    logger.info(log_data)

    # Add job_id to the SQS job queue
    queue = sqs_queue()
    msg_response = queue.send_message(MessageBody=str(job.job_id),
                                      MessageAttributes={})

    log_data['message'] = 'SQS message response: {}'.format(msg_response)
    logger.debug(log_data)
    def generate_file(self, agency_code=None):
        """ Generates a file based on the FileGeneration object and updates any Jobs referencing it """
        raw_filename = (GEN_FILENAMES[self.file_type]
                        if not self.file_generation else
                        GEN_FILENAMES[self.file_type].format(
                            self.file_generation.agency_type))
        file_name = S3Handler.get_timestamped_filename(raw_filename)
        if self.is_local:
            file_path = "".join([CONFIG_BROKER['broker_files'], file_name])
        else:
            file_path = "".join(["None/", file_name])

        # Generate the file and upload to S3
        log_data = {
            'message': 'Finished file {} generation'.format(self.file_type),
            'message_type': 'ValidatorInfo',
            'file_type': self.file_type,
            'file_path': file_path
        }
        if self.file_generation:
            self.generate_d_file(file_path)

            log_data.update({
                'agency_code':
                self.file_generation.agency_code,
                'agency_type':
                self.file_generation.agency_type,
                'start_date':
                self.file_generation.start_date,
                'end_date':
                self.file_generation.end_date,
                'file_generation_id':
                self.file_generation.file_generation_id
            })
        elif self.job.file_type.letter_name in ['A', 'E', 'F']:
            log_data['job_id'] = self.job.job_id
            mark_job_status(self.job.job_id, 'running')

            if self.job.file_type.letter_name == 'A':
                if not agency_code:
                    raise ResponseException(
                        'Agency code not provided for an A file generation')

                self.generate_a_file(agency_code, file_path)
            else:
                # Call self.generate_%s_file() where %s is e or f based on the Job's file_type
                file_type_lower = self.job.file_type.letter_name.lower()
                getattr(self, 'generate_%s_file' % file_type_lower)()

            mark_job_status(self.job.job_id, 'finished')
        else:
            e = 'No FileGeneration object for D file generation.' if self.file_type in ['D1', 'D2'] else \
                'Cannot generate file for {} file type.'.format(self.file_type if self.file_type else 'empty')
            raise ResponseException(e)

        logger.info(log_data)
    def generate_from_job(self):
        """ Generates a file for a specified job """
        # Mark Job as running
        mark_job_status(self.job.job_id, 'running')

        # Ensure this is a file generation job
        job_type = self.job.job_type.name
        if job_type != 'file_upload':
            raise ResponseException(
                'Job ID {} is not a file generation job (job type is {})'.format(self.job.job_id, job_type),
                StatusCode.CLIENT_ERROR, None, ValidationError.jobError)

        # Ensure there is an available agency_code
        if not self.agency_code:
            raise ResponseException(
                'An agency_code must be provided to generate a file'.format(self.job.job_id, job_type),
                StatusCode.CLIENT_ERROR, None, ValidationError.jobError)

        # Retrieve any FileRequest that may have started since the Broker sent the request to SQS
        skip_generation = None
        if self.job.file_type.letter_name in ['D1', 'D2']:
            skip_generation = retrieve_cached_file_request(self.job, self.agency_type, self.agency_code, self.is_local)

        if not skip_generation:
            # Generate timestamped file names
            raw_filename = CONFIG_BROKER["".join([str(self.job.file_type.name), "_file_name"])]
            self.job.original_filename = S3Handler.get_timestamped_filename(raw_filename)
            if self.is_local:
                self.job.filename = "".join([CONFIG_BROKER['broker_files'], self.job.original_filename])
            else:
                self.job.filename = "".join([str(self.job.submission_id), "/", self.job.original_filename])
            self.sess.commit()

            # Generate the file, and upload to S3
            if self.job.file_type.letter_name in ['D1', 'D2']:
                # Update the validation Job if necessary
                update_validation_job_info(self.sess, self.job)

                self.generate_d_file()
            elif self.job.file_type.letter_name == 'A':
                self.generate_a_file()
            elif self.job.file_type.letter_name == 'E':
                self.generate_e_file()
            else:
                self.generate_f_file()

            mark_job_status(self.job.job_id, 'finished')

        logger.info({
            'message': 'Finished file {} generation'.format(self.job.file_type.letter_name),
            'message_type': 'ValidatorInfo', 'job_id': self.job.job_id, 'agency_code': self.agency_code,
            'file_type': self.job.file_type.letter_name, 'start_date': self.job.start_date,
            'end_date': self.job.end_date, 'filename': self.job.original_filename
        })
    def generate_from_job(self, job_id, agency_code):
        """ Generates a file for a specified job

            Args:
                job_id: ID of the upload Job
                agency_code: FREC or CGAC code to generate data from
        """
        mark_job_status(job_id, 'running')

        with job_context(job_id, self.is_local) as context:
            sess, job = context

            # Ensure this is a file generation job
            if job.job_type.name != 'file_upload':
                raise ResponseException(
                    'Job ID {} is not a file generation job (job type is {})'.
                    format(job.job_id, job.job_type.name),
                    StatusCode.CLIENT_ERROR, None, ValidationError.jobError)

            # Ensure there is an available agency_code
            if not agency_code:
                if job.submission_id:
                    agency_code = job.submission.frec_code if job.submission.frec_code else job.submission.cgac_code
                else:
                    raise ResponseException(
                        'An agency_code must be provided to generate a file'.
                        format(job.job_id,
                               job.job_type.name), StatusCode.CLIENT_ERROR,
                        None, ValidationError.jobError)

            # Generate timestamped file names
            old_filename = job.original_filename
            job.original_filename = S3Handler.get_timestamped_filename(
                CONFIG_BROKER["".join([str(job.file_type.name),
                                       "_file_name"])])
            if self.is_local:
                job.filename = "".join(
                    [CONFIG_BROKER['broker_files'], job.original_filename])
            else:
                job.filename = "".join(
                    [str(job.submission_id), "/", job.original_filename])

            # Generate the file and upload to S3
            if job.file_type.letter_name in ['D1', 'D2']:
                # Update the validation Job if necessary
                if job.submission_id:
                    self.update_validation_job_info(job)

                generate_d_file(sess, job, agency_code, self.is_local,
                                old_filename)
            elif job.file_type.letter_name == 'E':
                generate_e_file(sess, job, self.is_local)
            else:
                generate_f_file(sess, job, self.is_local)
    def validate_job(self, job_id):
        """ Gets file for job, validates each row, and sends valid rows to a staging table
        Args:
        request -- HTTP request containing the jobId
        Returns:
        Http response object
        """
        # Create connection to job tracker database
        sess = GlobalDB.db().session

        # Get the job
        job = sess.query(Job).filter_by(job_id=job_id).one_or_none()
        if job is None:
            validation_error_type = ValidationError.jobError
            write_file_error(job_id, None, validation_error_type)
            raise ResponseException(
                'Job ID {} not found in database'.format(job_id),
                StatusCode.CLIENT_ERROR, None, validation_error_type)

        # Make sure job's prerequisites are complete
        if not run_job_checks(job_id):
            validation_error_type = ValidationError.jobError
            write_file_error(job_id, None, validation_error_type)
            raise ResponseException(
                'Prerequisites for Job ID {} are not complete'.format(job_id),
                StatusCode.CLIENT_ERROR, None, validation_error_type)

        # Make sure this is a validation job
        if job.job_type.name in ('csv_record_validation', 'validation'):
            job_type_name = job.job_type.name
        else:
            validation_error_type = ValidationError.jobError
            write_file_error(job_id, None, validation_error_type)
            raise ResponseException(
                'Job ID {} is not a validation job (job type is {})'.format(
                    job_id, job.job_type.name), StatusCode.CLIENT_ERROR, None,
                validation_error_type)

        # set job status to running and do validations
        mark_job_status(job_id, "running")
        if job_type_name == 'csv_record_validation':
            self.run_validation(job)
        elif job_type_name == 'validation':
            self.run_cross_validation(job)
        else:
            raise ResponseException("Bad job type for validator",
                                    StatusCode.INTERNAL_ERROR)

        # Update last validated date
        job.last_validated = datetime.utcnow()
        sess.commit()
        return JsonResponse.create(StatusCode.OK,
                                   {"message": "Validation complete"})
def job_context(job_id, is_local=True):
    """Common context for files D1, D2, E, and F generation. Handles marking the job finished and/or failed"""
    # Flask context ensures we have access to global.g
    with Flask(__name__).app_context():
        sess = GlobalDB.db().session
        try:
            yield sess
            logger.info({
                'message': 'Marking job {} as finished'.format(job_id),
                'message_type': 'BrokerInfo',
                'job_id': job_id
            })
            mark_job_status(job_id, "finished")
        except Exception as e:
            # logger.exception() automatically adds traceback info
            logger.exception({
                'message':
                'Marking job {} as failed'.format(job_id),
                'message_type':
                'BrokerException',
                'job_id':
                job_id,
                'exception':
                str(e)
            })
            job = sess.query(Job).filter_by(job_id=job_id).one_or_none()
            if job:
                # mark job as failed
                job.error_message = str(e)
                sess.commit()
                mark_job_status(job_id, "failed")

                # ensure FileRequest from failed job is not cached
                file_request = sess.query(FileRequest).filter_by(
                    job_id=job_id).one_or_none()
                if file_request and file_request.is_cached_file:
                    file_request.is_cached_file = False
                    sess.commit()
        finally:
            file_request = sess.query(FileRequest).filter_by(
                job_id=job_id).one_or_none()
            if file_request and file_request.is_cached_file:
                # copy job data to all child FileRequests
                child_requests = sess.query(FileRequest).filter_by(
                    parent_job_id=job_id).all()
                file_type = FILE_TYPE_DICT_LETTER[
                    file_request.job.file_type_id]
                for child in child_requests:
                    copy_parent_file_request_data(sess, child.job,
                                                  file_request.job, file_type,
                                                  is_local)

            GlobalDB.close()
def start_d_generation(job, start_date, end_date, agency_type, agency_code=None):
    """ Validates the start and end dates of the generation, updates the submission's publish status and progress (if
        its not detached generation), and sends the job information to SQS.

        Args:
            job: File generation job to start
            start_date: String to parse as the start date of the generation
            end_date: String to parse as the end date of the generation
            agency_type: Type of Agency to generate files by: "awarding" or "funding"
            agency_code: Agency code for detached D file generations

        Returns:
            SQS send_message response
    """
    if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)):
        raise ResponseException("Start or end date cannot be parsed into a date of format MM/DD/YYYY",
                                StatusCode.CLIENT_ERROR)

    # Update the Job's start and end dates
    sess = GlobalDB.db().session
    job.start_date = start_date
    job.end_date = end_date
    sess.commit()

    # Update submission
    if job.submission_id:
        agency_code = update_generation_submission(sess, job)

    mark_job_status(job.job_id, "waiting")

    log_data = {'message': 'Sending {} file generation job {} to SQS'.format(job.file_type.letter_name, job.job_id),
                'message_type': 'BrokerInfo', 'submission_id': job.submission_id, 'job_id': job.job_id,
                'file_type': job.file_type.letter_name}
    logger.info(log_data)

    file_request = retrieve_cached_file_request(job, agency_type, agency_code, g.is_local)
    if file_request:
        log_data['message'] = 'No new file generated, used FileRequest with ID {}'.format(file_request.file_request_id)
        logger.info(log_data)
    else:
        # Set SQS message attributes
        message_attr = {'agency_type': {'DataType': 'String', 'StringValue': agency_type}}
        if not job.submission_id:
            message_attr['agency_code'] = {'DataType': 'String', 'StringValue': agency_code}

        # Add job_id to the SQS job queue
        queue = sqs_queue()
        msg_response = queue.send_message(MessageBody=str(job.job_id), MessageAttributes=message_attr)

        log_data['message'] = 'SQS message response: {}'.format(msg_response)
        logger.debug(log_data)
Example #11
0
    def handle_response_exception(error):
        """Handle exceptions explicitly raised during validation."""
        logger.error(str(error))

        job = get_current_job()
        if job:
            if job.filename is not None:
                # insert file-level error info to the database
                writeFileError(job.job_id, job.filename, error.errorType,
                               error.extraInfo)
            if error.errorType != ValidationError.jobError:
                # job pass prerequisites for validation, but an error
                # happened somewhere. mark job as 'invalid'
                mark_job_status(job.job_id, 'invalid')
        return JsonResponse.error(error, error.status)
    def validate_job(self, job_id):
        """ Gets file for job, validates each row, and sends valid rows to a staging table
        Args:
        request -- HTTP request containing the jobId
        Returns:
        Http response object
        """
        # Create connection to job tracker database
        sess = GlobalDB.db().session

        # Get the job
        job = sess.query(Job).filter_by(job_id=job_id).one_or_none()
        if job is None:
            validation_error_type = ValidationError.jobError
            write_file_error(job_id, None, validation_error_type)
            raise ResponseException('Job ID {} not found in database'.format(job_id), StatusCode.CLIENT_ERROR, None,
                                    validation_error_type)

        # Make sure job's prerequisites are complete
        if not run_job_checks(job_id):
            validation_error_type = ValidationError.jobError
            write_file_error(job_id, None, validation_error_type)
            raise ResponseException('Prerequisites for Job ID {} are not complete'.format(job_id),
                                    StatusCode.CLIENT_ERROR, None, validation_error_type)

        # Make sure this is a validation job
        if job.job_type.name in ('csv_record_validation', 'validation'):
            job_type_name = job.job_type.name
        else:
            validation_error_type = ValidationError.jobError
            write_file_error(job_id, None, validation_error_type)
            raise ResponseException(
                'Job ID {} is not a validation job (job type is {})'.format(job_id, job.job_type.name),
                StatusCode.CLIENT_ERROR, None, validation_error_type)

        # set job status to running and do validations
        mark_job_status(job_id, "running")
        if job_type_name == 'csv_record_validation':
            self.run_validation(job)
        elif job_type_name == 'validation':
            self.run_cross_validation(job)
        else:
            raise ResponseException("Bad job type for validator", StatusCode.INTERNAL_ERROR)

        # Update last validated date
        job.last_validated = datetime.utcnow()
        sess.commit()
        return JsonResponse.create(StatusCode.OK, {"message": "Validation complete"})
Example #13
0
    def handle_validation_exception(error):
        """Handle uncaught exceptions in validation process."""
        logger.error(str(error))

        # csv-specific errors get a different job status and response code
        if isinstance(error, ValueError) or isinstance(error, csv.Error):
            job_status, response_code = 'invalid', 400
        else:
            job_status, response_code = 'failed', 500
        job = get_current_job()
        if job:
            if job.filename is not None:
                writeFileError(job.job_id, job.filename,
                               ValidationError.unknownError)
            mark_job_status(job.job_id, job_status)
        return JsonResponse.error(error, response_code)
    def generate_file(self, agency_code=None):
        """ Generates a file based on the FileGeneration object and updates any Jobs referencing it """
        raw_filename = (GEN_FILENAMES[self.file_type] if not self.file_generation else
                        GEN_FILENAMES[self.file_type].format(self.file_generation.agency_type))
        file_name = S3Handler.get_timestamped_filename(raw_filename)
        if self.is_local:
            file_path = "".join([CONFIG_BROKER['broker_files'], file_name])
        else:
            file_path = "".join(["None/", file_name])

        # Generate the file and upload to S3
        log_data = {'message': 'Finished file {} generation'.format(self.file_type), 'message_type': 'ValidatorInfo',
                    'file_type': self.file_type, 'file_path': file_path}
        if self.file_generation:
            self.generate_d_file(file_path)

            log_data.update({
                'agency_code': self.file_generation.agency_code, 'agency_type': self.file_generation.agency_type,
                'start_date': self.file_generation.start_date, 'end_date': self.file_generation.end_date,
                'file_generation_id': self.file_generation.file_generation_id
            })
        elif self.job.file_type.letter_name in ['A', 'E', 'F']:
            log_data['job_id'] = self.job.job_id
            mark_job_status(self.job.job_id, 'running')

            if self.job.file_type.letter_name == 'A':
                if not agency_code:
                    raise ResponseException('Agency code not provided for an A file generation')

                self.generate_a_file(agency_code, file_path)
            else:
                # Call self.generate_%s_file() where %s is e or f based on the Job's file_type
                file_type_lower = self.job.file_type.letter_name.lower()
                getattr(self, 'generate_%s_file' % file_type_lower)()

            mark_job_status(self.job.job_id, 'finished')
        else:
            e = 'No FileGeneration object for D file generation.' if self.file_type in ['D1', 'D2'] else \
                'Cannot generate file for {} file type.'.format(self.file_type if self.file_type else 'empty')
            raise ResponseException(e)

        logger.info(log_data)
def start_e_f_generation(job):
    """ Passes the Job ID for an E or F generation Job to SQS

        Args:
            job: File generation job to start
    """
    mark_job_status(job.job_id, "waiting")

    file_type = job.file_type.letter_name
    log_data = {'message': 'Sending {} file generation job {} to Validator in SQS'.format(file_type, job.job_id),
                'message_type': 'BrokerInfo', 'submission_id': job.submission_id, 'job_id': job.job_id,
                'file_type': file_type}
    logger.info(log_data)

    # Add job_id to the SQS job queue
    queue = sqs_queue()
    msg_response = queue.send_message(MessageBody=str(job.job_id), MessageAttributes={})

    log_data['message'] = 'SQS message response: {}'.format(msg_response)
    logger.debug(log_data)
def job_context(job_id):
    """Common context for file E and F generation. Handles marking the job
    finished and/or failed"""
    # Flask context ensures we have access to global.g
    with Flask(__name__).app_context():
        sess = GlobalDB.db().session
        try:
            yield sess
            logger.debug('Marking job as finished')
            mark_job_status(job_id, "finished")
        except Exception as e:
            # logger.exception() automatically adds traceback info
            logger.exception('Job %s failed', job_id)
            job = sess.query(Job).filter_by(job_id=job_id).one_or_none()
            if job:
                job.error_message = str(e)
                sess.commit()
                mark_job_status(job_id, "failed")
        finally:
            GlobalDB.close()
def job_context(task, job_id):
    """Common context for file E and F generation. Handles marking the job
    finished and/or failed"""
    # Flask context ensures we have access to global.g
    with Flask(__name__).app_context():
        sess = GlobalDB.db().session
        try:
            yield sess
            mark_job_status(job_id, "finished")
        except Exception as e:
            # logger.exception() automatically adds traceback info
            logger.exception('Job %s failed, retrying', job_id)
            try:
                raise task.retry()
            except MaxRetriesExceededError:
                logger.warning('Job %s completely failed', job_id)
                # Log the error
                job = sess.query(Job).filter_by(job_id=job_id).one_or_none()
                if job:
                    job.error_message = str(e)
                    sess.commit()
                    mark_job_status(job_id, "failed")
def copy_parent_file_request_data(child_job, parent_job, is_local=None):
    """ Parent FileRequest job data to the child FileRequest job data.

        Args:
            child_job: Job object for the child FileRequest
            parent_job: Job object for the parent FileRequest
            is_local: A boolean flag indicating whether the application is being run locally or not
    """
    sess = GlobalDB.db().session

    # Do not edit submissions that have successfully completed
    if child_job.job_status_id == lookups.JOB_STATUS_DICT['finished']:
        return

    # Generate file path for child Job's filaname
    filepath = CONFIG_BROKER['broker_files'] if is_local else "{}/".format(str(child_job.submission_id))
    filename = '{}{}'.format(filepath, parent_job.original_filename)

    # Copy parent job's data
    child_job.from_cached = True
    child_job.filename = filename
    child_job.original_filename = parent_job.original_filename
    child_job.number_of_errors = parent_job.number_of_errors
    child_job.number_of_warnings = parent_job.number_of_warnings
    child_job.error_message = parent_job.error_message

    # Change the validation job's file data when within a submission
    if child_job.submission_id is not None:
        val_job = sess.query(Job).filter(Job.submission_id == child_job.submission_id,
                                         Job.file_type_id == parent_job.file_type_id,
                                         Job.job_type_id == lookups.JOB_TYPE_DICT['csv_record_validation']).one()
        val_job.filename = filename
        val_job.original_filename = parent_job.original_filename
    sess.commit()

    copy_file_from_parent_to_child(child_job, parent_job, is_local)

    # Mark job status last so the validation job doesn't start until everything is done
    mark_job_status(child_job.job_id, lookups.JOB_STATUS_DICT_ID[parent_job.job_status_id])
    def run_cross_validation(self, job):
        """ Cross file validation job. Test all rules with matching rule_timing.
            Run each cross-file rule and create error report.

            Args:
                job: Current job
        """
        sess = GlobalDB.db().session
        job_id = job.job_id
        # Create File Status object
        create_file_if_needed(job_id)
        # Create list of errors
        error_list = ErrorInterface()

        submission_id = job.submission_id
        bucket_name = CONFIG_BROKER['aws_bucket']
        region_name = CONFIG_BROKER['aws_region']
        job_start = datetime.now()
        logger.info(
            {
                'message': 'Beginning cross-file validations on submission_id: ' + str(submission_id),
                'message_type': 'ValidatorInfo',
                'submission_id': submission_id,
                'job_id': job.job_id,
                'action': 'run_cross_validations',
                'start': job_start,
                'status': 'start'})
        # Delete existing cross file errors for this submission
        sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == job_id).delete()
        sess.commit()

        # get all cross file rules from db
        cross_file_rules = sess.query(RuleSql).filter_by(rule_cross_file_flag=True)

        # for each cross-file combo, run associated rules and create error report
        for c in get_cross_file_pairs():
            first_file = c[0]
            second_file = c[1]
            combo_rules = cross_file_rules.filter(or_(and_(
                RuleSql.file_id == first_file.id,
                RuleSql.target_file_id == second_file.id), and_(
                RuleSql.file_id == second_file.id,
                RuleSql.target_file_id == first_file.id)))
            # send comboRules to validator.crossValidate sql
            failures = cross_validate_sql(combo_rules.all(), submission_id, self.short_to_long_dict, first_file.id,
                                          second_file.id, job)
            # get error file name
            report_filename = self.get_file_name(report_file_name(submission_id, False, first_file.name,
                                                                  second_file.name))
            warning_report_filename = self.get_file_name(report_file_name(submission_id, True, first_file.name,
                                                                          second_file.name))

            # loop through failures to create the error report
            with self.get_writer(region_name, bucket_name, report_filename, self.crossFileReportHeaders) as writer, \
                    self.get_writer(region_name, bucket_name, warning_report_filename, self.crossFileReportHeaders) as \
                    warning_writer:
                for failure in failures:
                    if failure[9] == RULE_SEVERITY_DICT['fatal']:
                        writer.write(failure[0:7])
                    if failure[9] == RULE_SEVERITY_DICT['warning']:
                        warning_writer.write(failure[0:7])
                    error_list.record_row_error(job_id, "cross_file",
                                                failure[0], failure[3], failure[5], failure[6],
                                                failure[7], failure[8], severity_id=failure[9])
                # write the last unfinished batch
                writer.finish_batch()
                warning_writer.finish_batch()

        # write all recorded errors to database
        error_list.write_all_row_errors(job_id)
        # Update error info for submission
        populate_job_error_info(job)

        # mark job status as "finished"
        mark_job_status(job_id, "finished")
        job_duration = (datetime.now()-job_start).total_seconds()
        logger.info(
            {
                'message': 'Completed cross-file validations on submission_id: ' + str(submission_id),
                'message_type': 'ValidatorInfo',
                'submission_id': submission_id,
                'job_id': job.job_id,
                'action': 'run_cross_validations',
                'status': 'finish',
                'start': job_start,
                'duration': job_duration})
        # set number of errors and warnings for submission.
        submission = populate_submission_error_info(submission_id)
        # TODO: Remove temporary step below
        # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their
        # submissions as publishable
        # Publish only if no errors are present
        if submission.number_of_errors == 0:
            submission.publishable = True
        sess.commit()

        # Mark validation complete
        mark_file_complete(job_id)
def start_d_generation(job, start_date, end_date, agency_type, agency_code=None):
    """ Validates the start and end dates of the generation, updates the submission's publish status and progress (if
        its not detached generation), and sends the job information to SQS.

        Args:
            job: File generation job to start
            start_date: String to parse as the start date of the generation
            end_date: String to parse as the end date of the generation
            agency_type: Type of Agency to generate files by: "awarding" or "funding"
            agency_code: Agency code for detached D file generations
    """
    if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)):
        raise ResponseException("Start or end date cannot be parsed into a date of format MM/DD/YYYY",
                                StatusCode.CLIENT_ERROR)

    # Update the Job's start and end dates
    sess = GlobalDB.db().session
    job.start_date = start_date
    job.end_date = end_date
    sess.commit()

    # Update submission
    if job.submission_id:
        agency_code = update_generation_submission(sess, job)

    mark_job_status(job.job_id, 'waiting')

    file_generation = retrieve_cached_file_generation(job, agency_type, agency_code)
    if file_generation:
        try:
            copy_file_generation_to_job(job, file_generation, g.is_local)
        except Exception as e:
            logger.error(traceback.format_exc())

            mark_job_status(job.job_id, 'failed')
            job.error_message = str(e)
            sess.commit()
    else:
        # Create new FileGeneration and reset Jobs
        file_generation = FileGeneration(
            request_date=datetime.now().date(), start_date=job.start_date, end_date=job.end_date,
            file_type=job.file_type.letter_name, agency_code=agency_code, agency_type=agency_type, is_cached_file=True)
        sess.add(file_generation)
        sess.commit()

        try:
            job.file_generation_id = file_generation.file_generation_id
            sess.commit()
            reset_generation_jobs(sess, job)
            logger.info({'message': 'Sending new FileGeneration {} to SQS'.format(file_generation.file_generation_id),
                         'message_type': 'BrokerInfo', 'file_type': job.file_type.letter_name, 'job_id': job.job_id,
                         'submission_id': job.submission_id, 'file_generation_id': file_generation.file_generation_id})

            # Add file_generation_id to the SQS job queue
            queue = sqs_queue()
            message_attr = {"validation_type": {"DataType": "String", "StringValue": "generation"}}
            queue.send_message(MessageBody=str(file_generation.file_generation_id), MessageAttributes=message_attr)
        except Exception as e:
            logger.error(traceback.format_exc())

            mark_job_status(job.job_id, 'failed')
            job.error_message = str(e)
            file_generation.is_cached_file = False
            sess.commit()
Example #21
0
def validator_process_job(job_id, agency_code, is_retry=False):
    """ Retrieves a Job based on its ID, and kicks off a validation. Handles errors by ensuring the Job (if exists) is
        no longer running.

        Args:
            job_id: ID of a Job
            agency_code: CGAC or FREC code for agency, only required for file generations by Job
            is_retry: If this is not the very first time handling execution of this job. If True, cleanup is
                      performed before proceeding to retry the job

        Raises:
            Any Exceptions raised by the GenerationManager or ValidationManager, excluding those explicitly handled
    """
    if is_retry:
        if cleanup_validation(job_id):
            log_job_message(
                logger=logger,
                message=
                "Attempting a retry of {} after successful retry-cleanup.".
                format(inspect.stack()[0][3]),
                job_id=job_id,
                is_debug=True)
        else:
            log_job_message(
                logger=logger,
                message="Retry of {} found to be not necessary after cleanup. "
                "Returning from job with success.".format(
                    inspect.stack()[0][3]),
                job_id=job_id,
                is_debug=True)
            return

    sess = GlobalDB.db().session
    job = None

    try:
        # Get the job
        job = sess.query(Job).filter_by(job_id=job_id).one_or_none()
        if job is None:
            validation_error_type = ValidationError.jobError
            write_file_error(job_id, None, validation_error_type)
            raise ResponseException(
                'Job ID {} not found in database'.format(job_id),
                StatusCode.CLIENT_ERROR, None, validation_error_type)

        mark_job_status(job_id, 'ready')

        # We can either validate or generate a file based on Job ID
        if job.job_type.name == 'file_upload':
            # Generate A, E, or F file
            file_generation_manager = FileGenerationManager(sess,
                                                            g.is_local,
                                                            job=job)
            file_generation_manager.generate_file(agency_code)
        else:
            # Run validations
            validation_manager = ValidationManager(
                g.is_local, CONFIG_SERVICES['error_report_path'])
            validation_manager.validate_job(job.job_id)

    except (ResponseException, csv.Error, UnicodeDecodeError, ValueError) as e:
        # Handle exceptions explicitly raised during validation
        error_data = {
            'message': 'An exception occurred in the Validator',
            'message_type': 'ValidatorInfo',
            'job_id': job_id,
            'traceback': traceback.format_exc()
        }

        if job:
            error_data.update({
                'submission_id': job.submission_id,
                'file_type': job.file_type.name
            })
            logger.error(error_data)

            sess.refresh(job)
            job.error_message = str(e)
            if job.filename is not None:
                error_type = ValidationError.unknownError
                if isinstance(e, UnicodeDecodeError):
                    error_type = ValidationError.encodingError
                elif isinstance(e, ResponseException):
                    error_type = e.errorType

                write_file_error(job.job_id, job.filename, error_type)

            mark_job_status(job.job_id, 'invalid')
        else:
            logger.error(error_data)
            raise e

    except Exception as e:
        # Log uncaught exceptions and fail the Job
        error_data = {
            'message': 'An unhandled exception occurred in the Validator',
            'message_type': 'ValidatorInfo',
            'job_id': job_id,
            'traceback': traceback.format_exc()
        }
        if job:
            error_data.update({
                'submission_id': job.submission_id,
                'file_type': job.file_type.name
            })
        logger.error(error_data)

        # Try to mark the Job as failed, but continue raising the original Exception if not possible
        try:
            mark_job_status(job_id, 'failed')

            sess.refresh(job)
            job.error_message = str(e)
            sess.commit()
        except:
            pass

        raise e
    def run_cross_validation(self, job):
        """ Cross file validation job. Test all rules with matching rule_timing.
            Run each cross-file rule and create error report.

            Args:
                job: Current job
        """
        sess = GlobalDB.db().session
        job_id = job.job_id
        # Create File Status object
        create_file_if_needed(job_id)
        # Create list of errors
        error_list = ErrorInterface()

        submission_id = job.submission_id
        job_start = datetime.now()
        logger.info({
            'message':
            'Beginning cross-file validations on submission_id: ' +
            str(submission_id),
            'message_type':
            'ValidatorInfo',
            'submission_id':
            submission_id,
            'job_id':
            job.job_id,
            'action':
            'run_cross_validations',
            'start':
            job_start,
            'status':
            'start'
        })
        # Delete existing cross file errors for this submission
        sess.query(ErrorMetadata).filter(
            ErrorMetadata.job_id == job_id).delete()
        sess.commit()

        # get all cross file rules from db
        cross_file_rules = sess.query(RuleSql).filter_by(
            rule_cross_file_flag=True)

        # for each cross-file combo, run associated rules and create error report
        for c in get_cross_file_pairs():
            first_file = c[0]
            second_file = c[1]
            combo_rules = cross_file_rules.filter(
                or_(
                    and_(RuleSql.file_id == first_file.id,
                         RuleSql.target_file_id == second_file.id),
                    and_(RuleSql.file_id == second_file.id,
                         RuleSql.target_file_id == first_file.id)))

            # get error file name/path
            error_file_name = report_file_name(submission_id, False,
                                               first_file.name,
                                               second_file.name)
            error_file_path = "".join(
                [CONFIG_SERVICES['error_report_path'], error_file_name])
            warning_file_name = report_file_name(submission_id, True,
                                                 first_file.name,
                                                 second_file.name)
            warning_file_path = "".join(
                [CONFIG_SERVICES['error_report_path'], warning_file_name])

            # open error report and gather failed rules within it
            with open(error_file_path, 'w', newline='') as error_file,\
                    open(warning_file_path, 'w', newline='') as warning_file:
                error_csv = csv.writer(error_file,
                                       delimiter=',',
                                       quoting=csv.QUOTE_MINIMAL,
                                       lineterminator='\n')
                warning_csv = csv.writer(warning_file,
                                         delimiter=',',
                                         quoting=csv.QUOTE_MINIMAL,
                                         lineterminator='\n')

                # write headers to file
                error_csv.writerow(self.crossFileReportHeaders)
                warning_csv.writerow(self.crossFileReportHeaders)

                # send comboRules to validator.crossValidate sql
                current_cols_short_to_long = self.short_to_long_dict[
                    first_file.id].copy()
                current_cols_short_to_long.update(
                    self.short_to_long_dict[second_file.id].copy())
                cross_validate_sql(combo_rules.all(), submission_id,
                                   current_cols_short_to_long, first_file.id,
                                   second_file.id, job, error_csv, warning_csv,
                                   error_list, job_id)
            # close files
            error_file.close()
            warning_file.close()

            # stream file to S3 when not local
            if not self.is_local:
                # stream error file
                with open(error_file_path, 'rb') as csv_file:
                    with smart_open.smart_open(
                            S3Handler.create_file_path(
                                self.get_file_name(error_file_name)),
                            'w') as writer:
                        while True:
                            chunk = csv_file.read(CHUNK_SIZE)
                            if chunk:
                                writer.write(chunk)
                            else:
                                break
                csv_file.close()
                os.remove(error_file_path)

                # stream warning file
                with open(warning_file_path, 'rb') as warning_csv_file:
                    with smart_open.smart_open(
                            S3Handler.create_file_path(
                                self.get_file_name(warning_file_name)),
                            'w') as warning_writer:
                        while True:
                            chunk = warning_csv_file.read(CHUNK_SIZE)
                            if chunk:
                                warning_writer.write(chunk)
                            else:
                                break
                warning_csv_file.close()
                os.remove(warning_file_path)

        # write all recorded errors to database
        error_list.write_all_row_errors(job_id)
        # Update error info for submission
        populate_job_error_info(job)

        # mark job status as "finished"
        mark_job_status(job_id, "finished")
        job_duration = (datetime.now() - job_start).total_seconds()
        logger.info({
            'message':
            'Completed cross-file validations on submission_id: ' +
            str(submission_id),
            'message_type':
            'ValidatorInfo',
            'submission_id':
            submission_id,
            'job_id':
            job.job_id,
            'action':
            'run_cross_validations',
            'status':
            'finish',
            'start':
            job_start,
            'duration':
            job_duration
        })
        # set number of errors and warnings for submission.
        submission = populate_submission_error_info(submission_id)
        # TODO: Remove temporary step below
        # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their
        # submissions as publishable
        # Publish only if no errors are present
        if submission.number_of_errors == 0:
            submission.publishable = True
        sess.commit()

        # Mark validation complete
        mark_file_complete(job_id)
Example #23
0
def copy_file_generation_to_job(job, file_generation, is_local):
    """ Copy cached FileGeneration data to a Job requesting a file.

        Args:
            job: Job object to copy the data to
            file_generation: Cached FileGeneration object to copy the data from
            is_local: A boolean flag indicating whether the application is being run locally or not
    """
    sess = GlobalDB.db().session
    log_data = {
        'message':
        'Copying FileGeneration {} data to Job {}'.format(
            file_generation.file_generation_id, job.job_id),
        'message_type':
        'BrokerInfo',
        'job_id':
        job.job_id,
        'file_type':
        job.file_type.name,
        'file_generation_id':
        file_generation.file_generation_id
    }
    logger.info(log_data)

    # Do not edit submissions that have already successfully completed
    sess.refresh(job)
    if job.job_status_id == lookups.JOB_STATUS_DICT['finished']:
        return

    job.file_generation_id = file_generation.file_generation_id

    # File is still being generated, just mark the FileGeneration ID in the Job and wait
    # FileGeneration will update all child Jobs when it finishes
    if not file_generation.file_path:
        sess.commit()
        return

    # Generate file path for child Job's filename
    filepath = CONFIG_BROKER['broker_files'] if g.is_local else "{}/".format(
        str(job.submission_id))
    original_filename = file_generation.file_path.split('/')[-1]
    filename = '{}{}'.format(filepath, original_filename)

    # Copy parent job's data
    job.filename = filename
    job.original_filename = original_filename
    job.number_of_errors = 0
    job.number_of_warnings = 0

    # Change the validation job's file data when within a submission
    if job.submission_id is not None:
        val_job = sess.query(Job).filter(
            Job.submission_id == job.submission_id,
            Job.file_type_id == job.file_type_id, Job.job_type_id ==
            lookups.JOB_TYPE_DICT['csv_record_validation']).one()
        val_job.filename = filename
        val_job.original_filename = original_filename

        # Copy the data to the Submission's bucket
        if not g.is_local and file_generation.file_path != job.filename:
            # Check to see if the same file exists in the child bucket
            s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"])
            bucket = CONFIG_BROKER['aws_bucket']
            response = s3.list_objects_v2(Bucket=bucket, Prefix=job.filename)
            for obj in response.get('Contents', []):
                if obj['Key'] == job.filename:
                    # The file already exists in this location
                    log_data[
                        'message'] = '{} file already exists in this location: {}; not overwriting.'.format(
                            job.file_type.name, job.filename)
                    logger.info(log_data)
                    mark_job_status(job.job_id, 'finished')
                    return

            S3Handler.copy_file(bucket, bucket, file_generation.file_path,
                                job.filename)
    sess.commit()

    # Mark Job status last so the validation job doesn't start until everything is done
    mark_job_status(job.job_id, 'finished')
    def runCrossValidation(self, job):
        """ Cross file validation job, test all rules with matching rule_timing """
        sess = GlobalDB.db().session
        job_id = job.job_id
        # Create File Status object
        createFileIfNeeded(job_id)
        error_list = ErrorInterface()

        submission_id = job.submission_id
        bucketName = CONFIG_BROKER['aws_bucket']
        regionName = CONFIG_BROKER['aws_region']
        _exception_logger.info(
            'VALIDATOR_INFO: Beginning runCrossValidation on submission_id: '
            '%s', submission_id)

        # Delete existing cross file errors for this submission
        sess.query(ErrorMetadata).filter(
            ErrorMetadata.job_id == job_id).delete()
        sess.commit()

        # get all cross file rules from db
        crossFileRules = sess.query(RuleSql).filter(
            RuleSql.rule_cross_file_flag == True)

        # for each cross-file combo, run associated rules and create error report
        for c in get_cross_file_pairs():
            first_file = c[0]
            second_file = c[1]
            comboRules = crossFileRules.filter(
                or_(
                    and_(RuleSql.file_id == first_file.id,
                         RuleSql.target_file_id == second_file.id),
                    and_(RuleSql.file_id == second_file.id,
                         RuleSql.target_file_id == first_file.id)))
            # send comboRules to validator.crossValidate sql
            failures = Validator.crossValidateSql(comboRules.all(),
                                                  submission_id,
                                                  self.short_to_long_dict)
            # get error file name
            reportFilename = self.getFileName(
                get_cross_report_name(submission_id, first_file.name,
                                      second_file.name))
            warningReportFilename = self.getFileName(
                get_cross_warning_report_name(submission_id, first_file.name,
                                              second_file.name))

            # loop through failures to create the error report
            with self.getWriter(regionName, bucketName, reportFilename, self.crossFileReportHeaders) as writer, \
                 self.getWriter(regionName, bucketName, warningReportFilename, self.crossFileReportHeaders) as warningWriter:
                for failure in failures:
                    if failure[9] == RULE_SEVERITY_DICT['fatal']:
                        writer.write(failure[0:7])
                    if failure[9] == RULE_SEVERITY_DICT['warning']:
                        warningWriter.write(failure[0:7])
                    error_list.recordRowError(job_id,
                                              "cross_file",
                                              failure[0],
                                              failure[3],
                                              failure[5],
                                              failure[6],
                                              failure[7],
                                              failure[8],
                                              severity_id=failure[9])
                writer.finishBatch()
                warningWriter.finishBatch()

        error_list.writeAllRowErrors(job_id)
        mark_job_status(job_id, "finished")
        _exception_logger.info(
            'VALIDATOR_INFO: Completed runCrossValidation on submission_id: '
            '%s', submission_id)
        submission = sess.query(Submission).filter_by(
            submission_id=submission_id).one()
        # Update error info for submission
        submission.number_of_errors = sumNumberOfErrorsForJobList(
            submission_id)
        submission.number_of_warnings = sumNumberOfErrorsForJobList(
            submission_id, errorType="warning")
        # TODO: Remove temporary step below
        # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their submissions
        # as publishable
        # Publish only if no errors are present
        if submission.number_of_errors == 0:
            submission.publishable = True
        sess.commit()

        # Mark validation complete
        markFileComplete(job_id)
Example #25
0
def run_app():
    """Run the application."""
    app = create_app()

    # This is for DataDog (Do Not Delete)
    if USE_DATADOG:
        TraceMiddleware(app,
                        tracer,
                        service="broker-dd",
                        distributed_tracing=False)

    with app.app_context():
        current_app.debug = CONFIG_SERVICES['debug']
        local = CONFIG_BROKER['local']
        g.is_local = local
        error_report_path = CONFIG_SERVICES['error_report_path']
        current_app.config.from_object(__name__)

        # Create connection to job tracker database
        sess = GlobalDB.db().session

        # Future: Override config w/ environment variable, if set
        current_app.config.from_envvar('VALIDATOR_SETTINGS', silent=True)

        queue = sqs_queue()
        messages = []

        logger.info("Starting SQS polling")
        while True:
            # Set current_message to None before every loop to ensure it's never set to the previous message
            current_message = None
            try:
                # Grabs one (or more) messages from the queue
                messages = queue.receive_messages(
                    WaitTimeSeconds=10, MessageAttributeNames=['All'])
                for message in messages:
                    logger.info("Message received: %s", message.body)

                    # Retrieve the job_id from the message body
                    current_message = message
                    g.job_id = message.body
                    mark_job_status(g.job_id, "ready")

                    # Get the job
                    job = sess.query(Job).filter_by(
                        job_id=g.job_id).one_or_none()
                    if job is None:
                        validation_error_type = ValidationError.jobError
                        write_file_error(g.job_id, None, validation_error_type)
                        raise ResponseException(
                            'Job ID {} not found in database'.format(g.job_id),
                            StatusCode.CLIENT_ERROR, None,
                            validation_error_type)

                    # We have two major functionalities in the Validator: validation and file generation
                    if (not job.file_type or job.file_type.letter_name
                            in ['A', 'B', 'C', 'FABS'] or job.job_type.name !=
                            'file_upload') and job.submission_id:
                        # Run validations
                        validation_manager = ValidationManager(
                            local, error_report_path)
                        validation_manager.validate_job(job.job_id)
                    else:
                        # Retrieve the agency code data from the message attributes
                        msg_attr = current_message.message_attributes
                        agency_code = msg_attr['agency_code']['StringValue'] if msg_attr and \
                            msg_attr.get('agency_code') else None
                        agency_type = msg_attr['agency_type']['StringValue'] if msg_attr and \
                            msg_attr.get('agency_type') else None

                        file_generation_manager = FileGenerationManager(
                            job, agency_code, agency_type, local)
                        file_generation_manager.generate_from_job()
                        sess.commit()
                        sess.refresh(job)

                    # Delete from SQS once processed
                    message.delete()

            except ResponseException as e:
                # Handle exceptions explicitly raised during validation.
                logger.error(traceback.format_exc())

                job = get_current_job()
                if job:
                    if job.filename is not None:
                        # Insert file-level error info to the database
                        write_file_error(job.job_id, job.filename, e.errorType,
                                         e.extraInfo)
                    if e.errorType != ValidationError.jobError:
                        # Job passed prerequisites for validation but an error happened somewhere: mark job as 'invalid'
                        mark_job_status(job.job_id, 'invalid')
                        if current_message:
                            if e.errorType in [
                                    ValidationError.rowCountError,
                                    ValidationError.headerError,
                                    ValidationError.fileTypeError
                            ]:
                                current_message.delete()
            except Exception as e:
                # Handle uncaught exceptions in validation process.
                logger.error(traceback.format_exc())

                # csv-specific errors get a different job status and response code
                if isinstance(e, ValueError) or isinstance(
                        e, csv.Error) or isinstance(e, UnicodeDecodeError):
                    job_status = 'invalid'
                else:
                    job_status = 'failed'
                job = get_current_job()
                if job:
                    if job.filename is not None:
                        error_type = ValidationError.unknownError
                        if isinstance(e, UnicodeDecodeError):
                            error_type = ValidationError.encodingError
                            # TODO Is this really the only case where the message should be deleted?
                            if current_message:
                                current_message.delete()
                        write_file_error(job.job_id, job.filename, error_type)
                    mark_job_status(job.job_id, job_status)
            finally:
                GlobalDB.close()
                # Set visibility to 0 so that another attempt can be made to process in SQS immediately,
                # instead of waiting for the timeout window to expire
                for message in messages:
                    try:
                        message.change_visibility(VisibilityTimeout=0)
                    except ClientError:
                        # Deleted messages will throw errors, which is fine because they are handled
                        pass
def generate_file(submission, file_type, start, end, agency_type):
    """ Start a file generation job for the specified file type within a submission

        Args:
            submission: submission for which we're generating the file
            file_type: type of file to generate the job for
            start: the start date for the file to generate
            end: the end date for the file to generate
            agency_type: The type of agency (awarding or funding) to generate the file for (only used for D file
                generation)

        Returns:
            Results of check_generation or JsonResponse object containing an error if the prerequisite job isn't
            complete.
    """
    error_message = None
    # submission is a FABS submission
    if submission.d2_submission:
        error_message = "Cannot generate files for FABS submissions."

    elif file_type in ['D1', 'D2']:
        # D file generation requires start and end date
        if not start or not end:
            error_message = "Must have a start and end date for D file generation."
        # D files can only be generated by awarding or funding agency
        elif agency_type not in ['awarding', 'funding']:
            error_message = "agency_type must be either awarding or funding for D file generation."

    # Only D1, D2, E, and F files can be generated
    elif file_type not in ['E', 'F']:
        error_message = "File type must be either D1, D2, E, or F"

    # Return any client errors
    if error_message:
        return JsonResponse.error(ValueError(error_message),
                                  StatusCode.CLIENT_ERROR)

    sess = GlobalDB.db().session
    job = sess.query(Job).filter(
        Job.submission_id == submission.submission_id,
        Job.file_type_id == lookups.FILE_TYPE_DICT_LETTER_ID[file_type],
        Job.job_type_id == lookups.JOB_TYPE_DICT['file_upload']).one()

    log_data = {
        'message':
        'Starting {} file generation within submission {}'.format(
            file_type, submission.submission_id),
        'message_type':
        'BrokerInfo',
        'submission_id':
        submission.submission_id,
        'job_id':
        job.job_id,
        'file_type':
        file_type
    }
    logger.info(log_data)

    # Check prerequisites on upload job
    if not generation_helper.check_generation_prereqs(submission.submission_id,
                                                      file_type):
        return JsonResponse.error(
            ResponseException(
                "Must wait for completion of prerequisite validation job",
                StatusCode.CLIENT_ERROR), StatusCode.CLIENT_ERROR)
    try:
        if file_type in ['D1', 'D2']:
            generation_helper.start_d_generation(job, start, end, agency_type)
        else:
            generation_helper.start_e_f_generation(job)
    except Exception as e:
        mark_job_status(job.job_id, 'failed')
        job.error_message = str(e)
        sess.commit()
        return JsonResponse.error(e, StatusCode.INTERNAL_ERROR)

    # Return same response as check generation route
    return check_generation(submission, file_type)
def generate_detached_file(file_type, cgac_code, frec_code, start, end,
                           quarter, agency_type):
    """ Start a file generation job for the specified file type not connected to a submission

        Args:
            file_type: type of file to be generated
            cgac_code: the code of a CGAC agency if generating for a CGAC agency
            frec_code: the code of a FREC agency if generating for a FREC agency
            start: start date in a string, formatted MM/DD/YYYY
            end: end date in a string, formatted MM/DD/YYYY
            quarter: quarter to generate for, formatted Q#/YYYY
            agency_type: The type of agency (awarding or funding) to generate the file for

        Returns:
            JSONResponse object with keys job_id, status, file_type, url, message, start, and end.

        Raises:
            ResponseException: if the start and end Strings cannot be parsed into dates
    """
    # Make sure it's a valid request
    if not cgac_code and not frec_code:
        return JsonResponse.error(
            ValueError(
                "Detached file generation requires CGAC or FR Entity Code"),
            StatusCode.CLIENT_ERROR)

    if file_type in ['D1', 'D2']:
        # Make sure we have a start and end date for D1/D2 generation
        if not start or not end:
            return JsonResponse.error(
                ValueError(
                    "Must have a start and end date for D file generation."),
                StatusCode.CLIENT_ERROR)
        # Check if date format is MM/DD/YYYY
        if not (StringCleaner.is_date(start) and StringCleaner.is_date(end)):
            raise ResponseException(
                'Start or end date cannot be parsed into a date',
                StatusCode.CLIENT_ERROR)

        if agency_type not in ('awarding', 'funding'):
            return JsonResponse.error(
                ValueError("agency_type must be either awarding or funding."),
                StatusCode.CLIENT_ERROR)
    else:
        # Check if date format is Q#/YYYY
        if not quarter:
            return JsonResponse.error(
                ValueError("Must have a quarter for A file generation."),
                StatusCode.CLIENT_ERROR)

        try:
            start, end = generic_helper.quarter_to_dates(quarter)
        except ResponseException as e:
            return JsonResponse.error(e, StatusCode.CLIENT_ERROR)

    # Add job info
    file_type_name = lookups.FILE_TYPE_DICT_LETTER_NAME[file_type]
    new_job = generation_helper.add_generation_job_info(
        file_type_name=file_type_name, start_date=start, end_date=end)

    agency_code = frec_code if frec_code else cgac_code
    log_data = {
        'message': 'Starting detached {} file generation'.format(file_type),
        'message_type': 'BrokerInfo',
        'job_id': new_job.job_id,
        'file_type': file_type,
        'agency_code': agency_code,
        'start_date': start,
        'end_date': end
    }
    logger.info(log_data)

    try:
        if file_type in ['D1', 'D2']:
            generation_helper.start_d_generation(new_job,
                                                 start,
                                                 end,
                                                 agency_type,
                                                 agency_code=agency_code)
        else:
            generation_helper.start_a_generation(new_job, start, end,
                                                 agency_code)
    except Exception as e:
        mark_job_status(new_job.job_id, 'failed')
        new_job.error_message = str(e)
        GlobalDB.db().session.commit()
        return JsonResponse.error(e, StatusCode.INTERNAL_ERROR)

    # Return same response as check generation route
    return check_detached_generation(new_job.job_id)
def copy_file_generation_to_job(job, file_generation, is_local):
    """ Copy cached FileGeneration data to a Job requesting a file.

        Args:
            job: Job object to copy the data to
            file_generation: Cached FileGeneration object to copy the data from
            is_local: A boolean flag indicating whether the application is being run locally or not
    """
    sess = GlobalDB.db().session
    log_data = {
        'message': 'Copying FileGeneration {} data to Job {}'.format(file_generation.file_generation_id, job.job_id),
        'message_type': 'BrokerInfo', 'job_id': job.job_id, 'file_type': job.file_type.name,
        'file_generation_id': file_generation.file_generation_id}
    logger.info(log_data)

    # Do not edit submissions that have already successfully completed
    sess.refresh(job)
    if job.job_status_id == lookups.JOB_STATUS_DICT['finished']:
        return

    job.file_generation_id = file_generation.file_generation_id

    # File is still being generated, just mark the FileGeneration ID in the Job and wait
    # FileGeneration will update all child Jobs when it finishes
    if not file_generation.file_path:
        sess.commit()
        return

    # Generate file path for child Job's filename
    filepath = CONFIG_BROKER['broker_files'] if g.is_local else "{}/".format(str(job.submission_id))
    original_filename = file_generation.file_path.split('/')[-1]
    filename = '{}{}'.format(filepath, original_filename)

    # Copy parent job's data
    job.filename = filename
    job.original_filename = original_filename
    job.number_of_errors = 0
    job.number_of_warnings = 0

    # Change the validation job's file data when within a submission
    if job.submission_id is not None:
        val_job = sess.query(Job).filter(Job.submission_id == job.submission_id,
                                         Job.file_type_id == job.file_type_id,
                                         Job.job_type_id == lookups.JOB_TYPE_DICT['csv_record_validation']).one()
        val_job.filename = filename
        val_job.original_filename = original_filename

        # Copy the data to the Submission's bucket
        if not g.is_local and file_generation.file_path != job.filename:
            # Check to see if the same file exists in the child bucket
            s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"])
            bucket = CONFIG_BROKER['aws_bucket']
            response = s3.list_objects_v2(Bucket=bucket, Prefix=job.filename)
            for obj in response.get('Contents', []):
                if obj['Key'] == job.filename:
                    # The file already exists in this location
                    log_data['message'] = '{} file already exists in this location: {}; not overwriting.'.format(
                        job.file_type.name, job.filename)
                    logger.info(log_data)
                    mark_job_status(job.job_id, 'finished')
                    return

            S3Handler.copy_file(bucket, bucket, file_generation.file_path, job.filename)
    sess.commit()

    # Mark Job status last so the validation job doesn't start until everything is done
    mark_job_status(job.job_id, 'finished')
def generate_file(submission, file_type, start, end, agency_type):
    """ Start a file generation job for the specified file type within a submission

        Args:
            submission: submission for which we're generating the file
            file_type: type of file to generate the job for
            start: the start date for the file to generate
            end: the end date for the file to generate
            agency_type: The type of agency (awarding or funding) to generate the file for (only used for D file
                generation)

        Returns:
            Results of check_generation or JsonResponse object containing an error if the prerequisite job isn't
            complete.
    """
    error_message = None
    # submission is a FABS submission
    if submission.d2_submission:
        error_message = "Cannot generate files for FABS submissions."

    elif file_type in ['D1', 'D2']:
        # D file generation requires start and end date
        if not start or not end:
            error_message = "Must have a start and end date for D file generation."
        # D files can only be generated by awarding or funding agency
        elif agency_type not in ['awarding', 'funding']:
            error_message = "agency_type must be either awarding or funding for D file generation."

    # Only D1, D2, E, and F files can be generated
    elif file_type not in ['E', 'F']:
        error_message = "File type must be either D1, D2, E, or F"

    # Return any client errors
    if error_message:
        return JsonResponse.error(ValueError(error_message), StatusCode.CLIENT_ERROR)

    sess = GlobalDB.db().session
    job = sess.query(Job).filter(Job.submission_id == submission.submission_id,
                                 Job.file_type_id == lookups.FILE_TYPE_DICT_LETTER_ID[file_type],
                                 Job.job_type_id == lookups.JOB_TYPE_DICT['file_upload']).one()
    logger.info({
        'message': 'Starting {} file generation within submission {}'.format(file_type, submission.submission_id),
        'message_type': 'BrokerInfo',
        'submission_id': submission.submission_id,
        'job_id': job.job_id,
        'file_type': file_type
    })

    # Check prerequisites on upload job
    if not generation_helper.check_generation_prereqs(submission.submission_id, file_type):
        return JsonResponse.error(ResponseException("Must wait for completion of prerequisite validation job",
                                                    StatusCode.CLIENT_ERROR), StatusCode.CLIENT_ERROR)
    try:
        if file_type in ['D1', 'D2']:
            generation_helper.start_d_generation(job, start, end, agency_type)
        else:
            generation_helper.start_e_f_generation(job)
    except Exception as e:
        mark_job_status(job.job_id, 'failed')
        job.error_message = str(e)
        sess.commit()
        return JsonResponse.error(e, StatusCode.INTERNAL_ERROR)

    # Return same response as check generation route
    return check_generation(submission, file_type)
def generate_detached_file(file_type, cgac_code, frec_code, start_date, end_date, year, period, agency_type):
    """ Start a file generation job for the specified file type not connected to a submission

        Args:
            file_type: type of file to be generated
            cgac_code: the code of a CGAC agency if generating for a CGAC agency
            frec_code: the code of a FREC agency if generating for a FREC agency
            start_date: start date in a string, formatted MM/DD/YYYY
            end_date: end date in a string, formatted MM/DD/YYYY
            year: year to generate for, integer 4 digits
            period: period to generate for, integer (2-12)
            agency_type: The type of agency (awarding or funding) to generate the file for

        Returns:
            JSONResponse object with keys job_id, status, file_type, url, message, start_date, and end_date.

        Raises:
            ResponseException: if the start_date and end_date Strings cannot be parsed into dates
    """
    # Make sure it's a valid request
    if not cgac_code and not frec_code:
        return JsonResponse.error(ValueError("Detached file generation requires CGAC or FR Entity Code"),
                                  StatusCode.CLIENT_ERROR)

    if file_type in ['D1', 'D2']:
        # Make sure we have a start and end date for D1/D2 generation
        if not start_date or not end_date:
            return JsonResponse.error(ValueError("Must have a start and end date for D file generation."),
                                      StatusCode.CLIENT_ERROR)

        # Check if date format is MM/DD/YYYY
        if not (StringCleaner.is_date(start_date) and StringCleaner.is_date(end_date)):
            raise ResponseException('Start or end date cannot be parsed into a date', StatusCode.CLIENT_ERROR)

        if agency_type not in ('awarding', 'funding'):
            return JsonResponse.error(ValueError("agency_type must be either awarding or funding."),
                                      StatusCode.CLIENT_ERROR)
    else:
        # Make sure both year and period are provided
        if not (year and period):
            return JsonResponse.error(ValueError("Must have a year and period for A file generation."),
                                      StatusCode.CLIENT_ERROR)

        try:
            # Convert to real start and end dates
            start_date, end_date = generic_helper.year_period_to_dates(year, period)
        except ResponseException as e:
            return JsonResponse.error(e, StatusCode.CLIENT_ERROR)

    # Add job info
    file_type_name = lookups.FILE_TYPE_DICT_LETTER_NAME[file_type]
    new_job = generation_helper.create_generation_job(file_type_name, start_date, end_date)

    agency_code = frec_code if frec_code else cgac_code
    logger.info({'message': 'Starting detached {} file generation'.format(file_type), 'message_type': 'BrokerInfo',
                 'job_id': new_job.job_id, 'file_type': file_type, 'agency_code': agency_code, 'start_date': start_date,
                 'end_date': end_date})

    try:
        if file_type in ['D1', 'D2']:
            generation_helper.start_d_generation(new_job, start_date, end_date, agency_type, agency_code=agency_code)
        else:
            generation_helper.start_a_generation(new_job, start_date, end_date, agency_code)
    except Exception as e:
        mark_job_status(new_job.job_id, 'failed')
        new_job.error_message = str(e)
        GlobalDB.db().session.commit()
        return JsonResponse.error(e, StatusCode.INTERNAL_ERROR)

    # Return same response as check generation route
    return check_detached_generation(new_job.job_id)
Example #31
0
def copy_parent_file_request_data(sess, child_job, parent_job, is_local):
    """Parent FileRequest job data to the child FileRequest job data.

        Args:
            sess: current DB session
            child_job: Job ID for the child FileRequest object
            parent_job: Job ID for the parent FileRequest object
            is_local: True if in local development, False otherwise
    """
    file_type = parent_job.file_type.letter_name
    log_data = {
        'message':
        'Copying data from parent job with job_id:{}'.format(
            parent_job.job_id),
        'message_type':
        'ValidatorInfo',
        'job_id':
        child_job.job_id,
        'file_type':
        parent_job.file_type.name
    }

    # Keep path but update file name
    filename = '{}/{}'.format(
        child_job.filename.rsplit('/', 1)[0], parent_job.original_filename)

    # Copy parent job's data
    child_job.from_cached = True
    child_job.filename = filename
    child_job.original_filename = parent_job.original_filename
    child_job.number_of_errors = parent_job.number_of_errors
    child_job.number_of_warnings = parent_job.number_of_warnings
    child_job.error_message = parent_job.error_message

    # Change the validation job's file data when within a submission
    if child_job.submission_id is not None:
        val_job = sess.query(Job).filter(
            Job.submission_id == child_job.submission_id,
            Job.file_type_id == parent_job.file_type_id,
            Job.job_type_id == JOB_TYPE_DICT['csv_record_validation']).one()
        val_job.filename = filename
        val_job.original_filename = parent_job.original_filename
    sess.commit()

    if not is_local and parent_job.filename != child_job.filename:
        # Check to see if the same file exists in the child bucket
        s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"])
        response = s3.list_objects_v2(Bucket=CONFIG_BROKER['aws_bucket'],
                                      Prefix=child_job.filename)
        for obj in response.get('Contents', []):
            if obj['Key'] == child_job.filename:
                # The file already exists in this location
                log_data[
                    'message'] = 'Cached {} file CSV already exists in this location'.format(
                        file_type)
                logger.info(log_data)
                return

        # Copy the parent file into the child's S3 location
        log_data['message'] = 'Copying the cached {} file from job {}'.format(
            file_type, parent_job.job_id)
        logger.info(log_data)
        with smart_open.smart_open(
                S3Handler.create_file_path(parent_job.filename),
                'r') as reader:
            stream_file_to_s3(child_job.filename, reader)

    # Mark job status last so the validation job doesn't start until everything is done
    mark_job_status(child_job.job_id,
                    JOB_STATUS_DICT_ID[parent_job.job_status_id])
Example #32
0
def start_d_generation(job,
                       start_date,
                       end_date,
                       agency_type,
                       agency_code=None,
                       file_format='csv'):
    """ Validates the start and end dates of the generation, updates the submission's publish status and progress (if
        its not detached generation), and sends the job information to SQS.

        Args:
            job: File generation job to start
            start_date: String to parse as the start date of the generation
            end_date: String to parse as the end date of the generation
            agency_type: Type of Agency to generate files by: "awarding" or "funding"
            agency_code: Agency code for detached D file generations
            file_format: determines if the file generated is a txt or a csv
    """
    if not (StringCleaner.is_date(start_date)
            and StringCleaner.is_date(end_date)):
        raise ResponseException(
            "Start or end date cannot be parsed into a date of format MM/DD/YYYY",
            StatusCode.CLIENT_ERROR)

    # Update the Job's start and end dates
    sess = GlobalDB.db().session
    job.start_date = start_date
    job.end_date = end_date
    sess.commit()

    # Update submission
    if job.submission_id:
        agency_code = update_generation_submission(sess, job)

    mark_job_status(job.job_id, 'waiting')

    file_generation = retrieve_cached_file_generation(job, agency_type,
                                                      agency_code, file_format)
    if file_generation:
        try:
            copy_file_generation_to_job(job, file_generation, g.is_local)
        except Exception as e:
            logger.error(traceback.format_exc())

            mark_job_status(job.job_id, 'failed')
            job.error_message = str(e)
            sess.commit()
    else:
        # Create new FileGeneration and reset Jobs
        file_generation = FileGeneration(request_date=datetime.now().date(),
                                         start_date=job.start_date,
                                         end_date=job.end_date,
                                         file_type=job.file_type.letter_name,
                                         agency_code=agency_code,
                                         agency_type=agency_type,
                                         file_format=file_format,
                                         is_cached_file=True)
        sess.add(file_generation)
        sess.commit()

        try:
            job.file_generation_id = file_generation.file_generation_id
            sess.commit()
            reset_generation_jobs(sess, job)
            logger.info({
                'message':
                'Sending new FileGeneration {} to SQS'.format(
                    file_generation.file_generation_id),
                'message_type':
                'BrokerInfo',
                'file_type':
                job.file_type.letter_name,
                'job_id':
                job.job_id,
                'submission_id':
                job.submission_id,
                'file_generation_id':
                file_generation.file_generation_id
            })

            # Add file_generation_id to the SQS job queue
            queue = sqs_queue()
            message_attr = {
                "validation_type": {
                    "DataType": "String",
                    "StringValue": "generation"
                }
            }
            queue.send_message(MessageBody=str(
                file_generation.file_generation_id),
                               MessageAttributes=message_attr)
        except Exception as e:
            logger.error(traceback.format_exc())

            mark_job_status(job.job_id, 'failed')
            job.error_message = str(e)
            file_generation.is_cached_file = False
            sess.commit()
Example #33
0
def start_generation_job(job, start_date, end_date, agency_code=None):
    """ Validates the dates for a D file generation job and passes the Job ID to SQS

        Args:
            job: File generation job to start
            start_date: Start date of the file generation
            end_date: End date of the file generation
            agency_code: Agency code for detached D file generations

        Returns:
            Tuple of boolean indicating successful start, and error response if False
    """
    sess = GlobalDB.db().session
    file_type = job.file_type.letter_name
    try:
        if file_type in ['D1', 'D2']:
            # Validate and set Job's start and end dates
            if not (StringCleaner.is_date(start_date)
                    and StringCleaner.is_date(end_date)):
                raise ResponseException(
                    "Start or end date cannot be parsed into a date",
                    StatusCode.CLIENT_ERROR)
            job.start_date = start_date
            job.end_date = end_date
            sess.commit()
        elif file_type not in ["E", "F"]:
            raise ResponseException("File type must be either D1, D2, E or F",
                                    StatusCode.CLIENT_ERROR)

    except ResponseException as e:
        return False, JsonResponse.error(e,
                                         e.status,
                                         file_type=file_type,
                                         status='failed')

    mark_job_status(job.job_id, "waiting")

    # Add job_id to the SQS job queue
    logger.info({
        'message_type':
        'ValidatorInfo',
        'job_id':
        job.job_id,
        'message':
        'Sending file generation job {} to Validator in SQS'.format(job.job_id)
    })
    queue = sqs_queue()

    message_attr = {
        'agency_code': {
            'DataType': 'String',
            'StringValue': agency_code
        }
    } if agency_code else {}
    response = queue.send_message(MessageBody=str(job.job_id),
                                  MessageAttributes=message_attr)
    logger.debug({
        'message_type': 'ValidatorInfo',
        'job_id': job.job_id,
        'message': 'Send message response: {}'.format(response)
    })

    return True, None
    def run_validation(self, job):
        """ Run validations for specified job
        Args:
            job: Job to be validated
        Returns:
            True if successful
        """

        sess = GlobalDB.db().session
        error_list = ErrorInterface()
        job_id = job.job_id
        submission_id = job.submission_id

        row_number = 1
        file_type = job.file_type.name
        validation_start = datetime.now()

        log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format(
            str(submission_id), str(job_id), file_type)
        logger.info({
            'message': 'Beginning run_validation {}'.format(log_str),
            'message_type': 'ValidatorInfo',
            'submission_id': submission_id,
            'job_id': job_id,
            'file_type': file_type,
            'action': 'run_validations',
            'status': 'start',
            'start_time': validation_start
        })
        # Get orm model for this file
        model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0]

        # Delete existing file level errors for this submission
        sess.query(ErrorMetadata).filter(
            ErrorMetadata.job_id == job_id).delete()
        sess.commit()

        # Clear existing records for this submission
        sess.query(model).filter_by(submission_id=submission_id).delete()
        sess.commit()

        # Clear existing flex fields for this job
        sess.query(FlexField).filter_by(job_id=job_id).delete()
        sess.commit()

        # If local, make the error report directory
        if self.is_local and not os.path.exists(self.directory):
            os.makedirs(self.directory)
        # Get bucket name and file name
        file_name = job.filename
        bucket_name = CONFIG_BROKER['aws_bucket']
        region_name = CONFIG_BROKER['aws_region']

        error_file_name = report_file_name(job.submission_id, False,
                                           job.file_type.name)
        error_file_path = "".join(
            [CONFIG_SERVICES['error_report_path'], error_file_name])
        warning_file_name = report_file_name(job.submission_id, True,
                                             job.file_type.name)
        warning_file_path = "".join(
            [CONFIG_SERVICES['error_report_path'], warning_file_name])

        # Create File Status object
        create_file_if_needed(job_id, file_name)

        reader = CsvReader()

        # Get file size and write to jobs table
        if CONFIG_BROKER["use_aws"]:
            file_size = S3Handler.get_file_size(file_name)
        else:
            file_size = os.path.getsize(file_name)
        job.file_size = file_size
        sess.commit()

        # Get fields for this file
        fields = sess.query(FileColumn).filter(
            FileColumn.file_id == FILE_TYPE_DICT[file_type]).all()

        for field in fields:
            sess.expunge(field)

        csv_schema = {row.name_short: row for row in fields}

        try:
            extension = os.path.splitext(file_name)[1]
            if not extension or extension.lower() not in ['.csv', '.txt']:
                raise ResponseException("", StatusCode.CLIENT_ERROR, None,
                                        ValidationError.fileTypeError)

            # Count file rows: throws a File Level Error for non-UTF8 characters
            temp_file = open(reader.get_filename(region_name, bucket_name,
                                                 file_name),
                             encoding='utf-8')
            file_row_count = len(list(csv.reader(temp_file)))
            try:
                temp_file.close()
            except AttributeError:
                # File does not exist, and so does not need to be closed
                pass

            # Pull file and return info on whether it's using short or long col headers
            reader.open_file(region_name,
                             bucket_name,
                             file_name,
                             fields,
                             bucket_name,
                             self.get_file_name(error_file_name),
                             self.long_to_short_dict[job.file_type_id],
                             is_local=self.is_local)

            # list to keep track of rows that fail validations
            error_rows = []

            # While not done, pull one row and put it into staging table if it passes
            # the Validator

            loading_start = datetime.now()
            logger.info({
                'message': 'Beginning data loading {}'.format(log_str),
                'message_type': 'ValidatorInfo',
                'submission_id': submission_id,
                'job_id': job_id,
                'file_type': file_type,
                'action': 'data_loading',
                'status': 'start',
                'start_time': loading_start
            })

            with open(error_file_path, 'w', newline='') as error_file,\
                    open(warning_file_path, 'w', newline='') as warning_file:
                error_csv = csv.writer(error_file,
                                       delimiter=',',
                                       quoting=csv.QUOTE_MINIMAL,
                                       lineterminator='\n')
                warning_csv = csv.writer(warning_file,
                                         delimiter=',',
                                         quoting=csv.QUOTE_MINIMAL,
                                         lineterminator='\n')

                required_list = None
                type_list = None
                if file_type == "fabs":
                    # create a list of all required/type labels for FABS
                    labels = sess.query(ValidationLabel).all()
                    required_list = {}
                    type_list = {}
                    for label in labels:
                        if label.label_type == "requirement":
                            required_list[label.column_name] = label.label
                        else:
                            type_list[label.column_name] = label.label

                # write headers to file
                error_csv.writerow(self.reportHeaders)
                warning_csv.writerow(self.reportHeaders)
                while not reader.is_finished:
                    row_number += 1

                    if row_number % 100 == 0:
                        elapsed_time = (datetime.now() -
                                        loading_start).total_seconds()
                        logger.info({
                            'message':
                            'Loading row: {} {}'.format(
                                str(row_number), log_str),
                            'message_type':
                            'ValidatorInfo',
                            'submission_id':
                            submission_id,
                            'job_id':
                            job_id,
                            'file_type':
                            file_type,
                            'action':
                            'data_loading',
                            'status':
                            'loading',
                            'rows_loaded':
                            row_number,
                            'start_time':
                            loading_start,
                            'elapsed_time':
                            elapsed_time
                        })
                    #
                    # first phase of validations: read record and record a
                    # formatting error if there's a problem
                    #
                    (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \
                        self.read_record(reader, error_csv, row_number, job, fields, error_list)
                    if reduceRow:
                        row_number -= 1
                    if rowErrorHere:
                        error_rows.append(row_number)
                    if doneReading:
                        # Stop reading from input file
                        break
                    elif skip_row:
                        # Do not write this row to staging, but continue processing future rows
                        continue

                    #
                    # second phase of validations: do basic schema checks
                    # (e.g., require fields, field length, data type)
                    #
                    # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic
                    # validations, so these validations are not repeated here
                    if file_type in ["award", "award_procurement"]:
                        # Skip basic validations for D files, set as valid to trigger write to staging
                        passed_validations = True
                        valid = True
                    else:
                        if file_type == "fabs":
                            record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + "_" +\
                                                             (record['awarding_sub_tier_agency_c'] or '-none-') + \
                                                             "_" + (record['fain'] or '-none-') + "_" + \
                                                             (record['uri'] or '-none-')
                        passed_validations, failures, valid = Validator.validate(
                            record, csv_schema, file_type == "fabs",
                            required_list, type_list)
                    if valid:
                        # todo: update this logic later when we have actual validations
                        if file_type == "fabs":
                            record["is_valid"] = True

                        model_instance = model(job_id=job_id,
                                               submission_id=submission_id,
                                               valid_record=passed_validations,
                                               **record)
                        skip_row = not insert_staging_model(
                            model_instance, job, error_csv, error_list)
                        if flex_cols:
                            sess.add_all(flex_cols)
                            sess.commit()

                        if skip_row:
                            error_rows.append(row_number)
                            continue

                    if not passed_validations:
                        fatal = write_errors(
                            failures, job,
                            self.short_to_long_dict[job.file_type_id],
                            error_csv, warning_csv, row_number, error_list,
                            flex_cols)
                        if fatal:
                            error_rows.append(row_number)

                loading_duration = (datetime.now() -
                                    loading_start).total_seconds()
                logger.info({
                    'message':
                    'Completed data loading {}'.format(log_str),
                    'message_type':
                    'ValidatorInfo',
                    'submission_id':
                    submission_id,
                    'job_id':
                    job_id,
                    'file_type':
                    file_type,
                    'action':
                    'data_loading',
                    'status':
                    'finish',
                    'start_time':
                    loading_start,
                    'end_time':
                    datetime.now(),
                    'duration':
                    loading_duration,
                    'total_rows':
                    row_number
                })

                if file_type in ('appropriations', 'program_activity',
                                 'award_financial'):
                    update_tas_ids(model, submission_id)
                #
                # third phase of validations: run validation rules as specified
                # in the schema guidance. these validations are sql-based.
                #
                sql_error_rows = self.run_sql_validations(
                    job, file_type, self.short_to_long_dict[job.file_type_id],
                    error_csv, warning_csv, row_number, error_list)
                error_rows.extend(sql_error_rows)
            error_file.close()
            warning_file.close()

            # stream file to S3 when not local
            if not self.is_local:
                # stream error file
                with open(error_file_path, 'rb') as csv_file:
                    with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(error_file_name)), 'w')\
                            as writer:
                        while True:
                            chunk = csv_file.read(CHUNK_SIZE)
                            if chunk:
                                writer.write(chunk)
                            else:
                                break
                csv_file.close()
                os.remove(error_file_path)

                # stream warning file
                with open(warning_file_path, 'rb') as warning_csv_file:
                    with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(warning_file_name)), 'w')\
                            as warning_writer:
                        while True:
                            chunk = warning_csv_file.read(CHUNK_SIZE)
                            if chunk:
                                warning_writer.write(chunk)
                            else:
                                break
                warning_csv_file.close()
                os.remove(warning_file_path)

            # Calculate total number of rows in file
            # that passed validations
            error_rows_unique = set(error_rows)
            total_rows_excluding_header = row_number - 1
            valid_rows = total_rows_excluding_header - len(error_rows_unique)

            # Update fabs is_valid rows where applicable
            # Update submission to include action dates where applicable
            if file_type == "fabs":
                sess.query(DetachedAwardFinancialAssistance).\
                    filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique),
                           DetachedAwardFinancialAssistance.submission_id == submission_id).\
                    update({"is_valid": False}, synchronize_session=False)
                sess.commit()
                min_action_date, max_action_date = get_action_dates(
                    submission_id)
                sess.query(Submission).filter(Submission.submission_id == submission_id).\
                    update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date},
                           synchronize_session=False)

            # Ensure validated rows match initial row count
            if file_row_count != row_number:
                raise ResponseException("", StatusCode.CLIENT_ERROR, None,
                                        ValidationError.rowCountError)

            # Update job metadata
            job.number_of_rows = row_number
            job.number_of_rows_valid = valid_rows
            sess.commit()

            error_list.write_all_row_errors(job_id)
            # Update error info for submission
            populate_job_error_info(job)

            if file_type == "fabs":
                # set number of errors and warnings for detached submission
                populate_submission_error_info(submission_id)

            # Mark validation as finished in job tracker
            mark_job_status(job_id, "finished")
            mark_file_complete(job_id, file_name)
        finally:
            # Ensure the files always close
            reader.close()

            validation_duration = (datetime.now() -
                                   validation_start).total_seconds()
            logger.info({
                'message':
                'Completed run_validation {}'.format(log_str),
                'message_type':
                'ValidatorInfo',
                'submission_id':
                submission_id,
                'job_id':
                job_id,
                'file_type':
                file_type,
                'action':
                'run_validation',
                'status':
                'finish',
                'start_time':
                validation_start,
                'end_time':
                datetime.now(),
                'duration':
                validation_duration
            })

        return True
Example #35
0
def job_context(job_id, is_local=True):
    """ Common context for files D1, D2, E, and F generation. Handles marking the job finished and/or failed

        Args:
            job_id: the ID of the submission job
            is_local: a boolean indicating whether this is being run in a local environment or not

        Yields:
            The current DB session
    """
    # Flask context ensures we have access to global.g
    with Flask(__name__).app_context():
        sess, job = retrieve_job_context_data(job_id)
        try:
            yield sess, job
            if not job.from_cached:
                # only mark completed jobs as done
                logger.info({
                    'message':
                    'Marking job {} as finished'.format(job.job_id),
                    'job_id':
                    job.job_id,
                    'message_type':
                    'ValidatorInfo'
                })
                mark_job_status(job.job_id, "finished")
        except Exception as e:
            # logger.exception() automatically adds traceback info
            logger.exception({
                'message':
                'Marking job {} as failed'.format(job.job_id),
                'job_id':
                job.job_id,
                'message_type':
                'ValidatorException',
                'exception':
                str(e)
            })

            # mark job as failed
            job.error_message = str(e)
            mark_job_status(job.job_id, "failed")

            # ensure FileRequest from failed job is not cached
            file_request = sess.query(FileRequest).filter_by(
                job_id=job.job_id).one_or_none()
            if file_request and file_request.is_cached_file:
                file_request.is_cached_file = False

            sess.commit()

        finally:
            file_request = sess.query(FileRequest).filter_by(
                job_id=job.job_id).one_or_none()
            if file_request and file_request.is_cached_file:
                # copy job data to all child FileRequests
                child_requests = sess.query(FileRequest).filter_by(
                    parent_job_id=job.job_id).all()
                if len(child_requests) > 0:
                    logger.info({
                        'message':
                        'Copying file data from job {} to its children'.format(
                            job.job_id),
                        'message_type':
                        'ValidatorInfo',
                        'job_id':
                        job.job_id
                    })
                    for child in child_requests:
                        copy_parent_file_request_data(sess, child.job, job,
                                                      is_local)
            GlobalDB.close()
    def runValidation(self, job):
        """ Run validations for specified job
        Args:
            job: Job to be validated
        Returns:
            True if successful
        """

        sess = GlobalDB.db().session
        job_id = job.job_id

        error_list = ErrorInterface()

        _exception_logger.info(
            'VALIDATOR_INFO: Beginning runValidation on job_id: %s', job_id)

        submission_id = job.submission_id

        rowNumber = 1
        fileType = job.file_type.name
        # Get orm model for this file
        model = [ft.model for ft in FILE_TYPE if ft.name == fileType][0]

        # Clear existing records for this submission
        sess.query(model).filter_by(submission_id=submission_id).delete()
        sess.commit()

        # If local, make the error report directory
        if self.isLocal and not os.path.exists(self.directory):
            os.makedirs(self.directory)
        # Get bucket name and file name
        fileName = job.filename
        bucketName = CONFIG_BROKER['aws_bucket']
        regionName = CONFIG_BROKER['aws_region']

        errorFileName = self.getFileName(get_report_path(job, 'error'))
        warningFileName = self.getFileName(get_report_path(job, 'warning'))

        # Create File Status object
        createFileIfNeeded(job_id, fileName)

        reader = self.getReader()

        # Get file size and write to jobs table
        if CONFIG_BROKER["use_aws"]:
            fileSize = s3UrlHandler.getFileSize(errorFileName)
        else:
            fileSize = os.path.getsize(fileName)
        job.file_size = fileSize
        sess.commit()

        # Get fields for this file
        fields = sess.query(FileColumn). \
            filter(FileColumn.file_id == FILE_TYPE_DICT[fileType]). \
            all()

        for field in fields:
            sess.expunge(field)

        csvSchema = {row.name_short: row for row in fields}

        try:
            # Pull file and return info on whether it's using short or long col headers
            reader.open_file(regionName, bucketName, fileName, fields,
                             bucketName, errorFileName,
                             self.long_to_short_dict)

            # list to keep track of rows that fail validations
            errorRows = []

            # While not done, pull one row and put it into staging table if it passes
            # the Validator

            with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer, \
                 self.getWriter(regionName, bucketName, warningFileName, self.reportHeaders) as warningWriter:
                while not reader.is_finished:
                    rowNumber += 1

                    if rowNumber % 10 == 0:
                        logger.info('loading row %s', rowNumber)

                    #
                    # first phase of validations: read record and record a
                    # formatting error if there's a problem
                    #
                    (record, reduceRow, skipRow, doneReading, rowErrorHere,
                     flex_cols) = self.readRecord(reader, writer, fileType,
                                                  rowNumber, job, fields,
                                                  error_list)
                    if reduceRow:
                        rowNumber -= 1
                    if rowErrorHere:
                        errorRows.append(rowNumber)
                    if doneReading:
                        # Stop reading from input file
                        break
                    elif skipRow:
                        # Do not write this row to staging, but continue processing future rows
                        continue

                    #
                    # second phase of validations: do basic schema checks
                    # (e.g., require fields, field length, data type)
                    #
                    # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic validations,
                    # so these validations are not repeated here
                    if fileType in ["award", "award_procurement"]:
                        # Skip basic validations for D files, set as valid to trigger write to staging
                        passedValidations = True
                        valid = True
                    else:
                        passedValidations, failures, valid = Validator.validate(
                            record, csvSchema)
                    if valid:
                        skipRow = self.writeToStaging(record, job,
                                                      submission_id,
                                                      passedValidations,
                                                      writer, rowNumber, model,
                                                      error_list)
                        if flex_cols:
                            self.write_to_flex(flex_cols, job_id,
                                               submission_id, fileType)

                        if skipRow:
                            errorRows.append(rowNumber)
                            continue

                    if not passedValidations:
                        if self.writeErrors(failures, job,
                                            self.short_to_long_dict, writer,
                                            warningWriter, rowNumber,
                                            error_list):
                            errorRows.append(rowNumber)

                _exception_logger.info(
                    'VALIDATOR_INFO: Loading complete on job_id: %s. '
                    'Total rows added to staging: %s', job_id, rowNumber)

                if fileType in ('appropriations', 'program_activity',
                                'award_financial'):
                    update_tas_ids(model, submission_id)
                #
                # third phase of validations: run validation rules as specified
                # in the schema guidance. these validations are sql-based.
                #
                sqlErrorRows = self.runSqlValidations(job, fileType,
                                                      self.short_to_long_dict,
                                                      writer, warningWriter,
                                                      rowNumber, error_list)
                errorRows.extend(sqlErrorRows)

                # Write unfinished batch
                writer.finishBatch()
                warningWriter.finishBatch()

            # Calculate total number of rows in file
            # that passed validations
            errorRowsUnique = set(errorRows)
            totalRowsExcludingHeader = rowNumber - 1
            validRows = totalRowsExcludingHeader - len(errorRowsUnique)

            # Update job metadata
            job.number_of_rows = rowNumber
            job.number_of_rows_valid = validRows
            sess.commit()

            error_list.writeAllRowErrors(job_id)
            # Update error info for submission
            populateSubmissionErrorInfo(submission_id)
            # Mark validation as finished in job tracker
            mark_job_status(job_id, "finished")
            markFileComplete(job_id, fileName)
        finally:
            # Ensure the file always closes
            reader.close()
            _exception_logger.info(
                'VALIDATOR_INFO: Completed L1 and SQL rule validations on '
                'job_id: %s', job_id)
        return True
def generate_detached_file(file_type, cgac_code, frec_code, start_date,
                           end_date, year, period, agency_type, file_format):
    """ Start a file generation job for the specified file type not connected to a submission

        Args:
            file_type: type of file to be generated
            cgac_code: the code of a CGAC agency if generating for a CGAC agency
            frec_code: the code of a FREC agency if generating for a FREC agency
            start_date: start date in a string, formatted MM/DD/YYYY
            end_date: end date in a string, formatted MM/DD/YYYY
            year: year to generate for, integer 4 digits
            period: period to generate for, integer (2-12)
            agency_type: The type of agency (awarding or funding) to generate the file for
            file_format: determines if the file generated is a txt or a csv (only used for D file generation)

        Returns:
            JSONResponse object with keys job_id, status, file_type, url, message, start_date, and end_date.

        Raises:
            ResponseException: if the start_date and end_date Strings cannot be parsed into dates
    """
    # Make sure it's a valid request
    if not cgac_code and not frec_code:
        return JsonResponse.error(
            ValueError(
                "Detached file generation requires CGAC or FR Entity Code"),
            StatusCode.CLIENT_ERROR)

    if file_type in ['D1', 'D2']:
        # Make sure we have a start and end date for D1/D2 generation
        if not start_date or not end_date:
            return JsonResponse.error(
                ValueError(
                    'Must have a start and end date for D file generation.'),
                StatusCode.CLIENT_ERROR)

        # Check if date format is MM/DD/YYYY
        if not (StringCleaner.is_date(start_date)
                and StringCleaner.is_date(end_date)):
            raise ResponseException(
                'Start or end date cannot be parsed into a date',
                StatusCode.CLIENT_ERROR)

        if agency_type not in ['awarding', 'funding']:
            return JsonResponse.error(
                ValueError('agency_type must be either awarding or funding.'),
                StatusCode.CLIENT_ERROR)

        if file_format not in ['csv', 'txt']:
            return JsonResponse.error(
                ValueError('file_format must be either csv or txt.'),
                StatusCode.CLIENT_ERROR)
    else:
        # Make sure both year and period are provided
        if not (year and period):
            return JsonResponse.error(
                ValueError(
                    "Must have a year and period for A file generation."),
                StatusCode.CLIENT_ERROR)

        try:
            # Convert to real start and end dates
            start_date, end_date = generic_helper.year_period_to_dates(
                year, period)
        except ResponseException as e:
            return JsonResponse.error(e, StatusCode.CLIENT_ERROR)

    # Add job info
    file_type_name = lookups.FILE_TYPE_DICT_LETTER_NAME[file_type]
    new_job = generation_helper.create_generation_job(file_type_name,
                                                      start_date, end_date)

    agency_code = frec_code if frec_code else cgac_code
    logger.info({
        'message':
        'Starting detached {} file generation'.format(file_type),
        'message_type':
        'BrokerInfo',
        'job_id':
        new_job.job_id,
        'file_type':
        file_type,
        'agency_code':
        agency_code,
        'start_date':
        start_date,
        'end_date':
        end_date
    })

    try:
        if file_type in ['D1', 'D2']:
            generation_helper.start_d_generation(new_job,
                                                 start_date,
                                                 end_date,
                                                 agency_type,
                                                 agency_code=agency_code,
                                                 file_format=file_format)
        else:
            generation_helper.start_a_generation(new_job, start_date, end_date,
                                                 agency_code)
    except Exception as e:
        mark_job_status(new_job.job_id, 'failed')
        new_job.error_message = str(e)
        GlobalDB.db().session.commit()
        return JsonResponse.error(e, StatusCode.INTERNAL_ERROR)

    # Return same response as check generation route
    return check_detached_generation(new_job.job_id)
Example #38
0
def validator_process_file_generation(file_gen_id, is_retry=False):
    """ Retrieves a FileGeneration object based on its ID, and kicks off a file generation. Handles errors by ensuring
        the FileGeneration (if exists) is no longer cached.

        Args:
            file_gen_id: ID of a FileGeneration object
            is_retry: If this is not the very first time handling execution of this job. If True, cleanup is
                      performed before proceeding to retry the job

        Raises:
            Any Exceptions raised by the FileGenerationManager
    """
    if is_retry:
        if cleanup_generation(file_gen_id):
            log_job_message(
                logger=logger,
                message=
                "Attempting a retry of {} after successful retry-cleanup.".
                format(inspect.stack()[0][3]),
                job_id=file_gen_id,
                is_debug=True)
        else:
            log_job_message(
                logger=logger,
                message="Retry of {} found to be not necessary after cleanup. "
                "Returning from job with success.".format(
                    inspect.stack()[0][3]),
                job_id=file_gen_id,
                is_debug=True)
            return

    sess = GlobalDB.db().session
    file_generation = None

    try:
        file_generation = sess.query(FileGeneration).filter_by(
            file_generation_id=file_gen_id).one_or_none()
        if file_generation is None:
            raise ResponseException(
                'FileGeneration ID {} not found in database'.format(
                    file_gen_id), StatusCode.CLIENT_ERROR, None)

        file_generation_manager = FileGenerationManager(
            sess, g.is_local, file_generation=file_generation)
        file_generation_manager.generate_file()

    except Exception as e:
        # Log uncaught exceptions and fail all Jobs referencing this FileGeneration
        error_data = {
            'message':
            'An unhandled exception occurred in the Validator during file generation',
            'message_type': 'ValidatorInfo',
            'file_generation_id': file_gen_id,
            'traceback': traceback.format_exc()
        }
        if file_generation:
            error_data.update({
                'agency_code': file_generation.agency_code,
                'agency_type': file_generation.agency_type,
                'start_date': file_generation.start_date,
                'end_date': file_generation.end_date,
                'file_type': file_generation.file_type,
                'file_path': file_generation.file_path,
            })
        logger.error(error_data)

        # Try to mark the Jobs as failed, but continue raising the original Exception if not possible
        try:
            if file_generation:
                # Uncache the FileGeneration
                sess.refresh(file_generation)
                file_generation.is_cached_file = False

                # Mark all Jobs waiting on this FileGeneration as failed
                generation_jobs = sess.query(Job).filter_by(
                    file_generation_id=file_gen_id).all()
                for job in generation_jobs:
                    if job.job_status in [
                            JOB_STATUS_DICT['waiting'],
                            JOB_STATUS_DICT['ready'],
                            JOB_STATUS_DICT['running']
                    ]:
                        mark_job_status(job.job_id, 'failed')
                        sess.refresh(job)
                        job.file_generation_id = None
                        job.error_message = str(e)
                sess.commit()
        except:
            pass

        # ResponseExceptions only occur at very specific times, and should not affect the Validator's future attempts
        # at handling messages from SQS
        if not isinstance(e, ResponseException):
            raise e
    def run_validation(self, job):
        """ Run validations for specified job
        Args:
            job: Job to be validated
        Returns:
            True if successful
        """

        sess = GlobalDB.db().session
        job_id = job.job_id

        error_list = ErrorInterface()

        submission_id = job.submission_id

        row_number = 1
        file_type = job.file_type.name
        validation_start = datetime.now()

        logger.info(
            {
                'message': 'Beginning run_validation on submission_id: ' + str(submission_id) +
                ', job_id: ' + str(job_id) + ', file_type: ' + file_type,
                'message_type': 'ValidatorInfo',
                'submission_id': submission_id,
                'job_id': job_id,
                'file_type': file_type,
                'action': 'run_validations',
                'status': 'start',
                'start_time': validation_start})
        # Get orm model for this file
        model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0]

        # Delete existing file level errors for this submission
        sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == job_id).delete()
        sess.commit()

        # Clear existing records for this submission
        sess.query(model).filter_by(submission_id=submission_id).delete()
        sess.commit()

        # Clear existing flex fields for this job
        sess.query(FlexField).filter_by(job_id=job_id).delete()
        sess.commit()

        # If local, make the error report directory
        if self.isLocal and not os.path.exists(self.directory):
            os.makedirs(self.directory)
        # Get bucket name and file name
        file_name = job.filename
        bucket_name = CONFIG_BROKER['aws_bucket']
        region_name = CONFIG_BROKER['aws_region']

        error_file_name = self.get_file_name(report_file_name(job.submission_id, False, job.file_type.name))
        warning_file_name = self.get_file_name(report_file_name(job.submission_id, True, job.file_type.name))

        # Create File Status object
        create_file_if_needed(job_id, file_name)

        reader = self.get_reader()

        # Get file size and write to jobs table
        if CONFIG_BROKER["use_aws"]:
            file_size = S3Handler.get_file_size(file_name)
        else:
            file_size = os.path.getsize(file_name)
        job.file_size = file_size
        sess.commit()

        # Get fields for this file
        fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[file_type]).all()

        for field in fields:
            sess.expunge(field)

        csv_schema = {row.name_short: row for row in fields}

        try:
            extension = os.path.splitext(file_name)[1]
            if not extension or extension not in ['.csv', '.txt']:
                raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError)

            # Count file rows: throws a File Level Error for non-UTF8 characters
            temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8')
            file_row_count = len(list(csv.reader(temp_file)))
            try:
                temp_file.close()
            except AttributeError:
                # File does not exist, and so does not need to be closed
                pass

            # Pull file and return info on whether it's using short or long col headers
            reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, error_file_name,
                             self.long_to_short_dict, is_local=self.isLocal)

            # list to keep track of rows that fail validations
            error_rows = []

            # While not done, pull one row and put it into staging table if it passes
            # the Validator

            loading_start = datetime.now()
            logger.info(
                {
                    'message': 'Beginning data loading on submission_id: ' + str(submission_id) +
                    ', job_id: ' + str(job_id) + ', file_type: ' + file_type,
                    'message_type': 'ValidatorInfo',
                    'submission_id': submission_id,
                    'job_id': job_id,
                    'file_type': file_type,
                    'action': 'data_loading',
                    'status': 'start',
                    'start_time': loading_start})

            with self.get_writer(region_name, bucket_name, error_file_name, self.reportHeaders) as writer, \
                    self.get_writer(region_name, bucket_name, warning_file_name, self.reportHeaders) as warning_writer:
                while not reader.is_finished:
                    row_number += 1

                    if row_number % 100 == 0:

                        elapsed_time = (datetime.now()-loading_start).total_seconds()
                        logger.info(
                            {
                                'message': 'Loading row: ' + str(row_number) + ' on submission_id: ' +
                                str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type,
                                'message_type': 'ValidatorInfo',
                                'submission_id': submission_id,
                                'job_id': job_id,
                                'file_type': file_type,
                                'action': 'data_loading',
                                'status': 'loading',
                                'rows_loaded': row_number,
                                'start_time': loading_start,
                                'elapsed_time': elapsed_time})
                    #
                    # first phase of validations: read record and record a
                    # formatting error if there's a problem
                    #
                    (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \
                        self.read_record(reader, writer, row_number, job, fields, error_list)
                    if reduceRow:
                        row_number -= 1
                    if rowErrorHere:
                        error_rows.append(row_number)
                    if doneReading:
                        # Stop reading from input file
                        break
                    elif skip_row:
                        # Do not write this row to staging, but continue processing future rows
                        continue

                    #
                    # second phase of validations: do basic schema checks
                    # (e.g., require fields, field length, data type)
                    #
                    # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic
                    # validations, so these validations are not repeated here
                    if file_type in ["award", "award_procurement"]:
                        # Skip basic validations for D files, set as valid to trigger write to staging
                        passed_validations = True
                        valid = True
                    else:
                        if file_type in ["detached_award"]:
                            record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + \
                                  (record['awarding_sub_tier_agency_c'] or '-none-') + \
                                  (record['fain'] or '-none-') + (record['uri'] or '-none-')
                        passed_validations, failures, valid = Validator.validate(record, csv_schema,
                                                                                 file_type in ["detached_award"])
                    if valid:
                        # todo: update this logic later when we have actual validations
                        if file_type in ["detached_award"]:
                            record["is_valid"] = True

                        model_instance = model(job_id=job_id, submission_id=submission_id,
                                               valid_record=passed_validations, **record)
                        skip_row = not insert_staging_model(model_instance, job, writer, error_list)
                        if flex_cols:
                            sess.add_all(flex_cols)
                            sess.commit()

                        if skip_row:
                            error_rows.append(row_number)
                            continue

                    if not passed_validations:
                        fatal = write_errors(failures, job, self.short_to_long_dict, writer, warning_writer,
                                             row_number, error_list)
                        if fatal:
                            error_rows.append(row_number)

                loading_duration = (datetime.now()-loading_start).total_seconds()
                logger.info(
                    {
                        'message': 'Completed data loading on submission_id: ' + str(submission_id) +
                        ', job_id: ' + str(job_id) + ', file_type: ' + file_type,
                        'message_type': 'ValidatorInfo',
                        'submission_id': submission_id,
                        'job_id': job_id,
                        'file_type': file_type,
                        'action': 'data_loading',
                        'status': 'finish',
                        'start_time': loading_start,
                        'end_time': datetime.now(),
                        'duration': loading_duration,
                        'total_rows': row_number
                    })

                if file_type in ('appropriations', 'program_activity', 'award_financial'):
                    update_tas_ids(model, submission_id)
                #
                # third phase of validations: run validation rules as specified
                # in the schema guidance. these validations are sql-based.
                #
                sql_error_rows = self.run_sql_validations(job, file_type, self.short_to_long_dict, writer,
                                                          warning_writer, row_number, error_list)
                error_rows.extend(sql_error_rows)

                # Write unfinished batch
                writer.finish_batch()
                warning_writer.finish_batch()

            # Calculate total number of rows in file
            # that passed validations
            error_rows_unique = set(error_rows)
            total_rows_excluding_header = row_number - 1
            valid_rows = total_rows_excluding_header - len(error_rows_unique)

            # Update detached_award is_valid rows where applicable
            # Update submission to include action dates where applicable
            if file_type in ["detached_award"]:
                sess.query(DetachedAwardFinancialAssistance).\
                    filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique),
                           DetachedAwardFinancialAssistance.submission_id == submission_id).\
                    update({"is_valid": False}, synchronize_session=False)
                sess.commit()
                min_action_date, max_action_date = get_action_dates(submission_id)
                sess.query(Submission).filter(Submission.submission_id == submission_id).\
                    update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date},
                           synchronize_session=False)

            # Ensure validated rows match initial row count
            if file_row_count != row_number:
                raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError)

            # Update job metadata
            job.number_of_rows = row_number
            job.number_of_rows_valid = valid_rows
            sess.commit()

            error_list.write_all_row_errors(job_id)
            # Update error info for submission
            populate_job_error_info(job)

            if file_type in ["detached_award"]:
                # set number of errors and warnings for detached submission
                populate_submission_error_info(submission_id)

            # Mark validation as finished in job tracker
            mark_job_status(job_id, "finished")
            mark_file_complete(job_id, file_name)
        finally:
            # Ensure the file always closes
            reader.close()

            validation_duration = (datetime.now()-validation_start).total_seconds()
            logger.info(
                {
                    'message': 'Completed run_validation on submission_id: ' + str(submission_id) +
                    ', job_id: ' + str(job_id) + ', file_type: ' + file_type,
                    'message_type': 'ValidatorInfo',
                    'submission_id': submission_id,
                    'job_id': job_id,
                    'file_type': file_type,
                    'action': 'run_validation',
                    'status': 'finish',
                    'start_time': validation_start,
                    'end_time': datetime.now(),
                    'duration': validation_duration
                })

        return True
Example #40
0
    def generate_file(self, agency_code=None):
        """ Generates a file based on the FileGeneration object and updates any Jobs referencing it """
        fillin_vals = {'timestamp': get_timestamp()}
        if self.file_generation:
            fillin_vals.update({
                'start':
                self.file_generation.start_date.strftime('%Y%m%d'),
                'end':
                self.file_generation.end_date.strftime('%Y%m%d'),
                'agency_type':
                self.file_generation.agency_type,
                'ext':
                '.{}'.format(self.file_generation.file_format),
            })
        if self.job and self.job.submission:
            # Submission Files
            fillin_vals.update({
                'submission_id':
                self.job.submission_id,
                'FYP':
                filename_fyp_sub_format(self.job.submission),
            })
            file_name = SUBMISSION_FILENAMES[self.file_type].format(
                **fillin_vals)
        else:
            # Detached Files
            if self.job and self.job.file_type.letter_name == 'A':
                period_date = self.job.end_date + relativedelta(months=3)
                fillin_vals['FYP'] = filename_fyp_format(
                    period_date.year, period_date.month, False)
            file_name = DETACHED_FILENAMES[self.file_type].format(
                **fillin_vals)
        if self.is_local:
            file_path = "".join([CONFIG_BROKER['broker_files'], file_name])
        else:
            file_path = "".join(["None/", file_name])

        # Generate the file and upload to S3
        log_data = {
            'message': 'Finished file {} generation'.format(self.file_type),
            'message_type': 'ValidatorInfo',
            'file_type': self.file_type,
            'file_path': file_path
        }
        if self.file_generation:
            self.generate_d_file(file_path)

            log_data.update({
                'agency_code':
                self.file_generation.agency_code,
                'agency_type':
                self.file_generation.agency_type,
                'start_date':
                self.file_generation.start_date,
                'end_date':
                self.file_generation.end_date,
                'file_generation_id':
                self.file_generation.file_generation_id
            })
        elif self.job.file_type.letter_name in ['A', 'E', 'F']:
            log_data['job_id'] = self.job.job_id
            mark_job_status(self.job.job_id, 'running')

            if self.job.file_type.letter_name == 'A':
                if not agency_code:
                    raise ResponseException(
                        'Agency code not provided for an A file generation')

                self.generate_a_file(agency_code, file_path)
            else:
                # Call self.generate_%s_file() where %s is e or f based on the Job's file_type
                file_type_lower = self.job.file_type.letter_name.lower()
                getattr(self, 'generate_%s_file' % file_type_lower)()

            mark_job_status(self.job.job_id, 'finished')
        else:
            e = 'No FileGeneration object for D file generation.' if self.file_type in ['D1', 'D2'] else \
                'Cannot generate file for {} file type.'.format(self.file_type if self.file_type else 'empty')
            raise ResponseException(e)

        logger.info(log_data)
    def run_validation(self, job):
        """ Run validations for specified job

            Args:
                job: Job to be validated

            Returns:
                True if successful
        """

        sess = GlobalDB.db().session
        self.job = job
        self.submission_id = job.submission_id
        self.file_type = job.file_type
        self.file_name = job.filename
        self.is_fabs = (self.file_type.name == 'fabs')

        # initializing processing metadata vars for a new validation
        self.reader = CsvReader()
        self.error_list = ErrorInterface()
        self.error_rows = []
        self.max_row_number = 1
        self.total_rows = 0
        self.short_rows = []
        self.long_rows = []

        validation_start = datetime.now()
        bucket_name = CONFIG_BROKER['aws_bucket']
        region_name = CONFIG_BROKER['aws_region']

        self.log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format(
            str(self.submission_id), str(self.job.job_id), self.file_type.name)
        logger.info({
            'message': 'Beginning run_validation {}'.format(self.log_str),
            'message_type': 'ValidatorInfo',
            'submission_id': self.submission_id,
            'job_id': self.job.job_id,
            'file_type': self.file_type.name,
            'action': 'run_validations',
            'status': 'start',
            'start_time': validation_start
        })
        # Get orm model for this file
        self.model = [ft.model for ft in FILE_TYPE if ft.name == self.file_type.name][0]

        # Delete existing file level errors for this submission
        sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == self.job.job_id).delete()
        sess.commit()
        # Clear existing records for this submission
        sess.query(self.model).filter_by(submission_id=self.submission_id).delete()
        sess.commit()
        # Clear existing flex fields for this job
        sess.query(FlexField).filter_by(job_id=self.job.job_id).delete()
        sess.commit()

        # If local, make the error report directory
        if self.is_local and not os.path.exists(self.directory):
            os.makedirs(self.directory)
        create_file_if_needed(self.job.job_id, self.file_name)

        # Get file size and write to jobs table
        if CONFIG_BROKER['use_aws']:
            file_size = S3Handler.get_file_size(self.file_name)
        else:
            file_size = os.path.getsize(self.file_name)
        self.job.file_size = file_size
        sess.commit()

        # Get fields for this file
        self.fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[self.file_type.name])\
            .order_by(FileColumn.daims_name.asc()).all()
        self.expected_headers, self.parsed_fields = parse_fields(sess, self.fields)
        self.csv_schema = {row.name_short: row for row in self.fields}

        try:
            # Loading data and initial validations
            self.load_file_data(sess, bucket_name, region_name)

            if self.file_type.name in ('appropriations', 'program_activity', 'award_financial'):
                update_tas_ids(self.model, self.submission_id)

            # SQL Validations
            with open(self.error_file_path, 'a', newline='') as error_file, \
                    open(self.warning_file_path, 'a', newline='') as warning_file:
                error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
                warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')

                # third phase of validations: run validation rules as specified in the schema guidance. These
                # validations are sql-based.
                sql_error_rows = self.run_sql_validations(self.short_to_long_dict[self.file_type.file_type_id],
                                                          error_csv, warning_csv)
                self.error_rows.extend(sql_error_rows)
            error_file.close()
            warning_file.close()

            # stream file to S3 when not local
            if not self.is_local:
                s3_resource = boto3.resource('s3', region_name=region_name)
                # stream error file
                with open(self.error_file_path, 'rb') as csv_file:
                    s3_resource.Object(bucket_name, self.get_file_name(self.error_file_name)).put(Body=csv_file)
                csv_file.close()
                os.remove(self.error_file_path)

                # stream warning file
                with open(self.warning_file_path, 'rb') as warning_csv_file:
                    s3_resource.Object(bucket_name,
                                       self.get_file_name(self.warning_file_name)).put(Body=warning_csv_file)
                warning_csv_file.close()
                os.remove(self.warning_file_path)

            # Calculate total number of rows in file that passed validations
            error_rows_unique = set(self.error_rows)
            total_rows_excluding_header = self.total_rows - 1
            valid_rows = total_rows_excluding_header - len(error_rows_unique)

            # Update fabs is_valid rows where applicable
            # Update submission to include action dates where applicable
            if self.is_fabs:
                sess.query(DetachedAwardFinancialAssistance). \
                    filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique),
                           DetachedAwardFinancialAssistance.submission_id == self.submission_id). \
                    update({'is_valid': False}, synchronize_session=False)
                sess.commit()
                min_action_date, max_action_date = get_action_dates(self.submission_id)
                sess.query(Submission).filter(Submission.submission_id == self.submission_id). \
                    update({'reporting_start_date': min_action_date, 'reporting_end_date': max_action_date},
                           synchronize_session=False)

            # Update job metadata
            self.job.number_of_rows = self.total_rows
            self.job.number_of_rows_valid = valid_rows
            sess.commit()

            self.error_list.write_all_row_errors(self.job.job_id)
            # Update error info for submission
            populate_job_error_info(self.job)

            if self.is_fabs:
                # set number of errors and warnings for detached submission
                populate_submission_error_info(self.submission_id)

            # Mark validation as finished in job tracker
            mark_job_status(self.job.job_id, 'finished')
            mark_file_complete(self.job.job_id, self.file_name)

        except Exception:
            logger.error({
                'message': 'An exception occurred during validation',
                'message_type': 'ValidatorInfo',
                'submission_id': self.submission_id,
                'job_id': self.job.job_id,
                'file_type': self.file_type.name,
                'traceback': traceback.format_exc()
            })
            raise

        finally:
            # Ensure the files always close
            self.reader.close()

            validation_duration = (datetime.now()-validation_start).total_seconds()
            logger.info({
                'message': 'Completed run_validation {}'.format(self.log_str),
                'message_type': 'ValidatorInfo',
                'submission_id': self.submission_id,
                'job_id': self.job.job_id,
                'file_type': self.file_type.name,
                'action': 'run_validation',
                'status': 'finish',
                'start_time': validation_start,
                'end_time': datetime.now(),
                'duration': validation_duration
            })

        return True