Python S3Handler Exemples, dataactcore.aws.s3Handler.S3Handler Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : csv_selection.py Projet : adolfoeliazat/data-act-broker-backend

def stream_file_to_s3(upload_name, reader, is_certified=False):
    """Stream file to S3

        Args:
            upload_name - file name to be used as S3 key
            reader - reader object to read data from
            is_certified - True if writing to the certified bucket, False otherwise (default False)
    """
    path, file_name = upload_name.rsplit('/', 1)
    logger.debug({
        'message': 'Streaming file to S3',
        'message_type': 'ValidatorDebug',
        'file_name': file_name if file_name else path
    })

    if is_certified:
        handler = S3Handler.create_file_path(upload_name, CONFIG_BROKER["certified_bucket"])
    else:
        handler = S3Handler.create_file_path(upload_name)

    with smart_open.smart_open(handler, 'w') as writer:
        while True:
            chunk = reader.read(CHUNK_SIZE)
            if chunk:
                writer.write(chunk)
            else:
                break

Exemple #2

0

Afficher le fichier

Fichier : generation_helper.py Projet : vcgato29/data-act-broker-backend

def copy_file_from_parent_to_child(child_job, parent_job, is_local):
    """ Copy the file from the parent job's bucket to the child job's bucket.

        Args:
            child_job: Job object for the child FileRequest
            parent_job: Job object for the parent FileRequest
            is_local: A boolean flag indicating whether the application is being run locally or not

    """
    file_type = parent_job.file_type.letter_name
    log_data = {'message': 'Copying data from parent job with job_id:{}'.format(parent_job.job_id),
                'message_type': 'ValidatorInfo', 'job_id': child_job.job_id, 'file_type': parent_job.file_type.name}

    if not is_local:
        is_local = g.is_local
    if not is_local and parent_job.filename != child_job.filename:
        # Check to see if the same file exists in the child bucket
        s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"])
        response = s3.list_objects_v2(Bucket=CONFIG_BROKER['aws_bucket'], Prefix=child_job.filename)
        for obj in response.get('Contents', []):
            if obj['Key'] == child_job.filename:
                # The file already exists in this location
                log_data['message'] = 'Cached {} file CSV already exists in this location'.format(file_type)
                logger.info(log_data)
                return

        # Copy the parent file into the child's S3 location
        log_data['message'] = 'Copying the cached {} file from job {}'.format(file_type, parent_job.job_id)
        logger.info(log_data)
        S3Handler.copy_file(CONFIG_BROKER['aws_bucket'], CONFIG_BROKER['aws_bucket'], parent_job.filename,
                            child_job.filename)

Exemple #3

0

Afficher le fichier

def check_file_generation(job_id):
    """ Check the status of a file generation

        Args:
            job_id: upload Job ID
        Return:
            Dict with keys: job_id, status, file_type, message, url, start, end
    """
    sess = GlobalDB.db().session

    # We want to use one_or_none() here so we can see if the job is None so we can mark the status as invalid to
    # indicate that a status request is invoked for a job that isn't created yet
    upload_job = sess.query(Job).filter_by(job_id=job_id).one_or_none()
    response_dict = {
        'job_id': job_id,
        'status': '',
        'file_type': '',
        'message': '',
        'url': '#',
        'size': None
    }

    if upload_job is None:
        response_dict['start'] = ''
        response_dict['end'] = ''
        response_dict['status'] = 'invalid'
        response_dict[
            'message'] = 'No generation job found with the specified ID'
        return response_dict

    response_dict['file_type'] = lookups.FILE_TYPE_DICT_LETTER[
        upload_job.file_type_id]
    response_dict['size'] = upload_job.file_size
    response_dict['status'] = map_generate_status(sess, upload_job)
    response_dict['message'] = upload_job.error_message or ''

    # Generate the URL (or path) to the file
    if CONFIG_BROKER['use_aws'] and response_dict[
            'status'] is 'finished' and upload_job.filename:
        path, file_name = upload_job.filename.split('/')
        response_dict['url'] = S3Handler().get_signed_url(
            path=path,
            file_name=file_name,
            bucket_route=None,
            url_mapping=CONFIG_BROKER["submission_bucket_mapping"],
            method='get_object')
    elif response_dict['status'] is 'finished' and upload_job.filename:
        response_dict['url'] = upload_job.filename

    # Only D file generations have start and end dates
    if response_dict['file_type'] in ['D1', 'D2']:
        response_dict['start'] = upload_job.start_date.strftime(
            "%m/%d/%Y") if upload_job.start_date is not None else ""
        response_dict['end'] = upload_job.end_date.strftime(
            "%m/%d/%Y") if upload_job.end_date is not None else ""

    return response_dict

Exemple #4

0

Afficher le fichier

Fichier : file_generation_manager.py Projet : alexjajabah/data-act-broker-backend

    def generate_file(self, agency_code=None):
        """ Generates a file based on the FileGeneration object and updates any Jobs referencing it """
        raw_filename = (GEN_FILENAMES[self.file_type]
                        if not self.file_generation else
                        GEN_FILENAMES[self.file_type].format(
                            self.file_generation.agency_type))
        file_name = S3Handler.get_timestamped_filename(raw_filename)
        if self.is_local:
            file_path = "".join([CONFIG_BROKER['broker_files'], file_name])
        else:
            file_path = "".join(["None/", file_name])

        # Generate the file and upload to S3
        log_data = {
            'message': 'Finished file {} generation'.format(self.file_type),
            'message_type': 'ValidatorInfo',
            'file_type': self.file_type,
            'file_path': file_path
        }
        if self.file_generation:
            self.generate_d_file(file_path)

            log_data.update({
                'agency_code':
                self.file_generation.agency_code,
                'agency_type':
                self.file_generation.agency_type,
                'start_date':
                self.file_generation.start_date,
                'end_date':
                self.file_generation.end_date,
                'file_generation_id':
                self.file_generation.file_generation_id
            })
        elif self.job.file_type.letter_name in ['A', 'E', 'F']:
            log_data['job_id'] = self.job.job_id
            mark_job_status(self.job.job_id, 'running')

            if self.job.file_type.letter_name == 'A':
                if not agency_code:
                    raise ResponseException(
                        'Agency code not provided for an A file generation')

                self.generate_a_file(agency_code, file_path)
            else:
                # Call self.generate_%s_file() where %s is e or f based on the Job's file_type
                file_type_lower = self.job.file_type.letter_name.lower()
                getattr(self, 'generate_%s_file' % file_type_lower)()

            mark_job_status(self.job.job_id, 'finished')
        else:
            e = 'No FileGeneration object for D file generation.' if self.file_type in ['D1', 'D2'] else \
                'Cannot generate file for {} file type.'.format(self.file_type if self.file_type else 'empty')
            raise ResponseException(e)

        logger.info(log_data)

Exemple #5

0

Afficher le fichier

Fichier : file_generation_manager.py Projet : DavidLAndrews/data-act-broker-backend

    def generate_from_job(self, job_id, agency_code):
        """ Generates a file for a specified job

            Args:
                job_id: ID of the upload Job
                agency_code: FREC or CGAC code to generate data from
        """
        mark_job_status(job_id, 'running')

        with job_context(job_id, self.is_local) as context:
            sess, job = context

            # Ensure this is a file generation job
            if job.job_type.name != 'file_upload':
                raise ResponseException(
                    'Job ID {} is not a file generation job (job type is {})'.
                    format(job.job_id, job.job_type.name),
                    StatusCode.CLIENT_ERROR, None, ValidationError.jobError)

            # Ensure there is an available agency_code
            if not agency_code:
                if job.submission_id:
                    agency_code = job.submission.frec_code if job.submission.frec_code else job.submission.cgac_code
                else:
                    raise ResponseException(
                        'An agency_code must be provided to generate a file'.
                        format(job.job_id,
                               job.job_type.name), StatusCode.CLIENT_ERROR,
                        None, ValidationError.jobError)

            # Generate timestamped file names
            old_filename = job.original_filename
            job.original_filename = S3Handler.get_timestamped_filename(
                CONFIG_BROKER["".join([str(job.file_type.name),
                                       "_file_name"])])
            if self.is_local:
                job.filename = "".join(
                    [CONFIG_BROKER['broker_files'], job.original_filename])
            else:
                job.filename = "".join(
                    [str(job.submission_id), "/", job.original_filename])

            # Generate the file and upload to S3
            if job.file_type.letter_name in ['D1', 'D2']:
                # Update the validation Job if necessary
                if job.submission_id:
                    self.update_validation_job_info(job)

                generate_d_file(sess, job, agency_code, self.is_local,
                                old_filename)
            elif job.file_type.letter_name == 'E':
                generate_e_file(sess, job, self.is_local)
            else:
                generate_f_file(sess, job, self.is_local)

Exemple #6

0

Afficher le fichier

Fichier : file_generation_manager.py Projet : vcgato29/data-act-broker-backend

    def generate_from_job(self):
        """ Generates a file for a specified job """
        # Mark Job as running
        mark_job_status(self.job.job_id, 'running')

        # Ensure this is a file generation job
        job_type = self.job.job_type.name
        if job_type != 'file_upload':
            raise ResponseException(
                'Job ID {} is not a file generation job (job type is {})'.format(self.job.job_id, job_type),
                StatusCode.CLIENT_ERROR, None, ValidationError.jobError)

        # Ensure there is an available agency_code
        if not self.agency_code:
            raise ResponseException(
                'An agency_code must be provided to generate a file'.format(self.job.job_id, job_type),
                StatusCode.CLIENT_ERROR, None, ValidationError.jobError)

        # Retrieve any FileRequest that may have started since the Broker sent the request to SQS
        skip_generation = None
        if self.job.file_type.letter_name in ['D1', 'D2']:
            skip_generation = retrieve_cached_file_request(self.job, self.agency_type, self.agency_code, self.is_local)

        if not skip_generation:
            # Generate timestamped file names
            raw_filename = CONFIG_BROKER["".join([str(self.job.file_type.name), "_file_name"])]
            self.job.original_filename = S3Handler.get_timestamped_filename(raw_filename)
            if self.is_local:
                self.job.filename = "".join([CONFIG_BROKER['broker_files'], self.job.original_filename])
            else:
                self.job.filename = "".join([str(self.job.submission_id), "/", self.job.original_filename])
            self.sess.commit()

            # Generate the file, and upload to S3
            if self.job.file_type.letter_name in ['D1', 'D2']:
                # Update the validation Job if necessary
                update_validation_job_info(self.sess, self.job)

                self.generate_d_file()
            elif self.job.file_type.letter_name == 'A':
                self.generate_a_file()
            elif self.job.file_type.letter_name == 'E':
                self.generate_e_file()
            else:
                self.generate_f_file()

            mark_job_status(self.job.job_id, 'finished')

        logger.info({
            'message': 'Finished file {} generation'.format(self.job.file_type.letter_name),
            'message_type': 'ValidatorInfo', 'job_id': self.job.job_id, 'agency_code': self.agency_code,
            'file_type': self.job.file_type.letter_name, 'start_date': self.job.start_date,
            'end_date': self.job.end_date, 'filename': self.job.original_filename
        })

Exemple #7

0

Afficher le fichier

Fichier : function_bag.py Projet : usdigitalresponse/data-act-broker-backend

def get_fabs_meta(submission_id):
    """Return the total rows, valid rows, publish date, and publish file for FABS submissions"""
    sess = GlobalDB.db().session

    # get row counts from the DetachedAwardFinancialAssistance table
    dafa = DetachedAwardFinancialAssistance
    total_rows = sess.query(dafa).filter(dafa.submission_id == submission_id)
    valid_rows = total_rows.filter(dafa.is_valid)

    # retrieve the published data and file
    submission = sess.query(Submission).filter(
        Submission.submission_id == submission_id).one()
    publish_date, published_file = None, None
    certify_data = get_lastest_certified_date(submission, is_fabs=True)

    try:
        iter(certify_data)
    except TypeError:
        publish_date = certify_data
    else:
        publish_date, file_path = certify_data
        if CONFIG_BROKER["use_aws"] and file_path:
            path, file_name = file_path.rsplit(
                '/', 1)  # split by last instance of /
            published_file = S3Handler().get_signed_url(
                path=path,
                file_name=file_name,
                bucket_route=CONFIG_BROKER['certified_bucket'],
                url_mapping=CONFIG_BROKER["certified_bucket_mapping"],
                method="get_object")
        elif file_path:
            published_file = file_path

    return {
        'valid_rows':
        valid_rows.count(),
        'total_rows':
        total_rows.count(),
        'publish_date':
        publish_date.strftime('%-I:%M%p %m/%d/%Y') if publish_date else None,
        'published_file':
        published_file
    }

Exemple #8

0

Afficher le fichier

Fichier : file_generation_manager.py Projet : fedspendingtransparency/data-act-broker-backend

    def generate_file(self, agency_code=None):
        """ Generates a file based on the FileGeneration object and updates any Jobs referencing it """
        raw_filename = (GEN_FILENAMES[self.file_type] if not self.file_generation else
                        GEN_FILENAMES[self.file_type].format(self.file_generation.agency_type))
        file_name = S3Handler.get_timestamped_filename(raw_filename)
        if self.is_local:
            file_path = "".join([CONFIG_BROKER['broker_files'], file_name])
        else:
            file_path = "".join(["None/", file_name])

        # Generate the file and upload to S3
        log_data = {'message': 'Finished file {} generation'.format(self.file_type), 'message_type': 'ValidatorInfo',
                    'file_type': self.file_type, 'file_path': file_path}
        if self.file_generation:
            self.generate_d_file(file_path)

            log_data.update({
                'agency_code': self.file_generation.agency_code, 'agency_type': self.file_generation.agency_type,
                'start_date': self.file_generation.start_date, 'end_date': self.file_generation.end_date,
                'file_generation_id': self.file_generation.file_generation_id
            })
        elif self.job.file_type.letter_name in ['A', 'E', 'F']:
            log_data['job_id'] = self.job.job_id
            mark_job_status(self.job.job_id, 'running')

            if self.job.file_type.letter_name == 'A':
                if not agency_code:
                    raise ResponseException('Agency code not provided for an A file generation')

                self.generate_a_file(agency_code, file_path)
            else:
                # Call self.generate_%s_file() where %s is e or f based on the Job's file_type
                file_type_lower = self.job.file_type.letter_name.lower()
                getattr(self, 'generate_%s_file' % file_type_lower)()

            mark_job_status(self.job.job_id, 'finished')
        else:
            e = 'No FileGeneration object for D file generation.' if self.file_type in ['D1', 'D2'] else \
                'Cannot generate file for {} file type.'.format(self.file_type if self.file_type else 'empty')
            raise ResponseException(e)

        logger.info(log_data)

Exemple #9

0

Afficher le fichier

Fichier : function_bag.py Projet : fedspendingtransparency/data-act-broker-backend

def get_fabs_meta(submission_id):
    """Return the total rows, valid rows, publish date, and publish file for FABS submissions"""
    sess = GlobalDB.db().session

    # get row counts from the FABS table
    total_rows = sess.query(FABS).filter(FABS.submission_id == submission_id)
    valid_rows = total_rows.filter(FABS.is_valid)

    # retrieve the published data and file
    submission = sess.query(Submission).filter(
        Submission.submission_id == submission_id).one()
    publish_date, published_file = None, None
    publish_data = get_latest_published_date(submission, is_fabs=True)

    try:
        iter(publish_data)
    except TypeError:
        publish_date = publish_data
    else:
        publish_date, file_path = publish_data
        if CONFIG_BROKER['use_aws'] and file_path:
            path, file_name = file_path.rsplit(
                '/', 1)  # split by last instance of /
            published_file = S3Handler().get_signed_url(
                path=path,
                file_name=file_name,
                bucket_route=CONFIG_BROKER['certified_bucket'],
                url_mapping=CONFIG_BROKER['certified_bucket_mapping'])
        elif file_path:
            published_file = file_path

    return {
        'valid_rows':
        valid_rows.count(),
        'total_rows':
        total_rows.count(),
        'publish_date':
        publish_date.strftime('%Y-%m-%dT%H:%M:%S') if publish_date else None,
        'published_file':
        published_file
    }

Exemple #10

0

Afficher le fichier

Fichier : submission_handler.py Projet : usdigitalresponse/data-act-broker-backend

def revert_to_certified(submission, file_manager):
    """ Revert an updated DABS submission to its last certified state

        Args:
            submission: the submission to be reverted
            file_manager: a FileHandler object to be used to call revert_certified_error_files and determine is_local

        Returns:
            A JsonResponse containing a success message

        Raises:
            ResponseException: if submission provided is a FABS submission or is not in an "updated" status
    """

    if submission.d2_submission:
        raise ResponseException('Submission must be a DABS submission.', status=StatusCode.CLIENT_ERROR)

    if submission.publish_status_id != PUBLISH_STATUS_DICT['updated']:
        raise ResponseException('Submission has not been certified or has not been updated since certification.',
                                status=StatusCode.CLIENT_ERROR)

    sess = GlobalDB.db().session
    move_certified_data(sess, submission.submission_id, direction='revert')

    # Copy file paths from certified_files_history
    max_cert_history = sess.query(func.max(CertifyHistory.certify_history_id), func.max(CertifyHistory.updated_at)).\
        filter(CertifyHistory.submission_id == submission.submission_id).one()
    remove_timestamp = [str(FILE_TYPE_DICT['appropriations']), str(FILE_TYPE_DICT['program_activity']),
                        str(FILE_TYPE_DICT['award_financial'])]
    if file_manager.is_local:
        filepath = CONFIG_BROKER['broker_files']
        ef_path = ''
    else:
        filepath = '{}/'.format(submission.submission_id)
        ef_path = filepath
        remove_timestamp.extend([str(FILE_TYPE_DICT['executive_compensation']), str(FILE_TYPE_DICT['sub_award'])])

    # Certified filename -> Job filename, original filename
    # Local:
    #   A/B/C:
    #     filename -> '[broker_files dir][certified file base name]'
    #     original_filename -> '[certified file base name without the timestamp]'
    #   D1/D2:
    #     filename -> '[broker_files dir][certified file base name]'
    #     original_filename -> '[certified file base name]'
    #   E/F:
    #     filename -> '[certified file base name]'
    #     original_filename -> '[certified file base name]'
    # Remote:
    #   A/B/C/E/F:
    #     filename -> '[submission_id]/[certified file base name]'
    #     original_filename -> '[certified file base name without the timestamp]'
    #   D1/D2:
    #     filename -> '[submission_id dir][certified file base name]'
    #     original_filename -> '[certified file base name]'
    update_string = """
        WITH filenames AS (
            SELECT REVERSE(SPLIT_PART(REVERSE(filename), '/', 1)) AS simple_name,
                file_type_id
            FROM certified_files_history
            WHERE certify_history_id = {history_id}
        )
        UPDATE job
        SET filename = CASE WHEN job.file_type_id NOT IN (6, 7)
                THEN '{filepath}'
                ELSE '{ef_path}'
                END || simple_name,
            original_filename = CASE WHEN job.file_type_id NOT IN ({remove_timestamp})
                THEN simple_name
                ELSE substring(simple_name, position('_' in simple_name) + 1)
                END
        FROM filenames
        WHERE job.file_type_id = filenames.file_type_id
            AND job.submission_id = {submission_id};
    """.format(history_id=max_cert_history[0], filepath=filepath, ef_path=ef_path,
               remove_timestamp=', '.join(remove_timestamp), submission_id=submission.submission_id)
    sess.execute(update_string)

    # Set errors/warnings for the submission
    submission.number_of_errors = 0
    submission.number_of_warnings =\
        sess.query(func.coalesce(func.sum(CertifiedErrorMetadata.occurrences), 0).label('total_warnings')).\
        join(Job, CertifiedErrorMetadata.job_id == Job.job_id).\
        filter(Job.submission_id == submission.submission_id).one().total_warnings
    submission.publishable = True

    # Set default numbers/status/last validation date for jobs then update warnings
    sess.query(Job).filter_by(submission_id=submission.submission_id).\
        update({'number_of_errors': 0, 'number_of_warnings': 0, 'job_status_id': JOB_STATUS_DICT['finished'],
                'last_validated': max_cert_history[1], 'error_message': None, 'file_generation_id': None})

    # Get list of jobs so we can update them
    job_list = sess.query(Job).\
        filter(Job.submission_id == submission.submission_id,
               Job.job_type_id.in_([JOB_TYPE_DICT['csv_record_validation'], JOB_TYPE_DICT['validation']]),
               Job.file_type_id.notin_([FILE_TYPE_DICT['sub_award'], FILE_TYPE_DICT['executive_compensation']])).all()

    # Fixing File table
    job_ids = [str(job.job_id) for job in job_list]
    update_string = """
            UPDATE file
            SET filename = job.filename,
                file_status_id = 1,
                headers_missing = NULL,
                headers_duplicated = NULL
            FROM job
            WHERE job.job_id = file.job_id
                AND job.job_id IN ({job_ids});
        """.format(job_ids=', '.join(job_ids))
    sess.execute(update_string)

    file_type_mapping = {
        FILE_TYPE_DICT['appropriations']: CertifiedAppropriation,
        FILE_TYPE_DICT['program_activity']: CertifiedObjectClassProgramActivity,
        FILE_TYPE_DICT['award_financial']: CertifiedAwardFinancial,
        FILE_TYPE_DICT['award']: CertifiedAwardFinancialAssistance,
        FILE_TYPE_DICT['award_procurement']: CertifiedAwardProcurement
    }
    # Update the number of warnings for each job in the list
    for job in job_list:
        job.number_of_warnings = sess.query(func.coalesce(func.sum(CertifiedErrorMetadata.occurrences), 0).
                                            label('total_warnings')). \
            filter_by(job_id=job.job_id).one().total_warnings
        # For non-cross-file jobs, also update the row count and file size
        if job.job_type_id != JOB_TYPE_DICT['validation']:
            file_type_model = file_type_mapping[job.file_type_id]
            total_rows = sess.query(file_type_model).filter_by(submission_id=submission.submission_id).count()
            job.number_of_rows = total_rows + 1
            job.number_of_rows_valid = total_rows
            if file_manager.is_local:
                # local file size
                try:
                    job.file_size = os.path.getsize(job.filename)
                except:
                    logger.warning("File doesn't exist locally: %s", job.filename)
                    job.file_size = 0
            else:
                # boto file size
                job.file_size = S3Handler.get_file_size(job.filename)
    # Set submission to certified status
    submission.publish_status_id = PUBLISH_STATUS_DICT['published']
    sess.commit()

    # Move warning files back non-locally and clear out error files for all environments
    file_manager.revert_certified_error_files(sess, max_cert_history[0])

    return JsonResponse.create(StatusCode.OK, {'message': 'Submission {} successfully reverted to certified status.'.
                               format(submission.submission_id)})

Exemple #11

0

Afficher le fichier

Fichier : validationManager.py Projet : DavidLAndrews/data-act-broker-backend

    def run_cross_validation(self, job):
        """ Cross file validation job. Test all rules with matching rule_timing.
            Run each cross-file rule and create error report.

            Args:
                job: Current job
        """
        sess = GlobalDB.db().session
        job_id = job.job_id
        # Create File Status object
        create_file_if_needed(job_id)
        # Create list of errors
        error_list = ErrorInterface()

        submission_id = job.submission_id
        job_start = datetime.now()
        logger.info({
            'message':
            'Beginning cross-file validations on submission_id: ' +
            str(submission_id),
            'message_type':
            'ValidatorInfo',
            'submission_id':
            submission_id,
            'job_id':
            job.job_id,
            'action':
            'run_cross_validations',
            'start':
            job_start,
            'status':
            'start'
        })
        # Delete existing cross file errors for this submission
        sess.query(ErrorMetadata).filter(
            ErrorMetadata.job_id == job_id).delete()
        sess.commit()

        # get all cross file rules from db
        cross_file_rules = sess.query(RuleSql).filter_by(
            rule_cross_file_flag=True)

        # for each cross-file combo, run associated rules and create error report
        for c in get_cross_file_pairs():
            first_file = c[0]
            second_file = c[1]
            combo_rules = cross_file_rules.filter(
                or_(
                    and_(RuleSql.file_id == first_file.id,
                         RuleSql.target_file_id == second_file.id),
                    and_(RuleSql.file_id == second_file.id,
                         RuleSql.target_file_id == first_file.id)))

            # get error file name/path
            error_file_name = report_file_name(submission_id, False,
                                               first_file.name,
                                               second_file.name)
            error_file_path = "".join(
                [CONFIG_SERVICES['error_report_path'], error_file_name])
            warning_file_name = report_file_name(submission_id, True,
                                                 first_file.name,
                                                 second_file.name)
            warning_file_path = "".join(
                [CONFIG_SERVICES['error_report_path'], warning_file_name])

            # open error report and gather failed rules within it
            with open(error_file_path, 'w', newline='') as error_file,\
                    open(warning_file_path, 'w', newline='') as warning_file:
                error_csv = csv.writer(error_file,
                                       delimiter=',',
                                       quoting=csv.QUOTE_MINIMAL,
                                       lineterminator='\n')
                warning_csv = csv.writer(warning_file,
                                         delimiter=',',
                                         quoting=csv.QUOTE_MINIMAL,
                                         lineterminator='\n')

                # write headers to file
                error_csv.writerow(self.crossFileReportHeaders)
                warning_csv.writerow(self.crossFileReportHeaders)

                # send comboRules to validator.crossValidate sql
                current_cols_short_to_long = self.short_to_long_dict[
                    first_file.id].copy()
                current_cols_short_to_long.update(
                    self.short_to_long_dict[second_file.id].copy())
                cross_validate_sql(combo_rules.all(), submission_id,
                                   current_cols_short_to_long, first_file.id,
                                   second_file.id, job, error_csv, warning_csv,
                                   error_list, job_id)
            # close files
            error_file.close()
            warning_file.close()

            # stream file to S3 when not local
            if not self.is_local:
                # stream error file
                with open(error_file_path, 'rb') as csv_file:
                    with smart_open.smart_open(
                            S3Handler.create_file_path(
                                self.get_file_name(error_file_name)),
                            'w') as writer:
                        while True:
                            chunk = csv_file.read(CHUNK_SIZE)
                            if chunk:
                                writer.write(chunk)
                            else:
                                break
                csv_file.close()
                os.remove(error_file_path)

                # stream warning file
                with open(warning_file_path, 'rb') as warning_csv_file:
                    with smart_open.smart_open(
                            S3Handler.create_file_path(
                                self.get_file_name(warning_file_name)),
                            'w') as warning_writer:
                        while True:
                            chunk = warning_csv_file.read(CHUNK_SIZE)
                            if chunk:
                                warning_writer.write(chunk)
                            else:
                                break
                warning_csv_file.close()
                os.remove(warning_file_path)

        # write all recorded errors to database
        error_list.write_all_row_errors(job_id)
        # Update error info for submission
        populate_job_error_info(job)

        # mark job status as "finished"
        mark_job_status(job_id, "finished")
        job_duration = (datetime.now() - job_start).total_seconds()
        logger.info({
            'message':
            'Completed cross-file validations on submission_id: ' +
            str(submission_id),
            'message_type':
            'ValidatorInfo',
            'submission_id':
            submission_id,
            'job_id':
            job.job_id,
            'action':
            'run_cross_validations',
            'status':
            'finish',
            'start':
            job_start,
            'duration':
            job_duration
        })
        # set number of errors and warnings for submission.
        submission = populate_submission_error_info(submission_id)
        # TODO: Remove temporary step below
        # Temporarily set publishable flag at end of cross file, remove this once users are able to mark their
        # submissions as publishable
        # Publish only if no errors are present
        if submission.number_of_errors == 0:
            submission.publishable = True
        sess.commit()

        # Mark validation complete
        mark_file_complete(job_id)

Exemple #12

0

Afficher le fichier

Fichier : validationManager.py Projet : DavidLAndrews/data-act-broker-backend

    def run_validation(self, job):
        """ Run validations for specified job
        Args:
            job: Job to be validated
        Returns:
            True if successful
        """

        sess = GlobalDB.db().session
        error_list = ErrorInterface()
        job_id = job.job_id
        submission_id = job.submission_id

        row_number = 1
        file_type = job.file_type.name
        validation_start = datetime.now()

        log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format(
            str(submission_id), str(job_id), file_type)
        logger.info({
            'message': 'Beginning run_validation {}'.format(log_str),
            'message_type': 'ValidatorInfo',
            'submission_id': submission_id,
            'job_id': job_id,
            'file_type': file_type,
            'action': 'run_validations',
            'status': 'start',
            'start_time': validation_start
        })
        # Get orm model for this file
        model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0]

        # Delete existing file level errors for this submission
        sess.query(ErrorMetadata).filter(
            ErrorMetadata.job_id == job_id).delete()
        sess.commit()

        # Clear existing records for this submission
        sess.query(model).filter_by(submission_id=submission_id).delete()
        sess.commit()

        # Clear existing flex fields for this job
        sess.query(FlexField).filter_by(job_id=job_id).delete()
        sess.commit()

        # If local, make the error report directory
        if self.is_local and not os.path.exists(self.directory):
            os.makedirs(self.directory)
        # Get bucket name and file name
        file_name = job.filename
        bucket_name = CONFIG_BROKER['aws_bucket']
        region_name = CONFIG_BROKER['aws_region']

        error_file_name = report_file_name(job.submission_id, False,
                                           job.file_type.name)
        error_file_path = "".join(
            [CONFIG_SERVICES['error_report_path'], error_file_name])
        warning_file_name = report_file_name(job.submission_id, True,
                                             job.file_type.name)
        warning_file_path = "".join(
            [CONFIG_SERVICES['error_report_path'], warning_file_name])

        # Create File Status object
        create_file_if_needed(job_id, file_name)

        reader = CsvReader()

        # Get file size and write to jobs table
        if CONFIG_BROKER["use_aws"]:
            file_size = S3Handler.get_file_size(file_name)
        else:
            file_size = os.path.getsize(file_name)
        job.file_size = file_size
        sess.commit()

        # Get fields for this file
        fields = sess.query(FileColumn).filter(
            FileColumn.file_id == FILE_TYPE_DICT[file_type]).all()

        for field in fields:
            sess.expunge(field)

        csv_schema = {row.name_short: row for row in fields}

        try:
            extension = os.path.splitext(file_name)[1]
            if not extension or extension.lower() not in ['.csv', '.txt']:
                raise ResponseException("", StatusCode.CLIENT_ERROR, None,
                                        ValidationError.fileTypeError)

            # Count file rows: throws a File Level Error for non-UTF8 characters
            temp_file = open(reader.get_filename(region_name, bucket_name,
                                                 file_name),
                             encoding='utf-8')
            file_row_count = len(list(csv.reader(temp_file)))
            try:
                temp_file.close()
            except AttributeError:
                # File does not exist, and so does not need to be closed
                pass

            # Pull file and return info on whether it's using short or long col headers
            reader.open_file(region_name,
                             bucket_name,
                             file_name,
                             fields,
                             bucket_name,
                             self.get_file_name(error_file_name),
                             self.long_to_short_dict[job.file_type_id],
                             is_local=self.is_local)

            # list to keep track of rows that fail validations
            error_rows = []

            # While not done, pull one row and put it into staging table if it passes
            # the Validator

            loading_start = datetime.now()
            logger.info({
                'message': 'Beginning data loading {}'.format(log_str),
                'message_type': 'ValidatorInfo',
                'submission_id': submission_id,
                'job_id': job_id,
                'file_type': file_type,
                'action': 'data_loading',
                'status': 'start',
                'start_time': loading_start
            })

            with open(error_file_path, 'w', newline='') as error_file,\
                    open(warning_file_path, 'w', newline='') as warning_file:
                error_csv = csv.writer(error_file,
                                       delimiter=',',
                                       quoting=csv.QUOTE_MINIMAL,
                                       lineterminator='\n')
                warning_csv = csv.writer(warning_file,
                                         delimiter=',',
                                         quoting=csv.QUOTE_MINIMAL,
                                         lineterminator='\n')

                required_list = None
                type_list = None
                if file_type == "fabs":
                    # create a list of all required/type labels for FABS
                    labels = sess.query(ValidationLabel).all()
                    required_list = {}
                    type_list = {}
                    for label in labels:
                        if label.label_type == "requirement":
                            required_list[label.column_name] = label.label
                        else:
                            type_list[label.column_name] = label.label

                # write headers to file
                error_csv.writerow(self.reportHeaders)
                warning_csv.writerow(self.reportHeaders)
                while not reader.is_finished:
                    row_number += 1

                    if row_number % 100 == 0:
                        elapsed_time = (datetime.now() -
                                        loading_start).total_seconds()
                        logger.info({
                            'message':
                            'Loading row: {} {}'.format(
                                str(row_number), log_str),
                            'message_type':
                            'ValidatorInfo',
                            'submission_id':
                            submission_id,
                            'job_id':
                            job_id,
                            'file_type':
                            file_type,
                            'action':
                            'data_loading',
                            'status':
                            'loading',
                            'rows_loaded':
                            row_number,
                            'start_time':
                            loading_start,
                            'elapsed_time':
                            elapsed_time
                        })
                    #
                    # first phase of validations: read record and record a
                    # formatting error if there's a problem
                    #
                    (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \
                        self.read_record(reader, error_csv, row_number, job, fields, error_list)
                    if reduceRow:
                        row_number -= 1
                    if rowErrorHere:
                        error_rows.append(row_number)
                    if doneReading:
                        # Stop reading from input file
                        break
                    elif skip_row:
                        # Do not write this row to staging, but continue processing future rows
                        continue

                    #
                    # second phase of validations: do basic schema checks
                    # (e.g., require fields, field length, data type)
                    #
                    # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic
                    # validations, so these validations are not repeated here
                    if file_type in ["award", "award_procurement"]:
                        # Skip basic validations for D files, set as valid to trigger write to staging
                        passed_validations = True
                        valid = True
                    else:
                        if file_type == "fabs":
                            record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + "_" +\
                                                             (record['awarding_sub_tier_agency_c'] or '-none-') + \
                                                             "_" + (record['fain'] or '-none-') + "_" + \
                                                             (record['uri'] or '-none-')
                        passed_validations, failures, valid = Validator.validate(
                            record, csv_schema, file_type == "fabs",
                            required_list, type_list)
                    if valid:
                        # todo: update this logic later when we have actual validations
                        if file_type == "fabs":
                            record["is_valid"] = True

                        model_instance = model(job_id=job_id,
                                               submission_id=submission_id,
                                               valid_record=passed_validations,
                                               **record)
                        skip_row = not insert_staging_model(
                            model_instance, job, error_csv, error_list)
                        if flex_cols:
                            sess.add_all(flex_cols)
                            sess.commit()

                        if skip_row:
                            error_rows.append(row_number)
                            continue

                    if not passed_validations:
                        fatal = write_errors(
                            failures, job,
                            self.short_to_long_dict[job.file_type_id],
                            error_csv, warning_csv, row_number, error_list,
                            flex_cols)
                        if fatal:
                            error_rows.append(row_number)

                loading_duration = (datetime.now() -
                                    loading_start).total_seconds()
                logger.info({
                    'message':
                    'Completed data loading {}'.format(log_str),
                    'message_type':
                    'ValidatorInfo',
                    'submission_id':
                    submission_id,
                    'job_id':
                    job_id,
                    'file_type':
                    file_type,
                    'action':
                    'data_loading',
                    'status':
                    'finish',
                    'start_time':
                    loading_start,
                    'end_time':
                    datetime.now(),
                    'duration':
                    loading_duration,
                    'total_rows':
                    row_number
                })

                if file_type in ('appropriations', 'program_activity',
                                 'award_financial'):
                    update_tas_ids(model, submission_id)
                #
                # third phase of validations: run validation rules as specified
                # in the schema guidance. these validations are sql-based.
                #
                sql_error_rows = self.run_sql_validations(
                    job, file_type, self.short_to_long_dict[job.file_type_id],
                    error_csv, warning_csv, row_number, error_list)
                error_rows.extend(sql_error_rows)
            error_file.close()
            warning_file.close()

            # stream file to S3 when not local
            if not self.is_local:
                # stream error file
                with open(error_file_path, 'rb') as csv_file:
                    with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(error_file_name)), 'w')\
                            as writer:
                        while True:
                            chunk = csv_file.read(CHUNK_SIZE)
                            if chunk:
                                writer.write(chunk)
                            else:
                                break
                csv_file.close()
                os.remove(error_file_path)

                # stream warning file
                with open(warning_file_path, 'rb') as warning_csv_file:
                    with smart_open.smart_open(S3Handler.create_file_path(self.get_file_name(warning_file_name)), 'w')\
                            as warning_writer:
                        while True:
                            chunk = warning_csv_file.read(CHUNK_SIZE)
                            if chunk:
                                warning_writer.write(chunk)
                            else:
                                break
                warning_csv_file.close()
                os.remove(warning_file_path)

            # Calculate total number of rows in file
            # that passed validations
            error_rows_unique = set(error_rows)
            total_rows_excluding_header = row_number - 1
            valid_rows = total_rows_excluding_header - len(error_rows_unique)

            # Update fabs is_valid rows where applicable
            # Update submission to include action dates where applicable
            if file_type == "fabs":
                sess.query(DetachedAwardFinancialAssistance).\
                    filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique),
                           DetachedAwardFinancialAssistance.submission_id == submission_id).\
                    update({"is_valid": False}, synchronize_session=False)
                sess.commit()
                min_action_date, max_action_date = get_action_dates(
                    submission_id)
                sess.query(Submission).filter(Submission.submission_id == submission_id).\
                    update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date},
                           synchronize_session=False)

            # Ensure validated rows match initial row count
            if file_row_count != row_number:
                raise ResponseException("", StatusCode.CLIENT_ERROR, None,
                                        ValidationError.rowCountError)

            # Update job metadata
            job.number_of_rows = row_number
            job.number_of_rows_valid = valid_rows
            sess.commit()

            error_list.write_all_row_errors(job_id)
            # Update error info for submission
            populate_job_error_info(job)

            if file_type == "fabs":
                # set number of errors and warnings for detached submission
                populate_submission_error_info(submission_id)

            # Mark validation as finished in job tracker
            mark_job_status(job_id, "finished")
            mark_file_complete(job_id, file_name)
        finally:
            # Ensure the files always close
            reader.close()

            validation_duration = (datetime.now() -
                                   validation_start).total_seconds()
            logger.info({
                'message':
                'Completed run_validation {}'.format(log_str),
                'message_type':
                'ValidatorInfo',
                'submission_id':
                submission_id,
                'job_id':
                job_id,
                'file_type':
                file_type,
                'action':
                'run_validation',
                'status':
                'finish',
                'start_time':
                validation_start,
                'end_time':
                datetime.now(),
                'duration':
                validation_duration
            })

        return True

Exemple #13

0

Afficher le fichier

def copy_file_generation_to_job(job, file_generation, is_local):
    """ Copy cached FileGeneration data to a Job requesting a file.

        Args:
            job: Job object to copy the data to
            file_generation: Cached FileGeneration object to copy the data from
            is_local: A boolean flag indicating whether the application is being run locally or not
    """
    sess = GlobalDB.db().session
    log_data = {
        'message':
        'Copying FileGeneration {} data to Job {}'.format(
            file_generation.file_generation_id, job.job_id),
        'message_type':
        'BrokerInfo',
        'job_id':
        job.job_id,
        'file_type':
        job.file_type.name,
        'file_generation_id':
        file_generation.file_generation_id
    }
    logger.info(log_data)

    # Do not edit submissions that have already successfully completed
    sess.refresh(job)
    if job.job_status_id == lookups.JOB_STATUS_DICT['finished']:
        return

    job.file_generation_id = file_generation.file_generation_id

    # File is still being generated, just mark the FileGeneration ID in the Job and wait
    # FileGeneration will update all child Jobs when it finishes
    if not file_generation.file_path:
        sess.commit()
        return

    # Generate file path for child Job's filename
    filepath = CONFIG_BROKER['broker_files'] if g.is_local else "{}/".format(
        str(job.submission_id))
    original_filename = file_generation.file_path.split('/')[-1]
    filename = '{}{}'.format(filepath, original_filename)

    # Copy parent job's data
    job.filename = filename
    job.original_filename = original_filename
    job.number_of_errors = 0
    job.number_of_warnings = 0

    # Change the validation job's file data when within a submission
    if job.submission_id is not None:
        val_job = sess.query(Job).filter(
            Job.submission_id == job.submission_id,
            Job.file_type_id == job.file_type_id, Job.job_type_id ==
            lookups.JOB_TYPE_DICT['csv_record_validation']).one()
        val_job.filename = filename
        val_job.original_filename = original_filename

        # Copy the data to the Submission's bucket
        if not g.is_local and file_generation.file_path != job.filename:
            # Check to see if the same file exists in the child bucket
            s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"])
            bucket = CONFIG_BROKER['aws_bucket']
            response = s3.list_objects_v2(Bucket=bucket, Prefix=job.filename)
            for obj in response.get('Contents', []):
                if obj['Key'] == job.filename:
                    # The file already exists in this location
                    log_data[
                        'message'] = '{} file already exists in this location: {}; not overwriting.'.format(
                            job.file_type.name, job.filename)
                    logger.info(log_data)
                    mark_job_status(job.job_id, 'finished')
                    return

            S3Handler.copy_file(bucket, bucket, file_generation.file_path,
                                job.filename)
    sess.commit()

    # Mark Job status last so the validation job doesn't start until everything is done
    mark_job_status(job.job_id, 'finished')

Exemple #14

0

Afficher le fichier

Fichier : validationManager.py Projet : usdigitalresponse/data-act-broker-backend

    def run_validation(self, job):
        """ Run validations for specified job

            Args:
                job: Job to be validated

            Returns:
                True if successful
        """

        sess = GlobalDB.db().session
        self.job = job
        self.submission_id = job.submission_id
        self.file_type = job.file_type
        self.file_name = job.filename
        self.is_fabs = (self.file_type.name == 'fabs')

        # initializing processing metadata vars for a new validation
        self.reader = CsvReader()
        self.error_list = ErrorInterface()
        self.error_rows = []
        self.max_row_number = 1
        self.total_rows = 0
        self.short_rows = []
        self.long_rows = []

        validation_start = datetime.now()
        bucket_name = CONFIG_BROKER['aws_bucket']
        region_name = CONFIG_BROKER['aws_region']

        self.log_str = 'on submission_id: {}, job_id: {}, file_type: {}'.format(
            str(self.submission_id), str(self.job.job_id), self.file_type.name)
        logger.info({
            'message': 'Beginning run_validation {}'.format(self.log_str),
            'message_type': 'ValidatorInfo',
            'submission_id': self.submission_id,
            'job_id': self.job.job_id,
            'file_type': self.file_type.name,
            'action': 'run_validations',
            'status': 'start',
            'start_time': validation_start
        })
        # Get orm model for this file
        self.model = [ft.model for ft in FILE_TYPE if ft.name == self.file_type.name][0]

        # Delete existing file level errors for this submission
        sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == self.job.job_id).delete()
        sess.commit()
        # Clear existing records for this submission
        sess.query(self.model).filter_by(submission_id=self.submission_id).delete()
        sess.commit()
        # Clear existing flex fields for this job
        sess.query(FlexField).filter_by(job_id=self.job.job_id).delete()
        sess.commit()

        # If local, make the error report directory
        if self.is_local and not os.path.exists(self.directory):
            os.makedirs(self.directory)
        create_file_if_needed(self.job.job_id, self.file_name)

        # Get file size and write to jobs table
        if CONFIG_BROKER['use_aws']:
            file_size = S3Handler.get_file_size(self.file_name)
        else:
            file_size = os.path.getsize(self.file_name)
        self.job.file_size = file_size
        sess.commit()

        # Get fields for this file
        self.fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[self.file_type.name])\
            .order_by(FileColumn.daims_name.asc()).all()
        self.expected_headers, self.parsed_fields = parse_fields(sess, self.fields)
        self.csv_schema = {row.name_short: row for row in self.fields}

        try:
            # Loading data and initial validations
            self.load_file_data(sess, bucket_name, region_name)

            if self.file_type.name in ('appropriations', 'program_activity', 'award_financial'):
                update_tas_ids(self.model, self.submission_id)

            # SQL Validations
            with open(self.error_file_path, 'a', newline='') as error_file, \
                    open(self.warning_file_path, 'a', newline='') as warning_file:
                error_csv = csv.writer(error_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
                warning_csv = csv.writer(warning_file, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')

                # third phase of validations: run validation rules as specified in the schema guidance. These
                # validations are sql-based.
                sql_error_rows = self.run_sql_validations(self.short_to_long_dict[self.file_type.file_type_id],
                                                          error_csv, warning_csv)
                self.error_rows.extend(sql_error_rows)
            error_file.close()
            warning_file.close()

            # stream file to S3 when not local
            if not self.is_local:
                s3_resource = boto3.resource('s3', region_name=region_name)
                # stream error file
                with open(self.error_file_path, 'rb') as csv_file:
                    s3_resource.Object(bucket_name, self.get_file_name(self.error_file_name)).put(Body=csv_file)
                csv_file.close()
                os.remove(self.error_file_path)

                # stream warning file
                with open(self.warning_file_path, 'rb') as warning_csv_file:
                    s3_resource.Object(bucket_name,
                                       self.get_file_name(self.warning_file_name)).put(Body=warning_csv_file)
                warning_csv_file.close()
                os.remove(self.warning_file_path)

            # Calculate total number of rows in file that passed validations
            error_rows_unique = set(self.error_rows)
            total_rows_excluding_header = self.total_rows - 1
            valid_rows = total_rows_excluding_header - len(error_rows_unique)

            # Update fabs is_valid rows where applicable
            # Update submission to include action dates where applicable
            if self.is_fabs:
                sess.query(DetachedAwardFinancialAssistance). \
                    filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique),
                           DetachedAwardFinancialAssistance.submission_id == self.submission_id). \
                    update({'is_valid': False}, synchronize_session=False)
                sess.commit()
                min_action_date, max_action_date = get_action_dates(self.submission_id)
                sess.query(Submission).filter(Submission.submission_id == self.submission_id). \
                    update({'reporting_start_date': min_action_date, 'reporting_end_date': max_action_date},
                           synchronize_session=False)

            # Update job metadata
            self.job.number_of_rows = self.total_rows
            self.job.number_of_rows_valid = valid_rows
            sess.commit()

            self.error_list.write_all_row_errors(self.job.job_id)
            # Update error info for submission
            populate_job_error_info(self.job)

            if self.is_fabs:
                # set number of errors and warnings for detached submission
                populate_submission_error_info(self.submission_id)

            # Mark validation as finished in job tracker
            mark_job_status(self.job.job_id, 'finished')
            mark_file_complete(self.job.job_id, self.file_name)

        except Exception:
            logger.error({
                'message': 'An exception occurred during validation',
                'message_type': 'ValidatorInfo',
                'submission_id': self.submission_id,
                'job_id': self.job.job_id,
                'file_type': self.file_type.name,
                'traceback': traceback.format_exc()
            })
            raise

        finally:
            # Ensure the files always close
            self.reader.close()

            validation_duration = (datetime.now()-validation_start).total_seconds()
            logger.info({
                'message': 'Completed run_validation {}'.format(self.log_str),
                'message_type': 'ValidatorInfo',
                'submission_id': self.submission_id,
                'job_id': self.job.job_id,
                'file_type': self.file_type.name,
                'action': 'run_validation',
                'status': 'finish',
                'start_time': validation_start,
                'end_time': datetime.now(),
                'duration': validation_duration
            })

        return True

Exemple #15

0

Afficher le fichier

Fichier : validationManager.py Projet : tony-sappe/data-act-broker-backend

    def run_validation(self, job):
        """ Run validations for specified job
        Args:
            job: Job to be validated
        Returns:
            True if successful
        """

        sess = GlobalDB.db().session
        job_id = job.job_id

        error_list = ErrorInterface()

        submission_id = job.submission_id

        row_number = 1
        file_type = job.file_type.name
        validation_start = datetime.now()

        logger.info(
            {
                'message': 'Beginning run_validation on submission_id: ' + str(submission_id) +
                ', job_id: ' + str(job_id) + ', file_type: ' + file_type,
                'message_type': 'ValidatorInfo',
                'submission_id': submission_id,
                'job_id': job_id,
                'file_type': file_type,
                'action': 'run_validations',
                'status': 'start',
                'start_time': validation_start})
        # Get orm model for this file
        model = [ft.model for ft in FILE_TYPE if ft.name == file_type][0]

        # Delete existing file level errors for this submission
        sess.query(ErrorMetadata).filter(ErrorMetadata.job_id == job_id).delete()
        sess.commit()

        # Clear existing records for this submission
        sess.query(model).filter_by(submission_id=submission_id).delete()
        sess.commit()

        # Clear existing flex fields for this job
        sess.query(FlexField).filter_by(job_id=job_id).delete()
        sess.commit()

        # If local, make the error report directory
        if self.isLocal and not os.path.exists(self.directory):
            os.makedirs(self.directory)
        # Get bucket name and file name
        file_name = job.filename
        bucket_name = CONFIG_BROKER['aws_bucket']
        region_name = CONFIG_BROKER['aws_region']

        error_file_name = self.get_file_name(report_file_name(job.submission_id, False, job.file_type.name))
        warning_file_name = self.get_file_name(report_file_name(job.submission_id, True, job.file_type.name))

        # Create File Status object
        create_file_if_needed(job_id, file_name)

        reader = self.get_reader()

        # Get file size and write to jobs table
        if CONFIG_BROKER["use_aws"]:
            file_size = S3Handler.get_file_size(file_name)
        else:
            file_size = os.path.getsize(file_name)
        job.file_size = file_size
        sess.commit()

        # Get fields for this file
        fields = sess.query(FileColumn).filter(FileColumn.file_id == FILE_TYPE_DICT[file_type]).all()

        for field in fields:
            sess.expunge(field)

        csv_schema = {row.name_short: row for row in fields}

        try:
            extension = os.path.splitext(file_name)[1]
            if not extension or extension not in ['.csv', '.txt']:
                raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.fileTypeError)

            # Count file rows: throws a File Level Error for non-UTF8 characters
            temp_file = open(reader.get_filename(region_name, bucket_name, file_name), encoding='utf-8')
            file_row_count = len(list(csv.reader(temp_file)))
            try:
                temp_file.close()
            except AttributeError:
                # File does not exist, and so does not need to be closed
                pass

            # Pull file and return info on whether it's using short or long col headers
            reader.open_file(region_name, bucket_name, file_name, fields, bucket_name, error_file_name,
                             self.long_to_short_dict, is_local=self.isLocal)

            # list to keep track of rows that fail validations
            error_rows = []

            # While not done, pull one row and put it into staging table if it passes
            # the Validator

            loading_start = datetime.now()
            logger.info(
                {
                    'message': 'Beginning data loading on submission_id: ' + str(submission_id) +
                    ', job_id: ' + str(job_id) + ', file_type: ' + file_type,
                    'message_type': 'ValidatorInfo',
                    'submission_id': submission_id,
                    'job_id': job_id,
                    'file_type': file_type,
                    'action': 'data_loading',
                    'status': 'start',
                    'start_time': loading_start})

            with self.get_writer(region_name, bucket_name, error_file_name, self.reportHeaders) as writer, \
                    self.get_writer(region_name, bucket_name, warning_file_name, self.reportHeaders) as warning_writer:
                while not reader.is_finished:
                    row_number += 1

                    if row_number % 100 == 0:

                        elapsed_time = (datetime.now()-loading_start).total_seconds()
                        logger.info(
                            {
                                'message': 'Loading row: ' + str(row_number) + ' on submission_id: ' +
                                str(submission_id) + ', job_id: ' + str(job_id) + ', file_type: ' + file_type,
                                'message_type': 'ValidatorInfo',
                                'submission_id': submission_id,
                                'job_id': job_id,
                                'file_type': file_type,
                                'action': 'data_loading',
                                'status': 'loading',
                                'rows_loaded': row_number,
                                'start_time': loading_start,
                                'elapsed_time': elapsed_time})
                    #
                    # first phase of validations: read record and record a
                    # formatting error if there's a problem
                    #
                    (record, reduceRow, skip_row, doneReading, rowErrorHere, flex_cols) = \
                        self.read_record(reader, writer, row_number, job, fields, error_list)
                    if reduceRow:
                        row_number -= 1
                    if rowErrorHere:
                        error_rows.append(row_number)
                    if doneReading:
                        # Stop reading from input file
                        break
                    elif skip_row:
                        # Do not write this row to staging, but continue processing future rows
                        continue

                    #
                    # second phase of validations: do basic schema checks
                    # (e.g., require fields, field length, data type)
                    #
                    # D files are obtained from upstream systems (ASP and FPDS) that perform their own basic
                    # validations, so these validations are not repeated here
                    if file_type in ["award", "award_procurement"]:
                        # Skip basic validations for D files, set as valid to trigger write to staging
                        passed_validations = True
                        valid = True
                    else:
                        if file_type in ["detached_award"]:
                            record['afa_generated_unique'] = (record['award_modification_amendme'] or '-none-') + \
                                  (record['awarding_sub_tier_agency_c'] or '-none-') + \
                                  (record['fain'] or '-none-') + (record['uri'] or '-none-')
                        passed_validations, failures, valid = Validator.validate(record, csv_schema,
                                                                                 file_type in ["detached_award"])
                    if valid:
                        # todo: update this logic later when we have actual validations
                        if file_type in ["detached_award"]:
                            record["is_valid"] = True

                        model_instance = model(job_id=job_id, submission_id=submission_id,
                                               valid_record=passed_validations, **record)
                        skip_row = not insert_staging_model(model_instance, job, writer, error_list)
                        if flex_cols:
                            sess.add_all(flex_cols)
                            sess.commit()

                        if skip_row:
                            error_rows.append(row_number)
                            continue

                    if not passed_validations:
                        fatal = write_errors(failures, job, self.short_to_long_dict, writer, warning_writer,
                                             row_number, error_list)
                        if fatal:
                            error_rows.append(row_number)

                loading_duration = (datetime.now()-loading_start).total_seconds()
                logger.info(
                    {
                        'message': 'Completed data loading on submission_id: ' + str(submission_id) +
                        ', job_id: ' + str(job_id) + ', file_type: ' + file_type,
                        'message_type': 'ValidatorInfo',
                        'submission_id': submission_id,
                        'job_id': job_id,
                        'file_type': file_type,
                        'action': 'data_loading',
                        'status': 'finish',
                        'start_time': loading_start,
                        'end_time': datetime.now(),
                        'duration': loading_duration,
                        'total_rows': row_number
                    })

                if file_type in ('appropriations', 'program_activity', 'award_financial'):
                    update_tas_ids(model, submission_id)
                #
                # third phase of validations: run validation rules as specified
                # in the schema guidance. these validations are sql-based.
                #
                sql_error_rows = self.run_sql_validations(job, file_type, self.short_to_long_dict, writer,
                                                          warning_writer, row_number, error_list)
                error_rows.extend(sql_error_rows)

                # Write unfinished batch
                writer.finish_batch()
                warning_writer.finish_batch()

            # Calculate total number of rows in file
            # that passed validations
            error_rows_unique = set(error_rows)
            total_rows_excluding_header = row_number - 1
            valid_rows = total_rows_excluding_header - len(error_rows_unique)

            # Update detached_award is_valid rows where applicable
            # Update submission to include action dates where applicable
            if file_type in ["detached_award"]:
                sess.query(DetachedAwardFinancialAssistance).\
                    filter(DetachedAwardFinancialAssistance.row_number.in_(error_rows_unique),
                           DetachedAwardFinancialAssistance.submission_id == submission_id).\
                    update({"is_valid": False}, synchronize_session=False)
                sess.commit()
                min_action_date, max_action_date = get_action_dates(submission_id)
                sess.query(Submission).filter(Submission.submission_id == submission_id).\
                    update({"reporting_start_date": min_action_date, "reporting_end_date": max_action_date},
                           synchronize_session=False)

            # Ensure validated rows match initial row count
            if file_row_count != row_number:
                raise ResponseException("", StatusCode.CLIENT_ERROR, None, ValidationError.rowCountError)

            # Update job metadata
            job.number_of_rows = row_number
            job.number_of_rows_valid = valid_rows
            sess.commit()

            error_list.write_all_row_errors(job_id)
            # Update error info for submission
            populate_job_error_info(job)

            if file_type in ["detached_award"]:
                # set number of errors and warnings for detached submission
                populate_submission_error_info(submission_id)

            # Mark validation as finished in job tracker
            mark_job_status(job_id, "finished")
            mark_file_complete(job_id, file_name)
        finally:
            # Ensure the file always closes
            reader.close()

            validation_duration = (datetime.now()-validation_start).total_seconds()
            logger.info(
                {
                    'message': 'Completed run_validation on submission_id: ' + str(submission_id) +
                    ', job_id: ' + str(job_id) + ', file_type: ' + file_type,
                    'message_type': 'ValidatorInfo',
                    'submission_id': submission_id,
                    'job_id': job_id,
                    'file_type': file_type,
                    'action': 'run_validation',
                    'status': 'finish',
                    'start_time': validation_start,
                    'end_time': datetime.now(),
                    'duration': validation_duration
                })

        return True

Exemple #16

0

Afficher le fichier

Fichier : generation_helper.py Projet : fedspendingtransparency/data-act-broker-backend

def copy_file_generation_to_job(job, file_generation, is_local):
    """ Copy cached FileGeneration data to a Job requesting a file.

        Args:
            job: Job object to copy the data to
            file_generation: Cached FileGeneration object to copy the data from
            is_local: A boolean flag indicating whether the application is being run locally or not
    """
    sess = GlobalDB.db().session
    log_data = {
        'message': 'Copying FileGeneration {} data to Job {}'.format(file_generation.file_generation_id, job.job_id),
        'message_type': 'BrokerInfo', 'job_id': job.job_id, 'file_type': job.file_type.name,
        'file_generation_id': file_generation.file_generation_id}
    logger.info(log_data)

    # Do not edit submissions that have already successfully completed
    sess.refresh(job)
    if job.job_status_id == lookups.JOB_STATUS_DICT['finished']:
        return

    job.file_generation_id = file_generation.file_generation_id

    # File is still being generated, just mark the FileGeneration ID in the Job and wait
    # FileGeneration will update all child Jobs when it finishes
    if not file_generation.file_path:
        sess.commit()
        return

    # Generate file path for child Job's filename
    filepath = CONFIG_BROKER['broker_files'] if g.is_local else "{}/".format(str(job.submission_id))
    original_filename = file_generation.file_path.split('/')[-1]
    filename = '{}{}'.format(filepath, original_filename)

    # Copy parent job's data
    job.filename = filename
    job.original_filename = original_filename
    job.number_of_errors = 0
    job.number_of_warnings = 0

    # Change the validation job's file data when within a submission
    if job.submission_id is not None:
        val_job = sess.query(Job).filter(Job.submission_id == job.submission_id,
                                         Job.file_type_id == job.file_type_id,
                                         Job.job_type_id == lookups.JOB_TYPE_DICT['csv_record_validation']).one()
        val_job.filename = filename
        val_job.original_filename = original_filename

        # Copy the data to the Submission's bucket
        if not g.is_local and file_generation.file_path != job.filename:
            # Check to see if the same file exists in the child bucket
            s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"])
            bucket = CONFIG_BROKER['aws_bucket']
            response = s3.list_objects_v2(Bucket=bucket, Prefix=job.filename)
            for obj in response.get('Contents', []):
                if obj['Key'] == job.filename:
                    # The file already exists in this location
                    log_data['message'] = '{} file already exists in this location: {}; not overwriting.'.format(
                        job.file_type.name, job.filename)
                    logger.info(log_data)
                    mark_job_status(job.job_id, 'finished')
                    return

            S3Handler.copy_file(bucket, bucket, file_generation.file_path, job.filename)
    sess.commit()

    # Mark Job status last so the validation job doesn't start until everything is done
    mark_job_status(job.job_id, 'finished')

Exemple #17

0

Afficher le fichier

def copy_parent_file_request_data(sess, child_job, parent_job, is_local):
    """Parent FileRequest job data to the child FileRequest job data.

        Args:
            sess: current DB session
            child_job: Job ID for the child FileRequest object
            parent_job: Job ID for the parent FileRequest object
            is_local: True if in local development, False otherwise
    """
    file_type = parent_job.file_type.letter_name
    log_data = {
        'message':
        'Copying data from parent job with job_id:{}'.format(
            parent_job.job_id),
        'message_type':
        'ValidatorInfo',
        'job_id':
        child_job.job_id,
        'file_type':
        parent_job.file_type.name
    }

    # Keep path but update file name
    filename = '{}/{}'.format(
        child_job.filename.rsplit('/', 1)[0], parent_job.original_filename)

    # Copy parent job's data
    child_job.from_cached = True
    child_job.filename = filename
    child_job.original_filename = parent_job.original_filename
    child_job.number_of_errors = parent_job.number_of_errors
    child_job.number_of_warnings = parent_job.number_of_warnings
    child_job.error_message = parent_job.error_message

    # Change the validation job's file data when within a submission
    if child_job.submission_id is not None:
        val_job = sess.query(Job).filter(
            Job.submission_id == child_job.submission_id,
            Job.file_type_id == parent_job.file_type_id,
            Job.job_type_id == JOB_TYPE_DICT['csv_record_validation']).one()
        val_job.filename = filename
        val_job.original_filename = parent_job.original_filename
    sess.commit()

    if not is_local and parent_job.filename != child_job.filename:
        # Check to see if the same file exists in the child bucket
        s3 = boto3.client('s3', region_name=CONFIG_BROKER["aws_region"])
        response = s3.list_objects_v2(Bucket=CONFIG_BROKER['aws_bucket'],
                                      Prefix=child_job.filename)
        for obj in response.get('Contents', []):
            if obj['Key'] == child_job.filename:
                # The file already exists in this location
                log_data[
                    'message'] = 'Cached {} file CSV already exists in this location'.format(
                        file_type)
                logger.info(log_data)
                return

        # Copy the parent file into the child's S3 location
        log_data['message'] = 'Copying the cached {} file from job {}'.format(
            file_type, parent_job.job_id)
        logger.info(log_data)
        with smart_open.smart_open(
                S3Handler.create_file_path(parent_job.filename),
                'r') as reader:
            stream_file_to_s3(child_job.filename, reader)

    # Mark job status last so the validation job doesn't start until everything is done
    mark_job_status(child_job.job_id,
                    JOB_STATUS_DICT_ID[parent_job.job_status_id])

Exemple #18

0

Afficher le fichier

Fichier : generate_comments_files.py Projet : fedspendingtransparency/data-act-broker-backend

            max_cert_id = sess.query(func.max(CertifyHistory.certify_history_id).label('cert_id')). \
                filter_by(submission_id=submission_id).one()

            route_vars = [
                agency_code, submission.reporting_fiscal_year,
                submission.reporting_fiscal_period // 3, max_pub_id.pub_id
            ]
            new_route = '/'.join([str(var) for var in route_vars]) + '/'

            if not is_local:
                old_path = '{}/{}'.format(str(submission_id), filename)
                new_path = new_route + filename
                # Copy the file if it's a non-local submission
                S3Handler().copy_file(
                    original_bucket=CONFIG_BROKER['aws_bucket'],
                    new_bucket=CONFIG_BROKER['certified_bucket'],
                    original_path=old_path,
                    new_path=new_path)
            else:
                new_path = "".join([CONFIG_BROKER['broker_files'], filename])

            # add published history
            file_history = PublishedFilesHistory(
                publish_history_id=max_pub_id.pub_id,
                certify_history_id=max_cert_id.cert_id,
                submission_id=submission_id,
                filename=new_path,
                file_type_id=None,
                comment=None,
                warning_filename=None)
            sess.add(file_history)