예제 #1
0
파일: main.py 프로젝트: rfrancis1/curation
def perform_validation_on_file(file_name, found_file_names, hpo_id,
                               folder_prefix, bucket):
    """
    Attempts to load a csv file into BigQuery

    :param file_name: name of the file to validate
    :param found_file_names: files found in the submission folder
    :param hpo_id: identifies the hpo site
    :param folder_prefix: directory containing the submission
    :param bucket: bucket containing the submission
    :return: tuple (results, errors) where
     results is list of tuples (file_name, found, parsed, loaded)
     errors is list of tuples (file_name, message)
    """
    errors = []
    results = []
    logging.info(f"Validating file '{file_name}'")
    found = parsed = loaded = 0
    table_name = file_name.split('.')[0]

    if file_name in found_file_names:
        found = 1
        load_results = bq_utils.load_from_csv(hpo_id, table_name,
                                              folder_prefix)
        load_job_id = load_results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])

        if not incomplete_jobs:
            job_resource = bq_utils.get_job_details(job_id=load_job_id)
            job_status = job_resource['status']
            if 'errorResult' in job_status:
                # These are issues (which we report back) as opposed to internal errors
                issues = [item['message'] for item in job_status['errors']]
                errors.append((file_name, ' || '.join(issues)))
                logging.info(
                    f"Issues found in gs://{bucket}/{folder_prefix}/{file_name}"
                )
                for issue in issues:
                    logging.info(issue)
            else:
                # Processed ok
                parsed = loaded = 1
        else:
            # Incomplete jobs are internal unrecoverable errors.
            # Aborting the process allows for this submission to be validated when system recovers.
            message = (
                f"Loading hpo_id '{hpo_id}' table '{table_name}' failed because "
                f"job id '{load_job_id}' did not complete.\n")
            message += f"Aborting processing 'gs://{bucket}/{folder_prefix}'."
            logging.error(message)
            raise InternalValidationError(message)

    if file_name in common.SUBMISSION_FILES:
        results.append((file_name, found, parsed, loaded))

    return results, errors
예제 #2
0
def perform_validation_on_file(file_name, found_file_names, hpo_id,
                               folder_prefix, bucket):
    errors = []
    results = []
    logging.info('Validating file `{file_name}`'.format(file_name=file_name))
    found = parsed = loaded = 0
    table_name = file_name.split('.')[0]

    if file_name in found_file_names:
        found = 1
        load_results = bq_utils.load_from_csv(hpo_id, table_name,
                                              folder_prefix)
        load_job_id = load_results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])

        if len(incomplete_jobs) == 0:
            job_resource = bq_utils.get_job_details(job_id=load_job_id)
            job_status = job_resource['status']
            if 'errorResult' in job_status:
                # These are issues (which we report back) as opposed to internal errors
                issues = [item['message'] for item in job_status['errors']]
                errors.append((file_name, ' || '.join(issues)))
                logging.info(
                    'Issues found in gs://{bucket}/{folder_prefix}/{file_name}'
                    .format(bucket=bucket,
                            folder_prefix=folder_prefix,
                            file_name=file_name))
                for issue in issues:
                    logging.info(issue)
            else:
                # Processed ok
                parsed = loaded = 1
        else:
            # Incomplete jobs are internal unrecoverable errors.
            # Aborting the process allows for this submission to be validated when system recovers.
            message_fmt = 'Loading hpo_id `%s` table `%s` failed because job id `%s` did not complete.'
            message = message_fmt % (hpo_id, table_name, load_job_id)
            message += ' Aborting processing `gs://%s/%s`.' % (bucket,
                                                               folder_prefix)
            logging.error(message)
            raise InternalValidationError(message)

    if file_name in common.REQUIRED_FILES or found:
        results.append((file_name, found, parsed, loaded))

    return results, errors