def perform_validation_on_file(file_name, found_file_names, hpo_id, folder_prefix, bucket): """ Attempts to load a csv file into BigQuery :param file_name: name of the file to validate :param found_file_names: files found in the submission folder :param hpo_id: identifies the hpo site :param folder_prefix: directory containing the submission :param bucket: bucket containing the submission :return: tuple (results, errors) where results is list of tuples (file_name, found, parsed, loaded) errors is list of tuples (file_name, message) """ errors = [] results = [] logging.info(f"Validating file '{file_name}'") found = parsed = loaded = 0 table_name = file_name.split('.')[0] if file_name in found_file_names: found = 1 load_results = bq_utils.load_from_csv(hpo_id, table_name, folder_prefix) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) if not incomplete_jobs: job_resource = bq_utils.get_job_details(job_id=load_job_id) job_status = job_resource['status'] if 'errorResult' in job_status: # These are issues (which we report back) as opposed to internal errors issues = [item['message'] for item in job_status['errors']] errors.append((file_name, ' || '.join(issues))) logging.info( f"Issues found in gs://{bucket}/{folder_prefix}/{file_name}" ) for issue in issues: logging.info(issue) else: # Processed ok parsed = loaded = 1 else: # Incomplete jobs are internal unrecoverable errors. # Aborting the process allows for this submission to be validated when system recovers. message = ( f"Loading hpo_id '{hpo_id}' table '{table_name}' failed because " f"job id '{load_job_id}' did not complete.\n") message += f"Aborting processing 'gs://{bucket}/{folder_prefix}'." logging.error(message) raise InternalValidationError(message) if file_name in common.SUBMISSION_FILES: results.append((file_name, found, parsed, loaded)) return results, errors
def perform_validation_on_file(file_name, found_file_names, hpo_id, folder_prefix, bucket): errors = [] results = [] logging.info('Validating file `{file_name}`'.format(file_name=file_name)) found = parsed = loaded = 0 table_name = file_name.split('.')[0] if file_name in found_file_names: found = 1 load_results = bq_utils.load_from_csv(hpo_id, table_name, folder_prefix) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) if len(incomplete_jobs) == 0: job_resource = bq_utils.get_job_details(job_id=load_job_id) job_status = job_resource['status'] if 'errorResult' in job_status: # These are issues (which we report back) as opposed to internal errors issues = [item['message'] for item in job_status['errors']] errors.append((file_name, ' || '.join(issues))) logging.info( 'Issues found in gs://{bucket}/{folder_prefix}/{file_name}' .format(bucket=bucket, folder_prefix=folder_prefix, file_name=file_name)) for issue in issues: logging.info(issue) else: # Processed ok parsed = loaded = 1 else: # Incomplete jobs are internal unrecoverable errors. # Aborting the process allows for this submission to be validated when system recovers. message_fmt = 'Loading hpo_id `%s` table `%s` failed because job id `%s` did not complete.' message = message_fmt % (hpo_id, table_name, load_job_id) message += ' Aborting processing `gs://%s/%s`.' % (bucket, folder_prefix) logging.error(message) raise InternalValidationError(message) if file_name in common.REQUIRED_FILES or found: results.append((file_name, found, parsed, loaded)) return results, errors