def populate_achilles(hpo_bucket, hpo_id=FAKE_HPO_ID, include_heel=True): from validation import achilles, achilles_heel import app_identity app_id = app_identity.get_application_id() test_file_name = achilles.ACHILLES_ANALYSIS + '.csv' achilles_analysis_file_path = os.path.join(TEST_DATA_EXPORT_PATH, test_file_name) schema_name = achilles.ACHILLES_ANALYSIS write_cloud_file(hpo_bucket, achilles_analysis_file_path) gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, achilles.ACHILLES_ANALYSIS) bq_utils.load_csv(schema_name, gcs_path, app_id, dataset_id, table_id) table_names = [achilles.ACHILLES_RESULTS, achilles.ACHILLES_RESULTS_DIST] if include_heel: table_names.append(achilles_heel.ACHILLES_HEEL_RESULTS) running_jobs = [] for table_name in table_names: test_file_name = table_name + '.csv' test_file_path = os.path.join(TEST_DATA_EXPORT_SYNPUF_PATH, table_name + '.csv') write_cloud_file(hpo_bucket, test_file_path) gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, table_name) load_results = bq_utils.load_csv(table_name, gcs_path, app_id, dataset_id, table_id) running_jobs.append(load_results['jobReference']['jobId']) bq_utils.wait_on_jobs(running_jobs)
def test_load_csv(self): from google.appengine.api import app_identity app_id = app_identity.get_application_id() table_name = 'achilles_analysis' schema_file_name = table_name + '.json' csv_file_name = table_name + '.csv' schema_path = os.path.join(resources.fields_path, schema_file_name) local_csv_path = os.path.join(test_util.TEST_DATA_EXPORT_PATH, csv_file_name) with open(local_csv_path, 'r') as fp: response = gcs_utils.upload_object(self.hpo_bucket, csv_file_name, fp) hpo_bucket = self.hpo_bucket gcs_object_path = 'gs://%(hpo_bucket)s/%(csv_file_name)s' % locals() dataset_id = bq_utils.get_dataset_id() load_results = bq_utils.load_csv(schema_path, gcs_object_path, app_id, dataset_id, table_name) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_name)) query_response = bq_utils.query('SELECT COUNT(1) FROM %(table_name)s' % locals()) self.assertEqual(query_response['kind'], 'bigquery#queryResponse')
def load_test_data(self, hpo_id: str = None): """ Load to bq test achilles heel results data from csv file :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used :return: contents of the file as list of objects """ table_name: str = common.ACHILLES_HEEL_RESULTS if hpo_id is not None: table_id: str = bq_utils.get_table_id(hpo_id, table_name) else: table_id: str = table_name test_file_name: str = f'{table_id}.csv' test_file_path: str = os.path.join(test_util.TEST_DATA_PATH, test_file_name) target_bucket = self.storage_client.get_bucket(self.bucket) test_blob = target_bucket.blob(test_file_name) test_blob.upload_from_filename(test_file_path) gcs_path: str = f'gs://{self.bucket}/{test_file_name}' load_results = bq_utils.load_csv(table_name, gcs_path, self.project_id, self.dataset_id, table_id) job_id = load_results['jobReference']['jobId'] bq_utils.wait_on_jobs([job_id]) return resources.csv_to_list(test_file_path)
def load_dataset_from_files(dataset_id, path): app_id = bq_utils.app_identity.get_application_id() bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID) test_util.empty_bucket(bucket) job_ids = [] for table in common.CDM_TABLES: filename = table + '.csv' schema = os.path.join(resources.fields_path, table + '.json') f = os.path.join(path, filename) if os.path.exists(os.path.join(path, filename)): with open(f, 'r') as fp: gcs_utils.upload_object(bucket, filename, fp) else: test_util.write_cloud_str(bucket, filename, '\n') gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename) load_results = bq_utils.load_csv(schema, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True) load_job_id = load_results['jobReference']['jobId'] job_ids.append(load_job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) test_util.empty_bucket(bucket)
def _upload_file_to_bucket(bucket, dataset_id, path, table): app_id = bq_utils.app_identity.get_application_id() filename = table + '.csv' schema = os.path.join(resources.fields_path, table + '.json') f = os.path.join(path, filename) if os.path.exists(os.path.join(path, filename)): with open(f, 'r') as fp: gcs_utils.upload_object(bucket, filename, fp) else: test_util.write_cloud_str(bucket, filename, '\n') gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename) load_results = bq_utils.load_csv(schema, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True) load_job_id = load_results['jobReference']['jobId'] return load_job_id
def _upload_file_to_bucket(bucket, dataset_id, path, table): app_id = bq_utils.app_identity.get_application_id() filename = table + '.csv' file_path = os.path.join(path, filename) try: with open(file_path, 'rb') as filepath: gcs_utils.upload_object(bucket, filename, filepath) except OSError: test_util.write_cloud_str(bucket, filename, '\n') gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename) load_results = bq_utils.load_csv(table, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True) load_job_id = load_results['jobReference']['jobId'] return load_job_id
def load_test_data(self, hpo_id=None): """ Load to bq test achilles heel results data from csv file :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used :return: contents of the file as list of objects """ schema_path = os.path.join(resources.fields_path, common.ACHILLES_HEEL_RESULTS + '.json') table_id = common.ACHILLES_HEEL_RESULTS if hpo_id is not None: table_id = bq_utils.get_table_id(hpo_id, common.ACHILLES_HEEL_RESULTS) test_file_name = table_id + '.csv' test_file_path = os.path.join(test_util.TEST_DATA_PATH, test_file_name) test_util.write_cloud_file(self.bucket, test_file_path) gcs_path = 'gs://' + self.bucket + '/' + test_file_name load_results = bq_utils.load_csv(schema_path, gcs_path, self.app_id, self.dataset_id, table_id) job_id = load_results['jobReference']['jobId'] bq_utils.wait_on_jobs([job_id]) return resources._csv_to_list(test_file_path)
def populate_achilles(hpo_id=FAKE_HPO_ID, include_heel=True): from validation import achilles, achilles_heel import app_identity app_id = app_identity.get_application_id() test_resources_bucket = RESOURCES_BUCKET_FMT.format(project_id=app_id) table_names = [ achilles.ACHILLES_ANALYSIS, achilles.ACHILLES_RESULTS, achilles.ACHILLES_RESULTS_DIST ] if include_heel: table_names.append(achilles_heel.ACHILLES_HEEL_RESULTS) running_jobs = [] for table_name in table_names: gcs_path = f'gs://{test_resources_bucket}/{table_name}.csv' dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, table_name) load_results = bq_utils.load_csv(table_name, gcs_path, app_id, dataset_id, table_id) running_jobs.append(load_results['jobReference']['jobId']) bq_utils.wait_on_jobs(running_jobs)
def _upload_file_to_bucket(cls, bucket: str, dataset_id: str, path: str, table: str): app_id: str = bq_utils.app_identity.get_application_id() filename: str = f'{table}.csv' file_path: str = os.path.join(path, filename) target_bucket = cls.storage_client.get_bucket(bucket) blob = target_bucket.blob(filename) try: with open(file_path, 'rb') as filepath: blob.upload_from_file(filepath) except OSError as exc: blob.upload_from_string('\n') gcs_path: str = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename) load_results = bq_utils.load_csv(table, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True) load_job_id = load_results['jobReference']['jobId'] return load_job_id
def test_load_csv(self): app_id = app_identity.get_application_id() table_name = 'achilles_analysis' csv_file_name = table_name + '.csv' local_csv_path = os.path.join(test_util.TEST_DATA_EXPORT_PATH, csv_file_name) sc_bucket = self.client.get_bucket(self.hpo_bucket) bucket_blob = sc_bucket.blob(csv_file_name) with open(local_csv_path, 'rb') as fp: bucket_blob.upload_from_file(fp) hpo_bucket = self.hpo_bucket gcs_object_path = 'gs://%(hpo_bucket)s/%(csv_file_name)s' % locals() dataset_id = self.dataset_id load_results = bq_utils.load_csv(table_name, gcs_object_path, app_id, dataset_id, table_name) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_name)) query_response = bq_utils.query('SELECT COUNT(1) FROM %(table_name)s' % locals()) self.assertEqual(query_response['kind'], 'bigquery#queryResponse')
def write_to_result_table(project, dataset, site, match_values): """ Append items in match_values to the table generated from site name. Attempts to limit the insert query to less than 1MB. :param site: string identifier for the hpo site. :param match_values: dictionary of person_ids and match values for a field :param project: the project BigQuery project name :param dataset: name of the dataset containing the table to append to :return: query results value :raises: oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError """ if not match_values: LOGGER.info(f"No values to insert for site: {site}") return None result_table = site + consts.VALIDATION_TABLE_SUFFIX bucket = gcs_utils.get_drc_bucket() path = dataset + '/intermediate_results/' + site + '.csv' field_list = [consts.PERSON_ID_FIELD] field_list.extend(consts.VALIDATION_FIELDS) field_list.append(consts.ALGORITHM_FIELD) results = StringIO() field_list_str = ','.join(field_list) + '\n' results.write(field_list_str) LOGGER.info(f"Generating csv values to write to storage for site: {site}") for person_key, person_values in match_values.items(): str_list = [str(person_key)] for field in consts.VALIDATION_FIELDS: value = str(person_values.get(field, consts.MISSING)) str_list.append(value) str_list.append(consts.YES) val_str = ','.join(str_list) results.write(val_str + '\n') LOGGER.info(f"Writing csv file to cloud storage for site: {site}") # write results results.seek(0) gcs_utils.upload_object(bucket, path, results) results.close() LOGGER.info( f"Wrote {len(match_values)} items to cloud storage for site: {site}") # wait on results to be written schema_path = os.path.join(fields_path, 'identity_match.json') LOGGER.info( f"Beginning load of identity match values from csv into BigQuery " "for site: {site}") try: # load csv file into bigquery results = bq_utils.load_csv(schema_path, 'gs://' + bucket + '/' + path, project, dataset, result_table, write_disposition=consts.WRITE_TRUNCATE) # ensure the load job finishes query_job_id = results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([query_job_id]) if incomplete_jobs != []: raise bq_utils.BigQueryJobWaitError(incomplete_jobs) except (oauth2client.client.HttpAccessTokenRefreshError, googleapiclient.errors.HttpError): LOGGER.exception( f"Encountered an exception when loading records from csv for site: {site}" ) raise LOGGER.info(f"Loaded match values for site: {site}") return results