def test_get_full_result_log_when_all_exist(self, mock_hpo_csv): self._empty_hpo_buckets() hpos = resources.hpo_csv() hpo_0 = hpos[0] hpo_0_bucket = gcs_utils.get_hpo_bucket(hpo_0['hpo_id']) with open(FIVE_PERSONS_SUCCESS_RESULT_CSV, 'r') as fp: gcs_utils.upload_object(hpo_0_bucket, common.RESULT_CSV, fp) with open(FIVE_PERSONS_SUCCESS_RESULT_NO_HPO_JSON, 'r') as fp: hpo_0_expected_items = json.load(fp) for item in hpo_0_expected_items: item['hpo_id'] = hpo_0['hpo_id'] hpo_1 = hpos[1] hpo_1_bucket = gcs_utils.get_hpo_bucket(hpo_1['hpo_id']) with open(ALL_FILES_UNPARSEABLE_VALIDATION_RESULT, 'r') as fp: gcs_utils.upload_object(hpo_1_bucket, common.RESULT_CSV, fp) with open(ALL_FILES_UNPARSEABLE_VALIDATION_RESULT_NO_HPO_JSON, 'r') as fp: hpo_1_expected_items = json.load(fp) for item in hpo_1_expected_items: item['hpo_id'] = hpo_1['hpo_id'] expected = hpo_0_expected_items + hpo_1_expected_items actual = main.get_full_result_log() self.assertResultLogItemsEqual(expected, actual)
def test_get_metadata_on_existing_file(self): expected_file_name = 'person.csv' with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, expected_file_name, fp) metadata = gcs_utils.get_metadata(self.hpo_bucket, expected_file_name) self.assertIsNotNone(metadata) self.assertEqual(metadata['name'], expected_file_name)
def test_merge_with_unmatched_schema(self): running_jobs = [] with open(NYC_FIVE_PERSONS_MEASUREMENT_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'), 'measurement.csv', fp) result = bq_utils.load_cdm_csv('nyc', 'measurement') running_jobs.append(result['jobReference']['jobId']) with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'), 'person.csv', fp) result = bq_utils.load_cdm_csv('pitt', 'person') running_jobs.append(result['jobReference']['jobId']) incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) self.assertEqual( len(incomplete_jobs), 0, 'loading tables {},{} timed out'.format('nyc_measurement', 'pitt_person')) table_names = ['nyc_measurement', 'pitt_person'] success, error = bq_utils.merge_tables(bq_utils.get_dataset_id(), table_names, bq_utils.get_dataset_id(), 'merged_nyc_pitt') self.assertFalse(success)
def test_load_ehr_observation(self): hpo_id = 'pitt' dataset_id = bq_utils.get_dataset_id() table_id = bq_utils.get_table_id(hpo_id, table_name='observation') q = 'SELECT observation_id FROM {dataset_id}.{table_id} ORDER BY observation_id'.format( dataset_id=dataset_id, table_id=table_id) expected_observation_ids = [ int(row['observation_id']) for row in resources._csv_to_list( PITT_FIVE_PERSONS_OBSERVATION_CSV) ] with open(PITT_FIVE_PERSONS_OBSERVATION_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(hpo_id), 'observation.csv', fp) result = bq_utils.load_cdm_csv(hpo_id, 'observation') job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([job_id]) self.assertEqual(len(incomplete_jobs), 0, 'pitt_observation load job did not complete') load_job_result = bq_utils.get_job_details(job_id) load_job_result_status = load_job_result['status'] load_job_errors = load_job_result_status.get('errors') self.assertIsNone(load_job_errors, msg='pitt_observation load job failed: ' + str(load_job_errors)) query_results_response = bq_utils.query(q) query_job_errors = query_results_response.get('errors') self.assertIsNone(query_job_errors) actual_result = [ int(row['f'][0]['v']) for row in query_results_response['rows'] ] self.assertListEqual(actual_result, expected_observation_ids)
def load_dataset_from_files(dataset_id, path): app_id = bq_utils.app_identity.get_application_id() bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID) test_util.empty_bucket(bucket) job_ids = [] for table in common.CDM_TABLES: filename = table + '.csv' schema = os.path.join(resources.fields_path, table + '.json') f = os.path.join(path, filename) if os.path.exists(os.path.join(path, filename)): with open(f, 'r') as fp: gcs_utils.upload_object(bucket, filename, fp) else: test_util.write_cloud_str(bucket, filename, '\n') gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename) load_results = bq_utils.load_csv(schema, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True) load_job_id = load_results['jobReference']['jobId'] job_ids.append(load_job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) if len(incomplete_jobs) > 0: message = "Job id(s) %s failed to complete" % incomplete_jobs raise RuntimeError(message) test_util.empty_bucket(bucket)
def test_get_object(self): with open(FIVE_PERSONS_PERSON_CSV, 'r') as fp: expected = fp.read() with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) result = gcs_utils.get_object(self.hpo_bucket, 'person.csv') self.assertEqual(expected, result)
def test_upload_object(self): bucket_items = gcs_utils.list_bucket(self.hpo_bucket) self.assertEqual(len(bucket_items), 0) with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) bucket_items = gcs_utils.list_bucket(self.hpo_bucket) self.assertEqual(len(bucket_items), 1) bucket_item = bucket_items[0] self.assertEqual(bucket_item['name'], 'person.csv')
def test_integration_queries_to_retract_from_fake_dataset(self, mock_list_datasets, mock_is_ehr_dataset): mock_list_datasets.return_value = [{'id': self.project_id+':'+self.bq_dataset_id}] mock_is_ehr_dataset.return_value = True job_ids = [] row_count_queries = {} # load the cdm files into dataset for cdm_file in test_util.NYC_FIVE_PERSONS_FILES: cdm_file_name = os.path.basename(cdm_file) cdm_table = cdm_file_name.split('.')[0] hpo_table = bq_utils.get_table_id(self.hpo_id, cdm_table) # store query for checking number of rows to delete row_count_queries[hpo_table] = EXPECTED_ROWS_QUERY.format(dataset_id=self.bq_dataset_id, table_id=hpo_table, pids=retract_data_bq.int_list_to_bq( self.person_ids)) retract_data_bq.logger.debug('Preparing to load table %s.%s' % (self.bq_dataset_id, hpo_table)) with open(cdm_file, 'rb') as f: gcs_utils.upload_object(gcs_utils.get_hpo_bucket(self.hpo_id), cdm_file_name, f) result = bq_utils.load_cdm_csv(self.hpo_id, cdm_table, dataset_id=self.bq_dataset_id) retract_data_bq.logger.debug('Loading table %s.%s' % (self.bq_dataset_id, hpo_table)) job_id = result['jobReference']['jobId'] job_ids.append(job_id) incomplete_jobs = bq_utils.wait_on_jobs(job_ids) self.assertEqual(len(incomplete_jobs), 0, 'NYC five person load job did not complete') retract_data_bq.logger.debug('All tables loaded successfully') # use query results to count number of expected row deletions expected_row_count = {} for table in row_count_queries: result = bq_utils.query(row_count_queries[table]) expected_row_count[table] = retract_data_bq.to_int(result['totalRows']) # separate check to find number of actual deleted rows q = TABLE_ROWS_QUERY.format(dataset_id=self.bq_dataset_id) q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_before_retraction = {} for row in result: row_count_before_retraction[row['table_id']] = row['row_count'] deid_flag = False # perform retraction retract_data_bq.run_retraction(self.test_project_id, self.person_ids, self.hpo_id, deid_flag) # find actual deleted rows q_result = bq_utils.query(q) result = bq_utils.response2rows(q_result) row_count_after_retraction = {} for row in result: row_count_after_retraction[row['table_id']] = row['row_count'] for table in expected_row_count: self.assertEqual(expected_row_count[table], row_count_before_retraction[table] - row_count_after_retraction[table])
def test_query_result(self): with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON) load_job_id = result['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(PERSON)) table_id = bq_utils.get_table_id(FAKE_HPO_ID, PERSON) q = 'SELECT person_id FROM %s' % table_id result = bq_utils.query(q) self.assertEqual(5, int(result['totalRows']))
def test_get_full_result_log_when_one_does_not_exist(self, mock_hpo_csv): self._empty_hpo_buckets() hpos = resources.hpo_csv() hpo_0 = hpos[0] hpo_0_bucket = gcs_utils.get_hpo_bucket(hpo_0['hpo_id']) with open(FIVE_PERSONS_SUCCESS_RESULT_CSV, 'r') as fp: gcs_utils.upload_object(hpo_0_bucket, common.RESULT_CSV, fp) with open(FIVE_PERSONS_SUCCESS_RESULT_NO_HPO_JSON, 'r') as fp: expected = json.load(fp) for item in expected: item['hpo_id'] = hpo_0['hpo_id'] actual = main.get_full_result_log() self.assertResultLogItemsEqual(expected, actual)
def test_load_cdm_csv(self): with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON) self.assertEqual(result['status']['state'], 'RUNNING') load_job_id = result['jobReference']['jobId'] table_id = result['configuration']['load']['destinationTable']['tableId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_id)) table_info = bq_utils.get_table_info(table_id) num_rows = table_info.get('numRows') self.assertEqual(num_rows, '5')
def test_merge_with_good_data(self): running_jobs = [] with open(NYC_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('nyc'), 'person.csv', fp) result = bq_utils.load_cdm_csv('nyc', 'person') running_jobs.append(result['jobReference']['jobId']) with open(PITT_FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(gcs_utils.get_hpo_bucket('pitt'), 'person.csv', fp) result = bq_utils.load_cdm_csv('pitt', 'person') running_jobs.append(result['jobReference']['jobId']) nyc_person_ids = [ int(row['person_id']) for row in resources._csv_to_list(NYC_FIVE_PERSONS_PERSON_CSV) ] pitt_person_ids = [ int(row['person_id']) for row in resources._csv_to_list(PITT_FIVE_PERSONS_PERSON_CSV) ] expected_result = nyc_person_ids + pitt_person_ids expected_result.sort() incomplete_jobs = bq_utils.wait_on_jobs(running_jobs) self.assertEqual( len(incomplete_jobs), 0, 'loading tables {},{} timed out'.format('nyc_person', 'pitt_person')) dataset_id = bq_utils.get_dataset_id() table_ids = ['nyc_person', 'pitt_person'] merged_table_id = 'merged_nyc_pitt' success_flag, error = bq_utils.merge_tables(dataset_id, table_ids, dataset_id, merged_table_id) self.assertTrue(success_flag) self.assertEqual(error, "") query_string = 'SELECT person_id FROM {dataset_id}.{table_id}'.format( dataset_id=dataset_id, table_id=merged_table_id) merged_query_job_result = bq_utils.query(query_string) self.assertIsNone(merged_query_job_result.get('errors', None)) actual_result = [ int(row['f'][0]['v']) for row in merged_query_job_result['rows'] ] actual_result.sort() self.assertListEqual(expected_result, actual_result)
def _upload_file_to_bucket(bucket, dataset_id, path, table): app_id = bq_utils.app_identity.get_application_id() filename = table + '.csv' schema = os.path.join(resources.fields_path, table + '.json') f = os.path.join(path, filename) if os.path.exists(os.path.join(path, filename)): with open(f, 'r') as fp: gcs_utils.upload_object(bucket, filename, fp) else: test_util.write_cloud_str(bucket, filename, '\n') gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename) load_results = bq_utils.load_csv(schema, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True) load_job_id = load_results['jobReference']['jobId'] return load_job_id
def test_load_cdm_csv(self): with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) result = bq_utils.load_cdm_csv(FAKE_HPO_ID, PERSON) self.assertEqual(result['status']['state'], 'RUNNING') load_job_id = result['jobReference']['jobId'] table_id = result['configuration']['load']['destinationTable'][ 'tableId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_id)) query_response = bq_utils.query('SELECT 1 FROM %(table_id)s' % locals()) self.assertEqual(query_response['totalRows'], '5')
def test_load_csv(self): from google.appengine.api import app_identity app_id = app_identity.get_application_id() table_name = 'achilles_analysis' schema_file_name = table_name + '.json' csv_file_name = table_name + '.csv' schema_path = os.path.join(resources.fields_path, schema_file_name) local_csv_path = os.path.join(test_util.TEST_DATA_EXPORT_PATH, csv_file_name) with open(local_csv_path, 'r') as fp: response = gcs_utils.upload_object(self.hpo_bucket, csv_file_name, fp) hpo_bucket = self.hpo_bucket gcs_object_path = 'gs://%(hpo_bucket)s/%(csv_file_name)s' % locals() dataset_id = bq_utils.get_dataset_id() load_results = bq_utils.load_csv(schema_path, gcs_object_path, app_id, dataset_id, table_name) load_job_id = load_results['jobReference']['jobId'] incomplete_jobs = bq_utils.wait_on_jobs([load_job_id]) self.assertEqual(len(incomplete_jobs), 0, 'loading table {} timed out'.format(table_name)) query_response = bq_utils.query('SELECT COUNT(1) FROM %(table_name)s' % locals()) self.assertEqual(query_response['kind'], 'bigquery#queryResponse')
def save_datasources_json(datasource_id=None, folder_prefix="", target_bucket=None): """ Generate and save datasources.json (from curation report) in a GCS bucket :param datasource_id: the ID of the HPO aggregate dataset that report should go to :param folder_prefix: relative path in GCS to save to (without 'gs://') :param target_bucket: GCS bucket to save to. If not supplied, uses the bucket assigned to hpo_id. :return: """ if datasource_id is None: if target_bucket is None: raise RuntimeError( 'Cannot save datasources.json if neither hpo_id ' 'or target_bucket are specified.') hpo_id = 'default' else: if target_bucket is None: target_bucket = gcs_utils.get_hpo_bucket(datasource_id) datasource = dict(name=datasource_id, folder=datasource_id, cdmVersion=5) datasources = dict(datasources=[datasource]) datasources_fp = StringIO(json.dumps(datasources)) result = gcs_utils.upload_object( target_bucket, folder_prefix + ACHILLES_EXPORT_DATASOURCES_JSON, datasources_fp) return result
def _upload_achilles_files(hpo_id=None, folder_prefix='', target_bucket=None): """ uploads achilles web files to the corresponding hpo bucket :hpo_id: which hpo bucket do these files go into :returns: """ results = [] if target_bucket is not None: bucket = target_bucket else: if hpo_id is None: raise RuntimeError( 'either hpo_id or target_bucket must be specified') bucket = gcs_utils.get_hpo_bucket(hpo_id) logging.info('Uploading achilles index files to `gs://%s/%s`...', bucket, folder_prefix) for filename in resources.ACHILLES_INDEX_FILES: logging.info('Uploading achilles file `%s` to bucket `%s`' % (filename, bucket)) bucket_file_name = filename.split(resources.resource_path + os.sep)[1].strip().replace( '\\', '/') with open(filename, 'rb') as fp: upload_result = gcs_utils.upload_object( bucket, folder_prefix + bucket_file_name, fp) results.append(upload_result) return results
def save_datasources_json(hpo_id, folder_prefix=""): hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id) datasource = dict(name=hpo_id, folder=hpo_id, cdmVersion=5) datasources = dict(datasources=[datasource]) datasources_fp = StringIO.StringIO(json.dumps(datasources)) result = gcs_utils.upload_object( hpo_bucket, folder_prefix + ACHILLES_EXPORT_DATASOURCES_JSON, datasources_fp) return result
def retract(pids, bucket, found_files, folder_prefix, force_flag): """ Retract from a folder in a GCS bucket all records associated with a pid :param pids: person_ids to retract :param bucket: bucket containing records to retract :param found_files: files found in the current folder :param folder_prefix: current folder being processed :param force_flag: if False then prompt for each file :return: metadata for each object updated in order to retract """ result_list = [] for file_name in found_files: table_name = file_name.split(".")[0] lines_removed = 0 if force_flag: logger.debug("Attempting to force retract for person_ids %s in path %s/%s%s" % (pids, bucket, folder_prefix, file_name)) response = "Y" else: # Make sure user types Y to proceed logger.debug("Are you sure you want to retract rows for person_ids %s from path %s/%s%s?" % (pids, bucket, folder_prefix, file_name)) response = get_response() if response == "Y": # Output and input file content initialization retracted_file_string = StringIO.StringIO() input_file_string = gcs_utils.get_object(bucket, folder_prefix + file_name) input_contents = input_file_string.split('\n') modified_flag = False logger.debug("Checking for person_ids %s in path %s/%s%s" % (pids, bucket, folder_prefix, file_name)) # Check if file has person_id in first or second column for input_line in input_contents: if input_line != '': if (table_name in PID_IN_COL1 and get_integer(input_line.split(",")[0]) in pids) or \ (table_name in PID_IN_COL2 and get_integer(input_line.split(",")[1]) in pids): lines_removed += 1 modified_flag = True else: retracted_file_string.write(input_line + '\n') # Write result back to bucket if modified_flag: logger.debug("Retracted %d rows from %s/%s%s" % (lines_removed, bucket, folder_prefix, file_name)) logger.debug("Overwriting file %s/%s%s" % (bucket, folder_prefix, file_name)) upload_result = gcs_utils.upload_object(bucket, folder_prefix + file_name, retracted_file_string) result_list.append(upload_result) logger.debug("Retraction successful for file %s/%s%s " % (bucket, folder_prefix, file_name)) else: logger.debug("Skipping file %s/%s%s since pids %s not found" % (bucket, folder_prefix, file_name, pids)) elif response.lower() == "n": logger.debug("Ignoring file %s" % file_name) return result_list
def retract(pid, bucket, found_files, folder_prefix, force): """ Retract from a folder in a GCS bucket all records associated with a pid :param pid: person_id :param bucket: bucket containing records to retract :param found_files: files found in the current folder :param folder_prefix: current folder being processed :param force: if False then prompt for each file :return: metadata for each object updated in order to retract """ result_list = [] for file_name in found_files: if force: print("Force retracting rows for person_id %s from path %s/%s%s" % (pid, bucket, folder_prefix, file_name)) response = "Y" else: # Make sure user types Y to proceed print( "Are you sure you want to retract rows for person_id %s from path %s/%s%s?" % (pid, bucket, folder_prefix, file_name)) response = get_response() if response == "Y": # Output and input file content initialization retracted_file_string = StringIO.StringIO() input_file_string = gcs_utils.get_object(bucket, folder_prefix + file_name) input_contents = input_file_string.split('\n') modified_flag = False # Check if file has person_id in first or second column for input_line in input_contents: if input_line != '': if (file_name in PID_IN_COL1 and get_integer(input_line.split(",")[0]) != pid) or \ (file_name in PID_IN_COL2 and get_integer(input_line.split(",")[1]) != pid): retracted_file_string.write(input_line + '\n') else: modified_flag = True # TODO: return number of lines removed, message if no file in the folder was updated # Write result back to bucket if modified_flag: print("Overwriting file %s/%s%s" % (bucket, folder_prefix, file_name)) upload_result = gcs_utils.upload_object( bucket, folder_prefix + file_name, retracted_file_string) result_list.append(upload_result) else: print("Skipping file %s/%s%s since pid %s not found" % (bucket, folder_prefix, file_name, pid)) elif response.lower() == "n": print("Ignoring file %s" % file_name) return result_list
def _upload_file_to_bucket(bucket, dataset_id, path, table): app_id = bq_utils.app_identity.get_application_id() filename = table + '.csv' file_path = os.path.join(path, filename) try: with open(file_path, 'rb') as filepath: gcs_utils.upload_object(bucket, filename, filepath) except OSError: test_util.write_cloud_str(bucket, filename, '\n') gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename) load_results = bq_utils.load_csv(table, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True) load_job_id = load_results['jobReference']['jobId'] return load_job_id
def _write_string_to_file(bucket, name, string): """ Save the validation results in GCS :param bucket: bucket to save to :param name: name of the file (object) to save to in GCS :param cdm_file_results: list of tuples (<cdm_file_name>, <found>) :return: """ f = StringIO.StringIO() f.write(string) f.seek(0) result = gcs_utils.upload_object(bucket, name, f) f.close() return result
def _write_string_to_file(bucket, name, string): """ Save the validation results in GCS :param bucket: bucket to save to :param name: name of the file (object) to save to in GCS :param string: string to write :return: """ f = StringIO() f.write(string) f.seek(0) result = gcs_utils.upload_object(bucket, name, f) f.close() return result
def _generate_site(): """ Construct html pages for each report, data_model, file_transfer and index. :param : :return: returns 'okay' if succesful. logs critical error otherwise. """ bucket = gcs_utils.get_drc_bucket() for page_name in PAGE_NAMES: # generate the page html = to_html(page_name) # write it to the drc shared bucket file_name = page_name + '.html' fp = StringIO.StringIO(html) gcs_utils.upload_object(bucket, file_name, fp) # aggregate result logs and write to bucket full_result_log = get_full_result_log() content = json.dumps(full_result_log) fp = StringIO.StringIO(content) gcs_utils.upload_object(bucket, LOG_JSON, fp) return 'okay'
def _save_warnings_in_gcs(bucket, name, warnings): """ Save the warnings in GCS :param bucket: bucket to save to :param name: name of the file (object) to save to in GCS :param warnings: list of tuples (<file_name>, <message>) :return: """ f = StringIO.StringIO() f.write('"file_name","message"\n') for (file_name, message) in warnings: line = '"%(file_name)s","%(message)s"\n' % locals() f.write(line) f.seek(0) result = gcs_utils.upload_object(bucket, name, f) f.close() return result
def _save_errors_in_gcs(bucket, name, errors): """Save errors.csv into hpo bucket :bucket: bucket to save in :name: file_name to save to :errors: list of errors of form (file_name, errors) :returns: result of upload operation. not being used for now. """ f = StringIO.StringIO() f.write('"file_name","errors"\n') for (file_name, message) in errors: line = '"%(file_name)s","%(message)s"\n' % locals() f.write(line) f.seek(0) result = gcs_utils.upload_object(bucket, name, f) f.close() return result
def _save_result_in_gcs(bucket, name, results): """ Save the validation results in GCS :param bucket: bucket to save to :param name: name of the file (object) to save to in GCS :param file_results: list of tuples (<file_name>, <found>) :return: """ f = StringIO.StringIO() f.write('"file_name","found","parsed","loaded"\n') for (file_name, found, parsed, loaded) in results: line = '"%(file_name)s","%(found)s","%(parsed)s","%(loaded)s"\n' % locals( ) f.write(line) f.seek(0) result = gcs_utils.upload_object(bucket, name, f) f.close() return result
def _upload_achilles_files(hpo_id, folder_prefix): """uploads achilles web files to the corresponding hpo bucket :hpo_id: which hpo bucket do these files go into :returns: """ results = [] bucket = gcs_utils.get_hpo_bucket(hpo_id) for filename in common.ACHILLES_INDEX_FILES: logging.debug('Uploading achilles file `%s` to bucket `%s`' % (filename, bucket)) bucket_file_name = filename.split(resources.resource_path + os.sep)[1].strip() with open(filename, 'r') as fp: upload_result = gcs_utils.upload_object( bucket, folder_prefix + bucket_file_name, fp) results.append(upload_result) return results
def run_export(hpo_id=None, folder_prefix="", target_bucket=None): """ Run export queries for an HPO and store JSON payloads in specified folder in (optional) target bucket :type hpo_id: ID of the HPO to run export for. This is the data source name in the report. :param folder_prefix: Relative base path to store report. empty by default. :param target_bucket: Bucket to save report. If None, use bucket associated with hpo_id. """ results = [] # Using separate var rather than hpo_id here because hpo_id None needed in calls below datasource_name = 'default' if hpo_id is None: if target_bucket is None: raise RuntimeError( 'Cannot export if neither hpo_id or target_bucket is specified.' ) else: datasource_name = hpo_id if target_bucket is None: target_bucket = gcs_utils.get_hpo_bucket(hpo_id) logging.info('Exporting %s report to bucket %s', datasource_name, target_bucket) # Run export queries and store json payloads in specified folder in the target bucket reports_prefix = folder_prefix + ACHILLES_EXPORT_PREFIX_STRING + datasource_name + '/' for export_name in common.ALL_REPORTS: sql_path = os.path.join(export.EXPORT_PATH, export_name) result = export.export_from_path(sql_path, hpo_id) content = json.dumps(result) fp = StringIO(content) result = gcs_utils.upload_object( target_bucket, reports_prefix + export_name + '.json', fp) results.append(result) result = save_datasources_json(hpo_id=hpo_id, folder_prefix=folder_prefix, target_bucket=target_bucket) results.append(result) return results
def run_export(hpo_id, folder_prefix): """ this function also changes the datasources.json file """ results = [] logging.info('running export for hpo_id %s' % hpo_id) # TODO : add check for required tables hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id) _reports_prefix = ACHILLES_EXPORT_PREFIX_STRING + hpo_id + "/" for export_name in common.ALL_REPORTS: sql_path = os.path.join(export.EXPORT_PATH, export_name) result = export.export_from_path(sql_path, hpo_id) content = json.dumps(result) fp = StringIO.StringIO(content) result = gcs_utils.upload_object( hpo_bucket, folder_prefix + _reports_prefix + export_name + '.json', fp) results.append(result) datasources_json_result = save_datasources_json(hpo_id, folder_prefix) results.append(datasources_json_result) return results