def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_rdr_dataset_id() self.sandbox_dataset_id = sandbox.get_sandbox_dataset_id( self.dataset_id) sandbox.check_and_create_sandbox_dataset(self.project_id, self.dataset_id)
def setUp(self): self.app_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_dataset_id() self.bucket = gcs_utils.get_drc_bucket() test_util.empty_bucket(self.bucket) test_util.delete_all_tables(self.dataset_id) self.load_test_data(hpo_id=HPO_NYC)
def mapping_query(table_name, hpo_ids, dataset_id=None, project_id=None): """ Get query used to generate new ids for a CDM table :param table_name: name of CDM table :param hpo_ids: identifies the HPOs :param dataset_id: identifies the BQ dataset containing the input table :param project_id: identifies the GCP project containing the dataset :return: the query """ if dataset_id is None: dataset_id = bq_utils.get_dataset_id() if project_id is None: project_id = app_identity.get_application_id() subqueries = _mapping_subqueries(table_name, hpo_ids, dataset_id, project_id) union_all_query = UNION_ALL.join(subqueries) return ''' WITH all_{table_name} AS ( {union_all_query} ) SELECT DISTINCT src_table_id, src_{table_name}_id, {table_name}_id, SUBSTR(src_table_id, 1, STRPOS(src_table_id, "_{table_name}")-1) AS src_hpo_id FROM all_{table_name} '''.format(union_all_query=union_all_query, table_name=table_name)
def test_update_site_masking_table(self, mock_get_client): # Preconditions project_id = app_identity.get_application_id() sandbox_id = PIPELINE_TABLES + '_sandbox' mock_query = mock_get_client.return_value.query # Mocks the job return query_job_reference_results = mock.MagicMock( name="query_job_reference_results") query_job_reference_results.return_value = query_job_reference_results mock_query.side_effect = query_job_reference_results # Test actual_job = add_hpo.update_site_masking_table() # Post conditions update_site_masking_query = add_hpo.UPDATE_SITE_MASKING_QUERY.render( project_id=project_id, dataset_id=PIPELINE_TABLES, sandbox_id=sandbox_id, table_id=SITE_MASKING_TABLE_ID, lookup_tables_dataset=bq_consts.LOOKUP_TABLES_DATASET_ID, hpo_site_id_mappings_table=bq_consts.HPO_SITE_ID_MAPPINGS_TABLE_ID) expected_job = query_job_reference_results mock_query.assert_any_call(update_site_masking_query) self.assertEqual(actual_job, expected_job)
def setUp(self): self.project_id = get_application_id() self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.sandbox_dataset_id = sandbox.check_and_create_sandbox_dataset( self.project_id, self.dataset_id) self.client = bq.get_client(self.project_id) self.delete_sandbox()
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = 'fake_dataset' self.description = 'Test dataset created for testing BQ' self.label_or_tag = {'test': 'bq'} # Remove dataset if it already exists bq.delete_dataset(self.project_id, self.dataset_id)
def get_lab_concept_summary_query(hpo_id): """ Get the query that checks if the HPO site has submitted the required labs :param hpo_id: :return: """ project_id = app_identity.get_application_id() dataset_id = bq_utils.get_dataset_id() hpo_measurement_table = bq_utils.get_table_id(hpo_id, common.MEASUREMENT) # Create measurement_concept_sets_table if not exist if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_TABLE, dataset_id): load_measurement_concept_sets_table(project_id, dataset_id) # Create measurement_concept_sets_descendants_table if not exist if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE, dataset_id): load_measurement_concept_sets_descendants_table(project_id, dataset_id) return CHECK_REQUIRED_LAB_QUERY.format( project_id=project_id, ehr_ops_dataset_id=dataset_id, hpo_measurement_table=hpo_measurement_table, measurement_concept_sets_descendants= MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE)
def large_response_to_rowlist(query_response): """ Convert a query response to a list of dictionary objects This automatically uses the pageToken feature to iterate through a large result set. Use cautiously. :param query_response: the query response object to iterate :return: list of dictionaries """ bq_service = create_service() app_id = app_identity.get_application_id() page_token = query_response.get(bq_consts.PAGE_TOKEN) job_ref = query_response.get(bq_consts.JOB_REFERENCE) job_id = job_ref.get(bq_consts.JOB_ID) result_list = response2rows(query_response) while page_token: next_grouping = bq_service.jobs() \ .getQueryResults(projectId=app_id, jobId=job_id, pageToken=page_token) \ .execute(num_retries=bq_consts.BQ_DEFAULT_RETRY_COUNT) page_token = next_grouping.get(bq_consts.PAGE_TOKEN) intermediate_rows = response2rows(next_grouping) result_list.extend(intermediate_rows) return result_list
def load_cdm_csv(hpo_id, cdm_table_name, source_folder_prefix="", dataset_id=None): """ Load CDM file from a bucket into a table in bigquery :param hpo_id: ID for the HPO site :param cdm_table_name: name of the CDM table :return: an object describing the associated bigquery job """ if cdm_table_name not in resources.CDM_TABLES: raise ValueError( '{} is not a valid table to load'.format(cdm_table_name)) app_id = app_identity.get_application_id() if dataset_id is None: dataset_id = get_dataset_id() bucket = gcs_utils.get_hpo_bucket(hpo_id) fields_filename = os.path.join(resources.fields_path, cdm_table_name + '.json') gcs_object_path = 'gs://%s/%s%s.csv' % (bucket, source_folder_prefix, cdm_table_name) table_id = get_table_id(hpo_id, cdm_table_name) allow_jagged_rows = cdm_table_name == 'observation' return load_csv(fields_filename, gcs_object_path, app_id, dataset_id, table_id, allow_jagged_rows=allow_jagged_rows)
def list_tables(dataset_id=None, max_results=bq_consts.LIST_TABLES_MAX_RESULTS, project_id=None): """ List all the tables in the dataset :param dataset_id: dataset to list tables for (EHR dataset by default) :param max_results: maximum number of results to return :return: an object with the structure described at https://goo.gl/Z17MWs Example: result = list_tables() for table in result['tables']: print table['id'] """ bq_service = create_service() if project_id is None: app_id = app_identity.get_application_id() else: app_id = project_id if dataset_id is None: dataset_id = get_dataset_id() results = [] request = bq_service.tables().list(projectId=app_id, datasetId=dataset_id, maxResults=max_results) while request is not None: response = request.execute( num_retries=bq_consts.BQ_DEFAULT_RETRY_COUNT) tables = response.get('tables', []) results.extend(tables or []) request = bq_service.tables().list_next(request, response) return results
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.description = 'Unioned test dataset' self.label_or_tag = {'test': 'bq'} # Remove dataset if it already exists bq.delete_dataset(self.project_id, self.dataset_id)
def setUp(self): self.project_id = app_identity.get_application_id() self.hpo_id_1 = 'fake_1' self.site_name_1 = 'Fake Site Name 1' self.email_1 = [ 'fake.email.1@site_1.fakedomain', 'fake.email.2@site_1.fakedomain' ] self.hpo_id_2 = 'fake_2' self.hpo_id_3 = 'fake_3' self.hpo_id_4 = 'fake_4' self.hpo_id_4 = 'fake_5' self.bucket = 'fake' self.folder = 'fake_folder' self.fake_html_path = f"gs://{self.bucket}/{self.folder}/results.html" self.report_data = { 'folder': self.folder, 'timestamp': get_eastern_time(), 'submission_error': False } self.expected_mail_to_1 = [{ 'email': self.email_1[0], 'type': 'to' }, { 'email': self.email_1[1], 'type': 'to' }, { 'email': consts.DATA_CURATION_LISTSERV, 'type': 'cc' }]
def setup_from_request(self, _request, initial=False): """ Gather everything we need to log from the request object. :param _request: Flask request object :param initial: Is this the beginning of a request? If no, this means flask 'begin_request' call failed. """ # send any pending log entries in case 'end_request' was not called. if len(self._buffer) and initial: self.finalize() self._start_time = datetime.now(timezone.utc).isoformat() self._request_method = _request.method self._request_resource = _request.full_path if self._request_resource and self._request_resource.endswith('?'): self._request_resource = self._request_resource[:-1] self._request_agent = str(_request.user_agent) self._request_remote_addr = _request.headers.get( 'X-Appengine-User-Ip', _request.remote_addr) self._request_host = _request.headers.get( 'X-Appengine-Default-Version-Hostname', _request.host) self._request_log_id = _request.headers.get( 'X-Appengine-Request-Log-Id', 'None') self._request_taskname = _request.headers.get('X-Appengine-Taskname', None) self._request_queue = _request.headers.get('X-Appengine-Queuename', None) trace_id = _request.headers.get('X-Cloud-Trace-Context', '') if trace_id: trace_id = trace_id.split('/')[0] trace = 'projects/{0}/traces/{1}'.format( app_identity.get_application_id(), trace_id) self._trace = trace
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset = os.environ.get('UNIONED_DATASET_ID') self.client = get_client(self.project_id) self.table_id = 'fake' self.final_table = 'steps_intraday' self.view_id = f'view_{self.final_table}' self.test_tables = [self.table_id, self.view_id, self.final_table]
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = bq_utils.get_dataset_id() self.bucket: str = gcs_utils.get_drc_bucket() self.storage_client = StorageClient(self.project_id) self.storage_client.empty_bucket(self.bucket) test_util.delete_all_tables(self.dataset_id) self.load_test_data(hpo_id=HPO_NYC)
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.sandbox_id = sandbox.get_sandbox_dataset_id(self.dataset_id) # Removing any existing datasets that might interfere with the test self.client = get_client(self.project_id) self.client.delete_dataset(f'{self.project_id}.{self.sandbox_id}', delete_contents=True, not_found_ok=True)
def setUp(self): self.hpo_id = 'fake' self.project_id = 'fake-project-id' self.test_project_id = app_identity.get_application_id() self.pid_table_id = 'pid_table' self.bq_dataset_id = bq_utils.get_unioned_dataset_id() self.dataset_ids = 'all_datasets' self.person_research_ids = [(1, 6890173), (2, 858761), (1234567, 4589763)]
def wrapped(*args, **kwargs): appid = app_identity.get_application_id() # Only enforce HTTPS and auth for external requests; requests made for data generation # are allowed through (when enabled). if not _is_self_request(): if request.scheme.lower() != 'https' and appid not in ( 'None', 'testbed-test', 'testapp'): raise Unauthorized('HTTPS is required for %r' % appid) check_auth(role_whitelist) return func(*args, **kwargs)
def get_job_details(job_id): """Get job resource corresponding to job_id :param job_id: id of the job to get (i.e. `jobReference.jobId` in response body of insert request) :returns: the job resource (for details see https://goo.gl/bUE49Z) """ bq_service = create_service() app_id = app_identity.get_application_id() return bq_service.jobs().get( projectId=app_id, jobId=job_id).execute(num_retries=bq_consts.BQ_DEFAULT_RETRY_COUNT)
def setUp(self): self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID) self.person_table_id = bq_utils.get_table_id(FAKE_HPO_ID, common.PERSON) self.dataset_id = bq_utils.get_dataset_id() test_util.delete_all_tables(self.dataset_id) self.project_id = app_identity.get_application_id() self.TEST_FIELDS = [ { "type": "integer", "name": "integer_field", "mode": "required", "description": "An integer field" }, # DC-586 Import RDR rules should support null fields { "type": "integer", "name": "nullable_integer_field", "mode": "nullable", "description": "A nullable integer field" }, { "type": "string", "name": "string_field", "mode": "required", "description": "A string field" }, { "type": "date", "name": "date_field", "mode": "required", "description": "A date field" }, { "type": "timestamp", "name": "timestamp_field", "mode": "required", "description": "A timestamp field" }, { "type": "boolean", "name": "boolean_field", "mode": "required", "description": "A boolean field" }, { "type": "float", "name": "float_field", "mode": "required", "description": "A float field" } ] self.DT_FORMAT = '%Y-%m-%d %H:%M:%S' self.client = StorageClient(self.project_id) self.client.empty_bucket(self.hpo_bucket)
def setUp(self): self.hpo_id = 'fake' self.project_id = 'fake-project-id' self.test_project_id = app_identity.get_application_id() self.pid_table_id = 'pid_table' self.bq_dataset_id = os.environ.get('UNIONED_DATASET_ID') self.client = bq.get_client(self.test_project_id) self.dataset_ids = 'all_datasets' self.retraction_type = 'only_ehr' self.person_research_ids = [(1, 6890173), (2, 858761), (1234567, 4589763)]
def list_dataset_contents(dataset_id): project_id = app_identity.get_application_id() service = create_service() req = service.tables().list(projectId=project_id, datasetId=dataset_id) all_tables = [] while req: resp = req.execute() items = [item['id'].split('.')[-1] for item in resp.get('tables', [])] all_tables.extend(items or []) req = service.tables().list_next(req, resp) return all_tables
def setUp(self): self.project_id = app_identity.get_application_id() # this ensures the dataset is scoped appropriately in test and also # can be dropped in teardown (tests should not delete env resources) unioned_dataset_id = os.environ.get('UNIONED_DATASET_ID') self.dataset_id = f'{unioned_dataset_id}_bq_test' self.description = f'Dataset for {__name__} integration tests' self.label_or_tag = {'test': 'bq'} self.client = bq.get_client(self.project_id) self.dataset_ref = bigquery.dataset.DatasetReference( self.project_id, self.dataset_id)
def setUp(self): self.project_id = app_identity.get_application_id() if 'test' not in self.project_id: raise RuntimeError( f"Make sure the project_id is set to test. Project_id is {self.project_id}" ) self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.deact_dataset_id = os.environ.get('COMBINED_DATASET_ID') self.client = bq.get_client(self.project_id) self.bq_sandbox_dataset_id = sb.get_sandbox_dataset_id(self.dataset_id) self.tables = {**TABLE_ROWS, **MAPPING_TABLE_ROWS, **EXT_TABLE_ROWS} self.setup_data()
def setUp(self): self.project_id = app_identity.get_application_id() self.client = StorageClient(self.project_id) self.bucket_name: str = os.environ.get('BUCKET_NAME_FAKE') self.prefix: str = 'prefix' self.data: bytes = b'bytes' # NOTE: this needs to be in sorted order self.sub_prefixes: tuple = (f'{self.prefix}/a', f'{self.prefix}/b', f'{self.prefix}/c', f'{self.prefix}/d') self.client.empty_bucket(self.bucket_name) self._stage_bucket()
def bucket_access_configured(bucket_name: str) -> bool: """ Determine if the service account has appropriate permissions on the bucket :param bucket_name: identifies the GCS bucket :return: True if the service account has appropriate permissions, False otherwise """ project_id = app_identity.get_application_id() sc = StorageClient(project_id) bucket = sc.get_bucket(bucket_name) permissions: list = bucket.test_iam_permissions("storage.objects.create") return len(permissions) >= 1
def setUp(self): self.project_id = app_identity.get_application_id() self.assertIn('test', self.project_id) self.hpo_id = 'fake' self.site_name = 'Fake Site Name' self.bucket = 'fake' self.folder = 'fake_folder' self.fake_uri_path = f"https://console.cloud.google.com/storage/{self.bucket}/{self.folder}" self.report_data = { 'folder': self.folder, 'timestamp': get_eastern_time(), 'submission_error': False }
def merge_tables(source_dataset_id, source_table_id_list, destination_dataset_id, destination_table_id): """Takes a list of table names and runs a copy job :source_table_name_list: list of tables to merge :source_dataset_name: dataset where all the source tables reside :destination_table_name: data goes into this table :destination_dataset_name: dataset where the destination table resides :returns: True if successfull. Or False if error or taking too long. """ app_id = app_identity.get_application_id() source_tables = [{ "projectId": app_id, "datasetId": source_dataset_id, "tableId": table_name } for table_name in source_table_id_list] job_body = { 'configuration': { "copy": { "sourceTables": source_tables, "destinationTable": { "projectId": app_id, "datasetId": destination_dataset_id, "tableId": destination_table_id }, "writeDisposition": "WRITE_TRUNCATE", } } } bq_service = create_service() insert_result = bq_service.jobs().insert( projectId=app_id, body=job_body).execute(num_retries=bq_consts.BQ_DEFAULT_RETRY_COUNT) job_id = insert_result[bq_consts.JOB_REFERENCE][bq_consts.JOB_ID] incomplete_jobs = wait_on_jobs([job_id]) if len(incomplete_jobs) == 0: job_status = get_job_details(job_id)['status'] if 'errorResult' in job_status: error_messages = [ '{}'.format(item['message']) for item in job_status['errors'] ] logging.info(' || '.join(error_messages)) return False, ' || '.join(error_messages) else: logging.info("Wait timeout exceeded before load job with id '%s' was \ done" % job_id) return False, "Job timeout" return True, ""
def process_hpo_copy(hpo_id): """copies over files from hpo bucket to drc bucket :hpo_id: hpo from which to copy """ try: project_id = app_identity.get_application_id() storage_client = StorageClient(project_id) hpo_bucket = storage_client.get_hpo_bucket(hpo_id) drc_private_bucket = storage_client.get_drc_bucket() source_bucket = storage_client.bucket(hpo_bucket) destination_bucket = storage_client.bucket(drc_private_bucket) bucket_items = list_bucket(hpo_bucket) ignored_items = 0 filtered_bucket_items = [] for item in bucket_items: item_root = item['name'].split('/')[0] + '/' if item_root.lower() in common.IGNORE_DIRECTORIES: ignored_items += 1 else: filtered_bucket_items.append(item) logging.info(f"Ignoring {ignored_items} items in {hpo_bucket}") prefix = f'{hpo_id}/{hpo_bucket}/' for item in filtered_bucket_items: item_name = item['name'] source_blob = source_bucket.get_blob(item_name) destination_blob_name = f'{prefix}{item_name}' source_bucket.copy_blob(source_blob, destination_bucket, destination_blob_name) except BucketDoesNotExistError as bucket_error: bucket = bucket_error.bucket # App engine converts an env var set but left empty to be the string 'None' if bucket and bucket.lower() != 'none': logging.warning( f"Bucket '{bucket}' configured for hpo_id '{hpo_id}' does not exist" ) else: logging.info( f"Bucket '{bucket}' configured for hpo_id '{hpo_id}' is empty/unset" ) except HttpError as http_error: message = ( f"Failed to copy files for hpo_id '{hpo_id}' due to the following " f"HTTP error: {http_error.content.decode()}") logging.exception(message)
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.staging_dataset_id = f'{self.dataset_id}_staging' self.bucket = os.environ.get('BUCKET_NAME_FAKE') self.bq_client = bigquery.Client(project=self.project_id) self.gcs_client = storage.Client(project=self.project_id) self.test_vocab_folder_path = Path(TEST_VOCABULARY_PATH) self.test_vocabs = [CONCEPT, VOCABULARY] self.contents = {} for vocab in self.test_vocabs: vocab_path = self.test_vocab_folder_path / lv._table_name_to_filename( vocab) with vocab_path.open('r') as f: self.contents[vocab] = f.read()