def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset_id = bq_utils.get_rdr_dataset_id()
     self.sandbox_dataset_id = sandbox.get_sandbox_dataset_id(
         self.dataset_id)
     sandbox.check_and_create_sandbox_dataset(self.project_id,
                                              self.dataset_id)
Ejemplo n.º 2
0
 def setUp(self):
     self.app_id = app_identity.get_application_id()
     self.dataset_id = bq_utils.get_dataset_id()
     self.bucket = gcs_utils.get_drc_bucket()
     test_util.empty_bucket(self.bucket)
     test_util.delete_all_tables(self.dataset_id)
     self.load_test_data(hpo_id=HPO_NYC)
Ejemplo n.º 3
0
def mapping_query(table_name, hpo_ids, dataset_id=None, project_id=None):
    """
    Get query used to generate new ids for a CDM table

    :param table_name: name of CDM table
    :param hpo_ids: identifies the HPOs
    :param dataset_id: identifies the BQ dataset containing the input table
    :param project_id: identifies the GCP project containing the dataset
    :return: the query
    """
    if dataset_id is None:
        dataset_id = bq_utils.get_dataset_id()
    if project_id is None:
        project_id = app_identity.get_application_id()
    subqueries = _mapping_subqueries(table_name, hpo_ids, dataset_id,
                                     project_id)
    union_all_query = UNION_ALL.join(subqueries)
    return '''
    WITH all_{table_name} AS (
      {union_all_query}
    )
    SELECT DISTINCT
        src_table_id,
        src_{table_name}_id,
        {table_name}_id,
        SUBSTR(src_table_id, 1, STRPOS(src_table_id, "_{table_name}")-1) AS src_hpo_id
    FROM all_{table_name}
    '''.format(union_all_query=union_all_query, table_name=table_name)
Ejemplo n.º 4
0
    def test_update_site_masking_table(self, mock_get_client):
        # Preconditions
        project_id = app_identity.get_application_id()
        sandbox_id = PIPELINE_TABLES + '_sandbox'

        mock_query = mock_get_client.return_value.query

        # Mocks the job return
        query_job_reference_results = mock.MagicMock(
            name="query_job_reference_results")
        query_job_reference_results.return_value = query_job_reference_results
        mock_query.side_effect = query_job_reference_results

        # Test
        actual_job = add_hpo.update_site_masking_table()

        # Post conditions
        update_site_masking_query = add_hpo.UPDATE_SITE_MASKING_QUERY.render(
            project_id=project_id,
            dataset_id=PIPELINE_TABLES,
            sandbox_id=sandbox_id,
            table_id=SITE_MASKING_TABLE_ID,
            lookup_tables_dataset=bq_consts.LOOKUP_TABLES_DATASET_ID,
            hpo_site_id_mappings_table=bq_consts.HPO_SITE_ID_MAPPINGS_TABLE_ID)

        expected_job = query_job_reference_results

        mock_query.assert_any_call(update_site_masking_query)

        self.assertEqual(actual_job, expected_job)
Ejemplo n.º 5
0
 def setUp(self):
     self.project_id = get_application_id()
     self.dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.sandbox_dataset_id = sandbox.check_and_create_sandbox_dataset(
         self.project_id, self.dataset_id)
     self.client = bq.get_client(self.project_id)
     self.delete_sandbox()
Ejemplo n.º 6
0
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset_id = 'fake_dataset'
     self.description = 'Test dataset created for testing BQ'
     self.label_or_tag = {'test': 'bq'}
     # Remove dataset if it already exists
     bq.delete_dataset(self.project_id, self.dataset_id)
Ejemplo n.º 7
0
def get_lab_concept_summary_query(hpo_id):
    """
    Get the query that checks if the HPO site has submitted the required labs
    :param hpo_id: 
    :return: 
    """
    project_id = app_identity.get_application_id()
    dataset_id = bq_utils.get_dataset_id()
    hpo_measurement_table = bq_utils.get_table_id(hpo_id, common.MEASUREMENT)

    # Create measurement_concept_sets_table if not exist
    if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_TABLE, dataset_id):
        load_measurement_concept_sets_table(project_id, dataset_id)

    # Create measurement_concept_sets_descendants_table if not exist
    if not bq_utils.table_exists(MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE,
                                 dataset_id):
        load_measurement_concept_sets_descendants_table(project_id, dataset_id)

    return CHECK_REQUIRED_LAB_QUERY.format(
        project_id=project_id,
        ehr_ops_dataset_id=dataset_id,
        hpo_measurement_table=hpo_measurement_table,
        measurement_concept_sets_descendants=
        MEASUREMENT_CONCEPT_SETS_DESCENDANTS_TABLE)
Ejemplo n.º 8
0
def large_response_to_rowlist(query_response):
    """
    Convert a query response to a list of dictionary objects

    This automatically uses the pageToken feature to iterate through a
    large result set.  Use cautiously.

    :param query_response: the query response object to iterate
    :return: list of dictionaries
    """
    bq_service = create_service()
    app_id = app_identity.get_application_id()

    page_token = query_response.get(bq_consts.PAGE_TOKEN)
    job_ref = query_response.get(bq_consts.JOB_REFERENCE)
    job_id = job_ref.get(bq_consts.JOB_ID)

    result_list = response2rows(query_response)
    while page_token:
        next_grouping = bq_service.jobs() \
            .getQueryResults(projectId=app_id, jobId=job_id, pageToken=page_token) \
            .execute(num_retries=bq_consts.BQ_DEFAULT_RETRY_COUNT)
        page_token = next_grouping.get(bq_consts.PAGE_TOKEN)
        intermediate_rows = response2rows(next_grouping)
        result_list.extend(intermediate_rows)

    return result_list
Ejemplo n.º 9
0
def load_cdm_csv(hpo_id,
                 cdm_table_name,
                 source_folder_prefix="",
                 dataset_id=None):
    """
    Load CDM file from a bucket into a table in bigquery
    :param hpo_id: ID for the HPO site
    :param cdm_table_name: name of the CDM table
    :return: an object describing the associated bigquery job
    """
    if cdm_table_name not in resources.CDM_TABLES:
        raise ValueError(
            '{} is not a valid table to load'.format(cdm_table_name))

    app_id = app_identity.get_application_id()
    if dataset_id is None:
        dataset_id = get_dataset_id()
    bucket = gcs_utils.get_hpo_bucket(hpo_id)
    fields_filename = os.path.join(resources.fields_path,
                                   cdm_table_name + '.json')
    gcs_object_path = 'gs://%s/%s%s.csv' % (bucket, source_folder_prefix,
                                            cdm_table_name)
    table_id = get_table_id(hpo_id, cdm_table_name)
    allow_jagged_rows = cdm_table_name == 'observation'
    return load_csv(fields_filename,
                    gcs_object_path,
                    app_id,
                    dataset_id,
                    table_id,
                    allow_jagged_rows=allow_jagged_rows)
Ejemplo n.º 10
0
def list_tables(dataset_id=None,
                max_results=bq_consts.LIST_TABLES_MAX_RESULTS,
                project_id=None):
    """
    List all the tables in the dataset

    :param dataset_id: dataset to list tables for (EHR dataset by default)
    :param max_results: maximum number of results to return
    :return: an object with the structure described at https://goo.gl/Z17MWs

    Example:
      result = list_tables()
      for table in result['tables']:
          print table['id']
    """
    bq_service = create_service()
    if project_id is None:
        app_id = app_identity.get_application_id()
    else:
        app_id = project_id
    if dataset_id is None:
        dataset_id = get_dataset_id()
    results = []
    request = bq_service.tables().list(projectId=app_id,
                                       datasetId=dataset_id,
                                       maxResults=max_results)
    while request is not None:
        response = request.execute(
            num_retries=bq_consts.BQ_DEFAULT_RETRY_COUNT)
        tables = response.get('tables', [])
        results.extend(tables or [])
        request = bq_service.tables().list_next(request, response)
    return results
Ejemplo n.º 11
0
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.description = 'Unioned test dataset'
     self.label_or_tag = {'test': 'bq'}
     # Remove dataset if it already exists
     bq.delete_dataset(self.project_id, self.dataset_id)
Ejemplo n.º 12
0
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.hpo_id_1 = 'fake_1'
     self.site_name_1 = 'Fake Site Name 1'
     self.email_1 = [
         'fake.email.1@site_1.fakedomain', 'fake.email.2@site_1.fakedomain'
     ]
     self.hpo_id_2 = 'fake_2'
     self.hpo_id_3 = 'fake_3'
     self.hpo_id_4 = 'fake_4'
     self.hpo_id_4 = 'fake_5'
     self.bucket = 'fake'
     self.folder = 'fake_folder'
     self.fake_html_path = f"gs://{self.bucket}/{self.folder}/results.html"
     self.report_data = {
         'folder': self.folder,
         'timestamp': get_eastern_time(),
         'submission_error': False
     }
     self.expected_mail_to_1 = [{
         'email': self.email_1[0],
         'type': 'to'
     }, {
         'email': self.email_1[1],
         'type': 'to'
     }, {
         'email': consts.DATA_CURATION_LISTSERV,
         'type': 'cc'
     }]
Ejemplo n.º 13
0
    def setup_from_request(self, _request, initial=False):
        """
        Gather everything we need to log from the request object.
        :param _request: Flask request object
        :param initial: Is this the beginning of a request? If no, this means flask 'begin_request' call failed.
        """
        # send any pending log entries in case 'end_request' was not called.
        if len(self._buffer) and initial:
            self.finalize()

        self._start_time = datetime.now(timezone.utc).isoformat()
        self._request_method = _request.method
        self._request_resource = _request.full_path
        if self._request_resource and self._request_resource.endswith('?'):
            self._request_resource = self._request_resource[:-1]
        self._request_agent = str(_request.user_agent)
        self._request_remote_addr = _request.headers.get(
            'X-Appengine-User-Ip', _request.remote_addr)
        self._request_host = _request.headers.get(
            'X-Appengine-Default-Version-Hostname', _request.host)
        self._request_log_id = _request.headers.get(
            'X-Appengine-Request-Log-Id', 'None')

        self._request_taskname = _request.headers.get('X-Appengine-Taskname',
                                                      None)
        self._request_queue = _request.headers.get('X-Appengine-Queuename',
                                                   None)

        trace_id = _request.headers.get('X-Cloud-Trace-Context', '')
        if trace_id:
            trace_id = trace_id.split('/')[0]
            trace = 'projects/{0}/traces/{1}'.format(
                app_identity.get_application_id(), trace_id)
            self._trace = trace
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset = os.environ.get('UNIONED_DATASET_ID')
     self.client = get_client(self.project_id)
     self.table_id = 'fake'
     self.final_table = 'steps_intraday'
     self.view_id = f'view_{self.final_table}'
     self.test_tables = [self.table_id, self.view_id, self.final_table]
Ejemplo n.º 15
0
    def setUp(self):
        self.project_id = app_identity.get_application_id()
        self.dataset_id = bq_utils.get_dataset_id()
        self.bucket: str = gcs_utils.get_drc_bucket()
        self.storage_client = StorageClient(self.project_id)

        self.storage_client.empty_bucket(self.bucket)
        test_util.delete_all_tables(self.dataset_id)
        self.load_test_data(hpo_id=HPO_NYC)
Ejemplo n.º 16
0
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.sandbox_id = sandbox.get_sandbox_dataset_id(self.dataset_id)
     # Removing any existing datasets that might interfere with the test
     self.client = get_client(self.project_id)
     self.client.delete_dataset(f'{self.project_id}.{self.sandbox_id}',
                                delete_contents=True,
                                not_found_ok=True)
Ejemplo n.º 17
0
 def setUp(self):
     self.hpo_id = 'fake'
     self.project_id = 'fake-project-id'
     self.test_project_id = app_identity.get_application_id()
     self.pid_table_id = 'pid_table'
     self.bq_dataset_id = bq_utils.get_unioned_dataset_id()
     self.dataset_ids = 'all_datasets'
     self.person_research_ids = [(1, 6890173), (2, 858761),
                                 (1234567, 4589763)]
Ejemplo n.º 18
0
 def wrapped(*args, **kwargs):
     appid = app_identity.get_application_id()
     # Only enforce HTTPS and auth for external requests; requests made for data generation
     # are allowed through (when enabled).
     if not _is_self_request():
         if request.scheme.lower() != 'https' and appid not in (
                 'None', 'testbed-test', 'testapp'):
             raise Unauthorized('HTTPS is required for %r' % appid)
         check_auth(role_whitelist)
     return func(*args, **kwargs)
Ejemplo n.º 19
0
def get_job_details(job_id):
    """Get job resource corresponding to job_id
    :param job_id: id of the job to get (i.e. `jobReference.jobId` in response body of insert request)
    :returns: the job resource (for details see https://goo.gl/bUE49Z)
    """
    bq_service = create_service()
    app_id = app_identity.get_application_id()
    return bq_service.jobs().get(
        projectId=app_id,
        jobId=job_id).execute(num_retries=bq_consts.BQ_DEFAULT_RETRY_COUNT)
Ejemplo n.º 20
0
 def setUp(self):
     self.hpo_bucket = gcs_utils.get_hpo_bucket(FAKE_HPO_ID)
     self.person_table_id = bq_utils.get_table_id(FAKE_HPO_ID,
                                                  common.PERSON)
     self.dataset_id = bq_utils.get_dataset_id()
     test_util.delete_all_tables(self.dataset_id)
     self.project_id = app_identity.get_application_id()
     self.TEST_FIELDS = [
         {
             "type": "integer",
             "name": "integer_field",
             "mode": "required",
             "description": "An integer field"
         },
         # DC-586 Import RDR rules should support null fields
         {
             "type": "integer",
             "name": "nullable_integer_field",
             "mode": "nullable",
             "description": "A nullable integer field"
         },
         {
             "type": "string",
             "name": "string_field",
             "mode": "required",
             "description": "A string field"
         },
         {
             "type": "date",
             "name": "date_field",
             "mode": "required",
             "description": "A date field"
         },
         {
             "type": "timestamp",
             "name": "timestamp_field",
             "mode": "required",
             "description": "A timestamp field"
         },
         {
             "type": "boolean",
             "name": "boolean_field",
             "mode": "required",
             "description": "A boolean field"
         },
         {
             "type": "float",
             "name": "float_field",
             "mode": "required",
             "description": "A float field"
         }
     ]
     self.DT_FORMAT = '%Y-%m-%d %H:%M:%S'
     self.client = StorageClient(self.project_id)
     self.client.empty_bucket(self.hpo_bucket)
Ejemplo n.º 21
0
 def setUp(self):
     self.hpo_id = 'fake'
     self.project_id = 'fake-project-id'
     self.test_project_id = app_identity.get_application_id()
     self.pid_table_id = 'pid_table'
     self.bq_dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.client = bq.get_client(self.test_project_id)
     self.dataset_ids = 'all_datasets'
     self.retraction_type = 'only_ehr'
     self.person_research_ids = [(1, 6890173), (2, 858761),
                                 (1234567, 4589763)]
Ejemplo n.º 22
0
def list_dataset_contents(dataset_id):
    project_id = app_identity.get_application_id()
    service = create_service()
    req = service.tables().list(projectId=project_id, datasetId=dataset_id)
    all_tables = []
    while req:
        resp = req.execute()
        items = [item['id'].split('.')[-1] for item in resp.get('tables', [])]
        all_tables.extend(items or [])
        req = service.tables().list_next(req, resp)
    return all_tables
Ejemplo n.º 23
0
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     # this ensures the dataset is scoped appropriately in test and also
     # can be dropped in teardown (tests should not delete env resources)
     unioned_dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.dataset_id = f'{unioned_dataset_id}_bq_test'
     self.description = f'Dataset for {__name__} integration tests'
     self.label_or_tag = {'test': 'bq'}
     self.client = bq.get_client(self.project_id)
     self.dataset_ref = bigquery.dataset.DatasetReference(
         self.project_id, self.dataset_id)
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     if 'test' not in self.project_id:
         raise RuntimeError(
             f"Make sure the project_id is set to test. Project_id is {self.project_id}"
         )
     self.dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.deact_dataset_id = os.environ.get('COMBINED_DATASET_ID')
     self.client = bq.get_client(self.project_id)
     self.bq_sandbox_dataset_id = sb.get_sandbox_dataset_id(self.dataset_id)
     self.tables = {**TABLE_ROWS, **MAPPING_TABLE_ROWS, **EXT_TABLE_ROWS}
     self.setup_data()
Ejemplo n.º 25
0
    def setUp(self):
        self.project_id = app_identity.get_application_id()
        self.client = StorageClient(self.project_id)
        self.bucket_name: str = os.environ.get('BUCKET_NAME_FAKE')
        self.prefix: str = 'prefix'
        self.data: bytes = b'bytes'

        # NOTE: this needs to be in sorted order
        self.sub_prefixes: tuple = (f'{self.prefix}/a', f'{self.prefix}/b',
                                    f'{self.prefix}/c', f'{self.prefix}/d')
        self.client.empty_bucket(self.bucket_name)
        self._stage_bucket()
Ejemplo n.º 26
0
def bucket_access_configured(bucket_name: str) -> bool:
    """
    Determine if the service account has appropriate permissions on the bucket

    :param bucket_name: identifies the GCS bucket
    :return: True if the service account has appropriate permissions, False otherwise
    """
    project_id = app_identity.get_application_id()
    sc = StorageClient(project_id)
    bucket = sc.get_bucket(bucket_name)
    permissions: list = bucket.test_iam_permissions("storage.objects.create")
    return len(permissions) >= 1
Ejemplo n.º 27
0
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.assertIn('test', self.project_id)
     self.hpo_id = 'fake'
     self.site_name = 'Fake Site Name'
     self.bucket = 'fake'
     self.folder = 'fake_folder'
     self.fake_uri_path = f"https://console.cloud.google.com/storage/{self.bucket}/{self.folder}"
     self.report_data = {
         'folder': self.folder,
         'timestamp': get_eastern_time(),
         'submission_error': False
     }
Ejemplo n.º 28
0
def merge_tables(source_dataset_id, source_table_id_list,
                 destination_dataset_id, destination_table_id):
    """Takes a list of table names and runs a copy job

    :source_table_name_list: list of tables to merge
    :source_dataset_name: dataset where all the source tables reside
    :destination_table_name: data goes into this table
    :destination_dataset_name: dataset where the destination table resides
    :returns: True if successfull. Or False if error or taking too long.

    """
    app_id = app_identity.get_application_id()
    source_tables = [{
        "projectId": app_id,
        "datasetId": source_dataset_id,
        "tableId": table_name
    } for table_name in source_table_id_list]
    job_body = {
        'configuration': {
            "copy": {
                "sourceTables": source_tables,
                "destinationTable": {
                    "projectId": app_id,
                    "datasetId": destination_dataset_id,
                    "tableId": destination_table_id
                },
                "writeDisposition": "WRITE_TRUNCATE",
            }
        }
    }

    bq_service = create_service()
    insert_result = bq_service.jobs().insert(
        projectId=app_id,
        body=job_body).execute(num_retries=bq_consts.BQ_DEFAULT_RETRY_COUNT)
    job_id = insert_result[bq_consts.JOB_REFERENCE][bq_consts.JOB_ID]
    incomplete_jobs = wait_on_jobs([job_id])

    if len(incomplete_jobs) == 0:
        job_status = get_job_details(job_id)['status']
        if 'errorResult' in job_status:
            error_messages = [
                '{}'.format(item['message']) for item in job_status['errors']
            ]
            logging.info(' || '.join(error_messages))
            return False, ' || '.join(error_messages)
    else:
        logging.info("Wait timeout exceeded before load job with id '%s' was \
                     done" % job_id)
        return False, "Job timeout"
    return True, ""
Ejemplo n.º 29
0
def process_hpo_copy(hpo_id):
    """copies over files from hpo bucket to drc bucket

    :hpo_id: hpo from which to copy
    """
    try:
        project_id = app_identity.get_application_id()
        storage_client = StorageClient(project_id)
        hpo_bucket = storage_client.get_hpo_bucket(hpo_id)
        drc_private_bucket = storage_client.get_drc_bucket()
        source_bucket = storage_client.bucket(hpo_bucket)
        destination_bucket = storage_client.bucket(drc_private_bucket)
        bucket_items = list_bucket(hpo_bucket)

        ignored_items = 0
        filtered_bucket_items = []
        for item in bucket_items:
            item_root = item['name'].split('/')[0] + '/'
            if item_root.lower() in common.IGNORE_DIRECTORIES:
                ignored_items += 1
            else:
                filtered_bucket_items.append(item)

        logging.info(f"Ignoring {ignored_items} items in {hpo_bucket}")

        prefix = f'{hpo_id}/{hpo_bucket}/'

        for item in filtered_bucket_items:
            item_name = item['name']
            source_blob = source_bucket.get_blob(item_name)
            destination_blob_name = f'{prefix}{item_name}'
            source_bucket.copy_blob(source_blob, destination_bucket,
                                    destination_blob_name)
    except BucketDoesNotExistError as bucket_error:
        bucket = bucket_error.bucket
        # App engine converts an env var set but left empty to be the string 'None'
        if bucket and bucket.lower() != 'none':
            logging.warning(
                f"Bucket '{bucket}' configured for hpo_id '{hpo_id}' does not exist"
            )
        else:
            logging.info(
                f"Bucket '{bucket}' configured for hpo_id '{hpo_id}' is empty/unset"
            )
    except HttpError as http_error:
        message = (
            f"Failed to copy files for hpo_id '{hpo_id}' due to the following "
            f"HTTP error: {http_error.content.decode()}")
        logging.exception(message)
Ejemplo n.º 30
0
 def setUp(self):
     self.project_id = app_identity.get_application_id()
     self.dataset_id = os.environ.get('UNIONED_DATASET_ID')
     self.staging_dataset_id = f'{self.dataset_id}_staging'
     self.bucket = os.environ.get('BUCKET_NAME_FAKE')
     self.bq_client = bigquery.Client(project=self.project_id)
     self.gcs_client = storage.Client(project=self.project_id)
     self.test_vocab_folder_path = Path(TEST_VOCABULARY_PATH)
     self.test_vocabs = [CONCEPT, VOCABULARY]
     self.contents = {}
     for vocab in self.test_vocabs:
         vocab_path = self.test_vocab_folder_path / lv._table_name_to_filename(
             vocab)
         with vocab_path.open('r') as f:
             self.contents[vocab] = f.read()