def test_execute_queries(self):
        project_id = bq_utils.app_identity.get_application_id()
        dataset_id = bq_utils.get_combined_dataset_id()
        test_util.delete_all_tables(dataset_id)

        create_tables = (
            ['person'] + common.CLINICAL_DATA_TABLES +
            ['_mapping_' + t for t in common.MAPPED_CLINICAL_DATA_TABLES])
        # TODO(calbach): Make the setup/teardown of these concept tables hermetic.
        for tbl in ['concept', 'concept_ancestor']:
            if not bq_utils.table_exists(tbl, dataset_id=dataset_id):
                create_tables.push(tbl)
        for tbl in create_tables:
            bq_utils.create_standard_table(tbl,
                                           tbl,
                                           dataset_id=dataset_id,
                                           force_all_nullable=True)

        for tmpl in INSERT_FAKE_PARTICIPANTS_TMPLS:
            resp = bq_utils.query(
                tmpl.render(project_id=project_id,
                            dataset_id=dataset_id,
                            rdr_basics_concept_id=123,
                            rdr_consent_concept_id=345,
                            ehr_obs_concept_id=567,
                            rdr_basics_module_concept_id=
                            drop_participants_without_ppi_or_ehr.
                            BASICS_MODULE_CONCEPT_ID))
            self.assertTrue(resp["jobComplete"])

        queries = drop_participants_without_ppi_or_ehr.get_queries(
            project_id, dataset_id)
        clean_cdr_engine.clean_dataset(project_id, queries)

        def table_to_person_ids(t):
            rows = bq_utils.response2rows(
                bq_utils.query("SELECT person_id FROM `{}.{}.{}`".format(
                    project_id, dataset_id, t)))
            return set([r["person_id"] for r in rows])

        # We expect participants 1, 5 to have been removed from all tables.
        self.assertEqual(set([2, 3, 4, 6]), table_to_person_ids("person"))
        self.assertEqual(set([2, 4, 6]), table_to_person_ids("observation"))
        self.assertEquals(set([3, 4]), table_to_person_ids("drug_exposure"))

        test_util.delete_all_tables(dataset_id)
Exemple #2
0
def validate_submission(hpo_id, bucket, bucket_items, folder_prefix):
    logging.info('Validating %s submission in gs://%s/%s', hpo_id, bucket,
                 folder_prefix)
    # separate cdm from the unknown (unexpected) files
    found_cdm_files = []
    unknown_files = []
    found_pii_files = []
    folder_items = [item['name'][len(folder_prefix):] \
                    for item in bucket_items if item['name'].startswith(folder_prefix)]
    for item in folder_items:
        if _is_cdm_file(item):
            found_cdm_files.append(item)
        elif _is_pii_file(item):
            found_pii_files.append(item)
        else:
            if not (_is_known_file(item) or _is_string_excluded_file(item)):
                unknown_files.append(item)

    errors = []
    results = []

    # Create all tables first to simplify downstream processes
    # (e.g. ehr_union doesn't have to check if tables exist)
    for file_name in resources.CDM_FILES + common.PII_FILES:
        table_name = file_name.split('.')[0]
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        bq_utils.create_standard_table(table_name,
                                       table_id,
                                       drop_existing=True)

    for cdm_file_name in sorted(resources.CDM_FILES):
        file_results, file_errors = perform_validation_on_file(
            cdm_file_name, found_cdm_files, hpo_id, folder_prefix, bucket)
        results.extend(file_results)
        errors.extend(file_errors)

    for pii_file_name in sorted(common.PII_FILES):
        file_results, file_errors = perform_validation_on_file(
            pii_file_name, found_pii_files, hpo_id, folder_prefix, bucket)
        results.extend(file_results)
        errors.extend(file_errors)

    # (filename, message) for each unknown file
    warnings = [(unknown_file, common.UNKNOWN_FILE)
                for unknown_file in unknown_files]
    return dict(results=results, errors=errors, warnings=warnings)
Exemple #3
0
def validate_submission(hpo_id, bucket, folder_items, folder_prefix):
    """
    Load submission in BigQuery and summarize outcome

    :param hpo_id:
    :param bucket:
    :param folder_items:
    :param folder_prefix:
    :return: a dict with keys results, errors, warnings
      results is list of tuples (file_name, found, parsed, loaded)
      errors and warnings are both lists of tuples (file_name, message)
    """
    logging.info(
        f"Validating {hpo_id} submission in gs://{bucket}/{folder_prefix}")
    # separate cdm from the unknown (unexpected) files
    found_cdm_files, found_pii_files, unknown_files = categorize_folder_items(
        folder_items)

    errors = []
    results = []

    # Create all tables first to simplify downstream processes
    # (e.g. ehr_union doesn't have to check if tables exist)
    for file_name in resources.CDM_FILES + common.PII_FILES:
        table_name = file_name.split('.')[0]
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        bq_utils.create_standard_table(table_name,
                                       table_id,
                                       drop_existing=True)

    for cdm_file_name in sorted(resources.CDM_FILES):
        file_results, file_errors = perform_validation_on_file(
            cdm_file_name, found_cdm_files, hpo_id, folder_prefix, bucket)
        results.extend(file_results)
        errors.extend(file_errors)

    for pii_file_name in sorted(common.PII_FILES):
        file_results, file_errors = perform_validation_on_file(
            pii_file_name, found_pii_files, hpo_id, folder_prefix, bucket)
        results.extend(file_results)
        errors.extend(file_errors)

    # (filename, message) for each unknown file
    warnings = [(unknown_file, common.UNKNOWN_FILE)
                for unknown_file in unknown_files]
    return dict(results=results, errors=errors, warnings=warnings)
def get_domain_mapping_queries(project_id, dataset_id):
    """
    This function generates a list of query dicts for creating id mappings in _logging_domain_alignment.
    The list will get consumed clean_engine

    :param project_id: the project_id in which the query is run
    :param dataset_id: the dataset_id in which the query is run
    :return: a list of query dicts for creating id mappings in _logging_domain_alignment
    """
    # Create _logging_domain_alignment
    bq_utils.create_standard_table(DOMAIN_ALIGNMENT_TABLE_NAME,
                                   DOMAIN_ALIGNMENT_TABLE_NAME,
                                   drop_existing=True,
                                   dataset_id=dataset_id)

    queries = []

    for domain_table in domain_mapping.DOMAIN_TABLE_NAMES:
        query = dict()
        query[cdr_consts.QUERY] = parse_domain_mapping_query_cross_domain(
            project_id, dataset_id, domain_table)
        query[cdr_consts.DESTINATION_TABLE] = DOMAIN_ALIGNMENT_TABLE_NAME
        query[cdr_consts.DISPOSITION] = bq_consts.WRITE_APPEND
        query[cdr_consts.DESTINATION_DATASET] = dataset_id
        queries.append(query)

    # Create the query for creating field_mappings for the records moving between the same domain
    query = dict()
    query[cdr_consts.QUERY] = parse_domain_mapping_query_for_same_domains(
        project_id, dataset_id)
    query[cdr_consts.DESTINATION_TABLE] = DOMAIN_ALIGNMENT_TABLE_NAME
    query[cdr_consts.DISPOSITION] = bq_consts.WRITE_APPEND
    query[cdr_consts.DESTINATION_DATASET] = dataset_id
    queries.append(query)

    # Create the query for the records that are in the wrong domain but will not be moved
    query = dict()
    query[cdr_consts.QUERY] = parse_domain_mapping_query_for_excluded_records(
        project_id, dataset_id)
    query[cdr_consts.DESTINATION_TABLE] = DOMAIN_ALIGNMENT_TABLE_NAME
    query[cdr_consts.DISPOSITION] = bq_consts.WRITE_APPEND
    query[cdr_consts.DESTINATION_DATASET] = dataset_id
    queries.append(query)

    return queries
Exemple #5
0
 def test_create_standard_table(self):
     standard_tables = list(resources.CDM_TABLES) + ACHILLES_TABLES
     for standard_table in standard_tables:
         table_id = f'prefix_for_test_{standard_table}'
         result = bq_utils.create_standard_table(standard_table, table_id)
         self.assertTrue('kind' in result)
         self.assertEqual(result['kind'], 'bigquery#table')
         # sanity check
         self.assertTrue(bq_utils.table_exists(table_id))
Exemple #6
0
    def _load_dataset(self, hpo_id):
        for cdm_table in resources.CDM_TABLES:
            cdm_file_name = os.path.join(test_util.FIVE_PERSONS_PATH,
                                         cdm_table + '.csv')
            if os.path.exists(cdm_file_name):
                test_util.write_cloud_file(self.hpo_bucket, cdm_file_name)
            else:
                test_util.write_cloud_str(self.hpo_bucket, cdm_table + '.csv',
                                          'dummy\n')
            bq_utils.load_cdm_csv(hpo_id, cdm_table)

        # ensure concept table exists
        if not bq_utils.table_exists(common.CONCEPT):
            bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT)
            q = """INSERT INTO {dataset}.concept
            SELECT * FROM {vocab}.concept""".format(
                dataset=self.dataset, vocab=common.VOCABULARY_DATASET)
            bq_utils.query(q)
def create_empty_cdm_tables(snapshot_dataset_id, dataset_id):
    """
    Copy the table content from the current dataset to the snapshot dataset
    :param snapshot_dataset_id:
    :param dataset_id
    :return:
    """
    for table in resources.CDM_TABLES:
        if table == PERSON and has_at_birth_column(dataset_id):
            table_id = table
            table_name = 'post_deid_person'
        else:
            table_id = table
            table_name = table
        create_standard_table(table_name,
                              table_id,
                              drop_existing=True,
                              dataset_id=snapshot_dataset_id)
    cdm.create_vocabulary_tables(snapshot_dataset_id)
Exemple #8
0
    def _load_dataset(self, hpo_id):
        for cdm_table in resources.CDM_TABLES:

            cdm_filename: str = f'{cdm_table}.csv'
            cdm_filepath: str = os.path.join(test_util.FIVE_PERSONS_PATH,
                                             cdm_filename)

            bucket = self.storage_client.get_bucket(self.hpo_bucket)
            cdm_blob = bucket.blob(cdm_filename)
            if os.path.exists(cdm_filepath):
                cdm_blob.upload_from_filename(cdm_filepath)
            else:
                cdm_blob.upload_from_string('dummy\n')

            bq_utils.load_cdm_csv(hpo_id, cdm_table)

        # ensure concept table exists
        if not bq_utils.table_exists(common.CONCEPT):
            bq_utils.create_standard_table(common.CONCEPT, common.CONCEPT)
            q = """INSERT INTO {dataset}.concept
            SELECT * FROM {vocab}.concept""".format(
                dataset=self.dataset, vocab=common.VOCABULARY_DATASET)
            bq_utils.query(q)
Exemple #9
0
    def _create_drug_class_table(bigquery_dataset_id):

        table_name = 'drug_class'
        fields = [{
            "type": "integer",
            "name": "concept_id",
            "mode": "required"
        }, {
            "type": "string",
            "name": "concept_name",
            "mode": "required"
        }, {
            "type": "string",
            "name": "drug_class_name",
            "mode": "required"
        }]
        bq_utils.create_table(table_id=table_name,
                              fields=fields,
                              drop_existing=True,
                              dataset_id=bigquery_dataset_id)

        bq_utils.query(q=main_consts.DRUG_CLASS_QUERY.format(
            dataset_id=bigquery_dataset_id),
                       use_legacy_sql=False,
                       destination_table_id='drug_class',
                       retry_count=bq_consts.BQ_DEFAULT_RETRY_COUNT,
                       write_disposition='WRITE_TRUNCATE',
                       destination_dataset_id=bigquery_dataset_id)

        # ensure concept ancestor table exists
        if not bq_utils.table_exists(common.CONCEPT_ANCESTOR):
            bq_utils.create_standard_table(common.CONCEPT_ANCESTOR,
                                           common.CONCEPT_ANCESTOR)
            q = """INSERT INTO {dataset}.concept_ancestor
            SELECT * FROM {vocab}.concept_ancestor""".format(
                dataset=bigquery_dataset_id, vocab=common.VOCABULARY_DATASET)
            bq_utils.query(q)
Exemple #10
0
def main(input_dataset_id, output_dataset_id, project_id, hpo_ids=None):
    """
    Create a new CDM which is the union of all EHR datasets submitted by HPOs

    :param input_dataset_id identifies a dataset containing multiple CDMs, one for each HPO submission
    :param output_dataset_id identifies the dataset to store the new CDM in
    :param project_id: project containing the datasets
    :param hpo_ids: (optional) identifies HPOs to process, by default process all
    :returns: list of tables generated successfully
    """
    logging.info('EHR union started')
    if hpo_ids is None:
        hpo_ids = [item['hpo_id'] for item in resources.hpo_csv()]

    # Create empty output tables to ensure proper schema, clustering, etc.
    for table in common.CDM_TABLES:
        result_table = output_table_for(table)
        logging.info('Creating {dataset_id}.{table_id}...'.format(
            dataset_id=output_dataset_id, table_id=result_table))
        bq_utils.create_standard_table(table,
                                       result_table,
                                       drop_existing=True,
                                       dataset_id=output_dataset_id)

    # Create mapping tables
    for domain_table in tables_to_map():
        logging.info(
            'Mapping {domain_table}...'.format(domain_table=domain_table))
        mapping(domain_table, hpo_ids, input_dataset_id, output_dataset_id,
                project_id)

    # Load all tables with union of submitted tables
    for table_name in common.CDM_TABLES:
        logging.info(
            'Creating union of table {table}...'.format(table=table_name))
        load(table_name, hpo_ids, input_dataset_id, output_dataset_id)
Exemple #11
0
    def submit(self, sql, create, dml=None):
        """
        Submit the sql query to create a de-identified table.

        :param sql:  The sql to send.
        :param create: a flag to identify if this query should create a new
            table or append to an existing table.
        :param dml:  boolean flag identifying if a statement is a dml statement
        """
        dml = False if dml is None else dml
        table_name = self.get_tablename()
        client = bq.Client.from_service_account_json(self.private_key)
        #
        # Let's make sure the out dataset exists
        datasets = list(client.list_datasets())
        found = np.sum(
            [1 for dataset in datasets if dataset.dataset_id == self.odataset])
        if not found:
            dataset = bq.Dataset(client.dataset(self.odataset))
            client.create_dataset(dataset)

        # create the output table
        if create:
            LOGGER.info(f"creating new table:\t{self.tablename}")
            bq_utils.create_standard_table(self.tablename,
                                           self.tablename,
                                           drop_existing=True,
                                           dataset_id=self.odataset)
            write_disposition = bq_consts.WRITE_EMPTY
        else:
            write_disposition = bq_consts.WRITE_APPEND
            LOGGER.info(f"appending results to table:\t{self.tablename}")

        job = bq.QueryJobConfig()
        job.priority = self.priority
        job.dry_run = True

        dml_job = None
        if not dml:
            job.destination = client.dataset(self.odataset).table(
                self.tablename)
            job.use_query_cache = True
            job.allow_large_results = True
            job.write_disposition = write_disposition
            if self.partition:
                job._properties['timePartitioning'] = {'type': 'DAY'}
                job._properties['clustering'] = {'field': 'person_id'}
        else:
            # create a copy of the job config to use if the dry-run passes
            dml_job = copy(job)

        LOGGER.info(
            f"submitting a dry-run for:\t{self.get_tablename()}\t\tpriority:\t%s\t\tpartition:\t%s",
            self.priority, self.partition)

        logpath = os.path.join(self.logpath, self.idataset)
        try:
            os.makedirs(logpath)
        except OSError:
            # log path already exists and we don't care
            pass

        try:
            response = client.query(sql, location='US', job_config=job)
        except Exception:
            LOGGER.exception(
                f"dry run query failed for:\t{self.get_tablename()}\n"
                f"\t\tSQL:\t{sql}\n"
                f"\t\tjob config:\t{job}")
        else:

            if response.state == 'DONE':
                if dml_job:
                    job = dml_job

                job.dry_run = False

                LOGGER.info('dry-run passed.  submitting query for execution.')

                response = client.query(sql, location='US', job_config=job)
                LOGGER.info(
                    f"submitted a bigquery job for table:\t{table_name}\t\t"
                    f"status:\t'pending'\t\tvalue:\t{response.job_id}")
                self.wait(client, response.job_id)
Exemple #12
0
def run_validation(hpo_id, force_run=False):
    """
    runs validation for a single hpo_id

    :param hpo_id: which hpo_id to run for
    :param force_run: if True, process the latest submission whether or not it has already been processed before
    :raises
    BucketDoesNotExistError:
      Raised when a configured bucket does not exist
    InternalValidationError:
      Raised when an internal error is encountered during validation
    """
    logging.info(' Validating hpo_id %s' % hpo_id)
    bucket = gcs_utils.get_hpo_bucket(hpo_id)
    bucket_items = list_bucket(bucket)
    to_process_folder_list = _get_to_process_list(bucket, bucket_items,
                                                  force_run)

    for folder_prefix in to_process_folder_list:
        logging.info('Processing gs://%s/%s' % (bucket, folder_prefix))
        # separate cdm from the unknown (unexpected) files
        found_cdm_files = []
        unknown_files = []
        found_pii_files = []
        folder_items = [
            item['name'].split('/')[1] for item in bucket_items
            if item['name'].startswith(folder_prefix)
        ]
        for item in folder_items:
            if _is_cdm_file(item):
                found_cdm_files.append(item)
            elif _is_pii_file(item):
                found_pii_files.append(item)
            else:
                is_known_file = item in common.IGNORE_LIST
                if not is_known_file:
                    unknown_files.append(item)

        errors = []
        results = []

        # Create all tables first to simplify downstream processes
        # (e.g. ehr_union doesn't have to check if tables exist)
        for file_name in common.CDM_FILES + common.PII_FILES:
            table_name = file_name.split('.')[0]
            table_id = bq_utils.get_table_id(hpo_id, table_name)
            bq_utils.create_standard_table(table_name,
                                           table_id,
                                           drop_existing=True)

        for cdm_file_name in common.CDM_FILES:
            file_results, file_errors = perform_validation_on_file(
                cdm_file_name, found_cdm_files, hpo_id, folder_prefix, bucket)
            results.extend(file_results)
            errors.extend(file_errors)

        for pii_file_name in common.PII_FILES:
            file_results, file_errors = perform_validation_on_file(
                pii_file_name, found_pii_files, hpo_id, folder_prefix, bucket)
            results.extend(file_results)
            errors.extend(file_errors)

        # (filename, message) for each unknown file
        warnings = [(unknown_file, UNKNOWN_FILE)
                    for unknown_file in unknown_files]

        # output to GCS
        _save_result_in_gcs(bucket, folder_prefix + RESULT_CSV, results)
        _save_errors_warnings_in_gcs(bucket, folder_prefix + ERRORS_CSV,
                                     errors, warnings)

        if all_required_files_loaded(hpo_id, folder_prefix=folder_prefix):
            run_achilles(hpo_id)
            run_export(hpo_id=hpo_id, folder_prefix=folder_prefix)

        logging.info('Uploading achilles index files to `gs://%s/%s`.' %
                     (bucket, folder_prefix))
        _upload_achilles_files(hpo_id, folder_prefix)

        now_datetime_string = datetime.datetime.now().strftime(
            '%Y-%m-%dT%H:%M:%S')
        logging.info(
            'Processing complete. Saving timestamp %s to `gs://%s/%s`.' %
            (bucket, now_datetime_string,
             folder_prefix + common.PROCESSED_TXT))
        _write_string_to_file(bucket, folder_prefix + common.PROCESSED_TXT,
                              now_datetime_string)
Exemple #13
0
def run_validation(hpo_id, force_run=False):
    """
    runs validation for a single hpo_id

    :param hpo_id: which hpo_id to run for
    :param force_run: if True, process the latest submission whether or not it has already been processed before
    :raises
    BucketDoesNotExistError:
      Raised when a configured bucket does not exist
    InternalValidationError:
      Raised when an internal error is encountered during validation
    """
    logging.info(' Validating hpo_id %s' % hpo_id)
    bucket = gcs_utils.get_hpo_bucket(hpo_id)
    bucket_items = list_bucket(bucket)
    to_process_folder_list = _get_to_process_list(bucket, bucket_items,
                                                  force_run)

    for folder_prefix in to_process_folder_list:
        logging.info('Processing gs://%s/%s' % (bucket, folder_prefix))
        # separate cdm from the unknown (unexpected) files
        found_cdm_files = []
        unknown_files = []
        folder_items = [
            item['name'].split('/')[1] for item in bucket_items
            if item['name'].startswith(folder_prefix)
        ]
        for item in folder_items:
            if _is_cdm_file(item):
                found_cdm_files.append(item)
            else:
                is_known_file = item in common.IGNORE_LIST or is_pii(item)
                if not is_known_file:
                    unknown_files.append(item)

        errors = []
        results = []
        found_cdm_file_names = found_cdm_files

        # Create all tables first to simplify downstream processes
        # (e.g. ehr_union doesn't have to check if tables exist)
        for cdm_file_name in common.CDM_FILES:
            cdm_table_name = cdm_file_name.split('.')[0]
            table_id = bq_utils.get_table_id(hpo_id, cdm_table_name)
            bq_utils.create_standard_table(cdm_table_name,
                                           table_id,
                                           drop_existing=True)

        for cdm_file_name in common.CDM_FILES:
            logging.info('Validating file `{file_name}`'.format(
                file_name=cdm_file_name))
            found = parsed = loaded = 0
            cdm_table_name = cdm_file_name.split('.')[0]

            if cdm_file_name in found_cdm_file_names:
                found = 1
                load_results = bq_utils.load_cdm_csv(hpo_id, cdm_table_name,
                                                     folder_prefix)
                load_job_id = load_results['jobReference']['jobId']
                incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])

                if len(incomplete_jobs) == 0:
                    job_resource = bq_utils.get_job_details(job_id=load_job_id)
                    job_status = job_resource['status']
                    if 'errorResult' in job_status:
                        # These are issues (which we report back) as opposed to internal errors
                        issues = [
                            item['message'] for item in job_status['errors']
                        ]
                        errors.append((cdm_file_name, ' || '.join(issues)))
                        logging.info(
                            'Issues found in gs://{bucket}/{folder_prefix}/{cdm_file_name}'
                            .format(bucket=bucket,
                                    folder_prefix=folder_prefix,
                                    cdm_file_name=cdm_file_name))
                        for issue in issues:
                            logging.info(issue)
                    else:
                        # Processed ok
                        parsed = loaded = 1
                else:
                    # Incomplete jobs are internal unrecoverable errors.
                    # Aborting the process allows for this submission to be validated when system recovers.
                    message_fmt = 'Loading hpo_id `%s` table `%s` failed because job id `%s` did not complete.'
                    message = message_fmt % (hpo_id, cdm_table_name,
                                             load_job_id)
                    message += ' Aborting processing `gs://%s/%s`.' % (
                        bucket, folder_prefix)
                    logging.error(message)
                    raise InternalValidationError(message)

            if cdm_file_name in common.REQUIRED_FILES or found:
                results.append((cdm_file_name, found, parsed, loaded))

        # (filename, message) for each unknown file
        warnings = [(unknown_file, UNKNOWN_FILE)
                    for unknown_file in unknown_files]

        # output to GCS
        _save_result_in_gcs(bucket, folder_prefix + RESULT_CSV, results)
        _save_warnings_in_gcs(bucket, folder_prefix + WARNINGS_CSV, warnings)
        _save_errors_in_gcs(bucket, folder_prefix + ERRORS_CSV, errors)

        if all_required_files_loaded(hpo_id, folder_prefix=folder_prefix):
            run_achilles(hpo_id)
            run_export(hpo_id=hpo_id, folder_prefix=folder_prefix)

        logging.info('Uploading achilles index files to `gs://%s/%s`.' %
                     (bucket, folder_prefix))
        _upload_achilles_files(hpo_id, folder_prefix)

        now_datetime_string = datetime.datetime.now().strftime(
            '%Y-%m-%dT%H:%M:%S')
        logging.info(
            'Processing complete. Saving timestamp %s to `gs://%s/%s`.' %
            (bucket, now_datetime_string,
             folder_prefix + common.PROCESSED_TXT))
        _write_string_to_file(bucket, folder_prefix + common.PROCESSED_TXT,
                              now_datetime_string)
Exemple #14
0
    def submit(self, sql):
        """
        """
        table_name = self.get_tablename()
        client = bq.Client.from_service_account_json(self.private_key)
        #
        # Let's make sure the out dataset exists
        datasets = list(client.list_datasets())
        found = np.sum([1  for dataset in datasets if dataset.dataset_id == self.odataset])
        if not found:
            dataset = bq.Dataset(client.dataset(self.odataset))
            client.create_dataset(dataset)

        # create the output table
        bq_utils.create_standard_table(self.tablename, self.tablename, drop_existing=True, dataset_id=self.odataset)

        job = bq.QueryJobConfig()
        job.destination = client.dataset(self.odataset).table(self.tablename)
        job.use_query_cache = True
        job.allow_large_results = True
        if self.partition:
            job._properties['timePartitioning'] = {'type': 'DAY'}
            job._properties['clustering'] = {'field': 'person_id'}

        job.priority = self.priority
        job.dry_run = True
        self.log(module='submit-job',
                 subject=self.get_tablename(),
                 action='dry-run',
                 value={'priority': self.priority, 'parition': self.partition})

        logpath = os.path.join(self.logpath, self.idataset)
        try:
            os.makedirs(logpath)
        except OSError:
            # log path already exists and we don't care
            pass

        r = client.query(sql, location='US', job_config=job)
        if r.errors is None and r.state == 'DONE':
            job.dry_run = False

            r = client.query(sql, location='US', job_config=job)
            self.log(module='submit',
                     subject=self.get_tablename(),
                     action='submit-job',
                     table=table_name,
                     status='pending',
                     value=r.job_id,
                     object='bigquery')
            self.wait(client, r.job_id)
#            self.finalize(client)
            #
            # At this point we must try to partition the table
        else:
            self.log(module='submit',
                     subject=self.get_tablename(),
                     action='submit-job',
                     table=table_name,
                     status='error',
                     value=r.errors)
            print (r.errors)