Exemple #1
0
def do_query(project_id, project_name, dataset_name, table_name, gene_label,
             value_field, cohort_dataset, cohort_table, cohort_id_array):
    bigquery_service = get_bigquery_service()

    query = build_query(project_name, dataset_name, table_name, gene_label,
                        value_field, cohort_dataset, cohort_table,
                        cohort_id_array)
    query_body = {'query': query}

    table_data = bigquery_service.jobs()
    query_response = table_data.query(projectId=project_id,
                                      body=query_body).execute()

    result = []
    num_result_rows = int(query_response['totalRows'])
    if num_result_rows == 0:
        return result

    for row in query_response['rows']:
        result.append({
            'case_id': row['f'][0]['v'],
            'sample_id': row['f'][1]['v'],
            'aliquot_id': row['f'][2]['v'],
            'value': row['f'][5]['v'],
        })
        result.append({
            'case_id': row['f'][0]['v'],
            'sample_id': row['f'][3]['v'],
            'aliquot_id': row['f'][4]['v'],
            'value': row['f'][5]['v'],
        })

    return result
Exemple #2
0
    def _streaming_insert(self, rows):
        bigquery_service = get_bigquery_service()
        table_data = bigquery_service.tabledata()

        index = 0
        next = 0

        logger.info("[STATUS] Beginning row stream...")
        while index < len(rows) and next is not None:
            next = MAX_INSERT + index
            body = None
            if next > len(rows):
                next = None
                body = self._build_request_body_from_rows(rows[index:])
            else:
                body = self._build_request_body_from_rows(rows[index:next])

            response = table_data.insertAll(projectId=self.project_id,
                                            datasetId=self.dataset_id,
                                            tableId=self.table_id,
                                            body=body).execute()
            index = next
        logger.info("[STATUS] ...done.")

        return response
Exemple #3
0
def submit_jobs_with_user_data(params_array):
    bigquery_service = get_bigquery_service()
    provider_array = []

    cohort_settings = settings.GET_BQ_COHORT_SETTINGS()

    # Submit jobs
    for parameter_object in params_array:
        feature_id = parameter_object.feature_id
        cohort_id_array = parameter_object.cohort_id_array

        user_data = user_feature_handler(feature_id, cohort_id_array)

        if user_data['include_tcga']:
            job_item = submit_tcga_job(parameter_object, bigquery_service,
                                       cohort_settings)
            provider_array.append(job_item)

        if len(user_data['user_studies']) > 0:
            converted_feature_id = user_data['converted_feature_id']
            user_feature_id = user_data['user_feature_id']
            logging.debug("user_feature_id: {0}".format(user_feature_id))
            provider = UserDataQueryHandler(converted_feature_id,
                                            user_feature_id=user_feature_id)

            # The UserDataQueryHandler instance might not generate a BigQuery query and job at all given the combination
            # of cohort(s) and feature identifiers. The provider is not added to the array, and therefore to the
            # polling loop below, if it would not submit a BigQuery job.
            if provider.is_queryable(cohort_id_array):
                job_reference = provider.get_data_job_reference(
                    cohort_id_array, cohort_settings.dataset_id,
                    cohort_settings.table_id)

                logging.info(
                    "Submitted USER {job_id}: {fid} - {cohorts}".format(
                        job_id=job_reference['jobId'],
                        fid=feature_id,
                        cohorts=str(cohort_id_array)))
                provider_array.append({
                    'feature_id':
                    feature_id,
                    'provider':
                    provider,
                    'ready':
                    False,
                    'job_reference':
                    job_reference['job_reference'],
                    'tables_used':
                    job_reference['tables_queried']
                })
            else:
                logging.debug("No UserFeatureDefs for '{0}'".format(
                    converted_feature_id))

    return provider_array
 def __init__(self, project_id, dataset_id, table_id, executing_project=None, table_schema=None):
     # Project which will execute any jobs run by this class
     self.executing_project = executing_project or settings.BIGQUERY_PROJECT_ID
     # Destination project
     self.project_id = project_id
     # Destination dataset
     self.dataset_id = dataset_id
     # Destination table
     self.table_id = table_id
     self.bq_service = get_bigquery_service()
     self.table_schema = table_schema
Exemple #5
0
def get_feature_vectors_tcga_only(params_array, poll_retry_limit=20, skip_formatting_for_plot=False):
    bigquery_service = get_bigquery_service()
    provider_array = []

    cohort_settings = settings.GET_BQ_COHORT_SETTINGS()

    # Submit jobs
    for parameter_object in params_array:
        job_item = submit_tcga_job(parameter_object, bigquery_service, cohort_settings)
        provider_array.append(job_item)

    project_id = settings.BQ_PROJECT_ID
    result = get_submitted_job_results(provider_array, project_id, poll_retry_limit, skip_formatting_for_plot)

    return result
Exemple #6
0
def run_with_schema_object(project_id,
                           dataset_id,
                           table_name,
                           schema,
                           data_path,
                           source_format='NEWLINE_DELIMITED_JSON',
                           write_disposition='WRITE_EMPTY',
                           num_retries=5):

    bigquery_service = get_bigquery_service()

    job = load_table(bigquery_service, project_id, dataset_id, table_name,
                     schema, data_path, source_format, num_retries,
                     write_disposition)

    poll_job(bigquery_service, job)
Exemple #7
0
def run(project_id,
        dataset_id,
        table_name,
        schema_file,
        data_path,
        source_format='NEWLINE_DELIMITED_JSON',
        write_disposition='WRITE_EMPTY',
        num_retries=5,
        poll_interval=1):

    bigquery_service = get_bigquery_service()

    with open(schema_file, 'r') as f:
        schema = json.load(f)

    job = load_table(bigquery_service, project_id, dataset_id, table_name,
                     schema, data_path, source_format, num_retries,
                     write_disposition)

    poll_job(bigquery_service, job)
    def do_query(self, project_id, project_name, uniprot_id):
        bigquery_service = get_bigquery_service()

        query = self.build_query(project_name, uniprot_id)
        query_body = {'query': query}

        table_data = bigquery_service.jobs()
        query_response = table_data.query(projectId=project_id,
                                          body=query_body).execute()

        num_result_rows = int(query_response['totalRows'])
        if num_result_rows == 0:
            return None

        row = query_response['rows'][0]
        interpro_literal = row['f'][1]['v']
        interpro_literal = interpro_literal.replace('\'', '"')
        interpro_literal = json_loads(interpro_literal)

        return interpro_literal
    def get_bq_service(self):
        if self.bigquery_service is None:
            self.bigquery_service = get_bigquery_service()

        return self.bigquery_service
Exemple #10
0
    def _table_to_gcs(self,
                      file_format,
                      dataset_and_table,
                      export_type,
                      table_job_id=None):

        bq_service = get_bigquery_service()

        result = {'status': None, 'message': None}

        # presence of a table_job_id means the export query was still running when this
        # method was called; give it another round of checks
        if table_job_id:
            job_is_done = bq_service.jobs().get(
                projectId=settings.BIGQUERY_PROJECT_ID,
                jobId=table_job_id).execute()
            retries = 0
            while (job_is_done and not job_is_done['status']['state'] == 'DONE'
                   ) and retries < BQ_ATTEMPT_MAX:
                retries += 1
                sleep(1)
                job_is_done = bq_service.jobs().get(
                    projectId=settings.BIGQUERY_PROJECT_ID,
                    jobId=table_job_id).execute()

            if job_is_done and not job_is_done['status']['state'] == 'DONE':
                logger.debug(str(job_is_done))
                msg = "Export of {} to gs://{}/{} did not complete in the time allowed".format(
                    export_type, self.bucket_path, self.file_name)
                logger.error("[ERROR] {}.".format(msg))
                result['status'] = 'error'
                result['message'] = msg + "--please contact the administrator."
                return result
            else:
                dataset_and_table = {
                    'dataset_id':
                    job_is_done['configuration']['query']['destinationTable']
                    ['datasetId'],
                    'table_id':
                    job_is_done['configuration']['query']['destinationTable']
                    ['tableId']
                }

        job_id = str(uuid4())

        export_config = {
            'jobReference': {
                'projectId': self.project_id,
                'jobId': job_id
            },
            'configuration': {
                'extract': {
                    'sourceTable': {
                        'projectId': self.project_id,
                        'datasetId': dataset_and_table['dataset_id'],
                        'tableId': dataset_and_table['table_id']
                    },
                    'destinationUris':
                    ['gs://{}/{}'.format(self.bucket_path, self.file_name)],
                    'destinationFormat':
                    file_format,
                    'compression':
                    'GZIP'
                }
            }
        }

        export_job = bq_service.jobs().insert(
            projectId=settings.BIGQUERY_PROJECT_ID,
            body=export_config).execute(num_retries=5)

        job_is_done = bq_service.jobs().get(
            projectId=settings.BIGQUERY_PROJECT_ID, jobId=job_id).execute()

        retries = 0

        while (job_is_done and not job_is_done['status']['state'] == 'DONE'
               ) and retries < BQ_ATTEMPT_MAX:
            retries += 1
            sleep(1)
            job_is_done = bq_service.jobs().get(
                projectId=settings.BIGQUERY_PROJECT_ID,
                jobId=job_id).execute()

        logger.debug("[STATUS] extraction job_is_done: {}".format(
            str(job_is_done)))

        if job_is_done and job_is_done['status']['state'] == 'DONE':
            if 'status' in job_is_done and 'errors' in job_is_done['status']:
                msg = "Export of {} to GCS bucket {} was unsuccessful, reason: {}".format(
                    export_type, self.bucket,
                    job_is_done['status']['errors'][0]['message'])
                logger.error("[ERROR] {}".format(msg))
                result['status'] = 'error'
                result[
                    'message'] = "Unable to export {} to bucket {}--please contact the administrator.".format(
                        export_type, self.bucket)
            else:
                # Check the file
                exported_file = get_storage_resource().objects().get(
                    bucket=self.bucket_path, object=self.file_name).execute()
                if not exported_file:
                    msg = "Export file {}/{} not found".format(
                        self.bucket_path, self.file_name)
                    logger.error("[ERROR] ".format({msg}))
                    export_result = bq_service.jobs().get(
                        projectId=settings.BIGQUERY_PROJECT_ID,
                        jobId=job_id).execute()
                    if 'errors' in export_result:
                        logger.error('[ERROR] Errors seen: {}'.format(
                            export_result['errors'][0]['message']))
                    result['status'] = 'error'
                    result[
                        'message'] = "Unable to export {} to file {}/{}--please contact the administrator.".format(
                            export_type, self.bucket_path, self.file_name)
                else:
                    if int(exported_file['size']) > 0:
                        logger.info(
                            "[STATUS] Successfully exported {} into GCS file gs://{}/{}"
                            .format(export_type, self.bucket_path,
                                    self.file_name))
                        result['status'] = 'success'
                        result['message'] = "{}MB".format(
                            str(
                                round((float(exported_file['size']) / 1000000),
                                      2)))
                    else:
                        msg = "File gs://{}/{} created, but appears empty. Export of {} may not have succeeded".format(
                            export_type, self.bucket_path, self.file_name)
                        logger.warn("[WARNING] {}.".format(msg))
                        result['status'] = 'error'
                        result[
                            'message'] = msg + "--please contact the administrator."
        else:
            logger.debug(str(job_is_done))
            msg = "Export of {} to gs://{}/{} did not complete in the time allowed".format(
                export_type, self.bucket_path, self.file_name)
            logger.error("[ERROR] {}.".format(msg))
            result['status'] = 'error'
            result['message'] = msg + "--please contact the administrator."

        return result