Beispiel #1
0
def run_query(gcp_project_id, x, y, cohort_id_array, program_array, project_id_array, cohort_table_id, log_transform):
    from bq_data_access.v2.data_access import FeatureVectorBigQueryBuilder
    from bq_data_access.bigquery_cohorts import BigQueryCohortStorageSettings
    from bq_data_access.v2.plot_data_support import get_merged_feature_vectors
    from google_helpers.bigquery.service_v2 import BigQueryServiceSupport

    # Verify the program set
    # ----------------------
    program_set = get_bq_program_set(program_array)
    logger.info("Selected programs: {}".format(program_set))
    if len(program_set) == 0:
        logger.info("No programs set. Please include at least one program.")
        sys_exit(0)

    # Verify the cohort ID array
    # --------------------------
    if len(cohort_id_array) == 0:
        logger.info("No cohort IDs set. Please include at least one cohort ID.")
        sys_exit(0)

    # Verify the project ID array
    # ---------------------------
    if len(project_id_array) == 0:
        logger.info("No project IDs set. Please include at least one project ID.")
        sys_exit(0)

    cohort_settings = BigQueryCohortStorageSettings.build_from_full_table_id(cohort_table_id)
    bqss = BigQueryServiceSupport.build_from_application_default()
    fvb = FeatureVectorBigQueryBuilder(gcp_project_id, cohort_settings, bqss)

    data = get_merged_feature_vectors(fvb, x, y, None, cohort_id_array, log_transform, project_id_array, program_set=program_set)
Beispiel #2
0
def oncoprint_view_data(request):
    try:
        gene_list_str = request.GET.get('gene_list', None)
        gene_array = gene_list_str.split(',')
        genomic_build = request.GET.get('genomic_build', None)
        cohort_id_param_array = request.GET.getlist('cohort_id', None)

        if not is_valid_genomic_build(genomic_build):
            return JsonResponse({'error': 'Invalid genomic build'}, status=400)

        cohort_id_array = []
        for cohort_id in cohort_id_param_array:
            try:
                cohort_id = int(cohort_id)
                cohort_id_array.append(cohort_id)
            except Exception as e:
                return JsonResponse({'error': 'Invalid cohort parameter'}, status=400)
        if len(cohort_id_array) == 0:
            return JsonResponse({'error': 'No cohorts specified'}, status=400)

        program_set = get_program_set_for_oncoprint(cohort_id_array)
        confirmed_project_ids, user_only_study_ids = get_confirmed_project_ids_for_cohorts(cohort_id_array)
        # Only samples in projects from a data type's valid programs should be queried
        projects_this_program_set = Project.objects.filter(id__in=confirmed_project_ids,program__in=program_set).values_list('id', flat=True)

        if not len(program_set):
            return JsonResponse(
                {'message': "The chosen cohorts do not contain samples from programs with Gene Mutation data."})

        query_template = """
                    #standardSQL
                    SELECT cs.case_barcode, sm.Hugo_Symbol, sm.Alteration, sm.Type
                    FROM (
                          SELECT case_barcode
                          FROM `{cohort_table}`
                          WHERE cohort_id IN ({cohort_id_list})
                          AND (project_id IS NULL{project_clause})
                          GROUP BY case_barcode
                    ) cs
                    LEFT JOIN (
                        SELECT
                          case_barcode, Hugo_Symbol,
                          CASE
                            WHEN Protein_position IS NOT NULL AND Protein_position NOT LIKE '-/%' THEN
                              CONCAT(
                                COALESCE(REGEXP_EXTRACT(Amino_acids,r'^([A-Za-z*\-]+)'),'-'),
                                COALESCE(REGEXP_EXTRACT(Protein_position,r'^([0-9]+)'), '-'),
                                CASE
                                  WHEN Variant_Classification IN ('Frame_Shift_Del', 'Frame_Shift_Ins') OR {conseq_col} LIKE '%frameshift%' THEN '_fs'
                                  WHEN Variant_Classification IN ('Splice_Site', 'Splice_Region') THEN '_splice'
                                  WHEN Amino_acids LIKE '%/%' THEN REGEXP_EXTRACT(Amino_acids,r'^.*/([A-Za-z*-]+)')
                                  ELSE '-'
                                END
                              )
                            ELSE
                              CASE
                                WHEN {conseq_col} LIKE '%splice_%_variant%' THEN REGEXP_EXTRACT({conseq_col},r'^(splice_[^_]+_variant)')
                                WHEN {conseq_col} LIKE '%intron_variant%' THEN 'intron_variant'
                                WHEN Variant_Classification = 'IGR' THEN 'Intergenic'
                                ELSE Variant_Classification
                              END
                          END AS Alteration,
                          CASE
                            WHEN (Amino_acids IS NOT NULL AND REGEXP_EXTRACT(Amino_acids,r'^.*/([A-Za-z*-]+)$') = '*') OR Variant_Classification IN ('Frame_Shift_Del', 'Frame_Shift_Ins', 'Splice_Site', 'Splice_Region') THEN 'TRUNC'
                            WHEN Variant_Classification = 'Nonsense_Mutation' AND {conseq_col} LIKE 'stop_gained%' THEN 'TRUNC'
                            WHEN Variant_Classification = 'Nonstop_Mutation' OR (Variant_Classification = 'Missense_Mutation' AND Variant_Type IN ('DEL','INS')) OR (Variant_Classification = 'Translation_Start_Site') THEN 'MISSENSE'
                            WHEN (Variant_Classification = 'Missense_Mutation' AND Variant_Type IN ('ONP','SNP', 'TNP')) OR (Variant_Classification IN ('In_Frame_Del','In_Frame_Ins')) OR {conseq_col} LIKE '%inframe%' THEN 'INFRAME'
                            WHEN Variant_Classification IN ("RNA","IGR", "3\'UTR","3\'Flank","5\'UTR","5\'Flank") THEN
                              CASE
                                WHEN {conseq_col} LIKE '%intergenic%' THEN 'INTERGENIC'
                                WHEN {conseq_col} LIKE '%regulatory%' THEN 'REGULATORY'
                                WHEN {conseq_col} LIKE '%miRNA%' THEN 'miRNA'
                                WHEN {conseq_col} LIKE '%transcript%' THEN 'TRANSCRIPT'
                                WHEN {conseq_col} LIKE '%downstream%' THEN 'DOWNSTREAM'
                                WHEN {conseq_col} LIKE '%upstream%' THEN 'UPSTREAM'
                                ELSE UPPER(Variant_Classification)
                              END
                            ELSE UPPER(Variant_Classification)
                          END AS Type
                        FROM `{bq_data_project_id}.{dataset_name}.{table_name}`
                        WHERE Variant_Classification NOT IN ('Silent') {filter_clause}
                        AND case_barcode IN (
                          SELECT case_barcode
                          FROM `{cohort_table}`
                          WHERE cohort_id IN ({cohort_id_list})
                          AND (project_id IS NULL{project_clause})
                          GROUP BY case_barcode
                        )
                        GROUP BY case_barcode, Hugo_Symbol, Alteration, Type
                        ORDER BY case_barcode
                    ) sm
                    ON sm.case_barcode = cs.case_barcode
                    ;
                """
        project_id_stmt = ""
        if projects_this_program_set and len(projects_this_program_set):
            project_id_stmt = ', '.join([str(project_id) for project_id in projects_this_program_set])
        project_clause = " OR project_id IN ({})".format(project_id_stmt) if projects_this_program_set else ""

        gene_list_stm = ''
        if gene_array is not None:
            gene_list_stm = ', '.join('\'{0}\''.format(gene) for gene in gene_array)
        filter_clause = "AND Hugo_Symbol IN ({})".format(gene_list_stm) if gene_list_stm != "" else ""
        cohort_id_list = ', '.join([str(cohort_id) for cohort_id in cohort_id_array])

        cohort_table_id = "{project_name}.{dataset_id}.{table_id}".format(
            project_name=settings.BIGQUERY_PROJECT_ID,
            dataset_id=settings.BIGQUERY_COHORT_DATASET_ID,
            table_id=settings.BIGQUERY_COHORT_TABLE_ID)

        bq_table_info = BQ_MOLECULAR_ATTR_TABLES['TCGA'][genomic_build]
        somatic_mut_query = query_template.format(
            bq_data_project_id = settings.BIGQUERY_DATA_PROJECT_ID,
            dataset_name=bq_table_info['dataset'],
            table_name=bq_table_info['table'],
            conseq_col=("one_consequence" if genomic_build == "hg38" else 'consequence'),
            cohort_table=cohort_table_id,
            filter_clause=filter_clause,
            cohort_id_list=cohort_id_list,
            project_clause=project_clause
        )

        somatic_mut_query_job = BigQuerySupport.insert_query_job(somatic_mut_query)

        plot_data = []
        genes_with_no_cnvr = []
        # Build the CNVR features
        for gene in gene_array:
            feature = build_feature_ids(
                "CNVR", {'value_field': 'segment_mean', 'gene_name': gene, 'genomic_build': genomic_build}
            )
            if not feature or not len(feature):
                logger.warn("[WARNING] No internal feature ID found for CNVR, gene {}, build {}.".format(gene,genomic_build))
                genes_with_no_cnvr.append(gene)
                continue

            feature = feature[0]['internal_feature_id']

            fvb = FeatureVectorBigQueryBuilder.build_from_django_settings(BigQueryServiceSupport.build_from_django_settings())
            data = get_merged_feature_vectors(fvb, feature, None, None, cohort_id_array, None, projects_this_program_set, program_set=program_set)['items']

            if data and len(data):
                for item in data:
                    # 01A are tumor samples, which is what we want
                    if item['sample_id'].split('-')[-1] == '01A':
                        seg_mean = float(item['x'])
                        if seg_mean > 0.112 or seg_mean < -0.112:
                            cnvr_result = "AMP" if seg_mean > 1 else "GAIN" if seg_mean > 0.62 else "HOMDEL" if seg_mean < -1 else "HETLOSS"
                            plot_data.append("{}\t{}\t{}\t{}".format(item['case_id'],gene,cnvr_result,"CNA"))

        attempts = 0
        job_is_done = BigQuerySupport.check_job_is_done(somatic_mut_query_job)
        while attempts < settings.BQ_MAX_ATTEMPTS and not job_is_done:
            job_is_done = BigQuerySupport.check_job_is_done(somatic_mut_query_job)
            sleep(1)
            attempts += 1

        if job_is_done:
            results = BigQuerySupport.get_job_results(somatic_mut_query_job['jobReference'])

        #Only add plot_data if gene info is not missing
        if results and len(results) > 0:
            for row in results:
                if row['f'][1]['v']:
                    plot_data.append("{}\t{}\t{}\t{}".format(str(row['f'][0]['v']),str(row['f'][1]['v']),str(row['f'][2]['v']),str(row['f'][3]['v'])))

        if len(plot_data):
            plot_message = \
                '' if not genes_with_no_cnvr \
                    else "No internal feature ID found for CNVR, gene [{}], build {}."\
                        .format(', '.join(genes_with_no_cnvr), genomic_build)

            return JsonResponse({
                'plot_data': plot_data,
                'gene_list': gene_array,
                'bq_tables': ["{bq_data_project_id}:{dataset_name}.{table_name}".format(
                    bq_data_project_id=settings.BIGQUERY_DATA_PROJECT_ID,
                    dataset_name=bq_table_info['dataset'],
                    table_name=bq_table_info['table'])],
                'plot_message': plot_message,
            })

        else:
            return JsonResponse(
                {'message': "The chosen genes and cohorts do not contain any samples with Gene Mutation data."})

    except Exception as e:
        logger.error("[ERROR] In oncoprint_view_data: ")
        logger.exception(e)
        return JsonResponse({'Error': str(e)}, status=500)
def data_access_for_plot(request):
    """
    Used by the web application.
    """
    try:
        logTransform = None
        ver = request.GET.get('ver', '1')
        x_id = request.GET.get('x_id', None)
        y_id = request.GET.get('y_id', None)
        c_id = request.GET.get('c_id', None)
        try:
            # TODO Use jsonschema to validate logTransform object
            logTransform = json.loads(request.GET.get('log_transform', None))
        except Exception as e:
            logger.warn("[WARNING] Log transform parameter not supplied")
            logTransform = None

        cohort_id_array = request.GET.getlist('cohort_id', None)

        # Check that all requested feature identifiers are valid. Do not check for y_id if it is not
        # supplied in the request.
        feature_ids_to_check = [x_id]
        if c_id is not None:
            feature_ids_to_check.append(c_id)
        if y_id is not None:
            feature_ids_to_check.append(y_id)

        valid_features = get_feature_id_validity_for_array(
            feature_ids_to_check)

        for feature_id, is_valid in valid_features:
            if not is_valid:
                logging.error(
                    "Invalid internal feature ID '{}'".format(feature_id))
                raise Exception('Feature Not Found')

        # Gives the user data handler a chance to map e.g. "v2:USER:343:58901" to "v2:CLIN:case_barcode"
        x_id = _feature_converter(x_id)
        y_id = _feature_converter(y_id)
        c_id = _feature_converter(c_id)

        # Get the project IDs these cohorts' samples come from
        confirmed_study_ids, user_only_study_ids = get_confirmed_project_ids_for_cohorts(
            cohort_id_array)

        bqss = BigQueryServiceSupport.build_from_django_settings()
        fvb = FeatureVectorBigQueryBuilder.build_from_django_settings(bqss)

        # By extracting info from the cohort, we get the NAMES of the public projects
        # we need to access (public projects have unique name tags, e.g. tcga).
        program_set = get_public_program_name_set_for_cohorts(cohort_id_array)

        # We need to do this for cohorts that contain samples found in user data projects,
        # where those projects are extension of public data. This is because the cohorts
        # only reference the user project, but if we are plotting against pubic data, we
        # have to know which public programs we need to look at.

        prog_set_extended = get_extended_public_program_name_set_for_user_extended_projects(
            confirmed_study_ids)
        program_set.update(prog_set_extended)

        # Check to see if these programs have data for the requested vectors; if not, there's no reason to plot
        features_without_program_data = []
        for id in [x_id, y_id, c_id]:
            if id:
                type = id.split(':')[1].lower()
                plot_type = FEATURE_ID_TO_TYPE_MAP[
                    type] if type in FEATURE_ID_TO_TYPE_MAP else None
                if plot_type:
                    programs = FeatureDataTypeHelper.get_supported_programs_from_data_type(
                        plot_type)
                    valid_programs = set(programs).intersection(program_set)

                    if not len(valid_programs):
                        features_without_program_data.append(
                            FeatureDataTypeHelper.get_proper_feature_type_name(
                                plot_type))

        if len(features_without_program_data):
            return JsonResponse({
                'message':
                "The chosen cohorts do not contain programs with data for these features: {}."
                .format(", ".join(features_without_program_data))
            })

        user_programs = get_user_program_id_set_for_user_only_projects(
            user_only_study_ids)

        # Fix for #2381: confirmed_study_ids MUST ALWAYS contain the public dataset project IDs, because that's how we
        # enable older cohorts which didn't store project IDs (check for NULL) against ones where we did require the
        # project ID
        if len(user_programs):
            program_set.update(user_programs)
            confirmed_study_ids += user_only_study_ids

        data = get_merged_feature_vectors(fvb,
                                          x_id,
                                          y_id,
                                          c_id,
                                          cohort_id_array,
                                          logTransform,
                                          confirmed_study_ids,
                                          program_set=program_set)

        # Annotate each data point with cohort information
        add_cohort_info_to_merged_vectors(data, x_id, y_id, c_id,
                                          cohort_id_array)

        # convert to display strings where needed (eg. categoricals stored as indicies rather than strings)
        programs_by_project = {}
        preformatted_vals = {}
        for item in data['items']:
            programs = []
            for project in item['project']:
                # Fetch the program if we don't know it already
                if project not in programs_by_project:
                    programs_by_project[project] = Project.objects.get(
                        id=project).program.id
                programs.append(programs_by_project[project])

            for program in programs:
                if program not in preformatted_vals:
                    preformatted_vals[program] = fetch_metadata_value_set(
                        program)
                if x_id is not None and x_id.split(':')[-1] in preformatted_vals[program] and item['x'] in \
                        preformatted_vals[program][x_id.split(':')[-1]]['values']:
                    item['x'] = preformatted_vals[program][x_id.split(
                        ':')[-1]]['values'][item['x']]['displ_value']
                if y_id is not None and y_id.split(':')[-1] in preformatted_vals[program] and item['y'] in \
                        preformatted_vals[program][y_id.split(':')[-1]]['values']:
                    item['y'] = preformatted_vals[program][y_id.split(
                        ':')[-1]]['values'][item['y']]['displ_value']
                if c_id is not None and c_id.split(':')[-1] in preformatted_vals[program] and item['c'] in \
                        preformatted_vals[program][c_id.split(':')[-1]]['values']:
                    item['c'] = preformatted_vals[program][c_id.split(
                        ':')[-1]]['values'][item['c']]['displ_value']

        return JsonResponse(data)

    except Exception as e:
        logger.error("[ERROR] In data access for plot: ")
        logger.exception(e)
        return JsonResponse({'error': str(e)}, status=500)
Beispiel #4
0
def data_access_for_plot(request):
    """
    Used by the web application.
    """
    try:
        logTransform = None
        ver = request.GET.get('ver', '1')
        x_id = request.GET.get('x_id', None)
        y_id = request.GET.get('y_id', None)
        c_id = request.GET.get('c_id', None)
        try:
            logTransform = json.loads(request.GET.get('log_transform', None))
        except Exception as e:
            print >> sys.stdout, "[WARNING] Log transform parameter not supplied"
            logger.warn("[WARNING] Log transform parameter not supplied")
            logTransform = None

        cohort_id_array = request.GET.getlist('cohort_id', None)

        # Check that all requested feature identifiers are valid. Do not check for y_id if it is not
        # supplied in the request.
        feature_ids_to_check = [x_id]
        if c_id is not None:
            feature_ids_to_check.append(c_id)
        if y_id is not None:
            feature_ids_to_check.append(y_id)

        valid_features = get_feature_id_validity_for_array(
            feature_ids_to_check)

        for feature_id, is_valid in valid_features:
            logging.info((feature_id, is_valid))
            if not is_valid:
                logging.error(
                    "Invalid internal feature ID '{}'".format(feature_id))
                raise Exception('Feature Not Found')

        # Get the project IDs these cohorts' samples come from
        confirmed_study_ids = get_confirmed_project_ids_for_cohorts(
            cohort_id_array)

        bqss = BigQueryServiceSupport.build_from_django_settings()
        fvb = FeatureVectorBigQueryBuilder.build_from_django_settings(bqss)

        program_set = get_public_program_name_set_for_cohorts(cohort_id_array)
        data = get_merged_feature_vectors(fvb,
                                          x_id,
                                          y_id,
                                          None,
                                          cohort_id_array,
                                          logTransform,
                                          confirmed_study_ids,
                                          program_set=program_set)

        # Annotate each data point with cohort information
        add_cohort_info_to_merged_vectors(data, x_id, y_id, c_id,
                                          cohort_id_array)

        return JsonResponse(data)

    except Exception as e:
        print >> sys.stdout, traceback.format_exc()
        logger.exception(e)
        return JsonResponse({'error': str(e)}, status=500)