def seqpeek_view_data(self, request):
        try:
            hugo_symbol = request.hugo_symbol
            cohort_id_array = request.cohort_id

            gnab_feature_id = self.build_gnab_feature_id(hugo_symbol)
            logging.debug("GNAB feature ID for SeqPeke: {0}".format(gnab_feature_id))

            async_params = [ProviderClassQueryDescription(SeqPeekDataProvider, gnab_feature_id, cohort_id_array)]
            maf_data_result = get_feature_vectors_tcga_only(async_params, skip_formatting_for_plot=True)

            maf_data_vector = maf_data_result[gnab_feature_id]['data']

            if len(maf_data_vector) > 0:
                # Since the gene (hugo_symbol) parameter is part of the GNAB feature ID,
                # it will be sanity-checked in the SeqPeekMAFDataAccess instance.
                seqpeek_data = SeqPeekMAFDataFormatter().format_maf_vector_for_view(maf_data_vector, cohort_id_array)

                seqpeek_maf_vector = seqpeek_data.maf_vector
                seqpeek_cohort_info = seqpeek_data.cohort_info
                removed_row_statistics_dict = seqpeek_data.removed_row_statistics

                seqpeek_view_data = SeqPeekViewDataBuilder().build_view_data(hugo_symbol,
                                                                             seqpeek_maf_vector,
                                                                             seqpeek_cohort_info,
                                                                             cohort_id_array,
                                                                             removed_row_statistics_dict)

                response = self.create_response(seqpeek_view_data)
                return response
            else:
                # No data found
                return SeqPeekViewRecord(plot_data=SeqPeekViewPlotDataRecord(tracks=[], protein=None, regions=[]),
                                         hugo_symbol=hugo_symbol, cohort_id_list=[str(i) for i in cohort_id_array],
                                         removed_row_statistics=[])
        except Exception as e:
            logging.exception(e)
            raise InternalServerErrorException()
Beispiel #2
0
    def get_merged_feature_vectors(self, x_id, y_id, c_id, cohort_id_array, logTransform, study_id_array):
        """
        Fetches and merges data for two or three feature vectors (see parameter documentation below).
        The vectors have to be an array of dictionaries, with each dictionary containing a 'value' field
        (other fields are ignored):
        [
            {
                'value': 0.5
            },
            {
                'value': 1.0
            }
        ]
        The merged result:
        [
            {
                'patient_id': <patient ID #0>
                'x': <value for x for patient ID #0>
                'y': <value for y for patient ID #0>
                'c': <value for c for patient ID #0>
            },
            {
                'patient_id': <patient ID #1>
                'x': <value for x for patient ID #1>
                'y': <value for y for patient ID #1>
                'c': <value for c for patient ID #1>
            }
            ...
        ]

        :param x_id: Feature identifier for x-axis e.g. 'CLIN:age_at_initial_pathologic_diagnosis'
        :param y_id: Feature identifier for y-axis. If None, values for 'y' in the response will be marked as missing.
        :param c_id: Feature identifier for color-by. If None, values for 'c' in the response will be marked as missing.
        :param cohort_id_array: Cohort identifier array.

        :return: PlotDataResponse
        """

        async_params = [FeatureIdQueryDescription(x_id, cohort_id_array, study_id_array)]

        c_type, c_vec = ValueType.STRING, []
        y_type, y_vec = ValueType.STRING, []

        units = get_axis_units(x_id, y_id)

        if c_id is not None:
            async_params.append(FeatureIdQueryDescription(c_id, cohort_id_array, study_id_array))
        if y_id is not None:
            async_params.append(FeatureIdQueryDescription(y_id, cohort_id_array, study_id_array))

        async_result = get_feature_vectors_tcga_only(async_params)

        if c_id is not None:
            c_type, c_vec = async_result[c_id]['type'], async_result[c_id]['data']
        if y_id is not None:
            y_type, y_vec = async_result[y_id]['type'], async_result[y_id]['data']
            if logTransform is not None and logTransform['y'] and y_vec and is_log_transformable(y_type):
                # If we opt to use a transform that attempts to account for values out of range for log transformation,
                # this is the code to get the minimum y-value
                '''
                yvals = []
                for yd in y_vec:
                    if 'value' in yd and yd['value'] is not None and yd['value'] != "NA" and yd['value'] != "None":
                        yvals.append(float(yd['value']))
                y_min = min(yvals)
                '''
                for ydata in y_vec:
                    if 'value' in ydata and ydata['value'] is not None and ydata['value'] != "NA" and ydata['value'] != "None":
                        if float(ydata['value']) < 0:
                            ydata['value'] = "NA"
                        elif logTransform['yBase'] == 10:
                            ydata['value'] = str(math.log10((float(ydata['value']) + 1)))
                        elif logTransform['yBase'] == 'e':
                            ydata['value'] = str(math.log((float(ydata['value']) + 1)))
                        elif type(logTransform['yBase']) is int:
                            ydata['value'] = str(math.log((float(ydata['value']) + 1), logTransform['yBase']))
                        else:
                            logger.warn(
                                "[WARNING] No valid log base was supplied - log transformation will not be applied!"
                            )

        x_type, x_vec = async_result[x_id]['type'], async_result[x_id]['data']

        if logTransform is not None and logTransform['x'] and x_vec and is_log_transformable(x_type):
            # If we opt to use a transform that attempts to account for values out of range for log transformation,
            # this is the code to get the minimum x-value
            '''
            xvals = []
            for xd in x_vec:
                if 'value' in xd and xd['value'] is not None and xd['value'] != "NA" and xd['value'] != "None":
                    xvals.append(float(xd['value']))
            x_min = min(xvals)
            '''

            for xdata in x_vec:
                if 'value' in xdata and xdata['value'] is not None and xdata['value'] != "NA" and xdata['value'] != "None":
                    if float(xdata['value']) < 0:
                        xdata['value'] = "NA"
                    elif logTransform['xBase'] == 10:
                        xdata['value'] = str(math.log10((float(xdata['value']) + 1)))
                    elif logTransform['xBase'] == 'e':
                        xdata['value'] = str(math.log((float(xdata['value']) + 1)))
                    elif type(logTransform['xBase']) is int:
                        xdata['value'] = str(math.log((float(xdata['value']) + 1), logTransform['xBase']))
                    else:
                        logger.warn(
                            "[WARNING] No valid log base was supplied - log transformation will not be applied!"
                        )

        vms = VectorMergeSupport('NA', 'sample_id', 'case_id', ['x', 'y', 'c']) # changed so that it plots per sample not patient
        vms.add_dict_array(x_vec, 'x', 'value')
        vms.add_dict_array(y_vec, 'y', 'value')
        vms.add_dict_array(c_vec, 'c', 'value')
        merged = self.get_merged_dict_timed(vms)

        # Resolve which (requested) cohorts each datapoint belongs to.
        cohort_set_dict = CloudSQLCohortAccess.get_cohorts_for_datapoints(cohort_id_array)

        # Get the name and ID for every requested cohort.
        cohort_info_array = CloudSQLCohortAccess.get_cohort_info(cohort_id_array)
        cohort_info_obj_array = []
        for item in cohort_info_array:
            cohort_info_obj_array.append(PlotDataCohortInfo(id=item['id'], name=item['name']))

        items = []
        for value_bundle in merged:
            sample_id = value_bundle['sample_id']

            # Add an array of cohort
            # only if the number of containing cohort exceeds the configured threshold.
            cohort_set = []
            # TODO FIX - this check shouldn't be needed
            if sample_id in cohort_set_dict:
                cohort_set = cohort_set_dict[sample_id]

            if len(cohort_set) >= DATAPOINT_COHORT_THRESHOLD:
                value_bundle['cohort'] = cohort_set

            items.append(PlotDataPoint(**value_bundle))

        counts = self.get_counts(merged)
        count_message = PlotDatapointCount(**counts)

        type_message = PlotDataTypes(x=x_type, y=y_type, c=c_type)

        # TODO assign label for y if y_id is None, as in that case the y-field will be missing from the response
        label_message = PlotDataFeatureLabels(x=x_id, y=y_id, c=c_id)

        # TODO Refactor pairwise call to separate function
        # Include pairwise results
        input_vectors = [PairwiseInputVector(x_id, x_type, x_vec)]
        if c_id is not None:
            input_vectors.append(PairwiseInputVector(c_id, c_type, c_vec))
        if y_id is not None:
            input_vectors.append(PairwiseInputVector(y_id, y_type, y_vec))


        pairwise_result = None

        if len(input_vectors) > 1:
            pairwise_result = self.get_pairwise_result(input_vectors)

        if pairwise_result is None:
            logger.warn("[WARNING] Pairwise results not included in returned object")

        return PlotDataResponse(types=type_message, labels=label_message, items=items,
                                cohort_set=cohort_info_obj_array,
                                counts=count_message, pairwise_result=pairwise_result, xUnits=units['x'], yUnits=units['y'])
    def seqpeek_view_data(self, request):
        try:
            hugo_symbol = request.hugo_symbol
            cohort_id_array = request.cohort_id

            gnab_feature_id = self.build_gnab_feature_id(hugo_symbol)
            logging.debug("GNAB feature ID for SeqPeke: {0}".format(gnab_feature_id))

            # Lifted from api/data_access.py line 509+
            # Get the study IDs these cohorts' samples come from
            cohort_vals = ()
            cohort_params = ""

            for cohort in cohort_id_array:
                cohort_params += "%s,"
                cohort_vals += (cohort,)

            cohort_params = cohort_params[:-1]

            db = sql_connection()
            cursor = db.cursor()

            tcga_studies = fetch_isbcgc_project_set()

            cursor.execute("SELECT DISTINCT study_id FROM cohorts_samples WHERE cohort_id IN (" + cohort_params + ");",
                           cohort_vals)

            # Only samples whose source studies are TCGA studies, or extended from them, should be used
            confirmed_study_ids = []
            unconfirmed_study_ids = []

            for row in cursor.fetchall():
                if row[0] in tcga_studies:
                    if row[0] not in confirmed_study_ids:
                        confirmed_study_ids.append(row[0])
                elif row[0] not in unconfirmed_study_ids:
                    unconfirmed_study_ids.append(row[0])

            if len(unconfirmed_study_ids) > 0:
                studies = Study.objects.filter(id__in=unconfirmed_study_ids)

                for study in studies:
                    if study.get_my_root_and_depth()['root'] in tcga_studies:
                        confirmed_study_ids.append(study.id)

            async_params = [ProviderClassQueryDescription(SeqPeekDataProvider, gnab_feature_id, cohort_id_array, confirmed_study_ids)]
            maf_data_result = get_feature_vectors_tcga_only(async_params, skip_formatting_for_plot=True)

            maf_data_vector = maf_data_result[gnab_feature_id]['data']

            if len(maf_data_vector) > 0:
                # Since the gene (hugo_symbol) parameter is part of the GNAB feature ID,
                # it will be sanity-checked in the SeqPeekMAFDataAccess instance.
                seqpeek_data = SeqPeekMAFDataFormatter().format_maf_vector_for_view(maf_data_vector, cohort_id_array)

                seqpeek_maf_vector = seqpeek_data.maf_vector
                seqpeek_cohort_info = seqpeek_data.cohort_info
                removed_row_statistics_dict = seqpeek_data.removed_row_statistics

                seqpeek_view_data = SeqPeekViewDataBuilder().build_view_data(hugo_symbol,
                                                                             seqpeek_maf_vector,
                                                                             seqpeek_cohort_info,
                                                                             cohort_id_array,
                                                                             removed_row_statistics_dict)

                response = self.create_response(seqpeek_view_data)
                return response
            else:
                # No data found
                return SeqPeekViewRecord(plot_data=SeqPeekViewPlotDataRecord(tracks=[], protein=None, regions=[]),
                                         hugo_symbol=hugo_symbol, cohort_id_list=[str(i) for i in cohort_id_array],
                                         removed_row_statistics=[])
        except Exception as e:
            logging.exception(e)
            raise InternalServerErrorException()
    def get_merged_feature_vectors(self, x_id, y_id, c_id, cohort_id_array):
        """
        Fetches and merges data for two or three feature vectors (see parameter documentation below).
        The vectors have to be an array of dictionaries, with each dictionary containing a 'value' field
        (other fields are ignored):
        [
            {
                'value': 0.5
            },
            {
                'value': 1.0
            }
        ]
        The merged result:
        [
            {
                'patient_id': <patient ID #0>
                'x': <value for x for patient ID #0>
                'y': <value for y for patient ID #0>
                'c': <value for c for patient ID #0>
            },
            {
                'patient_id': <patient ID #1>
                'x': <value for x for patient ID #1>
                'y': <value for y for patient ID #1>
                'c': <value for c for patient ID #1>
            }
            ...
        ]

        :param x_id: Feature identifier for x-axis e.g. 'CLIN:age_at_initial_pathologic_diagnosis'
        :param y_id: Feature identifier for y-axis. If None, values for 'y' in the response will be marked as missing.
        :param c_id: Feature identifier for color-by. If None, values for 'c' in the response will be marked as missing.
        :param cohort_id_array: Cohort identifier array.

        :return: PlotDataResponse
        """

        async_params = [FeatureIdQueryDescription(x_id, cohort_id_array)]

        c_type, c_vec = ValueType.STRING, []
        y_type, y_vec = ValueType.STRING, []

        if c_id is not None:
            async_params.append(FeatureIdQueryDescription(c_id, cohort_id_array))
        if y_id is not None:
            async_params.append(FeatureIdQueryDescription(y_id, cohort_id_array))

        async_result = get_feature_vectors_tcga_only(async_params)

        if c_id is not None:
            c_type, c_vec = async_result[c_id]['type'], async_result[c_id]['data']
        if y_id is not None:
            y_type, y_vec = async_result[y_id]['type'], async_result[y_id]['data']

        x_type, x_vec = async_result[x_id]['type'], async_result[x_id]['data']

        vms = VectorMergeSupport('NA', 'sample_id', ['x', 'y', 'c']) # changed so that it plots per sample not patient
        vms.add_dict_array(x_vec, 'x', 'value')
        vms.add_dict_array(y_vec, 'y', 'value')
        vms.add_dict_array(c_vec, 'c', 'value')
        merged = self.get_merged_dict_timed(vms)

        # Resolve which (requested) cohorts each datapoint belongs to.
        cohort_set_dict = CloudSQLCohortAccess.get_cohorts_for_datapoints(cohort_id_array)

        # Get the name and ID for every requested cohort.
        cohort_info_array = CloudSQLCohortAccess.get_cohort_info(cohort_id_array)
        cohort_info_obj_array = []
        for item in cohort_info_array:
            cohort_info_obj_array.append(PlotDataCohortInfo(id=item['id'], name=item['name']))

        items = []
        for value_bundle in merged:
            sample_id = value_bundle['sample_id']

            # Add an array of cohort
            # only if the number of containing cohort exceeds the configured threshold.
            cohort_set = []
            # TODO FIX - this check shouldn't be needed
            if sample_id in cohort_set_dict:
                cohort_set = cohort_set_dict[sample_id]
            if len(cohort_set) >= DATAPOINT_COHORT_THRESHOLD:
                value_bundle['cohort'] = cohort_set
            items.append(PlotDataPoint(**value_bundle))

        counts = self.get_counts(merged)
        count_message = PlotDatapointCount(**counts)

        type_message = PlotDataTypes(x=x_type, y=y_type, c=c_type)

        # TODO assign label for y if y_id is None, as in that case the y-field will be missing from the response
        label_message = PlotDataFeatureLabels(x=x_id, y=y_id, c=c_id)

        # TODO Refactor pairwise call to separate function
        # Include pairwise results
        input_vectors = [PairwiseInputVector(x_id, x_type, x_vec),
                         PairwiseInputVector(c_id, c_type, c_vec)]

        if y_id is not None:
            input_vectors.append(PairwiseInputVector(y_id, y_type, y_vec))

        pairwise_result = None
        try:
            pairwise_result = self.get_pairwise_result(input_vectors)
        except Exception as e:
            logging.warn("Pairwise results not included in returned object")
            logging.exception(e)

        return PlotDataResponse(types=type_message, labels=label_message, items=items,
                                cohort_set=cohort_info_obj_array,
                                counts=count_message, pairwise_result=pairwise_result)