def annotate_vector_with_cohorts(self, cohort_id_array, result):
        # Resolve which (requested) cohorts each datapoint belongs to.
        cohort_set_dict = CloudSQLCohortAccess.get_cohorts_for_datapoints(cohort_id_array)

        for row in result:
            sample_id = row['sample_id']

            # Add an array of cohort
            # only if the number of containing cohort exceeds the configured threshold.
            cohort_set = []
            # TODO FIX - this check shouldn't be needed
            if sample_id in cohort_set_dict:
                cohort_set = cohort_set_dict[sample_id]
            row['cohort'] = cohort_set
예제 #2
0
    def annotate_vector_with_cohorts(self, cohort_id_array, merged):
        # Resolve which (requested) cohorts each datapoint belongs to.
        cohort_set_dict = CloudSQLCohortAccess.get_cohorts_for_datapoints(cohort_id_array)

        for value_bundle in merged:
            sample_id = value_bundle["sample_id"]

            # Add an array of cohort
            # only if the number of containing cohort exceeds the configured threshold.
            cohort_set = []
            # TODO FIX - this check shouldn't be needed
            if sample_id in cohort_set_dict:
                cohort_set = cohort_set_dict[sample_id]
            value_bundle["cohort"] = cohort_set
    def get_cohort_information(self, cohort_id_array):
        # Get the name, size and ID for every requested cohort.
        cohort_info_array = CloudSQLCohortAccess.get_cohort_info(cohort_id_array)

        return cohort_info_array
예제 #4
0
    def get_merged_feature_vectors(self, x_id, y_id, c_id, cohort_id_array, logTransform, study_id_array):
        """
        Fetches and merges data for two or three feature vectors (see parameter documentation below).
        The vectors have to be an array of dictionaries, with each dictionary containing a 'value' field
        (other fields are ignored):
        [
            {
                'value': 0.5
            },
            {
                'value': 1.0
            }
        ]
        The merged result:
        [
            {
                'patient_id': <patient ID #0>
                'x': <value for x for patient ID #0>
                'y': <value for y for patient ID #0>
                'c': <value for c for patient ID #0>
            },
            {
                'patient_id': <patient ID #1>
                'x': <value for x for patient ID #1>
                'y': <value for y for patient ID #1>
                'c': <value for c for patient ID #1>
            }
            ...
        ]

        :param x_id: Feature identifier for x-axis e.g. 'CLIN:age_at_initial_pathologic_diagnosis'
        :param y_id: Feature identifier for y-axis. If None, values for 'y' in the response will be marked as missing.
        :param c_id: Feature identifier for color-by. If None, values for 'c' in the response will be marked as missing.
        :param cohort_id_array: Cohort identifier array.

        :return: PlotDataResponse
        """

        async_params = [FeatureIdQueryDescription(x_id, cohort_id_array, study_id_array)]

        c_type, c_vec = ValueType.STRING, []
        y_type, y_vec = ValueType.STRING, []

        units = get_axis_units(x_id, y_id)

        if c_id is not None:
            async_params.append(FeatureIdQueryDescription(c_id, cohort_id_array, study_id_array))
        if y_id is not None:
            async_params.append(FeatureIdQueryDescription(y_id, cohort_id_array, study_id_array))

        async_result = get_feature_vectors_tcga_only(async_params)

        if c_id is not None:
            c_type, c_vec = async_result[c_id]['type'], async_result[c_id]['data']
        if y_id is not None:
            y_type, y_vec = async_result[y_id]['type'], async_result[y_id]['data']
            if logTransform is not None and logTransform['y'] and y_vec and is_log_transformable(y_type):
                # If we opt to use a transform that attempts to account for values out of range for log transformation,
                # this is the code to get the minimum y-value
                '''
                yvals = []
                for yd in y_vec:
                    if 'value' in yd and yd['value'] is not None and yd['value'] != "NA" and yd['value'] != "None":
                        yvals.append(float(yd['value']))
                y_min = min(yvals)
                '''
                for ydata in y_vec:
                    if 'value' in ydata and ydata['value'] is not None and ydata['value'] != "NA" and ydata['value'] != "None":
                        if float(ydata['value']) < 0:
                            ydata['value'] = "NA"
                        elif logTransform['yBase'] == 10:
                            ydata['value'] = str(math.log10((float(ydata['value']) + 1)))
                        elif logTransform['yBase'] == 'e':
                            ydata['value'] = str(math.log((float(ydata['value']) + 1)))
                        elif type(logTransform['yBase']) is int:
                            ydata['value'] = str(math.log((float(ydata['value']) + 1), logTransform['yBase']))
                        else:
                            logger.warn(
                                "[WARNING] No valid log base was supplied - log transformation will not be applied!"
                            )

        x_type, x_vec = async_result[x_id]['type'], async_result[x_id]['data']

        if logTransform is not None and logTransform['x'] and x_vec and is_log_transformable(x_type):
            # If we opt to use a transform that attempts to account for values out of range for log transformation,
            # this is the code to get the minimum x-value
            '''
            xvals = []
            for xd in x_vec:
                if 'value' in xd and xd['value'] is not None and xd['value'] != "NA" and xd['value'] != "None":
                    xvals.append(float(xd['value']))
            x_min = min(xvals)
            '''

            for xdata in x_vec:
                if 'value' in xdata and xdata['value'] is not None and xdata['value'] != "NA" and xdata['value'] != "None":
                    if float(xdata['value']) < 0:
                        xdata['value'] = "NA"
                    elif logTransform['xBase'] == 10:
                        xdata['value'] = str(math.log10((float(xdata['value']) + 1)))
                    elif logTransform['xBase'] == 'e':
                        xdata['value'] = str(math.log((float(xdata['value']) + 1)))
                    elif type(logTransform['xBase']) is int:
                        xdata['value'] = str(math.log((float(xdata['value']) + 1), logTransform['xBase']))
                    else:
                        logger.warn(
                            "[WARNING] No valid log base was supplied - log transformation will not be applied!"
                        )

        vms = VectorMergeSupport('NA', 'sample_id', 'case_id', ['x', 'y', 'c']) # changed so that it plots per sample not patient
        vms.add_dict_array(x_vec, 'x', 'value')
        vms.add_dict_array(y_vec, 'y', 'value')
        vms.add_dict_array(c_vec, 'c', 'value')
        merged = self.get_merged_dict_timed(vms)

        # Resolve which (requested) cohorts each datapoint belongs to.
        cohort_set_dict = CloudSQLCohortAccess.get_cohorts_for_datapoints(cohort_id_array)

        # Get the name and ID for every requested cohort.
        cohort_info_array = CloudSQLCohortAccess.get_cohort_info(cohort_id_array)
        cohort_info_obj_array = []
        for item in cohort_info_array:
            cohort_info_obj_array.append(PlotDataCohortInfo(id=item['id'], name=item['name']))

        items = []
        for value_bundle in merged:
            sample_id = value_bundle['sample_id']

            # Add an array of cohort
            # only if the number of containing cohort exceeds the configured threshold.
            cohort_set = []
            # TODO FIX - this check shouldn't be needed
            if sample_id in cohort_set_dict:
                cohort_set = cohort_set_dict[sample_id]

            if len(cohort_set) >= DATAPOINT_COHORT_THRESHOLD:
                value_bundle['cohort'] = cohort_set

            items.append(PlotDataPoint(**value_bundle))

        counts = self.get_counts(merged)
        count_message = PlotDatapointCount(**counts)

        type_message = PlotDataTypes(x=x_type, y=y_type, c=c_type)

        # TODO assign label for y if y_id is None, as in that case the y-field will be missing from the response
        label_message = PlotDataFeatureLabels(x=x_id, y=y_id, c=c_id)

        # TODO Refactor pairwise call to separate function
        # Include pairwise results
        input_vectors = [PairwiseInputVector(x_id, x_type, x_vec)]
        if c_id is not None:
            input_vectors.append(PairwiseInputVector(c_id, c_type, c_vec))
        if y_id is not None:
            input_vectors.append(PairwiseInputVector(y_id, y_type, y_vec))


        pairwise_result = None

        if len(input_vectors) > 1:
            pairwise_result = self.get_pairwise_result(input_vectors)

        if pairwise_result is None:
            logger.warn("[WARNING] Pairwise results not included in returned object")

        return PlotDataResponse(types=type_message, labels=label_message, items=items,
                                cohort_set=cohort_info_obj_array,
                                counts=count_message, pairwise_result=pairwise_result, xUnits=units['x'], yUnits=units['y'])
예제 #5
0
    def get_merged_feature_vectors(self, x_id, y_id, c_id, cohort_id_array):
        """
        Fetches and merges data for two or three feature vectors (see parameter documentation below).
        The vectors have to be an array of dictionaries, with each dictionary containing a 'value' field
        (other fields are ignored):
        [
            {
                'value': 0.5
            },
            {
                'value': 1.0
            }
        ]
        The merged result:
        [
            {
                'patient_id': <patient ID #0>
                'x': <value for x for patient ID #0>
                'y': <value for y for patient ID #0>
                'c': <value for c for patient ID #0>
            },
            {
                'patient_id': <patient ID #1>
                'x': <value for x for patient ID #1>
                'y': <value for y for patient ID #1>
                'c': <value for c for patient ID #1>
            }
            ...
        ]

        :param x_id: Feature identifier for x-axis e.g. 'CLIN:age_at_initial_pathologic_diagnosis'
        :param y_id: Feature identifier for y-axis. If None, values for 'y' in the response will be marked as missing.
        :param c_id: Feature identifier for color-by. If None, values for 'c' in the response will be marked as missing.
        :param cohort_id_array: Cohort identifier array.

        :return: PlotDataResponse
        """

        async_params = [FeatureIdQueryDescription(x_id, cohort_id_array)]

        c_type, c_vec = ValueType.STRING, []
        y_type, y_vec = ValueType.STRING, []

        if c_id is not None:
            async_params.append(FeatureIdQueryDescription(c_id, cohort_id_array))
        if y_id is not None:
            async_params.append(FeatureIdQueryDescription(y_id, cohort_id_array))

        async_result = get_feature_vectors_tcga_only(async_params)

        if c_id is not None:
            c_type, c_vec = async_result[c_id]['type'], async_result[c_id]['data']
        if y_id is not None:
            y_type, y_vec = async_result[y_id]['type'], async_result[y_id]['data']

        x_type, x_vec = async_result[x_id]['type'], async_result[x_id]['data']

        vms = VectorMergeSupport('NA', 'sample_id', ['x', 'y', 'c']) # changed so that it plots per sample not patient
        vms.add_dict_array(x_vec, 'x', 'value')
        vms.add_dict_array(y_vec, 'y', 'value')
        vms.add_dict_array(c_vec, 'c', 'value')
        merged = self.get_merged_dict_timed(vms)

        # Resolve which (requested) cohorts each datapoint belongs to.
        cohort_set_dict = CloudSQLCohortAccess.get_cohorts_for_datapoints(cohort_id_array)

        # Get the name and ID for every requested cohort.
        cohort_info_array = CloudSQLCohortAccess.get_cohort_info(cohort_id_array)
        cohort_info_obj_array = []
        for item in cohort_info_array:
            cohort_info_obj_array.append(PlotDataCohortInfo(id=item['id'], name=item['name']))

        items = []
        for value_bundle in merged:
            sample_id = value_bundle['sample_id']

            # Add an array of cohort
            # only if the number of containing cohort exceeds the configured threshold.
            cohort_set = []
            # TODO FIX - this check shouldn't be needed
            if sample_id in cohort_set_dict:
                cohort_set = cohort_set_dict[sample_id]
            if len(cohort_set) >= DATAPOINT_COHORT_THRESHOLD:
                value_bundle['cohort'] = cohort_set
            items.append(PlotDataPoint(**value_bundle))

        counts = self.get_counts(merged)
        count_message = PlotDatapointCount(**counts)

        type_message = PlotDataTypes(x=x_type, y=y_type, c=c_type)

        # TODO assign label for y if y_id is None, as in that case the y-field will be missing from the response
        label_message = PlotDataFeatureLabels(x=x_id, y=y_id, c=c_id)

        # TODO Refactor pairwise call to separate function
        # Include pairwise results
        input_vectors = [PairwiseInputVector(x_id, x_type, x_vec),
                         PairwiseInputVector(c_id, c_type, c_vec)]

        if y_id is not None:
            input_vectors.append(PairwiseInputVector(y_id, y_type, y_vec))

        pairwise_result = None
        try:
            pairwise_result = self.get_pairwise_result(input_vectors)
        except Exception as e:
            logging.warn("Pairwise results not included in returned object")
            logging.exception(e)

        return PlotDataResponse(types=type_message, labels=label_message, items=items,
                                cohort_set=cohort_info_obj_array,
                                counts=count_message, pairwise_result=pairwise_result)