def get_feature_vector(self, feature_id, cohort_id_array):
        start = time.time()

        async_params = [(feature_id, cohort_id_array)]
        async_result = get_feature_vectors_with_user_data(async_params)

        feature_type, feature_vec = async_result[feature_id]["type"], async_result[feature_id]["data"]

        end = time.time()
        time_elapsed = end - start
        logging.info("Time elapsed: " + str(time_elapsed))

        vms = VectorMergeSupport("NA", "sample_id", [feature_id])
        vms.add_dict_array(feature_vec, feature_id, "value")

        merged = vms.get_merged_dict()

        return feature_type, merged
Example #2
0
    def prepare_feature_vector(self, input_vectors):
        feature_vector_mapping = {}
        vectors = []
        for item in input_vectors:
            feature_id, value_type, vector = item.feature_id, item.value_type, item.data
            if value_type == ValueType.INTEGER or value_type == ValueType.FLOAT:
                value_type = "N"
            elif value_type == ValueType.STRING:
                value_type = "C"
            else:
                value_type = "B"

            feature_vector_mapping[feature_id] = (value_type, vector)
            vectors.append(vector)

        # Create merged feature vectors
        feature_ids = [v.feature_id for v in input_vectors]

        vms = VectorMergeSupport('NA', 'sample_id', 'case_id', row_ids=feature_ids)

        for feature in feature_vector_mapping.keys():
            vms.add_dict_array(feature_vector_mapping[feature][1], feature, 'value')

        merged = vms.get_merged_dict()

        rows = []

        for feature in feature_vector_mapping.keys():
            current_row = [feature_vector_mapping[feature][0] + ":" + feature]

            for item in merged:
                current_row.append(item[feature])

            rows.append("\t".join(current_row))

        return rows
Example #3
0
    def prepare_features(self, cohort_id, features):
        # Get the feature data
        feature_vector_mapping = {}
        vectors = []
        for feature in features:
            value_type, vector = get_feature_vector(feature, cohort_id)

            if value_type == ValueType.INTEGER or value_type == ValueType.FLOAT:
                value_type = "N"
            elif value_type == ValueType.STRING:
                value_type = "C"
            else:
                value_type = "B"

            feature_vector_mapping[feature] = (value_type, vector)
            vectors.append(vector)

        # Create merged feature vectors
        vms = VectorMergeSupport('NA', 'sample_id', row_ids=features)

        for feature in feature_vector_mapping.keys():
            vms.add_dict_array(feature_vector_mapping[feature][1], feature, 'value')

        merged = vms.get_merged_dict()

        rows = []

        for feature in feature_vector_mapping.keys():
            current_row = [feature_vector_mapping[feature][0] + ":" + feature]

            for item in merged:
                current_row.append(item[feature])

            rows.append("\t".join(current_row))

        return rows
Example #4
0
    def get_merged_feature_vectors(self, x_id, y_id, c_id, cohort_id_array, logTransform, study_id_array):
        """
        Fetches and merges data for two or three feature vectors (see parameter documentation below).
        The vectors have to be an array of dictionaries, with each dictionary containing a 'value' field
        (other fields are ignored):
        [
            {
                'value': 0.5
            },
            {
                'value': 1.0
            }
        ]
        The merged result:
        [
            {
                'patient_id': <patient ID #0>
                'x': <value for x for patient ID #0>
                'y': <value for y for patient ID #0>
                'c': <value for c for patient ID #0>
            },
            {
                'patient_id': <patient ID #1>
                'x': <value for x for patient ID #1>
                'y': <value for y for patient ID #1>
                'c': <value for c for patient ID #1>
            }
            ...
        ]

        :param x_id: Feature identifier for x-axis e.g. 'CLIN:age_at_initial_pathologic_diagnosis'
        :param y_id: Feature identifier for y-axis. If None, values for 'y' in the response will be marked as missing.
        :param c_id: Feature identifier for color-by. If None, values for 'c' in the response will be marked as missing.
        :param cohort_id_array: Cohort identifier array.

        :return: PlotDataResponse
        """

        async_params = [FeatureIdQueryDescription(x_id, cohort_id_array, study_id_array)]

        c_type, c_vec = ValueType.STRING, []
        y_type, y_vec = ValueType.STRING, []

        units = get_axis_units(x_id, y_id)

        if c_id is not None:
            async_params.append(FeatureIdQueryDescription(c_id, cohort_id_array, study_id_array))
        if y_id is not None:
            async_params.append(FeatureIdQueryDescription(y_id, cohort_id_array, study_id_array))

        async_result = get_feature_vectors_tcga_only(async_params)

        if c_id is not None:
            c_type, c_vec = async_result[c_id]['type'], async_result[c_id]['data']
        if y_id is not None:
            y_type, y_vec = async_result[y_id]['type'], async_result[y_id]['data']
            if logTransform is not None and logTransform['y'] and y_vec and is_log_transformable(y_type):
                # If we opt to use a transform that attempts to account for values out of range for log transformation,
                # this is the code to get the minimum y-value
                '''
                yvals = []
                for yd in y_vec:
                    if 'value' in yd and yd['value'] is not None and yd['value'] != "NA" and yd['value'] != "None":
                        yvals.append(float(yd['value']))
                y_min = min(yvals)
                '''
                for ydata in y_vec:
                    if 'value' in ydata and ydata['value'] is not None and ydata['value'] != "NA" and ydata['value'] != "None":
                        if float(ydata['value']) < 0:
                            ydata['value'] = "NA"
                        elif logTransform['yBase'] == 10:
                            ydata['value'] = str(math.log10((float(ydata['value']) + 1)))
                        elif logTransform['yBase'] == 'e':
                            ydata['value'] = str(math.log((float(ydata['value']) + 1)))
                        elif type(logTransform['yBase']) is int:
                            ydata['value'] = str(math.log((float(ydata['value']) + 1), logTransform['yBase']))
                        else:
                            logger.warn(
                                "[WARNING] No valid log base was supplied - log transformation will not be applied!"
                            )

        x_type, x_vec = async_result[x_id]['type'], async_result[x_id]['data']

        if logTransform is not None and logTransform['x'] and x_vec and is_log_transformable(x_type):
            # If we opt to use a transform that attempts to account for values out of range for log transformation,
            # this is the code to get the minimum x-value
            '''
            xvals = []
            for xd in x_vec:
                if 'value' in xd and xd['value'] is not None and xd['value'] != "NA" and xd['value'] != "None":
                    xvals.append(float(xd['value']))
            x_min = min(xvals)
            '''

            for xdata in x_vec:
                if 'value' in xdata and xdata['value'] is not None and xdata['value'] != "NA" and xdata['value'] != "None":
                    if float(xdata['value']) < 0:
                        xdata['value'] = "NA"
                    elif logTransform['xBase'] == 10:
                        xdata['value'] = str(math.log10((float(xdata['value']) + 1)))
                    elif logTransform['xBase'] == 'e':
                        xdata['value'] = str(math.log((float(xdata['value']) + 1)))
                    elif type(logTransform['xBase']) is int:
                        xdata['value'] = str(math.log((float(xdata['value']) + 1), logTransform['xBase']))
                    else:
                        logger.warn(
                            "[WARNING] No valid log base was supplied - log transformation will not be applied!"
                        )

        vms = VectorMergeSupport('NA', 'sample_id', 'case_id', ['x', 'y', 'c']) # changed so that it plots per sample not patient
        vms.add_dict_array(x_vec, 'x', 'value')
        vms.add_dict_array(y_vec, 'y', 'value')
        vms.add_dict_array(c_vec, 'c', 'value')
        merged = self.get_merged_dict_timed(vms)

        # Resolve which (requested) cohorts each datapoint belongs to.
        cohort_set_dict = CloudSQLCohortAccess.get_cohorts_for_datapoints(cohort_id_array)

        # Get the name and ID for every requested cohort.
        cohort_info_array = CloudSQLCohortAccess.get_cohort_info(cohort_id_array)
        cohort_info_obj_array = []
        for item in cohort_info_array:
            cohort_info_obj_array.append(PlotDataCohortInfo(id=item['id'], name=item['name']))

        items = []
        for value_bundle in merged:
            sample_id = value_bundle['sample_id']

            # Add an array of cohort
            # only if the number of containing cohort exceeds the configured threshold.
            cohort_set = []
            # TODO FIX - this check shouldn't be needed
            if sample_id in cohort_set_dict:
                cohort_set = cohort_set_dict[sample_id]

            if len(cohort_set) >= DATAPOINT_COHORT_THRESHOLD:
                value_bundle['cohort'] = cohort_set

            items.append(PlotDataPoint(**value_bundle))

        counts = self.get_counts(merged)
        count_message = PlotDatapointCount(**counts)

        type_message = PlotDataTypes(x=x_type, y=y_type, c=c_type)

        # TODO assign label for y if y_id is None, as in that case the y-field will be missing from the response
        label_message = PlotDataFeatureLabels(x=x_id, y=y_id, c=c_id)

        # TODO Refactor pairwise call to separate function
        # Include pairwise results
        input_vectors = [PairwiseInputVector(x_id, x_type, x_vec)]
        if c_id is not None:
            input_vectors.append(PairwiseInputVector(c_id, c_type, c_vec))
        if y_id is not None:
            input_vectors.append(PairwiseInputVector(y_id, y_type, y_vec))


        pairwise_result = None

        if len(input_vectors) > 1:
            pairwise_result = self.get_pairwise_result(input_vectors)

        if pairwise_result is None:
            logger.warn("[WARNING] Pairwise results not included in returned object")

        return PlotDataResponse(types=type_message, labels=label_message, items=items,
                                cohort_set=cohort_info_obj_array,
                                counts=count_message, pairwise_result=pairwise_result, xUnits=units['x'], yUnits=units['y'])
    def get_merged_feature_vectors(self, x_id, y_id, c_id, cohort_id_array):
        """
        Fetches and merges data for two or three feature vectors (see parameter documentation below).
        The vectors have to be an array of dictionaries, with each dictionary containing a 'value' field
        (other fields are ignored):
        [
            {
                'value': 0.5
            },
            {
                'value': 1.0
            }
        ]
        The merged result:
        [
            {
                'patient_id': <patient ID #0>
                'x': <value for x for patient ID #0>
                'y': <value for y for patient ID #0>
                'c': <value for c for patient ID #0>
            },
            {
                'patient_id': <patient ID #1>
                'x': <value for x for patient ID #1>
                'y': <value for y for patient ID #1>
                'c': <value for c for patient ID #1>
            }
            ...
        ]

        :param x_id: Feature identifier for x-axis e.g. 'CLIN:age_at_initial_pathologic_diagnosis'
        :param y_id: Feature identifier for y-axis. If None, values for 'y' in the response will be marked as missing.
        :param c_id: Feature identifier for color-by. If None, values for 'c' in the response will be marked as missing.
        :param cohort_id_array: Cohort identifier array.

        :return: PlotDataResponse
        """

        async_params = [FeatureIdQueryDescription(x_id, cohort_id_array)]

        c_type, c_vec = ValueType.STRING, []
        y_type, y_vec = ValueType.STRING, []

        if c_id is not None:
            async_params.append(FeatureIdQueryDescription(c_id, cohort_id_array))
        if y_id is not None:
            async_params.append(FeatureIdQueryDescription(y_id, cohort_id_array))

        async_result = get_feature_vectors_tcga_only(async_params)

        if c_id is not None:
            c_type, c_vec = async_result[c_id]['type'], async_result[c_id]['data']
        if y_id is not None:
            y_type, y_vec = async_result[y_id]['type'], async_result[y_id]['data']

        x_type, x_vec = async_result[x_id]['type'], async_result[x_id]['data']

        vms = VectorMergeSupport('NA', 'sample_id', ['x', 'y', 'c']) # changed so that it plots per sample not patient
        vms.add_dict_array(x_vec, 'x', 'value')
        vms.add_dict_array(y_vec, 'y', 'value')
        vms.add_dict_array(c_vec, 'c', 'value')
        merged = self.get_merged_dict_timed(vms)

        # Resolve which (requested) cohorts each datapoint belongs to.
        cohort_set_dict = CloudSQLCohortAccess.get_cohorts_for_datapoints(cohort_id_array)

        # Get the name and ID for every requested cohort.
        cohort_info_array = CloudSQLCohortAccess.get_cohort_info(cohort_id_array)
        cohort_info_obj_array = []
        for item in cohort_info_array:
            cohort_info_obj_array.append(PlotDataCohortInfo(id=item['id'], name=item['name']))

        items = []
        for value_bundle in merged:
            sample_id = value_bundle['sample_id']

            # Add an array of cohort
            # only if the number of containing cohort exceeds the configured threshold.
            cohort_set = []
            # TODO FIX - this check shouldn't be needed
            if sample_id in cohort_set_dict:
                cohort_set = cohort_set_dict[sample_id]
            if len(cohort_set) >= DATAPOINT_COHORT_THRESHOLD:
                value_bundle['cohort'] = cohort_set
            items.append(PlotDataPoint(**value_bundle))

        counts = self.get_counts(merged)
        count_message = PlotDatapointCount(**counts)

        type_message = PlotDataTypes(x=x_type, y=y_type, c=c_type)

        # TODO assign label for y if y_id is None, as in that case the y-field will be missing from the response
        label_message = PlotDataFeatureLabels(x=x_id, y=y_id, c=c_id)

        # TODO Refactor pairwise call to separate function
        # Include pairwise results
        input_vectors = [PairwiseInputVector(x_id, x_type, x_vec),
                         PairwiseInputVector(c_id, c_type, c_vec)]

        if y_id is not None:
            input_vectors.append(PairwiseInputVector(y_id, y_type, y_vec))

        pairwise_result = None
        try:
            pairwise_result = self.get_pairwise_result(input_vectors)
        except Exception as e:
            logging.warn("Pairwise results not included in returned object")
            logging.exception(e)

        return PlotDataResponse(types=type_message, labels=label_message, items=items,
                                cohort_set=cohort_info_obj_array,
                                counts=count_message, pairwise_result=pairwise_result)