Example #1
0
    def prepare_feature_vector(self, input_vectors):
        feature_vector_mapping = {}
        vectors = []
        for item in input_vectors:
            feature_id, value_type, vector = item.feature_id, item.value_type, item.data
            if value_type == ValueType.INTEGER or value_type == ValueType.FLOAT:
                value_type = "N"
            elif value_type == ValueType.STRING:
                value_type = "C"
            else:
                value_type = "B"

            feature_vector_mapping[feature_id] = (value_type, vector)
            vectors.append(vector)

        # Create merged feature vectors
        feature_ids = [v.feature_id for v in input_vectors]

        vms = VectorMergeSupport('NA',
                                 'sample_id',
                                 'case_id',
                                 row_ids=feature_ids)

        for feature in feature_vector_mapping.keys():
            vms.add_dict_array(feature_vector_mapping[feature][1], feature,
                               'value')

        merged = vms.get_merged_dict()

        rows = []

        for feature in feature_vector_mapping.keys():
            current_row = [feature_vector_mapping[feature][0] + ":" + feature]

            for item in merged:
                current_row.append(item[feature])

            rows.append("\t".join(current_row))

        return rows
Example #2
0
    def prepare_features(self, cohort_id, features):
        # Get the feature data
        feature_vector_mapping = {}
        vectors = []
        for feature in features:
            value_type, vector = get_feature_vector(feature, cohort_id)

            if value_type == ValueType.INTEGER or value_type == ValueType.FLOAT:
                value_type = "N"
            elif value_type == ValueType.STRING:
                value_type = "C"
            else:
                value_type = "B"

            feature_vector_mapping[feature] = (value_type, vector)
            vectors.append(vector)

        # Create merged feature vectors
        vms = VectorMergeSupport('NA', 'sample_id', row_ids=features)

        for feature in feature_vector_mapping.keys():
            vms.add_dict_array(feature_vector_mapping[feature][1], feature,
                               'value')

        merged = vms.get_merged_dict()

        rows = []

        for feature in feature_vector_mapping.keys():
            current_row = [feature_vector_mapping[feature][0] + ":" + feature]

            for item in merged:
                current_row.append(item[feature])

            rows.append("\t".join(current_row))

        return rows
def get_merged_feature_vectors(x_id, y_id, c_id, cohort_id_array, logTransform, study_id_array):
    """
    Fetches and merges data for two or three feature vectors (see parameter documentation below).
    The vectors have to be an array of dictionaries, with each dictionary containing a 'value' field
    (other fields are ignored):
    [
        {
            'value': 0.5
        },
        {
            'value': 1.0
        }
    ]
    The merged result:
    [
        {
            'patient_id': <patient ID #0>
            'x': <value for x for patient ID #0>
            'y': <value for y for patient ID #0>
            'c': <value for c for patient ID #0>
        },
        {
            'patient_id': <patient ID #1>
            'x': <value for x for patient ID #1>
            'y': <value for y for patient ID #1>
            'c': <value for c for patient ID #1>
        }
        ...
    ]

    :param x_id: Feature identifier for x-axis e.g. 'CLIN:age_at_initial_pathologic_diagnosis'
    :param y_id: Feature identifier for y-axis. If None, values for 'y' in the response will be marked as missing.
    :param c_id: Feature identifier for color-by. If None, values for 'c' in the response will be marked as missing.
    :param cohort_id_array: Cohort identifier array.

    :return: PlotDataResponse
    """

    async_params = [FeatureIdQueryDescription(x_id, cohort_id_array, study_id_array)]

    c_type, c_vec = ValueType.STRING, []
    y_type, y_vec = ValueType.STRING, []

    units = get_axis_units(x_id, y_id)

    if c_id is not None:
        async_params.append(FeatureIdQueryDescription(c_id, cohort_id_array, study_id_array))
    if y_id is not None:
        async_params.append(FeatureIdQueryDescription(y_id, cohort_id_array, study_id_array))

    async_result = get_feature_vectors_tcga_only(async_params)

    if c_id is not None:
        c_type, c_vec = async_result[c_id]['type'], async_result[c_id]['data']
    if y_id is not None:
        y_type, y_vec = async_result[y_id]['type'], async_result[y_id]['data']
        if logTransform is not None and logTransform['y'] and y_vec and is_log_transformable(y_type):
            # If we opt to use a transform that attempts to account for values out of range for log transformation,
            # this is the code to get the minimum y-value
            '''
            yvals = []
            for yd in y_vec:
                if 'value' in yd and yd['value'] is not None and yd['value'] != "NA" and yd['value'] != "None":
                    yvals.append(float(yd['value']))
            y_min = min(yvals)
            '''
            for ydata in y_vec:
                if 'value' in ydata and ydata['value'] is not None and ydata['value'] != "NA" and ydata['value'] != "None":
                    if float(ydata['value']) < 0:
                        ydata['value'] = "NA"
                    elif logTransform['yBase'] == 10:
                        ydata['value'] = str(math.log10((float(ydata['value']) + 1)))
                    elif logTransform['yBase'] == 'e':
                        ydata['value'] = str(math.log((float(ydata['value']) + 1)))
                    elif type(logTransform['yBase']) is int:
                        ydata['value'] = str(math.log((float(ydata['value']) + 1), logTransform['yBase']))
                    else:
                        logger.warn(
                            "[WARNING] No valid log base was supplied - log transformation will not be applied!"
                        )

    x_type, x_vec = async_result[x_id]['type'], async_result[x_id]['data']

    if logTransform is not None and logTransform['x'] and x_vec and is_log_transformable(x_type):
        # If we opt to use a transform that attempts to account for values out of range for log transformation,
        # this is the code to get the minimum x-value
        '''
        xvals = []
        for xd in x_vec:
            if 'value' in xd and xd['value'] is not None and xd['value'] != "NA" and xd['value'] != "None":
                xvals.append(float(xd['value']))
        x_min = min(xvals)
        '''

        for xdata in x_vec:
            if 'value' in xdata and xdata['value'] is not None and xdata['value'] != "NA" and xdata['value'] != "None":
                if float(xdata['value']) < 0:
                    xdata['value'] = "NA"
                elif logTransform['xBase'] == 10:
                    xdata['value'] = str(math.log10((float(xdata['value']) + 1)))
                elif logTransform['xBase'] == 'e':
                    xdata['value'] = str(math.log((float(xdata['value']) + 1)))
                elif type(logTransform['xBase']) is int:
                    xdata['value'] = str(math.log((float(xdata['value']) + 1), logTransform['xBase']))
                else:
                    logger.warn(
                        "[WARNING] No valid log base was supplied - log transformation will not be applied!"
                    )

    vms = VectorMergeSupport('NA', 'sample_id', 'case_id', ['x', 'y', 'c']) # changed so that it plots per sample not patient
    vms.add_dict_array(x_vec, 'x', 'value')
    vms.add_dict_array(y_vec, 'y', 'value')
    vms.add_dict_array(c_vec, 'c', 'value')
    merged = get_merged_dict_timed(vms)

    # Resolve which (requested) cohorts each datapoint belongs to.
    cohort_set_dict = CloudSQLCohortAccess.get_cohorts_for_datapoints(cohort_id_array)

    # Get the name and ID for every requested cohort.
    cohort_info_array = CloudSQLCohortAccess.get_cohort_info(cohort_id_array)
    cohort_info_obj_array = []
    for item in cohort_info_array:
        cohort_info_obj_array.append({'id': item['id'], 'name': item['name']})

    items = []
    for value_bundle in merged:
        sample_id = value_bundle['sample_id']

        # Add an array of cohort
        # only if the number of containing cohort exceeds the configured threshold.
        cohort_set = []
        # TODO FIX - this check shouldn't be needed
        if sample_id in cohort_set_dict:
            cohort_set = cohort_set_dict[sample_id]

        if len(cohort_set) >= DATAPOINT_COHORT_THRESHOLD:
            value_bundle['cohort'] = cohort_set

        items.append(value_bundle)

    count_message = get_counts(merged)
    type_message = {'x': str(x_type), 'y': str(y_type), 'c': str(c_type)}

    # TODO assign label for y if y_id is None, as in that case the y-field will be missing from the response
    label_message = {'x': x_id, 'y': y_id, 'c': c_id}

    # TODO Refactor pairwise call to separate function
    # Include pairwise results
    input_vectors = [PairwiseInputVector(x_id, x_type, x_vec)]
    if c_id is not None:
        input_vectors.append(PairwiseInputVector(c_id, c_type, c_vec))
    if y_id is not None:
        input_vectors.append(PairwiseInputVector(y_id, y_type, y_vec))

    results = {'types':            type_message,
                'labels':           label_message,
                'items':            items,
                'cohort_set':       cohort_info_obj_array,
                'counts':           count_message,

                'xUnits':           units['x'],
                'yUnits':           units['y']}

    pairwise_result = None

    if len(input_vectors) > 1:
        pass
        #pairwise_result = get_pairwise_result(input_vectors)

    if pairwise_result is None:
        logger.warn("[WARNING] Pairwise results not included in returned object")
        results['pairwise_result'] = {}
    else:
        results['pairwise_result'] = [pairwise_result]
    return results