def prepare_feature_vector(self, input_vectors): feature_vector_mapping = {} vectors = [] for item in input_vectors: feature_id, value_type, vector = item.feature_id, item.value_type, item.data if value_type == ValueType.INTEGER or value_type == ValueType.FLOAT: value_type = "N" elif value_type == ValueType.STRING: value_type = "C" else: value_type = "B" feature_vector_mapping[feature_id] = (value_type, vector) vectors.append(vector) # Create merged feature vectors feature_ids = [v.feature_id for v in input_vectors] vms = VectorMergeSupport('NA', 'sample_id', 'case_id', row_ids=feature_ids) for feature in feature_vector_mapping.keys(): vms.add_dict_array(feature_vector_mapping[feature][1], feature, 'value') merged = vms.get_merged_dict() rows = [] for feature in feature_vector_mapping.keys(): current_row = [feature_vector_mapping[feature][0] + ":" + feature] for item in merged: current_row.append(item[feature]) rows.append("\t".join(current_row)) return rows
def prepare_features(self, cohort_id, features): # Get the feature data feature_vector_mapping = {} vectors = [] for feature in features: value_type, vector = get_feature_vector(feature, cohort_id) if value_type == ValueType.INTEGER or value_type == ValueType.FLOAT: value_type = "N" elif value_type == ValueType.STRING: value_type = "C" else: value_type = "B" feature_vector_mapping[feature] = (value_type, vector) vectors.append(vector) # Create merged feature vectors vms = VectorMergeSupport('NA', 'sample_id', row_ids=features) for feature in feature_vector_mapping.keys(): vms.add_dict_array(feature_vector_mapping[feature][1], feature, 'value') merged = vms.get_merged_dict() rows = [] for feature in feature_vector_mapping.keys(): current_row = [feature_vector_mapping[feature][0] + ":" + feature] for item in merged: current_row.append(item[feature]) rows.append("\t".join(current_row)) return rows
def get_merged_feature_vectors(x_id, y_id, c_id, cohort_id_array, logTransform, study_id_array): """ Fetches and merges data for two or three feature vectors (see parameter documentation below). The vectors have to be an array of dictionaries, with each dictionary containing a 'value' field (other fields are ignored): [ { 'value': 0.5 }, { 'value': 1.0 } ] The merged result: [ { 'patient_id': <patient ID #0> 'x': <value for x for patient ID #0> 'y': <value for y for patient ID #0> 'c': <value for c for patient ID #0> }, { 'patient_id': <patient ID #1> 'x': <value for x for patient ID #1> 'y': <value for y for patient ID #1> 'c': <value for c for patient ID #1> } ... ] :param x_id: Feature identifier for x-axis e.g. 'CLIN:age_at_initial_pathologic_diagnosis' :param y_id: Feature identifier for y-axis. If None, values for 'y' in the response will be marked as missing. :param c_id: Feature identifier for color-by. If None, values for 'c' in the response will be marked as missing. :param cohort_id_array: Cohort identifier array. :return: PlotDataResponse """ async_params = [FeatureIdQueryDescription(x_id, cohort_id_array, study_id_array)] c_type, c_vec = ValueType.STRING, [] y_type, y_vec = ValueType.STRING, [] units = get_axis_units(x_id, y_id) if c_id is not None: async_params.append(FeatureIdQueryDescription(c_id, cohort_id_array, study_id_array)) if y_id is not None: async_params.append(FeatureIdQueryDescription(y_id, cohort_id_array, study_id_array)) async_result = get_feature_vectors_tcga_only(async_params) if c_id is not None: c_type, c_vec = async_result[c_id]['type'], async_result[c_id]['data'] if y_id is not None: y_type, y_vec = async_result[y_id]['type'], async_result[y_id]['data'] if logTransform is not None and logTransform['y'] and y_vec and is_log_transformable(y_type): # If we opt to use a transform that attempts to account for values out of range for log transformation, # this is the code to get the minimum y-value ''' yvals = [] for yd in y_vec: if 'value' in yd and yd['value'] is not None and yd['value'] != "NA" and yd['value'] != "None": yvals.append(float(yd['value'])) y_min = min(yvals) ''' for ydata in y_vec: if 'value' in ydata and ydata['value'] is not None and ydata['value'] != "NA" and ydata['value'] != "None": if float(ydata['value']) < 0: ydata['value'] = "NA" elif logTransform['yBase'] == 10: ydata['value'] = str(math.log10((float(ydata['value']) + 1))) elif logTransform['yBase'] == 'e': ydata['value'] = str(math.log((float(ydata['value']) + 1))) elif type(logTransform['yBase']) is int: ydata['value'] = str(math.log((float(ydata['value']) + 1), logTransform['yBase'])) else: logger.warn( "[WARNING] No valid log base was supplied - log transformation will not be applied!" ) x_type, x_vec = async_result[x_id]['type'], async_result[x_id]['data'] if logTransform is not None and logTransform['x'] and x_vec and is_log_transformable(x_type): # If we opt to use a transform that attempts to account for values out of range for log transformation, # this is the code to get the minimum x-value ''' xvals = [] for xd in x_vec: if 'value' in xd and xd['value'] is not None and xd['value'] != "NA" and xd['value'] != "None": xvals.append(float(xd['value'])) x_min = min(xvals) ''' for xdata in x_vec: if 'value' in xdata and xdata['value'] is not None and xdata['value'] != "NA" and xdata['value'] != "None": if float(xdata['value']) < 0: xdata['value'] = "NA" elif logTransform['xBase'] == 10: xdata['value'] = str(math.log10((float(xdata['value']) + 1))) elif logTransform['xBase'] == 'e': xdata['value'] = str(math.log((float(xdata['value']) + 1))) elif type(logTransform['xBase']) is int: xdata['value'] = str(math.log((float(xdata['value']) + 1), logTransform['xBase'])) else: logger.warn( "[WARNING] No valid log base was supplied - log transformation will not be applied!" ) vms = VectorMergeSupport('NA', 'sample_id', 'case_id', ['x', 'y', 'c']) # changed so that it plots per sample not patient vms.add_dict_array(x_vec, 'x', 'value') vms.add_dict_array(y_vec, 'y', 'value') vms.add_dict_array(c_vec, 'c', 'value') merged = get_merged_dict_timed(vms) # Resolve which (requested) cohorts each datapoint belongs to. cohort_set_dict = CloudSQLCohortAccess.get_cohorts_for_datapoints(cohort_id_array) # Get the name and ID for every requested cohort. cohort_info_array = CloudSQLCohortAccess.get_cohort_info(cohort_id_array) cohort_info_obj_array = [] for item in cohort_info_array: cohort_info_obj_array.append({'id': item['id'], 'name': item['name']}) items = [] for value_bundle in merged: sample_id = value_bundle['sample_id'] # Add an array of cohort # only if the number of containing cohort exceeds the configured threshold. cohort_set = [] # TODO FIX - this check shouldn't be needed if sample_id in cohort_set_dict: cohort_set = cohort_set_dict[sample_id] if len(cohort_set) >= DATAPOINT_COHORT_THRESHOLD: value_bundle['cohort'] = cohort_set items.append(value_bundle) count_message = get_counts(merged) type_message = {'x': str(x_type), 'y': str(y_type), 'c': str(c_type)} # TODO assign label for y if y_id is None, as in that case the y-field will be missing from the response label_message = {'x': x_id, 'y': y_id, 'c': c_id} # TODO Refactor pairwise call to separate function # Include pairwise results input_vectors = [PairwiseInputVector(x_id, x_type, x_vec)] if c_id is not None: input_vectors.append(PairwiseInputVector(c_id, c_type, c_vec)) if y_id is not None: input_vectors.append(PairwiseInputVector(y_id, y_type, y_vec)) results = {'types': type_message, 'labels': label_message, 'items': items, 'cohort_set': cohort_info_obj_array, 'counts': count_message, 'xUnits': units['x'], 'yUnits': units['y']} pairwise_result = None if len(input_vectors) > 1: pass #pairwise_result = get_pairwise_result(input_vectors) if pairwise_result is None: logger.warn("[WARNING] Pairwise results not included in returned object") results['pairwise_result'] = {} else: results['pairwise_result'] = [pairwise_result] return results