def get_feature_vector(feature_id, cohort_id_array): """ Fetches the data from BigQuery tables for a given feature identifier and one or more stored cohorts. Returns the intersection of the samples defined by the feature identifier and the stored cohort. Each returned data point is represented as a dict containing patient, sample and aliquot barcodes, and the value as defined by the feature identifier. Args: feature_id: Feature identifier cohort_id_array: Array of cohort identifiers (integers) Returns: Data as an array of dicts. """ provider = FeatureProviderFactory.from_feature_id(feature_id) cohort_settings = settings.GET_BQ_COHORT_SETTINGS() result = provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) items = [] for data_point in result: data_item = {key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id']} value = provider.process_data_point(data_point) # TODO refactor missing value logic if value is None: value = 'NA' data_item['value'] = value items.append(data_item) return provider.get_value_type(), items
def submit_jobs_with_user_data(params_array): bigquery_service = get_bigquery_service() provider_array = [] cohort_settings = settings.GET_BQ_COHORT_SETTINGS() # Submit jobs for parameter_object in params_array: feature_id = parameter_object.feature_id cohort_id_array = parameter_object.cohort_id_array user_data = user_feature_handler(feature_id, cohort_id_array) if user_data['include_tcga']: job_item = submit_tcga_job(parameter_object, bigquery_service, cohort_settings) provider_array.append(job_item) if len(user_data['user_studies']) > 0: converted_feature_id = user_data['converted_feature_id'] user_feature_id = user_data['user_feature_id'] logging.debug("user_feature_id: {0}".format(user_feature_id)) provider = UserDataQueryHandler(converted_feature_id, user_feature_id=user_feature_id) # The UserDataQueryHandler instance might not generate a BigQuery query and job at all given the combination # of cohort(s) and feature identifiers. The provider is not added to the array, and therefore to the # polling loop below, if it would not submit a BigQuery job. if provider.is_queryable(cohort_id_array): job_reference = provider.get_data_job_reference( cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) logging.info( "Submitted USER {job_id}: {fid} - {cohorts}".format( job_id=job_reference['jobId'], fid=feature_id, cohorts=str(cohort_id_array))) provider_array.append({ 'feature_id': feature_id, 'provider': provider, 'ready': False, 'job_reference': job_reference['job_reference'], 'tables_used': job_reference['tables_queried'] }) else: logging.debug("No UserFeatureDefs for '{0}'".format( converted_feature_id)) return provider_array
def get_feature_vectors_tcga_only(params_array, poll_retry_limit=20, skip_formatting_for_plot=False): bigquery_service = get_bigquery_service() provider_array = [] cohort_settings = settings.GET_BQ_COHORT_SETTINGS() # Submit jobs for parameter_object in params_array: job_item = submit_tcga_job(parameter_object, bigquery_service, cohort_settings) provider_array.append(job_item) project_id = settings.BQ_PROJECT_ID result = get_submitted_job_results(provider_array, project_id, poll_retry_limit, skip_formatting_for_plot) return result
def get_feature_vector(feature_id, cohort_id_array): include_tcga = False user_studies = () for cohort_id in cohort_id_array: try: db = get_sql_connection() cursor = db.cursor(DictCursor) cursor.execute( "SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id, )) for row in cursor.fetchall(): if row['project_id'] is None: include_tcga = True else: user_studies += (row['project_id'], ) except Exception as e: if db: db.close() if cursor: cursor.close() raise e # ex: feature_id 'CLIN:Disease_Code' user_feature_id = None if feature_id.startswith('USER:'******'t include TCGA include_tcga = False items = [] type = None result = [] cohort_settings = settings.GET_BQ_COHORT_SETTINGS() if include_tcga: provider = FeatureProviderFactory.from_feature_id(feature_id) result = provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) # ex: result[0] # {'aliquot_id': None, 'case_id': u'TCGA-BH-A0B1', 'sample_id': u'TCGA-BH-A0B1-10A', 'value': u'BRCA'} for data_point in result: data_item = { key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id'] } value = provider.process_data_point(data_point) # TODO refactor missing value logic if value is None: value = 'NA' data_item['value'] = value items.append(data_item) type = provider.get_value_type() if len(user_studies) > 0: # Query User Data user_provider = UserFeatureProvider(feature_id, user_feature_id=user_feature_id) user_result = user_provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) result.extend(user_result) for data_point in user_result: data_item = { key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id'] } value = provider.process_data_point(data_point) # TODO refactor missing value logic if value is None: value = 'NA' data_item['value'] = value items.append(data_item) if not type: type = user_provider.get_value_type() return type, items