def print_query(feature_id, cohort_id_array, program_array, project_id_array, cohort_table_id): program_set = get_bq_program_set(program_array) logger.info("Selected programs: {}".format(program_set)) # Verify the program set # ---------------------- if len(program_set) == 0: logger.info("No programs set. Please include at least one program.") sys_exit(0) # Verify the cohort ID array # -------------------------- if len(cohort_id_array) == 0: logger.info( "No cohort IDs set. Please include at least one cohort ID.") sys_exit(0) # Verify the project ID array # --------------------------- logger.info("Selected projects: {}".format(project_id_array)) if len(project_id_array) == 0: logger.info("No project IDs set. Using NULL.") project_id_array = None provider = FeatureProviderFactory.from_feature_id(feature_id) query_string = provider.build_query(program_set, cohort_table_id, cohort_id_array, project_id_array) logger.info("QUERY:\n\n{}".format(query_string))
def get_feature_vector(feature_id, cohort_id_array, cohort_settings): """ Fetches the data from BigQuery tables for a given feature identifier and one or more stored cohorts. Returns the intersection of the samples defined by the feature identifier and the stored cohort. Each returned data point is represented as a dict containing patient, sample and aliquot barcodes, and the value as defined by the feature identifier. Args: feature_id: Feature identifier cohort_id_array: Array of cohort identifiers (integers) Returns: Data as an array of dicts. """ provider = FeatureProviderFactory.from_feature_id(feature_id) result = provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) items = [] for data_point in result: data_item = { key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id'] } value = provider.process_data_point(data_point) # TODO refactor missing value logic if value is None: value = 'NA' data_item['value'] = value items.append(data_item) return provider.get_value_type(), items
def is_valid_feature_identifier(feature_id): """ Answers if given internal feature identifier is valid. Args: feature_id: Internal feature identifier Returns: True if feature id is valid, otherwise False. """ is_valid = False try: provider_class = FeatureProviderFactory.get_provider_class_from_feature_id( feature_id) is_valid = provider_class.is_valid_feature_id(feature_id) except FeatureNotFoundException as e: logger.exception(e) # FeatureProviderFactory.get_provider_class_from_feature_id raises FeatureNotFoundException # if the feature identifier does not look valid. Nothing needs to be done here, # since is_valid is already False. pass except Exception as e: logger.error("Unrecognized feature ID: '{}'".format(feature_id)) logger.exception(e) finally: return is_valid
def is_valid_feature_identifier(feature_id): """ Answers if given internal feature identifier is valid. Args: feature_id: Internal feature identifier Returns: True if feature id is valid, otherwise False. """ is_valid = False try: provider_class = FeatureProviderFactory.get_provider_class_from_feature_id( feature_id) is_valid = provider_class.is_valid_feature_id(feature_id) except FeatureNotFoundException as e: logging.exception(e) # FeatureProviderFactory.get_provider_class_from_feature_id raises FeatureNotFoundException # if the feature identifier does not look valid. Nothing needs to be done here, # since is_valid is already False. pass except Exception as e: print >> sys.stdout, traceback.format_exc() finally: return is_valid
def submit_tcga_job(param_obj, project_id_number, bigquery_client, cohort_settings): query_provider = FeatureProviderFactory.from_parameters(param_obj) bigquery_runner = FeatureDataProvider(query_provider, bigquery_service=bigquery_client, project_id_number=project_id_number) feature_id = param_obj.feature_id cohort_id_array = param_obj.cohort_id_array project_id_array = param_obj.project_id_array logger.info("[STATUS] In submit_tcga_job, project_id_array: {}".format( str(project_id_array))) program_set = param_obj.program_set job_description = bigquery_runner.get_data_job_reference( program_set, cohort_settings.table_id, cohort_id_array, project_id_array) # Was a query job submitted at all? run_query = job_description['run_query'] if run_query: logger.info("Submitted TCGA {job_id}: {fid} - {cohorts}".format( job_id=job_description['job_reference']['jobId'], fid=feature_id, cohorts=str(cohort_id_array))) job_item = { 'run_query': run_query, 'feature_id': feature_id, 'provider': bigquery_runner, 'query_support': query_provider, 'ready': False, 'job_reference': job_description['job_reference'], 'tables_used': job_description['tables_used'] } else: job_item = { 'run_query': run_query, 'feature_id': feature_id, 'provider': bigquery_runner, 'query_support': query_provider, 'ready': False, 'job_reference': job_description['job_reference'], 'tables_used': job_description['tables_used'] } return job_item
def _feature_converter(feature_id): """ User data feature requests can be mapped onto parent program feature IDs (this is the purpose of the shared_map_id column in the projects_user_feature_definitions table). Currently only used for case_barcodes, (which cannot be plotted?) but this capability was in the V1 code. Thus, we port it over. """ if feature_id is None: return None provider_class = FeatureProviderFactory.get_provider_class_from_feature_id( feature_id) if provider_class.can_convert_feature_id(): converted_user_feature = provider_class.convert_feature_id(feature_id) if converted_user_feature: if feature_id.startswith( 'v2') and not converted_user_feature.startswith('v2'): converted_user_feature = 'v2:{0}'.format( converted_user_feature) return converted_user_feature return feature_id
def get_feature_vector(feature_id, cohort_id_array): include_tcga = False user_studies = () for cohort_id in cohort_id_array: try: db = get_sql_connection() cursor = db.cursor(DictCursor) cursor.execute( "SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id, )) for row in cursor.fetchall(): if row['project_id'] is None: include_tcga = True else: user_studies += (row['project_id'], ) except Exception as e: if db: db.close() if cursor: cursor.close() raise e # ex: feature_id 'CLIN:Disease_Code' user_feature_id = None if feature_id.startswith('USER:'******'t include TCGA include_tcga = False items = [] type = None result = [] cohort_settings = settings.GET_BQ_COHORT_SETTINGS() if include_tcga: provider = FeatureProviderFactory.from_feature_id(feature_id) result = provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) # ex: result[0] # {'aliquot_id': None, 'case_id': u'TCGA-BH-A0B1', 'sample_id': u'TCGA-BH-A0B1-10A', 'value': u'BRCA'} for data_point in result: data_item = { key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id'] } value = provider.process_data_point(data_point) # TODO refactor missing value logic if value is None: value = 'NA' data_item['value'] = value items.append(data_item) type = provider.get_value_type() if len(user_studies) > 0: # Query User Data user_provider = UserFeatureProvider(feature_id, user_feature_id=user_feature_id) user_result = user_provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) result.extend(user_result) for data_point in user_result: data_item = { key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id'] } value = provider.process_data_point(data_point) # TODO refactor missing value logic if value is None: value = 'NA' data_item['value'] = value items.append(data_item) if not type: type = user_provider.get_value_type() return type, items