def user_feature_handler(feature_id, cohort_id_array): include_tcga = False user_studies = () for cohort_id in cohort_id_array: try: db = sql_connection() cursor = db.cursor(MySQLdb.cursors.DictCursor) cursor.execute("SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id,)) for row in cursor.fetchall(): if row['project_id'] is None: include_tcga = True else: user_studies += (row['project_id'],) except Exception as e: if db: db.close() if cursor: cursor.close() raise e user_feature_id = None if feature_id.startswith('USER:'******'t include TCGA include_tcga = False return { 'converted_feature_id': feature_id, 'include_tcga': include_tcga, 'user_studies': user_studies, 'user_feature_id': user_feature_id }
def submit_jobs_with_user_data(params_array): bigquery_service = authorize_credentials_with_Google() provider_array = [] cohort_settings = settings.GET_BQ_COHORT_SETTINGS() # Submit jobs for parameter_object in params_array: feature_id = parameter_object.feature_id cohort_id_array = parameter_object.cohort_id_array user_data = user_feature_handler(feature_id, cohort_id_array) if user_data['include_tcga']: job_item = submit_tcga_job(parameter_object, bigquery_service, cohort_settings) provider_array.append(job_item) if len(user_data['user_studies']) > 0: converted_feature_id = user_data['converted_feature_id'] user_feature_id = user_data['user_feature_id'] logging.debug("user_feature_id: {0}".format(user_feature_id)) provider = UserFeatureProvider(converted_feature_id, user_feature_id=user_feature_id) # The UserFeatureProvider instance might not generate a BigQuery query and job at all given the combination # of cohort(s) and feature identifiers. The provider is not added to the array, and therefore to the # polling loop below, if it would not submit a BigQuery job. if provider.is_queryable(cohort_id_array): job_reference = provider.get_data_job_reference(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) logging.info("Submitted USER {job_id}: {fid} - {cohorts}".format(job_id=job_reference['jobId'], fid=feature_id, cohorts=str(cohort_id_array))) provider_array.append({ 'feature_id': feature_id, 'provider': provider, 'ready': False, 'job_reference': job_reference }) else: logging.debug("No UserFeatureDefs for '{0}'".format(converted_feature_id)) return provider_array
def get_feature_vector(feature_id, cohort_id_array): include_tcga = False user_studies = () for cohort_id in cohort_id_array: try: db = sql_connection() cursor = db.cursor(MySQLdb.cursors.DictCursor) cursor.execute("SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id,)) for row in cursor.fetchall(): if row['project_id'] is None: include_tcga = True else: user_studies += (row['project_id'],) except Exception as e: if db: db.close() if cursor: cursor.close() raise e # ex: feature_id 'CLIN:Disease_Code' user_feature_id = None if feature_id.startswith('USER:'******'t include TCGA include_tcga = False items = [] type = None result = [] cohort_settings = settings.GET_BQ_COHORT_SETTINGS() if include_tcga: provider = FeatureProviderFactory.from_feature_id(feature_id) result = provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) # ex: result[0] # {'aliquot_id': None, 'case_id': u'TCGA-BH-A0B1', 'sample_id': u'TCGA-BH-A0B1-10A', 'value': u'BRCA'} for data_point in result: data_item = {key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id']} value = provider.process_data_point(data_point) # TODO refactor missing value logic if value is None: value = 'NA' data_item['value'] = value items.append(data_item) type = provider.get_value_type() if len(user_studies) > 0: # Query User Data user_provider = UserFeatureProvider(feature_id, user_feature_id=user_feature_id) user_result = user_provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) result.extend(user_result) for data_point in user_result: data_item = {key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id']} value = provider.process_data_point(data_point) # TODO refactor missing value logic if value is None: value = 'NA' data_item['value'] = value items.append(data_item) if not type: type = user_provider.get_value_type() return type, items