def convert_user_feature_id(cls, feature_id): bq_id = None try: db = get_sql_connection() cursor = db.cursor(MySQLdb.cursors.DictCursor) cursor.execute( """ SELECT feature_name, bq_map_id, shared_map_id FROM projects_user_feature_definitions WHERE id = %s """, (int(feature_id.split(':')[-1]), )) for row in cursor.fetchall(): bq_id = row["shared_map_id"] cursor.close() db.close() logging.debug( "UserFeatureProvider.convert_user_feature_id {0} -> {1}". format(feature_id, bq_id)) return bq_id except Exception as e: if db: db.close() if cursor: cursor.close() raise e
def get_cohorts_for_datapoints(cls, cohort_id_array): # Generate the 'IN' statement string: (%s, %s, ..., %s) cohort_id_stmt = ', '.join( ['%s' for x in xrange(len(cohort_id_array))]) query = 'SELECT sample_barcode, cohort_id FROM {cohort_samples_table} WHERE cohort_id IN ({cohort_id_stmt})'.format( cohort_samples_table=DJANGO_COHORT_SAMPLES_TABLE, cohort_id_stmt=cohort_id_stmt) values = cohort_id_array try: db = get_sql_connection() cursor = db.cursor(DictCursor) cursor.execute(query, tuple(values)) result = cursor.fetchall() cohort_per_samples = {} for row in result: cohort_id, sample_barcode = row['cohort_id'], row[ 'sample_barcode'] if sample_barcode not in cohort_per_samples: cohort_per_samples[sample_barcode] = [] cohort_per_samples[sample_barcode].append(cohort_id) cursor.close() db.close() return cohort_per_samples except Exception as e: logger.exception(e) raise CohortException( 'get_cohorts_for_datapoints CloudSQL error, cohort IDs {cohort_ids}: {message}' .format(cohort_ids=cohort_id_array, message=str(e.message)))
def get_cohort_barcodes(cls, cohort_id_array): # Generate the 'IN' statement string: (%s, %s, ..., %s) cohort_id_stmt = ', '.join( ['%s' for x in xrange(len(cohort_id_array))]) query = 'SELECT sample_barcode AS barcode FROM {cohort_table} WHERE cohort_id IN ({cohort_id_stmt})'.format( cohort_table=DJANGO_COHORT_TABLE, cohort_id_stmt=cohort_id_stmt) values = cohort_id_array try: db = get_sql_connection() cursor = db.cursor(DictCursor) cursor.execute(query, tuple(values)) result = cursor.fetchall() barcodes = [] for row in result: barcodes.append(row['barcode']) cursor.close() db.close() # Return only unique barcodes return list(set(barcodes)) except Exception as e: raise CohortException( 'get_cohort_barcodes CloudSQL error, cohort IDs {cohort_ids}: {message}' .format(cohort_ids=cohort_id_array, message=str(e.message))) raise CohortException('bad cohort: ' + str(cohort_id_array))
def user_feature_handler(feature_id, cohort_id_array): include_tcga = False user_studies = () for cohort_id in cohort_id_array: try: db = get_sql_connection() cursor = db.cursor(MySQLdb.cursors.DictCursor) cursor.execute("SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id,)) for row in cursor.fetchall(): if row['project_id'] is None: include_tcga = True else: user_studies += (row['project_id'],) except Exception as e: if db: db.close() if cursor: cursor.close() raise e user_feature_id = None if feature_id.startswith('USER:'******'t include TCGA include_tcga = False return { 'converted_feature_id': feature_id, 'include_tcga': include_tcga, 'user_studies': user_studies, 'user_feature_id': user_feature_id }
def field_value_search(self, keyword, field): self.validate_field_search_input(keyword, field) query = 'SELECT DISTINCT {search_field} FROM {table_name} WHERE {search_field} LIKE %s LIMIT %s'.format( table_name=self.get_table_name(), search_field=field ) # Format the keyword for MySQL string matching sql_keyword = '%' + keyword + '%' query_args = [sql_keyword, FOUND_FEATURE_LIMIT] try: db = get_sql_connection() cursor = db.cursor(DictCursor) cursor.execute(query, tuple(query_args)) items = [] for row in cursor.fetchall(): items.append(row[field]) return items except MySQLError as mse: logger.exception(mse) raise BackendException('database error: ' + str(mse))
def get_project_ids(self, cohort_id_array): """ Returns: The user project identifiers associated with the samples in all given cohorts. """ if self._project_ids is not None: return self._project_ids project_ids = () for cohort_id in cohort_id_array: try: db = get_sql_connection() cursor = db.cursor(MySQLdb.cursors.DictCursor) cursor.execute( "SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id, )) for row in cursor.fetchall(): if row['project_id'] is not None: project_ids += (row['project_id'], ) except Exception as e: if db: db.close() if cursor: cursor.close() raise e self._project_ids = project_ids return self._project_ids
def from_user_feature_id(cls, feature_id): logging.debug("UserFeatureDef.from_user_feature_id {0}".format( str([feature_id]))) # ID breakdown: project ID:Feature ID # Example ID: USER:1:6 regex = re_compile("^USER:"******"([0-9]+):" # Feature ID "([0-9]+)$") feature_fields = regex.findall(feature_id) if len(feature_fields) == 0: raise FeatureNotFoundException(feature_id) project_id, user_feature_id = feature_fields[0] bq_id = None shared_id = None is_numeric = False try: db = get_sql_connection() cursor = db.cursor(MySQLdb.cursors.DictCursor) cursor.execute( """ SELECT feature_name, bq_map_id, shared_map_id, is_numeric FROM projects_user_feature_definitions WHERE id = %s """, (user_feature_id, )) for row in cursor.fetchall(): if row['shared_map_id']: shared_id = row['shared_map_id'] bq_id = row["bq_map_id"] is_numeric = row['is_numeric'] == 1 cursor.close() db.close() except Exception as e: if db: db.close() if cursor: cursor.close() raise e if shared_id is not None: return cls.from_feature_id(bq_id, project_id) if bq_id is None: raise FeatureNotFoundException(feature_id) # Else we're querying a very specific feature from a specific project bq_table, column_name, symbol = cls.get_table_and_field(bq_id) if bq_table is None or column_name is None: raise FeatureNotFoundException(feature_id) logging.debug("{0} {1} {2}".format(bq_table, column_name, symbol)) filters = None if symbol is not None: filters = {'Symbol': symbol} return [cls(bq_table, column_name, project_id, is_numeric, filters)]
def search(self, keyword, field): self.validate_search_field(keyword, field) query = 'SELECT mirna_name, platform, value_field, internal_feature_id ' \ 'FROM {table_name} WHERE {search_field} LIKE %s LIMIT %s'.format( table_name=self.get_table_name(), search_field=field ) # Format the keyword for MySQL string matching sql_keyword = '%' + keyword + '%' query_args = [sql_keyword, FOUND_FEATURE_LIMIT] try: db = get_sql_connection() cursor = db.cursor(DictCursor) cursor.execute(query, tuple(query_args)) items = [] for row in cursor.fetchall(): items.append(row) # Generate human readable labels for item in items: item['feature_type'] = MIRN_FEATURE_TYPE item['label'] = build_feature_label(item) return items except MySQLError: raise BackendException('database error', keyword, field)
def get_project_ids(self, cohort_id_array): """ Returns: The user project identifiers associated with the samples in all given cohorts. """ if self._project_ids is not None: return self._project_ids project_ids = () for cohort_id in cohort_id_array: try: db = get_sql_connection() cursor = db.cursor(MySQLdb.cursors.DictCursor) cursor.execute( "SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id, )) for row in cursor.fetchall(): if row['project_id'] is not None: project_ids += (row['project_id'], ) except Exception as e: if db: db.close() if cursor: cursor.close() raise e # user_data_plot_support.py then tries to convert it with a shared ID to a TCGA queryable id. Do here as well?? self._project_ids = project_ids return self._project_ids
def from_feature_id(cls, feature_id, project_id=None): """ This is the method used when the user feature maps to feature in an existing project (i.e. there is a shared_id in the projects_user_feature_definitions entry). It returns a *LIST* of UserFeatureDefs """ logging.debug("UserFeatureDef.from_feature_id: {0}".format( str([feature_id, project_id]))) if feature_id is None: raise FeatureNotFoundException(feature_id) # ID breakdown: project ID:Feature ID # Example ID: USER:1:6 regex = re_compile("(?:^v2:)?USER:"******"([0-9]+):" # Feature ID "([0-9]+)$") feature_fields = regex.findall(feature_id) if len(feature_fields) == 0: raise FeatureNotFoundException(feature_id) project_id, user_feature_id = feature_fields[0] try: db = get_sql_connection() cursor = db.cursor(MySQLdb.cursors.DictCursor) cursor.execute( """ SELECT bq_map_id, project_id, is_numeric FROM projects_user_feature_definitions WHERE id = %s """, (user_feature_id, )) results = [] for row in cursor.fetchall(): bq_table, column_name, symbol = cls.get_table_and_field( row['bq_map_id']) filters = None if symbol is not None: filters = {'Symbol': symbol} results.append( cls(bq_table, column_name, row['project_id'], row['is_numeric'] == 1, filters)) cursor.close() db.close() return results except Exception as e: if db: db.close() if cursor: cursor.close() raise e
def get_confirmed_project_ids_for_cohorts(cohort_id_array): """ Returns the project ID numbers that are referred to by the samples in a list of cohorts. Returns: List of project ID numbers. """ cohort_vals = () cohort_params = "" for cohort in cohort_id_array: cohort_params += "%s," cohort_vals += (cohort, ) cohort_params = cohort_params[:-1] db = get_sql_connection() cursor = db.cursor() tcga_studies = fetch_isbcgc_project_set() query_str = "SELECT DISTINCT project_id FROM cohorts_samples WHERE cohort_id IN (" + cohort_params + ");" cursor.execute(query_str, cohort_vals) # Only samples whose source studies are TCGA studies, or extended from them, should be used confirmed_study_ids = [] unconfirmed_study_ids = [] user_only_study_ids = [] for row in cursor.fetchall(): if row[0] in tcga_studies: if row[0] not in confirmed_study_ids: confirmed_study_ids.append(row[0]) elif row[0] not in unconfirmed_study_ids: unconfirmed_study_ids.append(row[0]) if len(unconfirmed_study_ids) > 0: projects = Project.objects.filter(id__in=unconfirmed_study_ids) for project in projects: if project.get_my_root_and_depth()['root'] in tcga_studies: confirmed_study_ids.append(project.id) else: user_only_study_ids.append(project.id) logger.info( "In get_confirmed_project_ids_for_cohorts, confirmed study IDs: {}". format(str(confirmed_study_ids))) return confirmed_study_ids, user_only_study_ids
def search(self, parameters): self.validate_feature_search_input(parameters) query = 'SELECT gene_name, probe_name, platform, relation_to_gene, relation_to_island, ' \ 'value_field, genomic_build, internal_feature_id ' \ 'FROM {table_name} ' \ 'WHERE gene_name=%s ' \ 'AND probe_name LIKE %s ' \ 'AND platform LIKE %s ' \ 'AND relation_to_gene LIKE %s ' \ 'AND relation_to_island LIKE %s ' \ 'AND genomic_build=%s' \ 'LIMIT %s'.format(table_name=self.get_table_name() ) # Fills in '' for fields that were not specified in the parameters input = defaultdict(lambda: '', parameters) # Format the keyword for MySQL string matching query_args = [input['gene_name'], '%' + input['probe_name'] + '%', '%' + input['platform'] + '%', '%' + input['relation_to_gene'] + '%', '%' + input['relation_to_island'] + '%', input['genomic_build'], FOUND_FEATURE_LIMIT] try: db = get_sql_connection() cursor = db.cursor(DictCursor) cursor.execute(query, tuple(query_args)) items = [] for row in cursor.fetchall(): items.append(row) # Generate human readable labels for item in items: item['feature_type'] = METH_FEATURE_TYPE item['label'] = self.build_feature_label(item) return items except MySQLError as mse: logger.exception(mse) raise BackendException('database error')
def get_cohort_info(cls, cohort_id_array): # Generate the 'IN' statement string: (%s, %s, ..., %s) cohort_id_stmt = ', '.join( ['%s' for x in xrange(len(cohort_id_array))]) query_template = ( "SELECT ti.id AS cohort_id, ti.name, COUNT(ts.sample_barcode) AS size " "FROM {cohort_info_table} ti " " LEFT JOIN {cohort_samples_table} ts ON ts.cohort_id = ti.id " "WHERE ti.id IN ({cohort_id_stmt}) " "GROUP BY ti.id, ti.name") query = query_template.format( cohort_info_table=DJANGO_COHORT_INFO_TABLE, cohort_samples_table=DJANGO_COHORT_SAMPLES_TABLE, cohort_id_stmt=cohort_id_stmt) try: db = get_sql_connection() cursor = db.cursor(DictCursor) cursor.execute(query, tuple(cohort_id_array)) result = [] for row in cursor.fetchall(): result.append({ 'id': row['cohort_id'], 'name': row['name'], 'size': row['size'] }) cursor.close() db.close() return result except Exception as e: print >> sys.stdout, "[ERROR] In get_cohort_info: " print >> sys.stdout, e print >> sys.stdout, traceback.format_exc() raise CohortException( 'get_cohort_info CloudSQL error, cohort IDs {cohort_ids}: {message}' .format(cohort_ids=cohort_id_array, message=str(e.message)))
def search(self, parameters): self.validate_feature_search_input(parameters) query = 'SELECT gene_name, platform, generating_center, value_label, internal_feature_id' \ ' FROM {table_name}' \ ' WHERE gene_name=%s'\ ' AND platform LIKE %s' \ ' AND generating_center LIKE %s'\ ' LIMIT %s'.format(table_name=self.get_table_name() ) logging.debug("CLOUDSQL_QUERY_GEXP_SEARCH: {}".format(query)) # Fills in '' for fields that were not specified in the parameters input = defaultdict(lambda: '', parameters) # Format the keyword for MySQL string matching # sql_keyword = '%' + keyword + '%' query_args = [ input['gene_name'], '%' + input['platform'] + '%', '%' + input['center'] + '%', FOUND_FEATURE_LIMIT ] try: db = get_sql_connection() cursor = db.cursor(DictCursor) cursor.execute(query, tuple(query_args)) items = [] for row in cursor.fetchall(): items.append(row) # Generate human readable labels for item in items: item['feature_type'] = GEXP_FEATURE_TYPE item['label'] = self.build_feature_label( item['gene_name'], item) return items except MySQLError as mse: raise BackendException("MySQLError: {}".format(str(mse)))
from projects.models import Public_Data_Tables, Program from google_helpers.bigquery.gcs_path_support import BigQueryGcsPathSupport, BigQuerySupport from GenespotRE import settings db = None cursor = None INDEXD_URI = settings.INDEXD_URI + "?ids=" LIMIT = settings.INDEXD_REQ_LIMIT EXECUTION_PROJECT = "isb-cgc" STORAGE_DATASET = "gcs_path_import_staging" try: db = get_sql_connection() cursor = db.cursor() program_paths = {} query_base = """ SELECT file_gdc_id, case_barcode, sample_barcode, case_gdc_id, sample_gdc_id FROM {metadata_data_table} WHERE file_gdc_id IS NOT NULL AND NOT(file_gdc_id = '') LIMIT {limit} OFFSET {offset}; """ count_query_base_cloudsql = """ SELECT COUNT(*) FROM ( SELECT file_gdc_id
def get_feature_vector(feature_id, cohort_id_array): include_tcga = False user_studies = () for cohort_id in cohort_id_array: try: db = get_sql_connection() cursor = db.cursor(DictCursor) cursor.execute( "SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id, )) for row in cursor.fetchall(): if row['project_id'] is None: include_tcga = True else: user_studies += (row['project_id'], ) except Exception as e: if db: db.close() if cursor: cursor.close() raise e # ex: feature_id 'CLIN:Disease_Code' user_feature_id = None if feature_id.startswith('USER:'******'t include TCGA include_tcga = False items = [] type = None result = [] cohort_settings = settings.GET_BQ_COHORT_SETTINGS() if include_tcga: provider = FeatureProviderFactory.from_feature_id(feature_id) result = provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) # ex: result[0] # {'aliquot_id': None, 'case_id': u'TCGA-BH-A0B1', 'sample_id': u'TCGA-BH-A0B1-10A', 'value': u'BRCA'} for data_point in result: data_item = { key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id'] } value = provider.process_data_point(data_point) # TODO refactor missing value logic if value is None: value = 'NA' data_item['value'] = value items.append(data_item) type = provider.get_value_type() if len(user_studies) > 0: # Query User Data user_provider = UserFeatureProvider(feature_id, user_feature_id=user_feature_id) user_result = user_provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id) result.extend(user_result) for data_point in user_result: data_item = { key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id'] } value = provider.process_data_point(data_point) # TODO refactor missing value logic if value is None: value = 'NA' data_item['value'] = value items.append(data_item) if not type: type = user_provider.get_value_type() return type, items
def from_user_feature_id(cls, feature_id): """ This returns a *LIST* of one UserFeatureDef, unless it maps to a standard project ID. Then the list may have multiple entries. """ logging.debug("UserFeatureDef.from_user_feature_id {0}".format( str([feature_id]))) # ID breakdown: project ID:Feature ID # Example ID: USER:1:6 regex = re_compile("(?:^v2:)?USER:"******"([0-9]+):" # Feature ID "([0-9]+)$") feature_fields = regex.findall(feature_id) if len(feature_fields) == 0: raise FeatureNotFoundException(feature_id) project_id, user_feature_id = feature_fields[0] bq_id = None shared_id = None is_numeric = False try: db = get_sql_connection() cursor = db.cursor(MySQLdb.cursors.DictCursor) cursor.execute( """ SELECT feature_name, bq_map_id, shared_map_id, is_numeric FROM projects_user_feature_definitions WHERE id = %s """, (user_feature_id, )) for row in cursor.fetchall(): if row['shared_map_id']: shared_id = row['shared_map_id'] bq_id = row["bq_map_id"] is_numeric = row['is_numeric'] == 1 cursor.close() db.close() except Exception as e: if db: db.close() if cursor: cursor.close() raise e # The feature we want is "shared", i.e. defined in a project we are extended from. So we delegate the # task to the from_feature_id() function: if shared_id is not None: return cls.from_feature_id(bq_id, project_id) if bq_id is None: raise FeatureNotFoundException(feature_id) # Else we're querying a very specific feature from a specific project bq_table, column_name, symbol = cls.get_table_and_field(bq_id) if bq_table is None or column_name is None: raise FeatureNotFoundException(feature_id) logging.debug("{0} {1} {2}".format(bq_table, column_name, symbol)) filters = None if symbol is not None: filters = {'Symbol': symbol} return [cls(bq_table, column_name, project_id, is_numeric, filters)]