Exemple #1
0
    def convert_user_feature_id(cls, feature_id):
        bq_id = None
        try:
            db = get_sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)
            cursor.execute(
                """
                SELECT feature_name, bq_map_id, shared_map_id
                FROM projects_user_feature_definitions
                WHERE id = %s
            """, (int(feature_id.split(':')[-1]), ))
            for row in cursor.fetchall():
                bq_id = row["shared_map_id"]

            cursor.close()
            db.close()

            logging.debug(
                "UserFeatureProvider.convert_user_feature_id {0} -> {1}".
                format(feature_id, bq_id))
            return bq_id

        except Exception as e:
            if db: db.close()
            if cursor: cursor.close()
            raise e
Exemple #2
0
    def get_cohorts_for_datapoints(cls, cohort_id_array):
        # Generate the 'IN' statement string: (%s, %s, ..., %s)
        cohort_id_stmt = ', '.join(
            ['%s' for x in xrange(len(cohort_id_array))])

        query = 'SELECT sample_barcode, cohort_id FROM {cohort_samples_table} WHERE cohort_id IN ({cohort_id_stmt})'.format(
            cohort_samples_table=DJANGO_COHORT_SAMPLES_TABLE,
            cohort_id_stmt=cohort_id_stmt)

        values = cohort_id_array
        try:
            db = get_sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(values))

            result = cursor.fetchall()
            cohort_per_samples = {}

            for row in result:
                cohort_id, sample_barcode = row['cohort_id'], row[
                    'sample_barcode']
                if sample_barcode not in cohort_per_samples:
                    cohort_per_samples[sample_barcode] = []
                cohort_per_samples[sample_barcode].append(cohort_id)

            cursor.close()
            db.close()

            return cohort_per_samples

        except Exception as e:
            logger.exception(e)
            raise CohortException(
                'get_cohorts_for_datapoints CloudSQL error, cohort IDs {cohort_ids}: {message}'
                .format(cohort_ids=cohort_id_array, message=str(e.message)))
Exemple #3
0
    def get_cohort_barcodes(cls, cohort_id_array):
        # Generate the 'IN' statement string: (%s, %s, ..., %s)
        cohort_id_stmt = ', '.join(
            ['%s' for x in xrange(len(cohort_id_array))])
        query = 'SELECT sample_barcode AS barcode FROM {cohort_table} WHERE cohort_id IN ({cohort_id_stmt})'.format(
            cohort_table=DJANGO_COHORT_TABLE, cohort_id_stmt=cohort_id_stmt)
        values = cohort_id_array
        try:
            db = get_sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(values))

            result = cursor.fetchall()
            barcodes = []
            for row in result:
                barcodes.append(row['barcode'])
            cursor.close()
            db.close()

            # Return only unique barcodes
            return list(set(barcodes))

        except Exception as e:
            raise CohortException(
                'get_cohort_barcodes CloudSQL error, cohort IDs {cohort_ids}: {message}'
                .format(cohort_ids=cohort_id_array, message=str(e.message)))
            raise CohortException('bad cohort: ' + str(cohort_id_array))
Exemple #4
0
def user_feature_handler(feature_id, cohort_id_array):
    include_tcga = False
    user_studies = ()
    for cohort_id in cohort_id_array:
        try:
            db = get_sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)

            cursor.execute("SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id,))
            for row in cursor.fetchall():
                if row['project_id'] is None:
                    include_tcga = True
                else:
                    user_studies += (row['project_id'],)

        except Exception as e:
            if db: db.close()
            if cursor: cursor.close()
            raise e

    user_feature_id = None
    if feature_id.startswith('USER:'******'t include TCGA
            include_tcga = False

    return {
        'converted_feature_id': feature_id,
        'include_tcga': include_tcga,
        'user_studies': user_studies,
        'user_feature_id': user_feature_id
    }
    def field_value_search(self, keyword, field):
        self.validate_field_search_input(keyword, field)

        query = 'SELECT DISTINCT {search_field} FROM {table_name} WHERE {search_field} LIKE %s LIMIT %s'.format(
            table_name=self.get_table_name(),
            search_field=field
        )
        # Format the keyword for MySQL string matching
        sql_keyword = '%' + keyword + '%'
        query_args = [sql_keyword, FOUND_FEATURE_LIMIT]

        try:
            db = get_sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(query_args))
            items = []

            for row in cursor.fetchall():
                items.append(row[field])

            return items

        except MySQLError as mse:
            logger.exception(mse)
            raise BackendException('database error: ' + str(mse))
Exemple #6
0
    def get_project_ids(self, cohort_id_array):
        """
        Returns: The user project identifiers associated with the samples in all given cohorts.
        """
        if self._project_ids is not None:
            return self._project_ids

        project_ids = ()
        for cohort_id in cohort_id_array:
            try:
                db = get_sql_connection()
                cursor = db.cursor(MySQLdb.cursors.DictCursor)

                cursor.execute(
                    "SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id",
                    (cohort_id, ))
                for row in cursor.fetchall():
                    if row['project_id'] is not None:
                        project_ids += (row['project_id'], )

            except Exception as e:
                if db: db.close()
                if cursor: cursor.close()
                raise e

        self._project_ids = project_ids
        return self._project_ids
Exemple #7
0
    def from_user_feature_id(cls, feature_id):
        logging.debug("UserFeatureDef.from_user_feature_id {0}".format(
            str([feature_id])))
        # ID breakdown: project ID:Feature ID
        # Example ID: USER:1:6
        regex = re_compile("^USER:"******"([0-9]+):"
                           # Feature ID
                           "([0-9]+)$")

        feature_fields = regex.findall(feature_id)
        if len(feature_fields) == 0:
            raise FeatureNotFoundException(feature_id)
        project_id, user_feature_id = feature_fields[0]
        bq_id = None
        shared_id = None
        is_numeric = False

        try:
            db = get_sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)
            cursor.execute(
                """
                SELECT feature_name, bq_map_id, shared_map_id, is_numeric
                FROM projects_user_feature_definitions
                WHERE id = %s
            """, (user_feature_id, ))
            for row in cursor.fetchall():
                if row['shared_map_id']:
                    shared_id = row['shared_map_id']
                bq_id = row["bq_map_id"]
                is_numeric = row['is_numeric'] == 1

            cursor.close()
            db.close()

        except Exception as e:
            if db: db.close()
            if cursor: cursor.close()
            raise e

        if shared_id is not None:
            return cls.from_feature_id(bq_id, project_id)

        if bq_id is None:
            raise FeatureNotFoundException(feature_id)

        # Else we're querying a very specific feature from a specific project
        bq_table, column_name, symbol = cls.get_table_and_field(bq_id)
        if bq_table is None or column_name is None:
            raise FeatureNotFoundException(feature_id)

        logging.debug("{0} {1} {2}".format(bq_table, column_name, symbol))

        filters = None
        if symbol is not None:
            filters = {'Symbol': symbol}

        return [cls(bq_table, column_name, project_id, is_numeric, filters)]
Exemple #8
0
    def search(self, keyword, field):
        self.validate_search_field(keyword, field)

        query = 'SELECT mirna_name, platform, value_field, internal_feature_id ' \
                'FROM {table_name} WHERE {search_field} LIKE %s LIMIT %s'.format(
            table_name=self.get_table_name(),
            search_field=field
        )

        # Format the keyword for MySQL string matching
        sql_keyword = '%' + keyword + '%'
        query_args = [sql_keyword, FOUND_FEATURE_LIMIT]

        try:
            db = get_sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(query_args))
            items = []

            for row in cursor.fetchall():
                items.append(row)

            # Generate human readable labels
            for item in items:
                item['feature_type'] = MIRN_FEATURE_TYPE
                item['label'] = build_feature_label(item)

            return items

        except MySQLError:
            raise BackendException('database error', keyword, field)
Exemple #9
0
    def get_project_ids(self, cohort_id_array):
        """
        Returns: The user project identifiers associated with the samples in all given cohorts.
        """
        if self._project_ids is not None:
            return self._project_ids

        project_ids = ()
        for cohort_id in cohort_id_array:
            try:
                db = get_sql_connection()
                cursor = db.cursor(MySQLdb.cursors.DictCursor)

                cursor.execute(
                    "SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id",
                    (cohort_id, ))
                for row in cursor.fetchall():
                    if row['project_id'] is not None:
                        project_ids += (row['project_id'], )

            except Exception as e:
                if db: db.close()
                if cursor: cursor.close()
                raise e

        #  user_data_plot_support.py then tries to convert it with a shared ID to a TCGA queryable id. Do here as well??

        self._project_ids = project_ids
        return self._project_ids
Exemple #10
0
    def from_feature_id(cls, feature_id, project_id=None):
        """
           This is the method used when the user feature maps to feature in an existing project (i.e. there is a
           shared_id in the projects_user_feature_definitions entry).
           It returns a *LIST* of UserFeatureDefs
         """
        logging.debug("UserFeatureDef.from_feature_id: {0}".format(
            str([feature_id, project_id])))
        if feature_id is None:
            raise FeatureNotFoundException(feature_id)
        # ID breakdown: project ID:Feature ID
        # Example ID: USER:1:6
        regex = re_compile("(?:^v2:)?USER:"******"([0-9]+):"
                           # Feature ID
                           "([0-9]+)$")

        feature_fields = regex.findall(feature_id)
        if len(feature_fields) == 0:
            raise FeatureNotFoundException(feature_id)
        project_id, user_feature_id = feature_fields[0]

        try:
            db = get_sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)
            cursor.execute(
                """
                SELECT bq_map_id, project_id, is_numeric
                FROM projects_user_feature_definitions
                WHERE id = %s
            """, (user_feature_id, ))

            results = []
            for row in cursor.fetchall():
                bq_table, column_name, symbol = cls.get_table_and_field(
                    row['bq_map_id'])
                filters = None
                if symbol is not None:
                    filters = {'Symbol': symbol}
                results.append(
                    cls(bq_table, column_name, row['project_id'],
                        row['is_numeric'] == 1, filters))

            cursor.close()
            db.close()

            return results

        except Exception as e:
            if db: db.close()
            if cursor: cursor.close()
            raise e
def get_confirmed_project_ids_for_cohorts(cohort_id_array):
    """
    Returns the project ID numbers that are referred to by the samples
    in a list of cohorts.
    
    Returns:
        List of project ID numbers.
    """
    cohort_vals = ()
    cohort_params = ""

    for cohort in cohort_id_array:
        cohort_params += "%s,"
        cohort_vals += (cohort, )

    cohort_params = cohort_params[:-1]
    db = get_sql_connection()
    cursor = db.cursor()

    tcga_studies = fetch_isbcgc_project_set()

    query_str = "SELECT DISTINCT project_id FROM cohorts_samples WHERE cohort_id IN (" + cohort_params + ");"
    cursor.execute(query_str, cohort_vals)

    # Only samples whose source studies are TCGA studies, or extended from them, should be used
    confirmed_study_ids = []
    unconfirmed_study_ids = []
    user_only_study_ids = []

    for row in cursor.fetchall():
        if row[0] in tcga_studies:
            if row[0] not in confirmed_study_ids:
                confirmed_study_ids.append(row[0])
        elif row[0] not in unconfirmed_study_ids:
            unconfirmed_study_ids.append(row[0])

    if len(unconfirmed_study_ids) > 0:
        projects = Project.objects.filter(id__in=unconfirmed_study_ids)

        for project in projects:
            if project.get_my_root_and_depth()['root'] in tcga_studies:
                confirmed_study_ids.append(project.id)
            else:
                user_only_study_ids.append(project.id)

    logger.info(
        "In get_confirmed_project_ids_for_cohorts, confirmed study IDs: {}".
        format(str(confirmed_study_ids)))

    return confirmed_study_ids, user_only_study_ids
    def search(self, parameters):
        self.validate_feature_search_input(parameters)

        query = 'SELECT gene_name, probe_name, platform, relation_to_gene, relation_to_island, ' \
                       'value_field, genomic_build, internal_feature_id ' \
                'FROM {table_name} ' \
                'WHERE gene_name=%s ' \
                'AND probe_name LIKE %s ' \
                'AND platform LIKE %s ' \
                'AND relation_to_gene LIKE %s ' \
                'AND relation_to_island LIKE %s ' \
                'AND genomic_build=%s' \
                'LIMIT %s'.format(table_name=self.get_table_name()
        )

        # Fills in '' for fields that were not specified in the parameters
        input = defaultdict(lambda: '', parameters)

        # Format the keyword for MySQL string matching
        query_args = [input['gene_name'],
                      '%' + input['probe_name'] + '%',
                      '%' + input['platform'] + '%',
                      '%' + input['relation_to_gene'] + '%',
                      '%' + input['relation_to_island'] + '%',
                      input['genomic_build'],
                      FOUND_FEATURE_LIMIT]

        try:
            db = get_sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(query_args))
            items = []

            for row in cursor.fetchall():
                items.append(row)

            # Generate human readable labels
            for item in items:
                item['feature_type'] = METH_FEATURE_TYPE
                item['label'] = self.build_feature_label(item)

            return items

        except MySQLError as mse:
            logger.exception(mse)
            raise BackendException('database error')
Exemple #13
0
    def get_cohort_info(cls, cohort_id_array):
        # Generate the 'IN' statement string: (%s, %s, ..., %s)
        cohort_id_stmt = ', '.join(
            ['%s' for x in xrange(len(cohort_id_array))])

        query_template = (
            "SELECT ti.id AS cohort_id, ti.name, COUNT(ts.sample_barcode) AS size "
            "FROM {cohort_info_table} ti "
            "   LEFT JOIN {cohort_samples_table} ts ON ts.cohort_id = ti.id "
            "WHERE ti.id IN ({cohort_id_stmt}) "
            "GROUP BY ti.id, ti.name")

        query = query_template.format(
            cohort_info_table=DJANGO_COHORT_INFO_TABLE,
            cohort_samples_table=DJANGO_COHORT_SAMPLES_TABLE,
            cohort_id_stmt=cohort_id_stmt)

        try:
            db = get_sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(cohort_id_array))

            result = []

            for row in cursor.fetchall():
                result.append({
                    'id': row['cohort_id'],
                    'name': row['name'],
                    'size': row['size']
                })

            cursor.close()
            db.close()

            return result

        except Exception as e:
            print >> sys.stdout, "[ERROR] In get_cohort_info: "
            print >> sys.stdout, e
            print >> sys.stdout, traceback.format_exc()
            raise CohortException(
                'get_cohort_info CloudSQL error, cohort IDs {cohort_ids}: {message}'
                .format(cohort_ids=cohort_id_array, message=str(e.message)))
    def search(self, parameters):
        self.validate_feature_search_input(parameters)

        query = 'SELECT gene_name, platform, generating_center, value_label, internal_feature_id' \
                ' FROM {table_name}' \
                ' WHERE gene_name=%s'\
                ' AND platform LIKE %s' \
                ' AND generating_center LIKE %s'\
                ' LIMIT %s'.format(table_name=self.get_table_name()
        )
        logging.debug("CLOUDSQL_QUERY_GEXP_SEARCH: {}".format(query))

        # Fills in '' for fields that were not specified in the parameters
        input = defaultdict(lambda: '', parameters)

        # Format the keyword for MySQL string matching
        # sql_keyword = '%' + keyword + '%'
        query_args = [
            input['gene_name'], '%' + input['platform'] + '%',
            '%' + input['center'] + '%', FOUND_FEATURE_LIMIT
        ]

        try:
            db = get_sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(query_args))
            items = []

            for row in cursor.fetchall():
                items.append(row)

            # Generate human readable labels
            for item in items:
                item['feature_type'] = GEXP_FEATURE_TYPE
                item['label'] = self.build_feature_label(
                    item['gene_name'], item)

            return items

        except MySQLError as mse:
            raise BackendException("MySQLError: {}".format(str(mse)))
Exemple #15
0
from projects.models import Public_Data_Tables, Program
from google_helpers.bigquery.gcs_path_support import BigQueryGcsPathSupport, BigQuerySupport

from GenespotRE import settings

db = None
cursor = None

INDEXD_URI = settings.INDEXD_URI + "?ids="
LIMIT = settings.INDEXD_REQ_LIMIT

EXECUTION_PROJECT = "isb-cgc"
STORAGE_DATASET = "gcs_path_import_staging"

try:
    db = get_sql_connection()
    cursor = db.cursor()

    program_paths = {}

    query_base = """
        SELECT file_gdc_id, case_barcode, sample_barcode, case_gdc_id, sample_gdc_id
        FROM {metadata_data_table}
        WHERE file_gdc_id IS NOT NULL AND NOT(file_gdc_id = '')
        LIMIT {limit} OFFSET {offset};
    """

    count_query_base_cloudsql = """
        SELECT COUNT(*)
        FROM (
          SELECT file_gdc_id
def get_feature_vector(feature_id, cohort_id_array):
    include_tcga = False
    user_studies = ()
    for cohort_id in cohort_id_array:
        try:
            db = get_sql_connection()
            cursor = db.cursor(DictCursor)

            cursor.execute(
                "SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id",
                (cohort_id, ))
            for row in cursor.fetchall():
                if row['project_id'] is None:
                    include_tcga = True
                else:
                    user_studies += (row['project_id'], )

        except Exception as e:
            if db: db.close()
            if cursor: cursor.close()
            raise e

    #  ex: feature_id 'CLIN:Disease_Code'
    user_feature_id = None
    if feature_id.startswith('USER:'******'t include TCGA
            include_tcga = False

    items = []
    type = None
    result = []
    cohort_settings = settings.GET_BQ_COHORT_SETTINGS()
    if include_tcga:
        provider = FeatureProviderFactory.from_feature_id(feature_id)
        result = provider.get_data(cohort_id_array, cohort_settings.dataset_id,
                                   cohort_settings.table_id)

        # ex: result[0]
        # {'aliquot_id': None, 'case_id': u'TCGA-BH-A0B1', 'sample_id': u'TCGA-BH-A0B1-10A', 'value': u'BRCA'}
        for data_point in result:
            data_item = {
                key: data_point[key]
                for key in ['case_id', 'sample_id', 'aliquot_id']
            }
            value = provider.process_data_point(data_point)
            # TODO refactor missing value logic
            if value is None:
                value = 'NA'
            data_item['value'] = value
            items.append(data_item)

        type = provider.get_value_type()

    if len(user_studies) > 0:
        # Query User Data
        user_provider = UserFeatureProvider(feature_id,
                                            user_feature_id=user_feature_id)
        user_result = user_provider.get_data(cohort_id_array,
                                             cohort_settings.dataset_id,
                                             cohort_settings.table_id)
        result.extend(user_result)

        for data_point in user_result:
            data_item = {
                key: data_point[key]
                for key in ['case_id', 'sample_id', 'aliquot_id']
            }
            value = provider.process_data_point(data_point)
            # TODO refactor missing value logic
            if value is None:
                value = 'NA'
            data_item['value'] = value
            items.append(data_item)

        if not type:
            type = user_provider.get_value_type()

    return type, items
Exemple #17
0
    def from_user_feature_id(cls, feature_id):
        """
           This returns a *LIST* of one UserFeatureDef, unless it maps to a standard project ID. Then
           the list may have multiple entries.
         """
        logging.debug("UserFeatureDef.from_user_feature_id {0}".format(
            str([feature_id])))
        # ID breakdown: project ID:Feature ID
        # Example ID: USER:1:6
        regex = re_compile("(?:^v2:)?USER:"******"([0-9]+):"
                           # Feature ID
                           "([0-9]+)$")

        feature_fields = regex.findall(feature_id)
        if len(feature_fields) == 0:
            raise FeatureNotFoundException(feature_id)
        project_id, user_feature_id = feature_fields[0]
        bq_id = None
        shared_id = None
        is_numeric = False

        try:
            db = get_sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)
            cursor.execute(
                """
                SELECT feature_name, bq_map_id, shared_map_id, is_numeric
                FROM projects_user_feature_definitions
                WHERE id = %s
            """, (user_feature_id, ))
            for row in cursor.fetchall():
                if row['shared_map_id']:
                    shared_id = row['shared_map_id']
                bq_id = row["bq_map_id"]
                is_numeric = row['is_numeric'] == 1

            cursor.close()
            db.close()

        except Exception as e:
            if db: db.close()
            if cursor: cursor.close()
            raise e

        # The feature we want is "shared", i.e. defined in a project we are extended from. So we delegate the
        # task to the from_feature_id() function:
        if shared_id is not None:
            return cls.from_feature_id(bq_id, project_id)

        if bq_id is None:
            raise FeatureNotFoundException(feature_id)

        # Else we're querying a very specific feature from a specific project
        bq_table, column_name, symbol = cls.get_table_and_field(bq_id)
        if bq_table is None or column_name is None:
            raise FeatureNotFoundException(feature_id)

        logging.debug("{0} {1} {2}".format(bq_table, column_name, symbol))

        filters = None
        if symbol is not None:
            filters = {'Symbol': symbol}

        return [cls(bq_table, column_name, project_id, is_numeric, filters)]