예제 #1
0
def user_feature_handler(feature_id, cohort_id_array):
    include_tcga = False
    user_studies = ()
    for cohort_id in cohort_id_array:
        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)

            cursor.execute("SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id,))
            for row in cursor.fetchall():
                if row['project_id'] is None:
                    include_tcga = True
                else:
                    user_studies += (row['project_id'],)

        except Exception as e:
            if db: db.close()
            if cursor: cursor.close()
            raise e

    user_feature_id = None
    if feature_id.startswith('USER:'******'t include TCGA
            include_tcga = False

    return {
        'converted_feature_id': feature_id,
        'include_tcga': include_tcga,
        'user_studies': user_studies,
        'user_feature_id': user_feature_id
    }
예제 #2
0
def submit_jobs_with_user_data(params_array):
    bigquery_service = authorize_credentials_with_Google()
    provider_array = []

    cohort_settings = settings.GET_BQ_COHORT_SETTINGS()

    # Submit jobs
    for parameter_object in params_array:
        feature_id = parameter_object.feature_id
        cohort_id_array = parameter_object.cohort_id_array

        user_data = user_feature_handler(feature_id, cohort_id_array)

        if user_data['include_tcga']:
            job_item = submit_tcga_job(parameter_object, bigquery_service, cohort_settings)
            provider_array.append(job_item)

        if len(user_data['user_studies']) > 0:
            converted_feature_id = user_data['converted_feature_id']
            user_feature_id = user_data['user_feature_id']
            logging.debug("user_feature_id: {0}".format(user_feature_id))
            provider = UserFeatureProvider(converted_feature_id, user_feature_id=user_feature_id)

            # The UserFeatureProvider instance might not generate a BigQuery query and job at all given the combination
            # of cohort(s) and feature identifiers. The provider is not added to the array, and therefore to the
            # polling loop below, if it would not submit a BigQuery job.
            if provider.is_queryable(cohort_id_array):
                job_reference = provider.get_data_job_reference(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id)

                logging.info("Submitted USER {job_id}: {fid} - {cohorts}".format(job_id=job_reference['jobId'], fid=feature_id,
                                                                                 cohorts=str(cohort_id_array)))
                provider_array.append({
                    'feature_id': feature_id,
                    'provider': provider,
                    'ready': False,
                    'job_reference': job_reference
                })
            else:
                logging.debug("No UserFeatureDefs for '{0}'".format(converted_feature_id))

    return provider_array
예제 #3
0
def get_feature_vector(feature_id, cohort_id_array):
    include_tcga = False
    user_studies = ()
    for cohort_id in cohort_id_array:
        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)

            cursor.execute("SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id,))
            for row in cursor.fetchall():
                if row['project_id'] is None:
                    include_tcga = True
                else:
                    user_studies += (row['project_id'],)

        except Exception as e:
            if db: db.close()
            if cursor: cursor.close()
            raise e

    #  ex: feature_id 'CLIN:Disease_Code'
    user_feature_id = None
    if feature_id.startswith('USER:'******'t include TCGA
            include_tcga = False

    items = []
    type = None
    result = []
    cohort_settings = settings.GET_BQ_COHORT_SETTINGS()
    if include_tcga:
        provider = FeatureProviderFactory.from_feature_id(feature_id)
        result = provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id)

        # ex: result[0]
        # {'aliquot_id': None, 'case_id': u'TCGA-BH-A0B1', 'sample_id': u'TCGA-BH-A0B1-10A', 'value': u'BRCA'}
        for data_point in result:
            data_item = {key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id']}
            value = provider.process_data_point(data_point)
            # TODO refactor missing value logic
            if value is None:
                value = 'NA'
            data_item['value'] = value
            items.append(data_item)

        type = provider.get_value_type()

    if len(user_studies) > 0:
        # Query User Data
        user_provider = UserFeatureProvider(feature_id, user_feature_id=user_feature_id)
        user_result = user_provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id)
        result.extend(user_result)

        for data_point in user_result:
            data_item = {key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id']}
            value = provider.process_data_point(data_point)
            # TODO refactor missing value logic
            if value is None:
                value = 'NA'
            data_item['value'] = value
            items.append(data_item)

        if not type:
            type = user_provider.get_value_type()

    return type, items