def main(argv):

    searchClient = BigQuerySearchClient()

    # TCGA Query - CRDC
    crdcquery = """
     	SELECT 'case_'||associated_entities__case_gdc_id , 'crdc:'||file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""

    #COPD query - Topmed
    bdcquery = """
  		SELECT SUBJECT_ID, 'bdc:'||read_drs_id
  		FROM `isbcgc-216220.COPDGene.phenotype_drs`
      	where Weight_KG between 92.5 and 93.0
      	LIMIT 3"""

    results = searchClient.runQuery(crdcquery)  # Send the query
    results += searchClient.runQuery(bdcquery)

    # repeat steps 2 and 3 for each row of the query
    for row in results:

        print("subject={}, drsID={}".format(row[0], row[1]))
Beispiel #2
0
def main(argv):

    faspRunner = FASPRunner(pauseSecs=0)
    creditor = faspRunner.creditor
    settings = faspRunner.settings

    # set your Seven Bridges CGC project using what you have put in FASP Settings
    sbProject = settings['SevenBridgesProject']
    sbInstance = settings['SevenBridgesInstance']

    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = BigQuerySearchClient()

    # Step 2 - DRS - set up a DRS Client
    drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 's3')

    # Step 3 - set up a class that runs samtools for us
    location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                                 settings['GCPPipelineRegion'])
    sam2 = GCPLSsamtools(location, settings['GCPOutputBucket'])
    mysams = {'s3': samtoolsSBClient(sbInstance, sbProject), 'gs': sam2}

    query = """
     	SELECT 'case_'||associated_entities__case_gdc_id , file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""
    print(query)

    query_job = searchClient.runQuery(query)  # Send the query
    creditor.creditFromList('ISBGDCData')

    # repeat steps 2 and 3 for each row of the query

    for row in query_job:

        print("subject={}, drsID={}".format(row[0], row[1]))

        # Step 2 - Use DRS to get the URL
        objInfo = drsClient.getObject(row[1])
        creditor.creditClass(drsClient)
        fileSize = objInfo['size']
        outfile = "{}.txt".format(row[0])
        # submit to both aws and gcp
        for cl, mysam in mysams.items():
            url = drsClient.getAccessURL(row[1], cl)
            # Step 3 - Run a pipeline on the file at the drs url

            creditor.creditClass(mysam)
            task_id = mysam.runWorkflow(url, outfile)
            via = 'py'
            note = 'double submit'

            time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
            faspRunner.logRun(time, via, note, task_id, outfile, str(fileSize),
                              searchClient, drsClient, mysam)

    creditor.creditFromList('FASPScript8_sdrf', closeImage=False)
Beispiel #3
0
def main(argv):

    searchClient = DiscoverySearchClient(
        'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/')
    #query = "select id, phenopacket from sample_phenopackets.ga4gh_tables.gecco_phenopackets limit 10"
    query = "select id from sample_phenopackets.ga4gh_tables.gecco_phenopackets where json_extract_scalar(phenopacket, '$.subject.sex') = 'MALE'"

    bqSearchClient = BigQuerySearchClient()
    #query = "select id, phenopacket from sample_phenopackets.ga4gh_tables.gecco_phenopackets limit 10"

    crdcquery = """
		SELECT BioSample_Accession id
		FROM `isbcgc-216220.GECCO_CRC_Susceptibility.Subject_Phenotypes` sp
		join `isbcgc-216220.GECCO_CRC_Susceptibility.Sample_MULTI` sm on sm.dbgap_subject_id = sp.dbgap_subject_id
		and sex = 'Male'
		"""

    dbList = []
    results = bqSearchClient.runQuery(crdcquery)
    print(len(results))
    for r in results:
        dbList.append(r['id'])
    ppList = []
    query_job = searchClient.runQuery(query)  # Send the query
    print(len(query_job))
    for r in query_job:
        ppList.append(r[0])

    # compare the lists
    dbList.sort()
    ppList.sort()
    if dbList == ppList:
        print("The lists dbList and ppList are the same")
    else:
        print("The lists dbList and ppList are not the same")
Beispiel #4
0
def main(argv):

    searchClient = BigQuerySearchClient()
    query = """
		SELECT submitter_id, read_drs_id
		FROM `isbcgc-216220.onek_genomes.ssd_drs`
		where population = 'BEB'
		LIMIT 1"""

    res = searchClient.runQuery(query)
    print(res)
Beispiel #5
0
def main(argv):

    searchClient = BigQuerySearchClient()

    query = """
     	SELECT 'case_'||associated_entities__case_gdc_id , file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""

    searchClient.runQuery(query)
Beispiel #6
0
def main():


	searchClient = BigQuerySearchClient()

	query = """
		SELECT s.sample_name, drs_id, s.acc, assay_type, filename, 
		FROM `nih-sra-datastore.sra.metadata` s, unnest(attributes) att
		join `isbcgc-216220.onek_genomes.sra_drs_files` d on d.acc = s.acc
		where filetype = 'bam' and mapped = 'mapped' and sequencing_type ='exome'
		and att.k = 'population_sam' and att.v = 'JPT' 
		LIMIT 3"""

	searchClient.runQuery(query)
def main():

	searchClient = BigQuerySearchClient()
	query = """SELECT sra.biosample, sra.acc||'.cram'
		FROM `isbcgc-216220.GECCO_CRC_Susceptibility.Subject_Phenotypes` sp
		join `isbcgc-216220.GECCO_CRC_Susceptibility.Sample_MULTI` sm on
		sm.dbgap_subject_id = sp.dbgap_subject_id
		join `nih-sra-datastore.sra.metadata` sra on sm.BioSample_Accession = sra.biosample
		where AGE between 45 and 55 and sex = 'Female' limit 3"""
	query_job = searchClient.runQuery(query)
	
	# repeat steps 2 and 3 for each row of the query
	for row in query_job:

		print("sample={}, drsID={}".format(row[0], row[1]))
Beispiel #8
0
def main(argv):

    faspRunner = FASPRunner(pauseSecs=0)

    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = BigQuerySearchClient()

    query = """
     	SELECT 'case_'||associated_entities__case_gdc_id , file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""

    drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 's3')

    # Step 3 - set up a class that runs samtools for us
    sbProject = faspRunner.settings['SevenBridgesProject']
    sbInst = faspRunner.settings['SevenBridgesInstance']
    mysam = samtoolsSBClient(sbInst, sbProject)

    faspRunner.configure(searchClient, drsClient, mysam)

    faspRunner.runQuery(query, 'GDC query SB compute')
def main(argv):

    # Step 1 - Discovery
    # query for relevant DRS objects
    discoveryClients = {
        "sb":
        DiscoverySearchClient(
            'https://ga4gh-search-adapter-presto-public.prod.dnastack.com'),
        "bdc":
        BigQuerySearchClient()
    }

    crdcquery = """SELECT sp.dbGaP_Subject_ID,  'sb:'||sb_drs_id 
	FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi sp 
	join dbgap_demo.scr_gecco_susceptibility.sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id 
	join dbgap_demo.scr_gecco_susceptibility.sb_drs_index di on di.sample_id = sm.sample_id 
	where AGE between 45 and 55 and sex = 'Female' and file_type = 'cram' limit 3"""

    bdcquery = """
		SELECT sp.dbGaP_Subject_ID,  'bdc:'||read_drs_id
		FROM `isbcgc-216220.COPDGene.Subject_MULTI` sm
		join `isbcgc-216220.COPDGene.Subject_Phenotypes_HMB` sp on sp.dbgap_subject_id = sm.dbgap_subject_id
		join `isbcgc-216220.COPDGene.COPD_DRS` drs on drs.su_submitter_id = sm.subject_id
 		where gender = '2'
 		and Age_Enroll between 45 and 55
 		LIMIT 3"""

    results = discoveryClients['sb'].runQuery(crdcquery)  # Send the query
    results += discoveryClients['bdc'].runQuery(bdcquery)

    # repeat steps 2 and 3 for each row of the query
    for row in results:

        print("subject={}, drsID={}".format(row[0], row[1]))
Beispiel #10
0
def main(argv):

    faspRunner = FASPRunner(pauseSecs=0)
    settings = faspRunner.settings
    # Step 1 - Discovery
    # query for relevant DRS objects
    searchClient = BigQuerySearchClient()
    # 	query = """
    #      	SELECT subject_id, read_drs_id
    #      	FROM `isbcgc-216220.COPDGene.phenotype_drs`
    #      	where weight_kg between 91.8 and 93.0
    #      	LIMIT 1"""
    query = """
		SELECT submitter_id, read_drs_id
		FROM `isbcgc-216220.onek_genomes.ssd_drs`
		where population = 'BEB'
		LIMIT 1"""

    # BioDataCatalyst
    drsClient = bdcDRSClient('~/.keys/bdc_credentials.json', 'gs')
    location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                                 settings['GCPPipelineRegion'])
    mysam = GCPLSsamtools(location, settings['GCPOutputBucket'])

    faspRunner.configure(searchClient, drsClient, mysam)

    faspRunner.runQuery(query, 'One k  query ')
Beispiel #11
0
def main(argv):


	faspRunner = FASPRunner(pauseSecs=0)
	settings = faspRunner.settings
	# Step 1 - Discovery
	# query for relevant DRS objects
	searchClient = BigQuerySearchClient()

	query = """
		SELECT s.sample_name, drs_id, s.acc, assay_type, filename, 
		FROM `nih-sra-datastore.sra.metadata` s, unnest(attributes) att
		join `isbcgc-216220.onek_genomes.sra_drs_files` d on d.acc = s.acc
		where filetype = 'bam' and mapped = 'mapped' and sequencing_type ='exome'
		and att.k = 'population_sam' and att.v = 'JPT' 
		LIMIT 3"""

	#drsClient = DRSMetaResolver()
	drsClient = DRSClient('https://locate.ncbi.nlm.nih.gov',access_id='2', public=True)
	location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion'])
	mysam = GCPLSsamtools(location, settings['GCPOutputBucket'])

	faspRunner.configure(searchClient, drsClient, mysam)
		
	faspRunner.runQuery(query, 'One k query SRA DRS')
Beispiel #12
0
def main(argv):

	faspRunner = FASPRunner(pauseSecs=0)
	
	# Step 1 - Discovery
	# query for relevant DRS objects
	searchClient = BigQuerySearchClient()
	query = """SELECT sra.biosample, sra.acc||'.cram'
		FROM `isbcgc-216220.GECCO_CRC_Susceptibility.Subject_Phenotypes` sp
		join `isbcgc-216220.GECCO_CRC_Susceptibility.Sample_MULTI` sm on
		sm.dbgap_subject_id = sp.dbgap_subject_id
		join `nih-sra-datastore.sra.metadata` sra on sm.BioSample_Accession = sra.biosample
		where AGE between 45 and 55 and sex = 'Female' limit 3"""
	query_job = searchClient.runQuery(query)
	
	# Step 2 - DRS - set up a DRS Client
	# CRDC
	drsClient = sdlDRSClient('~/.keys/prj_14565.ngc', True)
	
	# Step 3 - set up a class that run a compute for us
	wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json')
	
	# repeat steps 2 and 3 for each row of the query
	for row in query_job:

		print("sample={}, drsID={}".format(row[0], row[1]))
		
		# Step 2 - Use DRS to get the URL
		objInfo = drsClient.getObject(row[1])
		fileSize = objInfo['size']
		print(fileSize)
		# we've predetermined we want to use the gs copy in this case
		#url = drsClient.getAccessURL(row[1], 'gs')
		res = drsClient.getAccessURL(row[1],'gs.us')
		url = res['url']
		print(url)
		# Step 3 - Run a pipeline on the file at the drs url
		outfile = "{}.txt".format(row[0])
		pipeline_id = wesClient.runWorkflow(url, outfile)
		print('submitted:{}'.format(pipeline_id))
		
		via = 'WES'
		note = 'WES MD5 on NCBI SDL'

		time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
		faspRunner.logRun(time, via, note,  pipeline_id, outfile, str(fileSize),
			searchClient, drsClient, wesClient)
Beispiel #13
0
def main(argv):

    faspRunner = FASPRunner()
    settings = faspRunner.settings

    # Step 1 - Discovery
    # query for relevant DRS objects
    discoveryClients = {
        "crdc":
        BigQuerySearchClient(),
        "anv":
        Gen3ManifestClient('./fasp/data/gtex/gtex-cram-manifest_wCuries.json')
    }

    # TCGA Query - CRDC
    crdcquery = """
     	SELECT 'case_'||associated_entities__case_gdc_id , 'crdc:'||file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""

    # Run both queriues abd aggregate results
    results = discoveryClients['anv'].runQuery(3)  # Send the query
    results += discoveryClients['crdc'].runQuery(crdcquery)

    # Step 2 - DRS - set up DRS Clients
    # TODO Use DRSMetaresolver so we don't have to build our own resolver in this code
    drsClients = {
        "crdc":
        crdcDRSClient('~/.keys/crdc_credentials.json', 'gs'),
        "anv":
        anvilDRSClient('~/.keys/anvil_credentials.json',
                       settings['GCPProject'], 'gs')
    }

    # Step 3 - set up a class that runs samtools for us
    # providing the location for the results
    location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                                 settings['GCPPipelineRegion'])
    wesClient = GCPLSsamtools(location, settings['GCPOutputBucket'])

    # repeat steps 2 and 3 for each row of the query
    for row in results:

        print("subject={}, drsID={}".format(row[0], row[1]))
        resRow = [row[0], row[1]]
        # Step 2 - Use DRS to get the URL
        # This is a local solution to resolve prefixed DRS ids, DRS Metarolver would be better
        # get the prefix

        prefix, drsid = row[1].split(":", 1)
        drsClient = drsClients[prefix]
        print('Sending id {} to {}'.format(drsid,
                                           drsClient.__class__.__name__))

        url = drsClient.getAccessURL(drsid)
        objInfo = drsClient.getObject(drsid)
        #print (objInfo)
        fileSize = objInfo['size']
        #fileSize = 0

        # Step 3 - Run a pipeline on the file at the drs url
        if url != None:
            outfile = "{}.txt".format(row[0])
            via = 'sh'
            note = 'GTEx and TCGA'
            time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
            run_id = wesClient.runWorkflow(url, outfile)
            searchClient = discoveryClients[prefix]
            faspRunner.logRun(time, via, note, run_id, outfile, fileSize,
                              searchClient, drsClient, wesClient)
            resRow.append('OK')
        else:
            print('could not get DRS url')
            resRow.append('unauthorized')
Beispiel #14
0
def main(argv):

	
	faspRunner = FASPRunner(pauseSecs=0)
	creditor = faspRunner.creditor
	settings = faspRunner.settings
	
	# set your Seven Bridges CGC project using what you have put in FASP Settings
	sbProject = settings['SevenBridgesProject']
	sbInstance = settings['SevenBridgesInstance']

	# Step 1 - Discovery
	# query for relevant DRS objects
	discoveryClients = {
		"sb": DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/'),
		"bdc": BigQuerySearchClient()
	}

	crdcquery = "SELECT sp.dbGaP_Subject_ID,  'sb:'||sb_drs_id FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi sp join dbgap_demo.scr_gecco_susceptibility.sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id join dbgap_demo.scr_gecco_susceptibility.sb_drs_index di on di.sample_id = sm.sample_id where AGE between 45 and 55 and sex = 'Female' and file_type = 'cram' limit 3"
		


	bdcquery = """
		SELECT sp.dbGaP_Subject_ID,  'bdc:'||read_drs_id
		FROM `isbcgc-216220.COPDGene.Subject_MULTI` sm
		join `isbcgc-216220.COPDGene.Subject_Phenotypes_HMB` sp on sp.dbgap_subject_id = sm.dbgap_subject_id
		join `isbcgc-216220.COPDGene.COPD_DRS` drs on drs.su_submitter_id = sm.subject_id
 		where gender = '2'
 		and Age_Enroll between 45 and 55
 		LIMIT 3"""
		

	results = discoveryClients['sb'].runQuery(crdcquery)  # Send the query
	creditor.creditFromList('dbGapSSD')
	creditor.creditClass(discoveryClients['sb'])
	results += discoveryClients['bdc'].runQuery(bdcquery) 
	creditor.creditFromList('BDCData')
	

	# Step 2 - DRS - set up DRS Clients	
	drsClients = {
		"sb": sbcgcDRSClient('~/.keys/sevenbridges_keys.json', 's3'),
		"bdc": bdcDRSClient('~/.keys/bdc_credentials.json', 'gs')
	}
	print('setting credentials ')
	creditor.creditFromList('dbGaPFence')
		
	# Step 3 - set up a class that runs samtools for us
	# providing the location for the results
	location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion'])
	sam2 = GCPLSsamtools(location, settings['GCPOutputBucket'])
	samClients = {
		"sb": samtoolsSBClient(sbInstance, sbProject),
		"bdc": sam2
	}

	
	# repeat steps 2 and 3 for each row of the query
	for row in results:

		print("subject={}, drsID={}".format(row[0], row[1]))
		resRow = [row[0], row[1]]
		# Step 2 - Use DRS to get the URL
		# get the prefix
		prefix, drsid = row[1].split(":", 1)
		drsClient = drsClients[prefix]
		searchClient = discoveryClients[prefix]
		creditor.creditClass(drsClient)
		url = drsClient.getAccessURL(drsid)
		print(url)
		#objInfo = drsClient.getObject(drsid)
		#print (objInfo)
		#fileSize = objInfo['size']
		fileSize = 0
				
		# Step 3 - Run a pipeline on the file at the drs url
		if url != None:
			outfile = "{}.txt".format(row[0])
			mysam = samClients[prefix]
			creditor.creditClass(mysam)
			via = 'sh'
			note = 'Two dbGaP sources'
			time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
			run_id = mysam.runWorkflow(url, outfile)
			faspRunner.logRun(time, via, note,  run_id, outfile, fileSize,
				searchClient, drsClient, mysam)
			resRow.append('OK')
		else:
			print('could not get DRS url')
			resRow.append('unauthorized')
Beispiel #15
0
def main(argv):

	
	faspRunner = FASPRunner(pauseSecs=0)
	creditor = faspRunner.creditor
	settings = faspRunner.settings
	
	# Step 1 - Discovery
	# query for relevant DRS objects
	searchClient = BigQuerySearchClient()

	# TCGA Query - CRDC
	crdcquery = """
     	SELECT 'case_'||associated_entities__case_gdc_id , 'crdc:'||file_id
		FROM `isb-cgc.GDC_metadata.rel24_fileData_active` 
		where data_format = 'BAM' 
		and project_disease_type = 'Breast Invasive Carcinoma'
		limit 3"""
	
	#COPD query - Topmed	
	bdcquery = """
  		SELECT SUBJECT_ID, 'bdc:'||read_drs_id
  		FROM `isbcgc-216220.COPDGene.phenotype_drs`
      	where Weight_KG between 92.5 and 93.0
      	LIMIT 3"""
  		
	results = searchClient.runQuery(crdcquery)  # Send the query
	creditor.creditFromList('ISBGDCData')
	results += searchClient.runQuery(bdcquery)  
	creditor.creditFromList('BDCData')

	# Step 2 - DRS - set up DRS Clients	
	drsClients = {
		"crdc": crdcDRSClient('~/.keys/crdc_credentials.json', ''),
		"bdc": bdcDRSClient('~/.keys/bdc_credentials.json', '')
	}
	print('setting credentials ')
	creditor.creditFromList('dbGaPFence')
	
	# Step 3 - set up a class that runs samtools for us
	# providing the location for the results
	location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion'])
	mysam = GCPLSsamtools(location, settings['GCPOutputBucket'])
	
	# repeat steps 2 and 3 for each row of the query
	for row in results:

		print("subject={}, drsID={}".format(row[0], row[1]))
		
		# Step 2 - Use DRS to get the URL
		# get the prefix
		prefix, drsid = row[1].split(":", 1)
		url = drsClients[prefix].getAccessURL(drsid, 'gs')
		drsClient = drsClients[prefix]
		creditor.creditClass(drsClient)
		objInfo = drsClient.getObject(drsid)
		fileSize = objInfo['size']
				
		# Step 3 - Run a pipeline on the file at the drs url
		outfile = "{}.txt".format(row[0])
		mysam.runWorkflow(url, outfile)
		creditor.creditClass(mysam)
		via = 'sh'
		pipeline_id = 'paste here'
		note = 'Two sources'
		time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
		faspRunner.logRun(time, via, note,  pipeline_id, outfile, fileSize,
			searchClient, drsClient, mysam)
			
	creditor.creditFromList('FASPScript2_sdrf', closeImage=False)
Beispiel #16
0
	def __del__(self):
		self.log.close()


	def logRun(self, time, via, note, pipeline_id, outfile, fileSize, 
		searcher, finder, computer):
		
		searchClass = searcher.__class__.__name__
		drsClass = finder.__class__.__name__
		computeClass = computer.__class__.__name__
		
		logline = '{}\t\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(time, via,
			self.program, note, pipeline_id, outfile, fileSize,
			searchClass, drsClass, computeClass)
		self.log.write(logline)
		self.log.write("\n")
		
	def close(self):
		self.log.close()
		

if __name__ == "__main__":
	dummy = BigQuerySearchClient()		
	fl = FASPLogger( './testlog.txt', 'me')
	via = 'py'
	note = 'testing'
	time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
	pipeline_id = '123456'
	outfile = 'text.txt'
	fileSize = 100;
	fl.logRun(time, via, note, pipeline_id, outfile, fileSize, dummy, dummy, dummy)
''' Query to illustrate Anne's use case for variants related to a gene involved in a rare pediatric brain cancer'''
#  IMPORTS

from fasp.search import BigQuerySearchClient


searchClient = BigQuerySearchClient()

query = """
SELECT mut.case_barcode subject, meta.file_gdc_id as drs_id, 
meta.file_gdc_url as tumor_bam_file_path,
clin.race, clin.age_at_diagnosis, clin.ethnicity
  
FROM `isb-cgc.TCGA_hg38_data_v0.Somatic_Mutation` as mut 
join `isb-cgc.TCGA_bioclin_v0.Clinical` as clin 
on clin.case_barcode = mut.case_barcode 
join `isb-cgc.GDC_metadata.rel24_GDCfileID_to_GCSurl` as meta 
on meta.file_gdc_id = mut.tumor_bam_uuid #OR meta.file_gdc_id = mut.normal_bam_uuid 

where mut.Hugo_Symbol = "JMJD1C" 

order by meta.file_gdc_id
limit 3"""

searchClient.runQuery(query)

Beispiel #18
0
''' Query to illustrate Anne's use case for variants related to a gene involved in a rare pediatric brain cancer'''
#  IMPORTS
import sys

from fasp.runner import FASPRunner

# The implementations we're using
from fasp.loc import crdcDRSClient
from fasp.workflow import GCPLSsamtools
from fasp.search import BigQuerySearchClient

faspRunner = FASPRunner(pauseSecs=0)
settings = faspRunner.settings

searchClient = BigQuerySearchClient()
drsClient = crdcDRSClient('~/.keys/crdc_credentials.json', 'gs')
location = 'projects/{}/locations/{}'.format(settings['GCPProject'],
                                             settings['GCPPipelineRegion'])
mysam = GCPLSsamtools(location, settings['GCPOutputBucket'])

faspRunner.configure(searchClient, drsClient, mysam)

query = """
SELECT mut.case_barcode subject, meta.file_gdc_id as drs_id, 
meta.file_gdc_url as tumor_bam_file_path,
clin.race, clin.age_at_diagnosis, clin.ethnicity
  
FROM `isb-cgc.TCGA_hg38_data_v0.Somatic_Mutation` as mut 
join `isb-cgc.TCGA_bioclin_v0.Clinical` as clin 
on clin.case_barcode = mut.case_barcode 
join `isb-cgc.GDC_metadata.rel24_GDCfileID_to_GCSurl` as meta