def main(argv): faspRunner = FASPRunner(pauseSecs=0) settings = faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = BigQuerySearchClient() # query = """ # SELECT subject_id, read_drs_id # FROM `isbcgc-216220.COPDGene.phenotype_drs` # where weight_kg between 91.8 and 93.0 # LIMIT 1""" query = """ SELECT submitter_id, read_drs_id FROM `isbcgc-216220.onek_genomes.ssd_drs` where population = 'BEB' LIMIT 1""" # BioDataCatalyst drsClient = bdcDRSClient('~/.keys/bdc_credentials.json', 'gs') location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) faspRunner.configure(searchClient, drsClient, mysam) faspRunner.runQuery(query, 'One k query ')
def __init__(self, debug=False): self.drsClients = { "insdc": sdlDRSClient('~/.keys/prj_11218_D17199.ngc'), "crdc": crdcDRSClient('~/.keys/crdc_credentials.json','s3'), "bdc": bdcDRSClient('~/.keys/bdc_credentials.json','gs'), "anv": anvilDRSClient('~/.keys/anvil_credentials.json', '', 'gs'), "insdc": sdlDRSClient('~/.keys/prj_11218_D17199.ngc'), "sbcgc": sbcgcDRSClient('~/.keys/sevenbridges_keys.json','s3'), "sbcav": cavaticaDRSClient('~/.keys/sevenbridges_keys.json','s3'), "srapub": DRSClient('https://locate.ncbi.nlm.nih.gov', debug=False) } self.registeredClients = [] self.hostNameIndex = {} self.debug = debug
def DRSClientFromRegistryEntry(self, service, prefix): if prefix == "crdc": drsClient = crdcDRSClient('~/.keys/crdc_credentials.json','s3') elif prefix == "bdc": drsClient = bdcDRSClient('~/.keys/bdc_credentials.json','gs') elif prefix == "insdc": drsClient = sdlDRSClient('~/.keys/prj_11218_D17199.ngc') elif prefix == "sbcgc": drsClient = sbcgcDRSClient('~/.keys/sevenbridges_keys.json','s3') elif prefix == "sbcav": drsClient = cavaticaDRSClient('~/.keys/sevenbridges_keys.json','s3') else: drsClient = DRSClient.fromRegistryEntry(service) return drsClient
def __init__(self, debug=False, getReg=True): self.drsClients = { "insdc": sdlDRSClient('~/.keys/prj_11218_D17199.ngc'), "crdc": crdcDRSClient('~/.keys/crdc_credentials.json', 's3'), "bdc": bdcDRSClient('~/.keys/bdc_credentials.json', 'gs'), "anv": anvilDRSClient('~/.keys/anvil_credentials.json', '', 'gs'), "insdc": sdlDRSClient('~/.keys/prj_11218_D17199.ngc'), "sbcgc": sbcgcDRSClient('~/.keys/sevenbridges_keys.json', 's3'), "sbcav": cavaticaDRSClient('~/.keys/sevenbridges_keys.json', 'gs'), 'sbbdc': sbbdcDRSClient('~/.keys/sevenbridges_keys.json', 's3'), "sradrs": SRADRSClient('https://locate.be-md.ncbi.nlm.nih.gov') } self.registeredClients = [] self.hostNameIndex = {} self.debug = debug if getReg: self.getRegisteredDRSServices()
def main(argv): faspRunner = FASPRunner(pauseSecs=0) # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient( 'https://ga4gh-search-adapter-presto-public.prod.dnastack.com/') query = "select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'ACB' limit 3" # Step 2 - DRS - set up a DRS Client drsClient = bdcDRSClient('~/.keys/bdc_credentials.json', 'gs') # Step 3 - set up a class that run a compute for us wesClient = DNAStackWESClient('~/.keys/dnastack_wes_credentials.json') faspRunner.configure(searchClient, drsClient, wesClient) faspRunner.runQuery(query, 'One k query using Search and WES')
def main(argv): faspRunner = FASPRunner(pauseSecs=0) settings =faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/') query = "select submitter_id, read_drs_id drsid from thousand_genomes.onek_genomes.ssd_drs where population = 'BEB' limit 3" # Step 2 - DRS - set up a DRS Client # CRDC drsClient = bdcDRSClient('~/.keys/bdc_credentials.json', 'gs') # Step 3 - set up a class that runs samtools for us # providing the location where we the results to go location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) faspRunner.configure(searchClient, drsClient, mysam) faspRunner.runQuery(query, 'One k query using Search')
def main(argv): faspRunner = FASPRunner(pauseSecs=0) creditor = faspRunner.creditor settings = faspRunner.settings # set your Seven Bridges CGC project using what you have put in FASP Settings sbProject = settings['SevenBridgesProject'] sbInstance = settings['SevenBridgesInstance'] # Step 1 - Discovery # query for relevant DRS objects discoveryClients = { "sb": DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/'), "bdc": BigQuerySearchClient() } crdcquery = "SELECT sp.dbGaP_Subject_ID, 'sb:'||sb_drs_id FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi sp join dbgap_demo.scr_gecco_susceptibility.sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id join dbgap_demo.scr_gecco_susceptibility.sb_drs_index di on di.sample_id = sm.sample_id where AGE between 45 and 55 and sex = 'Female' and file_type = 'cram' limit 3" bdcquery = """ SELECT sp.dbGaP_Subject_ID, 'bdc:'||read_drs_id FROM `isbcgc-216220.COPDGene.Subject_MULTI` sm join `isbcgc-216220.COPDGene.Subject_Phenotypes_HMB` sp on sp.dbgap_subject_id = sm.dbgap_subject_id join `isbcgc-216220.COPDGene.COPD_DRS` drs on drs.su_submitter_id = sm.subject_id where gender = '2' and Age_Enroll between 45 and 55 LIMIT 3""" results = discoveryClients['sb'].runQuery(crdcquery) # Send the query creditor.creditFromList('dbGapSSD') creditor.creditClass(discoveryClients['sb']) results += discoveryClients['bdc'].runQuery(bdcquery) creditor.creditFromList('BDCData') # Step 2 - DRS - set up DRS Clients drsClients = { "sb": sbcgcDRSClient('~/.keys/sevenbridges_keys.json', 's3'), "bdc": bdcDRSClient('~/.keys/bdc_credentials.json', 'gs') } print('setting credentials ') creditor.creditFromList('dbGaPFence') # Step 3 - set up a class that runs samtools for us # providing the location for the results location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) sam2 = GCPLSsamtools(location, settings['GCPOutputBucket']) samClients = { "sb": samtoolsSBClient(sbInstance, sbProject), "bdc": sam2 } # repeat steps 2 and 3 for each row of the query for row in results: print("subject={}, drsID={}".format(row[0], row[1])) resRow = [row[0], row[1]] # Step 2 - Use DRS to get the URL # get the prefix prefix, drsid = row[1].split(":", 1) drsClient = drsClients[prefix] searchClient = discoveryClients[prefix] creditor.creditClass(drsClient) url = drsClient.getAccessURL(drsid) print(url) #objInfo = drsClient.getObject(drsid) #print (objInfo) #fileSize = objInfo['size'] fileSize = 0 # Step 3 - Run a pipeline on the file at the drs url if url != None: outfile = "{}.txt".format(row[0]) mysam = samClients[prefix] creditor.creditClass(mysam) via = 'sh' note = 'Two dbGaP sources' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") run_id = mysam.runWorkflow(url, outfile) faspRunner.logRun(time, via, note, run_id, outfile, fileSize, searchClient, drsClient, mysam) resRow.append('OK') else: print('could not get DRS url') resRow.append('unauthorized')
def main(argv): faspRunner = FASPRunner(pauseSecs=0) creditor = faspRunner.creditor settings = faspRunner.settings # Step 1 - Discovery # query for relevant DRS objects searchClient = BigQuerySearchClient() # TCGA Query - CRDC crdcquery = """ SELECT 'case_'||associated_entities__case_gdc_id , 'crdc:'||file_id FROM `isb-cgc.GDC_metadata.rel24_fileData_active` where data_format = 'BAM' and project_disease_type = 'Breast Invasive Carcinoma' limit 3""" #COPD query - Topmed bdcquery = """ SELECT SUBJECT_ID, 'bdc:'||read_drs_id FROM `isbcgc-216220.COPDGene.phenotype_drs` where Weight_KG between 92.5 and 93.0 LIMIT 3""" results = searchClient.runQuery(crdcquery) # Send the query creditor.creditFromList('ISBGDCData') results += searchClient.runQuery(bdcquery) creditor.creditFromList('BDCData') # Step 2 - DRS - set up DRS Clients drsClients = { "crdc": crdcDRSClient('~/.keys/crdc_credentials.json', ''), "bdc": bdcDRSClient('~/.keys/bdc_credentials.json', '') } print('setting credentials ') creditor.creditFromList('dbGaPFence') # Step 3 - set up a class that runs samtools for us # providing the location for the results location = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion']) mysam = GCPLSsamtools(location, settings['GCPOutputBucket']) # repeat steps 2 and 3 for each row of the query for row in results: print("subject={}, drsID={}".format(row[0], row[1])) # Step 2 - Use DRS to get the URL # get the prefix prefix, drsid = row[1].split(":", 1) url = drsClients[prefix].getAccessURL(drsid, 'gs') drsClient = drsClients[prefix] creditor.creditClass(drsClient) objInfo = drsClient.getObject(drsid) fileSize = objInfo['size'] # Step 3 - Run a pipeline on the file at the drs url outfile = "{}.txt".format(row[0]) mysam.runWorkflow(url, outfile) creditor.creditClass(mysam) via = 'sh' pipeline_id = 'paste here' note = 'Two sources' time = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") faspRunner.logRun(time, via, note, pipeline_id, outfile, fileSize, searchClient, drsClient, mysam) creditor.creditFromList('FASPScript2_sdrf', closeImage=False)