def checkRunsPreviouslyLaunched(resultsFolder, projectId): '''Checks for currently running jobs and will exit if found.''' launchFilePath = resultsFolder + '/' + RUNS_LAUNCHED_FILE launchFids = dxencode.find_file(launchFilePath, projectId, multiple=True) if launchFids == None: print " No prior jobs launched." else: # NOTE: Appending to the one file, but just in case handle multiple files. for fid in launchFids: with dxpy.open_dxfile(fid) as fd: for line in fd: #print "Looking for job ["+line+"]" runId = line.split(None, 1) if not runId[0].startswith('analysis-'): continue analysis = dxpy.DXAnalysis(dxid=runId[0]) if analysis == None: continue state = analysis.describe()['state'] # states I have seen: in_progress, terminated, done, failed if state not in ["done", "failed", "terminated"]: msg = "Exiting: Can't launch because prior run [" + runId[ 0] + "] " if len(runId) > 1: msg += "(" + runId[1] + ") " msg += "has not finished (currently '" + state + "')." print msg sys.exit(1) else: msg = " Prior run [" + runId[0] + "] " if len(runId) > 1: msg += "(" + runId[1] + ") " msg += "is '" + state + "'." print msg
def rerun_with_frip(analysis_id, dryrun, assay_type): logger.debug( 'rerun_with_frip: analysis_id %s' % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) project_id = analysis.describe()['project'] temp = dxpy.api.workflow_new({ 'name': analysis.describe()['executableName'], 'project': project_id, 'initializeFrom': {'id': analysis.get_id()}, 'temporary': True}) new_workflow = dxpy.DXWorkflow(temp['id']) logger.debug( 'rerun_with_frip: new_workflow %s %s' % (new_workflow.get_id(), new_workflow.name)) final_stage, new_input, new_applet = get_assay_specific_variables( analysis, assay_type ) accessioning_stage = stage_named('Accession results', analysis) if accessioning_stage: new_workflow.remove_stage(accessioning_stage['id']) logger.debug( 'rerun_with_frip: new_applet %s %s' % (new_applet.get_id(), new_applet.name)) logger.debug( 'rerun_with_frip: new_input \n%s' % (pformat(new_input))) new_workflow.update_stage( final_stage['id'], executable=new_applet.get_id(), stage_input=new_input, force=True) m = re.match('ENCSR.{6}', analysis.name) accession = m.group(0) analysis_properties = analysis.describe()['properties'] analysis_properties.update({ 'experiment_accession': accession, 'original_analysis': analysis_id }) if dryrun: logger.debug( 'rerun_with_frip: workflow created but dryrun so no analysis run') return new_workflow else: logger.debug( 'rerun_with_frip: running workflow') return new_workflow.run( {}, project=project_id, name="%s frip" % (analysis.name), properties=analysis_properties)
def rerun_with_applet(analysis_id, stage_name, applet_name, folder=None): logger.debug('rerun_with_applet: analysis_id %s new_applet_name %s' % (analysis_id, applet_name)) analysis = dxpy.DXAnalysis(analysis_id) old_workflow_description = analysis.describe().get('workflow') old_workflow = dxpy.DXWorkflow(old_workflow_description['id']) project_id = analysis.describe()['project'] temp = dxpy.api.workflow_new({ 'name': analysis.describe()['executableName'], 'project': project_id, 'initializeFrom': { 'id': analysis.get_id() }, 'properties': old_workflow.get_properties(), 'temporary': True }) new_workflow = dxpy.DXWorkflow(temp['id']) logger.debug('rerun_with_applet: new_workflow %s %s' % (new_workflow.get_id(), new_workflow.name)) old_stage = stage_named(stage_name, analysis) accessioning_stage = stage_named('Accession results', analysis) if accessioning_stage: new_workflow.remove_stage(accessioning_stage['id']) new_applet = find_applet_by_name(applet_name) logger.debug('rerun_with_applet: new_applet %s %s' % (new_applet.get_id(), new_applet.name)) same_input = old_stage['execution']['input'] logger.debug('rerun_with_applet: same_input \n%s' % (pformat(same_input))) new_workflow.update_stage(old_stage['id'], executable=new_applet.get_id(), stage_input=same_input, force=True) m = re.match('ENCSR.{6}', analysis.name) accession = m.group(0) analysis_properties = analysis.describe()['properties'] analysis_properties.update({ 'experiment_accession': accession, 'original_analysis': analysis_id }) logger.debug('rerun_with_applet: running workflow') runargs = { # 'executable_input': {}, 'project': project_id, 'name': "%s %s" % (analysis.name, new_applet.name), 'properties': analysis_properties } if folder is not None: runargs.update({'folder': folder}) logger.debug("running new_workflow with args: \n%s" % (pformat(runargs))) return new_workflow.run({}, **runargs)
def main(): args = get_args() if args.debug: logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) logger.setLevel(logging.DEBUG) else: # Use the default logging level. logging.basicConfig(format='%(levelname)s:%(message)s') logger.setLevel(logging.INFO) if args.released: keypair = None server = PUBLIC_SERVER else: authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.experiments: ids = args.experiments elif args.all: # Get metadata for all ChIP-seq Experiments. base_exp_query = '/search/?type=Experiment&assay_title=ChIP-seq&award.project=ENCODE&status=released' extended_query = '&status=submitted&status=in+progress&status=started&status=release+ready' exp_query = base_exp_query if args.released else (base_exp_query + extended_query) all_experiments = common.encoded_get(server + exp_query, keypair)['@graph'] # Extract Experiment accessions. ids = [exp.get('accession') for exp in all_experiments] elif args.infile: ids = args.infile else: # Never reached because infile defaults to stdin. raise InputError('Must supply experiment ids' ' in arguments or --infile.') # Define column names for TSV. fieldnames = [ 'date', 'analysis', 'analysis_id', 'experiment', 'target', 'biosample_term_name', 'biosample_type', 'replication', 'lab', 'rfa', 'assembly', 'Nt', 'Np', 'N1', 'N2', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility_test', 'Ft', 'Fp', 'F1', 'F2', 'state', 'release', 'total_price', 'quality_metric_of' ] if args.create_google_sheet: # Force creation of temporary CSV that can be loaded into a DataFrame, # written to Google Sheets, then deleted. temp_file = 'temp_idr_%s.tsv' % (args.assembly) args.outfile = open(temp_file, 'w') writer = csv.DictWriter(args.outfile, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() # Get metadata for all IDR output Files. base_idr_query = ('/search/?type=File&assembly=%s&file_format=bed' '&output_type=optimal+idr+thresholded+peaks' '&output_type=conservative+idr+thresholded+peaks' '&output_type=pseudoreplicated+idr+thresholded+peaks' '&lab.title=ENCODE+Processing+Pipeline' '&lab.title=J.+Michael+Cherry,+Stanford' '&status=released' % (args.assembly)) extended_idr_query = '&status=in+progress&status=uploading&status=uploaded' idr_query = base_idr_query if args.released else (base_idr_query + extended_idr_query) all_idr_files = common.encoded_get(server + idr_query, keypair)['@graph'] na = 'not_available' for (i, experiment_id) in enumerate(ids): if experiment_id.startswith('#'): continue experiment_id = experiment_id.rstrip() experiment_uri = '/experiments/%s/' % (experiment_id) idr_files = \ [f for f in all_idr_files if f['dataset'] == experiment_uri] idr_step_runs = set([f.get('step_run') for f in idr_files]) if not len(idr_step_runs): if not args.all: logger.warning("%s: Found %d IDR step runs. Skipping" % (experiment_id, len(idr_step_runs))) continue idr_qc_uris = [] assemblies = [] for f in idr_files: quality_metrics = f.get('quality_metrics') if not len(quality_metrics) == 1: logger.error( '%s: Expected one IDR quality metric for file %s.' ' Found %d.' % (experiment_id, f.get('accession'), len(quality_metrics))) idr_qc_uris.extend(quality_metrics) assembly = f.get('assembly') if not assembly: logger.error('%s: File %s has no assembly' % (experiment_id, f.get('accession'))) assemblies.append(assembly) idr_qc_uris = set(idr_qc_uris) if not len(idr_qc_uris) == 1: logger.error('%s: Expected one unique IDR metric,' ' found %d. Skipping.' % (experiment_id, len(idr_qc_uris))) continue assemblies = set(assemblies) if not len(assemblies) == 1: logger.error('%s: Expected one unique assembly, found %d.' ' Skipping.' % (experiment_id, len(assemblies))) continue # Grab unique value from set. idr_qc_uri = next(iter(idr_qc_uris)) assembly = next(iter(assemblies)) # Get analysis_id from DNAnexus, create analysis_link. idr_step_run_uri = next(iter(idr_step_runs)) try: idr_step_run = common.encoded_get(server + idr_step_run_uri, keypair) except Exception as e: print(experiment_id, e, 'Skipping.') continue try: dx_job_id_str = idr_step_run.get('dx_applet_details')[0].get( 'dx_job_id') except: logger.warning( "Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id" ) logger.debug(idr_step_run) # Could try to pull it from alias. dx_job_id_str = None dx_job_id = dx_job_id_str.rpartition(':')[2] if not args.released: dx_job = dxpy.DXJob(dx_job_id) job_desc = dx_job.describe() analysis_id = job_desc.get('analysis') logger.debug('%s' % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % ( desc.get('project').split('-')[1], desc.get('id').split('-')[1]) else: analysis_link = na desc = {} # Get IDR object. idr = common.encoded_get(server + idr_qc_uri, keypair) # Pull metrics of interest. idr_status = idr.get('status', na) if (args.released and (idr_status == na or idr_status != 'released')): logger.error('%s: Expected released IDR metric. Skipping.' % idr_qc_uris) continue Np = idr.get('Np', na) N1 = idr.get('N1', na) N2 = idr.get('N2', na) Nt = idr.get('Nt', na) Fp = idr.get('Fp', na) F1 = idr.get('F1', na) F2 = idr.get('F2', na) Ft = idr.get('Ft', na) quality_metric_of = idr.get('quality_metric_of', []) date = idr.get('date_created', na) rescue_ratio = idr.get('rescue_ratio', na) self_consistency_ratio = idr.get('self_consistency_ratio', na) reproducibility_test = idr.get('reproducibility_test', na) # Get Experiment object. experiment = common.encoded_get(server + experiment_id, keypair) experiment_link = '%sexperiments/%s' % (server, experiment.get('accession')) # Get Award object. award = common.encoded_get(server + experiment.get('award'), keypair) # Grab project phase, e.g. ENCODE4. rfa = award.get('rfa', na) row = { 'date': date, 'analysis': analysis_link, 'analysis_id': desc.get('id', na), 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'replication': experiment.get('replication_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': rfa, 'assembly': assembly, 'Nt': Nt, 'Np': Np, 'N1': N1, 'N2': N2, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'Ft': Ft, 'Fp': Fp, 'F1': F1, 'F2': F2, 'state': desc.get('state', na), 'release': experiment['status'], 'total_price': desc.get('totalPrice', na), 'quality_metric_of': ', '.join(quality_metric_of) } writer.writerow(row) if args.create_google_sheet: args.outfile.close() # Load CSV data, sort. idr_data = pd.read_table(temp_file) idr_data = idr_data.replace('not_available', '') idr_data.date = idr_data.date.apply(lambda x: pd.to_datetime(x)) idr_data = idr_data.sort_values( by=['lab', 'biosample_term_name', 'target', 'experiment'], ascending=[True, True, True, True]) idr_data.date = idr_data.date.astype('str') idr_data = idr_data.reset_index(drop=True) # Read sheet title and create unique page title. date = datetime.now().strftime('%m_%d_%Y') sheet_title = (args.sheet_title if not args.released else '{} Released'.format(args.sheet_title)) page_title = '%s_IDR_FRIP_%s' % (args.assembly, date) # Open/create Google Sheet. gc = pygsheets.authorize(args.apikey) try: sh = gc.open(sheet_title) except pygsheets.exceptions.SpreadsheetNotFound: sh = gc.create(sheet_title) try: wks = sh.add_worksheet(page_title) except HttpError: wks = sh.worksheet_by_title(page_title) # Clear worksheet. wks.clear() # Add data from DataFrame. wks.set_dataframe(idr_data, copy_head=True, fit=True, start='A1') # Apply formatting and conditions. header['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, header) # Format numbers. for col in number_format_columns: num = idr_data.columns.get_loc(col) number_format['repeatCell']['range']['startColumnIndex'] = num number_format['repeatCell']['range']['endColumnIndex'] = num + 1 number_format['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, number_format) # Resize font. font_size_format['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, font_size_format) # Add conditional formatting. for conditional in conditions: num = idr_data.columns.get_loc("reproducibility_test") conditional['addConditionalFormatRule']['rule']['ranges'][0][ 'startColumnIndex'] = num conditional['addConditionalFormatRule']['rule']['ranges'][0][ 'endColumnIndex'] = num + 1 conditional['addConditionalFormatRule']['rule']['ranges'][0][ 'sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, conditional) for k, v in notes_dict.items(): num = idr_data.columns.get_loc(k) note['repeatCell']['range']['startColumnIndex'] = num note['repeatCell']['range']['endColumnIndex'] = num + 1 note['repeatCell']['cell']['note'] = v note['repeatCell']['range']['sheetId'] = wks.id wks.client.sh_batch_update(wks.spreadsheet.id, note) # Optional. Smaller column width to match original. for i in range(wks.cols): wks.adjust_column_width(i, pixel_size=38) # Resize tiny columns. tiny_columns = ['experiment', 'analysis'] for i in [idr_data.columns.get_loc(x) for x in tiny_columns]: wks.adjust_column_width(i, pixel_size=25) # Resize medium columns. medium_columns = ['replication', 'assembly', 'rfa'] for i in [idr_data.columns.get_loc(x) for x in medium_columns]: wks.adjust_column_width(i, pixel_size=65) # Resize wide columns. wide_columns = ['target', 'reproducibility_test', 'lab'] for i in [idr_data.columns.get_loc(x) for x in wide_columns]: wks.adjust_column_width(i, pixel_size=85) # Remove temp file. os.remove(temp_file)
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) logger.debug("Logging level set to DEBUG") else: logger.setLevel(logging.INFO) if args.analysis_ids: ids = [i for i in args.analysis_ids if not i.startswith('#')] elif args.infile: ids = [i for i in args.infile if not i.startswith('#')] else: # never reached because inile defaults to stdin raise InputError("Must supply analysis id's in arguments or --infile") if not args.name: if len(ids) > 1: job_name = "batch_%s" % (timestring) else: analysis = dxpy.DXAnalysis(ids[0]) job_name = "Accession %s" % (analysis.name) else: job_name = args.name tokens = [ 'dx run %s' % (ACCESSION_ANALYSIS_APPLET), '-i "outfn=%s"' % (args.outfile), '--destination "%s"' % (args.destination), '--name "%s"' % (job_name), '--yes' ] if args.watch: tokens.append('--watch') if args.project is not None: tokens.append('-i "project=%s"' % (args.project)) if args.pipeline is not None: tokens.append('-i "pipeline=%s"' % (args.pipeline)) if args.key is not None: tokens.append('-i "key=%s"' % (args.key)) # if args.keyfile is not None: # tokens.append('-i "keyfile=%s"' % (args.keyfile)) if args.debug is not None: tokens.append('-i "debug=%s"' % (args.debug)) if args.dryrun is not None: tokens.append('-i "dryrun=%s"' % (args.dryrun)) if args.force_patch is not None: tokens.append('-i "force_patch=%s"' % (args.force_patch)) if args.force_upload is not None: tokens.append('-i "force_upload=%s"' % (args.force_upload)) if args.use_content_md5sum is not None: tokens.append('-i "use_content_md5sum=%s"' % (args.use_content_md5sum)) if args.fqcheck is not None: tokens.append('-i "fqcheck=%s"' % (args.fqcheck)) if args.accession_raw is not None: tokens.append('-i "accession_raw=%s"' % (args.accession_raw)) if args.signal_only is not None: tokens.append('-i "signal_only=%s"' % (args.signal_only)) if args.skip_control is not None: tokens.append('-i "skip_control=%s"' % (args.skip_control)) if args.encoded_check is not None: tokens.append('-i "encoded_check=%s"' % (args.encoded_check)) for (i, analysis_id) in enumerate(ids): if analysis_id.startswith('#'): continue analysis_id = analysis_id.rstrip() logger.debug('%s' % (analysis_id)) tokens.append('-i "analysis_ids=%s"' % (analysis_id)) command_string = ' '.join(tokens) logger.debug(command_string) subprocess.check_call(shlex.split(command_string))
def main(token): # Configure dxpy authentication dxpy.set_security_context({ 'auth_token_type': 'Bearer', 'auth_token': token }) # Resolve FACTORY_PROJECT by ID proj = dxpy.DXProject(FACTORY_PROJECT) print 'Resolved project:', proj.describe()['name'], proj.get_id() # Set FACTORY_PROJECT as the workspace for subsequent operations # (sort of like the current working directory) dxpy.set_workspace_id(FACTORY_PROJECT) # Resolve the workflow by name. (Could also store ID like the project) wf = list( dxpy.search.find_data_objects(classname="workflow", name="RNA-seq pipeline", return_handler=True))[0] print 'Resolved workflow:', wf.describe()['name'], wf.get_id() # TODO: Stage the inputs. Here we find them in the IN folder left_reads = list( dxpy.search.find_data_objects(classname="file", name="ENCFF001JPX.1k.fastq.gz", folder="/IN", return_handler=True))[0] print 'Resolved left reads:', left_reads.describe( )['name'], left_reads.get_id() right_reads = list( dxpy.search.find_data_objects(classname="file", name="ENCFF001JQB.1k.fastq.gz", folder="/IN", return_handler=True))[0] print 'Resolved right reads:', right_reads.describe( )['name'], right_reads.get_id() # Launch the workflow analysis = wf.run({ '0.fastqs': [dxpy.dxlink(left_reads.get_id())], '0.fastq_pairs': [dxpy.dxlink(right_reads.get_id())] }) print 'Launched analysis:', analysis.get_id() print 'Analysis state:', analysis.describe()['state'] # TODO: Poll for (or come back when) analysis state 'done' or 'failed'. # Handle any failures. # Cooking-show-style substitution with completed analysis analysis = dxpy.DXAnalysis(COMPLETED_ANALYSIS) print 'Analysis state:', analysis.describe()['state'] # Enumerate outputs print 'Analysis outputs:' for one_output_name, one_output_link in analysis.describe( )['output'].iteritems(): one_output = dxpy.get_handler( one_output_link) # one_output : dxpy.DXFile one_file_name = one_output.describe()['name'] one_file_url, _ = one_output.get_download_url(preauthenticated=True, filename=one_file_name) print one_file_name, one_file_url
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.analysis_ids: ids = args.analysis_ids elif args.created_after: analyses = [] for state in args.state: analyses.extend( dxpy.find_analyses(name="ENCSR*", name_mode='glob', state=state, include_subjobs=True, return_handler=True, created_after="%s" % (args.created_after))) ids = [ analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith( 'ENCSR783QUL Peaks') ] elif args.infile: ids = args.infile else: #never reached because inile defaults to stdin raise InputError( "Must supply analysis id's in arguments, --infile or supply search string in --created_after" ) fieldnames = [ 'date', 'analysis', 'experiment', 'target', 'biosample_term_name', 'biosample_type', 'lab', 'rfa', 'assembly', 'Nt', 'Np', 'N1', 'N2', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility_test', 'state', 'total price', 'notes' ] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() for (i, analysis_id) in enumerate(ids): if analysis_id.startswith('#'): continue analysis_id = analysis_id.rstrip() logger.debug('%s' % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name']) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." % (desc['name'])) continue experiment = common.encoded_get( urlparse.urljoin(server, '/experiments/%s' % (experiment_accession)), keypair) logger.debug('ENCODEd experiment %s' % (experiment['accession'])) if args.lab and experiment['lab'].split('/')[2] not in args.lab: continue try: idr_stage = next( s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls") except: logging.error('Failed to find final IDR stage in %s' % (analysis_id)) else: if idr_stage[ 'state'] != 'done': #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None notes = [] #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs idr_stage_names = [ 'IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates', 'IDR Rep 2 Self-pseudoreplicates', 'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates' ] for stage_name in idr_stage_names: try: idr_stage = next( s['execution'] for s in desc['stages'] if s['execution']['name'] == stage_name) except StopIteration: continue except: raise if idr_stage['state'] == 'failed': try: job_log = subprocess.check_output( 'dx watch %s' % (idr_stage['id']), shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: job_log = e.output else: job_log = None if job_log: patterns = [ r'Peak files must contain at least 20 peaks post-merge' ] for p in patterns: m = re.search(p, job_log) if m: notes.append("%s: %s" % (stage_name, m.group(0))) if not notes: notes.append(idr_stage['failureMessage']) try: done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "failed") except StopIteration: done_time = "Not done or failed" except: raise else: Np = idr_stage['output'].get('Np') N1 = idr_stage['output'].get('N1') N2 = idr_stage['output'].get('N2') Nt = idr_stage['output'].get('Nt') rescue_ratio = idr_stage['output'].get('rescue_ratio') self_consistency_ratio = idr_stage['output'].get( 'self_consistency_ratio') reproducibility_test = idr_stage['output'].get( 'reproducibility_test') notes = "IDR Complete" done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done") if done_time: date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(done_time / 1000)) else: date = "Running" analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % ( desc.get('project').split('-')[1], desc.get('id').split('-')[1]) experiment_link = 'https://www.encodeproject.org/experiments/%s' % ( experiment.get('accession')) row = { 'date': date, 'analysis': analysis_link, 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': common.encoded_get(server + experiment.get('award'), keypair).get('rfa'), 'assembly': args.assembly, #TODO ... derive this from the analysis 'Np': Np, 'N1': N1, 'N2': N2, 'Nt': Nt, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'state': desc.get('state'), 'total price': desc.get('totalPrice') } if notes: row.update({'notes': '%s' % (notes)}) else: row.update({'notes': '%s' % ('OK')}) #log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.experiments: ids = args.experiments # elif args.created_after: # analyses = [] # for state in args.state: # analyses.extend(dxpy.find_analyses(name="ENCSR*",name_mode='glob',state=state,include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after))) # ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith('ENCSR783QUL Peaks')] elif args.all: exp_query = \ "/search/?type=Experiment" + \ "&assay_title=ChIP-seq" + \ "&award.project=ENCODE" + \ "&status=released&status=submitted&status=in+progress&status=started&status=release+ready" all_experiments = common.encoded_get(server + exp_query, keypair)['@graph'] ids = [exp.get('accession') for exp in all_experiments] elif args.infile: ids = args.infile else: #never reached because inile defaults to stdin raise InputError( "Must supply experiment id's in arguments or --infile") fieldnames = [ 'date', 'analysis', 'analysis id', 'experiment', 'target', 'biosample_term_name', 'biosample_type', 'lab', 'rfa', 'assembly', 'Nt', 'Np', 'N1', 'N2', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility_test', 'state', 'release', 'total price', 'notes' ] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() idr_query = \ "/search/?type=File" + \ "&file_format=bed" + \ "&output_type=optimal+idr+thresholded+peaks" + \ "&output_type=conservative+idr+thresholded+peaks" + \ "&lab.title=ENCODE+Processing+Pipeline" + \ "&lab.title=J.+Michael+Cherry,+Stanford" + \ "&status=in+progress&status=released&status=uploading&status=uploaded" all_idr_files = common.encoded_get(server + idr_query, keypair)['@graph'] for (i, experiment_id) in enumerate(ids): if experiment_id.startswith('#'): continue experiment_id = experiment_id.rstrip() experiment_uri = '/experiments/%s/' % (experiment_id) idr_files = \ [f for f in all_idr_files if f['dataset'] == experiment_uri] idr_step_runs = set([f.get('step_run') for f in idr_files]) if not len(idr_step_runs) == 1: if not args.all: logger.warning( "%s: Expected one IDR step run. Found %d. Skipping" % (experiment_id, len(idr_step_runs))) continue idr_qc_uris = [] assemblies = [] for f in idr_files: quality_metrics = f.get('quality_metrics') if not len(quality_metrics) == 1: logger.error( '%s: Expected one IDR quality metric for file %s. Found %d.' % (experiment_id, f.get('accession'), len(quality_metrics))) idr_qc_uris.extend(quality_metrics) assembly = f.get('assembly') if not assembly: logger.error('%s: File %s has no assembly' % (experiment_id, f.get('accession'))) assemblies.append(assembly) idr_qc_uris = set(idr_qc_uris) if not len(idr_qc_uris) == 1: logger.error( '%s: Expected one unique IDR metric, found %d. Skipping.' % (experiment_id, len(idr_qc_uris))) continue assemblies = set(assemblies) if not len(assemblies) == 1: logger.error( '%s: Expected one unique assembly, found %d. Skipping.' % (experiment_id, len(assemblies))) continue assembly = next(iter(assemblies)) idr_step_run_uri = next(iter(idr_step_runs)) idr_step_run = common.encoded_get(server + idr_step_run_uri, keypair) try: dx_job_id_str = idr_step_run.get('dx_applet_details')[0].get( 'dx_job_id') except: logger.warning( "Failed to get dx_job_id from step_run.dx_applet_details.dx_job_id" ) logger.debug(idr_step_run) dx_job_id_str = None #could try to pull it from alias dx_job_id = dx_job_id_str.rpartition(':')[2] dx_job = dxpy.DXJob(dx_job_id) job_desc = dx_job.describe() analysis_id = job_desc.get('analysis') logger.debug('%s' % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name']) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." % (desc['name'])) continue if args.all: # we've already gotten all the experiment objects experiment = \ next(e for e in all_experiments if e['accession'] == experiment_accession) else: experiment = \ common.encoded_get(urlparse.urljoin( server, '/experiments/%s' % (experiment_accession)), keypair) logger.debug('ENCODEd experiment %s' % (experiment['accession'])) if args.lab and experiment['lab'].split('/')[2] not in args.lab: continue try: idr_stage = next( s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls") except: logging.error('Failed to find final IDR stage in %s' % (analysis_id)) else: if idr_stage[ 'state'] != 'done': #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None notes = [] #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs idr_stage_names = [ 'IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates', 'IDR Rep 2 Self-pseudoreplicates', 'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates' ] for stage_name in idr_stage_names: try: idr_stage = next( s['execution'] for s in desc['stages'] if s['execution']['name'] == stage_name) except StopIteration: continue except: raise if idr_stage['state'] == 'failed': try: job_log = subprocess.check_output( 'dx watch %s' % (idr_stage['id']), shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: job_log = e.output else: job_log = None if job_log: patterns = [ r'Peak files must contain at least 20 peaks post-merge' ] for p in patterns: m = re.search(p, job_log) if m: notes.append("%s: %s" % (stage_name, m.group(0))) if not notes: notes.append(idr_stage['failureMessage']) try: done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "failed") except StopIteration: done_time = "Not done or failed" except: raise else: Np = idr_stage['output'].get('Np') N1 = idr_stage['output'].get('N1') N2 = idr_stage['output'].get('N2') Nt = idr_stage['output'].get('Nt') rescue_ratio = idr_stage['output'].get('rescue_ratio') self_consistency_ratio = idr_stage['output'].get( 'self_consistency_ratio') reproducibility_test = idr_stage['output'].get( 'reproducibility_test') notes = "IDR Complete" done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done") if done_time: date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(done_time / 1000)) else: date = "Running" analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % ( desc.get('project').split('-')[1], desc.get('id').split('-')[1]) experiment_link = 'https://www.encodeproject.org/experiments/%s' % ( experiment.get('accession')) row = { 'date': date, 'analysis': analysis_link, 'analysis id': desc.get('id'), 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': common.encoded_get(server + experiment.get('award'), keypair).get('rfa'), 'assembly': assembly, 'Np': Np, 'N1': N1, 'N2': N2, 'Nt': Nt, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'state': desc.get('state'), 'release': experiment['status'], 'total price': desc.get('totalPrice') } if notes: row.update({'notes': '%s' % (notes)}) else: row.update({'notes': '%s' % ('OK')}) #log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)