def get_analysis_time(accession, repstr, project): result = list(dxpy.find_analyses(project=project, name='*'+accession+repstr+'*', name_mode='glob', describe=True, state='done')) if len(result) != 1: print "WARN: No single (%s) analysis found for %s%s" % (len(result), accession, repstr) return -999.99 anl = result[0]['describe'] start = anl['created'] finish = [ t['setAt'] for t in anl['stateTransitions'] if t['newState'] == 'done'][0] #print start, finish, finish-start, (finish-start)/1000.0 return (finish-start)/1000.0 # covert to secs.
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.analysis_ids: ids = args.analysis_ids elif args.created_after: analyses = [] for state in args.state: analyses.extend( dxpy.find_analyses(name="ENCSR*", name_mode='glob', state=state, include_subjobs=True, return_handler=True, created_after="%s" % (args.created_after))) ids = [ analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith( 'ENCSR783QUL Peaks') ] elif args.infile: ids = args.infile else: #never reached because inile defaults to stdin raise InputError( "Must supply analysis id's in arguments, --infile or supply search string in --created_after" ) fieldnames = [ 'date', 'analysis', 'experiment', 'target', 'biosample_term_name', 'biosample_type', 'lab', 'rfa', 'assembly', 'Nt', 'Np', 'N1', 'N2', 'rescue_ratio', 'self_consistency_ratio', 'reproducibility_test', 'state', 'total price', 'notes' ] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() for (i, analysis_id) in enumerate(ids): if analysis_id.startswith('#'): continue analysis_id = analysis_id.rstrip() logger.debug('%s' % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks', desc['name']) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." % (desc['name'])) continue experiment = common.encoded_get( urlparse.urljoin(server, '/experiments/%s' % (experiment_accession)), keypair) logger.debug('ENCODEd experiment %s' % (experiment['accession'])) if args.lab and experiment['lab'].split('/')[2] not in args.lab: continue try: idr_stage = next( s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls") except: logging.error('Failed to find final IDR stage in %s' % (analysis_id)) else: if idr_stage[ 'state'] != 'done': #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None notes = [] #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs idr_stage_names = [ 'IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates', 'IDR Rep 2 Self-pseudoreplicates', 'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates' ] for stage_name in idr_stage_names: try: idr_stage = next( s['execution'] for s in desc['stages'] if s['execution']['name'] == stage_name) except StopIteration: continue except: raise if idr_stage['state'] == 'failed': try: job_log = subprocess.check_output( 'dx watch %s' % (idr_stage['id']), shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: job_log = e.output else: job_log = None if job_log: patterns = [ r'Peak files must contain at least 20 peaks post-merge' ] for p in patterns: m = re.search(p, job_log) if m: notes.append("%s: %s" % (stage_name, m.group(0))) if not notes: notes.append(idr_stage['failureMessage']) try: done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "failed") except StopIteration: done_time = "Not done or failed" except: raise else: Np = idr_stage['output'].get('Np') N1 = idr_stage['output'].get('N1') N2 = idr_stage['output'].get('N2') Nt = idr_stage['output'].get('Nt') rescue_ratio = idr_stage['output'].get('rescue_ratio') self_consistency_ratio = idr_stage['output'].get( 'self_consistency_ratio') reproducibility_test = idr_stage['output'].get( 'reproducibility_test') notes = "IDR Complete" done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done") if done_time: date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(done_time / 1000)) else: date = "Running" analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' % ( desc.get('project').split('-')[1], desc.get('id').split('-')[1]) experiment_link = 'https://www.encodeproject.org/experiments/%s' % ( experiment.get('accession')) row = { 'date': date, 'analysis': analysis_link, 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': common.encoded_get(server + experiment.get('award'), keypair).get('rfa'), 'assembly': args.assembly, #TODO ... derive this from the analysis 'Np': Np, 'N1': N1, 'N2': N2, 'Nt': Nt, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'state': desc.get('state'), 'total price': desc.get('totalPrice') } if notes: row.update({'notes': '%s' % (notes)}) else: row.update({'notes': '%s' % ('OK')}) #log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.analysis_ids: ids = args.analysis_ids elif args.created_after: analyses = [] for state in args.state: analyses.extend(dxpy.find_analyses(name="ENCSR*",name_mode='glob',state=state,include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after))) ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq' or analysis.describe()['executableName'].startswith('ENCSR783QUL Peaks')] elif args.infile: ids = args.infile else: #never reached because inile defaults to stdin raise InputError("Must supply analysis id's in arguments, --infile or supply search string in --created_after") fieldnames = [ 'date','analysis','experiment','target','biosample_term_name','biosample_type','lab','rfa','assembly', 'Nt','Np','N1','N2','rescue_ratio','self_consistency_ratio','reproducibility_test', 'state','total price','notes'] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() for (i, analysis_id) in enumerate(ids): if analysis_id.startswith('#'): continue analysis_id = analysis_id.rstrip() logger.debug('%s' %(analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks',desc['name']) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." %(desc['name'])) continue experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(experiment_accession)), keypair) logger.debug('ENCODEd experiment %s' %(experiment['accession'])) if args.lab and experiment['lab'].split('/')[2] not in args.lab: continue try: idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls") except: logging.error('Failed to find final IDR stage in %s' %(analysis_id)) else: if idr_stage['state'] != 'done': #Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None notes = [] #note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs idr_stage_names = ['IDR True Replicates', 'IDR Rep 1 Self-pseudoreplicates', 'IDR Rep 2 Self-pseudoreplicates', 'IDR Pooled Pseudoreplicates', 'IDR Pooled Pseudoeplicates'] for stage_name in idr_stage_names: try: idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == stage_name) except StopIteration: continue except: raise if idr_stage['state'] == 'failed': try: job_log = subprocess.check_output('dx watch %s' %(idr_stage['id']), shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: job_log = e.output else: job_log = None if job_log: patterns = [r'Peak files must contain at least 20 peaks post-merge'] for p in patterns: m = re.search(p,job_log) if m: notes.append("%s: %s" %(stage_name,m.group(0))) if not notes: notes.append(idr_stage['failureMessage']) try: done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "failed") except StopIteration: done_time = "Not done or failed" except: raise else: Np = idr_stage['output'].get('Np') N1 = idr_stage['output'].get('N1') N2 = idr_stage['output'].get('N2') Nt = idr_stage['output'].get('Nt') rescue_ratio = idr_stage['output'].get('rescue_ratio') self_consistency_ratio = idr_stage['output'].get('self_consistency_ratio') reproducibility_test = idr_stage['output'].get('reproducibility_test') notes = "IDR Complete" done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done") if done_time: date = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(done_time/1000)) else: date = "Running" analysis_link = 'https://platform.dnanexus.com/projects/%s/monitor/analysis/%s' %(desc.get('project').split('-')[1], desc.get('id').split('-')[1]) experiment_link = 'https://www.encodeproject.org/experiments/%s' %(experiment.get('accession')) row = { 'date': date, 'analysis': analysis_link, 'experiment': experiment_link, 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': common.encoded_get(server+experiment.get('award'),keypair).get('rfa'), 'assembly': args.assembly, #TODO ... derive this from the analysis 'Np': Np, 'N1': N1, 'N2': N2, 'Nt': Nt, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'state': desc.get('state'), 'total price': desc.get('totalPrice') } if notes: row.update({'notes': '%s' %(notes)}) else: row.update({'notes': '%s' %('OK')}) #log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid,authpw) if args.analysis_ids: ids = args.analysis_ids elif args.created_after: analyses = dxpy.find_analyses(name="ENCSR*",name_mode='glob',state='done',include_subjobs=True,return_handler=True,created_after="%s" %(args.created_after)) ids = [analysis.get_id() for analysis in analyses if analysis.describe()['executableName'] == 'tf_chip_seq'] elif args.infile: ids = args.infile else: #never reached because inile defaults to stdin raise InputError("Must supply analysis id's in arguments, --infile or supply search string in --created_after") fieldnames = [ 'date','analysis','experiment','target','biosample_term_name','biosample_type','lab','rfa','assembly', 'Nt','Np','N1','N2','rescue_ratio','self_consistency_ratio','reproducibility_test', 'state','total price','notes'] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter='\t', quotechar='"') writer.writeheader() for (i, analysis_id) in enumerate(ids): if analysis_id.startswith('#'): continue analysis_id = analysis_id.rstrip() logger.debug('%s' %(analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get('project') m = re.match('^(ENCSR[0-9]{3}[A-Z]{3}) Peaks',desc['name']) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." %(desc['name'])) continue experiment = common.encoded_get(urlparse.urljoin(server,'/experiments/%s' %(experiment_accession)), keypair) logger.debug('ENCODEd experiment %s' %(experiment['accession'])) try: idr_stage = next(s['execution'] for s in desc['stages'] if s['execution']['name'] == "Final IDR peak calls") except: logging.error('Failed to find IDR stage in %s' %(analysis_id)) else: Np = idr_stage['output'].get('Np') N1 = idr_stage['output'].get('N1') N2 = idr_stage['output'].get('N2') Nt = idr_stage['output'].get('Nt') rescue_ratio = idr_stage['output'].get('rescue_ratio') self_consistency_ratio = idr_stage['output'].get('self_consistency_ratio') reproducibility_test = idr_stage['output'].get('reproducibility_test') done_time = next(transition['setAt'] for transition in desc['stateTransitions'] if transition['newState'] == "done") row = { 'date': time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(done_time/1000)), 'analysis': analysis.get_id(), 'experiment': experiment.get('accession'), 'target': experiment['target'].split('/')[2], 'biosample_term_name': experiment.get('biosample_term_name'), 'biosample_type': experiment.get('biosample_type'), 'lab': experiment['lab'].split('/')[2], 'rfa': common.encoded_get(server+experiment.get('award'),keypair).get('rfa'), 'assembly': args.assembly, #TODO ... derive this from the analysis 'Np': Np, 'N1': N1, 'N2': N2, 'Nt': Nt, 'rescue_ratio': rescue_ratio, 'self_consistency_ratio': self_consistency_ratio, 'reproducibility_test': reproducibility_test, 'state': desc.get('state'), 'total price': desc.get('totalPrice') } notes = [] # if int(np_stage.get('output').get('npeaks_in')) - int(np_stage.get('output').get('npeaks_out')) != int(np_stage.get('output').get('npeaks_rejected')): # notes.append("in-out!=rej delta=%i" %(int(np_stage.get('output').get('npeaks_in')) - int(np_stage.get('output').get('npeaks_out')))) # else: # notes.append("in-out=rej OK") # bb_check_notes = [] # for stage in [np_stage, gp_stage]: # bb_dxf = dxpy.DXFile(stage['output']['overlapping_peaks_bb']) # if int(bb_dxf.describe()['size']) < 200000: # bb_check_notes.append("%s bb size=%i" %(stage['name'], int(bb_dxf.describe()['size']))) # if not bb_check_notes: # notes.append("bb check OK") # else: # notes.append(bb_check_notes) if notes: row.update({'notes': '%s' %(notes)}) else: row.update({'notes': '%s' %('OK')}) #log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)
def main(): args = get_args() if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) authid, authpw, server = common.processkey(args.key, args.keyfile) keypair = (authid, authpw) if args.analysis_ids: ids = args.analysis_ids elif args.created_after: analyses = [] for state in args.state: analyses.extend( dxpy.find_analyses( name="ENCSR*", name_mode="glob", state=state, include_subjobs=True, return_handler=True, created_after="%s" % (args.created_after), ) ) ids = [ analysis.get_id() for analysis in analyses if analysis.describe()["executableName"] == "tf_chip_seq" or analysis.describe()["executableName"].startswith("ENCSR783QUL Peaks") ] elif args.infile: ids = args.infile else: # never reached because inile defaults to stdin raise InputError("Must supply analysis id's in arguments, --infile or supply search string in --created_after") fieldnames = [ "name", "date", "analysis", "experiment", "target", "biosample_term_name", "biosample_type", "lab", "rfa", "assembly", "Nt", "Np", "N1", "N2", "rescue_ratio", "self_consistency_ratio", "reproducibility_test", "state", "release", "total price", "notes", ] writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames, delimiter="\t", quotechar='"') writer.writeheader() for (i, analysis_id) in enumerate(ids): if analysis_id.startswith("#"): continue analysis_id = analysis_id.rstrip() logger.debug("%s" % (analysis_id)) analysis = dxpy.DXAnalysis(analysis_id) desc = analysis.describe() project = desc.get("project") m = re.match("^(ENCSR[0-9]{3}[A-Z]{3}) Peaks", desc["name"]) if m: experiment_accession = m.group(1) else: logger.error("No accession in %s, skipping." % (desc["name"])) continue experiment = common.encoded_get(urlparse.urljoin(server, "/experiments/%s" % (experiment_accession)), keypair) logger.debug("ENCODEd experiment %s" % (experiment["accession"])) if args.lab and experiment["lab"].split("/")[2] not in args.lab: continue try: idr_stage = next(s["execution"] for s in desc["stages"] if s["execution"]["name"] == "Final IDR peak calls") except: logging.error("Failed to find final IDR stage in %s" % (analysis_id)) else: if ( idr_stage["state"] != "done" ): # Final IDR peak calls stage not done, so loop through intermediate IDR stages to find errors Np = N1 = N2 = Nt = rescue_ratio = self_consistency_ratio = reproducibility_test = None notes = [] # note this list contains a mis-spelled form of IDR Pooled Pseudoreplicates because until 11/13/15 the pipeline stage name was misspelled - need to be able to report on those runs idr_stage_names = [ "IDR True Replicates", "IDR Rep 1 Self-pseudoreplicates", "IDR Rep 2 Self-pseudoreplicates", "IDR Pooled Pseudoreplicates", "IDR Pooled Pseudoeplicates", ] for stage_name in idr_stage_names: try: idr_stage = next(s["execution"] for s in desc["stages"] if s["execution"]["name"] == stage_name) except StopIteration: continue except: raise if idr_stage["state"] == "failed": try: job_log = subprocess.check_output( "dx watch %s" % (idr_stage["id"]), shell=True, stderr=subprocess.STDOUT ) except subprocess.CalledProcessError as e: job_log = e.output else: job_log = None if job_log: patterns = [r"Peak files must contain at least 20 peaks post-merge"] for p in patterns: m = re.search(p, job_log) if m: notes.append("%s: %s" % (stage_name, m.group(0))) if not notes: notes.append(idr_stage["failureMessage"]) try: done_time = next( transition["setAt"] for transition in desc["stateTransitions"] if transition["newState"] == "failed" ) except StopIteration: done_time = "Not done or failed" except: raise else: Np = idr_stage["output"].get("Np") N1 = idr_stage["output"].get("N1") N2 = idr_stage["output"].get("N2") Nt = idr_stage["output"].get("Nt") rescue_ratio = idr_stage["output"].get("rescue_ratio") self_consistency_ratio = idr_stage["output"].get("self_consistency_ratio") reproducibility_test = idr_stage["output"].get("reproducibility_test") notes = "IDR Complete" done_time = next( transition["setAt"] for transition in desc["stateTransitions"] if transition["newState"] == "done" ) if done_time: date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(done_time / 1000)) else: date = "Running" analysis_link = "https://platform.dnanexus.com/projects/%s/monitor/analysis/%s" % ( desc.get("project").split("-")[1], desc.get("id").split("-")[1], ) experiment_link = "https://www.encodeproject.org/experiments/%s" % (experiment.get("accession")) row = { "name": desc.get("name"), "date": date, "analysis": analysis_link, "experiment": experiment_link, "target": experiment["target"].split("/")[2], "biosample_term_name": experiment.get("biosample_term_name"), "biosample_type": experiment.get("biosample_type"), "lab": experiment["lab"].split("/")[2], "rfa": common.encoded_get(server + experiment.get("award"), keypair).get("rfa"), "assembly": args.assembly, # TODO ... derive this from the analysis "Np": Np, "N1": N1, "N2": N2, "Nt": Nt, "rescue_ratio": rescue_ratio, "self_consistency_ratio": self_consistency_ratio, "reproducibility_test": reproducibility_test, "state": desc.get("state"), "release": experiment["status"], "total price": desc.get("totalPrice"), } if notes: row.update({"notes": "%s" % (notes)}) else: row.update({"notes": "%s" % ("OK")}) # log = subprocess.check_output('dx watch %s' %(analysis.)) writer.writerow(row)