def find_control_file(self,rep,default=None): '''Attempts to find an appropriate control file.''' # TODO Make more generic and move to dxencode.py when needed. (AUTHID,AUTHPW,SERVER) = dxencode.processkey(self.server_key) if 'controls' not in rep: return None for file_key in rep['controls']: if isinstance(file_key,list): file_key = file_key[0] file_obj = dxencode.enc_lookup_json(file_key,self.server_key,frame='embedded') rep_id = file_obj["replicate"]['@id'] rep_obj = dxencode.enc_lookup_json(rep_id,self.server_key,frame='embedded') exp_id = rep_obj['experiment']['@id'].split('/')[2] rep_tech = "rep%s_%s" % \ (rep_obj['biological_replicate_number'], rep_obj['technical_replicate_number']) control_root = self.psv['control_path'] # Cheating: if self.proj_name == "scratchPad" and self.psv['control_path'] == self.CONTROL_ROOT_FOLDER: control_root = "/lrna" path_n_glob = control_root + exp_id + '/' + rep_tech + '/' + self.CONTROL_FILE_GLOB target_folder = dxencode.find_folder(exp_id + '/' + rep_tech,self.project,control_root) #print "Target found [%s]" % target_folder if target_folder != None: path_n_glob = target_folder + '/' + self.CONTROL_FILE_GLOB fid = dxencode.find_file(path_n_glob,self.proj_id,multiple=False,recurse=False) if fid != None: return dxencode.file_path_from_fid(fid) if default != None: return default #print json.dumps(rep,indent=4) print "Unable to find control in search of %s" % rep['controls'] sys.exit(1)
def find_control_file(self,rep,default=None): '''Attempts to find an appropriate control file.''' # TODO Make more generic and move to dxencode.py when needed. (AUTHID,AUTHPW,SERVER) = dxencode.processkey(self.server_key) for file_key in rep['controls']: url = '%s%s/?format=json&frame=embedded' % (SERVER,file_key) #print '-- ' + AUTHID + " " + AUTHPW + " " + SERVER + " " + url try: response = dxencode.encoded_get(url, AUTHID, AUTHPW) file_obj = response.json() except: print "URL to control [%s] returned ?" % url print response sys.exit(1) #print json.dumps(response,indent=4) rep_id = file_obj["replicate"]['@id'] url = '%s%s/?format=json&frame=embedded' % (SERVER,rep_id) try: response = dxencode.encoded_get(url, AUTHID, AUTHPW) rep_obj = response.json() except: print "URL to replicate [%s] returned ?" % url print response sys.exit(1) exp_id = rep_obj['experiment'].split('/')[2] rep_tech = "rep%s_%s" % \ (rep_obj['biological_replicate_number'], rep_obj['technical_replicate_number']) # default by cheating if self.proj_name == dxencode.PRODUCTION_PROJECT: control_root = "/long-RNA-seq/runs/" else: control_root = self.CONTROL_ROOT_FOLDER path_n_glob = control_root + exp_id + '/' + rep_tech + '/' + self.CONTROL_FILE_GLOB target_folder = dxencode.find_folder(exp_id + '/' + rep_tech,self.project,control_root) #print "Target found [%s]" % target_folder if target_folder != None: path_n_glob = target_folder + '/' + self.CONTROL_FILE_GLOB fid = dxencode.find_file(path_n_glob,self.proj_id,multiple=False,recurse=False) if fid != None: return dxencode.file_path_from_fid(fid) if default != None: return default print "Unable to find control in search of %s" % rep['controls'] sys.exit(1)
def run(self): '''Runs recovery from start to finish using command line arguments.''' args = self.get_args() self.test = args.test self.ignore = False if args.ignore_properties: print "Ignoring DXFile properties (will post to test server)" self.ignore = args.ignore_properties self.server_key = 'test' # mandated because option is dangerous self.server_key = args.server self.authid, self.authpw, self.server = dxencode.processkey(self.server_key) if self.server_key == "www": self.acc_prefix = "ENCFF" self.proj_name = dxencode.env_get_current_project() if self.proj_name == None or args.project != None: self.proj_name = args.project if self.proj_name == None: print "Please enter a '--project' to run in." sys.exit(1) self.project = dxencode.get_project(self.proj_name) self.proj_id = self.project.get_id() print "== Running in project [%s] and will attempt recovery to the [%s] server ==" % \ (self.proj_name,self.server_key) exp_count = 0 halted = 0 total_recovered = 0 for exp_id in args.experiments: sys.stdout.flush() # Slow running job should flush to piped log self.exp_id = exp_id self.obj_cache["exp"] = {} # clear exp cache, which will hold exp specific wf_run and step_run objects # 1) Lookup experiment type from encoded, based on accession print "Working on %s..." % self.exp_id self.exp = dxencode.get_exp(self.exp_id,must_find=True,key=self.server_key) if self.exp == None or self.exp["status"] == "error": print "Unable to locate experiment %s in encoded (%s)" % (self.exp_id, self.server_key) continue self.exp_type = dxencode.get_exp_type(self.exp_id,self.exp,self.EXPERIMENT_TYPES_SUPPORTED) if self.exp_type == None: continue # 2) Locate the experiment accession named folder # NOTE: genome and annotation are not known for this exp yet, so the umbrella folder is just based on exp_type self.umbrella_folder = dxencode.umbrella_folder(args.folder,self.FOLDER_DEFAULT,self.proj_name,self.exp_type) self.exp_folder = dxencode.find_exp_folder(self.project,exp_id,self.umbrella_folder,warn=True) if self.exp_folder == None: continue print "- Examining %s:%s for '%s' results..." % \ (self.proj_name, self.exp_folder, self.exp_type) # 3) Given the experiment type, determine the expected results self.pipeline = self.pipeline_specification(args,self.exp_type,self.exp_folder) self.replicates = self.find_replicate_folders(self.exp_folder, verbose=args.verbose) # 4) Given expected results locate any files (by glob) that should be posted for # a) each single replicate (in replicate sub-folders named as reN_N/ # b) combined replicates in the experiment folder itself files_expected = self.find_expected_files(self.exp_folder, self.replicates, verbose=args.verbose) print "- Found %d files that are available in DX." % len(files_expected) if len(files_expected) == 0: continue # 5) For each file that should be posted, determine if the file needs to be posted. files_posted = self.find_posted_files(files_expected, test=self.test, verbose=args.verbose) #True) print "- Found %d files that have been posted" % len(files_posted) if len(files_posted) == 0: continue # 6) For each file that needs to be posted: exp_count += 1 file_count = 0 recovery_count = 0 for (out_type,rep_tech,fid) in files_posted: sys.stdout.flush() # Slow running job should flush to piped log accession = self.found[fid]['accession'] file_name = dxencode.file_path_from_fid(fid) if args.start_at != None: if accession != args.start_at and not file_name.endswith(args.start_at): continue else: print "- Starting at %s" % (file_name) args.start_at = None # a) discover all necessary dx information needed for post. # b) gather any other information necessary from dx and encoded. print "- Handle file %s %s" % (accession,dxencode.file_path_from_fid(fid)) payload = self.make_payload_obj(out_type,rep_tech,fid, verbose=args.verbose) file_count += 1 # c) Update encoded database only if necessary. if self.file_metadata_recovery(fid,payload,args.test,verbose=args.verbose): recovery_count += 1 if args.files != 0 and file_count >= args.files: # Short circuit for test print "- Just trying %d file(s) by request" % file_count break if not args.test: print "- For %s Processed %d file(s), recovered %s" % (self.exp_id, file_count, recovery_count) else: print "- For %s Processed %d file(s), would recover %s" % (self.exp_id, file_count, recovery_count) total_recovered += recovery_count if not args.test: print "Processed %d experiment(s), halted %d, recovered %d file(s)" % (exp_count, halted, total_recovered) else: print "Processed %d experiment(s), halted %d, would recover %d file(s)" % (exp_count, halted, total_recovered) if halted == exp_count: sys.exit(1) print "(finished)"
def run(self): '''Override super.run()''' args = self.get_args() self.test = args.test self.server_key = args.server if self.server_key != "test": self.acc_prefix = "ENCFF" self.proj_name = dxencode.env_get_current_project() if self.proj_name == None or args.project != None: self.proj_name = args.project if self.proj_name == None: print "Please enter a '--project' to run in." sys.exit(1) self.project = dxencode.get_project(self.proj_name) self.proj_id = self.project.get_id() print "== Running in project [%s] and will post to the [%s] server ==" % \ (self.proj_name,self.server_key) exp_count = 0 halted = 0 total_posted = 0 for exp_id in args.experiments: sys.stdout.flush() # Slow running job should flush to piped log # 1) Lookup experiment type from encoded, based on accession print "Working on %s..." % exp_id self.exp = dxencode.get_exp(exp_id, must_find=False, key=self.server_key) if self.exp == None or self.exp["status"] == "error": print "Unable to locate experiment %s in encoded" % exp_id continue self.exp_type = self.get_exp_type(exp_id) if self.exp_type == None: continue # 2) Locate the experiment accession named folder self.exp_folder = dxencode.find_exp_folder(self.project, exp_id, args.results_folder, warn=True) if self.exp_folder == None: continue print "- Examining %s:%s for '%s' results..." % \ (self.proj_name, self.exp_folder, self.exp_type) # 3) Given the experiment type, determine the expected results self.pipeline = self.pipeline_specification( args, self.exp_type, self.exp_folder) self.replicates = self.find_replicate_folders(self.exp_folder, verbose=args.verbose) # 4) Given expected results locate any files (by glob) that should be posted for # a) each single replicate (in replicate sub-folders named as reN_N/ # b) combined replicates in the experiment folder itself files_expected = self.find_expected_files(self.exp_folder, self.replicates, verbose=args.verbose) print "- Found %d files that are available to post." % len( files_expected) if len(files_expected) == 0: continue # 5) For each file that should be posted, determine if the file needs to be posted. files_to_post = { x[2]: x for x in self.find_needed_files(files_expected, verbose=args.verbose) } # index on dx file id print "- Found %d files that need to be posted" % len( files_to_post.keys()) # 6) For each file that needs to be posted: exp_count += 1 file_count = 0 post_count = 0 for (out_type, rep_tech, fid) in files_expected: sys.stdout.flush( ) # Slow running job should flush to piped log # a) discover all necessary dx information needed for post. # b) gather any other information necessary from dx and encoded. print " Handle file %s" % dxencode.file_path_from_fid(fid) job = dxencode.job_from_fid(fid) try: derived_from = self.find_derived_from( fid, job, args.verbose) except dxpy.exceptions.ResourceNotFound, e: print "WARN: derived_from failed %s" % e derived_from = [] if not files_to_post.get(fid, ()): f_obj = self.found.get(fid, None) if f_obj: current_derived_from = f_obj['derived_from'] if derived_from and not current_derived_from: print "Need to patch derived_from for %s/%s to %s (currently: %s)" % ( f_obj['accession'], fid, derived_from, current_derived_from) else: print "Derived from for %s good" % f_obj[ 'accession'] else: print "File %s (%s) from %s/%s not found @ DNANexus" % ( fid, out_type, exp_id, rep_tech) #POSTING else: payload = self.make_payload_obj(out_type, rep_tech, fid, verbose=args.verbose) if args.force_annotation: print "WARN: forcing genome_annotation to be %s" % args.force_annotation payload['genome_annotation'] = args.force_annotation file_count += 1 # c) Post file and update encoded database. accession = self.file_post(fid, payload, args.test) if accession == None: print "* HALTING %s - post failure could compromise 'derived_from'" % \ (self.exp_id) halted += 1 break # d) Update dnanexus file with file accession tag. if not args.test: post_count += 1 self.file_mark_accession(fid, accession, args.test) print "- For %s Processed %d file(s), posted %s" % \ (self.exp_id, file_count, post_count) total_posted += post_count
def file_metadata_recovery(self,fid,payload,test=True,verbose=False): '''Compares DX and encoded metadata and updates ENCODEd is necessary.''' recovered = False if fid not in self.found: print "* ERROR: Expecting to have exp_file for " + dxencode.file_path_from_fid(fid,projectToo=True) sys.exit(1) enc_file = self.found[fid] accession = enc_file['accession'] update_payload = {} patch_required = False # Compare derived from and update if necessary append_derived_by = False if self.APPEND_FLAG in payload['derived_from']: payload['derived_from'].remove(self.APPEND_FLAG) append_derived_by = True derived_diffs = len(payload['derived_from']) if verbose: print >> sys.stderr, "> DX file derived_from:" print >> sys.stderr, json.dumps(payload['derived_from'],indent=4) enc_derived = enc_file.get('derived_from') if enc_derived != None: if verbose: print >> sys.stderr, "> Enc file derived_from:" print >> sys.stderr, json.dumps(enc_derived,indent=4) if append_derived_by or len(enc_derived) == derived_diffs: for acc in payload['derived_from']: for inp_file in enc_derived: if acc == inp_file.get('accession'): derived_diffs -= 1 else: if verbose: print >> sys.stderr, "Enc file derived_from: Not Found" if derived_diffs > 0: if append_derived_by: print " + Need to append to 'derived_from'." if enc_derived != None: for inp_file in enc_derived: acc = inp_file.get('accession') if acc != None and acc not in payload['derived_from']: payload['derived_from'].append(acc) else: print " + Need to update 'derived_from'." update_payload['derived_from'] = payload['derived_from'] patch_required = True # Compare wf_run/step_run and update if necessary step_run_diff = True dx_step_run = payload.get("step_run") if dx_step_run != None: dx_step_run = dx_step_run.split('/')[-1] # Just the alias please if verbose: print >> sys.stderr, "> DX file step_run:" + dx_step_run else: print "* ERROR: payload is missing 'step_run'" enc_step_run = enc_file.get("step_run") if verbose: if enc_step_run != None: print >> sys.stderr, "> Enc file step_run:" + enc_step_run #print >> sys.stderr, json.dumps(enc_step_run,indent=4) else: print >> sys.stderr, "> Enc file step_run: Not Found" if dx_step_run == None and enc_step_run == None: step_run_diff = False elif dx_step_run != None and enc_step_run != None: step_run = self.enc_lookup_json(enc_step_run,must_find=True) if verbose: print >> sys.stderr, "> Actual step_run:" print >> sys.stderr, json.dumps(step_run,indent=4) aliases = step_run.get('aliases') if aliases != None: if verbose: print >> sys.stderr, "> aliass:" print >> sys.stderr, json.dumps(aliases,indent=4) if dx_step_run in aliases: step_run_diff = False if step_run_diff: print " + Need to update 'step_run'." update_payload["step_run"] = payload["step_run"] patch_required = True # What about lab? if 'lab' not in enc_file: print " + Need to update missing 'lab' new: '"+payload['lab']+"'" update_payload['lab'] = payload['lab'] patch_required = True else: lab = enc_file.get('lab') if '@id' not in lab or lab['@id'] != payload['lab']: print " + Need to update 'lab' new: '"+payload['lab']+"'" #print json.dumps(lab,indent=4,sort_keys=True) update_payload['lab'] = payload['lab'] patch_required = True # award? if 'award' not in enc_file: print " + Need to update missing 'award' new: '"+payload['award']+"'" update_payload['award'] = payload['award'] patch_required = True elif enc_file['award'] != payload['award']: print " + Need to update 'award' new: '"+payload['award']+"' enc: '"+enc_file['award']+"'" update_payload['award'] = payload['award'] patch_required = True if patch_required: update_payload['notes'] = payload['notes'] # Only update notes if other things update, then always update notes if test: print " * Would patch file: '%s'" % dxencode.file_path_from_fid(fid) #print json.dumps(update_payload,indent=4,sort_keys=True) recovered = True else: try: ret = dxencode.encoded_patch_obj(accession, update_payload, self.server, self.authid, self.authpw) except: print "Failed to patch file: '%s'" % dxencode.file_path_from_fid(fid) print json.dumps(update_payload,indent=4,sort_keys=True) sys.exit(1) print " * Patched file: '%s'" % dxencode.file_path_from_fid(fid) recovered = True else: print " - No need to patch file." return recovered