def find_control_file(self,rep,default=None):
     '''Attempts to find an appropriate control file.'''
     # TODO Make more generic and move to dxencode.py when needed.
     
     (AUTHID,AUTHPW,SERVER) = dxencode.processkey(self.server_key)
     if 'controls' not in rep:
         return None
     for file_key in rep['controls']:
         if isinstance(file_key,list):
             file_key = file_key[0]
         file_obj = dxencode.enc_lookup_json(file_key,self.server_key,frame='embedded')
         rep_id = file_obj["replicate"]['@id']
         rep_obj = dxencode.enc_lookup_json(rep_id,self.server_key,frame='embedded')
         exp_id = rep_obj['experiment']['@id'].split('/')[2]
         rep_tech = "rep%s_%s" % \
                 (rep_obj['biological_replicate_number'], rep_obj['technical_replicate_number'])
         control_root = self.psv['control_path']
         # Cheating:
         if self.proj_name == "scratchPad" and self.psv['control_path'] == self.CONTROL_ROOT_FOLDER:
             control_root = "/lrna"
         path_n_glob = control_root + exp_id + '/' + rep_tech + '/' + self.CONTROL_FILE_GLOB
         target_folder = dxencode.find_folder(exp_id + '/' + rep_tech,self.project,control_root)
         #print "Target found [%s]" % target_folder
         if target_folder != None:
             path_n_glob = target_folder + '/' + self.CONTROL_FILE_GLOB
         fid = dxencode.find_file(path_n_glob,self.proj_id,multiple=False,recurse=False)
         if fid != None:
             return dxencode.file_path_from_fid(fid)
             
     if default != None:
         return default
     #print json.dumps(rep,indent=4)
     print "Unable to find control in search of %s" % rep['controls']
     sys.exit(1)
 def find_control_file(self,rep,default=None):
     '''Attempts to find an appropriate control file.'''
     # TODO Make more generic and move to dxencode.py when needed.
     
     (AUTHID,AUTHPW,SERVER) = dxencode.processkey(self.server_key)
     for file_key in rep['controls']:
         url = '%s%s/?format=json&frame=embedded' % (SERVER,file_key)
         #print '-- ' + AUTHID + " " + AUTHPW + " " + SERVER + " " + url
         try:
             response = dxencode.encoded_get(url, AUTHID, AUTHPW)
             file_obj = response.json()
         except:
             print "URL to control [%s] returned ?" % url
             print response
             sys.exit(1)
         #print json.dumps(response,indent=4)
         rep_id = file_obj["replicate"]['@id']
         url = '%s%s/?format=json&frame=embedded' % (SERVER,rep_id)
         try:
             response = dxencode.encoded_get(url, AUTHID, AUTHPW)
             rep_obj = response.json()
         except:
             print "URL to replicate [%s] returned ?" % url
             print response
             sys.exit(1)
         exp_id = rep_obj['experiment'].split('/')[2]
         rep_tech = "rep%s_%s" % \
                 (rep_obj['biological_replicate_number'], rep_obj['technical_replicate_number'])
         # default by cheating
         if self.proj_name == dxencode.PRODUCTION_PROJECT:
             control_root = "/long-RNA-seq/runs/"
         else:
             control_root = self.CONTROL_ROOT_FOLDER
         path_n_glob = control_root + exp_id + '/' + rep_tech + '/' + self.CONTROL_FILE_GLOB
         target_folder = dxencode.find_folder(exp_id + '/' + rep_tech,self.project,control_root)
         #print "Target found [%s]" % target_folder
         if target_folder != None:
             path_n_glob = target_folder + '/' + self.CONTROL_FILE_GLOB
         fid = dxencode.find_file(path_n_glob,self.proj_id,multiple=False,recurse=False)
         if fid != None:
             return dxencode.file_path_from_fid(fid)
             
     if default != None:
         return default
     print "Unable to find control in search of %s" % rep['controls']
     sys.exit(1)
Example #3
0
    def run(self):
        '''Runs recovery from start to finish using command line arguments.'''
        args = self.get_args()
        self.test = args.test
        self.ignore = False
        if args.ignore_properties:
            print "Ignoring DXFile properties (will post to test server)"
            self.ignore = args.ignore_properties
            self.server_key = 'test' # mandated because option is dangerous
            
        self.server_key = args.server
        self.authid, self.authpw, self.server = dxencode.processkey(self.server_key)
        
        if self.server_key == "www":
            self.acc_prefix = "ENCFF"
        self.proj_name = dxencode.env_get_current_project()
        if self.proj_name == None or args.project != None:
            self.proj_name = args.project
        if self.proj_name == None:
            print "Please enter a '--project' to run in."
            sys.exit(1)

        self.project = dxencode.get_project(self.proj_name)
        self.proj_id = self.project.get_id()
        print "== Running in project [%s] and will attempt recovery to the [%s] server ==" % \
                                                        (self.proj_name,self.server_key)

        exp_count = 0
        halted = 0
        total_recovered = 0
        for exp_id in args.experiments:
            sys.stdout.flush() # Slow running job should flush to piped log
            self.exp_id = exp_id
            self.obj_cache["exp"] = {}  # clear exp cache, which will hold exp specific wf_run and step_run objects
            # 1) Lookup experiment type from encoded, based on accession
            print "Working on %s..." % self.exp_id
            self.exp = dxencode.get_exp(self.exp_id,must_find=True,key=self.server_key)
            if self.exp == None or self.exp["status"] == "error":
                print "Unable to locate experiment %s in encoded (%s)" % (self.exp_id, self.server_key)
                continue
            self.exp_type = dxencode.get_exp_type(self.exp_id,self.exp,self.EXPERIMENT_TYPES_SUPPORTED)
            if self.exp_type == None:
                continue

            # 2) Locate the experiment accession named folder
            # NOTE: genome and annotation are not known for this exp yet, so the umbrella folder is just based on exp_type
            self.umbrella_folder = dxencode.umbrella_folder(args.folder,self.FOLDER_DEFAULT,self.proj_name,self.exp_type)
            self.exp_folder = dxencode.find_exp_folder(self.project,exp_id,self.umbrella_folder,warn=True)
            if self.exp_folder == None:
                continue
            print "- Examining %s:%s for '%s' results..." % \
                                            (self.proj_name, self.exp_folder, self.exp_type)

            # 3) Given the experiment type, determine the expected results
            self.pipeline   = self.pipeline_specification(args,self.exp_type,self.exp_folder)
            self.replicates = self.find_replicate_folders(self.exp_folder, verbose=args.verbose)

            # 4) Given expected results locate any files (by glob) that should be posted for
            #    a) each single replicate (in replicate sub-folders named as reN_N/
            #    b) combined replicates in the experiment folder itself
            files_expected = self.find_expected_files(self.exp_folder, self.replicates, verbose=args.verbose)
            print "- Found %d files that are available in DX." % len(files_expected)
            if len(files_expected) == 0:
                continue

            # 5) For each file that should be posted, determine if the file needs to be posted.
            files_posted = self.find_posted_files(files_expected, test=self.test, verbose=args.verbose) #True)
            print "- Found %d files that have been posted" % len(files_posted)
            if len(files_posted) == 0:
                continue

            # 6) For each file that needs to be posted:
            exp_count += 1
            file_count = 0
            recovery_count = 0
            for (out_type,rep_tech,fid) in files_posted:
                sys.stdout.flush() # Slow running job should flush to piped log
                accession = self.found[fid]['accession']
                file_name = dxencode.file_path_from_fid(fid)
                if args.start_at != None:
                    if accession != args.start_at and not file_name.endswith(args.start_at):
                        continue
                    else:
                        print "- Starting at %s" % (file_name)
                        args.start_at = None
                    
                # a) discover all necessary dx information needed for post.
                # b) gather any other information necessary from dx and encoded.
                print "- Handle file %s %s" % (accession,dxencode.file_path_from_fid(fid))
                payload = self.make_payload_obj(out_type,rep_tech,fid, verbose=args.verbose)

                file_count += 1
                # c) Update encoded database only if necessary.
                if self.file_metadata_recovery(fid,payload,args.test,verbose=args.verbose):
                    recovery_count += 1

                if args.files != 0 and file_count >= args.files:  # Short circuit for test
                    print "- Just trying %d file(s) by request" % file_count
                    break

            if not args.test:
                print "- For %s Processed %d file(s), recovered %s" % (self.exp_id, file_count, recovery_count)
            else:
                print "- For %s Processed %d file(s), would recover %s" % (self.exp_id, file_count, recovery_count)
            total_recovered += recovery_count

        if not args.test:
            print "Processed %d experiment(s), halted %d, recovered %d file(s)" % (exp_count, halted, total_recovered)
        else:
            print "Processed %d experiment(s), halted %d, would recover %d file(s)" % (exp_count, halted, total_recovered)
        if halted == exp_count:
            sys.exit(1)
        print "(finished)"
Example #4
0
    def run(self):
        '''Override super.run()'''
        args = self.get_args()
        self.test = args.test
        self.server_key = args.server
        if self.server_key != "test":
            self.acc_prefix = "ENCFF"
        self.proj_name = dxencode.env_get_current_project()
        if self.proj_name == None or args.project != None:
            self.proj_name = args.project
        if self.proj_name == None:
            print "Please enter a '--project' to run in."
            sys.exit(1)

        self.project = dxencode.get_project(self.proj_name)
        self.proj_id = self.project.get_id()
        print "== Running in project [%s] and will post to the [%s] server ==" % \
                                                        (self.proj_name,self.server_key)

        exp_count = 0
        halted = 0
        total_posted = 0
        for exp_id in args.experiments:
            sys.stdout.flush()  # Slow running job should flush to piped log
            # 1) Lookup experiment type from encoded, based on accession
            print "Working on %s..." % exp_id
            self.exp = dxencode.get_exp(exp_id,
                                        must_find=False,
                                        key=self.server_key)
            if self.exp == None or self.exp["status"] == "error":
                print "Unable to locate experiment %s in encoded" % exp_id
                continue
            self.exp_type = self.get_exp_type(exp_id)
            if self.exp_type == None:
                continue

            # 2) Locate the experiment accession named folder
            self.exp_folder = dxencode.find_exp_folder(self.project,
                                                       exp_id,
                                                       args.results_folder,
                                                       warn=True)
            if self.exp_folder == None:
                continue
            print "- Examining %s:%s for '%s' results..." % \
                                            (self.proj_name, self.exp_folder, self.exp_type)

            # 3) Given the experiment type, determine the expected results
            self.pipeline = self.pipeline_specification(
                args, self.exp_type, self.exp_folder)
            self.replicates = self.find_replicate_folders(self.exp_folder,
                                                          verbose=args.verbose)

            # 4) Given expected results locate any files (by glob) that should be posted for
            #    a) each single replicate (in replicate sub-folders named as reN_N/
            #    b) combined replicates in the experiment folder itself
            files_expected = self.find_expected_files(self.exp_folder,
                                                      self.replicates,
                                                      verbose=args.verbose)
            print "- Found %d files that are available to post." % len(
                files_expected)
            if len(files_expected) == 0:
                continue

            # 5) For each file that should be posted, determine if the file needs to be posted.
            files_to_post = {
                x[2]: x
                for x in self.find_needed_files(files_expected,
                                                verbose=args.verbose)
            }
            # index on dx file id
            print "- Found %d files that need to be posted" % len(
                files_to_post.keys())

            # 6) For each file that needs to be posted:
            exp_count += 1
            file_count = 0
            post_count = 0
            for (out_type, rep_tech, fid) in files_expected:
                sys.stdout.flush(
                )  # Slow running job should flush to piped log
                # a) discover all necessary dx information needed for post.
                # b) gather any other information necessary from dx and encoded.
                print "  Handle file %s" % dxencode.file_path_from_fid(fid)
                job = dxencode.job_from_fid(fid)

                try:
                    derived_from = self.find_derived_from(
                        fid, job, args.verbose)

                except dxpy.exceptions.ResourceNotFound, e:
                    print "WARN: derived_from failed %s" % e
                    derived_from = []
                if not files_to_post.get(fid, ()):
                    f_obj = self.found.get(fid, None)
                    if f_obj:
                        current_derived_from = f_obj['derived_from']
                        if derived_from and not current_derived_from:
                            print "Need to patch derived_from for %s/%s to %s (currently: %s)" % (
                                f_obj['accession'], fid, derived_from,
                                current_derived_from)
                        else:
                            print "Derived from for %s good" % f_obj[
                                'accession']
                    else:
                        print "File %s (%s) from %s/%s not found @ DNANexus" % (
                            fid, out_type, exp_id, rep_tech)

                #POSTING
                else:
                    payload = self.make_payload_obj(out_type,
                                                    rep_tech,
                                                    fid,
                                                    verbose=args.verbose)
                    if args.force_annotation:
                        print "WARN: forcing genome_annotation to be %s" % args.force_annotation
                        payload['genome_annotation'] = args.force_annotation

                    file_count += 1
                    # c) Post file and update encoded database.
                    accession = self.file_post(fid, payload, args.test)
                    if accession == None:
                        print "* HALTING %s - post failure could compromise 'derived_from'" % \
                                                                                        (self.exp_id)
                        halted += 1
                        break

                    # d) Update dnanexus file with file accession tag.
                    if not args.test:
                        post_count += 1
                    self.file_mark_accession(fid, accession, args.test)

                print "- For %s Processed %d file(s), posted %s" % \
                                                            (self.exp_id, file_count, post_count)
                total_posted += post_count
Example #5
0
    def file_metadata_recovery(self,fid,payload,test=True,verbose=False):
        '''Compares DX and encoded metadata and updates ENCODEd is necessary.'''
        recovered = False
        if fid not in self.found:
            print "* ERROR: Expecting to have exp_file for " + dxencode.file_path_from_fid(fid,projectToo=True)
            sys.exit(1)
            
        enc_file = self.found[fid]
        accession = enc_file['accession']
        update_payload = {}
        patch_required = False
        
        # Compare derived from and update if necessary
        append_derived_by = False
        if self.APPEND_FLAG in payload['derived_from']:
            payload['derived_from'].remove(self.APPEND_FLAG)
            append_derived_by = True
        derived_diffs = len(payload['derived_from'])
        if verbose:
            print >> sys.stderr, "> DX file derived_from:"
            print >> sys.stderr, json.dumps(payload['derived_from'],indent=4)
        enc_derived = enc_file.get('derived_from')
        if enc_derived != None:
            if verbose:
                print >> sys.stderr, "> Enc file derived_from:"
                print >> sys.stderr, json.dumps(enc_derived,indent=4)
            if append_derived_by or len(enc_derived) == derived_diffs:
                for acc in payload['derived_from']:
                    for inp_file in enc_derived:
                        if acc == inp_file.get('accession'):
                            derived_diffs -= 1
        else:
            if verbose:
                print >> sys.stderr, "Enc file derived_from: Not Found"
        if derived_diffs > 0:
            if append_derived_by:
                print "  + Need to append to 'derived_from'."
                if enc_derived != None:
                    for inp_file in enc_derived:
                        acc = inp_file.get('accession')
                        if acc != None and acc not in payload['derived_from']:
                            payload['derived_from'].append(acc)
            else:
                print "  + Need to update 'derived_from'."
            update_payload['derived_from'] = payload['derived_from']
            patch_required = True
        
        # Compare wf_run/step_run and update if necessary
        step_run_diff = True
        dx_step_run = payload.get("step_run")
        if dx_step_run != None:
            dx_step_run = dx_step_run.split('/')[-1] # Just the alias please
            if verbose:
                print >> sys.stderr, "> DX file step_run:" + dx_step_run 
        else:
            print "* ERROR: payload is missing 'step_run'"
        enc_step_run = enc_file.get("step_run")
        if verbose:
            if enc_step_run != None:
                print >> sys.stderr, "> Enc file step_run:" + enc_step_run
                #print >> sys.stderr, json.dumps(enc_step_run,indent=4)
            else:
                print >> sys.stderr, "> Enc file step_run: Not Found"

        if dx_step_run == None and enc_step_run == None:   
            step_run_diff = False
        elif dx_step_run != None and enc_step_run != None:
            step_run = self.enc_lookup_json(enc_step_run,must_find=True)
            if verbose:
                print >> sys.stderr, "> Actual step_run:"
                print >> sys.stderr, json.dumps(step_run,indent=4)
            aliases = step_run.get('aliases')
            if aliases != None:
                if verbose:
                    print >> sys.stderr, "> aliass:"
                    print >> sys.stderr, json.dumps(aliases,indent=4)
                if dx_step_run in aliases:
                    step_run_diff = False
        if step_run_diff:
            print "  + Need to update 'step_run'."
            update_payload["step_run"] = payload["step_run"]
            patch_required = True
            
        # What about lab?
        if 'lab' not in enc_file:
            print "  + Need to update missing 'lab' new: '"+payload['lab']+"'"
            update_payload['lab'] = payload['lab']
            patch_required = True
        else:
            lab = enc_file.get('lab')
            if '@id' not in lab or lab['@id'] != payload['lab']:
                print "  + Need to update 'lab' new: '"+payload['lab']+"'"
                #print json.dumps(lab,indent=4,sort_keys=True)
                update_payload['lab'] = payload['lab']
                patch_required = True

        # award?
        if 'award' not in enc_file:
            print "  + Need to update missing 'award' new: '"+payload['award']+"'"
            update_payload['award'] = payload['award']
            patch_required = True
        elif enc_file['award'] != payload['award']:
            print "  + Need to update 'award' new: '"+payload['award']+"'  enc: '"+enc_file['award']+"'"
            update_payload['award'] = payload['award']
            patch_required = True

        if patch_required:
            update_payload['notes'] = payload['notes'] # Only update notes if other things update, then always update notes
            if test:
                print "  * Would patch file: '%s'" % dxencode.file_path_from_fid(fid)
                #print json.dumps(update_payload,indent=4,sort_keys=True)
                recovered = True
            else:
                try:
                    ret = dxencode.encoded_patch_obj(accession, update_payload, self.server, self.authid, self.authpw)
                except:
                    print "Failed to patch file: '%s'" % dxencode.file_path_from_fid(fid)
                    print json.dumps(update_payload,indent=4,sort_keys=True)
                    sys.exit(1)
                print "  * Patched file: '%s'" % dxencode.file_path_from_fid(fid)
                recovered = True
        else:
                print "  - No need to patch file."
                
        return recovered