Ejemplo n.º 1
0
    def pipeline_specific_vars(self,args,verbose=False):
        '''Adds pipeline specific variables to a dict, for use building the workflow.'''
        psv = Launch.pipeline_specific_vars(self,args)
        
        # Could be multiple annotations supported per genome
        psv['annotation'] = args.annotation
        if psv['genome'] != self.GENOME_DEFAULT and psv['annotation'] == self.ANNO_DEFAULT:
            psv['annotation'] = self.ANNO_DEFAULTS[psv['genome']]
        if psv['annotation'] not in self.ANNO_ALLOWED[psv['genome']]:
            print psv['genome']+" has no "+psv['annotation']+" annotation."
            sys.exit(1)
        
        # Some specific settings
        psv['nthreads']   = 8
        psv['rnd_seed']   = 12345

        # If annotation is not default, then add it to title
        if psv['annotation'] != self.ANNO_DEFAULTS[psv['genome']]:
            psv['title'] += ', ' + psv['annotation']
            psv['name']  += '_' + psv['annotation']
            
        self.no_tophat = args.no_tophat
        if not self.no_tophat:
            self.PRUNE_STEPS = []

        # Must override results location because of annotation
        psv['resultsLoc'] = dxencode.umbrella_folder(args.folder,self.FOLDER_DEFAULT,self.proj_name,psv['exp_type'], \
                                                                                            psv['genome'],psv['annotation'])
        psv['resultsFolder'] = psv['resultsLoc'] + psv['experiment'] + '/'
        self.update_rep_result_folders(psv)

        if verbose:
            print "Pipeline Specific Vars:"
            print json.dumps(psv,indent=4)
        return psv
    def pipeline_specific_vars(self,args,verbose=False):
        '''Adds pipeline specific variables to a dict, for use building the workflow.'''
        psv = Launch.pipeline_specific_vars(self,args)
        
        # Could be multiple annotations supported per genome
        psv['annotation'] = args.annotation
        if psv['genome'] != self.GENOME_DEFAULT and psv['annotation'] == self.ANNO_DEFAULT:
            psv['annotation'] = self.ANNO_DEFAULTS[psv['genome']]
        if psv['annotation'] not in self.ANNO_ALLOWED[psv['genome']]:
            print psv['genome']+" has no "+psv['annotation']+" annotation."
            sys.exit(1)

        if not psv['paired_end']:
            print "Rampage is always expected to be paired-end but mapping says otherwise."
            sys.exit(1)

        # Some specific settings
        psv['nthreads']   = 8
        psv['control'] = args.control
        
        # run will either be for combined or single rep.
        if not psv['combined']:
            run = psv['reps']['a']  # If not combined then run will be for the first (only) replicate
        else:
            run = psv
            
        # workflow labeling
        psv['description'] = "The ENCODE Rampage RNA pipeline for long RNAs"
        run['name'] = "rampage_"+psv['genome']
        if psv['genome'] == 'mm10':
            run['name'] += psv['annotation']
        if psv['gender'] == 'female':
            run['name'] += "XX"
        else:
            run['name'] += "XY"
        run['title'] = "Rampage RNA " + psv['experiment'] + " - " + run['rep_tech']
        run['name'] += "_"+psv['experiment']+"_" + run['rep_tech']
        if not psv['combined']:
            run['title'] += " [library '"+run['library_id']+"']"
        run['title'] += " on " + psv['genome']+" - "+psv['gender']

        # Must override results location because of annotation
        psv['resultsLoc'] = dxencode.umbrella_folder(args.folder,self.FOLDER_DEFAULT,self.proj_name,psv['exp_type'], \
                                                                                            psv['genome'],psv['annotation'])
        psv['resultsFolder'] = psv['resultsLoc'] + psv['experiment'] + '/'
        psv['reps']['a']['resultsFolder'] = psv['resultsLoc'] + psv['experiment'] + '/' + \
                                                              psv['reps']['a']['rep_tech'] + '/'
        if psv['combined']:
            psv['reps']['b']['resultsFolder'] = psv['resultsLoc'] + psv['experiment'] + '/' + \
                                                                  psv['reps']['b']['rep_tech'] + '/'

        if verbose:
            print "Pipeline Specific Vars:"
            print json.dumps(psv,indent=4)
        return psv
    def pipeline_specific_vars(self,args,verbose=False):
        '''Adds pipeline specific variables to a dict, for use building the workflow.'''
        psv = Launch.pipeline_specific_vars(self,args)
        
        # Could be multiple annotations supported per genome
        psv['annotation'] = args.annotation
        if psv['genome'] != self.GENOME_DEFAULT and psv['annotation'] == self.ANNO_DEFAULT:
            psv['annotation'] = self.ANNO_DEFAULTS[psv['genome']]
        if psv['annotation'] not in self.ANNO_ALLOWED[psv['genome']]:
            print psv['genome']+" has no "+psv['annotation']+" annotation."
            sys.exit(1)

        if not psv['paired_end']:
            print "Rampage is always expected to be paired-end but mapping says otherwise."
            sys.exit(1)

        # Some specific settings
        psv['nthreads']   = 8
        psv['control'] = args.control
        
        # run will either be for combined or single rep.
        if not self.combined_reps:
            run = psv['reps']['a']  # If not combined then run will be for the first (only) replicate
        else:
            run = psv
            
        # If annotation is not default, then add it to title
        if psv['annotation'] != self.ANNO_DEFAULTS[psv['genome']]:
            psv['title'] += ', ' + psv['annotation']
            psv['name']  += '_' + psv['annotation']

        # Must override results location because of annotation
        psv['resultsLoc'] = dxencode.umbrella_folder(args.folder,self.FOLDER_DEFAULT,self.proj_name,psv['exp_type'], \
                                                                                            psv['genome'],psv['annotation'])
        psv['resultsFolder'] = psv['resultsLoc'] + psv['experiment'] + '/'
        self.update_rep_result_folders(psv)

        if verbose:
            print "Pipeline Specific Vars:"
            print json.dumps(psv,indent=4)
        return psv
Ejemplo n.º 4
0
    def pipeline_specific_vars(self,args,verbose=False):
        '''Adds pipeline specific variables to a dict, for use building the workflow.'''
        psv = Launch.pipeline_specific_vars(self,args)

        # Now add pipline specific variables and tests
        
        # Could be multiple annotations supported per genome
        psv['annotation'] = args.annotation
        if psv['genome'] != self.GENOME_DEFAULT and psv['annotation'] == self.ANNO_DEFAULT:
            psv['annotation'] = self.ANNO_DEFAULTS[psv['genome']]
        if psv['annotation'] not in self.ANNO_ALLOWED[psv['genome']]:
            print psv['genome']+" has no "+psv['annotation']+" annotation."
            sys.exit(1)
        
        # Paired ends?
        if psv['paired_end']:
            print "Small-RNA is always expected to be single-end but mapping says otherwise."
            #print json.dumps(psv,indent=4,sort_keys=True)
            sys.exit(1)

        # Some specific settings
        psv['nthreads']   = 8

        # If annotation is not default, then add it to title
        if psv['annotation'] != self.ANNO_DEFAULTS[psv['genome']]:
            psv['title'] += ', ' + psv['annotation']
            psv['name']  += '_' + psv['annotation']

        # Must override results location because of annotation
        psv['resultsLoc'] = dxencode.umbrella_folder(args.folder,self.FOLDER_DEFAULT,self.proj_name,psv['exp_type'], \
                                                                                            psv['genome'],psv['annotation'])
        psv['resultsFolder'] = psv['resultsLoc'] + psv['experiment'] + '/'
        self.update_rep_result_folders(psv)

        if verbose:
            print "Pipeline Specific Vars:"
            print json.dumps(psv,indent=4)
        return psv
Ejemplo n.º 5
0
    def run(self):
        '''Runs recovery from start to finish using command line arguments.'''
        args = self.get_args()
        self.test = args.test
        self.ignore = False
        if args.ignore_properties:
            print "Ignoring DXFile properties (will post to test server)"
            self.ignore = args.ignore_properties
            self.server_key = 'test' # mandated because option is dangerous
            
        self.server_key = args.server
        self.authid, self.authpw, self.server = dxencode.processkey(self.server_key)
        
        if self.server_key == "www":
            self.acc_prefix = "ENCFF"
        self.proj_name = dxencode.env_get_current_project()
        if self.proj_name == None or args.project != None:
            self.proj_name = args.project
        if self.proj_name == None:
            print "Please enter a '--project' to run in."
            sys.exit(1)

        self.project = dxencode.get_project(self.proj_name)
        self.proj_id = self.project.get_id()
        print "== Running in project [%s] and will attempt recovery to the [%s] server ==" % \
                                                        (self.proj_name,self.server_key)

        exp_count = 0
        halted = 0
        total_recovered = 0
        for exp_id in args.experiments:
            sys.stdout.flush() # Slow running job should flush to piped log
            self.exp_id = exp_id
            self.obj_cache["exp"] = {}  # clear exp cache, which will hold exp specific wf_run and step_run objects
            # 1) Lookup experiment type from encoded, based on accession
            print "Working on %s..." % self.exp_id
            self.exp = dxencode.get_exp(self.exp_id,must_find=True,key=self.server_key)
            if self.exp == None or self.exp["status"] == "error":
                print "Unable to locate experiment %s in encoded (%s)" % (self.exp_id, self.server_key)
                continue
            self.exp_type = dxencode.get_exp_type(self.exp_id,self.exp,self.EXPERIMENT_TYPES_SUPPORTED)
            if self.exp_type == None:
                continue

            # 2) Locate the experiment accession named folder
            # NOTE: genome and annotation are not known for this exp yet, so the umbrella folder is just based on exp_type
            self.umbrella_folder = dxencode.umbrella_folder(args.folder,self.FOLDER_DEFAULT,self.proj_name,self.exp_type)
            self.exp_folder = dxencode.find_exp_folder(self.project,exp_id,self.umbrella_folder,warn=True)
            if self.exp_folder == None:
                continue
            print "- Examining %s:%s for '%s' results..." % \
                                            (self.proj_name, self.exp_folder, self.exp_type)

            # 3) Given the experiment type, determine the expected results
            self.pipeline   = self.pipeline_specification(args,self.exp_type,self.exp_folder)
            self.replicates = self.find_replicate_folders(self.exp_folder, verbose=args.verbose)

            # 4) Given expected results locate any files (by glob) that should be posted for
            #    a) each single replicate (in replicate sub-folders named as reN_N/
            #    b) combined replicates in the experiment folder itself
            files_expected = self.find_expected_files(self.exp_folder, self.replicates, verbose=args.verbose)
            print "- Found %d files that are available in DX." % len(files_expected)
            if len(files_expected) == 0:
                continue

            # 5) For each file that should be posted, determine if the file needs to be posted.
            files_posted = self.find_posted_files(files_expected, test=self.test, verbose=args.verbose) #True)
            print "- Found %d files that have been posted" % len(files_posted)
            if len(files_posted) == 0:
                continue

            # 6) For each file that needs to be posted:
            exp_count += 1
            file_count = 0
            recovery_count = 0
            for (out_type,rep_tech,fid) in files_posted:
                sys.stdout.flush() # Slow running job should flush to piped log
                accession = self.found[fid]['accession']
                file_name = dxencode.file_path_from_fid(fid)
                if args.start_at != None:
                    if accession != args.start_at and not file_name.endswith(args.start_at):
                        continue
                    else:
                        print "- Starting at %s" % (file_name)
                        args.start_at = None
                    
                # a) discover all necessary dx information needed for post.
                # b) gather any other information necessary from dx and encoded.
                print "- Handle file %s %s" % (accession,dxencode.file_path_from_fid(fid))
                payload = self.make_payload_obj(out_type,rep_tech,fid, verbose=args.verbose)

                file_count += 1
                # c) Update encoded database only if necessary.
                if self.file_metadata_recovery(fid,payload,args.test,verbose=args.verbose):
                    recovery_count += 1

                if args.files != 0 and file_count >= args.files:  # Short circuit for test
                    print "- Just trying %d file(s) by request" % file_count
                    break

            if not args.test:
                print "- For %s Processed %d file(s), recovered %s" % (self.exp_id, file_count, recovery_count)
            else:
                print "- For %s Processed %d file(s), would recover %s" % (self.exp_id, file_count, recovery_count)
            total_recovered += recovery_count

        if not args.test:
            print "Processed %d experiment(s), halted %d, recovered %d file(s)" % (exp_count, halted, total_recovered)
        else:
            print "Processed %d experiment(s), halted %d, would recover %d file(s)" % (exp_count, halted, total_recovered)
        if halted == exp_count:
            sys.exit(1)
        print "(finished)"