def copyRawRiles(ct, dest=None): sampList = ct.dct['Sample'] if not dest: proj = ct.db.getAttrFromSamp('project_name', sampList[0]) dest = '/isiseqruns/GUP_Deliveries/Sub_0{}_RawFQ_{}'.format( subNum, proj) fcid_ln = list(set([( ct.db.getAttrFromSamp('fcid', s), ct.db.getAttrFromSamp('flowcell_lane', s)) for s in sampList])) for (fcid, ln) in fcid_ln: bcl_ct = calc_tuple.CalcTuple( db=ct.db, node='Bcl2fastq', fcid=fcid, laneNum=ln) subSampList = [s for s in sampList if s in ct.db.getAllSamples(**bcl_ct.dct)] for samp in subSampList: odir = bcl_ct.getSampOutdir(samp) intermedDir = os.path.relpath( odir, os.path.join(config.GUP_HOME, 'RUNS')) toDir = os.path.join(dest, intermedDir) subprocess.call( ['mkdir', '-p', '-m', '777', os.path.split(toDir)[0]]) try: shutil.copytree(odir, toDir) except BaseException: print "{} already there?".format(sampSrc)
def buildCalcTupleForArbitraryPooling(self, **kwargs): pool = kwargs.pop('pool') subNum = getSubNumFromPoolDict(pool) if 'subNumPrefix' in kwargs and kwargs['subNumPrefix']: subNum = kwargs['subNumPrefix'] + '_' + subNum ct = calc_tuple.CalcTuple( db=self.db, node='Collate', fcid=self.fcid, subNum=subNum, poolId=sorted(pool.keys()), pool=pool, **kwargs) repSamp = ct.dct['pool'].values()[0] # TODO a) make this hack a method of collate ct, or b) singleton pools # get encased in list if isinstance(repSamp, list): repSamp = repSamp[0] self.calcTuples.append(ct) _outdir = p_join( self.dirPref, "Sub_{0}_{1}_{2}__{3}".format( subNum, ct.getRefGenome(), self.db.getAttrFromSamp( 'project_name', repSamp), ct.hsh[ :config.ODIR_HSH_LEN])) if self.tryToLoadFinishedCalc(ct): return self.buildCalcInfoWithErrAndWrn(ct, _outdir) ct.putMetadata(Sample=None) subprocess.call(['mkdir', '-p', '-m', '777', ct.getMetadata('outdir')])
def getUpstream(self, node=None): 'similar to FilterCalcTuple but no ReadRescue' dct = self.dct.copy() samp = dct.pop('Sample') dct['fcid'] = self.db.getAttrFromSamp('fcid', samp) dct['laneNum'] = self.db.getAttrFromSamp('flowcell_lane', samp) return calc_tuple.CalcTuple(db=self.db, node='Bcl2fastq', **dct)
def buildCalcTupleForSampList(self, **kwargs): assert kwargs['subNumPrefix'], "need it for sampList collate." ct = calc_tuple.CalcTuple( db=self.db, node='Collate', fcid=self.fcid, **kwargs) self.calcTuples.append(ct) if self.tryToLoadFinishedCalc(ct): return _outdir = p_join( self.dirPref, "Sub_{0}_{1}_{2}".format( ct.dct['subNum'], ct.getRefGenome(), ct.hsh[:config.ODIR_HSH_LEN]) ) self.buildCalcInfoWithErrAndWrn(ct, _outdir) ct.putMetadata(Sample=ct.dct['Sample']) subprocess.call(['mkdir', '-p', '-m', '777', ct.getMetadata('outdir')])
def buildCalcTuples(self, **kwargs): """ We get the submission numbers, and build a dictionary mapping each subNum's sample list. Finally we check if there are multiple species, making a calcTuple for each unique genome. Planning on three use cases at the moment: 1) FCID - group all samples in this flowcell by their submission number 2) By SubNum - Ideally this would span multiple fcids, now just a filter. TODO JWS 3) Custom Pool - This generates its own specific hexidecimal subNum There's also three cases for algorithms, commented as Case_I, ... 1) Sample - Each alignment entity is single sample. 2) DefaultPooling - Each Alignment entity is a pool based on sample Ids I believe all samples in pool have same sample_name in db 3) ArbPooling - Each Alignment entity is a arbitrary pooling. For sample_name we will again use pooId. This is confusing, because throughout this code Sample often means 'alignment entity' """ # TODO Refactor this, by getting ct's first. # A little tricky to test throughly: # is always used, even for pools of 1 with the same name as the sample. # refGenome (matching all, none, or some of default vaules) # Sample Sets: SubNum 1 Fcid, SubNum multFC, custom sample set, # default pool, custom pool # Homogenous/Heterogeneous submissions # 30 cases self.calcTuples = [] doPooling = False if 'pool' in kwargs: if isinstance(kwargs['pool'], dict): self.buildCalcTupleForArbitraryPooling(**kwargs) # Case III return elif kwargs['pool'] is False: kwargs.pop('pool') else: doPooling = True assert kwargs['pool'] is True, \ "allowed vals for pool: True (False) or dct" elif 'sampList' in kwargs: self.buildCalcTupleForSampList(**kwargs) return if 'refGenome' in kwargs: assert 'subNum' in kwargs or 'sampList' in kwargs, \ "must specify single subNum with refG" #refGenome = kwargs.pop('refGenome') # else: #refGenome = None # TODO fcidOnly or MultFC? two descriptions of same thing ''' if not ('multFC' in kwargs and kwargs['multFC']): kwargs['fcid'] = self.fcid kwargs['multFC'] = False ''' subNumToSamps = self.getSubNumToSampsDict(**kwargs) # TODO this shouldn't be here. find When samples are first imported # from stemcell and lower them there. for samp in self.db.tables['Samp']: self.db.tables['Samp'][samp]['genome'] = self.db.tables[ 'Samp'][samp]['genome'].lower() for subNum in subNumToSamps: if 'subNum' in kwargs: kwargs.pop('subNum') for genome in set([self.db.getAttrFromSamp('genome', el) for el in subNumToSamps[subNum]]): subSampList = sorted([el for el in subNumToSamps[subNum] if self.db.getAttrFromSamp('genome', el) == genome]) prj = self.db.getAttrFromSamp('project_name', subSampList[0]) pool = {} if doPooling: pool = self.getPoolFromSubSamps(subSampList) ct = calc_tuple.CalcTuple( db=self.db, node='Collate', subNum=subNum, Sample=subSampList, pool=pool, poolId=sorted(pool.keys()), fcid=self.fcid, **kwargs) self.calcTuples.append(ct) try: subNumStr = "{0:04d}".format(int(subNum)) except BaseException: subNumStr = subNum _outdir = p_join(self.dirPref, "Sub_{0}_{1}_{2}__{3}".format( subNumStr, prj, ct.getRefGenome(), ct.hsh[:config.ODIR_HSH_LEN])) if self.tryToLoadFinishedCalc(ct): continue self.buildCalcInfoWithErrAndWrn(ct, _outdir) ct.putMetadata(Sample=subSampList) subprocess.call(['mkdir', '-p', '-m', '777', ct.getMetadata('outdir')])