def gatherCausalStats(Ddata, scenario, selpos=500000, thinSfx='', complikeSfx=None, likesTableSfx='', nonNanStats='ALL', getio=None): """For each replica in one scenario, gather causal SNP stats and save them as the replica statistic. """ snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenario.scenDir()) replicaStatsDir = os.path.join(Ddata, 'replicastats' + thinSfx, scenario.scenDir()) complikeFN = os.path.join( snpStatsDir, AddFileSfx('complike.data/', 'normedLocal', scenario.mutPop, complikeSfx, 'nonNan', *MakeSeq(nonNanStats))) causalStatsFN = os.path.join( replicaStatsDir, AddFileSfx('causalStats.tsv', complikeSfx, likesTableSfx, 'nonNan', *MakeSeq(nonNanStats))) if getio: return dict(depends_on=complikeFN, creates=causalStatsFN, mediumRuleNameSfx=(scenario.scenDir(), complikeSfx, likesTableSfx)) complikeFile = IDotData(complikeFN) complikeFile[complikeFile.Pos == selpos].addComputedCols( newColNames='replicaNum', newColFn=lambda r: int(r.Chrom)).save(causalStatsFN)
def normalizeColumnsWithinGroups(inFN, cols, groupCols, outFN, groupsAreContiguous=True, getio=None): """Normalize the specified columns of a table within groups. Params: inFN - the input table cols - the columns to be normalized groupCols - the columns that define the groups: rows that have the same combination of values in the group columns, are in the same group. outFN - the output table groupsAreContiguous - if True, rows belonging to the same group must be contiguous in the table; if False, no such assumption is made. """ cols = tuple(MakeSeq(cols)) groupCols = tuple(MakeSeq(groupCols)) meansFN = GetCreates(computeMeanStdWithinGroups, **Dict('inFN cols groupCols groupsAreContiguous'))[0] if getio: return dict(depends_on=(inFN, meansFN), creates=outFN, splitByCols={inFN: dict(keyCols=())}) inFile = IDotData(inFN) means = IDotData(meansFN) inFile.normalizeColumnsWithinGroups_using_means( **Dict('cols groupCols groupsAreContiguous means')).save(outFN)
def checkTableKey(inFN, cols, comparison='lt', writeCheckedFile=True, tsvOpts={}, lineFilter=None, lineFilterCols=(), getio=None): """Check that in the given table, record identifiers increase uniformly. Params: cols - the columns whose tuple should uniformly inrease comparison - this comparison must be true between each record and the next. the comparison is the name of a routine in the operator module. """ cols = tuple(MakeSeq(cols)) lineFilterCols = tuple(MakeSeq(lineFilterCols)) checkedFN = Str('$inFN.checked_${comparison}') + Sfx(*cols) if getio: return dict(depends_on=inFN, creates=checkedFN if writeCheckedFile else (), attrs=dict(piperun_short=True)) comparisonFunc = getattr(operator, comparison) prevRec = None loadCols = cols + lineFilterCols nskipped = 0 nchecked = 0 for i, r in enumerate(IDotData(inFN, ToLoad=loadCols, **tsvOpts)): if lineFilter and not lineFilter(r): nskipped += 1 continue thisRec = r[cols] if IsSeq(r) else (r, ) if i > 0 and not comparisonFunc(prevRec, thisRec): logging.error( Str('at line $i of $inFN, looking at $cols: $prevRec is not $comparison $thisRec' )) assert False else: nchecked += 1 prevRec = thisRec dbg('nchecked nskipped') DumpFile(checkedFN, 'checked ok.')
def computeMeanFstAndFreqDiffScores(pops, chrom, selPop, sweepDir, pop2ancFreqFN, pop2sampleSizeFN, outMeanFstFN, outFreqDiffFN, getio=None): """Compute meanFst and freqDiff scores""" if selPop not in pops: pops = tuple(MakeSeq(pops)) + (selPop, ) cmpPops = [pop for pop in pops if pop != selPop] if getio: return dict(depends_on=(pop2ancFreqFN, pop2sampleSizeFN), creates=(outMeanFstFN, outFreqDiffFN), attrs=Dict('chrom', pop=pops)) # pop2ancFreq.to_csv( 'befdrop.tsv', sep = '\t' ) # pop2ancFreq.fillna( value = 1.0, inplace = True ) # pop2ancFreq.to_csv( 'aftdrop.tsv', sep = '\t' ) pop2ancFreq = pd.read_table(pop2ancFreqFN, index_col='pos') pop2sampleSize = pd.read_table(pop2sampleSizeFN, index_col='pop').sampleSize dbg('pop2sampleSize') #pop2snpInfo.to_csv( 'test.tsv', sep = '\t', header = True ) derFreq = 1.0 - pop2ancFreq[selPop] cmpAncFreqs = pop2ancFreq[[pop for pop in pops if pop != selPop]] meanAnc = cmpAncFreqs.mean(axis=1) freqDiff = derFreq - (1.0 - meanAnc) freqDiff.name = 'freqDiff' freqDiff.to_csv(outFreqDiffFN, sep='\t', header=True) # compute meanFst # dbg( '"vvvvvvvvvvvw" selPop pop2ancFreq[selPop] pop2ancFreq["JPT+CHB"] pop2ancFreq["YRI"]' ) # dbg( 'selPop pop2sampleSize[selPop] pop2sampleSize["JPT+CHB"] pop2sampleSize["YRI"]' ) d = dict([(pop, fst_onePopPair(ancFreqs=np.array( (pop2ancFreq[selPop], pop2ancFreq[pop])), sampleSizes=(pop2sampleSize[selPop], pop2sampleSize[pop]))) for pop in cmpPops]) fstVals = pd.DataFrame(data=d, index=pop2ancFreq.index) # spc = fst_onePopPair( ancFreqs = np.array( ( pop2ancFreq[ 'BEB' ], pop2ancFreq[ 'ASN' ] ) ), # sampleSizes = ( pop2sampleSize[ 'BEB' ], pop2sampleSize[ 'ASN' ] ) ) # dbg( '"ddddddddddd" fstVals.loc[526736] spc' ) # dbg( 'fstVals' ) fstVals.fillna(value=0.0, inplace=True) #fstVals.to_csv( 'fstvals.tsv', sep = '\t', header = True, na_rep = 'NaN' ) fstMean = fstVals.mean(axis=1) dbg('fstVals fstMean') fstMean.name = 'meanFst' fstMean.to_csv(outMeanFstFN, sep='\t', header=True, na_rep='NaN')
def DefineRulesTo_normalizeColumnsWithinGroups(pr, inFN, cols, groupCols, groupsAreContiguous=True, nameSfx='', outFN=None): """Adds rules to create a version of a table with given columns normalized within groups.""" cols = tuple(MakeSeq(cols)) groupCols = tuple(MakeSeq(groupCols)) DefineRulesTo_meanStdWithinGroups( **Dict('pr inFN cols groupCols groupsAreContiguous nameSfx')) pr.addInvokeRule( invokeFn=normalizeColumnsWithinGroups, invokeArgs=Dict('inFN cols groupCols outFN groupsAreContiguous'), name='normalizeColumnsWithinGroups' + Sfx(nameSfx))
def DefineRulesTo_computeMeanStd(pr, inFNs, colNum, outFN, addRuleArgs={}): """Define rules to compute mean and stddev for a given column in the given tsv files""" pr.addRule(commands=' | '.join( ('tail -q -n +2 ' + ' '.join(MakeSeq(inFNs)), 'cut -f %d' % colNum, 'grep -iv nan', '../Operations/Ilya_Operations/tblstats')), depends_on=inFNs, saveOutputTo=outFN, **addRuleArgs)
def sortTableOn(inFN, outFN, keyCols, reverse=False, getio=None): """Sort the given table on the given column(s).""" if getio: return dict(depends_on=inFN, creates=outFN) result = IDotData(inFN).sortedOn(*MakeSeq(keyCols)) if reverse: d = result.toDotData() result = d[range(len(d) - 1, -1, -1)] result.save(outFN)
def plotHistograms(inFN, cols, outFNs=None, getio=None, **kwargs): histFNs = GetCreates(computeHistograms, **Dict('inFN cols outFNs')) outFN = AddFileSfx(ReplaceFileExt(inFN, '.svg'), 'hist') if getio: return dict(depends_on=histFNs, creates=outFN) GraphHistograms(histFiles=histFNs, outFile=outFN, labels=tuple(MakeSeq(cols)), **kwargs)
def gatherCausalRanks(Ddata=None, scenario=None, selpos=500000, thinSfx='', complikeSfx=None, likesTableSfx='', nonNanStats='ALL', cmsFileFN=None, causalRankFN=None, getio=None): """For each replica in one scenario, get the rank of the causal SNP by CMS score, and save as a replica statistic. """ assert cmsFileFN or (Ddata and scenario) scenDir = scenario.scenDir() if scenario else 'unknown_scenDir' if not cmsFileFN: snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenDir) if not cmsFileFN: cmsFileFN = os.path.join( snpStatsDir, AddFileSfx('complike.data/', scenario.mutPop, complikeSfx, likesTableSfx)) if not causalRankFN: causalRankFN = os.path.join( Ddata, 'replicastats', scenDir, AddFileSfx('causalRank.tsv', complikeSfx, 'nonNan', *MakeSeq(nonNanStats))) if getio: return dict(depends_on=cmsFileFN, creates=causalRankFN, mediumRuleNameSfx=(scenDir, complikeSfx)) cmsScores = IDotData(cmsFileFN) if nonNanStats.upper() == 'ALL': nonNanStats = cmsScores.headings with IDotData.openForWrite( causalRankFN, headings='replicaNum causalRank causalScore') as causalRankFile: for replicaNum, cmsScores1, cmsScores2 in cmsScores.groupby( 'Chrom', multiPass=2): for r1 in cmsScores1: if r1.Pos == selpos: causalScore = r1.complike numHigher = 0 for r2 in cmsScores2: if r2.complike > causalScore and all( [np.isfinite(r2[c]) for c in nonNanStats]): numHigher += 1 causalRankFile.writeRecord(int(replicaNum), numHigher, causalScore) if r1.Pos >= selpos: break
def computeHistograms(inFN, cols, binSizes=None, outFNs=None, getio=None): """Compute histograms of the specified columns of the input""" cols = tuple(MakeSeq(cols)) binSizesHere = (.001, ) * len(cols) if binSizes is None else tuple( MakeSeq(binSizes)) outFNsHere = outFNs if outFNsHere is None: outFNsHere = [ AddFileSubdir('stats', AddFileSfx(inFN, 'hist', col)) for col in cols ] assert len(cols) == len(binSizesHere) == len(outFNsHere) if getio: return dict(depends_on=inFN, creates=outFNsHere) # add histogram combiner hists = [Histogrammer(binSize=binSize) for binSize in binSizesHere] z = IDotData(inFN) for h, c, outFN in zip(hists, cols, outFNsHere): h.addVals(z[c]) h.save(outFN)
def CreateSimsParams_neutral(Ddata, suffix, inputParamsFiles, getio=None): """Write the neutral parameter file. """ inputParamsFiles = MakeSeq(inputParamsFiles) neutralParamsFile = Ddata + '/params_neutral' + suffix if getio: return dict(depends_on=inputParamsFiles, creates=neutralParamsFile) neutralParams = reduce(concat, map(SlurpFile, inputParamsFiles)) DumpFile(neutralParamsFile, neutralParams)
def computeSumsWithinGroups(inFN, cols, groupCols, groupsAreContiguous=True, outFN=None, getio=None): """For a tsv file, compute sums, sumsquares and counts for each of the given columns within groups defined by groupCols. >>> z = IDotData( names = ( 'a', 'b' ), Records = ( ( 1, 2 ), ( 1, 3 ), ( 2, 4 ), ( 2, 5 ) ) ) >>> computeSumsWithinGroups( inFN = z, cols = 'b', groupCols = 'a', outFN = sys.stdout ) ... # doctest: +NORMALIZE_WHITESPACE a b_count b_sum b_sumSq b_numNaN 1 2 5.0 13.0 0 2 2 9.0 41.0 0 """ cols = tuple(MakeSeq(cols)) groupCols = tuple(MakeSeq(groupCols)) if outFN is None: outFN = AddFileSubdir('stats', AddFileSfx(inFN, 'sums', *(cols + groupCols))) def combiner(inFNs, outFN): IDotData.mergeColumnSummaries(iDotDatas=inFNs, cols=cols, groupCols=groupCols).save(outFN) if getio: return dict(depends_on=inFN, creates=outFN, splitByCols={inFN: dict(keyCols=())}, combiner={outFN: combiner}) IDotData(inFN).summarizeColumnsWithinGroups( **Dict('cols groupCols groupsAreContiguous')).save(outFN)
def computeMeanStd_binned_tsvs(inFNs, valCol, binCol, binMin, binMax, binStep, outFN, getio=None): """Compute binned stats for a set of tables""" if getio: return dict(depends_on=inFNs, creates=outFN, uses=computeMeanStd_binned) computeMeanStd_binned( inDatas=itertools.imap( lambda f: pd.read_table(f, usecols=(valCol, binCol)).dropna(), MakeSeq(inFNs)), **Dict('valCol binCol binMin binMax binStep')).to_csv( outFN, sep='\t', index_label='binId', na_rep='NaN')
def joinStats(snpInfoFN, statLikesFNs, likesRatioFN, outFN, getio=None): """Join stats into one file""" if getio: return dict(depends_on=(snpInfoFN, likesRatioFN) + tuple(MakeSeq(statLikesFNs)), creates=outFN) snpInfo = pd.read_table(snpInfoFN, index_col='SNP pos (bases)') snpInfo.index.rename('pos', inplace=True) statLikes = [ pd.read_table(statLikeFN, index_col='pos') for statLikeFN in statLikesFNs ] likesRatio = pd.read_table(likesRatioFN, index_col='pos') result = snpInfo.join(statLikes + [likesRatio], how='outer') result.info() dbg('result.describe()') result.to_csv(outFN, sep='\t', na_rep='NaN', header=True)
def DefineRulesTo_runSims(pr, mutAges, mutPops, mutFreqs, nreplicas, allPops=None, Ddata='../Data/Ilya_Data/sim/sfs/working/pardis2', simsOut='simsOut', suffix='', shortSimTime=True, DdataSeeds='', useGenMap=None, includeNeutral=True, withGeneConvBug=False, withNewCosi=False, withCosi=None, DdataMimic=None): """Instantiate, for each combination of ( mutAge, mutPop, mutFreq ), the script that creates simulation parameters for simulations with that selected-mutation-age. """ assert not (DdataSeeds and DdataMimic) mutPops = MakeSeq(mutPops) mutAges = MakeSeq(mutAges) mutFreqs = MakeSeq(mutFreqs) if allPops is None: allPops = mutPops Dsims = Ddata + '/' + simsOut + suffix for scen in GetScenarios( **Dict('mutAges mutPops mutFreqs includeNeutral')): if DdataSeeds: seeds = IDotData( os.path.join(DdataSeeds, 'replicastats', scen.scenDir(), 'simSeeds.tsv')) for replicaNum, seedsLine in zip( range(nreplicas), seeds if DdataSeeds else itertools.repeat(None, nreplicas)): assert not DdataSeeds or seedsLine.replicaNum == replicaNum pfx = os.path.join(Dsims, scen.scenDir(), '%d_%s' % (replicaNum, scen.scenName())) recombDir = '../Data/Ilya_Data/sim/sfs/working/pardis2' attrs = Dict('replicaNum', scenDir=scen.scenDir()) if not scen.is_neutral(): attrs.update(mutAge=scen.mutAge, mutPop=scen.mutPop, mutFreq=scen.mutFreq) else: attrs.update(mutAge=0, mutPop=0, mutFreq=0) if shortSimTime: attrs['piperun_short'] = True mutAge = '%dky' % (0 if scen.isNeutral() else scen.mutAge) useGenMapFile = os.path.join( DdataMimic, 'simsOut', scen.scenDir(), '%d_%s.model' % (replicaNum, scen.scenName())) if DdataMimic else '' useMutRateFile = os.path.join( DdataMimic, 'simsOut', scen.scenDir(), '%d_%s.mut' % (replicaNum, scen.scenName())) if DdataMimic else '' # dbg( '"GGGGGGGGG" mutPops' ) pr.addRule( targets = [ pfx + ext for ext in ( [ '.model', '.mut', '.cosiParams' ] + ( [ '.recombParams' ] if not useGenMap else [] ) + [ '.%s-%d' % ( hapOrPos, pop ) for hapOrPos in ('hap', 'pos') for pop in allPops ] + ( [] if ( withNewCosi or withCosi ) else [ os.path.join( Dsims, scen.scenDir(), 'treeinfo', '%d_%s.%s' % ( replicaNum, scen.scenName(), which ) ) for which in ( 'regions.tsv', 'mutlist.tsv', 'nodes.dat' ) + ( () if scen.isNeutral() else ( 'sweepinfo.tsv', ) ) ] ) ) ], sources = [ Ddata + '/' + ( 'params_neutral' + suffix if scen.isNeutral() else 'params%s/%s/params_%s' % \ ( suffix, mutAge, scen.scenName() ) ) ] \ + ( [ useGenMap ] if useGenMap else [ recombDir + '/recParams_bestfit_generic', \ recombDir + '/autosomes_decode.distr' ] ) + \ ( [ useGenMapFile, useMutRateFile ] if DdataMimic else [] ), commands = ' '.join(('perl ../Operations/Ilya_Operations/sim/sfs/working/pardis2/' \ 'runOneSim.pl' + ( ' --coalSeed %ld --recombSeed %ld --useMutRate %s' % ( long( seedsLine.coalescentSeed ), long( seedsLine.recombSeed ), seedsLine.GetStrItem( 'mutRate' ) ) if DdataSeeds else '' ) + ( ' --useGenMap ' + useGenMap if useGenMap else '' ) + ( ' --withGeneConversionBug' if withGeneConvBug else '' ) + ( ' --withNewCosi' if withNewCosi else '' ) + ( ( ' --withCosi ' + withCosi ) if withCosi else '' ) + ( ( ' --useGenMap ' + useGenMapFile + ' --useMutRateFile ' + useMutRateFile ) if DdataMimic else '' ), scen.scenName(), mutAge, str(replicaNum), Ddata, Dsims, suffix )), name = 'RunOneSim', attrs = attrs, comment = 'Adding simulation', mediumRuleNameSfx = ( scen.scenName(), mutAge, replicaNum ) )
def DefineRulesTo_fastCMS(pr, pops, chroms, selPop, sweepDir, cmsDir, genomeBuild='hg19'): """Define rules to do fast CMS computation. Params: pr - the PipeRun object to which to add rules selPop - testing selection in which pop? pops - comparing selPop to which pops? sweepDir - the sweep directory cmsDir - the directory under which CMS stats go """ pops = list(MakeSeq(pops)) if selPop not in pops: pops.append(selPop) allPops = tuple(MakeSeq(pops)) if selPop not in allPops: allPops += (selPop, ) cmpPops = [pop for pop in allPops if pop != selPop] rawScoresFN = {} genMapSfx = genomeBuild2genMapSfx[genomeBuild] for pop in allPops: for chrom in chroms: with pr.settingAttrs('pop chrom'): snpInfoFN = os.path.join( sweepDir, 'analysis/chr%(chrom)s/snps_%(pop)s.tsv' % locals()) projDir = os.path.join(sweepDir, 'data/chr%(chrom)s' % locals()) ancestralImportedFN = os.path.join(projDir, 'ancestral.tsv.imported') genotypesImportedFN = os.path.join( projDir, 'genotypes_chr%(chrom)s_%(pop)s_r21_nr_fwd_phased_all.imported' % locals()) genMapImportedFN = os.path.join( projDir, 'genetic_map_chr%(chrom)s_%(genMapSfx)s.txt.imported' % locals()) pr.addRule( name='extractSnpInfo', commands= 'java -classpath ../Other/Ilya_Other/sweep/sweepsrc/sweep.jar edu.mit.broad.sweep.Main ExtractAlleleFreqs %(projDir)s/project %(snpInfoFN)s %(pop)s %(chrom)s' % locals(), commandsOld= 'java -classpath ../Other/Ilya_Other/sweep/sweepsrc/sweep/target/sweep-1.0-SNAPSHOT-jar-with-dependencies.jar edu.mit.broad.sweep.Main ExtractAlleleFreqs %(projDir)s/project %(snpInfoFN)s %(pop)s %(chrom)s' % locals(), depends_on=(ancestralImportedFN, genotypesImportedFN, genMapImportedFN), creates=snpInfoFN) chr2dihhFN = {} for chrom in chroms: with pr.settingAttrs('chrom'): chrom_s = 'chr' + str(chrom) chromDir = os.path.join(cmsDir, chrom_s) xpopScoresFN = os.path.join( chromDir, AddFileSfx('max_xpop.tsv', chrom_s, selPop, pops)) pr.addInvokeRule(invokeFn=gatherXPOPscores, invokeArgs=Dict('pops chrom selPop sweepDir', outFN=xpopScoresFN), attrs=dict(pop=allPops, stat='max_xpop', piperun_short=True)) ihsFN = getFN_ihs_signif(**Dict('sweepDir chrom', pop=selPop)) ihsScoresFN = os.path.join( chromDir, AddFileSfx('iHS.tsv', chrom_s, selPop, pops)) dihhScoresFN = os.path.join( chromDir, AddFileSfx('dihh.tsv', chrom_s, selPop, pops)) chr2dihhFN[chrom] = dihhScoresFN pop2ancFreqFN = os.path.join( cmsDir, chrom_s, AddFileSfx('pop2ancFreq.tsv', chrom_s, pops)) pop2sampleSizeFN = os.path.join( cmsDir, chrom_s, AddFileSfx('pop2sampleSize.tsv', chrom_s, pops)) pop2snpInfoFN = dict([(pop, os.path.join(sweepDir, 'analysis', 'chr%(chrom)s' % locals(), 'snps_%(pop)s.tsv' % locals())) for pop in pops]) pr.addInvokeRule( invokeFn=gather_snp_info, invokeArgs=Dict( 'pops pop2snpInfoFN pop2ancFreqFN pop2sampleSizeFN')) pr.addInvokeRule( invokeFn=gather_iHS_scores, invokeArgs=Dict( 'chrom selPop ihsFN pop2ancFreqFN', # snpInfoFN = pop2snpInfoFN[ selPop ], ihsOutFN=ihsScoresFN, dihhOutFN=dihhScoresFN), attrs=dict(pop=selPop, stat=('iHS', 'StdDiff'), piperun_short=True)) freqDiffScoresFN = os.path.join( chromDir, AddFileSfx('freqDiff.tsv', chrom_s, selPop, pops)) meanFstScoresFN = os.path.join( chromDir, AddFileSfx('meanFst.tsv', chrom_s, selPop, pops)) pr.addInvokeRule( invokeFn=computeMeanFstAndFreqDiffScores, invokeArgs=Dict( 'chrom selPop sweepDir pops pop2ancFreqFN pop2sampleSizeFN', outMeanFstFN=meanFstScoresFN, outFreqDiffFN=freqDiffScoresFN), attrs=dict(pop=allPops, stat=('freqDiff', 'meanFst'), piperun_short=True)) StdDiffScoresFN = os.path.join( chromDir, AddFileSfx('StdDiff.tsv', chrom_s, selPop, pops)) rawScoresFN[chrom] = dict(iHS=ihsScoresFN, StdDiff=StdDiffScoresFN, meanFst=meanFstScoresFN, freqDiff=freqDiffScoresFN, max_xpop=xpopScoresFN) # end: with pr.settingAttrs( 'chrom' ) # end: for chrom in chroms # ihhStdFN = os.path.join( cmsDir, 'dihhstd.tsv' ) dihhGlobalStdFN = os.path.join( cmsDir, AddFileSfx('dihh_global_std.tsv', selPop, pops)) dihhBinMeansFN = os.path.join( cmsDir, AddFileSfx('dihh_bin_means.tsv', selPop, pops)) pr.addInvokeRule(invokeFn=normalizeByFreq_getMeanStd_tsv, invokeArgs=dict( iHHDiffFNs=[chr2dihhFN[k] for k in chroms], globalStatFN=dihhGlobalStdFN, binsStatFN=dihhBinMeansFN), name='compute_dihh_meanstd') # pr.addInvokeRule( invokeFn = computeMeanStd_binned_tsvs, # invokeArgs = dict( inFNs = chr2dihhFN.values(), valCol = 'iHHDiff', # binCol = 'normingFreqs', binMin = 0.05, binMax = 1.05, binStep = .05, # outFN = ihhStdFN ), # name = 'compute_dihh_std' ) for chrom in chroms: with pr.settingAttrs('chrom'): chrom_s = 'chr' + str(chrom) chromDir = os.path.join(cmsDir, chrom_s) StdDiffScoresFN = os.path.join( chromDir, AddFileSfx('StdDiff.tsv', chrom_s, selPop, pops)) dbg('chrom chr2dihhFN[chrom]') pr.addInvokeRule(invokeFn=normalizeByFreq_compute_normed_tsv, invokeArgs=dict(iHHDiffFN=chr2dihhFN[chrom], globalStatFN=dihhGlobalStdFN, binsStatFN=dihhBinMeansFN, StdDiffFN=StdDiffScoresFN)) statLikesRatioFNs = {} for stat in CMSBins.CMSstats: with pr.settingAttrs( stat=stat, pop=(selPop, ) if stat in ('iHS', 'StdDiff') else allPops, piperun_short=True): if stat not in CMSBins.nonNormedStats: rawFNs = [rawScoresFN[chrom][stat] for chrom in chroms] meanStdFN = os.path.join( cmsDir, AddFileSfx('meanStd.tsv', stat, selPop, pops)) # DefineRulesTo_computeMeanStd( pr, inFNs = rawFNs, colNum = 1, # outFN = meanStdFN, # addRuleArgs = \ # dict( name = 'computeMeanStd_for_stat', # attrs = dict( chrom = chroms ) ) ) # meanStdBzFN = os.path.join( cmsDir, stat + '_meanStdForStat.tsv' ) pr.addInvokeRule(invokeFn=computeMeanStd, invokeArgs=dict(inFNs=rawFNs, colName=stat, outFN=meanStdFN)) # end: if stat not in CMSBins.nonNormedStats for chrom in chroms: with pr.settingAttrs('chrom'): statFN = rawScoresFN[chrom][stat] if stat not in CMSBins.nonNormedStats: normedFN = AddFileSfx(statFN, 'normed') DefineRulesTo_normalizeOneColumn( pr, inFN=statFN, meanStdFN=meanStdFN, colName=stat, outFN=normedFN, addRuleArgs=dict(attrs=Dict('chrom'))) statFN = normedFN bins_beg = CMSBins.stat_start[stat] bins_end = CMSBins.stat_end[stat] bins_n = CMSBins.stat_nbin[stat] statLikesRatioFN = AddFileSfx(rawScoresFN[chrom][stat], 'likesRatio') statLikesRatioFNs[(chrom, stat)] = statLikesRatioFN pr.addInvokeRule( invokeFn=computeLikeRatioForStat, invokeArgs=dict( stat=stat, statValsFN=statFN, hitLikesFN= '../Data/Common_Data/sim/likes/hitsLikes_toneutFixed_1.tsv', missLikesFN= '../Data/Common_Data/sim/likes/missLikes_toneutFixed_1.tsv', stat_start=bins_beg, stat_end=bins_end, stat_nbin=bins_n, statLikesRatioFN=statLikesRatioFN)) # end: with pr.settingAttrs( 'chrom' ) # end: for chrom in chroms # end: with pr.settingAttrs( stat = stat, piperun_short = True ) # end: for stat in CMSBins.CMSstats for chrom in chroms: with pr.settingAttrs(chrom=chrom, stat=CMSBins.CMSstats): chrom_s = 'chr' + str(chrom) chromDir = os.path.join(cmsDir, chrom_s) likesRatioFN = os.path.join( chromDir, AddFileSfx('likesRatio.tsv', CMSBins.CMSstats, selPop, pops)) pr.addInvokeRule(invokeFn=addLikesRatios, invokeArgs=dict( inFNs=[ statLikesRatioFNs[(chrom, stat)] for stat in CMSBins.CMSstats ], colNames=[ colName + 'likeRatio' for colName in CMSBins.CMSstats ], outFN=likesRatioFN))
def runCmdParallelized(commands, depends_on, creates, comment, splitFunc, joinFunc, saveOutputTo=None, splitFN=None, joinFN=None, name=None, mediumRuleName=None, getio=None): """Run the specified command, using parallelization.""" from Operations.Ilya_Operations.PipeRun.python.PipeRun import PipeRun dbg('"IN_RUNCMDPAR_EEEEEE" depends_on creates saveOutputTo') if creates is None: creates = () commands = MakeSeq(commands) depends_on = MakeSeq(depends_on) creates = MakeSeq(creates) gio = Dict('depends_on creates comment name mediumRuleName') dbg('gio') if getio: return Dict( 'depends_on creates comment name mediumRuleName saveOutputTo', uses=(splitFunc, joinFunc)) splitFN = splitFN or list(MakeSeq(depends_on))[0] joinFN = RandomString(12) if saveOutputTo else ( joinFN or list(MakeSeq(creates))[0]) assert any([splitFN in command for command in commands]) assert saveOutputTo or any([joinFN in command for command in commands]) logging.info('calling ' + str(splitFunc) + ' to split ' + splitFN) outDir = os.path.join('/broad/hptmp', getpass.getuser(), 'par', os.path.abspath(splitFN)[1:]) pr = PipeRun(name='splitting', descr='splitting') r = pr.addInvokeRule(invokeFn=doSplit, invokeArgs=Dict('splitFunc splitFN outDir')) pr.runSubPipeline() chunkFNs = SlurpFileLines(r.creates[0]) logging.info('finished running ' + str(splitFunc) + ' to split ' + splitFN) dbg('"CHUNKS_ARE" chunkFNs') pr = PipeRun(name='parallelizing', descr='parallelizing') chunkOutFNs = [] for chunkFN in chunkFNs: chunkOutFN = AddFileSfx(chunkFN, 'out') chunkOutFNs.append(chunkOutFN) for command in commands: dbg('splitFN chunkFN chunkOutFN command command.replace(splitFN,chunkFN)' ) pr.addRule( commands=[ command.replace(splitFN, chunkFN).replace(joinFN, chunkOutFN) for command in commands ], depends_on=[f if f != splitFN else chunkFN for f in depends_on], creates=[f if f != joinFN else chunkOutFN for f in creates], saveOutputTo=None if saveOutputTo is None else chunkOutFN) pr.runSubPipeline() joinFunc(inFNs=chunkOutFNs, outFN=None if saveOutputTo else joinFN)