Exemple #1
0
def normalizeColumnsWithinGroups( inFN, cols, groupCols, outFN, groupsAreContiguous = True, getio = None ):
    """Normalize the specified columns of a table within groups.

    Params:

       inFN - the input table
       cols - the columns to be normalized
       groupCols - the columns that define the groups: rows that have the same combination of values
          in the group columns, are in the same group.
       outFN - the output table
       groupsAreContiguous - if True, rows belonging to the same group must be contiguous in the table;
          if False, no such assumption is made.
    """

    cols = tuple( MakeSeq( cols ) )
    groupCols = tuple( MakeSeq( groupCols ) )

    meansFN = GetCreates( computeMeanStdWithinGroups, **Dict( 'inFN cols groupCols groupsAreContiguous' ) )[0]
    
    if getio: return dict( depends_on = ( inFN, meansFN ), creates = outFN,
                           splitByCols = { inFN: dict( keyCols = () ) } )

    inFile = IDotData( inFN )
    means = IDotData( meansFN )

    inFile.normalizeColumnsWithinGroups_using_means( **Dict( 'cols groupCols groupsAreContiguous means' ) ).save( outFN )
Exemple #2
0
def normalizeColumnsWithinGroups(inFN,
                                 cols,
                                 groupCols,
                                 outFN,
                                 groupsAreContiguous=True,
                                 getio=None):
    """Normalize the specified columns of a table within groups.

    Params:

       inFN - the input table
       cols - the columns to be normalized
       groupCols - the columns that define the groups: rows that have the same combination of values
          in the group columns, are in the same group.
       outFN - the output table
       groupsAreContiguous - if True, rows belonging to the same group must be contiguous in the table;
          if False, no such assumption is made.
    """

    cols = tuple(MakeSeq(cols))
    groupCols = tuple(MakeSeq(groupCols))

    meansFN = GetCreates(computeMeanStdWithinGroups,
                         **Dict('inFN cols groupCols groupsAreContiguous'))[0]

    if getio:
        return dict(depends_on=(inFN, meansFN),
                    creates=outFN,
                    splitByCols={inFN: dict(keyCols=())})

    inFile = IDotData(inFN)
    means = IDotData(meansFN)

    inFile.normalizeColumnsWithinGroups_using_means(
        **Dict('cols groupCols groupsAreContiguous means')).save(outFN)
Exemple #3
0
def evalSpatialLoc( Ddata, thinSfx, scenario, putativeMutPop, nreplicas, complikeSfx = '',
                    likesTableSfx = '', selpos = 500000, whichSpatialLoc = 'Spline',
                    getio = None ):
    """Evaluate spatial localization.  Compute relevant replica statistic.  For each replica,
    compute: whether the localized intervals include the causal SNP; statistics about the
    localized intervals; the position of the causal SNP relative to the localized intervals."""

    assert not scenario.isNeutral()
    
    snpStatsDir = os.path.join( Ddata, 'snpStats'+ thinSfx, scenario.scenDir() )
    replicaStatsDir = os.path.join( Ddata, 'replicastats'+ thinSfx, scenario.scenDir() )
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = ( putativeMutPop, complikeSfx, likesTableSfx )

    intervalsListFN = os.path.join( replicaStatsDir, AddFileSfx( 'intervals%sList.tsv' % whichSpatialLoc, *sfxs ) )
    causalGdPosFN = os.path.join( replicaStatsDir, 'causalGdPos.tsv' )
    spatialLocEvalFN = os.path.join( replicaStatsDir, AddFileSfx( 'spatialLocEval%s.tsv' % whichSpatialLoc, *sfxs ) )

    if getio: return dict( depends_on = ( intervalsListFN, causalGdPosFN ),
                           creates = spatialLocEvalFN,
                           mediumRuleNameSfx = scenario.scenDir(),
                           name = 'evalSpatialLoc_' + whichSpatialLoc )

    with IDotData.openForWrite( spatialLocEvalFN,
                                'replicaNum numIntervals totLenBp totLenGd causalIncluded '
                                'distanceToIntervalBoundaryBp distanceToIntervalBoundaryGd' ) as spatialLocEvalFile:

        for ( replicaNum2, replicaIntervals ), ( replicaNum3, causalGdPos, replicaNum4 ) in \
                itertools.izip( IDotData( intervalsListFN ).groupby( 'replicaNum' ),
                                IDotData.merge( iDotDatas = ( IDotData( causalGdPosFN ),
                                                              IDotData( intervalsListFN ).replicaNum.removeDups() ),
                                                cols = ( 'replicaNum', 'replicaNum' ) ) ):

            replicaNum2, replicaNum3, replicaNum4  = map( int, ( replicaNum2, replicaNum3, replicaNum4 ) )
            if not replicaNum2 == replicaNum3 == replicaNum4:
                dbg( 'replicaNum2 replicaNum3 replicaNum4 intervalsListFN complikeFN causalGdPosFN spatialLocEvalFN' )
            assert replicaNum2 == replicaNum3 == replicaNum4

            causalIncluded = False
            totLenBp = 0
            totLenGd = 0.0
            for replicaInterval in replicaIntervals:
                dbg( 'replicaInterval causalGdPos' )
                #assert bool( replicaInterval.bpFrom <= selpos <= replicaInterval.bpTo ) == bool( replicaInterval.gdFrom <= causalPos_gd <= replicaInterval.gdTo )
                assert ( replicaInterval.gdFrom <= causalGdPos <= replicaInterval.gdTo ) == ( replicaInterval.bpFrom <= selpos <= replicaInterval.bpTo )
                if replicaInterval.gdFrom <= causalGdPos <= replicaInterval.gdTo:
                    causalIncluded = True

                totLenGd += ( replicaInterval.gdTo - replicaInterval.gdFrom )
                totLenBp += ( replicaInterval.bpTo - replicaInterval.bpFrom )

            spatialLocEvalFile.writeRecord( replicaNum2, len( replicaIntervals ),
                                            totLenBp, totLenGd, int( causalIncluded ),
                                            np.min( ( np.min( np.abs( replicaIntervals.bpFrom - selpos ) ),
                                                      np.min( np.abs( replicaIntervals.bpTo - selpos ) ) ) ),
                                            np.min( ( np.min( np.abs( replicaIntervals.gdFrom - causalGdPos ) ),
                                                      np.min( np.abs( replicaIntervals.gdTo - causalGdPos ) ) ) ) )
Exemple #4
0
def sortTableOn(inFN, outFN, keyCols, reverse=False, getio=None):
    """Sort the given table on the given column(s)."""

    if getio: return dict(depends_on=inFN, creates=outFN)

    result = IDotData(inFN).sortedOn(*MakeSeq(keyCols))
    if reverse:
        d = result.toDotData()
        result = d[range(len(d) - 1, -1, -1)]
    result.save(outFN)
Exemple #5
0
def sortTableOn( inFN, outFN, keyCols, reverse = False, getio = None ):
  """Sort the given table on the given column(s)."""

  if getio: return dict( depends_on = inFN, creates = outFN )

  result = IDotData( inFN ).sortedOn( *MakeSeq( keyCols ) )
  if reverse:
    d = result.toDotData()
    result = d[ range( len( d )-1, -1, -1 ) ]
  result.save( outFN )
Exemple #6
0
def gatherCausalRanks(Ddata=None,
                      scenario=None,
                      selpos=500000,
                      thinSfx='',
                      complikeSfx=None,
                      likesTableSfx='',
                      nonNanStats='ALL',
                      cmsFileFN=None,
                      causalRankFN=None,
                      getio=None):
    """For each replica in one scenario, get the rank of the causal SNP by CMS score, and save as a replica statistic.
    """

    assert cmsFileFN or (Ddata and scenario)
    scenDir = scenario.scenDir() if scenario else 'unknown_scenDir'
    if not cmsFileFN:
        snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenDir)
    if not cmsFileFN:
        cmsFileFN = os.path.join(
            snpStatsDir,
            AddFileSfx('complike.data/', scenario.mutPop, complikeSfx,
                       likesTableSfx))

    if not causalRankFN:
        causalRankFN = os.path.join(
            Ddata, 'replicastats', scenDir,
            AddFileSfx('causalRank.tsv', complikeSfx, 'nonNan',
                       *MakeSeq(nonNanStats)))

    if getio:
        return dict(depends_on=cmsFileFN,
                    creates=causalRankFN,
                    mediumRuleNameSfx=(scenDir, complikeSfx))

    cmsScores = IDotData(cmsFileFN)
    if nonNanStats.upper() == 'ALL': nonNanStats = cmsScores.headings

    with IDotData.openForWrite(
            causalRankFN,
            headings='replicaNum causalRank causalScore') as causalRankFile:
        for replicaNum, cmsScores1, cmsScores2 in cmsScores.groupby(
                'Chrom', multiPass=2):
            for r1 in cmsScores1:
                if r1.Pos == selpos:
                    causalScore = r1.complike
                    numHigher = 0
                    for r2 in cmsScores2:
                        if r2.complike > causalScore and all(
                            [np.isfinite(r2[c]) for c in nonNanStats]):
                            numHigher += 1
                    causalRankFile.writeRecord(int(replicaNum), numHigher,
                                               causalScore)

                if r1.Pos >= selpos: break
Exemple #7
0
def mergePosFilesOneSim( scenario, Ddata, replicaNum, putativeMutPop = None, simsOut = 'simsOut', thinExt = '', thinSfx = '',
			 statsSfx = '', ihsSfx = '', pop2name = pop2name,
			 getio = None ):
	"""Merge per-SNP position files for one sim.
	"""

	if not Ddata.endswith('/'): Ddata += '/'

	popNums = sorted( pop2name.keys() )
	minPopNum = popNums[ 0 ]

	SimDir = os.path.join( Ddata, simsOut + thinSfx )

	scenName = scenario.scenName()
	scenDir = scenario.scenDir()

	if putativeMutPop == None: putativeMutPop = scenario.mutPop
	
	posFiles = [ os.path.join( SimDir, scenDir, '%d_%s.pos-%d%s' % ( replicaNum, scenName, pop, thinExt ) )
		      for pop in popNums ]

	snpStatsDir = os.path.join( Ddata, 'snpStats' + thinSfx, scenario.scenDir() )
	mergedPosFN = os.path.join( snpStatsDir, 'mergedPos', AddFileSfx( 'mergedPos.tsv', statsSfx, putativeMutPop, ihsSfx, replicaNum ) )

	if getio: return dict( depends_on = posFiles, creates = mergedPosFN,
			       splitByCols = dict([ ( posFile, dict( keyCols = 'CHROM_POS',
								     tableReadOpts = dict( headingSep = 'whitespace' ) ) ) for posFile in posFiles ]),
			       mediumRuleNameSfx = scenario.scenDir() )

	replicaIDotDatas = []

	posCols = ['CHROM_POS %d' % minPopNum ] + ['FREQ1 %d' % popNum for popNum in popNums]

	# Load in pos files for this replica.
	# They give, for each SNP in the replica, its physical (basepair) position within the replica,
	# and the frequency of the derived and the ancestral alleles.
	posIndiv = [ IDotData( os.path.join( SimDir, scenDir, str(replicaNum) + '_' + scenName +
					     '.pos-%d%s' % ( pop, thinExt) ), skipFirstLines = 1,
			       headings = ('SNP','CHROM', 'CHROM_POS', 'ALLELE1',
					   'FREQ1', 'ALLELE2', 'FREQ2' )) 
		     for pop in popNums ]

	posAll = IDotData.merge( iDotDatas = posIndiv,
				 cols = ('CHROM_POS',) * len( posIndiv ),
				 blanks = (np.nan,) * len( posIndiv ),
				 suffixes = [ ' %s' % pop for pop in popNums ] )

	posAll = posAll[posCols]

	IDotData.repeat( heading = 'replicaNum', value = replicaNum ).hstack( posAll ).save( mergedPosFN )
Exemple #8
0
def gatherCausalStats(Ddata,
                      scenario,
                      selpos=500000,
                      thinSfx='',
                      complikeSfx=None,
                      likesTableSfx='',
                      nonNanStats='ALL',
                      getio=None):
    """For each replica in one scenario, gather causal SNP stats and save them as the replica statistic.
    """

    snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenario.scenDir())
    replicaStatsDir = os.path.join(Ddata, 'replicastats' + thinSfx,
                                   scenario.scenDir())
    complikeFN = os.path.join(
        snpStatsDir,
        AddFileSfx('complike.data/', 'normedLocal', scenario.mutPop,
                   complikeSfx, 'nonNan', *MakeSeq(nonNanStats)))

    causalStatsFN = os.path.join(
        replicaStatsDir,
        AddFileSfx('causalStats.tsv', complikeSfx, likesTableSfx, 'nonNan',
                   *MakeSeq(nonNanStats)))

    if getio:
        return dict(depends_on=complikeFN,
                    creates=causalStatsFN,
                    mediumRuleNameSfx=(scenario.scenDir(), complikeSfx,
                                       likesTableSfx))

    complikeFile = IDotData(complikeFN)
    complikeFile[complikeFile.Pos == selpos].addComputedCols(
        newColNames='replicaNum',
        newColFn=lambda r: int(r.Chrom)).save(causalStatsFN)
def gatherCausalSnpGdPos(Ddata, thinSfx, scenario, selpos=500000, getio=None):
    """For each replica in the scenario, save the genetic map position of the causal SNP
    as a replica statistic."""

    assert not scenario.isNeutral()

    snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenario.scenDir())
    replicaStatsDir = os.path.join(Ddata, 'replicastats' + thinSfx,
                                   scenario.scenDir())

    gdMapFN = os.path.join(snpStatsDir, 'gdMap.tsv')
    causalGdPosFN = os.path.join(replicaStatsDir, 'causalGdPos.tsv')

    if getio:
        return dict(depends_on=gdMapFN,
                    creates=causalGdPosFN,
                    mediumRuleNameSfx=scenario.scenDir())

    with IDotData.openForWrite(causalGdPosFN,
                               'replicaNum causalGdPos') as causalGdPosFile:
        gdMap = IDotData(gdMapFN)
        for r in gdMap[gdMap.Pos == selpos]:
            causalGdPosFile.writeRecord(r.Chrom, r.gdPos)
Exemple #10
0
def checkTableKey(inFN,
                  cols,
                  comparison='lt',
                  writeCheckedFile=True,
                  tsvOpts={},
                  lineFilter=None,
                  lineFilterCols=(),
                  getio=None):
    """Check that in the given table, record identifiers increase uniformly.

  Params:

     cols - the columns whose tuple should uniformly inrease
     comparison - this comparison must be true between each record and the next.
       the comparison is the name of a routine in the operator module.
  """

    cols = tuple(MakeSeq(cols))
    lineFilterCols = tuple(MakeSeq(lineFilterCols))
    checkedFN = Str('$inFN.checked_${comparison}') + Sfx(*cols)
    if getio:
        return dict(depends_on=inFN,
                    creates=checkedFN if writeCheckedFile else (),
                    attrs=dict(piperun_short=True))

    comparisonFunc = getattr(operator, comparison)
    prevRec = None
    loadCols = cols + lineFilterCols

    nskipped = 0
    nchecked = 0
    for i, r in enumerate(IDotData(inFN, ToLoad=loadCols, **tsvOpts)):
        if lineFilter and not lineFilter(r):
            nskipped += 1
            continue

        thisRec = r[cols] if IsSeq(r) else (r, )
        if i > 0 and not comparisonFunc(prevRec, thisRec):
            logging.error(
                Str('at line $i of $inFN, looking at $cols: $prevRec is not $comparison $thisRec'
                    ))
            assert False
        else:
            nchecked += 1
        prevRec = thisRec

    dbg('nchecked nskipped')
    DumpFile(checkedFN, 'checked ok.')
Exemple #11
0
def gatherCausalSnpGdPos( Ddata, thinSfx, scenario, selpos = 500000, getio = None ):
    """For each replica in the scenario, save the genetic map position of the causal SNP
    as a replica statistic."""

    assert not scenario.isNeutral()
    
    snpStatsDir = os.path.join( Ddata, 'snpStats'+ thinSfx, scenario.scenDir() )
    replicaStatsDir = os.path.join( Ddata, 'replicastats'+ thinSfx, scenario.scenDir() )

    gdMapFN = os.path.join( snpStatsDir, 'gdMap.tsv' )
    causalGdPosFN = os.path.join( replicaStatsDir, 'causalGdPos.tsv' )

    if getio: return dict( depends_on = gdMapFN, creates = causalGdPosFN,
                           mediumRuleNameSfx = scenario.scenDir() )

    with IDotData.openForWrite( causalGdPosFN, 'replicaNum causalGdPos' ) as causalGdPosFile:
        gdMap = IDotData( gdMapFN )
        for r in gdMap[ gdMap.Pos == selpos ]:
            causalGdPosFile.writeRecord( r.Chrom, r.gdPos )
Exemple #12
0
def computeMeanStdWithinGroups(inFN,
                               cols,
                               groupCols,
                               groupsAreContiguous=True,
                               outFN=None,
                               getio=None):
    """Add columns representing mean and std within each group.
    """

    sumsFN = GetCreates(computeSumsWithinGroups,
                        **Dict('inFN cols groupCols groupsAreContiguous'))[0]
    if outFN is None:
        outFN = AddFileSubdir('stats',
                              AddFileSfx(inFN, 'meanStd', *(cols + groupCols)))
    if getio:
        return dict(depends_on=sumsFN,
                    creates=outFN,
                    attrs=dict(piperun_short=True))

    return IDotData(sumsFN).addMeanStdCols(cols=cols).save(outFN)
Exemple #13
0
def computeHistograms(inFN, cols, binSizes=None, outFNs=None, getio=None):
    """Compute histograms of the specified columns of the input"""

    cols = tuple(MakeSeq(cols))
    binSizesHere = (.001, ) * len(cols) if binSizes is None else tuple(
        MakeSeq(binSizes))
    outFNsHere = outFNs
    if outFNsHere is None:
        outFNsHere = [
            AddFileSubdir('stats', AddFileSfx(inFN, 'hist', col))
            for col in cols
        ]

    assert len(cols) == len(binSizesHere) == len(outFNsHere)
    if getio: return dict(depends_on=inFN, creates=outFNsHere)
    # add histogram combiner

    hists = [Histogrammer(binSize=binSize) for binSize in binSizesHere]
    z = IDotData(inFN)
    for h, c, outFN in zip(hists, cols, outFNsHere):
        h.addVals(z[c])
        h.save(outFN)
Exemple #14
0
def computeSumsWithinGroups(inFN,
                            cols,
                            groupCols,
                            groupsAreContiguous=True,
                            outFN=None,
                            getio=None):
    """For a tsv file, compute sums, sumsquares and counts for each of the given columns within groups
  defined by groupCols.

  >>> z = IDotData( names = ( 'a', 'b' ), Records = ( ( 1, 2 ), ( 1, 3 ), ( 2, 4 ), ( 2, 5 ) ) )
  >>> computeSumsWithinGroups( inFN = z, cols = 'b', groupCols = 'a', outFN = sys.stdout )
  ... # doctest: +NORMALIZE_WHITESPACE
  a	b_count	b_sum	b_sumSq	b_numNaN
  1	2	5.0	13.0	0
  2	2	9.0	41.0	0

  """

    cols = tuple(MakeSeq(cols))
    groupCols = tuple(MakeSeq(groupCols))
    if outFN is None:
        outFN = AddFileSubdir('stats',
                              AddFileSfx(inFN, 'sums', *(cols + groupCols)))

    def combiner(inFNs, outFN):
        IDotData.mergeColumnSummaries(iDotDatas=inFNs,
                                      cols=cols,
                                      groupCols=groupCols).save(outFN)

    if getio:
        return dict(depends_on=inFN,
                    creates=outFN,
                    splitByCols={inFN: dict(keyCols=())},
                    combiner={outFN: combiner})

    IDotData(inFN).summarizeColumnsWithinGroups(
        **Dict('cols groupCols groupsAreContiguous')).save(outFN)
Exemple #15
0
def DotData2TSV_lowmem(inFN, outFN, readOpts={}, getio=None):
    """Convert DotData to TSV"""
    if getio: return dict(depends_on=inFN, creates=outFN)
    IDotData(Path=inFN, **readOpts).saveToSV(outFN)
Exemple #16
0
 def combiner(inFNs, outFN):
     SaveCount(np.sum([next(iter(IDotData(f))) for f in inFNs]), outFN)
Exemple #17
0
 def SaveCount(count, outFN):
     IDotData(names=('recordCount', ), Records=(count, )).save(outFN)
Exemple #18
0
def localizeSpatiallyByWindows(Ddata, scenario, nreplicas, thinSfx = '', putativeMutPop = None, complikeSfx = '',
                               likesTableSfx = '',
                               threshold = .5, numSNP = 1,
                               minGdInEachDir = .05,
                               fromReplica = None, toReplica = None, getio = None):
    """
    Spatially localize the selected variant for all replicas within a given scenario.
    The approach is to start with the highest-scoring SNP, and move left and right from it in fixed-size windows
    for as long as the windows contain at least 'numSNP' snps with score at least 'threshold'.

    Adapted from Operations.Shari_Operations.localize.hapmap_regions_0615.plotRegions() .
    """

    snpStatsDir = os.path.join( Ddata, 'snpStats'+ thinSfx, scenario.scenDir() )
    replicaStatsDir = os.path.join( Ddata, 'replicastats'+ thinSfx, scenario.scenDir() )
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = ( putativeMutPop, complikeSfx, likesTableSfx )
    complikeFN = os.path.join( snpStatsDir, AddFileSfx( 'complike.data/', *sfxs ) )

    intervalsListFN = os.path.join( replicaStatsDir, AddFileSfx( 'intervalsWindowsList.tsv', *sfxs ) )

    if getio:
        return dict( depends_on = complikeFN,
                     creates = intervalsListFN,
                     mediumRuleNameSfx = ( scenario.scenDir(), ) + sfxs,
                     fileDescrs =
                     { intervalsListFN:
                           'List of intervals in the region, one of which (hopefully) contains the causal SNP.'
                       ' For each replica this table has one or more lines, giving intervals in that replica.' } )
    

    #complike = IDotData( complikeFN ).filter( lambda r: all( np.isfinite( ( r.iHS, r.meanFst, r.max_xpop ) ) ) )
    complike = IDotData( complikeFN )

    with IDotData.openForWrite( intervalsListFN,
                                'replicaNum gdFrom gdTo gdSize bpFrom bpTo bpSize numPositiveBins '
                                'numSnpsOver_0_2 maxSNP_Pos maxSNP_lik' ) as intervalsListFile:

        for replicaNum, complikeForReplica in complike.groupby( 'Chrom' ):


            if fromReplica is not None and replicaNum < fromReplica: continue
            if toReplica is not None and replicaNum > toReplica: break

            X = complikeForReplica.toDotData()

            minPos = np.min(X.gdPos)
            maxPos = np.max(X.gdPos)
            bins = np.arange(0,1,.01)

            ind = X.complikeExp.argsort()
            lik = X.complikeExp[ind]
            maxlik = np.mean(lik[-5:])
            #maxlik = mean(lik[-5:])
            like = X.complikeExp / maxlik

            Y = X[ind]
            maxSNP = Y.Pos[-1]

            maxScore = Y.complikeExp[-1]
            relPos = X.gdPos - Y.gdPos[-1]
            X = X.hstack(DotData(Columns = [like,relPos],names=['scaled_like','relPos']))

            topGdPos = Y.gdPos[-1]
            minPos = Y.Pos[-1]
            maxPos = Y.Pos[-1]
            minGdPos = topGdPos
            maxGdPos = topGdPos
            numPositiveBins = 0

            dbg( 'replicaNum minPos maxPos minGdPos maxGdPos' )

            for dir in -1, +1:
                for bin in bins:
                    Z = X[np.abs(X.relPos - dir*bin) <= .02]

                    dbg( 'dir bin len(Z)' )

                    if len(Z) == 0: continue
                    top = Z[Z.scaled_like > threshold]

                    dbg( 'len(top)' )

                    if len(top) <= numSNP and np.abs( topGdPos - top.gdPos ) > minGdInEachDir: break
                    if len( top ) == 0: top = Z
                    if dir == -1:
                        minPos = np.min(top.Pos)
                        minGdPos = np.min(top.gdPos)
                    else:
                        maxPos = np.max(top.Pos)
                        maxGdPos = np.max(top.gdPos)
                    numPositiveBins += 1

                    
            ind = np.all([X.Pos > minPos, X.Pos < maxPos],axis=0)
            peak = X[ind]
            
            intervalsListFile.writeRecord( replicaNum, minGdPos, maxGdPos, maxGdPos - minGdPos,
                                           minPos, maxPos, maxPos - minPos,
                                           numPositiveBins, sum(peak.scaled_like > .2),
                                           maxSNP, maxScore )
Exemple #19
0
def localizeSpatiallyBySplineFitting( Ddata, scenario, nreplicas, thinSfx = '',
                                      putativeMutPop = None, complikeSfx = '', likesTableSfx = '',
                                      confidence = .9, minBins = 20, nbins = 200, smoothing = 0.0,
                                      getio = None ):
    """For each replica within a given scenario,
    localize the selected SNP spatially, by fitting a spline to a (smoothed version of) the CMS scores, dividing the
    region into bins, and finding the set of bins that cover 90% (or specified fraction of) area under the spline.

    Params:

       confidence - the spatially localized region will (hopefully) have this probability of containing the causal SNP;
         here, specifically, this means we'll include bins in the region that collectively cover this fraction of area
         under the posterior density curve.

       minBins - the region will include at least this many of the highest-average bins.
       
    """

    snpStatsDir = os.path.join( Ddata, 'snpStats'+ thinSfx, scenario.scenDir() )
    replicaStatsDir = os.path.join( Ddata, 'replicastats'+ thinSfx, scenario.scenDir() )
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = ( putativeMutPop, complikeSfx, likesTableSfx )
    complikeFN = os.path.join( snpStatsDir, AddFileSfx( 'complike.data/', *sfxs ) )

    intervalsListFN = os.path.join( replicaStatsDir, AddFileSfx( 'intervalsSplineList.tsv', *sfxs ) )
    intervalsStatsFN = os.path.join( replicaStatsDir, AddFileSfx( 'intervalsSplineStats.tsv', *sfxs ) )

    posteriorSplineFN = os.path.join( replicaStatsDir, AddFileSfx( 'intervalsSplineSpline.tsv', *sfxs ) )
    binInfoFN = os.path.join( replicaStatsDir, AddFileSfx( 'intervalsSplineBinInfo.tsv', *sfxs ) )

    if getio:
        return dict( depends_on = complikeFN,
                     creates = ( intervalsListFN, intervalsStatsFN, posteriorSplineFN, binInfoFN ),
                     mediumRuleNameSfx = ( scenario.scenDir(), ) + sfxs,
                     fileDescrs = { intervalsListFN:
                                        'List of intervals in the region, one of which (hopefully) contains the causal SNP.'
                                    ' For each replica this table has one or more lines, giving intervals in that replica.',
                                    intervalsStatsFN: 'Per-replica statistic about confidence intervals in that replica',
                                    posteriorSplineFN: 'The results of interpolating posterior density as a spline',
                                    binInfoFN: 'Information about individual bins under the spline' } )
    
    complike = IDotData( complikeFN ).filter( lambda r: all( np.isfinite( ( r.iHS, r.meanFst, r.max_xpop ) ) ) )

    binInfoHeadings = 'replicaNum binNum binStart binEnd included binCenters binAvgCMS binMaxCMS binIntegral '\
        'binIntegralNormed binRank'

    nbins_orig = nbins

    with contextlib.nested( IDotData.openForWrite( posteriorSplineFN, 'replicaNum gdPos complikeExp' ),
                            IDotData.openForWrite( binInfoFN, binInfoHeadings ),
                            IDotData.openForWrite( intervalsListFN, 'replicaNum gdFrom gdTo gdSize bpFrom bpTo '
                                                   'bpSize binsInInterval binsArea binsMax binsMaxFrac binsAvg '
                                                   'binsMinRank binsMaxRank' ),
                            IDotData.openForWrite( intervalsStatsFN, 'replicaNum numSegments gdTotLen bpTotLen' ) ) \
                            as ( posteriorSplineFile, binInfoFile, intervalsListFile, intervalsStatsFile ):

        for replicaNum, complikeForReplica in complike.groupby( 'Chrom' ):

            gdMin = np.min( complikeForReplica.gdPos )
            gdMax = np.max( complikeForReplica.gdPos )
            binSize = ( gdMax - gdMin ) / nbins_orig
            binLefts = np.arange( gdMin, gdMax, binSize )
            binCenters = binLefts + binSize / 2
            binRights = binLefts + binSize
            dbg( 'len(complikeForReplica) replicaNum len(binLefts) len(binRights) len(binCenters)' )
            nbins = len( binLefts )

            #
            # Compute the mean and max CMS score in each gdPos bin
            #
            binAvgCMS = np.zeros( nbins )
            binMaxCMS = np.zeros( nbins )
            binNums = np.zeros( nbins, dtype = int )

            for bin, valsInBin in \
                    complikeForReplica.addCol( 'bin',
                                               map( functools.partial( min, nbins-1 ),
                                                    map( int,
                                                         ( complikeForReplica.gdPos - gdMin ) / binSize ))).groupby('bin'):

                binNums[ bin ] = bin
                if valsInBin:
                    binAvgCMS[ bin ] = np.mean( valsInBin.complikeExp )
                    binMaxCMS[ bin ] = max( valsInBin.complikeExp )
                else:
                    binAvgCMS[ bin ] = binAvgCMS[ bin-1 ]
                    binMaxCMS[ bin ] = binMaxCMS[ bin-1 ]

            # fit a spline to the function ( binCenter, binAvgCMS ), approximating the posterior probability of a SNP being
            # causal.
            posteriorDensitySpline = interpolate.splrep( binCenters, binAvgCMS, s = smoothing )

            splineX = np.arange( gdMin, gdMax, binSize / 2 )
            for x, y in zip( splineX, interpolate.splev( splineX, posteriorDensitySpline ) ):
                posteriorSplineFile.writeRecord( replicaNum, x, y )

            # compute the integral under the interpolated function, for each bin.
            binIntegral = np.zeros( nbins )
            for binLeft, binRight, binNum in zip( binLefts, binRights, binNums ):
                binIntegral[ binNum ] = interpolate.splint( binLeft, binRight, posteriorDensitySpline )

            # normalize the integral value above each bin so that the total posterior density of being causal
            # integrates to 1.0
            binIntegralNormed = binIntegral / sum( binIntegral )

            binsByIntegralSize = binIntegral.argsort()[::-1]

            binRank = np.zeros( nbins )
            for i in range( nbins ):
                binRank[ binsByIntegralSize[ i ] ] = i

            binsToUse = np.zeros( nbins, dtype = bool )
            binsIncluded = 0
            fractionCovered = 0.0
            for bin in binsByIntegralSize:
                if fractionCovered >= confidence and binsIncluded >= minBins: break
                binsToUse[ bin ] = True
                binsIncluded += 1
                fractionCovered += binIntegralNormed[ bin ]

            # list the confidence intervals for this replica,
            # merging adjacent intervals.

            gd2bp = interpolate.interp1d( complikeForReplica.gdPos, complikeForReplica.Pos )
            gdMax = max( complikeForReplica.gdPos )

            binInfo = IDotData( names = binInfoHeadings, 
                                Columns = ( itertools.repeat( replicaNum, nbins ),
                                            range( nbins ),
                                            binLefts, binRights, binsToUse,
                                            binCenters, binAvgCMS, binMaxCMS, binIntegral, binIntegralNormed, binRank
                                            ) )

            binInfoFile.writeRecords( binInfo )


            binsOverallMaxCMS = max( binInfo.binMaxCMS )

            gdTotLen = 0.0
            bpTotLen = 0
            numSegments = 0
            for included, bins in binInfo.groupby( 'included' ):
                if included:
                    gdFrom = min( bins.binStart )
                    gdTo = min( max( bins.binEnd ), gdMax )
                    bpFrom = int( gd2bp( gdFrom ) )
                    bpTo = int( gd2bp( gdTo ) )

                    gdTotLen += ( gdTo - gdFrom )
                    bpTotLen += ( bpTo - bpFrom )
                    numSegments += 1

                    intervalsListFile.writeRecord( replicaNum, gdFrom, gdTo, gdTo - gdFrom, bpFrom, bpTo, bpTo - bpFrom,
                                                   len( bins ), sum( bins.binIntegralNormed ),
                                                   max( bins.binMaxCMS ), max( bins.binMaxCMS ) / binsOverallMaxCMS,
                                                   np.mean( bins.binAvgCMS ),
                                                   min( bins.binRank ), max( bins.binRank ) )

            assert numSegments > 0 and bpTotLen > 0 and gdTotLen > 0.0
            intervalsStatsFile.writeRecord( replicaNum, numSegments, gdTotLen, bpTotLen )
            dbg( 'replicaNum numSegments gdTotLen bpTotLen' )
def localizeSpatiallyByWindows(Ddata,
                               scenario,
                               nreplicas,
                               thinSfx='',
                               putativeMutPop=None,
                               complikeSfx='',
                               likesTableSfx='',
                               threshold=.5,
                               numSNP=1,
                               minGdInEachDir=.05,
                               fromReplica=None,
                               toReplica=None,
                               getio=None):
    """
    Spatially localize the selected variant for all replicas within a given scenario.
    The approach is to start with the highest-scoring SNP, and move left and right from it in fixed-size windows
    for as long as the windows contain at least 'numSNP' snps with score at least 'threshold'.

    Adapted from Operations.Shari_Operations.localize.hapmap_regions_0615.plotRegions() .
    """

    snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenario.scenDir())
    replicaStatsDir = os.path.join(Ddata, 'replicastats' + thinSfx,
                                   scenario.scenDir())
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = (putativeMutPop, complikeSfx, likesTableSfx)
    complikeFN = os.path.join(snpStatsDir, AddFileSfx('complike.data/', *sfxs))

    intervalsListFN = os.path.join(
        replicaStatsDir, AddFileSfx('intervalsWindowsList.tsv', *sfxs))

    if getio:
        return dict(
            depends_on=complikeFN,
            creates=intervalsListFN,
            mediumRuleNameSfx=(scenario.scenDir(), ) + sfxs,
            fileDescrs={
                intervalsListFN:
                'List of intervals in the region, one of which (hopefully) contains the causal SNP.'
                ' For each replica this table has one or more lines, giving intervals in that replica.'
            })

    #complike = IDotData( complikeFN ).filter( lambda r: all( np.isfinite( ( r.iHS, r.meanFst, r.max_xpop ) ) ) )
    complike = IDotData(complikeFN)

    with IDotData.openForWrite(
            intervalsListFN,
            'replicaNum gdFrom gdTo gdSize bpFrom bpTo bpSize numPositiveBins '
            'numSnpsOver_0_2 maxSNP_Pos maxSNP_lik') as intervalsListFile:

        for replicaNum, complikeForReplica in complike.groupby('Chrom'):

            if fromReplica is not None and replicaNum < fromReplica: continue
            if toReplica is not None and replicaNum > toReplica: break

            X = complikeForReplica.toDotData()

            minPos = np.min(X.gdPos)
            maxPos = np.max(X.gdPos)
            bins = np.arange(0, 1, .01)

            ind = X.complikeExp.argsort()
            lik = X.complikeExp[ind]
            maxlik = np.mean(lik[-5:])
            #maxlik = mean(lik[-5:])
            like = X.complikeExp / maxlik

            Y = X[ind]
            maxSNP = Y.Pos[-1]

            maxScore = Y.complikeExp[-1]
            relPos = X.gdPos - Y.gdPos[-1]
            X = X.hstack(
                DotData(Columns=[like, relPos],
                        names=['scaled_like', 'relPos']))

            topGdPos = Y.gdPos[-1]
            minPos = Y.Pos[-1]
            maxPos = Y.Pos[-1]
            minGdPos = topGdPos
            maxGdPos = topGdPos
            numPositiveBins = 0

            dbg('replicaNum minPos maxPos minGdPos maxGdPos')

            for dir in -1, +1:
                for bin in bins:
                    Z = X[np.abs(X.relPos - dir * bin) <= .02]

                    dbg('dir bin len(Z)')

                    if len(Z) == 0: continue
                    top = Z[Z.scaled_like > threshold]

                    dbg('len(top)')

                    if len(top) <= numSNP and np.abs(
                            topGdPos - top.gdPos) > minGdInEachDir:
                        break
                    if len(top) == 0: top = Z
                    if dir == -1:
                        minPos = np.min(top.Pos)
                        minGdPos = np.min(top.gdPos)
                    else:
                        maxPos = np.max(top.Pos)
                        maxGdPos = np.max(top.gdPos)
                    numPositiveBins += 1

            ind = np.all([X.Pos > minPos, X.Pos < maxPos], axis=0)
            peak = X[ind]

            intervalsListFile.writeRecord(replicaNum, minGdPos, maxGdPos,
                                          maxGdPos - minGdPos, minPos, maxPos,
                                          maxPos - minPos, numPositiveBins,
                                          sum(peak.scaled_like > .2), maxSNP,
                                          maxScore)
def evalSpatialLoc(Ddata,
                   thinSfx,
                   scenario,
                   putativeMutPop,
                   nreplicas,
                   complikeSfx='',
                   likesTableSfx='',
                   selpos=500000,
                   whichSpatialLoc='Spline',
                   getio=None):
    """Evaluate spatial localization.  Compute relevant replica statistic.  For each replica,
    compute: whether the localized intervals include the causal SNP; statistics about the
    localized intervals; the position of the causal SNP relative to the localized intervals."""

    assert not scenario.isNeutral()

    snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenario.scenDir())
    replicaStatsDir = os.path.join(Ddata, 'replicastats' + thinSfx,
                                   scenario.scenDir())
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = (putativeMutPop, complikeSfx, likesTableSfx)

    intervalsListFN = os.path.join(
        replicaStatsDir,
        AddFileSfx('intervals%sList.tsv' % whichSpatialLoc, *sfxs))
    causalGdPosFN = os.path.join(replicaStatsDir, 'causalGdPos.tsv')
    spatialLocEvalFN = os.path.join(
        replicaStatsDir,
        AddFileSfx('spatialLocEval%s.tsv' % whichSpatialLoc, *sfxs))

    if getio:
        return dict(depends_on=(intervalsListFN, causalGdPosFN),
                    creates=spatialLocEvalFN,
                    mediumRuleNameSfx=scenario.scenDir(),
                    name='evalSpatialLoc_' + whichSpatialLoc)

    with IDotData.openForWrite(
            spatialLocEvalFN,
            'replicaNum numIntervals totLenBp totLenGd causalIncluded '
            'distanceToIntervalBoundaryBp distanceToIntervalBoundaryGd'
    ) as spatialLocEvalFile:

        for ( replicaNum2, replicaIntervals ), ( replicaNum3, causalGdPos, replicaNum4 ) in \
                zip( IDotData( intervalsListFN ).groupby( 'replicaNum' ),
                                IDotData.merge( iDotDatas = ( IDotData( causalGdPosFN ),
                                                              IDotData( intervalsListFN ).replicaNum.removeDups() ),
                                                cols = ( 'replicaNum', 'replicaNum' ) ) ):

            replicaNum2, replicaNum3, replicaNum4 = list(
                map(int, (replicaNum2, replicaNum3, replicaNum4)))
            if not replicaNum2 == replicaNum3 == replicaNum4:
                dbg('replicaNum2 replicaNum3 replicaNum4 intervalsListFN complikeFN causalGdPosFN spatialLocEvalFN'
                    )
            assert replicaNum2 == replicaNum3 == replicaNum4

            causalIncluded = False
            totLenBp = 0
            totLenGd = 0.0
            for replicaInterval in replicaIntervals:
                dbg('replicaInterval causalGdPos')
                #assert bool( replicaInterval.bpFrom <= selpos <= replicaInterval.bpTo ) == bool( replicaInterval.gdFrom <= causalPos_gd <= replicaInterval.gdTo )
                assert (replicaInterval.gdFrom <= causalGdPos <=
                        replicaInterval.gdTo) == (replicaInterval.bpFrom <=
                                                  selpos <=
                                                  replicaInterval.bpTo)
                if replicaInterval.gdFrom <= causalGdPos <= replicaInterval.gdTo:
                    causalIncluded = True

                totLenGd += (replicaInterval.gdTo - replicaInterval.gdFrom)
                totLenBp += (replicaInterval.bpTo - replicaInterval.bpFrom)

            spatialLocEvalFile.writeRecord(
                replicaNum2, len(replicaIntervals), totLenBp, totLenGd,
                int(causalIncluded),
                np.min((np.min(np.abs(replicaIntervals.bpFrom - selpos)),
                        np.min(np.abs(replicaIntervals.bpTo - selpos)))),
                np.min((np.min(np.abs(replicaIntervals.gdFrom - causalGdPos)),
                        np.min(np.abs(replicaIntervals.gdTo - causalGdPos)))))
def localizeSpatiallyBySplineFitting(Ddata,
                                     scenario,
                                     nreplicas,
                                     thinSfx='',
                                     putativeMutPop=None,
                                     complikeSfx='',
                                     likesTableSfx='',
                                     confidence=.9,
                                     minBins=20,
                                     nbins=200,
                                     smoothing=0.0,
                                     getio=None):
    """For each replica within a given scenario,
    localize the selected SNP spatially, by fitting a spline to a (smoothed version of) the CMS scores, dividing the
    region into bins, and finding the set of bins that cover 90% (or specified fraction of) area under the spline.

    Params:

       confidence - the spatially localized region will (hopefully) have this probability of containing the causal SNP;
         here, specifically, this means we'll include bins in the region that collectively cover this fraction of area
         under the posterior density curve.

       minBins - the region will include at least this many of the highest-average bins.
       
    """

    snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenario.scenDir())
    replicaStatsDir = os.path.join(Ddata, 'replicastats' + thinSfx,
                                   scenario.scenDir())
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = (putativeMutPop, complikeSfx, likesTableSfx)
    complikeFN = os.path.join(snpStatsDir, AddFileSfx('complike.data/', *sfxs))

    intervalsListFN = os.path.join(
        replicaStatsDir, AddFileSfx('intervalsSplineList.tsv', *sfxs))
    intervalsStatsFN = os.path.join(
        replicaStatsDir, AddFileSfx('intervalsSplineStats.tsv', *sfxs))

    posteriorSplineFN = os.path.join(
        replicaStatsDir, AddFileSfx('intervalsSplineSpline.tsv', *sfxs))
    binInfoFN = os.path.join(replicaStatsDir,
                             AddFileSfx('intervalsSplineBinInfo.tsv', *sfxs))

    if getio:
        return dict(
            depends_on=complikeFN,
            creates=(intervalsListFN, intervalsStatsFN, posteriorSplineFN,
                     binInfoFN),
            mediumRuleNameSfx=(scenario.scenDir(), ) + sfxs,
            fileDescrs={
                intervalsListFN:
                'List of intervals in the region, one of which (hopefully) contains the causal SNP.'
                ' For each replica this table has one or more lines, giving intervals in that replica.',
                intervalsStatsFN:
                'Per-replica statistic about confidence intervals in that replica',
                posteriorSplineFN:
                'The results of interpolating posterior density as a spline',
                binInfoFN:
                'Information about individual bins under the spline'
            })

    complike = IDotData(complikeFN).filter(
        lambda r: all(np.isfinite((r.iHS, r.meanFst, r.max_xpop))))

    binInfoHeadings = 'replicaNum binNum binStart binEnd included binCenters binAvgCMS binMaxCMS binIntegral '\
        'binIntegralNormed binRank'

    nbins_orig = nbins

    with contextlib.nested( IDotData.openForWrite( posteriorSplineFN, 'replicaNum gdPos complikeExp' ),
                            IDotData.openForWrite( binInfoFN, binInfoHeadings ),
                            IDotData.openForWrite( intervalsListFN, 'replicaNum gdFrom gdTo gdSize bpFrom bpTo '
                                                   'bpSize binsInInterval binsArea binsMax binsMaxFrac binsAvg '
                                                   'binsMinRank binsMaxRank' ),
                            IDotData.openForWrite( intervalsStatsFN, 'replicaNum numSegments gdTotLen bpTotLen' ) ) \
                            as ( posteriorSplineFile, binInfoFile, intervalsListFile, intervalsStatsFile ):

        for replicaNum, complikeForReplica in complike.groupby('Chrom'):

            gdMin = np.min(complikeForReplica.gdPos)
            gdMax = np.max(complikeForReplica.gdPos)
            binSize = (gdMax - gdMin) / nbins_orig
            binLefts = np.arange(gdMin, gdMax, binSize)
            binCenters = binLefts + binSize / 2
            binRights = binLefts + binSize
            dbg('len(complikeForReplica) replicaNum len(binLefts) len(binRights) len(binCenters)'
                )
            nbins = len(binLefts)

            #
            # Compute the mean and max CMS score in each gdPos bin
            #
            binAvgCMS = np.zeros(nbins)
            binMaxCMS = np.zeros(nbins)
            binNums = np.zeros(nbins, dtype=int)

            for bin, valsInBin in \
                    complikeForReplica.addCol( 'bin',
                                               list(map( functools.partial( min, nbins-1 ),
                                                    list(map( int,
                                                         ( complikeForReplica.gdPos - gdMin ) / binSize ))))).groupby('bin'):

                binNums[bin] = bin
                if valsInBin:
                    binAvgCMS[bin] = np.mean(valsInBin.complikeExp)
                    binMaxCMS[bin] = max(valsInBin.complikeExp)
                else:
                    binAvgCMS[bin] = binAvgCMS[bin - 1]
                    binMaxCMS[bin] = binMaxCMS[bin - 1]

            # fit a spline to the function ( binCenter, binAvgCMS ), approximating the posterior probability of a SNP being
            # causal.
            posteriorDensitySpline = interpolate.splrep(binCenters,
                                                        binAvgCMS,
                                                        s=smoothing)

            splineX = np.arange(gdMin, gdMax, binSize / 2)
            for x, y in zip(splineX,
                            interpolate.splev(splineX,
                                              posteriorDensitySpline)):
                posteriorSplineFile.writeRecord(replicaNum, x, y)

            # compute the integral under the interpolated function, for each bin.
            binIntegral = np.zeros(nbins)
            for binLeft, binRight, binNum in zip(binLefts, binRights, binNums):
                binIntegral[binNum] = interpolate.splint(
                    binLeft, binRight, posteriorDensitySpline)

            # normalize the integral value above each bin so that the total posterior density of being causal
            # integrates to 1.0
            binIntegralNormed = binIntegral / sum(binIntegral)

            binsByIntegralSize = binIntegral.argsort()[::-1]

            binRank = np.zeros(nbins)
            for i in range(nbins):
                binRank[binsByIntegralSize[i]] = i

            binsToUse = np.zeros(nbins, dtype=bool)
            binsIncluded = 0
            fractionCovered = 0.0
            for bin in binsByIntegralSize:
                if fractionCovered >= confidence and binsIncluded >= minBins:
                    break
                binsToUse[bin] = True
                binsIncluded += 1
                fractionCovered += binIntegralNormed[bin]

            # list the confidence intervals for this replica,
            # merging adjacent intervals.

            gd2bp = interpolate.interp1d(complikeForReplica.gdPos,
                                         complikeForReplica.Pos)
            gdMax = max(complikeForReplica.gdPos)

            binInfo = IDotData(names=binInfoHeadings,
                               Columns=(itertools.repeat(replicaNum, nbins),
                                        list(range(nbins)), binLefts,
                                        binRights, binsToUse, binCenters,
                                        binAvgCMS, binMaxCMS, binIntegral,
                                        binIntegralNormed, binRank))

            binInfoFile.writeRecords(binInfo)

            binsOverallMaxCMS = max(binInfo.binMaxCMS)

            gdTotLen = 0.0
            bpTotLen = 0
            numSegments = 0
            for included, bins in binInfo.groupby('included'):
                if included:
                    gdFrom = min(bins.binStart)
                    gdTo = min(max(bins.binEnd), gdMax)
                    bpFrom = int(gd2bp(gdFrom))
                    bpTo = int(gd2bp(gdTo))

                    gdTotLen += (gdTo - gdFrom)
                    bpTotLen += (bpTo - bpFrom)
                    numSegments += 1

                    intervalsListFile.writeRecord(
                        replicaNum, gdFrom, gdTo, gdTo - gdFrom, bpFrom, bpTo,
                        bpTo - bpFrom, len(bins), sum(bins.binIntegralNormed),
                        max(bins.binMaxCMS),
                        max(bins.binMaxCMS) / binsOverallMaxCMS,
                        np.mean(bins.binAvgCMS), min(bins.binRank),
                        max(bins.binRank))

            assert numSegments > 0 and bpTotLen > 0 and gdTotLen > 0.0
            intervalsStatsFile.writeRecord(replicaNum, numSegments, gdTotLen,
                                           bpTotLen)
            dbg('replicaNum numSegments gdTotLen bpTotLen')
Exemple #23
0
def VStackDotDataFiles(inFiles, outFile, getio=None):
    """Vertically stack the specified DotData or tsv files"""

    if getio: return dict(depends_on=inFiles, creates=outFile)

    IDotData.vstackFromIterable(map(IDotData, inFiles)).save(outFile)
Exemple #24
0
def DefineRulesTo_runSims(pr,
                          mutAges,
                          mutPops,
                          mutFreqs,
                          nreplicas,
                          allPops=None,
                          Ddata='../Data/Ilya_Data/sim/sfs/working/pardis2',
                          simsOut='simsOut',
                          suffix='',
                          shortSimTime=True,
                          DdataSeeds='',
                          useGenMap=None,
                          includeNeutral=True,
                          withGeneConvBug=False,
                          withNewCosi=False,
                          withCosi=None,
                          DdataMimic=None):
    """Instantiate, for each combination of ( mutAge, mutPop, mutFreq ),   the script that creates simulation parameters
	for simulations with that selected-mutation-age.
	"""

    assert not (DdataSeeds and DdataMimic)

    mutPops = MakeSeq(mutPops)
    mutAges = MakeSeq(mutAges)
    mutFreqs = MakeSeq(mutFreqs)

    if allPops is None: allPops = mutPops

    Dsims = Ddata + '/' + simsOut + suffix

    for scen in GetScenarios(
            **Dict('mutAges mutPops mutFreqs includeNeutral')):
        if DdataSeeds:
            seeds = IDotData(
                os.path.join(DdataSeeds, 'replicastats', scen.scenDir(),
                             'simSeeds.tsv'))

        for replicaNum, seedsLine in zip(
                range(nreplicas),
                seeds if DdataSeeds else itertools.repeat(None, nreplicas)):

            assert not DdataSeeds or seedsLine.replicaNum == replicaNum

            pfx = os.path.join(Dsims, scen.scenDir(),
                               '%d_%s' % (replicaNum, scen.scenName()))
            recombDir = '../Data/Ilya_Data/sim/sfs/working/pardis2'

            attrs = Dict('replicaNum', scenDir=scen.scenDir())
            if not scen.is_neutral():
                attrs.update(mutAge=scen.mutAge,
                             mutPop=scen.mutPop,
                             mutFreq=scen.mutFreq)
            else:
                attrs.update(mutAge=0, mutPop=0, mutFreq=0)
            if shortSimTime: attrs['piperun_short'] = True

            mutAge = '%dky' % (0 if scen.isNeutral() else scen.mutAge)

            useGenMapFile = os.path.join(
                DdataMimic, 'simsOut', scen.scenDir(), '%d_%s.model' %
                (replicaNum, scen.scenName())) if DdataMimic else ''
            useMutRateFile = os.path.join(
                DdataMimic, 'simsOut', scen.scenDir(), '%d_%s.mut' %
                (replicaNum, scen.scenName())) if DdataMimic else ''
            #			dbg( '"GGGGGGGGG" mutPops' )
            pr.addRule( targets = [ pfx + ext for ext in ( [ '.model', '.mut', '.cosiParams' ] +
                        ( [ '.recombParams' ] if not useGenMap else [] ) +
                        [ '.%s-%d' % ( hapOrPos, pop )
                   for hapOrPos in ('hap', 'pos') for pop in allPops ]  +
                        ( [] if ( withNewCosi or withCosi ) else [ os.path.join( Dsims, scen.scenDir(), 'treeinfo',
                         '%d_%s.%s' % ( replicaNum, scen.scenName(), which ) )
                         for which in ( 'regions.tsv', 'mutlist.tsv', 'nodes.dat' )
                         + ( () if scen.isNeutral() else ( 'sweepinfo.tsv', ) ) ] ) ) ],
                 sources = [ Ddata + '/' +  ( 'params_neutral' + suffix if scen.isNeutral()
                  else 'params%s/%s/params_%s' % \
                   ( suffix, mutAge, scen.scenName() ) ) ] \
                  + ( [ useGenMap ] if useGenMap else [ recombDir + '/recParams_bestfit_generic', \
                      recombDir + '/autosomes_decode.distr' ] ) + \
                  ( [ useGenMapFile, useMutRateFile ] if DdataMimic else [] ),
                 commands = ' '.join(('perl ../Operations/Ilya_Operations/sim/sfs/working/pardis2/' \
                  'runOneSim.pl' + ( ' --coalSeed %ld --recombSeed %ld --useMutRate %s'
                       % ( long( seedsLine.coalescentSeed ),
                    long( seedsLine.recombSeed ),
                    seedsLine.GetStrItem( 'mutRate' ) )
                       if DdataSeeds else '' )
                 + ( ' --useGenMap ' + useGenMap if useGenMap else '' )
                 + ( ' --withGeneConversionBug' if withGeneConvBug else '' )
                 + ( ' --withNewCosi' if withNewCosi else '' )
                 + ( ( ' --withCosi ' + withCosi ) if withCosi else '' )
                 + ( ( ' --useGenMap ' + useGenMapFile + ' --useMutRateFile ' + useMutRateFile )
                     if DdataMimic else '' ),
                 scen.scenName(), mutAge,
                 str(replicaNum), Ddata, Dsims, suffix )),
                 name = 'RunOneSim',
                 attrs = attrs,
                 comment = 'Adding simulation', mediumRuleNameSfx = ( scen.scenName(), mutAge, replicaNum ) )
Exemple #25
0
 def combiner(inFNs, outFN):
     IDotData.mergeColumnSummaries(iDotDatas=inFNs,
                                   cols=cols,
                                   groupCols=groupCols).save(outFN)
Exemple #26
0
def mergeSims( scenario, Ddata, posFileFN = None, simsOut = 'simsOut', nreplicas = 100, thinExt = '', thinSfx = '',
	       putativeMutPop = None, outFile = None,
	       pop2name = pop2name, statsSfx = '', ihsSfx = '',
               limitToPop = None,
	       getio = None ):
	"""Gathers per-SNP information, for all replicas of a given scenario, and outputs it in a single DotData where each line
	gives info for one SNP.

	Specifically, reads simulation and Sweep output, collects columns needed for composite likehood test (chrom, base pair position, genetic
	distance, anc frequencies for 3 populations, xpop for each pair, and ihs, iHH_A and iHH_D for selected population)

	Input params:

	   scenario - an object of class Scenario, indicating the simulation scenario (either neutral or a selection scenario)
	       from which all replicas were simulated.
	   nreplicas - the number of replicas simulated under this scenario.
	      Each replica represents a chromosome region, with a set of SNPs on it.
	   
	   Ddata - the directory under which the simulations and the Sweep analysis results live.
	     Under this directory we expect to find:
	         iHS analysis results, under power_ihs/
		 XP-EHH analysis results, under power_xpop
		 simulation output giving SNP positions

	   thinExt - the extension appended to simulation files that describe the SNPs in the simulated replica.
	      Sometimes we create simulations and then thin them under different thinning models (to simulate SNP ascertainment
	      by the various stages of HapMap; these differently thinned versions of the same simulations might be stored in
	      simulation files with different extensions.

	   thinSfx - the suffix appended to the power_ihs and power_xpop directory names, telling where to find iHS and XP-EHH
	      analyses of the simulations.   When we analyze the same simulations after applying different thinning scenarios,
	      the iHS and XP-EHH analyses for each thinning scenario go into a separate set of directories.


	   putativeMutPop - the population in which, we think, selection is occurring, aka "putatively selected population".
	      In practice, when localizing a given region, we will usually suspect that selection has occurred in a particular
	      population.   When doing a genome-wide scan, we can do several scans assuming each population in turn to be
	      the selected population, and find regions selected in that population.

        Output params:

	    Ddata - under Ddata writes a DotData named merged_scenName.data, where each line gives info
	        for one SNP, with the following columns (type of data is float unless stated otherwise):

	        CHROM_POS 1 - physical (basepair) position of the SNP within its replica.
	           Note that one merged file contains SNPs for a set of replicas (all for the same scenario),
		   so there could be multiple SNPs with the same position.  The replica number
		   is given in the Chrom column.
		FREQ1 1 - derived allele frequency in pop 1 ( European )
		FREQ1 4 - derived allele frequency in pop 4 ( EastAsian )
		FREQ1 5 - derived allele frequency in pop 5 ( WestAfrican )

		R AllEHH logratio Deviation European_WestAfrican - XP-EHH score to the right of the SNP,
		   between European and WestAfrican pops, normalized to the neutral background.
		   Analogously for the next five columns:
		L AllEHH logratio Deviation European_WestAfrican
		R AllEHH logratio Deviation EastAsian_European
		L AllEHH logratio Deviation EastAsian_European
		R AllEHH logratio Deviation EastAsian_WestAfrican
		L AllEHH logratio Deviation EastAsian_WestAfrican

		SNP pos (cM) European_WestAfrican - genetic map position of this SNP, within its replica.
		   (the European_WestAfrican suffix is irrelevant).
		SNP pos (bases) European_WestAfrican - physical (basepair) position of this SNP within its replica.
		   (the European_WestAfrican suffix is irrelevant).
		Chrom European_WestAfrican - the replica from which this SNP comes; can be nan.
		   (the European_WestAfrican suffix is irrelevant)
		Chrom - the replica from which this SNP comes; can be nan
		SNP pos (bases) - physical (basepair) position of this SNP within its replica.
		SNP pos (cM) - genetic map position of this SNP within its replica
		Both iHH_A - sum of iHH_A for both directions from this SNP
		Both iHH_D - sum of iHH_D for both directions from this SNP
		Both iHS - the value in 'Both Unstandardised iHS' (below), but binned by derived allele frequency
		   and normalized within the bin.
		Left iHH_D - iHH_D to the left of the SNP (the raw integral value).  analogously for the next three.
		Right iHH_D
		Left iHH_A
		Right iHH_A
		Both Unstandardised iHS - log( (iHH_A_left + iHH_A_right) / ( iHH_D_left + iHH_D_right ) )
		   ( see also 'Both iHS' column for the standardized iHS score )
	
	"""

	if not Ddata.endswith('/'): Ddata += '/'

	assert nreplicas > 0
	dbg( 'pop2name' )

	SimDir = os.path.join( Ddata, simsOut + thinSfx )

	scenName = scenario.scenName()
	scenDir = scenario.scenDir()

	if putativeMutPop == None: putativeMutPop = scenario.mutPop
	
	ihsSignifFN = os.path.join( Ddata, 'power_ihs' + thinSfx, scenDir,
				    'ihs_sig_' + pop2name[ putativeMutPop ] + ihsSfx + '.tsv' )

	popNames = sorted( pop2name.values() )
	popNums = sorted( pop2name.keys() )
	minPopNum = popNums[ 0 ]

	posFileKeyCols = ( 'replicaNum', 'CHROM_POS %d' % minPopNum )
	xpopIhsKeyCols = ('Chrom', 'SNP pos (bases)')
	
	popPairs = [ '%s_%s' % ( popNames[ pop1idx ], popNames[ pop2idx ] )
		     for pop1idx in range( len( popNames ) ) for pop2idx in range( pop1idx+1, len( popNames ) )
                     if limitToPop is None or limitToPop in ( popNames[ pop1idx ], popNames[ pop2idx ] )  ]
	
	xpopSignifFNs = [ os.path.join( Ddata, 'power_xpop' + thinSfx, scenDir, 'xpop_significance_' + popPair + '.tsv' )
			  for popPair in popPairs ]

	snpStatsDir = os.path.join( Ddata, 'snpStats' + thinSfx, scenario.scenDir() )
    
	mergedData = outFile if outFile else os.path.join( snpStatsDir, AddFileSfx( 'merged.tsv', statsSfx, putativeMutPop, ihsSfx ) )

	fileDescrs = \
	{ mergedData :
		  ( 'Various per-snp statistics for SNPs in scenario $scenario, replicas 0-$nreplicas, '
		    'assuming selection in ' + pop2name[ putativeMutPop ],
		    ( ( 'CHROM_POS 1', 'physical (basepair) position of the SNP within its replica. '
			'Note that one merged file contains SNPs for a set of replicas (all for the same scenario), '
			'so there could be multiple SNPs with the same position.  The replica number '
			'is given in the Chrom column. ' ), 
		      ( 'FREQ1 1', 'derived allele frequency in pop 1 ( European )' ),
		      ( 'R AllEHH logratio Deviation European_WestAfrican', 'XP-EHH score to the R of the SNP, '
			'between European and WestAfrican pops, normalized to the neutral background.' ),
		      ( 'SNP pos (cM) European_WestAfrican', 'genetic map SNP position' ),
		      ( 'SNP pos (bases) European_WestAfrican', 'physical SNP position' ),
		      ( 'Chrom European_WestAfrican', 'chromosome (or replica number)' ),
		      ( 'Chrom', 'chromosome (or replica number)' ),
		      ( 'SNP pos (bases)', 'physical SNP position' ),
		      ( 'SNP pos (cM)', 'genetic map SNP position' ),
		      ( 'Both iHH_A', 'sum of iHH_A scores for both sides' ),
		      ( 'Both iHH_D', 'sum of iHH_D scores for both sides' ),
		      ( 'Both iHS', 'sum of iHS scores for both sides' ),
		      ( ' Left iHH_D', 'iHH_D score to the left of the SNP' ),
		      ( 'Right iHH_D', 'iHH_D score to the right of the SNP' ),
		      ( 'Left iHH_A', 'iHH_A score to the left of the SNP' ),
		      ( 'Right iHH_A', 'iHH_A score to the right of the SNP' ), 
		      ( 'Both Unstandardised iHS', 'sum of unstandardized iHS scores for both sides' ) ) ) }

	if posFileFN is None: posFileFN = os.path.join( Ddata, 'snpStats' + thinSfx, scenario.scenDir(),
							AddFileSfx( 'mergedPosStacked.tsv', statsSfx, putativeMutPop, ihsSfx ) )
	
	if getio: return dict( depends_on = [ posFileFN, ihsSignifFN ] + xpopSignifFNs, creates = mergedData,
			       splitByCols = dict([ ( posFileFN, dict( keyCols = posFileKeyCols ) ) ]
						  + [ ( signifFN, dict( keyCols = xpopIhsKeyCols ) ) for signifFN in [ ihsSignifFN ] + xpopSignifFNs ] ),
			       mediumRuleNameSfx = ( scenario.scenDir(), putativeMutPop ),
			       fileDescrs = fileDescrs,
                               attrs = Dict( 'putativeMutPop nreplicas pop2name' ) )

	dashFixer = lambda v: v if v != '-' else np.nan

	ihsAll = IDotData(ihsSignifFN, valueFixer = dashFixer)
	ihsAll = ihsAll[('Chrom','SNP pos (bases)','SNP pos (cM)','Both iHH_A','Both iHH_D','Both iHS',
			 'Left iHH_D','Right iHH_D','Left iHH_A','Right iHH_A','Both Unstandardised iHS')]
	def chkReplica( r, n = nreplicas ): return r.Chrom < n 
		
	ihsAll = ihsAll.takewhile( chkReplica )
	
	xpopCols = ('Chrom','SNP pos (bases)','SNP pos (cM)','L AllEHH logratio Deviation','R AllEHH logratio Deviation')

	xpopSignif = tuple( [ IDotData(xpopSignifFN, valueFixer = dashFixer)[ xpopCols].takewhile( chkReplica )
			      for xpopSignifFN in xpopSignifFNs ] )
	
	posCols = ['CHROM_POS %d' % minPopNum ] + ['FREQ1 %d' % popNum for popNum in popNums]

	result = IDotData.merge( iDotDatas =  ( IDotData( posFileFN ), ) + xpopSignif + ( ihsAll, ),
				 cols = (posFileKeyCols,) +
					 (xpopIhsKeyCols,) * ( len( popPairs ) + 1 ),
				 blanks = (None,) + (np.nan,) * ( len( popPairs ) + 1 ),
				 suffixes = ['pos'] + [ ' %s' % popPair for popPair in popPairs ] + [ '' ] )

	aPopPair = 'European_WestAfrican' if 'European_WestAfrican' in popPairs else popPairs[0]
	useCols = [ 'replicaNum' ] + posCols + \
	    [ '%s AllEHH logratio Deviation %s' % ( side, popPair ) for popPair in popPairs for side in ( 'L', 'R' ) ] + \
	    [ 'SNP pos (cM) ' + aPopPair,
	      'SNP pos (bases) ' + aPopPair,
	      'Chrom ' + aPopPair,
	      'Chrom',
	      'SNP pos (bases)',
	      'SNP pos (cM)',
	      'Both iHH_A',
	      'Both iHH_D',
	      'Both iHS' ]

	if len( popPairs ) == 1:
		result = result.renameCols( { 'L AllEHH logratio Deviation' : 'L AllEHH logratio Deviation ' + aPopPair,
					      'R AllEHH logratio Deviation' : 'R AllEHH logratio Deviation ' + aPopPair } )
					      
	result[ useCols ].save( mergedData )
	
	logging.info( 'Finished mergeSims()' )
Exemple #27
0
def VStackDotDataFiles( inFiles, outFile, getio = None ):
    """Vertically stack the specified DotData or tsv files"""

    if getio: return dict( depends_on = inFiles, creates = outFile )

    IDotData.vstackFromIterable( itertools.imap( IDotData, inFiles ) ).save( outFile )
Exemple #28
0
 def combiner( inFNs, outFN ): IDotData.mergeColumnSummaries( iDotDatas = inFNs, cols = cols, groupCols = groupCols ).save( outFN )
 
 if getio: return dict( depends_on = inFN, creates = outFN,