Example #1
0
def xpopMerge(xpop_EastAsian_WestAfrican,xpop_European_EastAsian,xpop_European_WestAfrican):

	"""Reads Sweep xpop output for each pop pair and merges into a DotData with AllEHH logratio Deviation
	for both sides for each pop pair.
	"""

	dashFixer = lambda v: v if v != '-' else nan

	EA_WA = DotData(SVPath=xpop_EastAsian_WestAfrican,ToLoad=['Chrom','SNP pos (bases)','SNP pos (cM)','L AllEHH logratio Deviation','R AllEHH logratio Deviation'], SVValueFixer = dashFixer)
	EU_EA = DotData(SVPath=xpop_European_EastAsian,ToLoad=['Chrom','SNP pos (bases)','SNP pos (cM)','L AllEHH logratio Deviation','R AllEHH logratio Deviation'], SVValueFixer = dashFixer)
	EU_WA = DotData(SVPath=xpop_European_WestAfrican,ToLoad=['Chrom','SNP pos (bases)','SNP pos (cM)','L AllEHH logratio Deviation','R AllEHH logratio Deviation'], SVValueFixer = dashFixer)

	keyCol1 = array(EA_WA['SNP pos (bases)']) + array(EA_WA['Chrom'])*1000000
	keyCol2 = array(EU_EA['SNP pos (bases)']) + array(EU_EA['Chrom'])*1000000
	keyCol3 = array(EU_WA['SNP pos (bases)']) + array(EU_WA['Chrom'])*1000000

	keyData1 = DotData(Columns = [keyCol1,],names = ['key',])
	keyData2 = DotData(Columns = [keyCol2,],names = ['key',])
	keyData3 = DotData(Columns = [keyCol3,],names = ['key',])

	EA_WA = EA_WA.hstack(keyData1)
	EU_EA = EU_EA.hstack(keyData2)
	EU_WA = EU_WA.hstack(keyData3)

	blank = (nan,)*len(EA_WA.dtype.names)

	return DotData.mergeOnKeyCols((EA_WA,EU_EA,EU_WA),('key',)*3,(blank,)*3,
				      (' EastAsian_WestAfrican',' EastAsian_European',' European_WestAfrican'))
Example #2
0
def tsv2npz(inFN, outFN=None, arrayName=None, dotDataArgs={}, getio=None):
    """Convert a TSV file to an .npy file."""
    if outFN is None: outFN = ReplaceFileExt(inFN, '.npz')
    if getio:
        return dict(depends_on=inFN,
                    creates=outFN,
                    attrs=dict(piperun_short=True))
    z = DotData(SVPath=inFN, **dotDataArgs)
    if arrayName is None:
        np.savez_compressed(outFN, z)
    else:
        np.savez_compressed(outFN, **{arrayName: z})
Example #3
0
def clean_hapmap2_region(Z,
                         f=None,
                         cleanIHS=True,
                         cleanDer=True,
                         cleanAnc=True,
                         keepCausal=False,
                         returnInd=False):

    if f: f.write(str(len(Z)) + '\t')
    if cleanIHS:
        ind1 = invert(isnan(Z.StdDiff))
        print('SNPs with iHS scores: ', sum(ind1))
    else:
        ind1 = ones(len(Z))
    print('ind1: ', sum(ind1))
    if f: f.write(str(sum(ind1)) + '\t')
    if cleanDer:
        ind2 = Z.derFreq > .2
    else:
        ind2 = ones(len(Z))
    print('Derived > .2: ', sum(ind2))
    if f: f.write(str(sum(ind2)) + '\t')
    if cleanAnc:
        ind3 = Z.meanAnc > .4
    else:
        ind3 = ones(len(Z))
    print('Ancestral > .4: ', sum(ind3))
    if f: f.write(str(sum(ind3)) + '\t')
    ind4 = zeros(len(Z))
    if keepCausal:
        ind4 = Z.Pos == 500000
    ind = all([ind1, ind2, ind3], axis=0)
    ind = any([ind, ind4], axis=0)
    print('SNPs to keep: ', sum(ind))
    if f: f.write(str(sum(ind)) + '\t')

    if returnInd: return ind

    fZ = Z[ind]

    normlike = normalize(fZ.complike)[0]
    lik = exp(fZ.complike)
    Z = fZ.hstack(DotData(Columns=[normlike, lik], names=['normLike', 'lik']))
    #Z=Z[['complike','derFreq','gdPos','iHS','max_xpop','meanAnc','meanFst','StdDiff','freqDiff','Chrom','Pos','normLike','lik']]

    return Z
Example #4
0
def mergeSims(scenario,
              Ddata='../Data/Shari_Data/sim/',
              simsOut='simsOut3',
              nreplicas=5,
              thinExt='.thin',
              thinSfx='',
              selpop=None,
              getio=None):
    """Gathers per-SNP information, for all replicas of a given scenario, and outputs it in a single DotData where each line
	gives info for one SNP.

	Specifically, reads simulation and Sweep output, collects columns needed for composite likehood test (chrom, base pair position, genetic
	distance, anc frequencies for 3 populations, xpop for each pair, and ihs, iHH_A and iHH_D for selected population)

	Input params:

	   scenario - an object of class Scenario, indicating the simulation scenario (either neutral or a selection scenario)
	       from which all replicas were simulated.
	   nreplicas - the number of replicas simulated under this scenario.
	      Each replica represents a chromosome region, with a set of SNPs on it.
	   
	   Ddata - the directory under which the simulations and the Sweep analysis results live.
	     Under this directory we expect to find:
	         iHS analysis results, under power_ihs/
		 XP-EHH analysis results, under power_xpop
		 simulation output giving SNP positions

	   thinExt - the extension appended to simulation files that describe the SNPs in the simulated replica.
	      Sometimes we create simulations and then thin them under different thinning models (to simulate SNP ascertainment
	      by the various stages of HapMap; these differently thinned versions of the same simulations might be stored in
	      simulation files with different extensions.

	   thinSfx - the suffix appended to the power_ihs and power_xpop directory names, telling where to find iHS and XP-EHH
	      analyses of the simulations.   When we analyze the same simulations after applying different thinning scenarios,
	      the iHS and XP-EHH analyses for each thinning scenario go into a separate set of directories.

        Output params:

	    Ddata - under Ddata writes a DotData named merged_scenName.data, where each line gives info
	        for one SNP, with the following columns (type of data is float unless stated otherwise):

	        CHROM_POS 1 - physical (basepair) position of the SNP within its replica.
	           Note that one merged file contains SNPs for a set of replicas (all for the same scenario),
		   so there could be multiple SNPs with the same position.  The replica number
		   is given in the Chrom column.
		FREQ1 1 - derived allele frequency in pop 1 ( European )
		FREQ1 4 - derived allele frequency in pop 4 ( EastAsian )
		FREQ1 5 - derived allele frequency in pop 5 ( WestAfrican )

		R AllEHH logratio Deviation European_WestAfrican - XP-EHH score to the right of the SNP,
		   between European and WestAfrican pops, normalized to the neutral background.
		   Analogously for the next five columns:
		L AllEHH logratio Deviation European_WestAfrican
		R AllEHH logratio Deviation EastAsian_European
		L AllEHH logratio Deviation EastAsian_European
		R AllEHH logratio Deviation EastAsian_WestAfrican
		L AllEHH logratio Deviation EastAsian_WestAfrican

		SNP pos (cM) European_WestAfrican - genetic map position of this SNP, within its replica.
		   (the European_WestAfrican suffix is irrelevant).
		SNP pos (bases) European_WestAfrican - physical (basepair) position of this SNP within its replica.
		   (the European_WestAfrican suffix is irrelevant).
		Chrom European_WestAfrican - the replica from which this SNP comes; can be nan.
		   (the European_WestAfrican suffix is irrelevant)
		Chrom - the replica from which this SNP comes; can be nan
		SNP pos (bases) - physical (basepair) position of this SNP within its replica.
		SNP pos (cM) - genetic map position of this SNP within its replica
		Both iHH_A - sum of iHH_A for both directions from this SNP
		Both iHH_D - sum of iHH_D for both directions from this SNP
		Both iHS - the value in 'Both Unstandardised iHS' (below), but binned by derived allele frequency
		   and normalized within the bin.
		Left iHH_D - iHH_D to the left of the SNP (the raw integral value).  analogously for the next three.
		Right iHH_D
		Left iHH_A
		Right iHH_A
		Both Unstandardised iHS - log( (iHH_A_left + iHH_A_right) / ( iHH_D_left + iHH_D_right ) )
		   ( see also 'Both iHS' column for the standardized iHS score )
	
	"""

    assert selpop == None or scenario.is_neutral()

    DataDir = Ddata + '/'
    SimDir = DataDir + simsOut + thinSfx + '/'

    if not scenario.is_neutral():
        scenName = 'sel%d_%d' % (scenario.mutFreq, scenario.mutPop)
        scenDir = str(scenario.mutAge) + 'ky/' + scenName
    else:
        scenName = 'neutral'
        scenDir = 'neutral'

    popName = {1: 'European', 4: 'EastAsian', 5: 'WestAfrican'}

    ihsSignifTsv = DataDir + 'power_ihs' + thinSfx + '/' + scenDir + '/ihs_sig_' + \
        popName[ scenario.mutPop if not scenario.is_neutral() else ( selpop if selpop != None else 1 ) ] + '.tsv'
    xpopSignifTsv = [
        DataDir + 'power_xpop' + thinSfx + '/' + scenDir +
        '/xpop_significance_' + popPair + '.tsv'
        for popPair in ('EastAsian_WestAfrican', 'EastAsian_European',
                        'European_WestAfrican')
    ]
    posFiles = [
        SimDir + scenDir + '/' + str(ichrom) + '_' + scenName + '.pos-%d%s' %
        (pop, thinExt) for ichrom in range(nreplicas) for pop in (1, 4, 5)
    ]

    ageSfx = '%dky' % (scenario.mutAge if not scenario.isNeutral() else 10)
    mergedDotData = AddFileSfx(Ddata + 'merged.data/', ageSfx,
                               scenario.scenName(), selpop, thinSfx)

    fileDescrs = \
    { mergedDotData :
       ( 'Various per-snp statistics for SNPs in scenario $scenario, replicas 0-$nreplicas.',
         ( ( 'CHROM_POS 1', 'physical (basepair) position of the SNP within its replica. '
      'Note that one merged file contains SNPs for a set of replicas (all for the same scenario), '
      'so there could be multiple SNPs with the same position.  The replica number '
      'is given in the Chrom column. ' ),
           ( 'FREQ1 1', 'derived allele frequency in pop 1 ( European )' ),
           ( 'R AllEHH logratio Deviation European_WestAfrican', 'XP-EHH score to the R of the SNP, '
      'between European and WestAfrican pops, normalized to the neutral background.' ),
           ( 'SNP pos (cM) European_WestAfrican', 'genetic map SNP position' ),
           ( 'SNP pos (bases) European_WestAfrican', 'physical SNP position' ),
           ( 'Chrom European_WestAfrican', 'chromosome (or replica number)' ),
           ( 'Chrom', 'chromosome (or replica number)' ),
           ( 'SNP pos (bases)', 'physical SNP position' ),
           ( 'SNP pos (cM)', 'genetic map SNP position' ),
           ( 'Both iHH_A', 'sum of iHH_A scores for both sides' ),
           ( 'Both iHH_D', 'sum of iHH_D scores for both sides' ),
           ( 'Both iHS', 'sum of iHS scores for both sides' ),
           ( ' Left iHH_D', 'iHH_D score to the left of the SNP' ),
           ( 'Right iHH_D', 'iHH_D score to the right of the SNP' ),
           ( 'Left iHH_A', 'iHH_A score to the left of the SNP' ),
           ( 'Right iHH_A', 'iHH_A score to the right of the SNP' ),
           ( 'Both Unstandardised iHS', 'sum of unstandardized iHS scores for both sides' ) ) ) }

    if getio:
        return dict(depends_on=posFiles + [ihsSignifTsv] + xpopSignifTsv,
                    creates=mergedDotData,
                    mediumRuleNameSfx=scenario.scenDir(),
                    fileDescrs=fileDescrs)

    ncausal = 0

    dashFixer = lambda v: v if v != '-' else numpy.nan

    # Load iHS of selected pop
    ihsAll = DotData(SVPath=ihsSignifTsv,
                     ToLoad=[
                         'Chrom', 'SNP pos (bases)', 'SNP pos (cM)',
                         'Both iHH_A', 'Both iHH_D', 'Both iHS', 'Left iHH_D',
                         'Right iHH_D', 'Left iHH_A', 'Right iHH_A',
                         'Both Unstandardised iHS'
                     ],
                     SVValueFixer=dashFixer)
    ihsAllChrom = ihsAll.Chrom

    # Load xpop values
    xpopAll = xpopMerge(*xpopSignifTsv)
    logging.info('done with xpopMerge')

    xpopAll = xpopAll[[
        'R AllEHH logratio Deviation European_WestAfrican',
        'L AllEHH logratio Deviation European_WestAfrican',
        'R AllEHH logratio Deviation EastAsian_European',
        'L AllEHH logratio Deviation EastAsian_European',
        'R AllEHH logratio Deviation EastAsian_WestAfrican',
        'L AllEHH logratio Deviation EastAsian_WestAfrican',
        'SNP pos (cM) European_WestAfrican',
        'SNP pos (bases) European_WestAfrican', 'Chrom European_WestAfrican'
    ]]
    xpopAllChrom = xpopAll['Chrom European_WestAfrican']

    replicates = []

    xpopIdx = 0
    ihsIdx = 0

    for ichrom in range(nreplicas):

        progress('Merging replicas', ichrom, nreplicas, freq=1)

        logging.info('looking at replica %d of %d' % (ichrom, nreplicas))
        # Load in pos files for this replica.
        # They give, for each SNP in the replica, its physical (basepair) position within the replica,
        # and the frequency of the derived and the ancestral alleles.
        pos1, pos4, pos5 = [
            DotData(SVPath=SimDir + scenDir + '/' + str(ichrom) + '_' +
                    scenName + '.pos-%d%s' % (pop, thinExt),
                    SVSkipFirstLines=1,
                    SVHeader=False,
                    names=[
                        'SNP', 'CHROM', 'CHROM_POS', 'ALLELE1', 'FREQ1',
                        'ALLELE2', 'FREQ2'
                    ]) for pop in (1, 4, 5)
        ]
        assert pos1.numCols() == pos4.numCols() == pos5.numCols()
        posBlank = ((numpy.nan, ) * pos1.numCols(), ) * 3
        logging.info('Loaded pos files for chrom ' + str(ichrom) + ': ' +
                     str(len(pos1)) + 'snps')

        assert set(pos1.CHROM_POS) == set(pos4.CHROM_POS) == set(
            pos5.CHROM_POS)

        logging.info('pos file sizes are: %d, %d, %d' %
                     (len(pos1), len(pos4), len(pos5)))
        logging.info('Merging on position...')
        posAll = DotData.mergeOnKeyCols((pos1, pos4, pos5),
                                        ('CHROM_POS', ) * 3,
                                        posBlank,
                                        suffixes=(' 1', ' 4', ' 5'))

        logging.info('Done merging.')
        logging.info('type(posAll) is ' + str(type(posAll)))
        print len(posAll)
        chrom = numpy.ones(len(posAll)) * ichrom
        newChrom = DotData(Columns=[
            chrom,
        ], names=[
            'newChrom',
        ])
        print newChrom
        posAll = posAll[['CHROM_POS 1', 'FREQ1 1', 'FREQ1 4', 'FREQ1 5']]
        posAll.hstack(newChrom)

        logging.info('added replica number column')

        print posAll
        posAllBlank = (numpy.nan, ) * posAll.numCols()

        # 10-16-08 ADDED CHROM TO MERGED OUTPT  ( not now used -- can be removed? )

        #
        # From the xpop and ihs significance results, get just the rows for SNPs in the
        # current replica
        #

        #while xpopIdx < len( xpopAllChrom ) and xpopAllChrom[ xpopIdx ] == ichrom: xpopIdx += 1
        #xpop = xpopAll[ :xpopIdx ]
        xpop = xpopAll[xpopAllChrom == ichrom]
        logging.info('selected xpop for replica %d' % ichrom)
        xpopBlank = (numpy.nan, ) * xpop.numCols()

        #while ihsIdx < len( ihsAllChrom ) and ihsAllChrom[ ihsIdx ] == ichrom: ihsIdx += 1
        #ihs = ihsAll[ :ihsIdx ]
        ihs = ihsAll[ihsAllChrom == ichrom]
        logging.info('selected ihs for replica %d' % ichrom)
        ihsBlank = (numpy.nan, ) * ihs.numCols()

        #		if not set( ihs[  'SNP pos (bases)' ] ).issubset( set( posAll['CHROM_POS 1'] ) ):
        #			print 'bad positions: ', set( posAll['CHROM_POS 1'] ) - set( ihs[  'SNP pos (bases)' ] )
        #		assert set( ihs[  'SNP pos (bases)' ] ).issubset( set( posAll['CHROM_POS 1'] ) ), "bad iHS file " + ihsSignifTsv

        logging.info('merging replica %d' % ichrom)
        Data = DotData.mergeOnKeyCols(
            (posAll, xpop, ihs),
            ('CHROM_POS 1', 'SNP pos (bases) European_WestAfrican',
             'SNP pos (bases)'),
            blanks=(posAllBlank, xpopBlank, ihsBlank),
            suffixes=('pos', ' xpop', ' ihs'),
            verbose=True)
        logging.info('done merging replica %d; now have %d records' %
                     (ichrom, len(Data)))

        Data = Data[numpy.invert(numpy.isnan(Data['CHROM_POS 1']))]
        logging.info(
            'done removing snp info for SNPs not in all .pos files for replica %d; now have %d records'
            % (ichrom, len(Data)))

        replicates.append(Data)

        logging.info('now have ' + str(len(replicates)) + ' replicates.')

    # endloop: for each replica

    logging.info('Stacking replicates...')
    allData = reduce(lambda x, y: x.vstack(y), replicates)
    logging.info('Saving merged SNP info to ' + mergedDotData)
    allData.save(mergedDotData)

    logging.info('Finished mergeSims()')
Example #5
0
def computeMeanStd_binned_old(inDatas, valCol, binCol, binMin, binMax,
                              binCount):
    """Compute binned stats for a set of tables"""

    sums = np.zeros(binCount)
    sumsSq = np.zeros_like(sums)
    counts = np.zeros_like(sums)
    bins = np.linspace(binMin, binMax, binCount + 1)
    binSize = (binMax - binMin) / binCount
    for d_idx, d in enumerate(inDatas):
        dbg('d_idx d binSize')
        dbg('d[binCol]')

        for i in range(binCount):
            binBot = bins[i]
            binTop = bins[i + 1]
            dbg('binBot binTop')
            #        theIdx = ( (binTop - d[ binCol ]) < binSize ) & ( ( binTop - d[ binCol ] ) > 0 )
            theIdx = (binBot < d[binCol].values) & (d[binCol].values <= binTop)
            dbg('binBot binTop')
            DotData(names=('rows', ),
                    Columns=theIdx.nonzero()).saveToSV('nz%02d.tsv' % i)
            #rowsStr = ','.join(map(str,list(theIdx.nonzero())))
            #print 'binnedRows=', rowsStr
            hereVals = d[theIdx][valCol]
            DotData(names=('temp', ),
                    Columns=(hereVals, )).saveToSV('temp2%2d.tsv' % i)

            dbg('"BEF" theIdx.sum() i bins[i] bins[i+1] len(hereVals)')
            counts[i] += len(hereVals)
            sums[i] += np.sum(hereVals)
            sumsSq[i] += np.sum(hereVals * hereVals)
            dbg('"AFT" i bins[i] bins[i+1] len(hereVals)')

        if False:
            # fast version
            binsHere = np.digitize(d[binCol], bins) - 1
            dbg('len(binsHere) binsHere')
            np.clip(binsHere, 0, binCount - 1, out=binsHere)
            dbg('binsHere')

            counts += np.bincount(binsHere, minlength=binCount)
            sums += np.bincount(binsHere,
                                weights=d[valCol],
                                minlength=binCount)
            sumsSq += np.bincount(binsHere,
                                  weights=d[valCol] * d[valCol],
                                  minlength=binCount)

    countsOrig = counts.astype(int)
    counts[counts == 0] = np.nan
    means = sums / counts
    stds = sumsSq / counts - means * means

    return pd.DataFrame(
        dict(binBeg=bins[:-1],
             binEnd=bins[1:],
             counts=countsOrig,
             sums=sums,
             sumsSq=sumsSq,
             means=means,
             stds=stds))
Example #6
0
def tsv2npy(inFN, outFN=None, getio=None):
    """Convert a TSV file to an .npy file."""
    if outFN is None: outFN = ReplaceFileExt(inFN, '.npy')
    if getio: return dict(depends_on=inFN, creates=outFN)
    z = DotData(SVPath=inFN)
    np.save(outFN, z)
Example #7
0
def TSV2DotData(inFN, outFN, getio=None):
    """Convert a TSV file to a DotData dir."""
    if getio: return dict(depends_on=inFN, creates=outFN)
    DotData(SVPath=inFN).save(outFN)
Example #8
0
def DotData2TSV(inFN, outFN, readOpts={}, getio=None):
    """Convert DotData to TSV"""
    if getio: return dict(depends_on=inFN, creates=outFN)
    DotData(Path=inFN, **readOpts).saveToSV(outFN)
Example #9
0
def xpopMerge(xpop_EastAsian_WestAfrican, xpop_European_EastAsian,
              xpop_European_WestAfrican):
    """Reads Sweep xpop output for each pop pair and merges into a DotData with AllEHH logratio Deviation
	for both sides for each pop pair.
	"""

    dashFixer = lambda v: v if v != '-' else nan

    EA_WA = DotData(SVPath=xpop_EastAsian_WestAfrican,
                    ToLoad=[
                        'Chrom', 'SNP pos (bases)', 'SNP pos (cM)',
                        'L AllEHH logratio Deviation',
                        'R AllEHH logratio Deviation'
                    ],
                    SVValueFixer=dashFixer)
    EU_EA = DotData(SVPath=xpop_European_EastAsian,
                    ToLoad=[
                        'Chrom', 'SNP pos (bases)', 'SNP pos (cM)',
                        'L AllEHH logratio Deviation',
                        'R AllEHH logratio Deviation'
                    ],
                    SVValueFixer=dashFixer)
    EU_WA = DotData(SVPath=xpop_European_WestAfrican,
                    ToLoad=[
                        'Chrom', 'SNP pos (bases)', 'SNP pos (cM)',
                        'L AllEHH logratio Deviation',
                        'R AllEHH logratio Deviation'
                    ],
                    SVValueFixer=dashFixer)

    keyCol1 = array(EA_WA['SNP pos (bases)']) + array(EA_WA['Chrom']) * 1000000
    keyCol2 = array(EU_EA['SNP pos (bases)']) + array(EU_EA['Chrom']) * 1000000
    keyCol3 = array(EU_WA['SNP pos (bases)']) + array(EU_WA['Chrom']) * 1000000

    keyData1 = DotData(Columns=[
        keyCol1,
    ], names=[
        'key',
    ])
    keyData2 = DotData(Columns=[
        keyCol2,
    ], names=[
        'key',
    ])
    keyData3 = DotData(Columns=[
        keyCol3,
    ], names=[
        'key',
    ])

    EA_WA = EA_WA.hstack(keyData1)
    EU_EA = EU_EA.hstack(keyData2)
    EU_WA = EU_WA.hstack(keyData3)

    blank = (nan, ) * len(EA_WA.dtype.names)

    return DotData.mergeOnKeyCols(
        (EA_WA, EU_EA, EU_WA), ('key', ) * 3, (blank, ) * 3,
        (' EastAsian_WestAfrican', ' EastAsian_European',
         ' European_WestAfrican'))
Example #10
0
def localizeSpatiallyByWindows(Ddata,
                               scenario,
                               nreplicas,
                               thinSfx='',
                               putativeMutPop=None,
                               complikeSfx='',
                               likesTableSfx='',
                               threshold=.5,
                               numSNP=1,
                               minGdInEachDir=.05,
                               fromReplica=None,
                               toReplica=None,
                               getio=None):
    """
    Spatially localize the selected variant for all replicas within a given scenario.
    The approach is to start with the highest-scoring SNP, and move left and right from it in fixed-size windows
    for as long as the windows contain at least 'numSNP' snps with score at least 'threshold'.

    Adapted from Operations.Shari_Operations.localize.hapmap_regions_0615.plotRegions() .
    """

    snpStatsDir = os.path.join(Ddata, 'snpStats' + thinSfx, scenario.scenDir())
    replicaStatsDir = os.path.join(Ddata, 'replicastats' + thinSfx,
                                   scenario.scenDir())
    if putativeMutPop == None: putativeMutPop = scenario.mutPop
    sfxs = (putativeMutPop, complikeSfx, likesTableSfx)
    complikeFN = os.path.join(snpStatsDir, AddFileSfx('complike.data/', *sfxs))

    intervalsListFN = os.path.join(
        replicaStatsDir, AddFileSfx('intervalsWindowsList.tsv', *sfxs))

    if getio:
        return dict(
            depends_on=complikeFN,
            creates=intervalsListFN,
            mediumRuleNameSfx=(scenario.scenDir(), ) + sfxs,
            fileDescrs={
                intervalsListFN:
                'List of intervals in the region, one of which (hopefully) contains the causal SNP.'
                ' For each replica this table has one or more lines, giving intervals in that replica.'
            })

    #complike = IDotData( complikeFN ).filter( lambda r: all( np.isfinite( ( r.iHS, r.meanFst, r.max_xpop ) ) ) )
    complike = IDotData(complikeFN)

    with IDotData.openForWrite(
            intervalsListFN,
            'replicaNum gdFrom gdTo gdSize bpFrom bpTo bpSize numPositiveBins '
            'numSnpsOver_0_2 maxSNP_Pos maxSNP_lik') as intervalsListFile:

        for replicaNum, complikeForReplica in complike.groupby('Chrom'):

            if fromReplica is not None and replicaNum < fromReplica: continue
            if toReplica is not None and replicaNum > toReplica: break

            X = complikeForReplica.toDotData()

            minPos = np.min(X.gdPos)
            maxPos = np.max(X.gdPos)
            bins = np.arange(0, 1, .01)

            ind = X.complikeExp.argsort()
            lik = X.complikeExp[ind]
            maxlik = np.mean(lik[-5:])
            #maxlik = mean(lik[-5:])
            like = X.complikeExp / maxlik

            Y = X[ind]
            maxSNP = Y.Pos[-1]

            maxScore = Y.complikeExp[-1]
            relPos = X.gdPos - Y.gdPos[-1]
            X = X.hstack(
                DotData(Columns=[like, relPos],
                        names=['scaled_like', 'relPos']))

            topGdPos = Y.gdPos[-1]
            minPos = Y.Pos[-1]
            maxPos = Y.Pos[-1]
            minGdPos = topGdPos
            maxGdPos = topGdPos
            numPositiveBins = 0

            dbg('replicaNum minPos maxPos minGdPos maxGdPos')

            for dir in -1, +1:
                for bin in bins:
                    Z = X[np.abs(X.relPos - dir * bin) <= .02]

                    dbg('dir bin len(Z)')

                    if len(Z) == 0: continue
                    top = Z[Z.scaled_like > threshold]

                    dbg('len(top)')

                    if len(top) <= numSNP and np.abs(
                            topGdPos - top.gdPos) > minGdInEachDir:
                        break
                    if len(top) == 0: top = Z
                    if dir == -1:
                        minPos = np.min(top.Pos)
                        minGdPos = np.min(top.gdPos)
                    else:
                        maxPos = np.max(top.Pos)
                        maxGdPos = np.max(top.gdPos)
                    numPositiveBins += 1

            ind = np.all([X.Pos > minPos, X.Pos < maxPos], axis=0)
            peak = X[ind]

            intervalsListFile.writeRecord(replicaNum, minGdPos, maxGdPos,
                                          maxGdPos - minGdPos, minPos, maxPos,
                                          maxPos - minPos, numPositiveBins,
                                          sum(peak.scaled_like > .2), maxSNP,
                                          maxScore)
Example #11
0
def mergeSims( scenario, Ddata = '../Data/Shari_Data/sim/', simsOut = 'simsOut3', nreplicas = 5,
	       thinExt = '.thin', thinSfx = '',
	       selpop = None, getio = None ):
	"""Gathers per-SNP information, for all replicas of a given scenario, and outputs it in a single DotData where each line
	gives info for one SNP.

	Specifically, reads simulation and Sweep output, collects columns needed for composite likehood test (chrom, base pair position, genetic
	distance, anc frequencies for 3 populations, xpop for each pair, and ihs, iHH_A and iHH_D for selected population)

	Input params:

	   scenario - an object of class Scenario, indicating the simulation scenario (either neutral or a selection scenario)
	       from which all replicas were simulated.
	   nreplicas - the number of replicas simulated under this scenario.
	      Each replica represents a chromosome region, with a set of SNPs on it.
	   
	   Ddata - the directory under which the simulations and the Sweep analysis results live.
	     Under this directory we expect to find:
	         iHS analysis results, under power_ihs/
		 XP-EHH analysis results, under power_xpop
		 simulation output giving SNP positions

	   thinExt - the extension appended to simulation files that describe the SNPs in the simulated replica.
	      Sometimes we create simulations and then thin them under different thinning models (to simulate SNP ascertainment
	      by the various stages of HapMap; these differently thinned versions of the same simulations might be stored in
	      simulation files with different extensions.

	   thinSfx - the suffix appended to the power_ihs and power_xpop directory names, telling where to find iHS and XP-EHH
	      analyses of the simulations.   When we analyze the same simulations after applying different thinning scenarios,
	      the iHS and XP-EHH analyses for each thinning scenario go into a separate set of directories.

        Output params:

	    Ddata - under Ddata writes a DotData named merged_scenName.data, where each line gives info
	        for one SNP, with the following columns (type of data is float unless stated otherwise):

	        CHROM_POS 1 - physical (basepair) position of the SNP within its replica.
	           Note that one merged file contains SNPs for a set of replicas (all for the same scenario),
		   so there could be multiple SNPs with the same position.  The replica number
		   is given in the Chrom column.
		FREQ1 1 - derived allele frequency in pop 1 ( European )
		FREQ1 4 - derived allele frequency in pop 4 ( EastAsian )
		FREQ1 5 - derived allele frequency in pop 5 ( WestAfrican )

		R AllEHH logratio Deviation European_WestAfrican - XP-EHH score to the right of the SNP,
		   between European and WestAfrican pops, normalized to the neutral background.
		   Analogously for the next five columns:
		L AllEHH logratio Deviation European_WestAfrican
		R AllEHH logratio Deviation EastAsian_European
		L AllEHH logratio Deviation EastAsian_European
		R AllEHH logratio Deviation EastAsian_WestAfrican
		L AllEHH logratio Deviation EastAsian_WestAfrican

		SNP pos (cM) European_WestAfrican - genetic map position of this SNP, within its replica.
		   (the European_WestAfrican suffix is irrelevant).
		SNP pos (bases) European_WestAfrican - physical (basepair) position of this SNP within its replica.
		   (the European_WestAfrican suffix is irrelevant).
		Chrom European_WestAfrican - the replica from which this SNP comes; can be nan.
		   (the European_WestAfrican suffix is irrelevant)
		Chrom - the replica from which this SNP comes; can be nan
		SNP pos (bases) - physical (basepair) position of this SNP within its replica.
		SNP pos (cM) - genetic map position of this SNP within its replica
		Both iHH_A - sum of iHH_A for both directions from this SNP
		Both iHH_D - sum of iHH_D for both directions from this SNP
		Both iHS - the value in 'Both Unstandardised iHS' (below), but binned by derived allele frequency
		   and normalized within the bin.
		Left iHH_D - iHH_D to the left of the SNP (the raw integral value).  analogously for the next three.
		Right iHH_D
		Left iHH_A
		Right iHH_A
		Both Unstandardised iHS - log( (iHH_A_left + iHH_A_right) / ( iHH_D_left + iHH_D_right ) )
		   ( see also 'Both iHS' column for the standardized iHS score )
	
	"""

	assert selpop == None  or  scenario.is_neutral()

	DataDir = Ddata + '/'
	SimDir = DataDir + simsOut + thinSfx + '/'

	if not scenario.is_neutral():
		scenName = 'sel%d_%d' % ( scenario.mutFreq, scenario.mutPop )
		scenDir  = str( scenario.mutAge ) + 'ky/' + scenName
	else:
		scenName = 'neutral'
		scenDir = 'neutral'
	
	popName = {1:'European',4:'EastAsian',5:'WestAfrican'}
	
	ihsSignifTsv = DataDir + 'power_ihs' + thinSfx + '/' + scenDir + '/ihs_sig_' + \
	    popName[ scenario.mutPop if not scenario.is_neutral() else ( selpop if selpop != None else 1 ) ] + '.tsv'
	xpopSignifTsv = [ DataDir + 'power_xpop' + thinSfx + '/' + scenDir + '/xpop_significance_' + popPair + '.tsv'
			  for popPair in ( 'EastAsian_WestAfrican', 'EastAsian_European', 'European_WestAfrican' ) ]
	posFiles = [ SimDir + scenDir + '/' + str(ichrom) + '_' + scenName + '.pos-%d%s' % ( pop, thinExt )
		     for ichrom in range( nreplicas ) for pop in ( 1, 4, 5 ) ]
	
	ageSfx = '%dky' % ( scenario.mutAge if not scenario.isNeutral() else 10 )
	mergedDotData  = AddFileSfx( Ddata + 'merged.data/', ageSfx, scenario.scenName(), selpop, thinSfx )

	fileDescrs = \
	{ mergedDotData :
		  ( 'Various per-snp statistics for SNPs in scenario $scenario, replicas 0-$nreplicas.',
		    ( ( 'CHROM_POS 1', 'physical (basepair) position of the SNP within its replica. '
			'Note that one merged file contains SNPs for a set of replicas (all for the same scenario), '
			'so there could be multiple SNPs with the same position.  The replica number '
			'is given in the Chrom column. ' ), 
		      ( 'FREQ1 1', 'derived allele frequency in pop 1 ( European )' ),
		      ( 'R AllEHH logratio Deviation European_WestAfrican', 'XP-EHH score to the R of the SNP, '
			'between European and WestAfrican pops, normalized to the neutral background.' ),
		      ( 'SNP pos (cM) European_WestAfrican', 'genetic map SNP position' ),
		      ( 'SNP pos (bases) European_WestAfrican', 'physical SNP position' ),
		      ( 'Chrom European_WestAfrican', 'chromosome (or replica number)' ),
		      ( 'Chrom', 'chromosome (or replica number)' ),
		      ( 'SNP pos (bases)', 'physical SNP position' ),
		      ( 'SNP pos (cM)', 'genetic map SNP position' ),
		      ( 'Both iHH_A', 'sum of iHH_A scores for both sides' ),
		      ( 'Both iHH_D', 'sum of iHH_D scores for both sides' ),
		      ( 'Both iHS', 'sum of iHS scores for both sides' ),
		      ( ' Left iHH_D', 'iHH_D score to the left of the SNP' ),
		      ( 'Right iHH_D', 'iHH_D score to the right of the SNP' ),
		      ( 'Left iHH_A', 'iHH_A score to the left of the SNP' ),
		      ( 'Right iHH_A', 'iHH_A score to the right of the SNP' ), 
		      ( 'Both Unstandardised iHS', 'sum of unstandardized iHS scores for both sides' ) ) ) }

	if getio: return dict( depends_on = posFiles + [ ihsSignifTsv ] + xpopSignifTsv, creates = mergedDotData,
			       mediumRuleNameSfx = scenario.scenDir(),
			       fileDescrs = fileDescrs )

	ncausal = 0

	dashFixer = lambda v: v if v != '-' else numpy.nan

	# Load iHS of selected pop
	ihsAll = DotData(SVPath = ihsSignifTsv,ToLoad=['Chrom','SNP pos (bases)','SNP pos (cM)','Both iHH_A','Both iHH_D','Both iHS','Left iHH_D','Right iHH_D','Left iHH_A','Right iHH_A','Both Unstandardised iHS'], SVValueFixer = dashFixer)
	ihsAllChrom = ihsAll.Chrom
	
	# Load xpop values
	xpopAll = xpopMerge( *xpopSignifTsv )
	logging.info( 'done with xpopMerge' )
	
	xpopAll = xpopAll[['R AllEHH logratio Deviation European_WestAfrican','L AllEHH logratio Deviation European_WestAfrican','R AllEHH logratio Deviation EastAsian_European','L AllEHH logratio Deviation EastAsian_European','R AllEHH logratio Deviation EastAsian_WestAfrican',
			   'L AllEHH logratio Deviation EastAsian_WestAfrican','SNP pos (cM) European_WestAfrican','SNP pos (bases) European_WestAfrican','Chrom European_WestAfrican']]
	xpopAllChrom = xpopAll['Chrom European_WestAfrican']
	
	replicates = []

	xpopIdx = 0
	ihsIdx = 0
	
	for ichrom in range(nreplicas):

		progress( 'Merging replicas', ichrom, nreplicas, freq = 1 )
	
		logging.info( 'looking at replica %d of %d' % ( ichrom, nreplicas ) )
		# Load in pos files for this replica.
		# They give, for each SNP in the replica, its physical (basepair) position within the replica,
		# and the frequency of the derived and the ancestral alleles.
		pos1, pos4, pos5 = [ DotData(SVPath=SimDir + scenDir + '/' + str(ichrom) + '_' + scenName + '.pos-%d%s' % ( pop, thinExt),
					     SVSkipFirstLines = 1, SVHeader = False,
					     names = ['SNP','CHROM', 'CHROM_POS', 'ALLELE1', 'FREQ1', 'ALLELE2', 'FREQ2' ]) for pop in ( 1, 4, 5 ) ]
		assert pos1.numCols() == pos4.numCols() == pos5.numCols()
		posBlank = ((numpy.nan,)*pos1.numCols(),)*3
		logging.info( 'Loaded pos files for chrom ' + str( ichrom ) +  ': ' + str( len(pos1) ) + 'snps' )

		assert set(pos1.CHROM_POS) == set(pos4.CHROM_POS) == set(pos5.CHROM_POS)

		logging.info( 'pos file sizes are: %d, %d, %d' % ( len( pos1 ), len( pos4 ), len( pos5 ) ) )
		logging.info( 'Merging on position...' )
		posAll = DotData.mergeOnKeyCols((pos1,pos4,pos5),('CHROM_POS',)*3,posBlank, suffixes = (' 1',' 4',' 5'))

		logging.info( 'Done merging.' )
		logging.info( 'type(posAll) is ' + str( type( posAll ) ) )
		print len(posAll)
		chrom = numpy.ones(len(posAll))*ichrom
		newChrom = DotData(Columns = [chrom,],names=['newChrom',])
		print newChrom
		posAll = posAll[['CHROM_POS 1','FREQ1 1','FREQ1 4','FREQ1 5']]
		posAll.hstack(newChrom)

		logging.info( 'added replica number column' )
		
		print posAll
		posAllBlank = (numpy.nan,)*posAll.numCols()
		
		# 10-16-08 ADDED CHROM TO MERGED OUTPT  ( not now used -- can be removed? )

		#
		# From the xpop and ihs significance results, get just the rows for SNPs in the 
		# current replica
		#

		#while xpopIdx < len( xpopAllChrom ) and xpopAllChrom[ xpopIdx ] == ichrom: xpopIdx += 1
		#xpop = xpopAll[ :xpopIdx ]
		xpop = xpopAll[ xpopAllChrom == ichrom ]
		logging.info( 'selected xpop for replica %d' % ichrom )
		xpopBlank = (numpy.nan,)*xpop.numCols()

		#while ihsIdx < len( ihsAllChrom ) and ihsAllChrom[ ihsIdx ] == ichrom: ihsIdx += 1
		#ihs = ihsAll[ :ihsIdx ]
		ihs = ihsAll[ ihsAllChrom == ichrom ]
		logging.info( 'selected ihs for replica %d' % ichrom )
		ihsBlank = (numpy.nan,)*ihs.numCols()

#		if not set( ihs[  'SNP pos (bases)' ] ).issubset( set( posAll['CHROM_POS 1'] ) ):
#			print 'bad positions: ', set( posAll['CHROM_POS 1'] ) - set( ihs[  'SNP pos (bases)' ] )
#		assert set( ihs[  'SNP pos (bases)' ] ).issubset( set( posAll['CHROM_POS 1'] ) ), "bad iHS file " + ihsSignifTsv

		logging.info( 'merging replica %d' % ichrom )
		Data = DotData.mergeOnKeyCols((posAll,xpop,ihs),('CHROM_POS 1','SNP pos (bases) European_WestAfrican','SNP pos (bases)'),
					      blanks = (posAllBlank,xpopBlank,ihsBlank), suffixes = ('pos',' xpop',' ihs'),
					      verbose = True )
		logging.info( 'done merging replica %d; now have %d records' % ( ichrom, len( Data ) ) )
		
		Data = Data[ numpy.invert( numpy.isnan( Data[ 'CHROM_POS 1' ] ) ) ]
		logging.info( 'done removing snp info for SNPs not in all .pos files for replica %d; now have %d records'
			      % ( ichrom, len( Data ) ) )
		
		replicates.append(Data)

		logging.info( 'now have ' + str( len( replicates ) ) + ' replicates.' )

	# endloop: for each replica

	logging.info( 'Stacking replicates...' )
	allData = reduce( lambda x, y: x.vstack(y), replicates)
	logging.info( 'Saving merged SNP info to ' + mergedDotData )
	allData.save( mergedDotData )
	
	logging.info( 'Finished mergeSims()' )
Example #12
0
                    default=1000,
                    help='number of boostrap iters')
parser.add_argument('--cosi-binary',
                    default='./coalescent',
                    help='cosi binary to run')

print 'calling parser'
args = parser.parse_args()
print 'parser done'
# do a reference run

print 'generating reference'
SystemSucceed(' '.join(
    map(str, (args.cosi_binary, '-p', '1_simple.cosiParams', '-n', 100,
              '-m'))) + ' | sample_stats_extra > ref.tsv')
refData = DotData(SVPath='ref.tsv')
min_p = np.ones(len(refData.dtype.names))
max_D = np.repeat(-np.inf, len(refData.dtype.names))
for i in range(10):
    dbg('i')
    refFN = 'reftest%d.tsv' % i
    SystemSucceed(' '.join(
        map(str, (args.cosi_binary, '-p', '0_simple.cosiParams', '-n', 100,
                  '-m'))) + ' | sample_stats_extra > ' + refFN)
    z = DotData(SVPath=refFN)
    for colNum, col in enumerate(z.dtype.names):
        ks_D, ks_p = stats.ks_2samp(refData[col], z[col])
        min_p[colNum] = np.min((min_p[colNum], ks_p))
        max_D[colNum] = np.max((max_D[colNum], ks_D))
    dbg('i min_p max_D')