Example #1
0
    def getChrLen(cls, genome, chr):
        assert genome is not None
        assert chr is not None
        # For the unit-tests
        if genome.lower() == 'testgenome':
            if chr == 'chr21':
                return 46944323
            if chr == 'chrM':
                return 16571

        if genome in cls._chrLengths and \
            chr in cls._chrLengths[genome]:
            return cls._chrLengths[genome][chr]
        else:
            try:
                #length = cls.getNumElementsInFastaFile(os.sep.join([ORIG_DATA_PATH, genome, 'sequence', cls.fixChr(chr) + '.fa']))
                from gold.util.CommonFunctions import createOrigPath
                length = cls.getNumElementsInFastaFile(
                    createOrigPath(genome, cls.getSequenceTrackName(genome),
                                   chr + '.fa'))
            except IOError:
                raise ArgumentValueError(
                    "Error: chromosome '%s' is not part of genome '%s'." %
                    (chr, genome))

            if not genome in cls._chrLengths:
                cls._chrLengths[genome] = {}
            cls._chrLengths[genome][chr] = length
            return length
Example #2
0
def computePurePseudoPvalue(observation, mcSamples, tail):
    numResamplings = len(mcSamples)
    if tail in ['right-tail', 'left-tail']:
        tailFactor = 1.0
    elif tail == 'two-tail':
        tailFactor = 2.0
    else:
        raise ArgumentValueError('Invalid value for tails argument:', tail)
    numMoreExtreme = computeNumMoreExtreme(observation, mcSamples, tail)
    pval = tailFactor * (numMoreExtreme+1) / (numResamplings+1)
    pval = min(1.0, pval)
    return pval
Example #3
0
def computeNumMoreExtreme(observation, mcSamples, tails):
    numMoreExtremeRight = sum(1 for res in mcSamples \
                     if res >= observation )
    numMoreExtremeLeft = sum(1 for res in mcSamples \
                     if res <= observation )
    if tails == 'right-tail':
        return numMoreExtremeRight
    elif tails == 'left-tail':
        return numMoreExtremeLeft
    elif tails == 'two-tail':
        return min(numMoreExtremeLeft, numMoreExtremeRight)

    raise ArgumentValueError('Invalid value for tails argument:', tails)
Example #4
0
 def _init(self,
           kernelType=None,
           kernelStdev=None,
           minimumOffsetValue=1,
           **kwArgs):
     #assert kernelType in ['gaussian','divideByOffset']
     #divideByOffset: weigh by 1/x, where x is offset from center, meaning integral of region (on one side) 0-x is log(x).
     if kernelType == 'gaussian':
         assert kernelStdev is not None
         self._kernelStdev = float(kernelStdev)
     elif kernelType == 'divideByOffset':
         assert minimumOffsetValue is not None
         self._minimumOffsetValue = float(minimumOffsetValue)
     else:
         raise ArgumentValueError('Invalid kernelType')
     self._kernelType = kernelType
    def customHeaders(self, customHeaders):
        self._customHeaders = OrderedDict()

        for key, val in customHeaders.iteritems():
            if val is not None:
                if val == '':
                    raise InvalidFormatError(
                        'Empty header values not allowed. '
                        'Please use ".", the period character, to '
                        'indicate missing values')

                if key.lower() in self._customHeaders:
                    raise ArgumentValueError(
                        'Custom header "{}" appears multiple times in the '
                        'header list. Note that custom headers are case '
                        'insensitive (e.g., "ABC" and "abc" is the same '
                        'header).'.format(key))
                self.setCustomHeader(key, val)
    def attributes(self, attributes):
        self._attributes = OrderedDict()

        for key, val in attributes.iteritems():
            if val is not None:
                if val == '':
                    raise InvalidFormatError(
                        'Empty attribute contents not allowed. '
                        'Please use ".", the period character, to '
                        'indicate missing values')

                if self._doUnquote:
                    val = urlDecodePhrase(val)
                if key.lower() in self._attributes:
                    raise ArgumentValueError(
                        'Attribute "{}" appears multiple times in the '
                        'attribute list. Note that attributes are case '
                        'insensitive (e.g., "ABC" and "abc" is the same '
                        'attribute).'.format(key))
                self.setAttribute(key, val)
    def _compute(self):
        #if any([randTrackClass==SegsSampledByIntensityTrack for randTrackClass in [self._randTrackClass1, self._randTrackClass2]]) \
        #    and self._kwArgs.get('trackNameIntensity') in [None,'']:
        #    return None

        #from dbgp.client import brk
        #brk(host='localhost', port=9000, idekey='galaxy')

        #print 'computing for reg: ',self._region
        #if VERBOSE:
        #    print [randChild.getResult() for randChild in self._children]
        #try:
        if self._kwArgs.get('minimal') != True and (
                self._realChild.getResult() is None
                or anyIsNan(self._realChild.getResult())):
            return None

        #TODO: change this to a "is this a parallel run?" check
        #if not USE_PARALLEL or ('minimal' in self._kwArgs and self._kwArgs['minimal']):
        for i in xrange(len(self._randResults), self._numResamplings):
            #print 'computing randChild..'
            #print ',',
            randChild = self._createRandomizedStat(i)
            self._randResults.append(
                randChild.getResult()
            )  #only to ensure result is created, will be accessed afterwards..
        #else:
        #    jobWrapper = RandomizationManagerStatJobWrapper(self, seed=self._kwArgs["uniqueId"])
        #    jobHandler = JobHandler(self._kwArgs["uniqueId"], True)
        #    self._randResults = jobHandler.run(jobWrapper)

        #logMessage(','.join([str(x) for x in randResults]))
        numpyRandResults = array(self._randResults)
        if self._observation is None:
            self._observation = self._realChild.getResult()

        if self._kwArgs.get('minimal') == True and (
                self._observation is None or anyIsNan(self._observation)):
            return None

        #meanOfNullDistr = 1.0 * sum( randResults ) / \
        #self._numResamplings
        nonNanNumpyRandResults = numpyRandResults[~isnan(numpyRandResults)]
        assert len(numpyRandResults) == self._numResamplings
        numberOfNonNanRandResults = len(nonNanNumpyRandResults)

        meanOfNullDistr = nonNanNumpyRandResults.mean(dtype='float64')
        medianOfNullDistr = median(nonNanNumpyRandResults)
        sdOfNullDistr = nonNanNumpyRandResults.std(dtype='float64')
        #sdCountFromNullOfObs = (observation - meanOfNullDistr) / sdOfNullDistr
        diffObsMean = (self._observation - meanOfNullDistr)
        numMoreExtreme = sum(1 for res in self._randResults \
                         if res >= self._observation )

        #pvalEqual = 1.0 * sum(1 for res in self._randResults \
        #                 if res == self._observation ) / self._numResamplings
        #pvalStrictLeft = 1.0 * sum(1 for res in self._randResults \
        #                 if res < self._observation ) / self._numResamplings

        numMoreExtremeRight = sum(1 for res in self._randResults \
                         if res >= self._observation )
        numMoreExtremeLeft = sum(1 for res in self._randResults \
                         if res <= self._observation )
        if self._tails == 'right-tail':
            numMoreExtreme = numMoreExtremeRight
            tailFactor = 1.0
        elif self._tails == 'left-tail':
            numMoreExtreme = numMoreExtremeLeft
            tailFactor = 1.0
        elif self._tails == 'two-tail':
            numMoreExtreme = min(numMoreExtremeLeft, numMoreExtremeRight)
            tailFactor = 2.0
        else:
            raise ArgumentValueError('Invalid value for tails argument:',
                                     self._tails)

        # For more info on the formula for calculating p-values:
        # "Permutation P-values should never be zero: calculating exact P-values
        #  when permutations are randomly drawn" (http://www.ncbi.nlm.nih.gov/pubmed/21044043)

        pval = tailFactor * (numMoreExtreme + 1) / (self._numResamplings + 1)
        pval = min(1.0, pval)

        #pvalEqual = 1.0 * sum(1 for res in self._randResults \
        #                 if res == self._observation ) / self._numResamplings
        #pvalStrictRight = 1.0 * sum(1 for res in self._randResults \
        #                 if res > self._observation ) / self._numResamplings
        #pvalStrictLeft = 1.0 * sum(1 for res in self._randResults \
        #                 if res < self._observation ) / self._numResamplings
        #
        #if self._tails == 'right-tail':
        #    pval = pvalStrictRight + pvalEqual
        #elif self._tails == 'left-tail':
        #    pval = pvalStrictLeft + pvalEqual
        #elif self._tails == 'two-tail':
        #    #pval = 2 * min(pvalStrictLeft, pvalStrictRight) + pvalEqual
        #    pval = min(1, 2 * min(pvalStrictLeft+ pvalEqual, pvalStrictRight+ pvalEqual))
        #else:
        #    raise RuntimeError()

        #if pval == 0:
        #pval = 1.0 / self._numResamplings

        resDict = OrderedDict([(self.PVAL_KEY, pval),
                               ('TSMC_' + self.getRawStatisticMainClassName(),
                                self._observation),
                               ('MeanOfNullDistr', meanOfNullDistr),
                               ('MedianOfNullDistr', medianOfNullDistr),
                               ('SdNullDistr', sdOfNullDistr),
                               ('DiffFromMean', diffObsMean),
                               (self.NUM_SAMPLES_KEY, self._numResamplings),
                               ('NumSamplesNotNan', numberOfNonNanRandResults),
                               (self.M_KEY, numMoreExtreme)])

        #if self._pointCount1.getResult() is not None:
        #if self._track._trackFormatReq is not None and not self._track._trackFormatReq.isDense() and not self._track._trackFormatReq.allowOverlaps():
        if hasattr(self, '_pointCount1'):
            numElTr1 = self._pointCount1.getResult()
            if numElTr1 < 1:
                resDict[self.PVAL_KEY] = None
            resDict.update({'NumPointsTr1': numElTr1})
        #if self._pointCount2.getResult() is not None:
        #if self._track2._trackFormatReq is not None and not self._track2._trackFormatReq.isDense() and not self._track2._trackFormatReq.allowOverlaps():
        if hasattr(self, '_pointCount2'):
            numElTr2 = self._pointCount2.getResult()
            if numElTr2 < 1:
                resDict['P-value'] = None
            resDict.update({'NumPointsTr2': numElTr2})

        if self._kwArgs.get('includeFullNullDistribution') == 'yes':
            resDict['fullNullDistribution'] = ','.join(
                [str(x) for x in nonNanNumpyRandResults])
        assert len(self._randResults) == self._numResamplings
        return resDict