Example #1
0
def pca(data, algorithm='eig'):
    """pca(data) -> mean, pcs, norm_pcs, variances, positions, norm_positions
	Perform Principal Components Analysis on a set of n data points in k
	dimensions. The data array must be of shape (n, k).
	
	This function returns the transformed position of each data point along
	with the amount of variance captured by each principal component.
	
	The optional algorithm parameter can be either 'svd' to perform PCA with 
	the singular value decomposition, or 'eig' to use a symmetric eigenvalue
	decomposition. Empirically, eig is faster on the datasets I have tested.
	"""

    if False:  #scaling is disabled
        for i in xrange(0, len(data[0])):
            freqs = []
            for j in xrange(0, len(data)):
                freqs.append(data[j][i])
            meanFreq = float(sum(freqs)) / len(freqs)
            sdFreq = stdDev(freqs, meanFreq)

            for j in xrange(0, len(data)):
                data[j][i] -= meanFreq
                data[j][i] /= sdFreq

    data = numpy.asarray(data)
    mean = data.mean(axis=0)
    centered = data - mean

    if algorithm == 'eig':
        pcs, variances, stds, positions, norm_positions = _pca_eig(centered)
    elif algorithm == 'svd':
        pcs, variances, stds, positions, norm_positions = _pca_svd(centered)
    else:
        raise RuntimeError('Algorithm %s not known.' % algorithm)

    sumVariances = sum(variances)
    for i in xrange(0, len(variances)):
        variances[i] /= sumVariances

    return positions, variances
Example #2
0
def pca(data, algorithm='eig'):
	"""pca(data) -> mean, pcs, norm_pcs, variances, positions, norm_positions
	Perform Principal Components Analysis on a set of n data points in k
	dimensions. The data array must be of shape (n, k).
	
	This function returns the transformed position of each data point along
	with the amount of variance captured by each principal component.
	
	The optional algorithm parameter can be either 'svd' to perform PCA with 
	the singular value decomposition, or 'eig' to use a symmetric eigenvalue
	decomposition. Empirically, eig is faster on the datasets I have tested.
	"""
	
	if False:	#scaling is disabled
		for i in xrange(0, len(data[0])):
			freqs = []
			for j in xrange(0, len(data)):
				freqs.append(data[j][i])
			meanFreq = float(sum(freqs)) / len(freqs)
			sdFreq = stdDev(freqs, meanFreq)
			
			for j in xrange(0, len(data)):
				data[j][i] -= meanFreq
				data[j][i] /= sdFreq
		
	data = numpy.asarray(data)
	mean = data.mean(axis = 0)
	centered = data - mean
	
	if algorithm=='eig':
		pcs, variances, stds, positions, norm_positions = _pca_eig(centered)
	elif algorithm=='svd':
		pcs, variances, stds, positions, norm_positions = _pca_svd(centered)
	else:
		raise RuntimeError('Algorithm %s not known.'%algorithm)
	
	sumVariances = sum(variances)
	for i in xrange(0, len(variances)):
		variances[i] /= sumVariances
	
	return positions, variances 
Example #3
0
    def run(self, statTest, effectSizeMeasure, profile, progress=None):
        self.results.data = []
        self.results.test = statTest.name
        self.results.profile = profile

        if progress == 'Verbose':
            print '  Processing feature:'

        index = 0
        for feature in profile.getFeatures():
            if progress == 'Verbose':
                print '    ' + feature
            elif progress != None:
                if progress.wasCanceled():
                    self.results.data = []
                    return

                index += 1
                progress.setValue(index)

            seqCount = profile.getActiveFeatureCounts(feature)
            parentCount = profile.getActiveParentCounts(feature)
            data = profile.getActiveFeatureProportions(feature)
            pValue, note = statTest.hypothesisTest(data)
            effectSize = effectSizeMeasure.run(data)

            row = [feature, float(pValue), float(pValue), effectSize, note]

            for i in xrange(0, len(seqCount)):
                propGroup = []
                for j in xrange(0, len(seqCount[i])):
                    sc = seqCount[i][j]
                    pc = parentCount[i][j]
                    if pc > 0:
                        propGroup.append(sc * 100.0 / pc)
                    else:
                        propGroup.append(0.0)

                meanGroup = mean(propGroup)
                row.append(meanGroup)
                row.append(stdDev(propGroup, meanGroup))

            for i in xrange(0, len(seqCount)):
                for j in xrange(0, len(seqCount[i])):
                    sc = seqCount[i][j]
                    pc = parentCount[i][j]
                    row.append(sc)
                    row.append(pc)
                    if pc > 0:
                        row.append(sc * 100.0 / pc)
                    else:
                        row.append(0.0)

            self.results.data.append(row)

        headingsSampleStats = []
        for i in xrange(0, len(profile.activeSamplesInGroups)):
            for sampleName in profile.activeSamplesInGroups[i]:
                headingsSampleStats.append(sampleName)
                headingsSampleStats.append(sampleName + ': parent seq. count')
                headingsSampleStats.append(sampleName + ': rel. freq. (%)')

        self.results.createTableHeadings(profile.activeGroupNames,
                                         headingsSampleStats)

        if len(self.results.data) >= 1:
            # sort results according to p-values
            self.results.data = TableHelper.SortTable(
                self.results.data, [self.results.dataHeadings['pValues']])

        if progress != None and progress != 'Verbose':
            index += 1
            progress.setValue(index)
Example #4
0
    def run(self, test, signLevel, statsResults, trials, bootstrapRep,
            progress):

        tableData = []
        index = 0
        for row in statsResults:
            feature = row[0]
            seq1 = row[1]
            seq2 = row[2]
            parentSeq1 = row[3]
            parentSeq2 = row[4]

            p1 = float(seq1) / parentSeq1
            p2 = float(seq2) / parentSeq2

            powerList = []
            powerListLess5 = []
            powerListGreater5 = []
            for trial in xrange(0, trials):
                if progress != '':
                    index += 1
                    progress.setValue(index)
                    progress.setLabelText(feature + ' - Trial = ' + str(trial))

                power = 0
                processedReplicates = 0
                for dummy in xrange(0, bootstrapRep):
                    c1 = 0
                    c2 = 0
                    for dummy in xrange(0, parentSeq1):
                        rnd = random.random()
                        if rnd <= p1:
                            c1 += 1

                    for dummy in xrange(0, parentSeq2):
                        rnd = random.random()
                        if rnd <= p2:
                            c2 += 1

                    if c1 == 0 and c2 == 0:
                        # This is a special case that many hypothesis test will not handle correctly
                        # so we just ignore it. This will have little effect on the calculated power
                        # of a test.
                        continue

                    processedReplicates += 1

                    pValueOneSided, pValueTwoSided = test.hypothesisTest(
                        c1, c2, parentSeq1, parentSeq2)
                    if pValueTwoSided < signLevel:
                        power += 1

                if processedReplicates > 0:
                    if min([seq1, seq2]) <= 5:
                        powerListLess5.append(
                            float(power) / processedReplicates)
                    else:
                        powerListGreater5.append(
                            float(power) / processedReplicates)

                    powerList.append(float(power) / processedReplicates)

            row = []
            row.append(feature)
            row.append(seq1)
            row.append(seq2)
            row.append(parentSeq1)
            row.append(parentSeq2)
            row.append(float(seq1) / parentSeq1)
            row.append(float(seq2) / parentSeq2)
            row.append(mean(powerList))
            row.append(stdDev(powerList))

            if math.isnan(mean(powerListLess5)):
                row.append('')
            else:
                row.append(mean(powerListLess5))

            if math.isnan(stdDev(powerListLess5)):
                row.append('')
            else:
                row.append(stdDev(powerListLess5))

            if math.isnan(mean(powerListGreater5)):
                row.append('')
            else:
                row.append(mean(powerListGreater5))

            if math.isnan(stdDev(powerListGreater5)):
                row.append('')
            else:
                row.append(stdDev(powerListGreater5))

            tableData.append(row)

        return tableData
Example #5
0
	def run(self, statTest, effectSizeMeasure, profile, progress = None):
		self.results.data = []
		self.results.test = statTest.name
		self.results.profile = profile
		
		if progress == 'Verbose':
			print '  Processing feature:'
		 
		index = 0
		for feature in profile.getFeatures():
			if progress == 'Verbose':
					print '    ' + feature
			elif progress != None:
				if progress.wasCanceled():
					self.results.data = []
					return

				index += 1
				progress.setValue(index)
								
			seqCount = profile.getActiveFeatureCounts(feature)
			parentCount = profile.getActiveParentCounts(feature)
			data = profile.getActiveFeatureProportions(feature)
			pValue, note = statTest.hypothesisTest(data)
			effectSize = effectSizeMeasure.run(data)
 
			row = [feature, float(pValue), float(pValue), effectSize, note]
			
			for i in xrange(0, len(seqCount)):
				propGroup = []
				for j in xrange(0, len(seqCount[i])):
					propGroup.append(seqCount[i][j] * 100.0 / parentCount[i][j])
					
				meanGroup = mean(propGroup)
				row.append(meanGroup)
				row.append(stdDev(propGroup, meanGroup))
			
			for i in xrange(0, len(seqCount)):
				for j in xrange(0, len(seqCount[i])):
					row.append(seqCount[i][j])
					row.append(parentCount[i][j])
					row.append(seqCount[i][j] * 100.0 / parentCount[i][j])
					
			self.results.data.append(row)

		headingsSampleStats = []
		for i in xrange(0, len(profile.activeSamplesInGroups)):
			for sampleName in profile.activeSamplesInGroups[i]:
				headingsSampleStats.append(sampleName)
				headingsSampleStats.append(sampleName + ': parent seq. count')
				headingsSampleStats.append(sampleName + ': rel. freq. (%)')
			
		self.results.createTableHeadings(profile.activeGroupNames, headingsSampleStats)
			
		if len(self.results.data) >= 1:
			# sort results according to p-values
			self.results.data = TableHelper.SortTable(self.results.data, [self.results.dataHeadings['pValues']])
			
		if progress != None and progress != 'Verbose':
			index += 1
			progress.setValue(index)
Example #6
0
    def plot(self, profile, statsResults):
        if len(profile.profileDict) <= 0:
            self.emptyAxis()
            return

        if len(profile.profileDict) > 10000:
            QtGui.QApplication.instance().setOverrideCursor(
                QtGui.QCursor(QtCore.Qt.ArrowCursor))
            reply = QtGui.QMessageBox.question(
                self, 'Continue?', 'Profile contains ' +
                str(len(profile.profileDict)) + ' features. ' +
                'It may take several seconds to generate this plot. Exploring the data at a higher hierarchy level is recommended. '
                + 'Do you wish to continue?', QtGui.QMessageBox.Yes,
                QtGui.QMessageBox.No)
            QtGui.QApplication.instance().restoreOverrideCursor()
            if reply == QtGui.QMessageBox.No:
                self.emptyAxis()
                return

        # *** Colour of plot elements
        axesColour = str(self.preferences['Axes colour'].name())
        group1Colour = str(
            self.preferences['Group colours'][profile.groupName1].name())
        group2Colour = str(
            self.preferences['Group colours'][profile.groupName2].name())

        # *** Set sample names
        self.groupName1 = profile.groupName1
        self.groupName2 = profile.groupName2

        # *** Create lists for each quantity of interest and calculate spread of data
        groupData1, groupData2 = profile.getFeatureProportionsAll()
        features = profile.getFeatures()

        field1 = []
        field2 = []
        xSpread = []
        ySpread = []
        for i in xrange(0, len(groupData1)):
            mean1 = mean(groupData1[i])
            mean2 = mean(groupData2[i])

            field1.append(mean1)
            field2.append(mean2)

            if self.spreadMethod == 'standard deviation':
                xSpread.append([
                    max(mean1 - stdDev(groupData1[i], mean1), 0),
                    min(mean1 + stdDev(groupData1[i], mean1), 100)
                ])
                ySpread.append([
                    max(mean2 - stdDev(groupData2[i], mean2), 0),
                    min(mean2 + stdDev(groupData2[i], mean2), 100)
                ])
            elif self.spreadMethod == '2 * standard deviation':
                xSpread.append([
                    max(mean1 - 2 * stdDev(groupData1[i], mean1), 0),
                    min(mean1 + 2 * stdDev(groupData1[i], mean1), 100)
                ])
                ySpread.append([
                    max(mean2 - 2 * stdDev(groupData2[i], mean2), 0),
                    min(mean2 + 2 * stdDev(groupData2[i], mean2), 100)
                ])
            elif self.spreadMethod == '25th and 75th percentile':
                spread1 = mquantiles(groupData1[i], prob=[0.25, 0.75])
                spread2 = mquantiles(groupData2[i], prob=[0.25, 0.75])
                xSpread.append([max(spread1[0], 0), min(spread1[1], 100)])
                ySpread.append([max(spread2[0], 0), min(spread2[1], 100)])
            elif self.spreadMethod == '9th and 91st percentile':
                spread1 = mquantiles(groupData1[i], prob=[0.09, 0.91])
                spread2 = mquantiles(groupData2[i], prob=[0.09, 0.91])
                xSpread.append([max(spread1[0], 0), min(spread1[1], 100)])
                ySpread.append([max(spread2[0], 0), min(spread2[1], 100)])
            elif self.spreadMethod == '2nd and 98th percentile':
                spread1 = mquantiles(groupData1[i], prob=[0.02, 0.98])
                spread2 = mquantiles(groupData2[i], prob=[0.02, 0.98])
                xSpread.append([max(spread1[0], 0), min(spread1[1], 100)])
                ySpread.append([max(spread2[0], 0), min(spread2[1], 100)])
            elif self.spreadMethod == 'minimum and maximum':
                xSpread.append([max(groupData1[i]), min(groupData1[i])])
                ySpread.append([max(groupData2[i]), min(groupData2[i])])

        # *** Set figure size
        self.fig.clear()
        self.fig.set_size_inches(self.figWidth, self.figHeight)

        if self.bShowHistograms:
            histogramSizeX = self.histogramSize / self.figWidth
            histogramSizeY = self.histogramSize / self.figHeight
        else:
            histogramSizeX = 0.0
            histogramSizeY = 0.0

        padding = 0.1  # inches
        xOffsetFigSpace = (0.4 + padding) / self.figWidth
        yOffsetFigSpace = (0.3 + padding) / self.figHeight
        axesScatter = self.fig.add_axes([
            xOffsetFigSpace, yOffsetFigSpace, 1.0 - xOffsetFigSpace -
            histogramSizeX - (2 * padding) / self.figWidth, 1.0 -
            yOffsetFigSpace - histogramSizeY - (2 * padding) / self.figHeight
        ])

        if self.bShowHistograms:
            axesTopHistogram = self.fig.add_axes([
                xOffsetFigSpace,
                1.0 - histogramSizeY - padding / self.figHeight,
                1.0 - xOffsetFigSpace - histogramSizeX -
                (2 * padding) / self.figWidth, histogramSizeY
            ])

            axesRightHistogram = self.fig.add_axes([
                1.0 - histogramSizeX - padding / self.figWidth,
                yOffsetFigSpace, histogramSizeX, 1.0 - yOffsetFigSpace -
                histogramSizeY - (2 * padding) / self.figHeight
            ])

        # *** Handle mouse events
        tooltips = []
        for i in xrange(0, len(field1)):
            tooltip = features[i] + '\n\n'
            tooltip += (self.groupName1 +
                        ' mean proportion: %.3f' % field1[i]) + '\n'
            tooltip += (self.groupName2 +
                        ' mean proportion: %.3f' % field2[i]) + '\n\n'
            tooltip += 'Difference between mean proportions (%): ' + (
                '%.3f' % (field1[i] - field2[i])) + '\n'

            if field2[i] != 0:
                tooltip += 'Ratio of mean proportions: %.3f' % (field1[i] /
                                                                field2[i])
            else:
                tooltip += 'Ratio of mean proportions: undefined'

            if statsResults.profile != None:
                pValue = statsResults.getFeatureStatisticAsStr(
                    features[i], 'pValues')
                pValueCorrected = statsResults.getFeatureStatisticAsStr(
                    features[i], 'pValuesCorrected')
                tooltip += '\n\n'
                tooltip += 'p-value: ' + pValue + '\n'
                tooltip += 'Corrected p-value: ' + pValueCorrected

            tooltips.append(tooltip)

        self.plotEventHandler = PlotEventHandler(field1, field2, tooltips)

        self.mouseEventCallback(self.plotEventHandler)

        # *** Calculate R^2 value
        slope, intercept, r_value, p_value, std_err = linregress(
            field1, field2)

        # *** Plot data

        # set visual properties of all points
        colours = []
        highlightedField1 = []
        highlightedField2 = []
        highlighColours = []
        for i in xrange(0, len(field1)):
            if field1[i] > field2[i]:
                colours.append(group1Colour)
            else:
                colours.append(group2Colour)

            if features[i] in self.preferences['Highlighted group features']:
                highlightedField1.append(field1[i])
                highlightedField2.append(field2[i])
                highlighColours.append(colours[i])

        # scatter plot
        axesScatter.scatter(field1,
                            field2,
                            c=colours,
                            s=self.markerSize,
                            zorder=5)
        if len(highlightedField1) > 0:
            axesScatter.scatter(highlightedField1,
                                highlightedField2,
                                c=highlighColours,
                                s=self.markerSize,
                                edgecolors='red',
                                linewidth=2,
                                zorder=10)

        # plot CIs
        if self.spreadMethod != 'None':
            xlist = []
            ylist = []
            for i in xrange(0, len(field1)):
                # horizontal CIs
                xlist.append(xSpread[i][0])
                xlist.append(xSpread[i][1])
                xlist.append(None)
                ylist.append(field2[i])
                ylist.append(field2[i])
                ylist.append(None)

                # vertical CIs
                xlist.append(field1[i])
                xlist.append(field1[i])
                xlist.append(None)
                ylist.append(ySpread[i][0])
                ylist.append(ySpread[i][1])
                ylist.append(None)

            axesScatter.plot(xlist,
                             ylist,
                             '-',
                             color='gray',
                             antialiased=False)

        # plot y=x line
        maxProportion = max(max(field1), max(field2)) * 1.05
        axesScatter.plot([0, maxProportion], [0, maxProportion],
                         color=axesColour,
                         linestyle='dashed',
                         marker='',
                         zorder=1)

        axesScatter.set_xlabel(self.groupName1 + ' (%)')
        axesScatter.set_ylabel(self.groupName2 + ' (%)')

        if self.bShowR2:
            axesScatter.text(0.02,
                             0.98,
                             r'R$^2$ = ' + ('%0.3f' % r_value**2),
                             horizontalalignment='left',
                             verticalalignment='top',
                             transform=axesScatter.transAxes)

        axesScatter.set_xlim(0, maxProportion)
        axesScatter.set_ylim(0, maxProportion)

        # *** Prettify scatter plot
        for line in axesScatter.yaxis.get_ticklines():
            line.set_color(axesColour)

        for line in axesScatter.xaxis.get_ticklines():
            line.set_color(axesColour)

        for loc, spine in axesScatter.spines.iteritems():
            spine.set_color(axesColour)

        # plot histograms
        if not self.bShowHistograms:
            for a in axesScatter.yaxis.majorTicks:
                a.tick1On = True
                a.tick2On = False

            for a in axesScatter.xaxis.majorTicks:
                a.tick1On = True
                a.tick2On = False

            for line in axesScatter.yaxis.get_ticklines():
                line.set_color(axesColour)

            for line in axesScatter.xaxis.get_ticklines():
                line.set_color(axesColour)

            for loc, spine in axesScatter.spines.iteritems():
                if loc in ['right', 'top']:
                    spine.set_color('none')
                else:
                    spine.set_color(axesColour)

        else:  # show histograms
            # plot top histogram
            axesTopHistogram.xaxis.set_major_formatter(NullFormatter())
            pdf, bins, patches = axesTopHistogram.hist(field1,
                                                       bins=self.numBins,
                                                       facecolor=group1Colour)
            axesTopHistogram.set_xlim(axesScatter.get_xlim())
            axesTopHistogram.set_yticks([0, max(pdf)])
            axesTopHistogram.set_ylim([0, max(pdf) * 1.05])

            # plot right histogram
            axesRightHistogram.yaxis.set_major_formatter(NullFormatter())
            pdf, bins, patches = axesRightHistogram.hist(
                field2,
                bins=self.numBins,
                orientation='horizontal',
                facecolor=group2Colour)
            axesRightHistogram.set_ylim(axesScatter.get_ylim())
            axesRightHistogram.set_xticks([0, max(pdf)])
            axesRightHistogram.set_xlim([0, max(pdf) * 1.05])

            # *** Prettify histogram plot
            for a in axesTopHistogram.yaxis.majorTicks:
                a.tick1On = True
                a.tick2On = False

            for a in axesTopHistogram.xaxis.majorTicks:
                a.tick1On = True
                a.tick2On = False

            for line in axesTopHistogram.yaxis.get_ticklines():
                line.set_color(axesColour)

            for line in axesTopHistogram.xaxis.get_ticklines():
                line.set_color(axesColour)

            for loc, spine in axesTopHistogram.spines.iteritems():
                if loc in ['right', 'top']:
                    spine.set_color('none')
                else:
                    spine.set_color(axesColour)

            for a in axesRightHistogram.yaxis.majorTicks:
                a.tick1On = True
                a.tick2On = False

            for a in axesRightHistogram.xaxis.majorTicks:
                a.tick1On = True
                a.tick2On = False

            for line in axesRightHistogram.yaxis.get_ticklines():
                line.set_color(axesColour)

            for line in axesRightHistogram.xaxis.get_ticklines():
                line.set_color(axesColour)

            for loc, spine in axesRightHistogram.spines.iteritems():
                if loc in ['right', 'top']:
                    spine.set_color('none')
                else:
                    spine.set_color(axesColour)

        self.updateGeometry()
        self.draw()
Example #7
0
results = [
    coverageListDP, coverageListDPCC, coverageListNW, coverageListWoolf,
    coverageListGart, coverageListRP
]
lengths = [
    ciLengthDP, ciLengthDPCC, ciLengthNW, ciLengthWoolf, ciLengthGart,
    ciLengthRP
]
methodNames = [
    'DP: Asymptotic', 'DP: Asymptotic-CC', 'Newcombe-Wilson', 'Woolf', 'Gart',
    'RP: Asympototic'
]

for i in xrange(0, len(results)):
    coverageMeanStr = '%.2f' % mean(results[i])
    coverageSdStr = '%.2f' % stdDev(results[i])
    coverageMinStr = '%.2f' % min(results[i])
    coverageMaxStr = '%.2f' % max(results[i])

    lengthMeanStr = '%.2f' % mean(lengths[i])
    lengthSdStr = '%.2f' % stdDev(lengths[i])

    fout.write(methodNames[i] + '\n')
    fout.write(coverageMeanStr + '+/-' + coverageSdStr + '[' + coverageMinStr +
               ';' + coverageMaxStr + ']\n')
    fout.write(lengthMeanStr + '+/-' + lengthSdStr + '\n')
    fout.write('\n')

fout.close()
Example #8
0
	def run(self, statTest, testType, confIntervMethod, coverage, profile, progress = None):
		self.results.test = statTest.name
		self.results.testType = testType
		self.results.alpha = 1.0 - coverage
		self.results.confIntervMethod = confIntervMethod
		self.results.profile = profile
		
		if progress == 'Verbose':
			print '  Processing feature:'
		 
		self.results.data = []
		index = 0
		
		# calculate statistics
		seqsGroup1 = []
		seqsGroup2 = []
		parentSeqsGroup1 = []
		parentSeqsGroup2 = []
		pValues = []
		lowerCIs = []
		upperCIs = []
		effectSizes = []
		notes = []
		if statTest.bSingleFeatureInterface:
			# process features one at a time
			for feature in profile.getFeatures():
				if progress == 'Verbose':
					print '    ' + feature
				elif progress != None:
					if progress.wasCanceled():
						self.results.data = []
						return

					index += 1
					progress.setValue(index)
															
				# get statistics
				seqGroup1, seqGroup2 = profile.getFeatureCounts(feature)
				parentSeqGroup1, parentSeqGroup2= profile.getParentFeatureCounts(feature)
				results = statTest.run(seqGroup1, seqGroup2, parentSeqGroup1, parentSeqGroup2, confIntervMethod, coverage)
				pValueOneSided, pValueTwoSided, lowerCI, upperCI, effectSize, note = results
				
				if testType == 'One-sided':
					pValue = pValueOneSided
				elif testType == 'Two-sided':
					pValue = pValueTwoSided
				else:
					print 'Error: Unknown test type.'
	 
				# record results
				seqsGroup1.append(seqGroup1)
				seqsGroup2.append(seqGroup2)
				parentSeqsGroup1.append(parentSeqGroup1)
				parentSeqsGroup2.append(parentSeqGroup2)
				pValues.append(pValue)
				lowerCIs.append(lowerCI)
				upperCIs.append(upperCI)
				effectSizes.append(effectSize)
				notes.append(note)
				
			if progress != None and progress != 'Verbose':
				index += 1
				progress.setValue(index)
		else:
			# process all features at once
			seqsGroup1, seqsGroup2 = profile.getFeatureCountsAll()
			parentSeqsGroup1, parentSeqsGroup2= profile.getParentFeatureCountsAll()
			pValuesOneSided, pValuesTwoSided, lowerCIs, upperCIs, effectSizes, notes = statTest.runAll(seqsGroup1, seqsGroup2, parentSeqsGroup1, parentSeqsGroup2, confIntervMethod, coverage, progress)
			if progress == 'Verbose':
				print '    Processing all features...'
			elif progress != None and progress.wasCanceled():
				self.results.data = []
				return

			if testType == 'One-sided':
				pValues = pValuesOneSided
			elif testType == 'Two-sided':
				pValues = pValuesTwoSided
			else:
				print 'Error: Unknown test type.'
				
		# record statistics
		features = profile.getFeatures()
		for i in xrange(0, len(features)):
			propGroup1 = []
			for j in xrange(0, len(seqsGroup1[i])):
				sg1 = seqsGroup1[i][j]
				psg1 = parentSeqsGroup1[i][j]
				
				if psg1 > 0:
					propGroup1.append( sg1 * 100.0 / psg1 )
				else:
					propGroup1.append( 0.0 )
			
			propGroup2 = []
			for j in xrange(0, len(seqsGroup2[i])):
				sg2 = seqsGroup2[i][j]
				psg2 = parentSeqsGroup2[i][j]
				
				if psg2 > 0:
					propGroup2.append( sg2 * 100.0 / psg2 )
				else:
					propGroup2.append( 0.0 )
			
			meanGroup1 = mean(propGroup1)
			meanGroup2 = mean(propGroup2)
			row = [features[i], meanGroup1, stdDev(propGroup1, meanGroup1), meanGroup2, stdDev(propGroup2, meanGroup2),
							float(pValues[i]),float(pValues[i]),float(effectSizes[i]),
							float(lowerCIs[i]),float(upperCIs[i]), notes[i]]
							
			for j in xrange(0, len(seqsGroup1[i])):
				row.append(seqsGroup1[i][j])
				row.append(parentSeqsGroup1[i][j])
				if parentSeqsGroup1[i][j] > 0:
					row.append(seqsGroup1[i][j] * 100.0 / parentSeqsGroup1[i][j])
				else:
					row.append(0.0)
				
			for j in xrange(0, len(seqsGroup2[i])):
				row.append(seqsGroup2[i][j])
				row.append(parentSeqsGroup2[i][j])
				if parentSeqsGroup2[i][j] > 0:
					row.append(seqsGroup2[i][j] * 100.0 / parentSeqsGroup2[i][j])
				else:
					row.append(0.0)
																
			self.results.data.append(row)
																
		headingsSampleStats = []
		for sampleName in (profile.samplesInGroup1 + profile.samplesInGroup2):
			headingsSampleStats.append(sampleName)
			headingsSampleStats.append(sampleName + ': parent seq. count')
			headingsSampleStats.append(sampleName + ': rel. freq. (%)')
			
		self.results.createTableHeadings(profile.groupName1, profile.groupName2, headingsSampleStats)
		
		# sort results according to p-values
		if len(self.results.data) >= 1:
			self.results.data = TableHelper.SortTable(self.results.data, [self.results.dataHeadings['pValues']])
    def run(self, confIntervMethod, coverage, tables, trials, bootstrapRep,
            progress):

        tableData = []
        index = 0
        for row in tables:
            feature = row[0]
            seq1 = row[1]
            seq2 = row[2]
            parentSeq1 = row[3]
            parentSeq2 = row[4]

            lowerCI, upperCI, obsEffectSize = confIntervMethod.run(
                seq1, seq2, parentSeq1, parentSeq2, coverage)

            p1 = float(seq1) / parentSeq1
            p2 = float(seq2) / parentSeq2

            coverageList = []
            coverageListLess5 = []
            coverageListGreater5 = []
            for trial in xrange(0, trials):
                if progress != '':
                    index += 1
                    progress.setValue(index)
                    progress.setLabelText(feature + ' - Trial = ' + str(trial))

                containedRep = 0
                for dummy in xrange(0, bootstrapRep):
                    c1 = binomial(parentSeq1, p1)
                    c2 = binomial(parentSeq2, p2)

                    lowerCI, upperCI, effectSize = confIntervMethod.run(
                        c1, c2, parentSeq1, parentSeq2, coverage)
                    if obsEffectSize >= lowerCI and obsEffectSize <= upperCI:
                        containedRep += 1

                if min([seq1, seq2]) <= 5:
                    coverageListLess5.append(
                        float(containedRep) / bootstrapRep)
                else:
                    coverageListGreater5.append(
                        float(containedRep) / bootstrapRep)

                coverageList.append(float(containedRep) / bootstrapRep)

            row = []
            row.append(feature)
            row.append(seq1)
            row.append(seq2)
            row.append(parentSeq1)
            row.append(parentSeq2)
            row.append(float(seq1) / parentSeq1)
            row.append(float(seq2) / parentSeq2)
            row.append(mean(coverageList))
            row.append(stdDev(coverageList))

            if math.isnan(mean(coverageListLess5)):
                row.append('')
            else:
                row.append(mean(coverageListLess5))

            if math.isnan(stdDev(coverageListLess5)):
                row.append('')
            else:
                row.append(stdDev(coverageListLess5))

            if math.isnan(mean(coverageListGreater5)):
                row.append('')
            else:
                row.append(mean(coverageListGreater5))

            if math.isnan(stdDev(coverageListGreater5)):
                row.append('')
            else:
                row.append(stdDev(coverageListGreater5))

            tableData.append(row)

        return tableData
Example #10
0
  def run(self, test, signLevel, statsResults, trials, bootstrapRep, progress):
     
    tableData = []
    index = 0
    for row in statsResults:                    
      feature = row[0]
      seq1 = row[1]
      seq2 = row[2]
      parentSeq1 = row[3]
      parentSeq2 = row[4]

      p1 = float(seq1) / parentSeq1
      p2 = float(seq2) / parentSeq2
    
      powerList = []  
      powerListLess5 = []  
      powerListGreater5 = []  
      for trial in xrange(0, trials): 
        if progress != '':
          index += 1
          progress.setValue(index)
          progress.setLabelText(feature + ' - Trial = ' + str(trial))   
          
        power = 0
        processedReplicates = 0
        for dummy in xrange(0, bootstrapRep):
          c1 = 0
          c2 = 0
          for dummy in xrange(0, parentSeq1):
            rnd = random.random()
            if rnd <= p1:
              c1 += 1
              
          for dummy in xrange(0, parentSeq2):
            rnd = random.random()
            if rnd <= p2:
              c2 += 1
      
          if c1 == 0 and c2 == 0:
            # This is a special case that many hypothesis test will not handle correctly
            # so we just ignore it. This will have little effect on the calculated power
            # of a test.
            continue
          
          processedReplicates += 1
          
          pValueOneSided, pValueTwoSided = test.hypothesisTest(c1, c2, parentSeq1, parentSeq2)
          if pValueTwoSided < signLevel:
            power += 1      
               
        if processedReplicates > 0:
          if min([seq1,seq2]) <= 5:
            powerListLess5.append(float(power) / processedReplicates)
          else:
            powerListGreater5.append(float(power) / processedReplicates)
            
          powerList.append(float(power) / processedReplicates)
  
      row = []
      row.append(feature)
      row.append(seq1)
      row.append(seq2)
      row.append(parentSeq1)
      row.append(parentSeq2)
      row.append(float(seq1) / parentSeq1)
      row.append(float(seq2) / parentSeq2)
      row.append(mean(powerList))
      row.append(stdDev(powerList))
      
      if math.isnan(mean(powerListLess5)):
        row.append('')
      else:
        row.append(mean(powerListLess5))
        
      if math.isnan(stdDev(powerListLess5)):
        row.append('')
      else:
        row.append(stdDev(powerListLess5))
        
      if math.isnan(mean(powerListGreater5)):
        row.append('')
      else:
        row.append(mean(powerListGreater5))
        
      if math.isnan(stdDev(powerListGreater5)):
        row.append('')
      else:
        row.append(stdDev(powerListGreater5))

      tableData.append(row)
      
    return tableData
Example #11
0
      ciLengthRP.append(upperCI - lowerCI)
            
    coverageListDP.append(float(containedRepDP) / replicates)
    coverageListDPCC.append(float(containedRepDPCC) / replicates)
    coverageListNW.append(float(containedRepNW) / replicates)
    coverageListWoolf.append(float(containedRepWoolf) / replicates)
    coverageListGart.append(float(containedRepGart) / replicates)
    coverageListRP.append(float(containedRepRP) / replicates)
     
results = [coverageListDP, coverageListDPCC, coverageListNW, coverageListWoolf, coverageListGart, coverageListRP]  
lengths = [ciLengthDP,ciLengthDPCC,ciLengthNW,ciLengthWoolf,ciLengthGart,ciLengthRP]   
methodNames = ['DP: Asymptotic', 'DP: Asymptotic-CC', 'Newcombe-Wilson', 'Woolf', 'Gart', 'RP: Asympototic']

for i in xrange(0, len(results)):
  coverageMeanStr = '%.2f' % mean(results[i])
  coverageSdStr = '%.2f' % stdDev(results[i])
  coverageMinStr = '%.2f' % min(results[i])
  coverageMaxStr = '%.2f' % max(results[i])
  
  lengthMeanStr = '%.2f' % mean(lengths[i])
  lengthSdStr = '%.2f' % stdDev(lengths[i])
     
  fout.write(methodNames[i] + '\n')
  fout.write(coverageMeanStr + '+/-' + coverageSdStr + '[' + coverageMinStr + ';' + coverageMaxStr + ']\n')
  fout.write(lengthMeanStr + '+/-' + lengthSdStr+ '\n')
  fout.write('\n')
  
fout.close()

    
    
Example #12
0
    def run(self,
            statTest,
            testType,
            confIntervMethod,
            coverage,
            profile,
            progress=None):
        self.results.test = statTest.name
        self.results.testType = testType
        self.results.alpha = 1.0 - coverage
        self.results.confIntervMethod = confIntervMethod
        self.results.profile = profile

        if progress == 'Verbose':
            print '  Processing feature:'

        self.results.data = []
        index = 0

        # calculate statistics
        seqsGroup1 = []
        seqsGroup2 = []
        parentSeqsGroup1 = []
        parentSeqsGroup2 = []
        pValues = []
        lowerCIs = []
        upperCIs = []
        effectSizes = []
        notes = []
        if statTest.bSingleFeatureInterface:
            # process features one at a time
            for feature in profile.getFeatures():
                if progress == 'Verbose':
                    print '    ' + feature
                elif progress != None:
                    if progress.wasCanceled():
                        self.results.data = []
                        return

                    index += 1
                    progress.setValue(index)

                # get statistics
                seqGroup1, seqGroup2 = profile.getFeatureCounts(feature)
                parentSeqGroup1, parentSeqGroup2 = profile.getParentFeatureCounts(
                    feature)
                results = statTest.run(seqGroup1, seqGroup2, parentSeqGroup1,
                                       parentSeqGroup2, confIntervMethod,
                                       coverage)
                pValueOneSided, pValueTwoSided, lowerCI, upperCI, effectSize, note = results

                if testType == 'One-sided':
                    pValue = pValueOneSided
                elif testType == 'Two-sided':
                    pValue = pValueTwoSided
                else:
                    print 'Error: Unknown test type.'

                # record results
                seqsGroup1.append(seqGroup1)
                seqsGroup2.append(seqGroup2)
                parentSeqsGroup1.append(parentSeqGroup1)
                parentSeqsGroup2.append(parentSeqGroup2)
                pValues.append(pValue)
                lowerCIs.append(lowerCI)
                upperCIs.append(upperCI)
                effectSizes.append(effectSize)
                notes.append(note)

            if progress != None and progress != 'Verbose':
                index += 1
                progress.setValue(index)
        else:
            # process all features at once
            seqsGroup1, seqsGroup2 = profile.getFeatureCountsAll()
            parentSeqsGroup1, parentSeqsGroup2 = profile.getParentFeatureCountsAll(
            )
            pValuesOneSided, pValuesTwoSided, lowerCIs, upperCIs, effectSizes, notes = statTest.runAll(
                seqsGroup1, seqsGroup2, parentSeqsGroup1, parentSeqsGroup2,
                confIntervMethod, coverage, progress)
            if progress == 'Verbose':
                print '    Processing all features...'
            elif progress != None and progress.wasCanceled():
                self.results.data = []
                return

            if testType == 'One-sided':
                pValues = pValuesOneSided
            elif testType == 'Two-sided':
                pValues = pValuesTwoSided
            else:
                print 'Error: Unknown test type.'

        # record statistics
        features = profile.getFeatures()
        for i in xrange(0, len(features)):
            propGroup1 = []
            for j in xrange(0, len(seqsGroup1[i])):
                sg1 = seqsGroup1[i][j]
                psg1 = parentSeqsGroup1[i][j]

                if psg1 > 0:
                    propGroup1.append(sg1 * 100.0 / psg1)
                else:
                    propGroup1.append(0.0)

            propGroup2 = []
            for j in xrange(0, len(seqsGroup2[i])):
                sg2 = seqsGroup2[i][j]
                psg2 = parentSeqsGroup2[i][j]

                if psg2 > 0:
                    propGroup2.append(sg2 * 100.0 / psg2)
                else:
                    propGroup2.append(0.0)

            meanGroup1 = mean(propGroup1)
            meanGroup2 = mean(propGroup2)
            row = [
                features[i], meanGroup1,
                stdDev(propGroup1, meanGroup1), meanGroup2,
                stdDev(propGroup2, meanGroup2),
                float(pValues[i]),
                float(pValues[i]),
                float(effectSizes[i]),
                float(lowerCIs[i]),
                float(upperCIs[i]), notes[i]
            ]

            for j in xrange(0, len(seqsGroup1[i])):
                row.append(seqsGroup1[i][j])
                row.append(parentSeqsGroup1[i][j])
                if parentSeqsGroup1[i][j] > 0:
                    row.append(seqsGroup1[i][j] * 100.0 /
                               parentSeqsGroup1[i][j])
                else:
                    row.append(0.0)

            for j in xrange(0, len(seqsGroup2[i])):
                row.append(seqsGroup2[i][j])
                row.append(parentSeqsGroup2[i][j])
                if parentSeqsGroup2[i][j] > 0:
                    row.append(seqsGroup2[i][j] * 100.0 /
                               parentSeqsGroup2[i][j])
                else:
                    row.append(0.0)

            self.results.data.append(row)

        headingsSampleStats = []
        for sampleName in (profile.samplesInGroup1 + profile.samplesInGroup2):
            headingsSampleStats.append(sampleName)
            headingsSampleStats.append(sampleName + ': parent seq. count')
            headingsSampleStats.append(sampleName + ': rel. freq. (%)')

        self.results.createTableHeadings(profile.groupName1,
                                         profile.groupName2,
                                         headingsSampleStats)

        # sort results according to p-values
        if len(self.results.data) >= 1:
            self.results.data = TableHelper.SortTable(
                self.results.data, [self.results.dataHeadings['pValues']])
Example #13
0
  def run(self, confIntervMethod, coverage, tables, trials, bootstrapRep, progress):
  
    tableData = []
    index = 0
    for row in tables:                    
      feature = row[0]
      seq1 = row[1]
      seq2 = row[2]
      parentSeq1 = row[3]
      parentSeq2 = row[4]
    
      lowerCI, upperCI, obsEffectSize = confIntervMethod.run(seq1, seq2, parentSeq1, parentSeq2, coverage) 
    
      p1 = float(seq1) / parentSeq1
      p2 = float(seq2) / parentSeq2
    
      coverageList = []  
      coverageListLess5 = []  
      coverageListGreater5 = []  
      for trial in xrange(0, trials): 
        if progress != '':
          index += 1
          progress.setValue(index)
          progress.setLabelText(feature + ' - Trial = ' + str(trial))  
          
        containedRep = 0
        for dummy in xrange(0, bootstrapRep):
          c1 = binomial(parentSeq1, p1)
          c2 = binomial(parentSeq2, p2)
      
          lowerCI, upperCI, effectSize = confIntervMethod.run(c1, c2, parentSeq1, parentSeq2, coverage)
          if obsEffectSize >= lowerCI and obsEffectSize <= upperCI:
            containedRep += 1        
               
        if min([seq1,seq2]) <= 5:
          coverageListLess5.append(float(containedRep) / bootstrapRep)
        else:
          coverageListGreater5.append(float(containedRep) / bootstrapRep)
          
        coverageList.append(float(containedRep) / bootstrapRep)
  
      row = []
      row.append(feature)
      row.append(seq1)
      row.append(seq2)
      row.append(parentSeq1)
      row.append(parentSeq2)
      row.append(float(seq1) / parentSeq1)
      row.append(float(seq2) / parentSeq2)
      row.append(mean(coverageList))
      row.append(stdDev(coverageList))
      
      if math.isnan(mean(coverageListLess5)):
        row.append('')
      else:
        row.append(mean(coverageListLess5))
        
      if math.isnan(stdDev(coverageListLess5)):
        row.append('')
      else:
        row.append(stdDev(coverageListLess5))
        
      if math.isnan(mean(coverageListGreater5)):
        row.append('')
      else:
        row.append(mean(coverageListGreater5))
        
      if math.isnan(stdDev(coverageListGreater5)):
        row.append('')
      else:
        row.append(stdDev(coverageListGreater5))

      tableData.append(row)
      
    return tableData