def compoChiSquaredTest(self, verbose=1, skipColumnZeros=0, useConstantSites=1, skipTaxNums=None, getRows=0): """A chi square composition test for each data partition. So you could do, for example:: read('myData.nex') # Calling Data() with no args tells it to make a Data object # using all the alignments in var.alignments d = Data() # Do the test. By default it is verbose, and prints results. # Additionally, a list of lists is returned ret = d.compoChiSquaredTest() # With verbose on, it might print something like --- # Part 0: Chi-square = 145.435278, (dof=170) P = 0.913995 print ret # The list of lists that it returns might be something like --- # [[145.43527849758556, 170, 0.91399521077908041]] # which has the same numbers as above, with one # inner list for each data partition. If your data has more than one partition:: read('first.nex') read('second.nex') d = Data() d.compoChiSquaredTest() # Output something like --- # Part 0: Chi-square = 200.870463, (dof=48) P = 0.000000 # Part 1: Chi-square = 57.794704, (dof=80) P = 0.971059 # [[200.87046313430443, 48, 0.0], [57.794704451018163, 80, 0.97105866938683427]] where the last line is returned. With *verbose* turned off, the ``Part N`` lines are not printed. This method returns a list of lists, one for each data partition. If *getRows* is off, the default, then it is a list of 3-item lists, and if *getRows* is turned on then it is a list of 4-item lists. In each inner list, the first is the X-squared statistic, the second is the degrees of freedom, and the third is the probability from chi-squared. (The expected comes from the data.) If *getRows* is turned on, the 4th item is a list of X-sq contributions from individual rows (ie individual taxa), that together sum to the X-sq for the whole partition as found in the first item. This latter way is the way that Tree-Puzzle does it. Note that this ostensibly tests whether the data are homogeneous in composition, but it does not work on sequences that are related. That is, testing whether the X^2 stat is significant using the chi^2 curve has a high probability of type II error for phylogenetic sequences. However, the X-squared stat can be used in valid ways. You can simulate data under the tree and model, and so generate a valid null distribution of X^2 values from the simulations, by which to assess the significance of the original X^2. You can use this method to generate X^2 values. A problem arises when a composition of a character is zero. If that happens, we can't calculate X-squared because there will be a division by zero. If *skipColumnZeros* is set to 1, then those columns are simply skipped. They are silently skipped unless verbose is turned on. So lets say that your original data have all characters, but one of them has a very low value. That is reflected in the model, and when you do simulations based on the model you occasionally get zeros for that character. Here it is up to you: you could say that the the data containing the zeros are validly part of the possibilities and so should be included, or you could say that the data containing the zeros are not valid and should be excluded. You choose between these by setting *skipColumnZeros*. Note that if you do not set *skipColumnZeros*, and then you analyse a partition that has column zeros, the result is None for that partition. Another problem occurs when a partition is completely missing a sequence. Of course that sequence does not contribute to the stat. However, in any simulations that you might do, that sequence *will* be there, and *will* contribute to the stat. So you will want to skip that sequence when you do your calcs from the simulation. You can do that with the *skipTaxNums* arg, which is a list of lists. The outer list is nParts long, and each inner list is a list of taxNums to exclude. """ if not useConstantSites: newData = Data([]) aligs = [] for a in self.alignments: # aligs.append(a.removeConstantSites()) aligs.append(a.subsetUsingMask(a.constantMask(), theMaskChar="1", inverse=1)) newData._fill(aligs) theResult = newData.compoChiSquaredTest( verbose=verbose, skipColumnZeros=skipColumnZeros, useConstantSites=1, skipTaxNums=skipTaxNums, getRows=getRows, ) del (newData) return theResult gm = ["Data.compoChiSquaredTest()"] nColumnZeros = 0 results = [] # check skipTaxNums if skipTaxNums != None: if type(skipTaxNums) != type([]): gm.append("skipTaxNums should be a list of lists.") raise P4Error(gm) if len(skipTaxNums) != self.nParts: gm.append("skipTaxNums should be a list of lists, nParts long.") raise P4Error(gm) for s in skipTaxNums: if type(s) != type([]): gm.append("skipTaxNums should be a list of lists.") raise P4Error(gm) for i in s: if type(i) != type(1): gm.append("skipTaxNums inner list items should be tax numbers.") gm.append("Got %s" % i) raise P4Error(gm) # Check for blank sequences. Its a pain to force the user to do this. hasBlanks = False blankSeqNums = [] for partNum in range(self.nParts): p = self.parts[partNum] partBlankSeqNums = [] for taxNum in range(self.nTax): if skipTaxNums and skipTaxNums[partNum] and taxNum in skipTaxNums[partNum]: pass else: nSites = pf.partSequenceSitesCount(p.cPart, taxNum) # no gaps, no missings if not nSites: partBlankSeqNums.append(taxNum) if partBlankSeqNums: hasBlanks = True blankSeqNums.append(partBlankSeqNums) if hasBlanks: gm.append("These sequence numbers were found to be blank. They should be excluded.") gm.append("%s" % blankSeqNums) gm.append("Set the arg skipTaxNums to this list.") raise P4Error(gm) for partNum in range(self.nParts): gm = ["Data.compoChiSquaredTest() Part %i" % partNum] p = self.parts[partNum] comps = [] for taxNum in range(self.nTax): if skipTaxNums and skipTaxNums[partNum] and taxNum in skipTaxNums[partNum]: pass else: oneComp = p.composition([taxNum]) nSites = pf.partSequenceSitesCount(p.cPart, taxNum) # no gaps, no missings # print "tax %i, nSites=%i, oneComp=%s" % (taxNum, nSites, # oneComp) if nSites: for k in range(len(oneComp)): oneComp[k] = oneComp[k] * nSites comps.append(oneComp) else: gm.append("(Zero-based) sequence %i is blank, and should be excluded." % taxNum) gm.append("You need to add the number %i to the arg skipTaxNums list of lists." % taxNum) gm.append("(I could do that automatically, but it is best if *you* do it, explicitly.)") gm.append("You can use the Alignment method checkForBlankSequences(listSeqNumsOfBlanks=True)") gm.append("to help you get those inner lists.") raise P4Error(gm) # print "comps=", comps # Here we calculate the X^2 stat. But we want to check # for columns summing to zero. So we can't use # func.xSquared() nRows = len(comps) nCols = len(comps[0]) # I could have just kept nSites, above theSumOfRows = func._sumOfRows(comps) theSumOfCols = func._sumOfColumns(comps) # print theSumOfCols isOk = 1 columnZeros = [] for j in range(len(theSumOfRows)): if theSumOfRows[j] == 0.0: gm.append("Zero in a row sum. Programming error.") raise P4Error(gm) for j in range(len(theSumOfCols)): if theSumOfCols[j] == 0.0: if skipColumnZeros: columnZeros.append(j) else: if verbose: print gm[0] print " Zero in a column sum." print " And skipColumnZeros is not set, so I am refusing to do it at all." isOk = 0 nColumnZeros += 1 theExpected = func._expected(theSumOfRows, theSumOfCols) # print "theExpected = ", theExpected # print "columnZeros = ", columnZeros if isOk: if getRows: xSq_rows = [] xSq = 0.0 alreadyGivenZeroWarning = 0 k = 0 for taxNum in range(self.nTax): if skipTaxNums and skipTaxNums[partNum] and taxNum in skipTaxNums[partNum]: if getRows: # this taxon is not in comps. Add a placeholder xSq_rows.append(0.0) # k is the counter for comps and theExpected, taxNum # without the skips else: xSq_row = 0.0 for j in range(nCols): if j in columnZeros: if skipColumnZeros: if verbose and not alreadyGivenZeroWarning: print gm[0] print " Skipping (zero-based) column number(s) %s, which sum to zero." % columnZeros alreadyGivenZeroWarning = 1 else: gm.append("Programming error.") raise P4Error(gm) else: theDiff = comps[k][j] - theExpected[k][j] xSq_row += (theDiff * theDiff) / theExpected[k][j] xSq += xSq_row if getRows: xSq_rows.append(xSq_row) k += 1 # print xSq_rows dof = (p.dim - len(columnZeros) - 1) * (len(comps) - 1) prob = pf.chiSquaredProb(xSq, dof) if verbose: print "Part %i: Chi-square = %f, (dof=%i) P = %f" % (partNum, xSq, dof, prob) if getRows: # print " rows = %s" % xSq_rows print "%20s %7s %s" % ("taxName", "xSq_row", "P (like puzzle)") for tNum in range(self.nTax): if not skipTaxNums or tNum not in skipTaxNums[partNum]: thisProb = pf.chiSquaredProb(xSq_rows[tNum], self.parts[partNum].dim - 1) print "%20s %7.5f %7.5f" % (self.taxNames[tNum], xSq_rows[tNum], thisProb) else: print "%20s --- ---" % self.taxNames[tNum] if getRows: results.append([xSq, dof, prob, xSq_rows]) else: results.append([xSq, dof, prob]) else: # ie not isOk, ie there is a zero in a column sum # Maybe a bad idea. Maybe it should just die, above. results.append(None) if nColumnZeros and verbose: print "There were %i column zeros." % nColumnZeros return results
def modelFitTests(self, fName = 'model_fit_tests_out', writeRawStats=0): """Do model fit tests on the data. The two tests are the Goldman-Cox test, and the tree- and model- based composition fit test. Both require simulations with optimizations in order to get a null distribution, and those simulations need to be done before this method. The simulations should be done with the simsForModelFitTests() method. Self should have a data and a model attached, and be optimized. The Goldman-Cox test (Goldman 1993. Statistical tests of models of DNA substitution. J Mol Evol 36: 182-198.) is a test for overall fit of the model to the data. It does not work if the data have gaps or ambiguities. The tree- and model-based composition test asks the question: 'Does the composition implied by the model fit the data?' If the model is homogeneous and empirical comp is used, then this is the same as the chi-square test except that the null distribution comes from simulations, not from the chi-square distribution. In that case only the question is, additionally, 'Are the data homogeneous in composition?', ie the same question asked by the chi-square test. However, the data might be heterogeneous, and the model might be heterogeneous over the tree; the tree- and model-based composition fit test can ask whether the heterogeneous model fits the heterogeneous data. The composition is tested in each data partition, separately. The test is done both overall, ie for all the sequences together, and for individual sequences. If you just want a compo homogeneity test with empirical homogeneous comp, try the compoTestUsingSimulations() method-- its way faster, because there are not optimizations in the sims part. Output is verbose, to a file.""" gm = ['Tree.modelFitTests()'] self.calcLogLike(verbose=0) doOut = True # Usually True. Set to False for debugging, experimentation, getting individual stats, etc # We can't do the Goldman-Cox test if there are any gaps or # ambiguities. doGoldmanCox = True for a in self.data.alignments: if a.hasGapsOrAmbiguities(): doGoldmanCox = False break #print "test doGoldmanCox = %s" % doGoldmanCox rawFName = '%s_raw.py' % fName #flob = sys.stderr #fRaw = sys.stderr if doOut: flob = file(fName, 'w') else: flob = None if writeRawStats: fRaw = file(rawFName, 'w') else: fRaw = None ####################### # Goldman-Cox stats ####################### # For a two-part data analysis, the first few lines of the # sims_GoldmanStats_* file will be like the following. Its in # groups of 3-- the first one for all parts together (part number # -1), and the next lines for separate parts. ## # part unconstr L log like Goldman-Cox stat ## -1 -921.888705 -1085.696919 163.808215 ## 0 -357.089057 -430.941958 73.852901 ## 1 -564.799648 -654.754962 89.955314 ## -1 -952.063037 -1130.195799 178.132761 ## 0 -362.164119 -439.709824 77.545705 ## ... and so on. # For a one-part analysis, it will be the same except that one sim # gets only one line, starting with zero. if doGoldmanCox: goldmanOverallSimStats = [] if self.data.nParts > 1: goldmanIndividualSimStats = [] for partNum in range(self.data.nParts): goldmanIndividualSimStats.append([]) import glob goldmanFNames = glob.glob('sims_GoldmanStats_*') #print "nParts=%s" % self.data.nParts #print goldmanFNames for fName1 in goldmanFNames: f2 = open(fName1) aLine = f2.readline() if not aLine: gm.append("Empty file %s" % fName1) raise Glitch, gm if aLine[0] != '#': gm.append("Expecting a '#' as the first character in file %s" % fName1) raise Glitch, gm aLine = f2.readline() #print "a got line %s" % aLine, while aLine: if self.data.nParts > 1: splitLine = aLine.split() if len(splitLine) != 4: gm.append("Bad line in Goldman stats file %s" % fName1) gm.append("'%s'" % aLine) raise Glitch, gm if int(splitLine[0]) != -1: gm.append("Bad line in Goldman stats file %s" % fName1) gm.append("First item should be -1") gm.append("'%s'" % aLine) raise Glitch, gm #print splitLine[-1] goldmanOverallSimStats.append(float(splitLine[-1])) aLine = f2.readline() #print "b got line %s" % aLine, if not aLine: gm.append("Premature end to file %s" % fName1) raise Glitch, gm for partNum in range(self.data.nParts): splitLine = aLine.split() #print "partNum %i, splitLine=%s" % (partNum, splitLine) if len(splitLine) != 4: gm.append("Bad line in Goldman stats file %s" % fName1) gm.append("'%s'" % aLine) raise Glitch, gm try: splitLine[0] = int(splitLine[0]) except ValueError: gm.append("Bad line in Goldman stats file %s" % fName1) gm.append("First item should be the partNum %i" % partNum) gm.append("'%s'" % aLine) raise Glitch, gm if splitLine[0] != partNum: gm.append("Bad line in Goldman stats file %s" % fName1) gm.append("First item should be the partNum %i" % partNum) gm.append("'%s'" % aLine) raise Glitch, gm #for taxNum in range(self.data.nTax): # print splitLine[taxNum + 1] #print splitLine[-1] if self.data.nParts == 1: goldmanOverallSimStats.append(float(splitLine[-1])) else: goldmanIndividualSimStats[partNum].append(float(splitLine[-1])) aLine = f2.readline() #print "c got line %s" % aLine, f2.close() #print "goldmanOverallSimStats =", goldmanOverallSimStats #print "goldmanIndividualSimStats =", goldmanIndividualSimStats #sys.exit() if doOut: flob.write('Model fit tests\n===============\n\n') flob.write('The data that we are testing have %i taxa,\n' % self.data.nTax) if len(self.data.alignments) == 1: flob.write('1 alignment, ') else: flob.write('%i alignments, ' % len(self.data.alignments)) if self.data.nParts == 1: flob.write('and 1 data partition.\n') else: flob.write('and %i data partitions.\n' % self.data.nParts) flob.write('The lengths of those partitions are as follows:\n') flob.write(' partNum nChar \n') for i in range(self.data.nParts): flob.write(' %3i %5i\n' % (i, self.data.parts[i].nChar)) self.data.calcUnconstrainedLogLikelihood2() if doOut: flob.write("\nThe unconstrained likelihood is %f\n" % self.data.unconstrainedLogLikelihood) flob.write('(This is the partition-by-partition unconstrained log likelihood, \n') flob.write('ie the sum of the unconstrained log likes from each partition separately, \n') flob.write('and so will not be the same as that given by PAUP, if the data are partitioned.)\n') flob.write('\n\nGoldman-Cox test for overall model fit\n') flob.write ('======================================\n') flob.write('The log likelihood for these data for this tree is %f\n' % self.logLike) flob.write('The unconstrained log likelihood for these data is %f\n' % self.data.unconstrainedLogLikelihood) originalGoldmanCoxStat = self.data.unconstrainedLogLikelihood - self.logLike if doOut: flob.write('The Goldman-Cox statistic for the original data is the difference, %f\n' % originalGoldmanCoxStat) if self.data.nParts > 1: flob.write('(The unconstrained log likelihood for these data is calculated partition by partition.)\n') flob.write('\n') if self.data.nParts > 1: originalGoldmanCoxStatsByPart = [] if doOut: flob.write('Stats by partition.\n') flob.write('part\t unconstrLogL\t log like \tGoldman-Cox stat\n') flob.write('----\t ----------\t -------- \t----------------\n') for partNum in range(self.data.nParts): unc = pf.getUnconstrainedLogLike(self.data.parts[partNum].cPart) like = pf.p4_partLogLike(self.cTree, self.data.parts[partNum].cPart, partNum, 0) diff = unc - like if doOut: flob.write(' %i\t%f\t%f\t %f\n' % (partNum, unc, like, diff)) originalGoldmanCoxStatsByPart.append(diff) # Do the overall stat nSims = len(goldmanOverallSimStats) if doOut: flob.write('\nThere were %i simulations.\n\n' % nSims) if writeRawStats: fRaw.write('# Goldman-Cox null distributions.\n') if self.data.nParts > 1: fRaw.write('# Simulation stats for overall data, ie for all data partitions combined.\n') else: fRaw.write('# Simulation stats.\n') fRaw.write('goldman_cox_overall = %s\n' % goldmanOverallSimStats) if self.data.nParts > 1: for partNum in range(self.data.nParts): fRaw.write('# Simulation stats for data partition %i\n' % partNum) fRaw.write('goldman_cox_part%i = %s\n' % (partNum, goldmanIndividualSimStats[partNum])) prob = func.tailAreaProbability(originalGoldmanCoxStat, goldmanOverallSimStats, verbose=0) if doOut: flob.write( '\n Overall Goldman-Cox test: ') if prob <= 0.05: flob.write('%13s' % "Doesn't fit.") else: flob.write('%13s' % 'Fits.') flob.write(' P = %5.3f\n' % prob) if self.data.nParts > 1: if doOut: flob.write(' Tests for individual data partitions:\n') for partNum in range(self.data.nParts): prob = func.tailAreaProbability(originalGoldmanCoxStatsByPart[partNum], goldmanIndividualSimStats[partNum], verbose=0) if doOut: flob.write( ' Part %-2i: ' % partNum) if prob <= 0.05: flob.write('%13s' % 'Doesn\'t fit.') else: flob.write('%13s' % 'Fits.') flob.write(' P = %5.3f\n' % prob) ######################### # COMPOSITION ######################### statsHashList = [] for pNum in range(self.data.nParts): h = {} statsHashList.append(h) h['individualNSites'] = [] h['observedIndividualCounts'] = [] for j in range(self.data.nTax): #print pf.partSequenceSitesCount(self.data.parts[pNum].cPart, j) h['individualNSites'].append(pf.partSequenceSitesCount(self.data.parts[pNum].cPart, j)) # no gaps or qmarks #print self.data.parts[pNum].composition([j]) h['observedIndividualCounts'].append(self.data.parts[pNum].composition([j])) # The line above is temporarily composition, not counts # pf.expectedCompositionCounts returns a tuple of tuples # representing the counts of the nodes in proper alignment order. h['expectedIndividualCounts'] = list(pf.p4_expectedCompositionCounts(self.cTree, pNum)) # alignment order # At the moment, h['observedIndividualCounts'] has composition, # not counts. So multiply by h['individualNSites'] for i in range(self.data.nTax): for j in range(self.data.parts[pNum].dim): h['observedIndividualCounts'][i][j] *= h['individualNSites'][i] # We will want to skip any sequences composed of all gaps skipTaxNums = [] for pNum in range(self.data.nParts): stn = [] for tNum in range(self.data.nTax): if not statsHashList[pNum]['individualNSites'][tNum]: stn.append(tNum) skipTaxNums.append(stn) #print "skipTaxNums = %s" % skipTaxNums # Do the boring old compo chi square test. if doOut: flob.write(longMessage1) # explanation ... for pNum in range(self.data.nParts): h = statsHashList[pNum] # Can't use func.xSquared(), because there might be column # zeros. #print "observedIndividualCounts = %s' % h['observedIndividualCounts"] nRows = len(h['observedIndividualCounts']) nCols = len(h['observedIndividualCounts'][0]) theSumOfRows = func._sumOfRows(h['observedIndividualCounts']) # I could have just used nSites, above theSumOfCols = func._sumOfColumns(h['observedIndividualCounts']) #print theSumOfCols isOk = 1 columnZeros = [] #for j in range(len(theSumOfRows)): # if theSumOfRows[j] == 0.0: # gm.append("Zero in a row sum. Programming error.") # raise Glitch, gm for j in range(len(theSumOfCols)): if theSumOfCols[j] <= 0.0: columnZeros.append(j) theExpected = func._expected(theSumOfRows, theSumOfCols) #print "theExpected = %s" % theExpected #print "columnZeros = %s" % columnZeros xSq = 0.0 for rowNum in range(nRows): if rowNum in skipTaxNums[pNum]: pass else: xSq_row = 0.0 for colNum in range(nCols): if colNum in columnZeros: pass else: theDiff = h['observedIndividualCounts'][rowNum][colNum] - theExpected[rowNum][colNum] xSq_row += (theDiff * theDiff) / theExpected[rowNum][colNum] xSq += xSq_row dof = (nCols - len(columnZeros) - 1) * (nRows - len(skipTaxNums[pNum]) - 1) prob = func.chiSquaredProb(xSq, dof) if doOut: flob.write(' Part %i: Chi-square = %f, (dof=%i) P = %f\n' % (pNum, xSq, dof, prob)) for pNum in range(self.data.nParts): h = statsHashList[pNum] h['overallStat'] = 0.0 h['individualStats'] = [0.0] * self.data.nTax for i in range(self.data.nTax): if i in skipTaxNums[pNum]: pass # h['individualStats'] stays at zeros else: for j in range(self.data.parts[pNum].dim): # Avoid dividing by Zero. if h['expectedIndividualCounts'][i][j]: dif = h['observedIndividualCounts'][i][j] - h['expectedIndividualCounts'][i][j] h['individualStats'][i] += ((dif * dif) /h['expectedIndividualCounts'][i][j]) h['overallStat'] += h['individualStats'][i] h['overallSimStats'] = [] h['individualSimStats'] = [] for i in range(self.data.nTax): h['individualSimStats'].append([]) if 0: print "h['individualNSites'] = %s" % h['individualNSites'] print "h['observedIndividualCounts'] = %s" % h['observedIndividualCounts'] print "h['expectedIndividualCounts'] = %s" % h['expectedIndividualCounts'] print "h['overallStat'] = %s" % h['overallStat'] print "h['individualStats'] = %s" % h['individualStats'] raise Glitch, gm import glob compoFNames = glob.glob('sims_CompStats_*') #print compoFNames for fName1 in compoFNames: f2 = open(fName1) aLine = f2.readline() if not aLine: gm.append("Empty file %s" % fName1) raise Glitch, gm #print "a got line %s" % aLine, while aLine: for partNum in range(self.data.nParts): h = statsHashList[partNum] splitLine = aLine.split() if len(splitLine) != (self.data.nTax + 2): gm.append("Bad line in composition stats file %s" % fName1) gm.append("'%s'" % aLine) raise Glitch, gm if int(splitLine[0]) != partNum: gm.append("Bad line in composition stats file %s" % fName1) gm.append("First item should be the partNum %i" % partNum) gm.append("'%s'" % aLine) raise Glitch, gm #for taxNum in range(self.data.nTax): # print splitLine[taxNum + 1] #print splitLine[-1] h['overallSimStats'].append(float(splitLine[-1])) for i in range(self.data.nTax): h['individualSimStats'][i].append(float(splitLine[i + 1])) #raise Glitch, gm aLine = f2.readline() if not aLine: break #print "b got line %s" % aLine, f2.close() nSims = len(statsHashList[0]['overallSimStats']) if doOut: flob.write(longMessage2) # Explain tree- and model-based compo fit stat, X^2_m flob.write( ' %i simulation reps were used.\n\n' % nSims) spacer1 = ' ' * 10 for partNum in range(self.data.nParts): h = statsHashList[partNum] if doOut: flob.write('Part %-2i:\n-------\n\n' % partNum) flob.write('Statistics from the original data\n') flob.write('%s%30s: %f\n' % (spacer1, 'Overall observed stat', h['overallStat'])) flob.write('%s%30s:\n' % (spacer1, 'Stats for individual taxa')) for taxNum in range(self.data.nTax): if taxNum not in skipTaxNums[partNum]: flob.write('%s%30s: %f\n' % (spacer1, self.data.taxNames[taxNum], h['individualStats'][taxNum])) else: flob.write('%s%30s: skipped\n' % (spacer1, self.data.taxNames[taxNum])) flob.write('\nAssessment of fit from null distribution from %i simulations\n' % nSims) flob.write('%s%30s: ' % (spacer1, 'Overall')) prob = func.tailAreaProbability(h['overallStat'], h['overallSimStats'], verbose=0) if doOut: if prob <= 0.05: flob.write('%13s' % 'Doesn\'t fit.') else: flob.write('%13s' % 'Fits.') flob.write(' P = %5.3f\n' % prob) ############# theRet= prob ############# for taxNum in range(self.data.nTax): if doOut: flob.write('%s%30s: ' % (spacer1, self.data.taxNames[taxNum])) if taxNum in skipTaxNums[partNum]: if doOut: flob.write('%13s\n' % 'skipped.') else: prob = func.tailAreaProbability(h['individualStats'][taxNum], h['individualSimStats'][taxNum], verbose=0) if doOut: if prob <= 0.05: flob.write('%13s' % "Doesn't fit.") else: flob.write('%13s' % 'Fits.') flob.write(' P = %5.3f\n' % prob) if writeRawStats: fRaw.write('#\n# Tree and model based composition fit test\n') fRaw.write('# =========================================\n') fRaw.write('# Simulation statistics, ie the null distributions\n\n') fRaw.write('# Part %i:\n' % partNum) fRaw.write('part%i_overall_compo_null = %s\n' % (partNum, h['overallSimStats'])) for taxNum in range(self.data.nTax): fRaw.write('part%i_%s_compo_null = %s\n' % (partNum, _fixFileName(self.data.taxNames[taxNum]), h['individualSimStats'][taxNum])) if flob and flob != sys.stdout: # Yes, it is possible to close sys.stdout flob.close() if fRaw and fRaw != sys.stdout: fRaw.close() return theRet
def compoChiSquaredTest(self, verbose=1, skipColumnZeros=0, useConstantSites=1, skipTaxNums=None, getRows=0): """A chi square composition test for each data partition. It returns a list of lists, one for each data partition. If getRows is off, the default, then it is a list of 3-item lists, and if 'getRows' is turned on then it is a list of 4-item lists. In each inner list, the first is the X-squared statistic, the second is the degrees of freedom, and the third is the probability from chi-squared. (The expected comes from the data.) If 'getRows' is turned on, the 4th item is a list of X-sq contributions from individual rows (ie individual taxa), that together sum to the X-sq for the whole partition as found in the first item. Note that this ostensibly tests whether the data are homogeneous in composition, but it does not work on sequences that are related. That is, testing whether the X^2 stat is significant using the chi^2 curve has a high probability of type II error for phylogenetic sequences. However, the X-squared stat can be used in valid ways. You can simulate data under the tree and model, and so generate a valid null distribution of X^2 values from the simulations, by which to assess the significance of the original X^2. You can use this method to generate X^2 values. A problem arises when a composition of a character is zero. If that happens, we can't calculate X-squared because there will be a division by zero. If skipColumnZeros is set to 1, then those columns are simply skipped. They are silently skipped unless verbose is turned on. So lets say that your original data have all characters, but one of them has a very low value. That is reflected in the model, and when you do simulations based on the model you occasionally get zeros for that character. Here it is up to you: you could say that the the data containing the zeros are validly part of the possibilities and so should be included, or you could say that the data containing the zeros are not valid and should be excluded. You choose between these by setting skipColumnZeros. Note that if you do not set skipColumnZeros, and then you analyse a partition that has column zeros, the result is None for that partition. Another problem occurs when a partition is completely missing a sequence. Of course that sequence does not contribute to the stat. However, in any simulations that you might do, that sequence *will* be there, and *will* contribute to the stat. So you will want to skip that sequence when you do your calcs from the simulation. You can do that with the 'skipTaxNums' arg, which is a list of lists. The outer list is nParts long, and each inner list is a list of taxNums to exclude. """ if not useConstantSites: newData = Data([]) aligs = [] for a in self.alignments: #aligs.append(a.removeConstantSites()) aligs.append( a.subsetUsingMask(a.constantMask(), theMaskChar='1', inverse=1)) newData._fill(aligs) theResult = newData.compoChiSquaredTest( verbose=verbose, skipColumnZeros=skipColumnZeros, useConstantSites=1, skipTaxNums=skipTaxNums, getRows=getRows) del (newData) return theResult gm = ['Data.compoChiSquaredTest()'] nColumnZeros = 0 results = [] # check skipTaxNums if skipTaxNums != None: if type(skipTaxNums) != type([]): gm.append("skipTaxNums should be a list of lists.") raise Glitch, gm if len(skipTaxNums) != self.nParts: gm.append( "skipTaxNums should be a list of lists, nParts long.") raise Glitch, gm for s in skipTaxNums: if type(s) != type([]): gm.append("skipTaxNums should be a list of lists.") raise Glitch, gm for i in s: if type(i) != type(1): gm.append( "skipTaxNums inner list items should be tax numbers." ) gm.append("Got %s" % i) raise Glitch, gm # Check for blank sequences. Its a pain to force the user to do this. hasBlanks = False blankSeqNums = [] for partNum in range(self.nParts): p = self.parts[partNum] partBlankSeqNums = [] for taxNum in range(self.nTax): if skipTaxNums and skipTaxNums[ partNum] and taxNum in skipTaxNums[partNum]: pass else: nSites = pf.partSequenceSitesCount( p.cPart, taxNum) # no gaps, no missings if not nSites: partBlankSeqNums.append(taxNum) if partBlankSeqNums: hasBlanks = True blankSeqNums.append(partBlankSeqNums) if hasBlanks: gm.append( "These sequence numbers were found to be blank. They should be excluded." ) gm.append("%s" % blankSeqNums) gm.append("Set the arg skipTaxNums to this list.") raise Glitch, gm for partNum in range(self.nParts): gm = ['Data.compoChiSquaredTest() Part %i' % partNum] p = self.parts[partNum] comps = [] for taxNum in range(self.nTax): if skipTaxNums and skipTaxNums[ partNum] and taxNum in skipTaxNums[partNum]: pass else: oneComp = p.composition([taxNum]) nSites = pf.partSequenceSitesCount( p.cPart, taxNum) # no gaps, no missings #print "tax %i, nSites=%i, oneComp=%s" % (taxNum, nSites, oneComp) if nSites: for k in range(len(oneComp)): oneComp[k] = oneComp[k] * nSites comps.append(oneComp) else: gm.append( "(Zero-based) sequence %i is blank, and should be excluded." % taxNum) gm.append( "You need to add the number %i to the arg skipTaxNums list of lists." % taxNum) gm.append( "(I could do that automatically, but it is best if *you* do it, explicitly.)" ) gm.append( "You can use the Alignment method checkForBlankSequences(listSeqNumsOfBlanks=True)" ) gm.append("to help you get those inner lists.") raise Glitch, gm #print "comps=", comps # Here we calculate the X^2 stat. But we want to check # for columns summing to zero. So we can't use # func.xSquared() nRows = len(comps) nCols = len(comps[0]) theSumOfRows = func._sumOfRows( comps) # I could have just kept nSites, above theSumOfCols = func._sumOfColumns(comps) #print theSumOfCols isOk = 1 columnZeros = [] for j in range(len(theSumOfRows)): if theSumOfRows[j] == 0.0: gm.append("Zero in a row sum. Programming error.") raise Glitch, gm for j in range(len(theSumOfCols)): if theSumOfCols[j] == 0.0: if skipColumnZeros: columnZeros.append(j) else: if verbose: print gm[0] print " Zero in a column sum." print " And skipColumnZeros is not set, so I am refusing to do it at all." isOk = 0 nColumnZeros += 1 theExpected = func._expected(theSumOfRows, theSumOfCols) #print "theExpected = ", theExpected #print "columnZeros = ", columnZeros if isOk: if getRows: xSq_rows = [] xSq = 0.0 alreadyGivenZeroWarning = 0 k = 0 for taxNum in range(self.nTax): if skipTaxNums and skipTaxNums[ partNum] and taxNum in skipTaxNums[partNum]: if getRows: xSq_rows.append( 0.0 ) # this taxon is not in comps. Add a placeholder else: # k is the counter for comps and theExpected, taxNum without the skips xSq_row = 0.0 for j in range(nCols): if j in columnZeros: if skipColumnZeros: if verbose and not alreadyGivenZeroWarning: print gm[0] print " Skipping (zero-based) column number(s) %s, which sum to zero." % columnZeros alreadyGivenZeroWarning = 1 else: gm.append("Programming error.") raise Glitch, gm else: theDiff = comps[k][j] - theExpected[k][j] xSq_row += (theDiff * theDiff) / theExpected[k][j] xSq += xSq_row if getRows: xSq_rows.append(xSq_row) k += 1 #print xSq_rows dof = (p.dim - len(columnZeros) - 1) * (len(comps) - 1) prob = pf.chiSquaredProb(xSq, dof) if verbose: print "Part %i: Chi-square = %f, (dof=%i) P = %f" % ( partNum, xSq, dof, prob) if getRows: #print " rows = %s" % xSq_rows print "%20s %7s %s" % ('taxName', 'xSq_row', 'P (like puzzle)') for tNum in range(self.nTax): if not skipTaxNums or tNum not in skipTaxNums[ partNum]: thisProb = pf.chiSquaredProb( xSq_rows[tNum], self.parts[partNum].dim - 1) print "%20s %7.5f %7.5f" % ( self.taxNames[tNum], xSq_rows[tNum], thisProb) else: print "%20s --- ---" % self.taxNames[ tNum] if getRows: results.append([xSq, dof, prob, xSq_rows]) else: results.append([xSq, dof, prob]) else: # ie not isOk, ie there is a zero in a column sum results.append( None ) # Maybe a bad idea. Maybe it should just die, above. if nColumnZeros and verbose: print "There were %i column zeros." % nColumnZeros return results