def isLinearGrowth(fld, minRange): """ This function looks at how linearly fld grows. The closer it can come to a straight line that goes through all the values of fld (assuming even growth along the other axis), the higher th confidence. """ samples.sampleList.sort(key=lambda x: samples.extractField(x, fld)) plot = __getPlot('id', fld) if (samples.sampleList[-1][fld] - samples.sampleList[0][fld]) < minRange: return SimResult(confidence.Confidence(confidence.Applic.cf, confidence.Validity.sound), "even distribution of sample property '" + fld + "'", 'insufficient distribution of samples', plot) fldList = samples.getAllFlds(fld) if len(fldList) < 3: #can't check for *even* distribution, but they are not right next to each other #if we got here at all, I think. if len(fldList) == 2: app = -observations.neareq(fldList[0], fldList[1]) return SimResult(confidence.Confidence(app, confidence.Validity.plaus), "even distribution of sample property '" + fld + "'", '2 samples ' + (app.isTrue() and '' or 'not ') + 'about equal', plot) else: return SimResult(confidence.Confidence(confidence.Applic.df, confidence.Validity.prob), "even distribution of sample property '" + fld + "'", 'fewer than 2 samples', '') #fldList.sort() line = stats.linregress(range(len(fldList)), fldList) #line[0] is slope #line[1] is intercept qual = __getQuality(line[3]) if len(fldList) < 5: qual -= 1 conf = __getConfidence((.8, .85, .9, .95, .99), line[2], qual) plot.plotLine(line[0], line[1]) """ visDesc = "Graph of " + fld + " spaced out evenly, plus the best fit line" visDesc += "\npoints are:\n" visDesc += "\n".join([str(tup) for tup in zip(range(len(fldList)), fldList)]) visDesc += "\nLine is slope " + str(line[0]) + " intercept " + str(line[1]) """ """ visDesc += '\nfits ' + fld + ' within ' + str(line[2]) visDesc += '.\nStatistical significance: ' + str(line[3]) """ return SimResult(conf, "even distribution of sample property '" + fld + "'", "'" + fld + "' is " + (line[2] < .9 and 'not ' or '') + "evenly distributed among all samples", plot)
def distantFromOthers(sample, field, spread): """ Discover how different (based on spread) this sample is from the sample nearest to it """ value = sample[field] spr = sample[spread] minDist = 50 samples.sampleList.sort(key=lambda x: samples.extractField(x, field)) plot = __getPlot('id', field) if any([sam[field] > sample[field] for sam in samples.sampleList]) and \ any([sam[field] < sample[field] for sam in samples.sampleList]): return SimResult(confidence.Confidence(confidence.Applic.cf, confidence.Validity.sound), str(sample) + " has a different " + field + " from any other sample", str(sample) + "'s value for " + field + ' is between that of other samples', plot) for sam in samples.sampleList: if sam == sample: continue distance = abs(sample[field] - sam[field]) spr = sample[spread] + sam[spread] if spr == 0: continue minDist = min(minDist, distance / float(spr)) if len(samples.sampleList) < 3: qual = confidence.Validity.plaus elif len(samples.sampleList) < 6: qual = confidence.Validity.prob else: qual = confidence.Validity.sound minDist *= 2 conf = __getConfidence((1, 2, 3, 4, 6), minDist, qual) plot.plotLine(0, sample[field]) plot.plotLine(0, sample[field]-sample[spread]) plot.plotLine(0, sample[field]+sample[spread]) return SimResult(conf, str(sample) + " has a different " + field + " from any other sample", str(sample) + "'s value for " + field + ' is ' + str(minDist) + ' times ' + spread + ' from any other sample', plot)
def skewsField(sample, field): """ Checks whether the value of field in the passed in sample is significantly different from the value of field for the rest of the samples under consideration. """ savedSamples = samples.sampleList[:] samples.sampleList.remove(sample) try: flds = samples.getAllFlds(field) mean = stats.mean(flds) stddev = stats.std(flds) val = sample[field] if stddev == 0: devs = 0 else: devs = abs(val - mean) / stddev finally: #we should be fixing the sample list even when I crash! samples.sampleList = savedSamples if len(samples.sampleList) < 3: qual = confidence.Validity.plaus elif len(samples.sampleList) < 6: qual = confidence.Validity.prob else: qual = confidence.Validity.sound conf = __getConfidence((.5, 1, 2, 3, 5), devs, qual) samples.sampleList.sort(key=lambda x: samples.extractField(x, field)) plot = __getPlot('id', field) plot.plotLine(0, mean) plot.plotLine(0, mean-stddev) plot.plotLine(0, mean+stddev) plot.plotLine(0, sample[field]) return SimResult(conf, str(sample) + " has a different " + field + " from other samples", str(sample) + "'s value for " + field + ' is ' + str(devs) + ' standard deviations from the mean', plot)
def checkOverlap(anchor, spread): """ Checks that every sample overlaps with every other sample at at least one point in anchor/spread (or spread * 2) """ if len(samples.sampleList) == 0: return SimResult(confidence.Confidence(confidence.Applic.cf, confidence.Validity.sound), 'sample ' + anchor + ' plus or minus ' + spread + ' overlaps for all samples', 'No samples in set', "") samples.sampleList.sort(key=lambda x: samples.extractField(x, anchor)) plot = __getPlot('id', anchor) range = [0, samples.sampleList[0][anchor] + samples.sampleList[0][spread]] range2 = [0, samples.sampleList[0][anchor] + 2 * samples.sampleList[0][spread]] for sample in samples.sampleList: sAnch = sample[anchor] sSpre = sample[spread] range[0] = max(range[0], sAnch-sSpre) range[1] = min(range[1], sAnch+sSpre) range2[0] = max(range2[0], sAnch-2*sSpre) range2[1] = min(range2[1], sAnch+2*sSpre) if range[1] > range[0]: dif = abs(range[1] - range[0]) / float(range[0] + range[1]) qual = dif >= .05 and confidence.Validity.accept or confidence.Validity.sound desc = 'Samples overlap within 1 sigma' plot.plotLine(0, range[0]) plot.plotLine(0, range[1]) conf = True elif range2[1] > range2[0]: dif = abs(range2[1] - range2[0]) / float(range2[0] + range2[1]) qual = dif >= .5 and confidence.Validity.sound or confidence.Validity.prob desc = 'Samples overlap within 2 sigma' plot.plotLine(0, range2[0]) plot.plotLine(0, range2[1]) conf = True else: dif = abs(range2[1] - range2[0]) / float(range2[0] + range2[1]) desc = 'Samples do not overlap within 2 sigma' #plot.plotLine(0, range2[0]) #plot.plotLine(0, range2[1]) if dif > .2: qual = confidence.Validity.accept elif dif > .1: qual = confidence.Validity.sound elif dif > .02: qual = confidence.Validity.prob else: qual = confidence.Validity.plaus conf = False confid = confidence.Confidence(confidence.Applic.ft, qual) if not conf: confid = -confid return SimResult(confid, 'sample ' + anchor + ' plus or minus ' + spread + ' overlaps for all samples', desc, plot)