Exemple #1
0
def findRange(fld):
    """
    finds the range of data over a field. That is, it finds the min and max of the field and
    then returns max - min.
    """
    vals = samples.getAllFlds(fld)
    return max(vals) - min(vals)
Exemple #2
0
def isLinearGrowth(fld, minRange):
    """
    This function looks at how linearly fld grows. The closer it can come to a straight line that
    goes through all the values of fld (assuming even growth along the other axis), the higher th
    confidence.
    """
    
    samples.sampleList.sort(key=lambda x: samples.extractField(x, fld))
    plot = __getPlot('id', fld)
    
    if (samples.sampleList[-1][fld] - samples.sampleList[0][fld]) < minRange:
        return SimResult(confidence.Confidence(confidence.Applic.cf, confidence.Validity.sound), 
                         "even distribution of sample property '" + fld + "'",
                         'insufficient distribution of samples', plot)
    
    fldList = samples.getAllFlds(fld)
    if len(fldList) < 3:
        #can't check for *even* distribution, but they are not right next to each other
        #if we got here at all, I think.
        if len(fldList) == 2:
            app = -observations.neareq(fldList[0], fldList[1])
            return SimResult(confidence.Confidence(app, confidence.Validity.plaus), 
                         "even distribution of sample property '" + fld + "'",
                         '2 samples ' + (app.isTrue() and '' or 'not ') + 'about equal', plot)
        else:
            return SimResult(confidence.Confidence(confidence.Applic.df, confidence.Validity.prob), 
                         "even distribution of sample property '" + fld + "'",
                         'fewer than 2 samples', '')
    #fldList.sort()
    
    line = stats.linregress(range(len(fldList)), fldList)
    
    #line[0] is slope
    #line[1] is intercept
    
    qual = __getQuality(line[3])
    if len(fldList) < 5:
        qual -= 1
    
    conf = __getConfidence((.8, .85, .9, .95, .99), line[2], qual)
    
    plot.plotLine(line[0], line[1])
    
    """
    visDesc = "Graph of " + fld + " spaced out evenly, plus the best fit line"
    visDesc += "\npoints are:\n"
    visDesc += "\n".join([str(tup) for tup in zip(range(len(fldList)), fldList)])
    visDesc += "\nLine is slope " + str(line[0]) + " intercept " + str(line[1]) 
    """
    """
    visDesc += '\nfits ' + fld + ' within ' + str(line[2])
    visDesc += '.\nStatistical significance: ' + str(line[3])
    """
    
    return SimResult(conf, "even distribution of sample property '" + fld + "'",
                     "'" + fld + "' is " + (line[2] < .9 and 'not ' or '') +
                     "evenly distributed among all samples", plot)
Exemple #3
0
def correlated(fldA, fldB, dir):
    """
    What this needs to do is identify whether there is some trend between fldA and fldB in the
    appropriate direction. If dir is positive, this is a direct correlation; if it is negative,
    it is an inverse correlation.
    """

    correlation = stats.pearsonr(samples.getAllFlds(fldA), samples.getAllFlds(fldB))
    
    conf = __getConfidence((-.1, .2, .5, .7, .85), correlation[0] * dir,
                           __getQuality(correlation[1] / 2))
    
    plot = __getPlot(fldA, fldB)
    
    return SimResult(conf, (dir > 0 and "positive" or "negative") + 
                     " correlation between " + fldA + " and " + fldB,
                     (abs(correlation[0]) < .5 and 'minimal' or
                      (correlation[0] > 0 and "positive" or "negative")) + 
                     " correlation between " + fldA + " and " + fldB + 
                     '; significance: ' + str(correlation[1]), plot)
Exemple #4
0
def skewsField(sample, field):
    """
    Checks whether the value of field in the passed in sample is significantly different from the
    value of field for the rest of the samples under consideration.
    """
    
    savedSamples = samples.sampleList[:]
    samples.sampleList.remove(sample)
    
    try:
        flds = samples.getAllFlds(field)
        
        mean = stats.mean(flds)
        stddev = stats.std(flds)
        val = sample[field]
        
        if stddev == 0:
            devs = 0
        else:
            devs = abs(val - mean) / stddev
    
    finally:
        #we should be fixing the sample list even when I crash!
        samples.sampleList = savedSamples
    
    if len(samples.sampleList) < 3:
        qual = confidence.Validity.plaus
    elif len(samples.sampleList) < 6:
        qual = confidence.Validity.prob
    else:
        qual = confidence.Validity.sound
        
    conf = __getConfidence((.5, 1, 2, 3, 5), devs, qual)
    
    samples.sampleList.sort(key=lambda x: samples.extractField(x, field))
    
    plot = __getPlot('id', field)
    plot.plotLine(0, mean)
    plot.plotLine(0, mean-stddev)
    plot.plotLine(0, mean+stddev)
    plot.plotLine(0, sample[field])
    
    return SimResult(conf, str(sample) + " has a different " + field + " from other samples",
                     str(sample) + "'s value for " + field + ' is ' + str(devs) + 
                     ' standard deviations from the mean', plot)