コード例 #1
0
ファイル: linear_regression.py プロジェクト: mpasalic/verifyy
    def regress(self, x, y):
        self.points = len(x)
        if len(x) < 5:
            return;
        
        self.xmin = min(x)
        self.xmax = max(x)
        
        # Shift regression to start with 0
        for i in range(0, self.points):
            x[i] = x[i] - self.xmin
        
        #1. Compute the regression coefficients
        n = self.points
        x_mean = 0.0
        y_mean = 0.0
        for i in range(0, self.points):
            y_mean += y[i]
            x_mean += x[i]
        y_mean = y_mean / float(len(y))
        x_mean = x_mean / float(len(x))

        sum_xy_err = 0.0
        sum_x_2_err = 0.0
        
        for i in range(0, self.points):
            sum_xy_err = (x[i]-x_mean)*(y[i]-y_mean)
            sum_x_2_err = (x[i] - x_mean)*(x[i] - x_mean)
        
        if (sum_x_2_err == 0):
            # This means that all the points are located at the same x point)
            return;
        
        self.b_1 = sum_xy_err / sum_x_2_err
        self.b_0 = y_mean - self.b_1 * x_mean
        self.setRegressionFormula()
        
        # Now, compute SSR, SSE, fo the F-test ANOVA
        SSE = 0.0
        SSR = 0.0
        n   = len(x) # number of observations
        p   = 1      # number of predictor variables
        
        for i in range(0, len(x)):
            y_predict = self.b_1 * x[i] + self.b_0
            SSE += (y_predict - y[i])*(y_predict - y[i])
            SSR += (y_predict - y_mean)*(y_predict - y_mean)
        pass
        
        #Now, r squared = SSR/(SSR + SSE)
        if (SSE < POSITIVIE_ZERO):
            # Perfect fir, r_2 is over the charts
            self.r_2 = 1.0
            self.f_value = 0.1+f_table_value(1, n-2)*10
        else:
            self.r_2 = SSR/(SSR+SSE)
            MSM = SSR / (1)
            MSE = SSE / (n - 2)
            self.f_value = MSM/MSE
            
        #Perform an f-test
        self.f_goal  = f_table_value(1, n - 2)
        
        if self.f_value > 10 * self.f_goal:
            self.veryStrongSignificance()
        elif self.f_value > self.f_goal:
            self.strongSignificance()
        else:
            self.weakSignificance()
        pass
コード例 #2
0
ファイル: one_factor.py プロジェクト: mpasalic/verifyy
 def analyse(self, data):
     #TODO: complain if data set is empty!
     if len(data) < 5:
         return
     
     # add the real value to the correct bin
     for datum in data:
         self.levels[datum.x].append(datum.y)
         self.ymax = self.ymax if datum.y < self.ymax else datum.y
         self.ymin = self.ymin if datum.y > self.ymin else datum.y
         
     #TODO: Prune away empty levels
     levelsToPrune = filter(lambda lvl: len(self.levels[lvl]) == 0, self.levels)
     for lvlDel in levelsToPrune:
         del self.levels[lvlDel]
     
     #find the mean. Since bins are probably unbalanced
     # (we're crowdsourcing, after all, weight each bin)
     # by its size
     #while doing that, find column means just as well
     means = {}
     weights = {}
     mu = 0
     total = 0
     for level in self.levels:
         means[level] = sum(self.levels[level]) / float(len(self.levels[level]))
         weights[level] = len(self.levels[level])
         total += weights[level]
     for level in self.levels:
         weights[level] = float(weights[level]) / total
         mu += weights[level] * means[level]
     
     self.means = means
     self.find1StdDevIntervals()
     
     # We have to check that we have at least 1 more 
     # class in data to try to get this
     if len(self.levels) > 1:
         #Ok, we have column and grand means.
         # find bin alphas
         self.means = means
         alphas = {}
         for level in self.levels:
             alphas[level] = means[level] - mu
         
         # Next, find SSA, SSE
         SSA = 0
         SSE = 0
         
         for j in self.levels:
             r_j = len(self.levels[j])
             SSA += r_j*alphas[j]*alphas[j]
             for y_ij in self.levels[j]:
                 e_ij = y_ij - mu - alphas[j]
                 SSE += e_ij * e_ij
         # Now, we have SSA, SSE, compute MSA, MSE
         # Note that r computation is a bit sketchy: since we
         #   don't have the same number of observations for each
         #   label, we have to compute it as a weighted average of
         #   all classes, then round and make an integer to get
         #   an f-table value
         
         a = len(alphas)
         r = sum( map(lambda i: weights[i]*len(self.levels[i]), self.levels) )
         r = int(round(r))
         MSA = SSA / (a - 1)
         MSE = SSE / (a * (r - 1))
         self.f_value = MSA / MSE
         self.f_goal = f_table_value(a-1, a*(r-1))
         self.alphas = alphas
         
         self.x_mean_effect = map(lambda li: [li, self.means[li], self.alphas[li]], self.levels)
         
         if (self.f_value > 10 * self.f_goal):
             self.veryStrongSignificance()
         elif (self.f_value > self.f_goal):
             self.strongSignificance()
         else:
             self.weakSignificance()
         pass