def linear_regression(data, ind, dep): y = data.getNumCol([dep]) A = data.getNumCol(ind) A = np.hstack( (A, A.shape[0] * [[1]]) ) # The matrix A.T * A is the covariancde matrix of the independent data AAinv = np.linalg.inv(np.dot( A.T, A)) # the covariancde matrix of the independent data x = np.linalg.lstsq(A, y) # solves the equation y = Ab b = x[0] # the solution that provides the best fit regression N = y.shape[0] # rows of y = number of data points C = b.shape[0] # rows of b = number of coefficients df_e = N - C # number of degrees of freedom of the error df_r = C - 1 # number of degrees of freedom of the model fit (if you have C-1 of the values of b you can find the last one) # the error of the model prediction error = y - np.dot(A, b) # the sum squared error, sse = np.dot(error.T, error) / df_e # the standard error stderr = np.sqrt(np.diagonal(sse[0, 0] * AAinv)) # a Cx1 vector. # t-statistic t = b.T / stderr #the probability of the coefficient indicating a random relationship (slope = 0) p = 2 * (1 - stats.t.cdf(abs(t), df_e)) #the r^2 coefficient indicating the quality of the fit. r2 = 1 - error.var() / y.var() # Return the values of the m0, m1, fit (b), the sum-squared error, the # R^2 fit quality, the t-statistic, and the probability of a # random relationship. return (x[0][0, 0], x[0][1, 0], b[2, 0], sse, r2, t, p)
def linear_regression_extension(data, ind, dep): A = data.getNumCol(ind) y = data.getNumCol([dep]) minind1 = A[:, 0].min() minind2 = A[:, 1].min() mindep = np.min(y) maxind1 = A[:, 0].max() maxind2 = A[:, 1].max() maxdep = np.max(y) A = np.hstack((A, A.shape[0] * [[1]])) AAinv = np.linalg.inv(np.dot(A.T, A)) x = np.linalg.lstsq(A, y) b = x[0] N = y.shape[0] C = b.shape[0] df_e = N - C df_r = C - 1 error = y - np.dot(A, b) sse = np.dot(error.T, error) / df_e stderr = np.sqrt(np.diagonal(sse[0, 0] * AAinv)) # a Cx1 vector. t = b.T / stderr p = 2 * (1 - stats.t.cdf(abs(t), df_e)) r2 = 1 - error.var() / y.var() # Return the values of the m0, m1, fit (b), minind1, minind2, mindep, maxind2,maxind2, maxdep, r2 return (b[0, 0], b[1, 0], b[2, 0], minind1, minind2, mindep, maxind2, maxind2, maxdep, r2)
def normalize_columns_together(headers, data): data.get_num_headers() selected = data.getNumCol(headers) extent = selected.max() - selected.min() r = (selected - selected.min()) / extent return r
def stdev(headers, data): selected = data.getNumCol(headers) sdlist = [] for i in range(selected.shape[1]): col = selected[:, i] sdlist.append(np.std(col)) return sdlist
def normalize_columns_separately(headers, data): selected = data.getNumCol(headers) minval = np.min(selected, axis=0) maxval = np.max(selected, axis=0) extent = maxval - minval result = (selected - minval) / extent return result
def data_range(headers, data): selected = data.getNumCol(headers) mins = selected.min(0) minlist = [[mins[0, 0]], [mins[0, 1]]] maxs = selected.max(0) maxlist = [[maxs[0, 0]], [maxs[0, 1]]] minnmax = np.hstack((minlist, maxlist)) return minnmax
def single_linear_regression(data, ind_var, dep_var): selected = data.getNumCol([ind_var, dep_var]).T slope, intercept, r_value, p_value, std_err = stats.linregress(selected) minind = np.min(selected, axis=1)[0, 0] mindep = np.min(selected, axis=1)[1, 0] maxind = np.max(selected, axis=1)[0, 0] maxdep = np.max(selected, axis=1)[1, 0] return (slope, intercept, r_value, p_value, std_err, minind, mindep, maxind, maxdep)
def pca_svd(data, headers, normalize=True): # assign to A the desired data. Use either normalize_columns_separately # or get_data, depending on the value of the normalize argument. A = data.getNumCol(headers) if normalize == True: A = self.normalize_columns_separately(headers, data) # assign to m the mean values of the columns of A m = np.matrix(A.mean(axis=0)) # assign to D the difference matrix A - m D = A.copy() for r in range(A.shape[0]): D[r] = D[r] - mu # assign to U, S, V the result of running np.svd on D, with full_matrices=False (U, S, V) = np.svd(D, full_matrices=False)
def perc25(headers, data): selected = data.getNumCol(headers) pc = np.percentile(selected, 25, axis=0) return pc
def sumTotal(headers, data): selected = data.getNumCol(headers) return np.sum(selected)
def sumCol(headers, data): selected = data.getNumCol(headers) return np.sum(selected, axis=0)
def mean(headers, data): return data.getNumCol(headers).mean(0)