def main(): d = Data('cars.csv') print "Raw Headers" print d.get_raw_headers() print "\n\n" print "Raw number of columns" print d.get_raw_num_columns() print "\n\n" print "Raw number of rows" print d.get_raw_num_rows() print "\n\n" print "13th row" print d.get_raw_row(13) print "\n\n" print "Value at row 6, header 'Car'" print d.get_raw_value(6, 'Car') print "\n\n" print "Matrix data" print d.matrix_data print "\n\n" print "Headers" print d.get_headers() print "\n\n" print "Number of cols" print d.get_num_columns() print "\n\n" print "5th row" print d.get_row(5) print "\n\n" print "Get value" print d.get_value(5, 'Horsepower') print "\n\n" print "get_data function" print d.get_data(['Origin', 'Horsepower']) print "\n\n" print "data range" print analysis.data_range(d, ['Origin', 'Horsepower']) print "\n\n" print "mean of horsepower and origin" print analysis.mean(d, ['Horsepower', 'Origin']) print "\n\n" print "standard deviation for horsepower and origin" print analysis.stdev(d, ['Horsepower', 'Origin']) print "\n" print "normalized columns origin and horsepower" print analysis.normalize_columns_separately(d, ['Origin', 'Horsepower']) print "\n\n" print "normalized together origin and horsepower" print analysis.normalize_columns_together(d, ['Origin', 'Horsepower']) print "\n\n" print "median of columns origin, horspower and weight" print analysis.median(d, ['Origin', 'Horsepower', 'Weight']) print d.get_data(['Origin', 'Horsepower']).shape
def main(argv): # test command line arguments if len(argv) < 2: print('Usage: python %s <csv filename>' % (argv[0])) exit(0) # create a data object, which reads in the data dobj = Data(argv[1]) headers = dobj.get_headers() #test the five analysis functions print([headers[0], headers[2]]) print("Data range by column:", analysis.data_range([headers[0], headers[2]], dobj)) print("Mean:", analysis.mean([headers[0], headers[2]], dobj)) print("Standard deviation:", analysis.stdev([headers[0], headers[2]], dobj)) print( "Normalize columns separately:", analysis.normalize_columns_separately([headers[0], headers[2]], dobj)) print("Normalize columns together:", analysis.normalize_columns_together([headers[0], headers[2]], dobj)) #Extension 1 print("Median:", analysis.median([headers[0], headers[2]], dobj)) #Extension 2 print("Median Separately:", analysis.median_separately([headers[0], headers[2]], dobj)) #Extension 3 print("just few rows:", dobj.limit_rows()) #Extension 4 print( "just a few columns. I changed the limit to 2 for demonstration purposes:", dobj.limit_columns()) #Extension 5 print("Data range overall:", analysis.data_range([headers[0], headers[2]], dobj, True)) #Extension 6 print( "The next two print statements get the last row of data. I add a row of data in between," "so they are different.") print(dobj.get_row(-1)) dobj.add_point([1, 2, 3]) print(dobj.get_row(-1))
def main(argv): # test command line arguments if len(argv) < 2: print('Usage: python %s <csv filename>' % (argv[0])) exit(0) # create a data object, which reads in the data dobj = data.Data(argv[1]) # print out information about the data print('Number of rows: ', dobj.get_num_points()) print('Number of columns: ', dobj.get_num_dimensions()) # print out the headers print("\nHeaders:") headers = dobj.get_headers() s = headers[0] for header in headers[1:]: s += ", " + header print(s) # print out the types print("\nTypes") types = dobj.get_types() s = types[0] for type in types[1:]: s += ", " + type print(s) # print out a single row print("\nPrinting row index 2") print(dobj.get_row(2)) # print out all of the data print("\nData") headers = dobj.get_headers() print("headers:", headers) for i in range(dobj.get_num_points()): s = str(dobj.get_value(headers[0], i)) for header in headers[1:]: s += "%10.3s" % (dobj.get_value(header, i)) print(s) print("\n\n\n\nselect_columns") d = dobj.get_data() # print("Data:", d) s = dobj.select_columns(['thing1', 'thing3']) print("Selected columns:", s) print("Data range:", analysis.data_range(['thing1', 'thing3'], dobj)) print("Mean:", analysis.mean(['thing1', 'thing3'], dobj)) print("Standard deviation:", analysis.stdev(['thing1', 'thing3'], dobj)) print("Normalize columns separately:", analysis.normalize_columns_separately(['thing1', 'thing3'], dobj)) print("Normalize columns together:", analysis.normalize_columns_together(['thing1', 'thing3'], dobj))
def buildLinearRegression(self,headers): normalized = analysis.normalize_columns_separately( headers, self.data ) list = normalized.tolist() for row in range(len(list)): list[row].append(0) list[row].append(1) normalized = np.matrix(list) self.points = normalized vtm = self.view.build() pts = (vtm * self.points.T).T for i in range( pts.shape[0] ): row = pts.tolist()[i] dx = 3 dy = 3 if self.shapeOption.get() == "Dot": pt = self.canvas.create_oval( row[0]-dx, row[1]-dx, row[0]+dx, row[1]+dx, fill=self.colorOption.get(), outline='', tags="data" ) self.dataObjects.append(pt) self.objects.append(pt) elif self.shapeOption.get() == "Square": pt = self.canvas.create_rectangle( row[0]-dx, row[1]-dx, row[0]+dx, row[1]+dx, fill=self.colorOption.get(), outline='', tags ="data" ) self.dataObjects.append(pt) self.objects.append(pt) unnormalized = self.data.get_data(headers).T.tolist() regress_output = scipy.stats.linregress(unnormalized[0],unnormalized[1]) m = round(regress_output[0],3) b = round(regress_output[1], 3) r = round(regress_output[2]*regress_output[2], 3) ranges = analysis.data_range(headers,self.data) xmin = ranges[0][0] xmax = ranges[0][1] ymin = ranges[1][0] ymax = ranges[1][1] pt1 = [0.0, ((xmin * m + b) - ymin)/(ymax - ymin),0,1 ] pt2 = [1.0, ((xmax * m + b) - ymin)/(ymax - ymin),0,1 ] print "point1" print pt1 print "point2" print pt2 self.regressionMatrix = np.matrix([pt1,pt2]) pts = (vtm * self.regressionMatrix.T).T print pts best_fit = self.canvas.create_line(pts[0,0],pts[0,1],pts[1,0],pts[1,1], width=3, fill='gold',tags="data") self.regressionLines.append(best_fit) self.label['text'] = "The best fit line equation:\n y = " + str(m) + "x + " + str(b)+"\n\nR^2 value: " + str(r)
def test(filename): data = Data(filename) data.addColumn('enumstuff3', 'enum', [ 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'aa', 'aaa', 'a', 'a', 'a', 'aa' ]) data.addColumn('numberstuff3', 'numeric', [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4, 3, 3, 4, 5]) print(data.get_data()) data.__str__() print( an.data_range([data.get_headers()[0], data.get_headers()[1]], filename)) print(an.mean([data.get_headers()[0], data.get_headers()[1]], filename)) print(an.stdev([data.get_headers()[0], data.get_headers()[1]], filename)) print( an.normalize_columns_seperately( [data.get_headers()[0], data.get_headers()[1]], filename)) print( an.normalize_columns_together( [data.get_headers()[0], data.get_headers()[1]], filename))
def buildLinearRegression(self, independent, dependent): dx = 5 dy = 5 #task5.1 #Extract Results and Assign them to Variables xvar = independent yvar = dependent #normalize columns separately a = analysis.normalize_columns_separately([xvar], self.data) b = analysis.normalize_columns_separately([yvar], self.data) c = np.hstack((a, b)) #task5.2 #add a third column of zeros to the matrix z1 = np.zeros((self.data.get_num_rows(), 1)) d = np.hstack((c, z1)) #task5.3 #add a fourth column of zeros to the matrix z2 = np.ones((self.data.get_num_rows(), 1)) self.data2matrix = np.hstack((d, z2)) #task5.4 #build the VTM vtm = self.view.build() #multiply it by data points tp = (vtm*self.data2matrix.T).T #build points for i in range(tp.shape[0]): tx = tp[i, 0] ty = tp[i, 1] pt = self.canvas.create_oval(tx - dx, ty - dy, tx + dx, ty + dy, fill="black", outline='') self.objects.append(pt) #task5.5 #calculate linear regression xy = self.data.get_data([xvar,yvar]) #yu = self.data.get_data([yvar]) ###help from Theo S. slope, intercept, r_value, p_value, r2 = sc.linregress(xy) print slope, intercept, r2 #task5.6 #get range xrange = analysis.data_range([xvar], self.data) yrange = analysis.data_range([yvar], self.data) #task5.7 #make endpoints value1 = ((xrange[0][0] * slope + intercept) - yrange[0][0]) / (yrange[0][1] - yrange[0][0]) value2 = ((xrange[0][1] * slope + intercept) - yrange[0][0]) / (yrange[0][1] - yrange[0][0]) print "hi" self.LRendpoints = np.matrix([ [0, value1, 0, 1], [1, value2, 0, 1] ]) #task5.8 #multiply the line endpoints by the vtm, #then make tk obj out of endpoints points = (vtm * self.LRendpoints.T).T self.regLine = self.canvas.create_line(points[0,0], points[0,1], points[1,0], points[1,1], fill= "Red", width = 3) self.LRobjects.append(self.regLine) #task5.9 self.lineLabel = tk.Label(self.canvas, text = "Linear Regression:" + str(slope)) self.lineLabel.place(x=points[1,0], y=points[1,1])
def buildLinearRegression(self,headers): norm = an.normalize_columns_separately(self.data, headers) zeromatrix = np.zeros(norm.shape[0]) onesmatrix = np.ones(norm.shape[0]) # x and y are automatically first two dimensions xdatahead = headers[0] ydatahead = headers[1] if xdatahead != None and ydatahead != None: dmatrix = np.matrix(norm) nmatrix = np.matrix((zeromatrix, onesmatrix)).T self.dataPointMatrix = np.hstack((dmatrix, nmatrix)) vtm = self.v.build() pts = (vtm * self.dataPointMatrix.T).T for i in range(pts.shape[0]): x = pts[i, 0] y = pts[i, 1] dx = 5 pt = self.canvas.create_oval(x - dx, y - dx, x + dx, y + dx, fill='blue', outline='') self.objects.append(pt) xdata=np.array(self.data.get_data([xdatahead]).T)[0] ydata=np.array(self.data.get_data([ydatahead]).T)[0] slope, intercept, r_value, p_value, slope_std_error = st.linregress(xdata,ydata) predict_y = intercept + slope * xdata pred_error = ydata - predict_y degrees_of_freedom = len(xdata) - 2 r2_value=r_value*r_value residual_std_error = np.sqrt(np.sum(pred_error ** 2) / degrees_of_freedom) rangex =an.data_range(self.data,[xdatahead]) rangey=an.data_range(self.data,[ydatahead]) yend0=((rangex[0,0]*slope+intercept)-rangey[0,0])/(rangey[0,1]-rangey[0,0]) yend1=((rangex[0,1]*slope+intercept)-rangey[0,0])/(rangey[0,1]-rangey[0,0]) print "minx", rangex[0,0] print "maxx", rangex[0,1] print "miny", rangey[0,0] print "maxy", rangey[0,1] linemtrxcol1=np.matrix([[0.0],[yend0],[0.0],[1.0]]) linemtrxcol2=np.matrix([[1.0],[yend1],[0.0],[1.0]]) self.linRegEndpoints=np.hstack((linemtrxcol1,linemtrxcol2)) print "vtm", vtm print "linRegEndpoints", self.linRegEndpoints le=vtm*self.linRegEndpoints print "le", le self.linRegLines.append(self.canvas.create_line(le[0, 0], le[1, 0], le[0, 1], le[1, 1], fill="red", tags="X")) self.statslabel.delete('1.0', tk.END) self.statslabel.insert(tk.END, "Slope: "+str(slope) + " " + "Intercept: " + str(intercept)+ " " + "r^2 value: "+ str(r2_value))
for header in nheaders[1:]: s += ", " + header print(s) print("\nNumeric Matrix:") print(dobj.numeric_matrix) # print out the types print("\nTypes:") types = dobj.get_types() s = types[0] for type in types[1:]: s += ", " + type print(s) r = analysis.data_range(headers, dobj) print("Data Range:\n ", r) mean = analysis.mean(headers, dobj) print("Mean: \n", mean) std = analysis.stdev(headers, dobj) print("Standard Deviation: \n", std) #std = analysis.stdev(headers, dobj) #print("Standard Deviation: \n", std) nor_m1 = analysis.normalize_columns_separately(headers, dobj) print("Normalized Columns Separately: \n", nor_m1) nor_m2 = analysis.normalize_columns_together(headers, dobj) print("Normalized Columns Together: \n", nor_m2)
def build_3d_linear_regression(self, independent_variables, dependent_variable): self.plot = analysis.normalize_columns_separately([ independent_variables[0], independent_variables[1], dependent_variable ], self.data) # self.plot = self.data.limit_columns([independent_variable, dependent_variable]) self.plot = np.hstack((self.plot, np.ones((self.plot.shape[0], 1)))) # build the view matrix and transform the points vtm = self.view.build() pts = self.plot * vtm # (vtm * self.plot.T).T # initialize self.size so that our movement functions don't break self.size = [] # make a graphical point for each data point for i in range(len(pts)): self.size.append(1) x = pts[i, 0] y = pts[i, 1] pt = self.canvas.create_oval(int(x - 1), int(y - 1), int(x + 1), int(y + 1), fill="black", outline='') self.points.append(pt) linres = analysis.linear_regression(self.data, independent_variables, dependent_variable) slope0 = linres[0] slope1 = linres[1] intercept = linres[2] rvalue = linres[4] xmin = analysis.data_range([independent_variables[0]], self.data)[0][0] xmax = analysis.data_range([independent_variables[0]], self.data)[0][1] ymin = analysis.data_range([independent_variables[1]], self.data)[0][0] ymax = analysis.data_range([independent_variables[1]], self.data)[0][1] zmin = analysis.data_range([dependent_variable], self.data)[0][0] zmax = analysis.data_range([dependent_variable], self.data)[0][1] xends = [0.0, 1.0] yends = [ ((xmin * slope0[0, 0] + intercept[0, 0]) - ymin) / (ymax - ymin), ((xmax * slope0[0, 0] + intercept[0, 0]) - ymin) / (ymax - ymin) ] zends = [ ((xmin * slope1[0, 0] + intercept[0, 0]) - zmin) / (zmax - zmin), ((xmax * slope1[0, 0] + intercept[0, 0]) - zmin) / (zmax - zmin) ] self.regression_endpoints = np.matrix([[0.0, 1.0], [yends[0], yends[1]], [zends[0], zends[1]], [1, 1]]) print("self.regression_endpoints", self.regression_endpoints) self.line_of_fit = (self.canvas.create_line( self.regression_endpoints[0, 0], self.regression_endpoints[1, 0], self.regression_endpoints[0, 1], self.regression_endpoints[1, 1], fill="red")) self.regression_lines.append(self.line_of_fit) self.fit_label = tk.Label(self.canvas, text="slope0: " + str(slope0[0, 0]) + "\nslope1: " + str(slope1[0, 0]) + "\nIntercept: " + str(intercept[0, 0]) + "\nR-value: " + str(rvalue)) self.fit_label.place(x=self.regression_endpoints[0, 1], y=self.regression_endpoints[1, 1]) self.updateAxes() self.updateFits() self.updatePoints()
print("String Matrix Representation") data.__str__(False) print() print("Subsets of Matrix\n") col = [1] row = [2, 4] print("All rows with Column Subset") print(data.subset(col), "\n") print("All columns with Row Subset") print(data.subset(rows=row), "\n") print("Subset Rows and Columns") print(data.subset(col, row), "\n") print("Range of Numeric Data") print(analysis.data_range(data, data.get_headers()), "\n") print("IQR of the Numeric Columns") print(analysis.data_iqr(data, data.get_headers()), "\n") print("Mean of the Numeric Columns") print(analysis.data_mean(data, data.get_headers()), "\n") print("Median of the Numeric Columns") print(analysis.data_median(data, data.get_headers()), "\n") print("StDev of the Numeric Columns") print(analysis.data_stdev(data, data.get_headers()), "\n") print("Variance of the Numeric Columns") print(analysis.data_variance(data, data.get_headers()), "\n")
def buildLinearRegression(self): #self.uniqueColors = False if self.gRegressLine is not None: self.canvas.delete(self.gRegressLine) self.canvas.delete(self.glinText) self.gRegressLine = None temp_matrix = analysis.normalize_columns_separately(self.data, self.dataheaders) self.rows = len(temp_matrix) if len(self.dataheaders) == 2: temp_matrix = np.hstack((temp_matrix, np.zeros(shape=(self.rows,1)))) self.dataMatrix = np.hstack((temp_matrix, np.ones(shape=(self.rows,1)))) self.buildAxes() if len(self.dataheaders) == 2: slope, self.intercept, r_value, self.p_value, self.std_err = scipy.stats.linregress(self.data.get_data(self.dataheaders)) self.slope.append(slope) self.r_squared = r_value**2 data_range = analysis.data_range(self.data, self.dataheaders) high = ((data_range[0][0]*self.slope[0] + self.intercept)-data_range[1][1])/(data_range[1][0]-data_range[1][1]) low = ((data_range[0][1]*self.slope[0] + self.intercept)-data_range[1][1])/(data_range[1][0]-data_range[1][1]) #print low,high self.endpoints = np.matrix([[0, low, 0, 1], [1, high, 0, 1]]) vtm = self.view.build() pts = (vtm * self.endpoints.T).T self.gRegressLine = self.canvas.create_line(pts[0,0], pts[0,1], pts[1,0], pts[1,1], fill = "red") linText = ("Slope: %.3f, Intercept: %.3f, R Squared: %.3f"%(slope, self.intercept, r_value**2)) self.glinText = self.canvas.create_text(pts[1,0], pts[1,1], text = linText) else: regressstuffs = analysis.linear_regression(self.data, self.dataheaders[:2], [self.dataheaders[2],]) self.intercept = regressstuffs[0][0] self.slope.append(regressstuffs[0][1]) self.slope.append(regressstuffs[0][2]) self.std_err = regressstuffs[1] self.r_squared = regressstuffs[2] self.p_value = regressstuffs[4] #print intercept data_range = analysis.data_range(self.data, self.dataheaders) highx0 = ((data_range[0][0]*self.slope[0] + self.intercept)-data_range[2][1])/(data_range[2][0]-data_range[2][1]) lowx0 = ((data_range[0][1]*self.slope[0] + self.intercept)-data_range[2][1])/(data_range[2][0]-data_range[2][1]) #print lowx0, highx0 highx1 = ((data_range[1][0]*self.slope[1] + self.intercept)-data_range[2][1])/(data_range[2][0]-data_range[2][1]) lowx1 = ((data_range[1][1]*self.slope[1] + self.intercept)-data_range[2][1])/(data_range[2][0]-data_range[2][1]) #print lowx1,highx1 #x1 goes in the x direction, x2 in y, dep goes in Z self.endpoints = np.matrix([[0, 0, lowx0, 1], [1, 0, highx0, 1], [0, 0, lowx1, 1], [0, 1, highx1, 1]]) vtm = self.view.build() pts = (vtm * self.endpoints.T).T #print pts #self.gRegressLine = self.canvas.create_rectangle(pts[0,0],pts[2,1],pts[1,0],pts[3,1]) self.gRegressLines = [] #I made each line in the plane a different color because I wasn't sure if things were working right so I wanted to be able to differentiate them #I think this should be a 3D visualization of the linear regression, but I might have done something horribly wrong(it seems to work as a plane) for self.gRegressLines.append(self.canvas.create_line(pts[0,0], pts[0,1], pts[1,0], pts[1,1], fill = "red")) self.gRegressLines.append(self.canvas.create_line(pts[2,0], pts[2,1], pts[3,0], pts[3,1], fill = "green")) self.gRegressLines.append(self.canvas.create_line(pts[0,0], pts[0,1], pts[2,0], pts[2,1], fill = "blue")) self.gRegressLines.append(self.canvas.create_line(pts[1,0], pts[1,1], pts[3,0], pts[3,1], fill = "black")) #self.gRegressLine = self.canvas.create_polygon(pts[0,0],pts[0,1], # pts[1,0], pts[1,1], # pts[2,0], pts[2,1], # pts[3,0], pts[3,1], fill = '', outline = "red") linText = ("X0 Slope: %.3f, X1 Slope: %.3f, Intercept: %.3f, R Squared: %.3f"%(self.slope[0], self.slope[1], self.intercept, self.r_squared)) self.glinText = self.canvas.create_text(pts[1,0], pts[1,1], text = linText)
def buildAxes(self): self.resetData() vtm = self.view.build() #print self.axes.T pts = (vtm * self.axes.T).T #create tick marks xpts = (vtm*self.xticks.T).T ypts = (vtm*self.yticks.T).T zpts = (vtm*self.zticks.T).T if self.dataMatrix is not None: #length = len(self.dataMatrix.T) #temp_matrix = self.dataMatrix.T[[0,1,2,-1]] #print temp_matrix datapts = (vtm*self.dataMatrix.T).T #number2letter= {0: 'X', 1: 'Y', 2: 'Z'} number2axes= {0: [0, 10], 1: [10, 0], 2: [5,5]} for i in range(3): axis = self.canvas.create_line(pts[2*i,0], pts[2*i,1], pts[2*i+1,0], pts[2*i+1,1]) self.gAxes.append(axis) for i in range(len(self.dataheaders)): text = self.canvas.create_text(pts[2*i+1,0]+number2axes[i][0]*1/self.view.extent[0,0], pts[2*i+1,1]+number2axes[i][1]*1/self.view.extent[0,1], text = self.dataheaders[i]) self.gLabels.append(text) if self.dataMatrix is not None: #this part is so much more difficult incorporating both if self.dataheaders[0] in self.data.get_headers(): xdrange = analysis.data_range(self.data, (self.dataheaders[0],)) else: xdrange = analysis.data_range(self.PCA, (self.dataheaders[0],)) xrange = xdrange[0][0] - xdrange[0][1] if self.dataheaders[1] in self.data.get_headers(): ydrange = analysis.data_range(self.data, (self.dataheaders[1],)) else: ydrange = analysis.data_range(self.PCA, (self.dataheaders[1],)) yrange = ydrange[0][0] - ydrange[0][1] if len(self.dataheaders) >2: if self.dataheaders[2] in self.data.get_headers(): zdrange = analysis.data_range(self.data, (self.dataheaders[2],)) else: zdrange = analysis.data_range(self.PCA, (self.dataheaders[2],)) zrange = zdrange[0][0] - zdrange[0][1] number2xlabel = {0:"%.2f"%(xdrange[0][1]+xrange/4.0), 1: "%.2f"%(xdrange[0][1]+2*xrange/4.0), 2: "%.2f"%(xdrange[0][1]+3*xrange/4.0), 3:"%.2f"%(xdrange[0][0])} number2ylabel = {0:"%.2f"%(ydrange[0][1]+yrange/4.0), 1: "%.2f"%(ydrange[0][1]+2*yrange/4.0), 2: "%.2f"%(ydrange[0][1]+3*yrange/4.0), 3:"%.2f"%(ydrange[0][0])} if len(self.dataheaders) >2: number2zlabel = {0:"%.2f"%(zdrange[0][1]+zrange/4.0), 1: "%.2f"%(zdrange[0][1]+2*zrange/4.0), 2: "%.2f"%(zdrange[0][1]+3*zrange/4.0), 3:"%.2f"%(zdrange[0][0])} for i in range(4): tick = self.canvas.create_line(xpts[2*i,0], xpts[2*i,1], xpts[2*i+1,0], xpts[2*i+1,1]) self.gXticks.append(tick) text = self.canvas.create_text(xpts[2*i+1,0], xpts[2*i+1,1], text= number2xlabel[i]) self.gXlabels.append(text) tick = self.canvas.create_line(ypts[2*i,0], ypts[2*i,1], ypts[2*i+1,0], ypts[2*i+1,1]) self.gYticks.append(tick) text = self.canvas.create_text(ypts[2*i+1,0], ypts[2*i+1,1], text= number2ylabel[i]) self.gYlabels.append(text) if len(self.dataheaders) >2: tick = self.canvas.create_line(zpts[2*i,0], zpts[2*i,1], zpts[2*i+1,0], zpts[2*i+1,1]) self.gZticks.append(tick) text = self.canvas.create_text(zpts[2*i+1,0], zpts[2*i+1,1], text= number2zlabel[i]) self.gZlabels.append(text) if self.dataMatrix is not None: rows =self.rows lenx = 3*1.0/self.view.extent[0,0] leny = 3*1.0/self.view.extent[0,0] for i in range(rows): dx = 1 r = 0.5*255 g = 0.5*255 b = 0.5*255 color ="#%02x%02x%02x" %(r,g,b) if self.colorResult is not None: if self.colorVar.get() == 1: color = self.colors[int(self.colorMatrix[i,0])] #print self.colorMatrix[i,0] else: if self.colorResult in self.data.get_headers(): dataRange = analysis.data_range(self.data, (self.colorResult,)) elif self.colorResult in self.PCA.get_headers(): dataRange = analysis.data_range(self.PCA, (self.colorResult,)) else: print "something is wrong, your color does not exist" middle = (dataRange[0][0]+dataRange[0][1])/2 alpha = 1.0/(1.0+math.e**(-10*(self.colorMatrix[i,0]-0.5))) #alpha = 1.0/(1.0+math.e**(-(1.0/dataRange[0][1])*(self.colorMatrix[i,0]-middle))) #print alpha r = (1.0- alpha)*255 g = (1.0 - alpha) *255 b = alpha*255 color ="#%02x%02x%02x" %(r,g,b) if self.sizeResult is not None: dx = self.sizeMatrix[i,0]*5 #print dx self.objects.append(self.canvas.create_oval(datapts[i,0]-lenx*dx, datapts[i,1]-leny*dx, datapts[i,0] + lenx*dx, datapts[i,1]+leny*dx, fill = color, outline ='',)) #self.canvas.postscript(file = "imag%d.ps"%(self.n) , colormode = 'color') #self.n+=1 self.scaleText.set("%.2f, %.2f"%(self.view.extent[0,0], self.view.extent[0,1])) self.resetAxes()
def buildLinearRegression(self, indx, indz, dep, export, filename): if (indz != ''): matrix = analysis.normalize_columns_separately(self.data, [indx, dep, indz]) else: matrix = analysis.normalize_columns_separately(self.data, [indx, dep]) zeros = np.zeros(self.data.get_raw_num_rows()) matrix = np.hstack( (matrix, np.matrix(zeros).T) ) ones = np.ones(self.data.get_raw_num_rows()) self.dataMatrix = np.hstack( (matrix, np.matrix(ones).T) ) # calculate view coordinates vtm = self.view.build() pts = (vtm * self.dataMatrix.T).T # use points with default size and color self.sizes = [2]*self.data.get_raw_num_rows() self.colors = ['#000000']*self.data.get_raw_num_rows() for i in range(len(pts)): pt = self.canvas.create_oval(pts[i, 0]-self.sizes[i], pts[i, 1]-self.sizes[i], pts[i, 0]+self.sizes[i], pts[i, 1]+self.sizes[i], fill=self.colors[i], outline='') self.objects.append(pt) # calculate single variable linear regression if (indz == ''): slope, intercept, r_value, p_value, std_err = scipy.stats.linregress( self.data.get_data([indx, dep])) ranges = analysis.data_range(self.data, [indx, dep]) end1y = ((ranges[0][0]*slope+intercept)-ranges[1][0])/(ranges[1][1]-ranges[1][0]) end2y = ((ranges[0][1]*slope+intercept)-ranges[1][0])/(ranges[1][1]-ranges[1][0]) self.regressionMatrix = np.matrix([ [0.0, end1y, 0.0, 1.0], [1.0, end2y, 0.0, 1.0] ]) eqn = "y = %.3fx + %.3f \nR = %.3f" % (slope, intercept, r_value) data = "p = %.3f \nStandard error = %.3f" % (p_value, std_err) out = eqn + "\n" + data # calculate muliple variable linear regression else: b, sse, r2, t, p = analysis.linear_regression(self.data, [indx, indz], dep) ranges = analysis.data_range(self.data, [indx, indz, dep]) end1y = ranges[0][0]*b[0] + ranges[1][0]*b[1] + b[2] end1y = (end1y - ranges[2][0])/(ranges[2][1] - ranges[2][0]) end2y = ranges[0][1]*b[0] + ranges[1][1]*b[1] + b[2] end2y = (end2y - ranges[2][0])/(ranges[2][1] - ranges[2][0]) self.regressionMatrix = np.matrix([ [0.0, end1y, 0.0, 1.0], [1.0, end2y, 1.0, 1.0] ]) eqn = "y = %.3fx + %.3fz + %.3f \nR^2 = %.3f" % (b[0], b[1], b[2], r2) sse_data = "Sum-squared error = %.3f" % (sse) p_data = "p = [%.3f, %.3f, %.3f]" % (p[0, 0], p[0, 1], p[0, 2]) t_data = "t-statistic = [%.3f, %.3f, %.3f]" % (t[0, 0], t[0, 1], t[0, 2]) out = eqn + "\n" + sse_data + "\n" + p_data + "\n" + t_data # display regression onscreen self.canvas.itemconfig(self.labels[0], text="x") self.canvas.itemconfig(self.labels[1], text="y") self.canvas.itemconfig(self.labels[2], text="z") endpts = (vtm * self.regressionMatrix.T).T l = self.canvas.create_line(endpts[0, 0], endpts[0, 1], endpts[1, 0], endpts[1, 1], fill="red") self.regressionObjects.append(l) regLabel = self.canvas.create_text(endpts[1, 0]+120, endpts[1, 1]+20, text=eqn) self.labels.append(regLabel) title = "Linear regression for " + str(self.fn) # write linear regression function to file if (export == 1): file = open(filename + ".txt", 'w') file.write(title + "\n" + out) file.close()
def main(): numpy.set_printoptions(suppress=True) print("\n----- Database Info -----") if len(sys.argv) < 2: print('Usage: python %s <csv filename>' % (sys.argv[0])) exit(0) # create a data object, which reads in the data dobj = data.Data(sys.argv[1]) print("\nName: ", dobj.get_filename()) # print out information about the dat print('Number of rows: ', dobj.get_num_points()) print('Number of numeric columns: ', dobj.get_num_dimensions()) # print out the headers print("\nHeaders:") headers = dobj.get_headers() s = headers[0] for header in headers[1:]: s += ", " + header print(s) # print out the headers print("\nNumeric Headers:") nheaders = dobj.get_numericheaders() s = nheaders[0] for header in nheaders[1:]: s += ", " + header print(s) # print out the types print("\nTypes:") types = dobj.get_types() s = types[0] for type in types[1:]: s += ", " + type print(s) r = analysis.data_range(headers, dobj) print("Data Range:\n ", r) mean = analysis.mean(headers, dobj) print("Mean: \n", mean) std = analysis.stdev(headers, dobj) print("Standard Deviation: \n", std) if headers == nheaders: nor_m1 = analysis.normalize_columns_separately(headers, dobj) print("Normalized Columns Separately: \n", nor_m1) if headers == nheaders: nor_m2 = analysis.normalize_columns_together(headers, dobj) print("Normalized Columns Together: \n", nor_m2) s = analysis.sumup(headers, dobj) print("Sum:\n", s) print("Variance:\n", analysis.variance(headers, dobj)) # EXTENSION5 ADD COLUMN dobj.add_colummn('new col', 'numeric', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) print( "\nAdd new column: 'new col','numeric', [1,2,3,4,5,6,7,8,9,10,11,12,13,14]" ) print("----- New Matrix: -----") m = dobj.get_whole_matrix() print(m) print('Number of rows: ', dobj.get_num_points()) print('Number of numeric columns: ', dobj.get_num_dimensions()) print("---------------------------------") # EXTENSION6 WRITE TO A CSV file a = numpy.asarray(m) with open('foo.csv', 'w') as outputfile: wr = csv.writer(outputfile, delimiter=',') wr.writerow(dobj.get_headers()) wr.writerow(dobj.get_types()) for ls in a: wr.writerow(ls)