def getDistrib(data, nbins=0, stride=0, bins=[], norm = False): from scipy.stats import histogram, histogram2 if nbins>0: stride = (max(data)-min(data))/nbins bins = np.arange(min(data)-stride, max(data)+stride, stride) dist = histogram2(data, bins) if norm: dist = map(float, dist) dist = [dist[i]/sum(dist) for i in range(len(dist))] return dist, bins, stride elif stride>0: bins = np.arange(min(data)-stride, max(data)+stride, stride) dist = histogram2(data, bins) if norm: dist = map(float, dist) dist = [dist[i]/sum(dist) for i in range(len(dist))] return dist, bins elif len(bins)>0: dist = histogram2(data, bins) if norm: dist = map(float, dist) dist = [dist[i]/sum(dist) for i in range(len(dist))] return dist else: nbins = 10 stride = (max(data)-min(data))/nbins bins = np.arange(min(data)-stride, max(data)+stride, stride) dist = histogram2(data, bins) if norm: dist = map(float, dist) dist = [dist[i]/sum(dist) for i in range(len(dist))] return dist, bins
def analyzeList(self, mylist, myrange=(0, 1, 1), filename=None): """ histogram2(a, bins) -- Compute histogram of a using divisions in bins Description: Count the number of times values from array a fall into numerical ranges defined by bins. Range x is given by bins[x] <= range_x < bins[x+1] where x =0,N and N is the length of the bins array. The last range is given by bins[N] <= range_N < infinity. Values less than bins[0] are not included in the histogram. Arguments: a -- 1D array. The array of values to be divied into bins bins -- 1D array. Defines the ranges of values to use during histogramming. Returns: 1D array. Each value represents the occurences for a given bin (range) of values. """ #hist,bmin,minw,err = stats.histogram(mynumpy, numbins=36) #print hist,bmin,minw,err,"\n" if len(mylist) < 2: apDisplay.printWarning("Did not write file not enough rows (" + str(filename) + ")") return if myrange[0] is None: mymin = float(math.floor(ndimage.minimum(mylist))) else: mymin = float(myrange[0]) if myrange[1] is None: mymax = float(math.ceil(ndimage.maximum(mylist))) else: mymax = float(myrange[1]) mystep = float(myrange[2]) mynumpy = numpy.asarray(mylist, dtype=numpy.float32) print "range=", round(ndimage.minimum(mynumpy), 2), " <> ", round(ndimage.maximum(mynumpy), 2) print " mean=", round(ndimage.mean(mynumpy), 2), " +- ", round( ndimage.standard_deviation(mynumpy), 2) #histogram bins = [] mybin = mymin while mybin <= mymax: bins.append(mybin) mybin += mystep bins = numpy.asarray(bins, dtype=numpy.float32) apDisplay.printMsg("Creating histogram with " + str(len(bins)) + " bins") hist = stats.histogram2(mynumpy, bins=bins) #print bins #print hist if filename is not None: f = open(filename, "w") for i in range(len(bins)): out = ("%3.4f %d\n" % (bins[i] + mystep / 2.0, hist[i])) f.write(out) f.write("&\n")
def PdfFromTrace (trace, intBounds): """ Returns the empirical density function of a trace. Parameters ---------- trace : vector of doubles The trace data intBounds : vector of doubles The array of interval boundaries. The pdf is the number of samples falling into an interval divided by the interval length. Returns ------- x : vector of doubles The center of the intervals (the points where the empirical pdf is calculated) y : vector of doubles The values of the empirical pdf at the given points """ hist = stats.histogram2 (trace, intBounds) intlens = intBounds[1:] - intBounds[0:-1] y = hist[0:-1] / intlens / len(trace) x = (intBounds[1:] + intBounds[0:-1]) / 2.0 return (x,y)
def PdfFromTrace(trace, intBounds): """ Returns the empirical density function of a trace. Parameters ---------- trace : vector of doubles The trace data intBounds : vector of doubles The array of interval boundaries. The pdf is the number of samples falling into an interval divided by the interval length. Returns ------- x : vector of doubles The center of the intervals (the points where the empirical pdf is calculated) y : vector of doubles The values of the empirical pdf at the given points """ hist = stats.histogram2(trace, intBounds) intlens = intBounds[1:] - intBounds[0:-1] y = hist[0:-1] / intlens / len(trace) x = (intBounds[1:] + intBounds[0:-1]) / 2.0 return (x, y)
def pdf_velocity(particle): nbins=10 uX = np.zeros( (np.size(particle[:,0]),np.size(particle[0,:]) )) uY = np.zeros( (np.size(particle[:,0]),np.size(particle[0,:]) )) uZ = np.zeros( (np.size(particle[:,0]),np.size(particle[0,:]) )) Tmax=np.size(particle[:,0]) NParticles=np.size(particle[0,:]) NParticles = 3 print "T,N",Tmax,NParticles jrange = range(Tmax) for j in jrange: for i in range(NParticles): uX[j,i] = particle[j][i][3] uY[j,i] = particle[j][i][4] uZ[j,i] = particle[j][i][5] #print uX[:,1].sort() # in case the hist must be stored h_tmp=np.zeros(nbins) h_uX =np.zeros(nbins) binX=pl.linspace(uX[:,NParticles-1].min(),uX[:,NParticles-1].max(),nbins) print binX pl.figure(1) pl.subplot(121) for npart in range(NParticles): h_tmp=stats.histogram2(uX[:,npart],binX) pl.plot(binX,h_tmp) h_uX+=h_tmp h_uX /= NParticles return (binX,h_uX)
def analyzeList(self, mylist, myrange=(0,1,1), filename=None): """ histogram2(a, bins) -- Compute histogram of a using divisions in bins Description: Count the number of times values from array a fall into numerical ranges defined by bins. Range x is given by bins[x] <= range_x < bins[x+1] where x =0,N and N is the length of the bins array. The last range is given by bins[N] <= range_N < infinity. Values less than bins[0] are not included in the histogram. Arguments: a -- 1D array. The array of values to be divied into bins bins -- 1D array. Defines the ranges of values to use during histogramming. Returns: 1D array. Each value represents the occurences for a given bin (range) of values. """ #hist,bmin,minw,err = stats.histogram(mynumpy, numbins=36) #print hist,bmin,minw,err,"\n" if len(mylist) < 2: apDisplay.printWarning("Did not write file not enough rows ("+str(filename)+")") return if myrange[0] is None: mymin = float(math.floor(ndimage.minimum(mylist))) else: mymin = float(myrange[0]) if myrange[1] is None: mymax = float(math.ceil(ndimage.maximum(mylist))) else: mymax = float(myrange[1]) mystep = float(myrange[2]) mynumpy = numpy.asarray(mylist, dtype=numpy.float32) print "range=",round(ndimage.minimum(mynumpy),2)," <> ",round(ndimage.maximum(mynumpy),2) print " mean=",round(ndimage.mean(mynumpy),2)," +- ",round(ndimage.standard_deviation(mynumpy),2) #histogram bins = [] mybin = mymin while mybin <= mymax: bins.append(mybin) mybin += mystep bins = numpy.asarray(bins, dtype=numpy.float32) apDisplay.printMsg("Creating histogram with "+str(len(bins))+" bins") hist = stats.histogram2(mynumpy, bins=bins) #print bins #print hist if filename is not None: f = open(filename, "w") for i in range(len(bins)): out = ("%3.4f %d\n" % (bins[i] + mystep/2.0, hist[i]) ) f.write(out) f.write("&\n")
def getSpectrum(self, eVChannel=1.0, limits=None): if limits is not None: startEnergy_eV = limits[0] endEnergy_eV = limits[1] else: startEnergy_eV = self.header["startEv"] endEnergy_eV = self.header["endEv"] energies_eV = numpy.arange(startEnergy_eV, endEnergy_eV + eVChannel, eVChannel) data = [energy_eV for energy_eV, dummy_time in self.cspData] intensities = stats.histogram2(data, energies_eV) assert len(energies_eV) == len(intensities) #print len(self.cspData), sum(intensities) return energies_eV[:-1], intensities[:-1]
def generate_voi_histogram(self, poi, width): print 'poi',poi,'width',width # indices of points in volume of interest (poi) pts_indices = self.get_voi_pts_indices(poi, width) self.voi_pts_indices = pts_indices pts = np.asarray(self.processor.pts3d_bound) pts = pts[:,pts_indices] #truncate points to volume of interest self.voi_pts = pts #mlab.points3d(pts[0,:],pts[1,:],pts[2,:], mode='point') #mlab.show() #go from 0 to 2m, create histogram with 80 bins = bin of 2.5cm (=height-slice) min = 0. max = 2. self.voi_bincount = 80 self.voi_interval_size = max - min bins = np.asarray(range(self.voi_bincount)) * self.voi_interval_size/float(self.voi_bincount) #print 'bins',bins hist = stats.histogram2(pts[2],bins) / float(len(pts[2])) #print 'zhist',hist #print zip(bins, hist) self.z_hist = hist self.z_hist_bins = bins slices = self.get_voi_slice_indices() self.z_hist_slices_indices = slices #precalculate spread values: self.z_hist_spread = [] for indices in self.z_hist_slices_indices: a = self.processor.pts3d_bound[:,indices] # ev12 gives an indication about how far points are spread out in a specific height-slice u, ev12 = gaussian_curvature.spread(a) self.z_hist_spread += [(ev12[0], ev12[1])] #create h,s,i histograms for each slice: pts_h = [] pts_s = [] #print self.processor.pts3d_bound #TODO: does this use the volume of interest? should it??? n,m = np.shape(np.asarray(self.processor.pts3d_bound)) #print 'm',m,'len(self.processor.pts3d_bound[2,:].A1)',len(self.processor.pts3d_bound[2,:].A1) for index in range(m): pts_h.append(float(self.imNP_h[self.processor.map2d[1,index],self.processor.map2d[0,index]])) for index in range(m): pts_s.append(float(self.imNP_s[self.processor.map2d[1,index],self.processor.map2d[0,index]])) pts_i = np.asarray(self.processor.intensities_bound) #print 'ptsi',pts_i if np.max(pts_i) > 0: self.intensity_normalization_factor = 1.0 / float(np.max(pts_i)) * 255 else: self.intensity_normalization_factor = 1. #print 'self.intensity_normalization_factor', self.intensity_normalization_factor #print pts_i pts_i *= self.intensity_normalization_factor pts_h = np.asarray(pts_h) pts_s = np.asarray(pts_s) self.z_hist_h_hists = [] self.z_hist_s_hists = [] self.z_hist_i_hists = [] #normalize by maximum slice: max_count = 0 max_count_index = 0 for count_idx, indices in enumerate(slices): n = np.shape(indices) if n[0] > max_count: max_count = n[0] max_count_index = count_idx slize_height = (self.voi_interval_size / float(self.voi_bincount)) self.z_hist_height_max = slize_height * (max_count_index + 0.5) #print 'max_count', max_count,'index',max_count_index, 'height in max bin', self.z_hist_height_max for indices in slices: pts_h_slice = pts_h[indices] pts_s_slice = pts_s[indices] pts_i_slice = pts_i[indices] self.hsi_hist_bincount = 5 bins = np.asarray(range(0,self.hsi_hist_bincount))*float(255.0/float(self.hsi_hist_bincount)) #print bins #todo: smooth with kernel fct count = float(len(pts_h_slice)) if count == 0: count = 1 hist_h = stats.histogram2(pts_h_slice,bins) / count self.z_hist_h_hists.append(hist_h) hist_s = stats.histogram2(pts_s_slice,bins) / count self.z_hist_s_hists.append(hist_s) hist_i = stats.histogram2(pts_i_slice,bins) / count #print 'hist_i', hist_i, pts_i_slice, bins, pts_i self.z_hist_i_hists.append(hist_i)
def get_featurevector(self, index, count, pts = None): if pts == None: pts = self.processor.pts3d_bound #print 'i',index,'c', count fv = [] indices = np.asarray(self.kdtree_queried_indices[count]) invalid_value = np.shape(pts)[1] #print indices #print 'iv',invalid_value indices = indices[indices != invalid_value] #print ut.getTime(), indices #print ut.getTime(), 'number of pts', len(indices) a = pts[:,indices] view = processor.rotate_to_plane(self.processor.scan_dataset.ground_plane_normal, np.matrix([-1,0,0.]).T) normal, eigenvalues = gaussian_curvature.gaussian_curvature(a,view) #eigenvalues = eigenvalues / np.square(r) #fv += [normal[0,0],0,normal[2,0]] #fv += normal.T.A[0].tolist() #fv += eigenvalues.tolist() #print np.asarray(pts[:,index].T[0])[0] # print 'pt',np.asarray(pts[:,index].T[0]) point = pts[:,index] ev1, ev2 = self.get_voi_histogram_spread(point) #z_max_height_diff = pts[2,index] - self.get_voi_maxcount_height() #fv += [self.get_voi_histogram_value(point),z_max_height_diff,normal[0,0],normal[1,0],normal[2,0], ev1, ev2] fv += [self.get_voi_histogram_value(point),normal[0,0],normal[1,0],normal[2,0], ev1, ev2] h = self.imNP_h[self.processor.map2d[1,index],self.processor.map2d[0,index]] s = self.imNP_s[self.processor.map2d[1,index],self.processor.map2d[0,index]] i = self.processor.intensities_bound[index] hsi = self.get_voi_hsi_histogram_values(point,h,s,i) fv += [hsi[0],hsi[1],hsi[2]] #print np.shape(self.imNP_tex1) #print np.shape(self.map2d) tex1 = self.imNP_tex1[self.processor.map2d[1,index],self.processor.map2d[0,index]] tex2 = self.imNP_tex2[self.processor.map2d[1,index],self.processor.map2d[0,index]] fv += [tex1, tex2] #print tex1, tex2 #color histograms: colors_h = [] colors_s = [] colors_v = [] for idx in indices: colors_h.append(float(self.imNP_h[self.processor.map2d[1,idx],self.processor.map2d[0,idx]])) colors_s.append(float(self.imNP_s[self.processor.map2d[1,idx],self.processor.map2d[0,idx]])) colors_v.append(float(self.imNP_v[self.processor.map2d[1,idx],self.processor.map2d[0,idx]])) color_hist = stats.histogram2(np.array(colors_h), [0,51,102,153,204]) color_hist = color_hist / float(np.sum(color_hist)) color_hist = list(color_hist) fv += color_hist color_hist = stats.histogram2(np.array(colors_s), [0,51,102,153,204]) color_hist = color_hist / float(np.sum(color_hist)) color_hist = list(color_hist) fv += color_hist color_hist = stats.histogram2(np.array(colors_v), [0,51,102,153,204]) color_hist = color_hist / float(np.sum(color_hist)) color_hist = list(color_hist) fv += color_hist #intensities intensities = self.processor.intensities_bound[indices] intensities = np.asarray(intensities) #map to 0-255-range: TODO: perhaps do some nonlinear transformation here? intensities = intensities / 10000 * 255 intensity_hist = stats.histogram2(intensities, [0,51,102,153,204]) intensity_hist = intensity_hist / float(np.sum(intensity_hist)) intensity_hist = list(intensity_hist) fv += intensity_hist #current colors: fv += [float(self.imNP_h[self.processor.map2d[1,index],self.processor.map2d[0,index]]) / 255.0] fv += [float(self.imNP_s[self.processor.map2d[1,index],self.processor.map2d[0,index]]) / 255.0] fv += [float(self.imNP_v[self.processor.map2d[1,index],self.processor.map2d[0,index]]) / 255.0] #current intensity value (scaled) intensity = self.processor.intensities_bound[index] #scale: intensity = intensity / 15000.0 intensity = [intensity] fv += intensity if self.debug_before_first_featurevector == True: self.debug_before_first_featurevector = False print ut.getTime(), 'get_featurevector: Choosing not to print Feature Vector Sample' #print ut.getTime(), 'feature vector sample(gaussian histograms):', fv return fv
def generate_voi_histogram(self, poi, width): print 'poi', poi, 'width', width # indices of points in volume of interest (poi) pts_indices = self.get_voi_pts_indices(poi, width) self.voi_pts_indices = pts_indices pts = np.asarray(self.processor.pts3d_bound) pts = pts[:, pts_indices] #truncate points to volume of interest self.voi_pts = pts #mlab.points3d(pts[0,:],pts[1,:],pts[2,:], mode='point') #mlab.show() #go from 0 to 2m, create histogram with 80 bins = bin of 2.5cm (=height-slice) min = 0. max = 2. self.voi_bincount = 80 self.voi_interval_size = max - min bins = np.asarray(range( self.voi_bincount)) * self.voi_interval_size / float( self.voi_bincount) #print 'bins',bins hist = stats.histogram2(pts[2], bins) / float(len(pts[2])) #print 'zhist',hist #print zip(bins, hist) self.z_hist = hist self.z_hist_bins = bins slices = self.get_voi_slice_indices() self.z_hist_slices_indices = slices #precalculate spread values: self.z_hist_spread = [] for indices in self.z_hist_slices_indices: a = self.processor.pts3d_bound[:, indices] # ev12 gives an indication about how far points are spread out in a specific height-slice u, ev12 = gaussian_curvature.spread(a) self.z_hist_spread += [(ev12[0], ev12[1])] #create h,s,i histograms for each slice: pts_h = [] pts_s = [] #print self.processor.pts3d_bound #TODO: does this use the volume of interest? should it??? n, m = np.shape(np.asarray(self.processor.pts3d_bound)) #print 'm',m,'len(self.processor.pts3d_bound[2,:].A1)',len(self.processor.pts3d_bound[2,:].A1) for index in range(m): pts_h.append( float(self.imNP_h[self.processor.map2d[1, index], self.processor.map2d[0, index]])) for index in range(m): pts_s.append( float(self.imNP_s[self.processor.map2d[1, index], self.processor.map2d[0, index]])) pts_i = np.asarray(self.processor.intensities_bound) #print 'ptsi',pts_i if np.max(pts_i) > 0: self.intensity_normalization_factor = 1.0 / float( np.max(pts_i)) * 255 else: self.intensity_normalization_factor = 1. #print 'self.intensity_normalization_factor', self.intensity_normalization_factor #print pts_i pts_i *= self.intensity_normalization_factor pts_h = np.asarray(pts_h) pts_s = np.asarray(pts_s) self.z_hist_h_hists = [] self.z_hist_s_hists = [] self.z_hist_i_hists = [] #normalize by maximum slice: max_count = 0 max_count_index = 0 for count_idx, indices in enumerate(slices): n = np.shape(indices) if n[0] > max_count: max_count = n[0] max_count_index = count_idx slize_height = (self.voi_interval_size / float(self.voi_bincount)) self.z_hist_height_max = slize_height * (max_count_index + 0.5) #print 'max_count', max_count,'index',max_count_index, 'height in max bin', self.z_hist_height_max for indices in slices: pts_h_slice = pts_h[indices] pts_s_slice = pts_s[indices] pts_i_slice = pts_i[indices] self.hsi_hist_bincount = 5 bins = np.asarray(range(0, self.hsi_hist_bincount)) * float( 255.0 / float(self.hsi_hist_bincount)) #print bins #todo: smooth with kernel fct count = float(len(pts_h_slice)) if count == 0: count = 1 hist_h = stats.histogram2(pts_h_slice, bins) / count self.z_hist_h_hists.append(hist_h) hist_s = stats.histogram2(pts_s_slice, bins) / count self.z_hist_s_hists.append(hist_s) hist_i = stats.histogram2(pts_i_slice, bins) / count #print 'hist_i', hist_i, pts_i_slice, bins, pts_i self.z_hist_i_hists.append(hist_i)
def get_featurevector(self, index, count, pts=None): if pts == None: pts = self.processor.pts3d_bound #print 'i',index,'c', count fv = [] indices = np.asarray(self.kdtree_queried_indices[count]) invalid_value = np.shape(pts)[1] #print indices #print 'iv',invalid_value indices = indices[indices != invalid_value] #print ut.getTime(), indices #print ut.getTime(), 'number of pts', len(indices) a = pts[:, indices] view = processor.rotate_to_plane( self.processor.scan_dataset.ground_plane_normal, np.matrix([-1, 0, 0.]).T) normal, eigenvalues = gaussian_curvature.gaussian_curvature(a, view) #eigenvalues = eigenvalues / np.square(r) #fv += [normal[0,0],0,normal[2,0]] #fv += normal.T.A[0].tolist() #fv += eigenvalues.tolist() #print np.asarray(pts[:,index].T[0])[0] # print 'pt',np.asarray(pts[:,index].T[0]) point = pts[:, index] ev1, ev2 = self.get_voi_histogram_spread(point) #z_max_height_diff = pts[2,index] - self.get_voi_maxcount_height() #fv += [self.get_voi_histogram_value(point),z_max_height_diff,normal[0,0],normal[1,0],normal[2,0], ev1, ev2] fv += [ self.get_voi_histogram_value(point), normal[0, 0], normal[1, 0], normal[2, 0], ev1, ev2 ] h = self.imNP_h[self.processor.map2d[1, index], self.processor.map2d[0, index]] s = self.imNP_s[self.processor.map2d[1, index], self.processor.map2d[0, index]] i = self.processor.intensities_bound[index] hsi = self.get_voi_hsi_histogram_values(point, h, s, i) fv += [hsi[0], hsi[1], hsi[2]] #print np.shape(self.imNP_tex1) #print np.shape(self.map2d) tex1 = self.imNP_tex1[self.processor.map2d[1, index], self.processor.map2d[0, index]] tex2 = self.imNP_tex2[self.processor.map2d[1, index], self.processor.map2d[0, index]] fv += [tex1, tex2] #print tex1, tex2 #color histograms: colors_h = [] colors_s = [] colors_v = [] for idx in indices: colors_h.append( float(self.imNP_h[self.processor.map2d[1, idx], self.processor.map2d[0, idx]])) colors_s.append( float(self.imNP_s[self.processor.map2d[1, idx], self.processor.map2d[0, idx]])) colors_v.append( float(self.imNP_v[self.processor.map2d[1, idx], self.processor.map2d[0, idx]])) color_hist = stats.histogram2(np.array(colors_h), [0, 51, 102, 153, 204]) color_hist = color_hist / float(np.sum(color_hist)) color_hist = list(color_hist) fv += color_hist color_hist = stats.histogram2(np.array(colors_s), [0, 51, 102, 153, 204]) color_hist = color_hist / float(np.sum(color_hist)) color_hist = list(color_hist) fv += color_hist color_hist = stats.histogram2(np.array(colors_v), [0, 51, 102, 153, 204]) color_hist = color_hist / float(np.sum(color_hist)) color_hist = list(color_hist) fv += color_hist #intensities intensities = self.processor.intensities_bound[indices] intensities = np.asarray(intensities) #map to 0-255-range: TODO: perhaps do some nonlinear transformation here? intensities = intensities / 10000 * 255 intensity_hist = stats.histogram2(intensities, [0, 51, 102, 153, 204]) intensity_hist = intensity_hist / float(np.sum(intensity_hist)) intensity_hist = list(intensity_hist) fv += intensity_hist #current colors: fv += [ float(self.imNP_h[self.processor.map2d[1, index], self.processor.map2d[0, index]]) / 255.0 ] fv += [ float(self.imNP_s[self.processor.map2d[1, index], self.processor.map2d[0, index]]) / 255.0 ] fv += [ float(self.imNP_v[self.processor.map2d[1, index], self.processor.map2d[0, index]]) / 255.0 ] #current intensity value (scaled) intensity = self.processor.intensities_bound[index] #scale: intensity = intensity / 15000.0 intensity = [intensity] fv += intensity if self.debug_before_first_featurevector == True: self.debug_before_first_featurevector = False print ut.getTime( ), 'get_featurevector: Choosing not to print Feature Vector Sample' #print ut.getTime(), 'feature vector sample(gaussian histograms):', fv return fv
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument( "--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;", ) parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help= "Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help= "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values.", ) parser.add_argument( "--fisher", action="store_true", default=False, help="if true then Fisher definition is used", ) parser.add_argument( "--bias", action="store_true", default=False, help= "if false,then the calculations are corrected for statistical bias", ) parser.add_argument( "--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored", ) parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored", ) parser.add_argument( "--inclusive", action="store_true", default=False, help="if false,limit will be ignored", ) parser.add_argument( "--printextras", action="store_true", default=False, help= "If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help= "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument( "--correction", action="store_true", default=False, help="continuity correction ", ) parser.add_argument( "--axis", type=int, default=0, help= "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help= "the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument( "--score", type=int, default=0, help="Score that is compared to the elements in a.", ) parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help= "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument( "--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds", ) parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help= "lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help= "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument( "--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e", ) parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols is not None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols is not None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols is not None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe( map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis( map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias, ) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode, ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf == 0 and mf == 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf == 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf == 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf == 0 and mf == 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf == 0 and mf == 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf == 0 and mf == 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf == 0 and mf == 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation, ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation, ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf == 0 and mf == 0: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf == 0 and mf == 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf == 0 and mf == 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1( map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail, ) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf == 0 and mf == 0: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf == 0 and mf == 0: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf == 0 and mf == 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda == 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two)) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity, ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one), map(float, sample_two), equal_var=args.equal_var) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort, ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction, ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_, ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two), ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
def histogram(self,totals=None): """histogram of days to guesses""" if totals is None: totals = self.totals range = self.range_of_ints(totals) return stats.histogram2(totals,range)
import sys import os import string import numpy import scipy import scipy.stats from scipy.stats import histogram, histogram2 debug = os.getenv("DEBUG") rsptime_fn = sys.argv[1] f = open(rsptime_fn, "r") records = f.readlines() times = numpy.array( [ float(r.strip().split(',')[1]) for r in records ] ) maxtime = max(times) (time_histo, time_low_range, time_binsize, time_extrapoints) = histogram( times, defaultlimits=(0.0, maxtime)) assert(time_low_range == 0.0) assert(time_extrapoints == 0) if debug: print(time_histo, ' shape ', time_histo.shape, ' low_range ', time_low_range, ' binsize ', time_binsize, ' extrapoints ', time_extrapoints) print('time histogram: %s'%string.join([ str(v) for v in time_histo.tolist() ], ',')) rsptimes = numpy.array( [ float(r.strip().split(',')[2]) for r in records ] ) rsptime_histo = histogram2( rsptimes, [ 0.0001, 0.00032, 0.001, 0.0032, 0.01, 0.032, 0.1, 0.32, 1, 3.2, 10, 32, 100 ] ) if debug: print(rsptime_histo,rsptime_histo.shape) print('response time histogram: %s'%string.join( [ str(v) for v in rsptime_histo.tolist() ], ','))