def all_means(cp): """Function accepts a cp object and returns a mean and standard deviation of all parameters for all fingerprints in this form: list of (feature, mu, std) for each feature """ data = [] for parameter_pair in zip(*[finger.items() for finger in cp.fingerprints()]): parameter, values = zip(*parameter_pair) print parameter, values print st.nanmean(values), st.nanstd(values) data.append([parameter[0], st.nanmean(values), st.nanstd(values)]) return data
def cinfo(CL,param): """ This property returns information on the parameter in the cloud (all given in the units of the parameter). Note that the parameter is averaged over the entire cloud time at the altitude required (bottom, top or in-cloud) - not the case using vpinfo(CL,param). CloudObj.cinfo["bottom"]: param at the cloud base CloudObj.cinfo["top"]: param at the cloud top CloudObj.cinfo["mean"]: mean param through the cloud (in cloud) CloudObj.cinfo["median"]: median param through the cloud (in cloud) CloudObj.cinfo["stdev"]: standard deviation of the param through the cloud (in cloud) CloudObj.cinfo["delta"]: difference of param between the bottom and the top CloudObj.cinfo["slope"]: delta divided by the mean thickness The property can be accessed as e.g. CloudObj.cinfo["bottom"] or CloudObj.cinfo (dictionary) """ H=dict() H["bottom"]=list(); H["top"]=list(); H["mean"]=list(); H["median"]=list(); H["stdev"]=list(); H["delta"]=list(); H["slope"]=list(); H["units"]=list(); alt=[i for i,x in enumerate(CL.dttl) if x == 'altitude'][0] T=[i for i,x in enumerate(CL.dttl) if x == param][0] try: for i in range(len(CL.props["height"])): ix=nonzero((CL.data[alt]>=CL.props["height"][i][1])*(CL.data[alt]<=CL.props["height"][i][2])) H["bottom"].append(float(st.nanmedian(CL.data[T][nonzero((CL.data[alt]>=CL.props["height"][i][0])*(CL.data[alt]<=CL.props["height"][i][1]))]))) H["top"].append(float(st.nanmedian(CL.data[T][nonzero((CL.data[alt]>=CL.props["height"][i][2])*(CL.data[alt]<=CL.props["height"][i][3]))]))) H["mean"].append(float(st.nanmean(CL.data[T][ix]))) H["median"].append(float(st.nanmedian(CL.data[T][ix]))) H["stdev"].append(float(st.nanstd(CL.data[T][ix]))) H["delta"].append(H["bottom"][i]-H["top"][i]) H["slope"].append(H["delta"][i]/(np.mean([CL.props["height"][i][2], CL.props["height"][i][3]])-np.mean([CL.props["height"][i][0], CL.props["height"][i][1]]))) # units/meter H["units"].append(CL.dunit[T]) del ix except: print("[cinfo] Height properties must be defined first using the defheight method.") return H
def collapse(self, keys, vName): """ desc: Collapse the data by a (list of) keys and get statistics on a dependent variable. arguments: keys: desc: A key or list of keys to collapse the data on. type: [list, str, unicode] vName: desc: The dependent variable to collapse. Alternative, you can specifiy a function, in which case the error will be 0. type: [str, unicode, function] returns: desc: A DataMatrix with the collapsed data, with the descriptives statistics on `vName`. type: DataMatrix """ if isinstance(keys, basestring): keys = [keys] m = [keys + ['mean', 'median', 'std', 'se', '95ci', 'count']] for g in self.group(keys): l = [] for key in keys: l.append(g[key][0]) if type(vName) == types.FunctionType: l.append(vName(g)) l.append(np.nan) l.append(np.nan) l.append(np.nan) l.append(np.nan) l.append(len(g)) else: a = g[vName] l.append(nanmean(a)) l.append(nanmedian(a)) l.append(nanstd(a)) l.append(nanstd(a)/np.sqrt(a.size)) l.append(1.96*nanstd(a)/np.sqrt(a.size)) l.append(a.size) m.append(l) return DataMatrix(m)
def produce_stats(data): m = {} for (server, size, conns, it, time, rate) in data: map_add(m, (server, size, conns), rate) data = [] for k, v in m.items(): mean = stats.nanmean(v) stddev = stats.nanstd(v) data += [k + (mean, stddev)] return data
def aggregate_ftr_matrix(self, ftr_matrix): sig = [] for ftr in ftr_matrix: median = stats.nanmedian(ftr) mean = stats.nanmean(ftr) std = stats.nanstd(ftr) # Invalid double scalars warning appears here skew = stats.skew(ftr) if any(ftr) else 0.0 kurtosis = stats.kurtosis(ftr) sig.extend([median, mean, std, skew, kurtosis]) return sig
def dmso_means(cp): """Function accepts a cp object and returns a mean and standard deviation of all parameters for the dmso fingerprints in this form: list of (feature, mu, std) for each feature """ dmsos = [dmso.items() for dmso in get_dmsos(cp)] data = [] for parameter_pair in zip(*dmsos): parameter, values = zip(*parameter_pair) data.append((parameter[0], st.nanmean(values), st.nanstd(values))) return data
def run_stats(x,n): """runstats(x,n). Calculates and returns the running mean, median, standard deviation, and median absolute deviation (MAD). This function handles NaNs and masked values (masked arrays) by ignoring them. x (input) is the array on which the running statistics are calculated (only one dimension, 1D array). n is the number of points taken in the running statistics window.""" x=copy.deepcopy(x) try: x.mask except: x=np.ma.array(x,mask=False) if len(np.shape(x))>2: raise ValueError("The array provided has more than 2 dimensions, at most 1 or 2 dimensions can be handled.") try: [ro,co]=np.shape(x) except: ro=np.shape(x)[0]; co=1 if ro==1 or co==1: ro=max(ro,co) x=x.reshape(ro,) else: raise ValueError("The array must be a vector (one column or row)") # initializing matrix M=ones([ro,n])*NaN; M=ma.asanyarray(M) # building matrix if n%2==1: # if n is odd for j in range(int(n/2),0,-1): posi=int(n/2)-j # current position M[0:ro-j,posi]=x[j:] for j in range(1,2+int(n/2),1): posi=int(n/2)+j-1; M[j-1:,posi]=x[0:(ro+1)-j] elif n%2==0: # if n is even for j in range(n/2,0,-1): posi=n/2-j M[0:ro-j,posi]=x[j:] for j in range(1,n/2+1): posi=n/2+j-1; M[j-1:,posi]=x[0:(ro+1)-j] else: print("Well, that's pretty weird. Are you sure n is an integer?") M.data[M.mask]=nan ave=st.nanmean(M, axis=1); med=st.nanmedian(M, axis=1); stde=st.nanstd(M, axis=1); mad=medabsdev(M,axis=1) return [ave, med, stde, mad]
def compute_average(nh=10, lr_num=10, lr_denum=1000, prefix='rbm', smoothing=True): cmd = "grep -rl --include='orig.conf' 'lr_num = %i$' . |" % lr_num +\ "xargs grep 'lr_denum = %i$' " % lr_denum print cmd p = os.popen(cmd) numseeds = len([pi for pi in enumerate(p)]) p = os.popen(cmd) x = numpy.ones((numseeds, 20)) * numpy.nan y = numpy.ones((numseeds, 20)) * numpy.nan for i, match in enumerate(p): jid = match.split('/')[1] rfname = '%s/%s_train_callback.hdf5' % (jid, prefix) if not os.path.exists(rfname): continue fp = tables.openFile(rfname) _x = fp.root.train_ll.col('n') _y = fp.root.train_ll.col('train_ll') _vlogz = fp.root.var_logz.col('var_logz') fp.close() if smoothing: idx = numpy.where(_vlogz < 50.)[0] x[i, idx] = _x[idx] y[i, idx] = _y[idx] else: x[i, :len(_x)] = _x y[i, :len(_y)] = _y print '**** prefix=%s nh=%i lr_num=%s lr_denum=%s ******' % (prefix, nh, lr_num, lr_denum) print nanmean(y, axis=0) xmean = nanmean(x, axis=0) ymean = nanmean(y, axis=0) ystd = nanstd(y, axis=0) ystd[numpy.isnan(ystd)] = 0. idx = ~numpy.isnan(xmean) return [xmean[idx], ymean[idx], ystd[idx]]
def runstats(x,n): # Stephanie Gagne, UHel, 2010 # converted to Python, Dal, 2012 # x is an array of 1 dimension. # n is the number of point taken in the running statistic """takes data, number of points for the running mean/standard deviation and returns the running mean and running standard deviation.""" try: x.mask except: x=ma.asanyarray(x); x.mask=ones(np.shape(x))*False try: [ro,co]=np.shape(x) except: ro=np.shape(x)[0]; co=1 if ro==1 or co==1: x=x.reshape(max(ro,co),) else: print("The array must be a vector (one column or row)") # initializing matrix ro=max(ro,co) M=ones([ro,n])*NaN; M=ma.asanyarray(M) # building matrix if n%2==1: # if n is odd for j in range(int(n/2),0,-1): posi=int(n/2)-j # current position M[0:ro-j,posi]=x[j:] for j in range(1,2+int(n/2),1): posi=int(n/2)+j-1; M[j-1:,posi]=x[0:(ro+1)-j] elif n%2==0: # if n is even for j in range(n/2,0,-1): posi=n/2-j M[0:ro-j,posi]=x[j:] for j in range(1,n/2+1): posi=n/2+j-1; M[j-1:,posi]=x[0:(ro+1)-j] else: print("Well, that's pretty weird. Are you sure n is an integer?") M.data[M.mask]=NaN ave=st.nanmean(M, axis=1); stde=st.nanstd(M, axis=1); return [ave, stde]
def addDescriptives(self): """Adds averages and errors to the PivotMatrix""" # Determine the row averages and std self.rowMeans = [] self.rowStds = [] for rowIndex in range(self.nRows): row = self.m[self.rowHeaders+rowIndex][self.colHeaders:-2] self.rowMeans.append(nanmean(row, axis=None)) self.rowStds.append(nanstd(row, axis=None)) self.m[self.rowHeaders+rowIndex][-2] = nanmean(row, axis=None) self.m[self.rowHeaders+rowIndex][-1] = nanstd(row, axis=None) # Determine the column averages and std _m = self.m.swapaxes(0,1) self.colMeans = [] self.colErrs = [] for colIndex in range(self.nCols): col = _m[self.colHeaders+colIndex][self.rowHeaders:-2] _m[self.colHeaders+colIndex][-2] = nanmean(col, axis=None) if self.err == '95ci': e = nanstd(col, axis=None)/np.sqrt(col.size)*1.96 elif self.err == 'se': e = nanstd(col, axis=None)/np.sqrt(col.size) elif self.err == 'std': e = nanstd(col, axis=None) else: raise Exception('Err keyword must be "95ci", "se", or "std"') _m[self.colHeaders+colIndex][-1] = e self.colMeans.append(nanmean(col, axis=None)) self.colErrs.append(e) # Determine the grand average and std self.m[-2,-2] = nanmean(self.m[self.rowHeaders:-2, self.colHeaders:-2], \ axis=None) self.m[-1,-1] = nanstd(self.m[self.rowHeaders:-2, self.colHeaders:-2], \ axis=None)
csv_writer = csv.writer(csvfile, delimiter=',', quotechar='|') csv_writer.writerow(hot[beginning_time:end_time]) with open('volume all 394 gp data.csv', 'ab') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', quotechar='|') csv_writer.writerow(gp[beginning_time:end_time]) #Append data hot_total_volume.append(hot[beginning_time:end_time]) gp_total_volume.append(gp[beginning_time:end_time]) date_1 += timedelta(days=1) #Average volume at each time across all days average_hot_volume = sci.nanmean(hot_total_volume, axis=0) variance_hot_volume = sci.nanstd(hot_total_volume, axis=0) average_gp_volume = sci.nanmean(gp_total_volume, axis=0) variance_gp_volume = sci.nanstd(gp_total_volume, axis=0) #Group into 3 minute increments (6 x 30 seconds) k = 0 resolution = 6 while k in range(0, len(average_hot_volume)): average_hot_volume[k:k + resolution] = sum( average_hot_volume[k:k + resolution]) / resolution variance_hot_volume[k:k + resolution] = sum( variance_hot_volume[k:k + resolution]) / resolution average_gp_volume[k:k + resolution] = sum( average_gp_volume[k:k + resolution]) / resolution variance_gp_volume[k:k + resolution] = sum( variance_gp_volume[k:k + resolution]) / resolution
csv_writer.writerow(hot[beginning_time:end_time]) with open('volume all 394 gp data.csv', 'ab') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',',quotechar='|') csv_writer.writerow(gp[beginning_time:end_time]) #Append data hot_total_volume.append(hot[beginning_time:end_time]) gp_total_volume.append(gp[beginning_time:end_time]) date_1 += timedelta(days=1) #Average volume at each time across all days average_hot_volume = sci.nanmean(hot_total_volume,axis=0) variance_hot_volume = sci.nanstd(hot_total_volume,axis=0) average_gp_volume = sci.nanmean(gp_total_volume,axis=0) variance_gp_volume = sci.nanstd(gp_total_volume,axis=0) #Group into 3 minute increments (6 x 30 seconds) k=0 resolution = 6 while k in range(0,len(average_hot_volume)): average_hot_volume[k:k+resolution] = sum(average_hot_volume[k:k+resolution])/resolution variance_hot_volume[k:k+resolution] = sum(variance_hot_volume[k:k+resolution])/resolution average_gp_volume[k:k+resolution] = sum(average_gp_volume[k:k+resolution])/resolution variance_gp_volume[k:k+resolution] = sum(variance_gp_volume[k:k+resolution])/resolution k+=resolution #Write out averaged 3 minute data with open('volume 394.csv', 'ab') as csvfile:
def stddev(x): """ std\{%s\} := Standard deviation of %s """ from scipy.stats.stats import nanstd x = notnone(x) return nanstd(x)
def vpinfo(CL,param,base='bg'): """ This method returns information on the chosen parameter from CloudObj.dttl in the cloud for all vertical scan. The averaging of the parameter is done in the particular column of the vertical scan. Options: param: string containing the title of the parameter as found in CloudObj.dttl or CloudObj.extrattl base: method to find the cloud base and top. Default is best guess (defBGheight) base='bg'; to use the 4-point method (defheight) base='4point'. Returns H: H["bottom"]: parameter at the cloud base H["top"]: parameter at the cloud top H["mean"]: mean parameter through the cloud H["median"]: median parameter through the cloud H["minimum"]: minimum parameter through the cloud H["maximum"]: maximum parameter through the cloud H["stdev"]: standard deviation of the parameter through the cloud H["delta"]: difference of parameter between the bottom and the top H["slope"]: delta divided by the mean thickness H["units"]: units of the parameter """ if type(param)==str: pass else: param=str(param) H=dict() altp=[i for i,x in enumerate(CL.dttl) if x == 'altitude'][0] tim=[i for i,x in enumerate(CL.dttl) if x == 'time'][0] T=[i for i,x in enumerate(CL.dttl) if x.lower() == param.lower()] if len(T)==1: T=T[0] Td=CL.data[T] Tunits=CL.dunit[T] alt=CL.data[altp] ta=CL.data[tim] elif len(T)>1: print("[vpinfo] Parameter %s was found multiple times in the basic data." %(param)); return dict() elif len(T)==0: posx=[] for i,ttl in enumerate(CL.extrattl): # for all extra datasets available posx=posx+[[i,j] for j,x in enumerate(ttl) if x.lower() == param.lower()] # check all titles matching with temperature if len(posx)==1: Td=CL.extradata[posx[0][0]][posx[0][1]] # loading the data Tunits=CL.extraunit[posx[0][0]][posx[0][1]] j=[j for j,x in enumerate(CL.extrattl[i]) if x.lower() == 'time'][0] Tt=CL.extradata[posx[0][0]][j] # loading associated time stamp # adapting for too short data for interpolation if len(Tt)<2: Td=np.ones((2,))*NaN; Tt=np.array([CL.times["cloud"][0][0],CL.times["cloud"][0][1]]); # adapting the time vector to a common time vector ta1=np.max([CL.data[tim][0],Tt[0]]); ta2=np.min([CL.data[tim][-1],Tt[-1]]); ta=CL.data[tim][nonzero((CL.data[tim]>=ta1)*(CL.data[tim]<=ta2))[0]] alt=CL.data[altp][nonzero((CL.data[tim]>=ta1)*(CL.data[tim]<=ta2))[0]] fT=interpolate.interp1d(Tt,Td,kind='linear') Td=fT(ta) else: print("[vpinfo] No or multiple %s found in the basic or the extra data." %(param)); return dict() H["bottom"]=list(); H["top"]=list(); H["mean"]=list(); H["median"]=list(); H["stdev"]=list(); H["delta"]=list(); H["slope"]=list(); H["units"]=list(); H["minimum"]=list(); H["maximum"]=list(); try: for i in range(len(CL.times["verticloud"])): if base=='4point': cb=CL.props["height"][i][1]; ct=CL.props["height"][i][2]; else: cb=CL.props["BGheight"][i][0]; ct=CL.props["BGheight"][i][1]; ix=nonzero((alt>=cb)*(alt<=ct)*(ta>=CL.times["verticloud"][i][0])*(ta<=CL.times["verticloud"][i][1]))[0] if len(ix)==0: H["mean"].append(nan); H["median"].append(nan); H["stdev"].append(nan); H["minimum"].append(nan); H["maximum"].append(nan); H["top"].append(nan); H["bottom"].append(nan); H["delta"].append(nan); H["slope"].append(nan); H["units"].append(nan) else: H["mean"].append(float(st.nanmean(Td[ix]))) H["median"].append(float(st.nanmedian(Td[ix]))) H["stdev"].append(float(st.nanstd(Td[ix]))) H["minimum"].append(float(np.nanmin(Td[ix]))) H["maximum"].append(float(np.nanmax(Td[ix]))) if base=='4point': if len(nonzero((alt>=ct)*(alt<=CL.props["height"][i][3])*(ta>=CL.times["verticloud"][i][0])*(ta<=CL.times["verticloud"][i][1]))[0])==0: H["top"].append(nan) else: H["top"].append(float(st.nanmedian(Td[nonzero((alt>=ct)*(alt<=CL.props["height"][i][3])*(ta>=CL.times["verticloud"][i][0])*(ta<=CL.times["verticloud"][i][1]))]))) if len(nonzero((alt>=CL.props["height"][i][0])*(alt<=cb)*(ta>=CL.times["verticloud"][i][0])*(ta<=CL.times["verticloud"][i][1]))[0])==0: H["bottom"].append(nan) else: H["bottom"].append(float(st.nanmedian(Td[nonzero((alt>=CL.props["height"][i][0])*(alt<=cb)*(ta>=CL.times["verticloud"][i][0])*(ta<=CL.times["verticloud"][i][1]))]))) H["delta"].append(H["bottom"][i]-H["top"][i]) H["slope"].append(H["delta"][i]/(np.mean([ct, CL.props["height"][i][3]])-np.mean([CL.props["height"][i][0], cb]))) else: R=10 # plus/minus R meters around the cloud top if len(nonzero((alt>=ct-R)*(alt<=ct+R)*(ta>=CL.times["verticloud"][i][0])*(ta<=CL.times["verticloud"][i][1]))[0])==0: H["top"].append(nan) else: H["top"].append(float(st.nanmedian(Td[nonzero((alt>=ct-R)*(alt<=ct+R)*(ta>=CL.times["verticloud"][i][0])*(ta<=CL.times["verticloud"][i][1]))]))) if len(nonzero((alt>=cb-R)*(alt<=cb+R)*(ta>=CL.times["verticloud"][i][0])*(ta<=CL.times["verticloud"][i][1]))[0])==0: H["bottom"].append(nan) else: H["bottom"].append(float(st.nanmedian(Td[nonzero((alt>=cb-R)*(alt<=cb+R)*(ta>=CL.times["verticloud"][i][0])*(ta<=CL.times["verticloud"][i][1]))]))) H["delta"].append(H["bottom"][i]-H["top"][i]) H["slope"].append(float(H["delta"][i]/(ct-cb))) H["units"].append(Tunits) del ix except: if base=='4point': print("[vpinfo] Height properties must be defined first using the defheight method.") else: print("[vpinfo] Height properties must be defined first using the defBGheight method.") return H