def Bdist(*args): '''binary distance matrix: dist(X) - return matrix of size (len(X),len(X)) all True! dist(X1,X2)- return matrix of size (len(X1),len(X2)) with (xi==xj)''' if (len(args) == 1): return true X = args[0] Y = args[0] #X = N.matrix(args[0]) #Y = N.matrix(args[0]) elif (len(args) >= 2): X = args[0] Y = args[1] #X = N.matrix(args[0]) #Y = N.matrix(args[1]) rv = N.zeros((len(X), len(Y)), 'double') A = N.repmat(X, 1, len(Y)) B = N.repmat(Y.T, len(X), 1) rv = (A & B) return rv
def SIMPLEmultinomLnL(c, alpha, presTime, recTime, J, k): ''' ARGS c: a parameter of SIMPLE a: a parameter of SIMPLE presTime: time separating onset of words during encoding recTime: time of separation during retrieval J: length of the list (ie, how many words?) k: a matrix where each row i corresponds to an output position, and element j in the row gives number of times item j was recalled at position i RETURNS lnL: the predicted proportion of correct responses at each position ''' lnL = np.zeros([1, J]) Ti = np.cumsum(np.repmat(presTime, 1, J)) Tr = Ti[-1] + np.cumsum(np.repmat(recTime, 1, J)) for i in range(J): # i indexes output + probe position M = np.log(Tr[i] - Ti) eta = np.exp(-c * abs(M(i) - M)**alpha) pall = eta / sum(eta) lnL[i] = sum(k[i, :] * np.log(pall)) return lnL
def Bdist(*args): '''binary distance matrix: dist(X) - return matrix of size (len(X),len(X)) all True! dist(X1,X2)- return matrix of size (len(X1),len(X2)) with (xi==xj)''' if(len(args)==1): return true X = args[0] Y = args[0] #X = N.matrix(args[0]) #Y = N.matrix(args[0]) elif(len(args)>=2): X = args[0] Y = args[1] #X = N.matrix(args[0]) #Y = N.matrix(args[1]) rv = N.zeros((len(X),len(Y)),'double') A = N.repmat(X,1,len(Y)) B = N.repmat(Y.T,len(X),1) rv = (A&B) return rv
def MeshGrid(xs, ys): '''immitate matlab's meshgrid to generate a 2d mesh ''' xs = arr(xs) ys = arr(ys) X = repmat(xs, (ys.size, 1)) Y = repmat(col(ys), (1, xs.size)) return (X, Y)
def initializeBPmessage(BPalg, nodePot): G = BPalg['G'] if (BPalg['options']['eqnStates']): #% when all nodes have same number of states 'maxState' #% both approaches are equivalent, but for speed purpose we should use this #% uniform distribution unif_msg = np.ones((G['maxState'], 2 * G['nEdges'])) / G['maxState'] BPalg['init_message'] = copy.copy( unif_msg) # use b = copy.copy(a) instead of 'a = b' if BPalg['options']['verbose']: print 'Initialized messages from all nodes to their respective neighbors.' else: #%if variable number of states or would like to initiate different #%messages for each node BPalg['init_message'] = np.zeros((G['maxState'], 2 * G['nEdges'])) nis = G['Edges'][:, 0] njs = G['Edges'][:, 1] eij = range(G['nEdges']) eji = eij + G['nEdges'] #% repmat creates variable size column vectors (1:nState(i)) and then rest of the column (nState(i):maxState) is zeros; # Mij = arrayfun(@(s) [repmat(1/s,s,1) ; zeros(G.maxState-s,1)],G.nStates(njs),'UniformOutput',false) # Mji = arrayfun(@(s) [repmat(1/s,s,1) ; zeros(G.maxState-s,1)],G.nStates(nis),'UniformOutput',false) Mij = np.apply_along_axis( lambda x: np.vstack( (np.repmat(1. / (x + 1), (x + 1, 1)), np.zeros(G['maxState'] - x))), 1, G['nState'][njs]) Mji = np.apply_along_axis( lambda x: np.vstack( (np.repmat(1. / (x + 1), (x + 1, 1)), np.zeros(G['maxState'] - x))), 1, G['nState'][nis]) #BPalg.init_message(:,eij) = cell2mat(Mij) #% from node i to j #BPalg.init_message(:,eji) = cell2mat(Mji) #% from node j to i BPalg['init_message'][:, eij] = Mij.tolist() BPalg['init_message'][:, eji] = Mji.tolist() if BPalg['options']['verbose']: print 'Initialized messages from all nodes to their respective neighbors.' BPalg['new_message'] = copy.copy( unif_msg) # use b = copy.copy(a) instead of 'a = b' BPalg['old_message'] = copy.copy( unif_msg) # use b = copy.copy(a) instead of 'a = b' #% When intialized this is set to zero for the iteration to run more than #% once. BPalg['convergence'] = 0 return BPalg
def logprob(self, mc, S): if empty(S): lP = [] return if np.any(S < 0) or np.any(S != round(S)): lP = np.repmat(-Inf, mc.shape) return lp = np.zeros((mc.shape)) fromS = S[0:-2] toS = S[1:-1] for i in range(len(mc)): if S[0] > len(mc[i].initialProb): lP[i] = -float('inf') else: lP[i] = np.log(mc[i].initialProb[S[0]]) if not fromS: if max(fromS) > mc[i].getNStates( ) or S[-1] > mc[i].transitionProb.shape[1]: lP[i] = -float('inf') else: dims = (mc[i].transitionProb.shape, S.shape[0], S.shape[1]) iTrans = np.ravel_multi_index( (mc[i].transitionProb.shape, fromS, toS), dims=dims, order='F') lP[i] = lP[i] + np.sum(np.log( mc[i].transitionProb[iTrans])) return lP
def __init__(self, pMass): self.pD = dict() if isinstance(pMass, DiscreteD): self.pD['probMass'] = pMass else: if pMass.shape[0] == 1 or pMass.shape[1] == 1: self.pD['probMass'] = pMass else: self.pD = np.repmat(pD, pMass.shape[0], 1) for i in range(pMass.shape[0]): self.pD[i, 1]['probMass'] = pMass[i, :].T
def compLSsuffstats_fourier(x,y,dims,minlens,nxcirc = None,condthresh = 1e8): # Compute least-squares regression sufficient statistics in DFT basis # Python version of this NOT complete 9/15/17! # # [dd,wwnrm,Bfft] = compLSsuffstats_fourier(x,y,dims,minlens,nxcirc,condthresh) # # INPUT: # ----- # x [n x p] - stimulus, where each row vector is the spatial stim at a single time # dims [m x 1] - number of coefficients along each stimulus dimension # minlens [m x 1] - minimum length scale for each dimension (can be scalar) # nxcirc [m x 1] - circular boundary in each stimulus dimension (minimum is dims) OPTIONAL # condthresh [1 x 1] - condition number for thresholding for small eigenvalues OPTIONAL # # OUTPUT: # ------ # dd (struct) - carries sufficient statistics for linear regresion # wwnrm [nf x 1] - squared "effective frequencies" in vector form for each dim # Bfft {1 x p} - cell array with DFT bases for each dimension # 1e8 is default value (condition number on prior covariance) dims = np.array(np.reshape(dims,(1,-1))) minlens = np.array(np.reshape(minlens,(1,-1))) # Set circular bounardy (for n-point fft) to avoid edge effects, if needed if nxcirc is None: #nxcirc = np.ceil(max([dims(:)'+minlens(:)'*4; dims(:)'*1.25]))' nxcirc = np.ceil(np.max(np.concatenate((dims+minlens*4 ,dims*1.25), axis = 0), axis = 0)) nd = np.size(dims) # number of filter dimensions if np.size(minlens) is 1: #% make vector out of minlens, if necessary minlens = np.repmat(minlens,nd,1) # Determine number of freqs and make Fourier basis for each dimension # cdiagvecs = [None for x in range(nd)] # eigenvalues for each dimension # Bfft = [None for x in range(nd)] # Fourier basis matrix for each filter dimension # wvecs = [None for x in range(nd)] # Fourier frequencies for each filter dimension # ncoeff = np.zeros([nd,1]) #fprintf('\ncompLSsuffstats_fourier:\n # filter freqs per stimulus dim:'); # Loop through dimensions for jj in np.arange(nd): #careful here, the mkcov_ASDfactored function uses minlens and 1 as the lensc and rho params prs = [[minlens[0][jj],1],dims[0][jj]] cdiagvecs[jj],Bfft[jj],wvecs[jj] = mkcovs.mkcov_ASDfactored(prs,nxcirc[jj], condthresh, compfftbasis= 1) ncoeff[jj] = len(cdiagvecs[jj]) # number of coeffs
def rquad(N,k): k1=k+1; k2=k+2; n=array(range(1, N+1)); nnk=2*n+k; A=repmat(1.0*k**2,(1,N))/(nnk*(nnk+2)); A=np.append(array([float(k)/k2]), A); n=array(range(1, N)); nnk=nnk[n]; n=n+1; B1=4*float(k1)/(k2*k2*(k+3)); nk=n+k; nnk2=nnk*nnk; B=4.0*(n*nk)**2/(nnk2*nnk2-nnk2); ab=np.append(array([2**float(k1)/k1, B1]), B); s=sqrt(ab[1:N]); x, v=eig(diag(A[0:N]) + diag(s, -1) + diag(s, +1)) I, x=zip(*enumerate(x)) x=array(x) x=(x+1)/2; w=((1.0/2)**(k1))*ab[0]*v[0,I]**2; xw=zip(x, w) xw.sort() return [array(x) for x in zip(*xw)]
def extract_from_file(file='', varname='zeta', extraction_type='full', **kwargs): # Check the netCDF file for existence and if the variable is in it if not os.path.exists(file): raise IOError('File %s could not be located on the filesystem' % file) ncf = nc.Dataset(file, mode='r') if varname not in ncf.variables: raise IOError('File %s does not have a variable named %s' % (file, varname)) # start getting data ncvar = ncf.variables[varname] dims = ncvar.dimensions ndims = len(dims) shape = ncvar.shape # print('var: %s, dims: %s, shape: %s' %(varname, str(dims), str(shape))) if not ndims == 3 and not ndims == 4: raise TypeError('ndims is neither 3 nor 4') if not dims[0] == 'ocean_time': raise TypeError('first dimension is not ocean_time') if not shape[0] == 1: raise TypeError('first dimension is not of length one') grid = load_grid.load_grid(file) coords = {} if ndims == 3: if dims[1] == 'eta_rho' and dims[2] == 'xi_rho': # print('G.lat %s' % str(grid['lat'])) # print('G.lon %s' % str(grid['lon'])) y2 = grid['lat'][:] x2 = grid['lon'][:] mask2 = grid['mask'][:] elif dims[1] == 'eta_u' and dims[2] == 'xi_u': # print('G.latu %s' % str(grid['lat_u'])) # print('G.lonv %s' % str(grid['lon_u'])) y2 = grid['latu'][:] x2 = grid['lonu'][:] mask2 = grid['masku'][:] elif dims[1] == 'eta_v' and dims[2] == 'xi_v': # print('G.latv %s' % str(grid['lat_v'])) # print('G.lonv %s' % str(grid['lon_v'])) y2 = grid['latv'][:] x2 = grid['lonv'][:] mask2 = grid['maskv'][:] else: raise TypeError('Unable to determine which gird to use') if extraction_type == 'full' or extraction_type == 'surface': # data = np.squeeze(ncvar[:]) # data[mask2==0] = np.NaN data = np.ma.array(np.squeeze(ncvar[:]), mask=(mask2 == 0)) coords['ym'] = y2 coords['xm'] = x2 if (extraction_type == 'profile' or extraction_type == 'profiles' or extraction_type == 'point' or extraction_type == 'points') and \ (kwargs.has_key('y') and kwargs.has_key('x')): print('In profiles') xm = np.array(kwargs['x']) ym = np.array(kwargs['y']) if not (xm.ndim == ym.ndim) or not (xm.shape == ym.shape): if xm.size == 1: xm = np.repmat(xm, ym.shape) elif ym.size == 1: ym = np.repmat(ym, xm.shape) # else: # raise RuntimeError('The x and y chosen to extract a point or profile on this 2D variable are incompatible.') data2 = np.ma.array(np.squeeze(ncvar[:]), mask=(mask2 == 0)) # data = utils.interp_2d(lat=y2,lon=x2,data=data2,lati=ym,loni=xm) mask = utils.interp_2d_xy(y=y2, x=x2, data=mask2, yi=ym, xi=xm) data = utils.interp_2d_xy(y=y2, x=x2, data=data2, yi=ym, xi=xm) data = np.ma.array(data, mask=(mask < 1)) coords['ym'] = ym coords['xm'] = xm if ndims == 4: xm = [] ym = [] zm = [] K = shape[1] J = shape[2] I = shape[3] lat = grid['lat'][:] lon = grid['lon'][:] if dims[2] == 'eta_rho' and dims[3] == 'xi_rho': y2 = lat x2 = lon mask2 = grid['mask'][:] elif dims[2] == 'eta_u' and dims[3] == 'xi_u': y2 = grid['latu'][:] x2 = grid['lonu'][:] mask2 = grid['masku'][:] elif dims[2] == 'eta_v' and dims[3] == 'xi_v': y2 = grid['latv'][:] x2 = grid['lonv'][:] mask2 = grid['maskv'][:] else: raise TypeError( 'Unable to determine which gird to use. dims[2] = %s, dims[3] = %s' % (dims[2], dims[3])) if dims[1] == 's_rho': cs = grid['cs'][:] elif dims[1] == 's_w': cs = grid['csw'][:] elif K == 1: cs = 0 else: raise TypeError( 'Unable to determine which cs to use. dim[1] = %s' % dim[1]) data = [] if extraction_type == 'full': # get zeta try: if dims[2] == 'eta_rho' and dims[3] == 'xi_rho': zeta2 = ncf.variables['zeta'][:] else: zeta2 = utils.interp_2d_xy(y=lat, x=lon, data=ncf.variables['zeta'][:], yi=y2, xi=x2) except Exception, e: print(e) zeta2 = np.zeros((len(y2), len(x2))) zeta2[zeta2 > 1000] = 0 # get H if dims[2] == 'eta_rho' and dims[3] == 'xi_rho': H2 = grid['H'][:] else: H2 = utils.interp_2d_xy(y=lat, x=lon, data=grid['H'][:], yi=y2, xi=x2) x3 = np.tile(x2.reshape(1, J, I), (K, 1, 1)) y3 = np.tile(y2.reshape(1, J, I), (K, 1, 1)) mask3 = np.tile(mask2.reshape(1, J, I), (K, 1, 1)) zeta3 = np.tile(zeta2.reshape(1, J, I), (K, 1, 1)) H3 = np.tile(H2.reshape(1, J, I), (K, 1, 1)) cs3 = np.tile(cs.reshape(K, 1, 1), (1, J, I)) z3 = zeta3 + cs3 * (zeta3 + H3) zm = z3 ym = y3 xm = x3 data = np.ma.array(np.squeeze(ncvar[:]), mask=(mask3 == 0)) elif extraction_type == 'surface': # get zeta try: if dims[2] == 'eta_rho' and dims[3] == 'xi_rho': zeta2 = ncf.variables['zeta'][:] else: zeta2 = utils.interp_2d_xy(y=lat, x=lon, data=ncf.variables['zeta'][:], yi=y2, xi=x2) except Exception, e: print(e) zeta2 = np.zeros((len(y2), len(x2))) zeta2[zeta2 > 1000] = 0 data = np.ma.array(np.squeeze(ncvar[0, K - 1, :, :]), mask=(mask2 == 0)) zm = zeta2 ym = y2 xm = x2
def conv_fourier(x, dims, minlens, nxcirc=None, condthresh=1e8): # Version of this NOT complete for higher dimensions 9/15/17! # # INPUT: # ----- # x [D x n x p] - stimulus, where each row vector is the spatial stim at a single time, D is number of batches # dims [m x 1] - number of coefficients along each stimulus dimension # minlens [m x 1] - minimum length scale for each dimension (can be scalar) # nxcirc [m x 1] - circular boundary in each stimulus dimension (minimum is dims) OPTIONAL # condthresh [1 x 1] - condition number for thresholding for small eigenvalues OPTIONAL # # OUTPUT: # ------ # Bx - output data, x, in fourier domain # wwnrm [nf x 1] - squared "effective frequencies" in vector form for each dim (normalized) # Bfft {1 x p} - cell array with DFT bases for each dimension (list of numpy arrays for each dimension) # 1e8 is default value (condition number on prior covariance) dims = np.array(np.reshape(dims, (1, -1))) minlens = np.array(np.reshape(minlens, (1, -1))) # Set circular bounardy (for n-point fft) to avoid edge effects, if needed if nxcirc is None: #nxcirc = np.ceil(max([dims(:)'+minlens(:)'*4; dims(:)'*1.25]))' nxcirc = np.ceil( np.max(np.concatenate((dims + minlens * 4, dims), axis=0), axis=0)) nd = np.size(dims) # number of filter dimensions if np.size( minlens ) is 1 and nd is not 1: #% make vector out of minlens, if necessary minlens = np.repmat(minlens, nd, 1) # generate here a list of your #None of these quantities depend on the data directly wvecs = [ rffb.comp_wvec(nxcirc[jj], minlens[0][jj], condthresh) for jj in np.arange(nd) ] #cdiagvecs = [mkcovs.mkcovdiag_ASD(minlens[jj],1,nxcirc[jj],np.square(wvecs[jj])) for jj in np.arange(nd)] Bffts = [ rffb.realfftbasis(dims[jj], nxcirc[jj], wvecs[jj])[0] for jj in np.arange(nd) ] #fprintf('\n Total # Fourier coeffs represented: %d\n\n', prod(ncoeff)); def f(switcher): # switch based on stimulus dimension if switcher is 2: pass if switcher is 3: pass return{ 1: #% 1 dimensional stimulus [np.square(2*np.pi/nxcirc[0]) * np.square(wvecs[0]), #normalized wvec np.ones([np.size(wvecs[0]),1])==1] #indices to keep # 2: % 2 dimensional stimulus # % Form full frequency vector and see which to cut # Cdiag = kron(cdiagvecs{2},cdiagvecs{1}); # ii = (Cdiag/max(Cdiag))>1/condthresh; % indices to keep # % compute vector of normalized frequencies squared # [ww1,ww2] = ndgrid(wvecs{1},wvecs{2}); # wwnrm = [(ww1(ii)*(2*pi/nxcirc(1))).^2 ... # (ww2(ii)*(2*pi/nxcirc(2))).^2]; # 3: % 3 dimensional stimulus # Cdiag = kron(cdiagvecs{3},(kron(cdiagvecs{2},cdiagvecs{1}))); # ii = (Cdiag/max(Cdiag))>1/condthresh; % indices to keep # % compute vector of normalized frequencies squared # [ww1,ww2,ww3] = ndgrid(wvecs{1},wvecs{2},wvecs{3}); # wwnrm = [(ww1(ii)*(2*pi/nxcirc(1))).mv ^2, ... # (ww2(ii)*(2*pi/nxcirc(2))).^2, ...., # (ww3(ii)*(2*pi/nxcirc(3))).^2]; # otherwise # error('compLSsuffstats_fourier.m : doesn''t yet handle %d dimensional filters\n',nd); }[switcher] try: [wwnrm, ii] = f(nd) except KeyError: print('\n\n Does not handle values of dimension', nd, 'yet') # Calculate stimulus sufficient stats in Fourier domain # if x.shape[0] == 1: # #originally this used the transpose operation (kronmulttrp) ! !!!might be a transpositional issue. # Bx = kron_ops.kronmult(Bffts,np.transpose(x)) # convert to Fourier domain # Bx = Bx[ii] # prune unneeded freqs # elif x.shape[0]>1: #Batched data. when the shape of x is 3 and dims is 2, for example. Bx = [kron_ops.kronmult(Bffts, np.transpose(batch)) for batch in x] Bx = [prune[ii] for prune in Bx] return Bx[0], wwnrm, Bffts[0], nxcirc
def extract_from_file(file='',varname='zeta',extraction_type='full',**kwargs): # Check the netCDF file for existence and if the variable is in it if not os.path.exists(file): raise IOError('File %s could not be located on the filesystem' %file) ncf = nc.Dataset(file,mode='r') if varname not in ncf.variables: raise IOError('File %s does not have a variable named %s' % (file, varname)) # start getting data ncvar = ncf.variables[varname] dims = ncvar.dimensions ndims = len(dims) shape = ncvar.shape # print('var: %s, dims: %s, shape: %s' %(varname, str(dims), str(shape))) if not ndims == 3 and not ndims == 4: raise TypeError('ndims is neither 3 nor 4') if not dims[0] == 'ocean_time': raise TypeError('first dimension is not ocean_time') if not shape[0] == 1: raise TypeError('first dimension is not of length one') grid = load_grid.load_grid(file) coords = {} if ndims == 3: if dims[1] == 'eta_rho' and dims[2] == 'xi_rho': # print('G.lat %s' % str(grid['lat'])) # print('G.lon %s' % str(grid['lon'])) y2 = grid['lat'][:] x2 = grid['lon'][:] mask2 = grid['mask'][:] elif dims[1] == 'eta_u' and dims[2] == 'xi_u': # print('G.latu %s' % str(grid['lat_u'])) # print('G.lonv %s' % str(grid['lon_u'])) y2 = grid['latu'][:] x2 = grid['lonu'][:] mask2 = grid['masku'][:] elif dims[1] == 'eta_v' and dims[2] == 'xi_v': # print('G.latv %s' % str(grid['lat_v'])) # print('G.lonv %s' % str(grid['lon_v'])) y2 = grid['latv'][:] x2 = grid['lonv'][:] mask2 = grid['maskv'][:] else: raise TypeError('Unable to determine which gird to use') if extraction_type == 'full' or extraction_type == 'surface': # data = np.squeeze(ncvar[:]) # data[mask2==0] = np.NaN data = np.ma.array(np.squeeze(ncvar[:]),mask=(mask2==0)) coords['ym'] = y2 coords['xm'] = x2 if (extraction_type == 'profile' or extraction_type == 'profiles' or extraction_type == 'point' or extraction_type == 'points') and \ (kwargs.has_key('y') and kwargs.has_key('x')): print('In profiles') xm = np.array(kwargs['x']) ym = np.array(kwargs['y']) if not (xm.ndim == ym.ndim) or not (xm.shape == ym.shape): if xm.size == 1: xm = np.repmat(xm,ym.shape) elif ym.size == 1: ym = np.repmat(ym,xm.shape) # else: # raise RuntimeError('The x and y chosen to extract a point or profile on this 2D variable are incompatible.') data2 = np.ma.array(np.squeeze(ncvar[:]),mask=(mask2==0)) # data = utils.interp_2d(lat=y2,lon=x2,data=data2,lati=ym,loni=xm) mask = utils.interp_2d_xy(y=y2,x=x2,data=mask2,yi=ym,xi=xm) data = utils.interp_2d_xy(y=y2,x=x2,data=data2,yi=ym,xi=xm) data = np.ma.array(data,mask=(mask < 1)) coords['ym'] = ym coords['xm'] = xm if ndims == 4: xm = [] ym = [] zm = [] K = shape[1] J = shape[2] I = shape[3] lat = grid['lat'][:] lon = grid['lon'][:] if dims[2] == 'eta_rho' and dims[3] == 'xi_rho': y2 = lat x2 = lon mask2 = grid['mask'][:] elif dims[2] == 'eta_u' and dims[3] == 'xi_u': y2 = grid['latu'][:] x2 = grid['lonu'][:] mask2 = grid['masku'][:] elif dims[2] == 'eta_v' and dims[3] == 'xi_v': y2 = grid['latv'][:] x2 = grid['lonv'][:] mask2 = grid['maskv'][:] else: raise TypeError('Unable to determine which gird to use. dims[2] = %s, dims[3] = %s' % (dims[2], dims[3])) if dims[1] == 's_rho': cs = grid['cs'][:] elif dims[1] == 's_w': cs = grid['csw'][:] elif K==1: cs = 0 else: raise TypeError('Unable to determine which cs to use. dim[1] = %s' % dim[1]) data = [] if extraction_type == 'full': # get zeta try: if dims[2] == 'eta_rho' and dims[3] == 'xi_rho': zeta2 = ncf.variables['zeta'][:] else: zeta2 = utils.interp_2d_xy(y=lat,x=lon,data=ncf.variables['zeta'][:],yi=y2,xi=x2) except Exception, e: print(e) zeta2 = np.zeros((len(y2),len(x2))) zeta2[zeta2>1000] = 0 # get H if dims[2] == 'eta_rho' and dims[3] == 'xi_rho': H2 = grid['H'][:] else: H2 = utils.interp_2d_xy(y=lat,x=lon,data=grid['H'][:],yi=y2,xi=x2) x3 = np.tile(x2.reshape(1, J, I),(K,1,1)) y3 = np.tile(y2.reshape(1, J, I),(K,1,1)) mask3 = np.tile(mask2.reshape(1, J, I),(K,1,1)) zeta3 = np.tile(zeta2.reshape(1, J, I),(K,1,1)) H3 = np.tile(H2.reshape(1, J, I),(K,1,1)) cs3 = np.tile(cs.reshape(K,1,1),(1,J,I)) z3 = zeta3 + cs3*(zeta3 + H3) zm = z3 ym = y3 xm = x3 data = np.ma.array(np.squeeze(ncvar[:]),mask=(mask3==0)) elif extraction_type == 'surface': # get zeta try: if dims[2] == 'eta_rho' and dims[3] == 'xi_rho': zeta2 = ncf.variables['zeta'][:] else: zeta2 = utils.interp_2d_xy(y=lat,x=lon,data=ncf.variables['zeta'][:],yi=y2,xi=x2) except Exception, e: print(e) zeta2 = np.zeros((len(y2),len(x2))) zeta2[zeta2>1000] = 0 data = np.ma.array(np.squeeze(ncvar[0,K-1,:,:]),mask=(mask2==0)) zm = zeta2 ym = y2 xm = x2
def preprocessing_setup(data, analysis_settings): """ Performs sanity checks on the input data and the algorithm parameter struct. Massages the data (i.e. drop outliers, zscore data, etc). **Arguments**: - data: Input data matrix (total number of trials x 6 columns) - analysis_settings: Struct with algorithm parameters **Returns**: - data: Input data matrix (if applicable, outlier free, zscored, category specific data only, etc) - analysis_settings: Struct with algorithm parameters; some additional parameters are added to this struct as well """ print('********** START OF MESSAGES **********') # Checks if the data matrix has 6 columns number_of_columns = np.shape(data)[1] if number_of_columns != 6: raise ValueError( 'Incorrect number of columns ({}) in the input matrix!'.format( number_of_columns)) # Registering which column in the data matrix is carrying which piece of information if (not ('data_matrix_columns' in analysis_settings)) or ( not analysis_settings['data_matrix_columns']): # Setting it to the default analysis_settings['data_matrix_columns'] = {} analysis_settings['data_matrix_columns']['subject_id'] = 0 analysis_settings['data_matrix_columns']['trials'] = 1 analysis_settings['data_matrix_columns']['category'] = 2 analysis_settings['data_matrix_columns']['predictor_var'] = 3 analysis_settings['data_matrix_columns']['dependent_var'] = 4 analysis_settings['data_matrix_columns']['net_effect_clusters'] = 5 subject_id_column = analysis_settings['data_matrix_columns']['subject_id'] trials_column = analysis_settings['data_matrix_columns']['trials'] category_column = analysis_settings['data_matrix_columns']['category'] predictor_var_column = analysis_settings['data_matrix_columns'][ 'predictor_var'] dependent_var_column = analysis_settings['data_matrix_columns'][ 'dependent_var'] net_effect_clusters_column = analysis_settings['data_matrix_columns'][ 'net_effect_clusters'] # Checks if the em iterations is specified; if not specified then it is set to a default of 20 if (not ('em_iterations' in analysis_settings)) or ( analysis_settings['em_iterations'] <= 0): analysis_settings['em_iterations'] = 20 print('Missing number of iterations! It is set to a default of {}'. format(analysis_settings['em_iterations'])) # Checks if the no. of particles is specified; if not specified then it is set to a default of 1000 if (not ('particles' in analysis_settings)) or (analysis_settings['particles'] <= 0): analysis_settings['particles'] = 100000 print( 'Missing number of particles! It is set to a default of {}'.format( analysis_settings['particles'])) # Checks if the family of curves is specified; if not then set to 'horz_indpnt' (Refer to family of curves) if (not ('curve_type' in analysis_settings)) or (not analysis_settings['curve_type']): analysis_settings['curve_type'] = 'horz_indpnt' print('Missing family of curves! It is set to a default of {}'.format( analysis_settings['curve_type'])) # Checks if the family of curves exist by fetching the number of curve parameters. This is just a sanity check if not isinstance( family_of_curves(analysis_settings['curve_type'], 'get_nParams'), int): raise ValueError( '{} - Does not exist! Check family_of_curves.m script'.format( analysis_settings['curve_type'])) # Checks if the distribution is specified; # If not specified and if the dependent variable is binary it's set to 'bernoulli'; otherwise set to to 'normal' if (not ('distribution' in analysis_settings)) or (not analysis_settings['distribution']): if len(np.unique(data[:, dependent_var_column])) == 2: analysis_settings['distribution'] = 'bernoulli' else: analysis_settings['distribution'] = 'normal' print( 'Missing distribution! based on the dependent variable it is set to {}' .format(analysis_settings['distribution'])) # Checks if the distribution specific parameters exist if (not ('dist_specific_params' in analysis_settings)) or ( not analysis_settings['dist_specific_params']): if analysis_settings['distribution'] == 'bernoulli': # For a Bernoulli dist there are no parameters so it is empty. We still need the struct to exist analysis_settings['dist_specific_params'] = {} elif analysis_settings['distribution'] == 'normal': # For normal distribution the additional parameter is sigma. We pass in sigma here. analysis_settings['dist_specific_params'] = {} analysis_settings['dist_specific_params'][ 'sigma'] = 1 # Default is 1 print('Missing sigma for normal distribution! It is set to {}'. format(analysis_settings['dist_specific_params']['sigma'])) # Checks if normal distribution specific parameter is valid i.e. sigma > 0 if (analysis_settings['distribution'] == 'normal') and ( analysis_settings['dist_specific_params']['sigma'] <= 0): raise ValueError( 'Normal distribution sigma will need to > 0! sigma = {}'.format( analysis_settings['dist_specific_params']['sigma'])) # Checks if beta_0 is specified; if not specified then it is set to a default of 0 if not ('beta_0' in analysis_settings): analysis_settings['beta_0'] = 0 print( 'Missing initial setting for beta_0! It is set to a default of {}'. format(analysis_settings['beta_0'])) # Checks if beta_1 is specified; if not specified then it is set to a default of 1 if not ('beta_1' in analysis_settings): analysis_settings['beta_1'] = 1 print( 'Missing initial setting for beta_1! It is set to a default of {}'. format(analysis_settings['beta_1'])) # Checks if tau is specified; if not specified then it is set to a default of 0.05 if not ('tau' in analysis_settings): analysis_settings['tau'] = 0.05 print('Missing initial setting for tau! It is set to a default of {}'. format(analysis_settings['tau'])) # Checks if this is a bootstrap run; if not specified then it is set to a default of false if not ('bootstrap' in analysis_settings): analysis_settings['bootstrap'] = False print( 'Missing initial setting for beta_1! It is set to a default of {}'. format(analysis_settings['bootstrap'])) # Checks if bootstrap flag is boolean if not (type(analysis_settings['bootstrap']) == bool): raise ValueError( 'analysis_settings.bootstrap field will need to be boolean!') # Checks if this is a scramble run; if not specified then it is set to a default of false if not ('scramble' in analysis_settings): analysis_settings['scramble'] = False # Checks if scramble flag is boolean if not (type(analysis_settings['scramble']) == bool): raise ValueError( 'analysis_settings.scramble field will need to be boolean!') # Errors if both bootstrap and scramble flags exist if analysis_settings['scramble'] and analysis_settings['bootstrap']: raise ValueError( 'Cannot run both scramble AND bootstrap analyses at the same time! Set any one flag to be false' ) # Builds a bootstrap data matrix from the original data matrix if analysis_settings['bootstrap'] and not (analysis_settings['scramble']): # We need a bootstrap sample number if (not ('bootstrap_run' in analysis_settings)) or ( not analysis_settings['bootstrap_run']): raise ValueError( 'Missing bootstrap sample number! set analysis_settings.bootstrap_run to a valid sample number' ) bootstrap_data = [] new_cluster_count = 1 new_subject_count = 1 # Get the number of subjects from the data matrix number_of_subjects = len(np.unique(data[:, subject_id_column])) # Randomly sample with replacement the number of subjects thus generating our bootstrap sample subj_num_with_replacement = random.choices( np.arange(number_of_subjects), k=number_of_subjects) # For each subject in our bootstrap sample gather all relevant information for i in range(len(subj_num_with_replacement)): subj_idx = np.where( data[:, subject_id_column] == subj_num_with_replacement[i]) # Recreate a new net effect cluster since this will need to be unique in the data matrix # (by repeatedly sampling subjects we could be repeating the net effect clusters) cluster_vector = data[subj_idx, net_effect_clusters_column] cluster_numbers = np.unique[cluster_vector] for j in range(len(cluster_numbers)): target_idx = np.where( data[subj_idx, net_effect_clusters_column] == cluster_numbers[j]) cluster_vector[target_idx] = new_cluster_count new_cluster_count += 1 # Recreate a new subject id # (by repeatedly sampling subjects we could be repeating the subject id's) # Gather all information into a bootstrap_data matrix bootstrap_data.append( np.concatenate( np.repmat(new_subject_count, len(subj_idx), 1), data[subj_idx, trials_column:dependent_var_column], cluster_vector)) new_subject_count += 1 # Perform some sanity checks to ensure that the bootstrap_data matrix is similar to the actual data matrix if not np.all(np.shape(bootstrap_data) == np.shape(data)): raise ValueError( 'Size of bootstrap dataset NOT the same as original data!') if not (len(np.unique(data[:, net_effect_clusters_column])) == len( np.unique(bootstrap_data[:, net_effect_clusters_column]))): raise ValueError( 'The number of clusters are not the same in the original and bootstrap sample!' ) if not np.array_equal(data[:, subject_id_column], bootstrap_data[:, subject_id_column]): raise ValueError( 'The ordering of subjects are not the same in the original and bootstrap sample!' ) # Store away the bootstrap sample subject information for future reference analysis_settings['bootstrap_run_subj_id'] = subj_num_with_replacement data = bootstrap_data # Checks if analysis will be performed for a specific category; if not then set to [] i.e. NOT category specific if not ('category' in analysis_settings): analysis_settings.category = [] print( 'Missing category specific analyses information! We are going to ignore the category dimension i.e. all ' 'trials from all categories will be analysed') # If this analysis is to be performed for a specific category then filters out data from other irrelevant categories if len(analysis_settings['category']) > 0: target_cat_idx = [] data_cat = np.unique(data[:, category_column]) for c in range(len(analysis_settings['category'])): cat_exist = np.where( data_cat == analysis_settings['category'][c])[0] if cat_exist.size == 0: raise ValueError( 'Category does not exist! You have set analysis_settings.category[{}]={}' .format(c, analysis_settings['category'][c])) target_cat_idx = np.concatenate( target_cat_idx, np.where(data[:, category_column] == analysis_settings['category'][c])[0]) data = data[target_cat_idx, :] # Checks if outliers (i.e. data trials) will need to dropped; if not specified then set to 'DO NOT DROP OUTLIERS' if not ('drop_outliers' in analysis_settings): analysis_settings['drop_outliers'] = 3 print( 'Missing drop_outliers specific information! We are dropping outliers that are {} standard deviations away from the group mean' .format(analysis_settings['drop_outliers'])) # If this analysis requires the outliers dropped, then drops the data trials within std devs from the GROUP MEAN if analysis_settings['drop_outliers'] > 0: # NaN's do not qualify as outliers so we filter them out and add them at the end of this step nan_free_idx = np.logical_not(np.isnan(data[:, predictor_var_column])) # NaN free data nan_free_data = data[nan_free_idx, :] std_dev_predictor_var = np.std( nan_free_data[:, predictor_var_column], ddof=1) * analysis_settings['drop_outliers'] mean_predictor_var = np.mean(nan_free_data[:, predictor_var_column]) predictor_var_idx = (nan_free_data[:, predictor_var_column] > (mean_predictor_var - std_dev_predictor_var)) & ( nan_free_data[:, predictor_var_column] < (mean_predictor_var + std_dev_predictor_var)) print( '{} trials are dropped since they are regarded as outliers'.format( np.shape(nan_free_data)[subject_id_column] - np.sum(predictor_var_idx))) nan_free_data_outlier_dropped = nan_free_data[predictor_var_idx, :] # NaN's trials nan_data = data[np.logical_not(nan_free_idx), :] # Combine the NaN data with the outlier free data data = np.concatenate( nan_free_data_outlier_dropped, nan_data ) if np.shape(nan_data)[0] > 0 else nan_free_data_outlier_dropped # Following the 'filter by category' and 'drop outliers', if applicable, we check if the data matrix is empty number_of_trials = np.shape(data)[subject_id_column] if number_of_trials <= 0: raise ValueError('No input data!') # Checks if we need to zscore predictor var within subjects, if not specified then it is set to default of FALSE if not ('zscore_within_subjects' in analysis_settings): analysis_settings['zscore_within_subjects'] = 0 print( 'Missing zscore_within_subjects information! We are NOT zscoring within subjects' ) # Verifies if zscore within subjects is boolean if not (type(analysis_settings['zscore_within_subjects']) == bool): raise ValueError( 'zscore_within_subjects field will need to be boolean!') # Zscore the predictor variable within each subject if analysis_settings['zscore_within_subjects']: # NaN's do not qualify to be zscored nan_free_idx = np.logical_not(np.isnan(data[:, predictor_var_column])) # NaN free data nan_free_data = data[nan_free_idx, :] # Get the list of subject id's (we use this cell array in zscoring the data within each subject, if applicable) subject_id_list = np.unique(nan_free_data[:, subject_id_column]) # We get the number of subjects number_of_subjects = len(subject_id_list) if number_of_subjects <= 0: raise ValueError('Not valid number of subjects!') for s in range(number_of_subjects): subject_idx = np.where( nan_free_data[:, subject_id_column] == subject_id_list[s])[0] nan_free_data[subject_idx, predictor_var_column] = stats.zscore( nan_free_data[subject_idx, predictor_var_column], ddof=1) print('Predictor variables within each subject are zscored!') # NaN's trials nan_data = data[np.logical_not(nan_free_idx), :] # Combine the NaN data with the outlier free data data = np.concatenate( nan_free_data, nan_data) if np.shape(nan_data)[0] > 0 else nan_free_data # Checks if resolution is specified, if not specified then set to default of 4. This translates to 1e-4 = 0.0001 if (not ('resolution' in analysis_settings)) or (analysis_settings['resolution'] <= 0): analysis_settings['resolution'] = 4 print('Missing resolution! It is set to a default of %d'.format( analysis_settings['resolution'])) # if we have normally distributed data, we want to z-score the dependent variable if analysis_settings['distribution'] == 'normal': data[:, dependent_var_column] = stats.zscore(data[:, dependent_var_column], ddof=1) # We scale the predictor var to be between 0 and 1 and round it to 4 digits nan_free_idx = np.logical_not(np.isnan(data[:, predictor_var_column])) nan_free_data = data[nan_free_idx, :] nan_free_data[:, predictor_var_column] = np.round( scale_data(nan_free_data[:, predictor_var_column], 0, 1), analysis_settings['resolution']) nan_data = data[np.logical_not(nan_free_idx), :] data = np.concatenate( nan_free_data, nan_data) if np.shape(nan_data)[0] > 0 else nan_free_data # Scrambling the data matrix if analysis_settings['scramble']: if (not ('scramble_run' in analysis_settings)) or ( not analysis_settings['scramble_run']): raise ValueError( 'Missing scramble sample number! set analysis_settings.scramble_run to a valid sample number' ) if (not ('scramble_style' in analysis_settings)) or ( not analysis_settings['scramble_style']): analysis_settings[ 'scramble_style'] = 'within_subjects_within_categories' # most conservative of all scramble techniques print('Missing scramble style! It is set a default of {}'.format( analysis_settings['scramble_style'])) # We get the list of subject id's subject_id_list = np.unique(data[:, subject_id_column]) # We get the number of subjects in this analysis number_of_subjects = len(subject_id_list) if number_of_subjects <= 0: raise ValueError('Not valid number of subjects!') if analysis_settings[ 'scramble_style'] == 'within_subjects_within_categories': # Here scramble all DVs WHILE respecting the net effect boundaries, subject groupings and category groupings categories = np.unique(data[:, category_column]) for s in range(number_of_subjects): for c in range(len(categories)): subject_category_idx = np.where( (data[:, subject_id_column] == subject_id_list[s]) & (data[:, category_column] == categories[c]))[0] if len(subject_category_idx) > 1: data[ subject_category_idx, dependent_var_column] = scramble_dependent_variable( data[subject_category_idx, dependent_var_column], data[subject_category_idx, net_effect_clusters_column]) elif analysis_settings[ 'scramble_style'] == 'within_subjects_across_categories': # Here we scramble all dependent variables WHILE respecting the net effect boundaries and subject groupings for s in range(number_of_subjects): subject_idx = np.where( data[:, subject_id_column] == subject_id_list[s])[0] if len(subject_idx) > 1: data[subject_idx, dependent_var_column] = scramble_dependent_variable( data[subject_idx, dependent_var_column], data[subject_idx, net_effect_clusters_column]) elif analysis_settings[ 'scramble_style'] == 'across_subjects_across_categories': # Here we scramble all dependent variables WHILE respecting the net effect boundaries all_idx = np.arange(np.shape(data)[0]) if len(all_idx) > 1: data[all_idx, dependent_var_column] = scramble_dependent_variable( data[all_idx, dependent_var_column], data[all_idx, net_effect_clusters_column]) else: raise ValueError( 'Invalid analysis_settings.scramble_style={}'.format( analysis_settings['scramble_style'])) # Our data matrix looks like data = [subject id, item, category, predictor var, dependent var, net effect cluster] # We verify if the subject id and dependent var columns are unique for the net effect clusters # Below is a example of a valid data matrix (note dependent variable is unique within net effect cluster 111) # data(1, :) = [24, 1, 1, 0.3333, 0, 111] # data(2, :) = [24, 2, 2, 0.2222, 0, 111] # data(3, :) = [24, 3, 1, 0.4444, 0, 111] # Below is a example of an invalid data matrix (note dependent variable is not unique within net effect cluster 111) # data(1, :) = [24, 1, 1, 0.3333, 0, 111] # data(2, :) = [24, 2, 2, 0.2222, 1, 111] # data(3, :) = [24, 3, 1, 0.4444, 0, 111] # Fetching the net effect clusters net_effect_clusters = np.unique(data[:, net_effect_clusters_column]) analysis_settings['net_effect_clusters'] = net_effect_clusters # If net effect clusters exist verify if the Subject Id and dependent variable are unique for those clusters if len(net_effect_clusters) != np.shape(data)[0]: for i in range(len(net_effect_clusters)): cluster_idx = np.where( data[:, net_effect_clusters_column] == net_effect_clusters[i])[0] if len( np.shape( np.unique( data[cluster_idx, [subject_id_column, dependent_var_column]], axis=0))) != 1: raise ValueError( 'Subject Id and/or dependent variable not unique for net effect cluster {}! Check ' 'the data matrix'.format(net_effect_clusters[i])) else: # If net effect clusters DO NOT exist then we treat each row as a net effect cluster by itself print( 'Each row will be treated separately. We will NOT be computing the net effect of any rows' ) # We create an analysis id unique to this analysis if (not ('analysis_id' in analysis_settings)) or (not analysis_settings['analysis_id']): time = datetime.datetime.now() analysis_settings['analysis_id'] = '{}-{}-{}-{}-{}'.format( time.month, time.day, time.hour, time.minute, time.second) # We create a results directory if no specific target directory is mentioned if (not ('target_dir' in analysis_settings)) or (not analysis_settings['target_dir']): results_dir = os.path.join(os.getcwd(), 'results') if not os.path.isdir(results_dir): os.mkdir(results_dir) analysis_settings['target_dir'] = results_dir # target_directory = 'results/analysis_id' analysis_settings['target_dir'] = os.path.join( analysis_settings['target_dir'], analysis_settings['analysis_id']) if not os.path.isdir(analysis_settings['target_dir']): os.mkdir(analysis_settings['target_dir']) # Due to memory constraints we perform two chunking tricks # Chunking trick I # In the curve fitting algorithm we need to compute the p(current iteration curves | previous # iteration curves). This matrix is huge when the number of particles (curves) is large, say 100,000. Even with a # 8 Gb RAM, dedicated to Matlab, we still get a out of memory errors. To avoid this problem we chunk the matrix # into smaller, more manageable matrices. Setting the chunk size to be particles x 0.05 -> 100,000 x 0.05 = 5000, # translates to p(current iteration curves(5000 curves at a time) | previous iteration curves). analysis_settings['wgt_chunks'] = analysis_settings['particles'] * 0.05 # If the chunk size is less then 5000 we set it be the number of particles itself if analysis_settings['wgt_chunks'] < 5000: analysis_settings['wgt_chunks'] = analysis_settings['particles'] # Chunking trick II if not ('particle_chunks' in analysis_settings): analysis_settings['particle_chunks'] = 2 print('Missing particle chunks! It is set to a default of {}'.format( analysis_settings['particle_chunks'])) # Depending on the number of particle chunks we get start, end points and the number of particles within each chunk. # For instance 1000 particles divided into 4 chunks will look like, # | 0 | 250 | 250 # | 250 | 500 | 250 # | 500 | 750 | 250 # | 750 | 1000| 250 dummy = np.arange( 0, analysis_settings['particles'], analysis_settings['particles'] / analysis_settings['particle_chunks']) analysis_settings['ptl_chunk_idx'] = np.stack( (dummy, dummy + analysis_settings['particles'] / analysis_settings['particle_chunks'], np.full( np.shape(dummy), analysis_settings['particles'] / analysis_settings['particle_chunks'])), axis=1) # Storing analysis relevant information into the analysis_settings struct # We get the list of subject id's subject_id_list = np.unique(data[:, subject_id_column]) # We get the number of subjects in this analysis analysis_settings['nSubjs'] = len(subject_id_list) if analysis_settings['nSubjs'] <= 0: raise ValueError('Not valid number of subjects!') print('********** END OF MESSAGES **********') return data, analysis_settings