Beispiel #1
0
Datei: var.py Projekt: cbrnr/scot
    def fit(self, data):
        """Fit VAR model to data.
        
        Parameters
        ----------
        data : array, shape (trials, channels, samples) or (channels, samples)
            Epoched or continuous data set.
            
        Returns
        -------
        self : :class:`VAR`
            The :class:`VAR` object to facilitate method chaining (see usage
            example).
        """
        data = atleast_3d(data)

        if self.delta == 0 or self.delta is None:
            # ordinary least squares
            x, y = self._construct_eqns(data)
        else:
            # regularized least squares (ridge regression)
            x, y = self._construct_eqns_rls(data)

        b, res, rank, s = sp.linalg.lstsq(x, y)

        self.coef = b.transpose()

        self.residuals = data - self.predict(data)
        self.rescov = sp.cov(cat_trials(self.residuals[:, :, self.p:]))

        return self
Beispiel #2
0
def pca(data, dim):
    """ Return the first dim principal components as colums of a matrix.

    Every row of the matrix resembles a point in the data space.
    """

    assert dim <= data.shape[1], \
        "dim must be less or equal than the original dimension"

    # We have to make a copy of the original data and substract the mean
    # of every entry
    data = makeCentered(data)
    cm = cov(data.T)

    # OPT only calculate the dim first eigenvectors here
    # The following calculation may seem a bit "weird" but also correct to me.
    # The eigenvectors with the dim highest eigenvalues have to be selected
    # We keep track of the indexes via enumerate to restore the right ordering
    # later.
    eigval, eigvec = eig(cm)
    eigval = [(val, ind) for ind, val  in enumerate(eigval)]
    eigval.sort()
    eigval[:-dim] = []  # remove all but the highest dim elements

    # now we have to bring them back in the right order
    eig_indexes = [(ind, val) for val, ind in eigval]
    eig_indexes.sort(reverse=True)
    eig_indexes = [ind for ind, val in eig_indexes]

    return eigvec.take(eig_indexes, 1).T
Beispiel #3
0
	def _initParams_fast(self):
		""" 
		initialize the gp parameters
			1) project Y on the known factor X0 -> Y0
				average variance of Y0 is used to initialize the variance explained by X0
			2) considers the residual Y1 = Y-Y0 (this equivals to regress out X0)
			3) perform PCA on cov(Y1) and considers the first k PC for initializing X
			4) the variance of all other PCs is used to initialize the noise
			5) the variance explained by interaction is set to a small random number 
		"""
		Xd = LA.pinv(self.X0)
		Y0 = self.X0.dot(Xd.dot(self.Y))
		Y1 = self.Y-Y0
		YY = SP.cov(Y1)
		S,U = LA.eigh(YY)
		X = U[:,-self.k:]*SP.sqrt(S[-self.k:])
		a = SP.array([SP.sqrt(Y0.var(0).mean())])
		b = 1e-3*SP.randn(1)
		c = SP.array([SP.sqrt((YY-SP.dot(X,X.T)).diagonal().mean())])
		# gp hyper params
		params = limix.CGPHyperParams()
		if self.interaction:
			params['covar'] = SP.concatenate([a,X.reshape(self.N*self.k,order='F'),SP.ones(1),b])
		else:
			params['covar'] = SP.concatenate([a,X.reshape(self.N*self.k,order='F')])
		params['lik'] = c
		return params
Beispiel #4
0
 def __init__(self, Y=None, Xr=None, F=None, Rr=None, factr=1e7, debug=False):
     """
     Args:
         Y:          [N, P] phenotype matrix
         Xr:         [N, S] genotype data of the set component
         R:          [N, S] genotype data of the set component
         factr:      paramenter that determines the accuracy of the solution
                     (see scipy.optimize.fmin_l_bfgs_b for more details)
     """
     # avoid SVD failure by adding some jitter 
     Xr+= 2e-6*(sp.rand(*Xr.shape)-0.5)
     # make sure it is normalised 
     Xr-= Xr.mean(0)
     Xr/= Xr.std(0)
     Xr/= sp.sqrt(Xr.shape[1])
     self.Y = Y
     self.F = F
     self.Xr = Xr
     self.covY = sp.cov(Y.T)
     self.factr = factr 
     self.debug = debug
     self.gp = {}
     self.info = {}
     self.lowrank = Xr.shape[1]<Xr.shape[0]
     if Rr is not None:
         self.Rr = Rr
     else:
         if self.lowrank:        self.Rr = None
         else:                   self.Rr = sp.dot(Xr, Xr.T)
Beispiel #5
0
    def fit(self, data):
        """ Fit VAR model to data.
        
        Parameters
        ----------
        data : array-like, shape = [n_samples, n_channels, n_trials] or [n_samples, n_channels]
            Continuous or segmented data set.
            
        Returns
        -------
        self : :class:`VAR`
            The :class:`VAR` object to facilitate method chaining (see usage example)
        """
        data = sp.atleast_3d(data)

        if self.delta == 0 or self.delta is None:
            # ordinary least squares
            (x, y) = self._construct_eqns(data)
        else:
            # regularized least squares (ridge regression)
            (x, y) = self._construct_eqns_rls(data)

        (b, res, rank, s) = sp.linalg.lstsq(x, y)

        self.coef = b.transpose()

        self.residuals = data - self.predict(data)
        self.rescov = sp.cov(cat_trials(self.residuals), rowvar=False)

        return self
Beispiel #6
0
 def learn_gmm(self,x,y,tau=None):
     '''
     Function that learns the GMM from training samples
         It is possible to add a regularizer term Sigma = Sigma + tau*I 
     Input:
         x : the training samples
         y :  the labels
         tau : the value of the regularizer, if tau = None (default) no regularization
     Output:
         the mean, covariance and proportion of each class
     '''
     ## Get information from the data
     C = int(y.max(0))   # Number of classes
     n = x.shape[0]  # Number of samples
     d = x.shape[1]  # Number of variables
     
     ## Initialization
     self.ni = sp.empty((C,1))    # Vector of number of samples for each class
     self.prop = sp.empty((C,1))  # Vector of proportion
     self.mean = sp.empty((C,d))  # Vector of means
     self.cov = sp.empty((C,d,d)) # Matrix of covariance
     
     ## Learn the parameter of the model for each class
     for i in range(C):
         j = sp.where(y==(i+1))[0]
         self.ni[i] = float(j.size)    
         self.prop[i] = self.ni[i]/n
         self.mean[i,:] = sp.mean(x[j,:],axis=0)
         self.cov[i,:,:] = sp.cov(x[j,:],bias=1,rowvar=0)  # Normalize by ni to be consistent with the update formulae
     if tau is not None:
         self.tau = tau*sp.eye(d)
Beispiel #7
0
    def _init_params(self, X):
        init = self.init
        n_samples, n_features = X.shape
        n_components = self.n_components

        if (init == 'kmeans'):
            km = Kmeans(n_components)
            clusters, mean, cov = km.cluster(X)
            coef = sp.array([c.shape[0] / n_samples for c in clusters])
            comps = [multivariate_normal(mean[i], cov[i], allow_singular=True)
                     for i in range(n_components)]
        elif (init == 'rand'):
            coef = sp.absolute(sprand.randn(n_components))
            coef = coef / coef.sum()
            means = X[sprand.permutation(n_samples)[0: n_components]]
            clusters = [[] for i in range(n_components)]
            for x in X:
                idx = sp.argmin([spla.norm(x - mean) for mean in means])
                clusters[idx].append(x)

            comps = []
            for k in range(n_components):
                mean = means[k]
                cov = sp.cov(clusters[k], rowvar=0, ddof=0)
                comps.append(multivariate_normal(mean, cov, allow_singular=True))

        self.coef = coef
        self.comps = comps
Beispiel #8
0
 def __init__(self, Y=None, Xr=None, Rg=None, Ug=None, Sg=None, factr=1e7, debug=False):
     """
     Args:
         Y:          [N, P] phenotype matrix
         Xr:         [N, S] genotype data of the set component
         R:          [N, S] genotype data of the set component
         factr:      paramenter that determines the accuracy of the solution
                     (see scipy.optimize.fmin_l_bfgs_b for more details)
     """
     # assert Xr
     Xr-= Xr.mean(0)
     Xr/= Xr.std(0)
     Xr/= sp.sqrt(Xr.shape[1])
     self.Y = Y
     self.Xr = Xr
     if Sg is None or Ug is None:
         Sg, Ug = la.eigh(Rg)
     self.Rg = Rg
     self.Ug = Ug
     self.Sg = Sg
     self.covY = sp.cov(Y.T)
     self.factr = factr 
     self.debug = debug
     self.gp = {}
     self.info = {}
     #_trRr = sp.diagonal(sp.dot(self.Ug, sp.dot(sp.diag(self.Sg), self.Ug.T))).sum()
     self.trRg = ((self.Ug*self.Sg**0.5)**2).sum()
Beispiel #9
0
    def _maximum_likelihood(self, X):
        n_samples, n_features = X.shape if X.ndim > 1 else (1, X.shape[0])
        n_components = self.n_components

        # Predict mean
        mu = X.mean(axis=0)

        # Predict covariance
        cov = sp.cov(X, rowvar=0)
        eigvals, eigvecs = self._eig_decomposition(cov)
        sigma2 = ((sp.sum(cov.diagonal()) - sp.sum(eigvals.sum())) /
                  (n_features - n_components))  # FIXME: M < D?

        weight = sp.dot(eigvecs, sp.diag(sp.sqrt(eigvals - sigma2)))
        M = sp.dot(weight.T, weight) + sigma2 * sp.eye(n_components)
        inv_M = spla.inv(M)

        self.eigvals = eigvals
        self.eigvecs = eigvecs
        self.predict_mean = mu
        self.predict_cov = sp.dot(weight, weight.T) + sigma2 * sp.eye(n_features)
        self.latent_mean = sp.transpose(sp.dot(inv_M, sp.dot(weight.T, X.T - mu[:, sp.newaxis])))
        self.latent_cov = sigma2 * inv_M
        self.sigma2 = sigma2    # FIXME!
        self.weight = weight
        self.inv_M = inv_M

        return self.latent_mean
def plot_covariance(history, dist_X):
   
    for dist_name in list(history.keys()):
        nTypes = len(history[dist_name].keys())
        errors = sp.zeros((2,nTypes))
        fig = plt.figure()
        fig.set_size_inches(6*nTypes,5)       
        plt.subplot(1,nTypes+1,1)
        plt.imshow(dist_X.corr_matrix,cmap=plt.cm.gray,interpolation='none')

        counter = 0
        for samp_name in list(history[dist_name].keys()):
            counter += 1
            hist_single = history[dist_name][samp_name]
            nsteps = len(hist_single)
            nbatch = hist_single[-1]['X'].shape[1]
            N = hist_single[0]['X'].shape[0]

            X = sp.zeros((N,nbatch,nsteps))
            P = sp.zeros((N,nbatch,nsteps))
            for tt in range(nsteps):
                X[:,:,tt] = hist_single[tt]['X']
                P[:,:,tt] = hist_single[tt]['P']
                
            ax = plt.subplot(1,nTypes+1,counter+1)
            inv_var_diags = sp.diag(10.**sp.linspace(-dist_X.log_conditioning, 0, N))**.5
            corr_matrix_calc = sp.dot(sp.dot(inv_var_diags**.5,sp.cov(X.reshape(N,nbatch*nsteps),rowvar = 1)),inv_var_diags**.5)
            plt.imshow(corr_matrix_calc,cmap=plt.cm.gray,interpolation='none')
           
            print (corr_matrix_calc)
        plt.show()
Beispiel #11
0
    def cluster(self, X):
        self.fit(X)

        cluster = [X[sp.argmax(self.responsibility, axis=1) == k] for k in range(self.n_classes)]
        mean = self.center
        cov = [sp.cov(c, rowvar=0, ddof=0) for c in cluster]

        return cluster, mean, cov
Beispiel #12
0
 def getEmpTraitCovar(self):
     """
     Returns the empirical trait covariance matrix
     """
     if self.P==1:
         out=self.Y[self.Iok].var()
     else:
         out=SP.cov(self.Y[self.Iok].T)
     return out
Beispiel #13
0
    def fit(self, X):
        cov = sp.cov(X, rowvar=0)
        eigvals, eigvecs = self._eig_decomposition(cov)

        self.eigvals = eigvals
        self.eigvecs = eigvecs
        self.mean = X.mean(axis=0)

        return sp.dot(X, eigvecs)
Beispiel #14
0
 def _initParams(self,init_method=None):
     """ this function initializes the paramenter and Ifilter """
     if self.P==1:
         if self.bgRE:
             params0 = {'Cg':SP.sqrt(0.5)*SP.ones(1),'Cn':SP.sqrt(0.5)*SP.ones(1)}
             Ifilter = None
         else:
             params0 = {'Cr':1e-9*SP.ones(1),'Cn':SP.ones(1)}
             Ifilter = {'Cr':SP.zeros(1,dtype=bool),'Cn':SP.ones(1,dtype=bool)}
     else:
         if self.bgRE:
             if self.colCovarType=='freeform':
                 if init_method=='pairwise':
                     _RV = fitPairwiseModel(self.Y,XX=self.XX,S_XX=self.S_XX,U_XX=self.U_XX,verbose=False)
                     params0 = {'Cg':_RV['params0_Cg'],'Cn':_RV['params0_Cn']}
                 elif init_method=='random':
                     params0 = {'Cg':SP.randn(self.Cg.getNumberParams()),'Cn':SP.randn(self.Cn.getNumberParams())}
                 else:
                     cov = 0.5*SP.cov(self.Y.T)+1e-4*SP.eye(self.P)
                     chol = LA.cholesky(cov,lower=True)
                     params = chol[SP.tril_indices(self.P)]
                     params0 = {'Cg':params.copy(),'Cn':params.copy()}
             Ifilter = None
         else:
             if self.colCovarType=='freeform':
                 cov = SP.cov(self.Y.T)+1e-4*SP.eye(self.P)
                 chol = LA.cholesky(cov,lower=True)
                 params = chol[SP.tril_indices(self.P)]
             #else:
             #    S,U=LA.eigh(cov)
             #    a = SP.sqrt(S[-self.rank_r:])[:,SP.newaxis]*U[:,-self.rank_r:]
             #    if self.colCovarType=='lowrank_id':
             #        c = SP.sqrt(S[:-self.rank_r].mean())*SP.ones(1)
             #    else:
             #        c = SP.sqrt(S[:-self.rank_r].mean())*SP.ones(self.P)
             #    params0_Cn = SP.concatenate([a.T.ravel(),c])
             params0 = {'Cr':1e-9*SP.ones(self.P),'Cn':params}
             Ifilter = {'Cr':SP.zeros(self.P,dtype=bool),
                         'Cn':SP.ones(params.shape[0],dtype=bool)}
     if self.mean.F is not None and self.bgRE:
         params0['mean'] = 1e-6*SP.randn(self.mean.getParams().shape[0])
         if Ifilter is not None:
             Ifilter['mean'] = SP.ones(self.mean.getParams().shape[0],dtype=bool)
     return params0,Ifilter
Beispiel #15
0
 def infer_full_post(self,X_i,D_i):
     class MJMError(Exception):
         pass
     [m,V] = self.infer_full(X_i,D_i)
     ns=X_i.shape[0]
     cv = sp.zeros([ns,ns])
     for i in xrange(self.size):
         cv+=V[ns*i:ns*(i+1),:]
     cv= cv/self.size + sp.cov(m,rowvar=0,bias=1)
     return [sp.mean(m,axis=0).reshape([1,ns]),cv]
Beispiel #16
0
  def randomized(cls, degree, dim, scale):
    mixcoeffs = scipy.random.random(degree)
    mixcoeffs /= mixcoeffs.sum()

    means = scipy.random.standard_normal((degree, dim)) * scale

    # Generate random covariances by generating random data.
    randomdata = (scipy.random.standard_normal((dim, 10)) * scale
                     for _ in xrange(degree))
    covs = [scipy.cov(i) for i in randomdata]
    return cls(mixcoeffs, means, covs)
Beispiel #17
0
    def setUp(self):
        np.random.seed(1)

        # define phenotype
        N = 200
        P = 2
        Y = sp.randn(N,P)
        # define row caoriance
        f = 10
        G = 1.*(sp.rand(N, f)<0.2)
        X = 1.*(sp.rand(N, f)<0.2)
        R = covar_rescale(sp.dot(X,X.T))
        R+= 1e-4 * sp.eye(N)
        # define col covariances
        Cg = FreeFormCov(P)
        self._Cg = Cg
        Cn = FreeFormCov(P)
        Cg.setCovariance(0.5 * sp.cov(Y.T))
        Cn.setCovariance(0.5 * sp.cov(Y.T))
        # define gp
        self.gp = GP3KronSumLR(Y = Y, Cg = Cg, Cn = Cn, R = R, G = G, rank = 1)
Beispiel #18
0
 def stats(self, startdate, enddate, mktbasket, avdate, output=False, mappingoverride=None):
     """
     Calculates statistics for a fund over a period.
     
     Parameters
     ----------
     startdate : datetime
         beginning of statistic period
     enddate : datetime
         end of statistic period
     mktbasket : dict
         dictionary of market streams
     output : bool
         if True, output results to db
     mappingoverride : None or mapping dictionary
     	whether to override the db mapping
     
     Returns
     -------
     stats : dict
         dictionary of statistics
     """
     actualstream, projstream = self.project(mktbasket, mappingoverride)
     if actualstream[startdate:enddate] is None: return None
     if projstream[startdate:enddate] is None: return None 
     actual = actualstream[startdate:enddate].returns
     projected = projstream[startdate:enddate].returns
     diff = actual - projected
     outdata = {
              'TE'     : scipy.std(diff) * 100.0 * 100.0,
              'BETA'   : scipy.cov(projected, actual, bias=1)[1, 0] / scipy.var(projected),
              'ALPHA'  : (scipy.product(diff + 1.0)) ** (1.0 / diff.size) - 1.0,
              'VOL'    : scipy.std(actual) * scipy.sqrt(252.0),
              'PROJ'   : scipy.product(1.0 + projected) - 1.0,
              'ACT'    : scipy.product(1.0 + actual) - 1.0,
              'R2'     : 0.0 if scipy.all(actual == 0.0) else scipy.corrcoef(projected, actual)[1, 0] ** 2.0,
              'AV'     : self.av(avdate),
              'DELTA'  : self.deltaestimate(avdate)
             }
     outdata['DIFF'] = outdata['ACT'] - outdata['PROJ']
     outdata['PL'] = outdata['DELTA'] * outdata['DIFF'] * 100.0 
     if output:
         cnxn = pyodbc.connect(ORACLESTRING)
         cursor = cnxn.cursor()
         sql = 'INSERT INTO FUNDOUTPUT VALUES ({0!s},{1!s},{2!s},{3!s},{4!s},{5!s},{6},{7},{8!s},{9!s},{10!s},{11!s},{12!s},{13!s});'
         sql = sql.format(self.fundcode, outdata['PROJ'], outdata['ACT'], outdata['DIFF'],
                    outdata['DELTA'], outdata['PL'], oracledatebuilder(startdate),
                    oracledatebuilder(enddate), outdata['TE'], outdata['R2'], outdata['BETA'],
                    outdata['ALPHA'], outdata['VOL'], outdata['AV'])
         cursor.execute(sql)
         cnxn.commit()
         cnxn.close()
     return outdata
Beispiel #19
0
            self.mapping[indexes[i]] = finalbeta[i]
        return self.mapping

    def stats(self, startdate, enddate, mktbasket, output = False):
        """
        Calculates statistics for a fund over a period.
        
        Parameters
        ----------
        startdate : datetime
            beginning of statistic period
        enddate : datetime
            end of statistic period
        mktbasket : dict
            dictionary of market streams
        output : bool
            if True, output results to db
        
        Returns
        -------
        stats : dict
            dictionary of statistics
        """
        inputmatrix, fundreturns, indexes, daterange = self.align(startdate, enddate, mktbasket)
        if self.mapping and not(inputmatrix is None):
            weights = scipy.array([self.mapping[mykey] if mykey in self.mapping else 0.0 for mykey in mktbasket.keys()])
            projected = scipy.dot(inputmatrix,weights.reshape(len(indexes),1)).flatten()
            actual = fundreturns.flatten()
            diff = actual-projected
            outdata = {
                     'TE'     : scipy.std(diff)*100.0*100.0,
                     'BETA'   : scipy.cov(projected,actual)[1,0]/scipy.var(projected),
                     'ALPHA'  : (scipy.product(diff+1.0))**(1.0/diff.size)-1.0,
                     'VOL'    : scipy.std(actual)*scipy.sqrt(252.0),
                     'PROJ'   : scipy.product(1.0+projected)-1.0,
                     'ACT'    : scipy.product(1.0+actual)-1.0,
                     'R2'     : 0.0 if scipy.all(actual==0.0) else scipy.corrcoef(projected,actual)[1,0]**2.0,
                     'AV'     : self.av(startdate),
                     'DELTA'  : self.deltaestimate(startdate)
                    }
            outdata['DIFF'] = outdata['ACT']-outdata['PROJ']
            outdata['PL'] = outdata['DELTA']*outdata['DIFF']*100.0 
            if output:
                cnxn = pyodbc.connect(ORACLESTRING)
                cursor = cnxn.cursor()
                sql = 'INSERT INTO FUNDOUTPUT VALUES ({0!s},{1!s},{2!s},{3!s},{4!s},{5!s},{6},{7},{8!s},{9!s},{10!s},{11!s},{12!s},{13!s});'
                sql = sql.format(self.fundcode,outdata['PROJ'],outdata['ACT'],outdata['DIFF'],
                           outdata['DELTA'],outdata['PL'],oracledatebuilder(startdate),
                           oracledatebuilder(enddate),outdata['TE'],outdata['R2'],outdata['BETA'],
                           outdata['ALPHA'],outdata['VOL'],outdata['AV'])
                cursor.execute(sql)
                cnxn.commit()            
                cnxn.close()
def ex15(exclude=sc.array([1,2,3,4]),plotfilename='ex15.png',
		 bovyprintargs={}):
    """ex15: solve exercise 15
    Input:
       exclude        - ID numbers to exclude from the analysis
       plotfilename   - filename for the output plot
    Output:
       plot
    History:
       2010-05-07 - Written - Bovy (NYU)
    """
    #Read the data
    data= read_data('data_allerr.dat',allerr=True)
    ndata= len(data)
    nsample= ndata- len(exclude)
    #Put the dat in the appropriate arrays and matrices
    Y= sc.zeros(nsample)
    X= sc.zeros(nsample)
    Z= sc.zeros((nsample,2))
    jj= 0
    for ii in range(ndata):
        if sc.any(exclude == data[ii][0]):
            pass
        else:
            Y[jj]= data[ii][1][1]
            X[jj]= data[ii][1][0]
            Z[jj,0]= X[jj]
            Z[jj,1]= Y[jj]
            jj= jj+1
    #Now compute the PCA solution
    Zm= sc.mean(Z,axis=0)
    Q= sc.cov(Z.T)
    eigs= linalg.eig(Q)
    maxindx= sc.argmax(eigs[0])
    V= eigs[1][maxindx]
    V= V/linalg.norm(V)

    m= sc.sqrt(1/V[0]**2.-1)
    bestfit= sc.array([-m*Zm[0]+Zm[1],m])

    #Plot result
    plot.bovy_print(**bovyprintargs)
    xrange=[0,300]
    yrange=[0,700]
    plot.bovy_plot(sc.array(xrange),bestfit[1]*sc.array(xrange)+bestfit[0],
                   'k--',xrange=xrange,yrange=yrange,
                   xlabel=r'$x$',ylabel=r'$y$',zorder=2)
    plot.bovy_plot(X,Y,marker='o',color='k',linestyle='None',
                   zorder=0,overplot=True)
 
    plot.bovy_text(r'$y = %4.2f \,x %4.0f' % (bestfit[1], bestfit[0])+r'$',
                   bottom_right=True)
    plot.bovy_end_print(plotfilename)
Beispiel #21
0
    def simulate(self, l, noisefunc=None, random_state=None):
        """Simulate vector autoregressive (VAR) model.

        This function generates data from the VAR model.

        Parameters
        ----------
        l : int or [int, int]
            Number of samples to generate. Can be a tuple or list, where l[0]
            is the number of samples and l[1] is the number of trials.
        noisefunc : func, optional
            This function is used to create the generating noise process. If
            set to None, Gaussian white noise with zero mean and unit variance
            is used.

        Returns
        -------
        data : array, shape (n_trials, n_samples, n_channels)
            Generated data.
        """
        m, n = np.shape(self.coef)
        p = n // m

        try:
            l, t = l
        except TypeError:
            t = 1

        if noisefunc is None:
            rng = check_random_state(random_state)
            noisefunc = lambda: rng.normal(size=(1, m))

        n = l + 10 * p

        y = np.zeros((n, m, t))
        res = np.zeros((n, m, t))

        for s in range(t):
            for i in range(p):
                e = noisefunc()
                res[i, :, s] = e
                y[i, :, s] = e
            for i in range(p, n):
                e = noisefunc()
                res[i, :, s] = e
                y[i, :, s] = e
                for k in range(1, p + 1):
                    y[i, :, s] += self.coef[:, (k - 1)::p].dot(y[i - k, :, s])

        self.residuals = res[10 * p:, :, :].T
        self.rescov = sp.cov(cat_trials(self.residuals).T, rowvar=False)

        return y[10 * p:, :, :].transpose([2, 1, 0])
Beispiel #22
0
    def simulate(self, l, noisefunc=None):
        """ Simulate vector autoregressive (VAR) model

            This function generates data from the VAR model.

            Parameters
            ----------
            l : {int, [int, int]}
                Specify number of samples to generate. Can be a tuple or list
                where l[0] is the number of samples and l[1] is the number of
                trials.
            noisefunc : func, optional
                This function is used to create the generating noise process.
                If set to None Gaussian white noise with zero mean and unit
                variance is used.

            Returns
            -------
            data : array, shape = [n_samples, n_channels, n_trials]
        """
        (m, n) = sp.shape(self.coef)
        p = n // m

        try:
            (l, t) = l
        except TypeError:
            t = 1

        if noisefunc is None:
            noisefunc = lambda: sp.random.normal(size=(1, m))

        n = l + 10 * p

        y = sp.zeros((n, m, t))
        res = sp.zeros((n, m, t))

        for s in range(t):
            for i in range(p):
                e = noisefunc()
                res[i, :, s] = e
                y[i, :, s] = e
            for i in range(p, n):
                e = noisefunc()
                res[i, :, s] = e
                y[i, :, s] = e
                for k in range(1, p + 1):
                    y[i, :, s] += self.coef[:, (k - 1)::p].dot(y[i - k, :, s])

        self.residuals = res[10 * p:, :, :]
        self.rescov = sp.cov(cat_trials(self.residuals), rowvar=False)

        return y[10 * p:, :, :]
Beispiel #23
0
 def _initParams(self, init_method=None):
     """ this function initializes the paramenter and Ifilter """
     if self.bgRE:
         if init_method=='random':
             params0 = {'covar': sp.randn(self._gpNull.covar.getNumberParams())}
         else:
             if self.P==1:
                 params0 = {'covar':sp.sqrt(0.5) * sp.ones(2)}
             else:
                 cov = 0.5*sp.cov(self.Y.T) + 1e-4*sp.eye(self.P)
                 chol = la.cholesky(cov, lower=True)
                 params = chol[sp.tril_indices(self.P)]
                 params0 = {'covar': sp.concatenate([params, params])}
     else:
         if self.P==1: 
             params_cn = sp.array([1.])
         else:
             cov = sp.cov(self.Y.T) + 1e-4*sp.eye(self.P)
             chol = la.cholesky(cov, lower=True)
             params_cn = chol[sp.tril_indices(self.P)]
         params0 = {'covar': params_cn}
     return params0
Beispiel #24
0
    def setUp(self):
        np.random.seed(1)

        # define phenotype
        N = 200
        P = 2
        Y = sp.randn(N,P)
        # define fixed effects
        F = []; A = []
        F.append(1.*(sp.rand(N,2)<0.5))
        A.append(sp.eye(P))
        # define row caoriance
        f = 10
        G = 1.*(sp.rand(N, f)<0.2)
        # define col covariances
        Cr = FreeFormCov(P)
        self._Cr = Cr
        Cn = FreeFormCov(P)
        Cr.setCovariance(0.5 * sp.cov(Y.T))
        Cn.setCovariance(0.5 * sp.cov(Y.T))
        # define gp
        self.gp = GP2KronSumLR(Y = Y, F = F, A = A, Cn = Cn, G = G)
Beispiel #25
0
 def __call__(self, gradient, error=None):
     # Append a copy to make sure this one is not changed after by the
     # client.
     self.samples.append(array(gradient))
     # Return None if no new estimate is being given.
     if len(self.samples) < self.samplesize:
         return None
     # After all the samples have been put into a single array, we can
     # delete them.
     gradientarray = array(self.samples).T
     inv_covar = inv(cov(gradientarray))
     self.values += dot(inv_covar, gradientarray.sum(axis=1))
     return self.values
Beispiel #26
0
 def _init_params_default(self):
     """
     Internal method for default parameter initialization
     """
     # if there are some nan -> mean impute
     Yimp = self.Y.copy()
     Inan = sp.isnan(Yimp)
     Yimp[Inan] = Yimp[~Inan].mean()
     if self.P==1:   C = sp.array([[Yimp.var()]])
     else:           C = sp.cov(Yimp.T)
     C /= float(self.n_randEffs)
     for ti in range(self.n_randEffs):
         self.getTraitCovarFun(ti).setCovariance(C)
Beispiel #27
0
def correlationMatrix(mdata,linit,lend,nstep):
    lstep=(lend-linit)/nstep
    corr=np.zeros((mdata.shape[0],mdata.shape[0]))
    liter= [linit+(i*lstep) for i in range(nstep)]
    print liter, len(liter),lend
    zz = 0
    for length in liter:
        corrs = cov(mdata[:,length:length+lstep])
        corr += corrs
        zz += 1
        print length, length+lstep,
    print zz
    corr /= nstep
    return corr
def calc_covariance_errors(history, dist_X):
    
    print ('Calculating covariance errors...')
    for dist_name in list(history.keys()):
        nTypes = len(history[dist_name].keys())
        hist_single = history[dist_name][list(history[dist_name].keys())[0]]
        nsteps = len(hist_single)
        samp_names = []
        errors = sp.zeros((nsteps,nTypes,2))

        counter = 0
        for samp_name in list(history[dist_name].keys()): 
            samp_names.append(samp_name)
            hist_single = history[dist_name][samp_name]
            nsteps = len(hist_single)
            nbatch = hist_single[-1]['X'].shape[1]
            N = hist_single[0]['X'].shape[0]
            errors_tmp = sp.zeros((nsteps,nTypes))

            X = sp.zeros((N,nbatch,nsteps))
            P = sp.zeros((N,nbatch,nsteps))
            for tt in range(nsteps):
                X[:,:,tt] = hist_single[tt]['X']
                P[:,:,tt] = hist_single[tt]['P']


            inv_var_diags = 10.**sp.linspace(-dist_X.log_conditioning, 0, N)
            corr_matrix_calc = sp.zeros((N,N,nsteps))
            cov_matrix_calc = sp.zeros((N,N,nsteps))

            for iN in sp.arange(1,nsteps):

              if (iN % (nsteps/10) == 0):
                print ("%s: %s errors calculated..." %(samp_name, iN))
              
              cov_matrix_calc[:,:,iN] = sp.cov(X[:,:,:iN].reshape(N,nbatch*iN),rowvar=1)
              corr_matrix_calc[:,:,iN] = sp.dot(sp.dot(sp.diag(inv_var_diags**.5),cov_matrix_calc[:,:,iN]),sp.diag(inv_var_diags**.5))
              errors_tmp[iN,0] = sp.sum((sp.diag(sp.diag(corr_matrix_calc[:,:,iN]))-sp.diag(sp.diag(dist_X.corr_matrix)))**2.0)/N
              errors_tmp[iN,1] = sp.sum((corr_matrix_calc[:,:,iN] - sp.diag(sp.diag(corr_matrix_calc[:,:,iN]))-dist_X.corr_matrix +sp.diag(sp.diag(dist_X.corr_matrix)))**2.0)/(N*(N-1))

            print (corr_matrix_calc[:5,:5,-1])
            print (dist_X.corr_matrix[:5,:5])
               
            errors[:,counter,0] = errors_tmp[:,0]            
            errors[:,counter,1] = errors_tmp[:,1]            

            counter += 1

    return errors, samp_names
Beispiel #29
0
    def setUp(self):
        np.random.seed(1)

        # define phenotype
        N = 200
        P = 2
        self.Y = sp.randn(N, P)
        # define fixed effects
        self.F = []; self.A = []
        self.F.append(1.*(sp.rand(N,2)<0.5))
        self.A.append(sp.eye(P))
        # define row caoriance
        f = 10
        X = 1.*(sp.rand(N, f)<0.2)
        self.R  = covar_rescale(sp.dot(X,X.T))
        self.R += 1e-4 * sp.eye(N)
        # define col covariances
        self.Cg = FreeFormCov(P)
        self.Cn = FreeFormCov(P)
        self.Cg.setCovariance(0.5 * sp.cov(self.Y.T))
        self.Cn.setCovariance(0.5 * sp.cov(self.Y.T))
        # define gp
        self.gp = GP2KronSum(Y=self.Y, F=self.F, A=self.A, Cg=self.Cg,
                             Cn=self.Cn, R=self.R)
Beispiel #30
0
 def __init__ (self,dataTraining, classID, proportions = None): 
     self.dataTraining = dataTraining
     #get the number of labels (since numbering goes form 0 to K-1, set class ID equal to K)
     nClasses= int(classID.max() + 1)
     #get the stats for each labels
     self.means = []
     self.invVarCovarMatrix = []
     self.constant = [] #last 3 terms in equation
     for  i in range(nClasses):
         id = classID == i #array of bools
         proportions = id.mean() #ratio of trues:fales (sum of ones/# of entries)
         self.means.append(dataTraining[id, :].mean(axis= 0))
         varCovarMatrix = scipy.cov(dataTraining[id,:],rowvar=0)
         self.invVarCovarMatrix.append(inv(varCovarMatrix))
         self.constant.append(-0.5*scipy.dot(scipy.dot(self.means[-1],self.invVarCovarMatrix[-1]),scipy.transpose(self.means[-1]))
                              +math.log(proportions) - 0.5*math.log(scipy.linalg.det(varCovarMatrix)))
def PCA_EigenVectors_Values(fullhits):
    '''
    Input expects hits as a list of lists
    Utility function:
    from utilities package but here only requests the full set
    of eigenvectors in order to transform the data.
    '''
    X1 = array([row[:3] for row in fullhits])  # voxel data only
    # takes data as numpy array
    data_array = transpose(X1)
    # Get eigenvalues and eigenvectors
    eigenval = []
    etranspose = []

    if (len(data_array) > 0):
        eigenval, eigenvec = linalg.eig(cov(data_array))
        # Transpose eigenvec to return to dataset
        etranspose = transpose(eigenvec)

    return eigenval, etranspose
Beispiel #32
0
def PC_varExplained(Y, standardize=True):
    """Run PCA and calculate the cumulative fraction of variance

	Args:
	    Y (dbl): phenotype values
	    standardize (logical): if True, phenotypes are standardized

	Returns:
        var (dbl): cumulative distribution of variance explained
	"""
    # figuring out the number of latent factors
    if standardize:
        Y -= Y.mean(0)
        Y /= Y.std(0)
    covY = SP.cov(Y)
    S, U = linalg.eigh(covY + 1e-6 * SP.eye(covY.shape[0]))
    S = S[::-1]
    rv = np.array([S[0:i].sum() for i in range(1, S.shape[0])])
    rv /= S.sum()
    return rv
Beispiel #33
0
    def train(self, X):
        # データの中心化
        self.X_mean = X.mean(0)
        X_centered = X - self.X_mean

        # 分散共分散行列の作成
        V = sp.cov(X_centered.T)

        # Vの固有値計算
        self.eigvals, self.eigvecs = linalg.eig(V)

        # 大きい方からn_components個の固有値を取り出し,それに対応する固有ベクトルを並べて基底を定める
        eigvals_idx = sp.argsort(self.eigvals)
        eigvals_idx = eigvals_idx[len(eigvals_idx)::-1]
        self.U = self.eigvecs[eigvals_idx[:self.n_components]]

        # 基底ベクトルから射影した点を求める
        X_pca = sp.dot(self.U, X_centered.T)
        X_pca = X_pca.T

        return X_pca, self.U
Beispiel #34
0
def roll_true():
    data = pd.read_csv('000032.csv', index_col=0, parse_dates=True)
    data = data[::-1]
    # print(data.index[-1:][0])
    enddate = data.index[-1:][0]
    begdate = enddate - relativedelta(months=2)
    print(begdate)
    print(enddate)
    month_data = data[data.index >= begdate]
    month_data = month_data[month_data.index <= enddate]
    print(month_data)
    month_data_close = month_data['close'].values

    d = np.diff(month_data_close)
    print(d)
    cov_ = sc.cov(d[:-1], d[1:])
    print(cov_)
    if cov_[0, 1] < 0:
        print('roll spread for negetive', round(2 * sc.sqrt(-cov_[0, 1]), 3))
    else:
        print('roll spread for positive', round(cov_[0, 1]))
Beispiel #35
0
def init_GPkronprod(Y, X_r, n_c):
    """
    init parameters for kron(C + sigma I,R) + sigma*I
    """
    # build linear kernel with the features
    covar0_r = SP.array([0])
    covar_r = linear.LinearCF(n_dimensions=X_r.shape[1])
    covar_r.X = X_r
    R = covar_r.K(covar0_r)
    var_R = utils.getVariance(R)
    cov = SP.cov(Y)

    # split into likelihood and noise terms
    ratio = SP.random.rand(3)
    ratio /= ratio.sum()
    lik0 = ratio[0] * SP.diag(cov).min()
    covar0_c = ratio[1] * SP.diag(cov).min()

    # remaining variance is assigned to latent factors
    if n_c > 1:
        X0_c = SP.zeros((Y.shape[0], n_c))
        ratio = SP.random.rand(n_c)
        ratio /= ratio.sum()
        for i in range(n_c):
            # split further up
            X0_c[:, i] = SP.sign(SP.random.rand) * SP.sqrt(
                ratio[i] * (SP.diag(cov) - lik0 - covar0_c))
    else:
        X0_c = SP.sign(
            SP.random.rand) * SP.sqrt(SP.diag(cov) - lik0 - covar0_c)
    X0_c = SP.reshape(X0_c, (X0_c.shape[0], n_c))

    # check if variance of initial values match observed variance
    assert SP.allclose(SP.diag(cov), (X0_c**2).sum(1) + lik0 +
                       covar0_c), 'ouch, something is wrong'

    # bring in correct format and transform as neccessary
    covar0_c = 0.5 * SP.log(SP.array([1. / var_R, covar0_c]))
    lik0 = 0.5 * SP.log(SP.array([lik0]))
    return X0_c, covar0_c, lik0, covar0_r
Beispiel #36
0
def column_covariances(X, uniformity_thresh):
    Xvert = high_frequency_vert(X, sigma=4.0)
    Xvertp = high_frequency_vert(X, sigma=3.0)
    models = []
    use_C = []
    for i in range(X.shape[2]):
        xsub = Xvert[:, :, i]
        xsubp = Xvertp[:, :, i]
        mu = xsub.mean(axis=0)
        dists = s.sqrt(pow((xsub - mu), 2).sum(axis=1))
        distsp = s.sqrt(pow((xsubp - mu), 2).sum(axis=1))
        thresh = percentile(dists, 95.0)
        uthresh = dists * uniformity_thresh
        #use       = s.logical_and(dists<thresh, abs(dists-distsp) < uthresh)
        use = dists < thresh
        C = s.cov(xsub[use, :], rowvar=False)
        [U, V, D] = svd(C)
        V[V < 1e-8] = 1e-8
        C = U.dot(s.diagflat(V)).dot(D)
        models.append(C)
        use_C.append(use)
    return s.array(models), Xvert, Xvertp, s.array(use_C).T
Beispiel #37
0
 def __init__(self,
              Y=None,
              Xr=None,
              F=None,
              factr=1e7,
              Ie=None,
              debug=False):
     """
     Args:
         Y:          [N, 1] phenotype matrix
         Xr:         [N, S] genotype data of the set component
         R:          [N, S] genotype data of the set component
         factr:      paramenter that determines the accuracy of the solution
                     (see scipy.optimize.fmin_l_bfgs_b for more details)
     """
     if F is None:
         F = sp.ones((y.shape[0], 1))
     # kroneckerize F
     W = sp.zeros((Y.shape[0], 2 * F.shape[1]))
     W[:, :F.shape[1]] = Ie[:, sp.newaxis] * F
     W[:, F.shape[1]:] = (~Ie[:, sp.newaxis]) * F
     from limix_core.mean import MeanBase
     self.mean = MeanBase(Y, W)
     # avoid SVD failus by adding some jitter
     Xr += 2e-6 * (sp.rand(*Xr.shape) - 0.5)
     # store stuff
     Xr -= Xr.mean(0)
     Xr /= Xr.std(0)
     Xr /= sp.sqrt(Xr.shape[1])
     self.Y = Y
     self.F = F
     self.Xr = Xr
     self.Ie = Ie
     self.covY = sp.cov(Y.T)
     self.factr = factr
     self.debug = debug
     self.gp = {}
     self.info = {}
Beispiel #38
0
    def fit(self, data):
        """ Fit VAR model to data.
        
        Parameters
        ----------
        data : array-like, shape = [n_samples, n_channels, n_trials] or [n_samples, n_channels]
            Continuous or segmented data set.
            
        Returns
        -------
        self : :class:`VAR`
            The :class:`VAR` object.
        """
        data = sp.atleast_3d(data)
        (x, y) = self._construct_eqns(data)
        self.fitting_model.fit(x, y)

        self.coef = self.fitting_model.coef_

        self.residuals = data - self.predict(data)
        self.rescov = sp.cov(datatools.cat_trials(self.residuals[self.p:, :, :]), rowvar=False)

        return self
Beispiel #39
0
        def fit(self, data):
            """Fit VAR model to data.

            Parameters
            ----------
            data : array, shape (trials, channels, samples)
                Continuous or segmented data set. If the data is continuous, a
                2D array of shape (channels, samples) can be provided.

            Returns
            -------
            self : :class:`VAR`
                The :class:`VAR` object.
            """
            data = atleast_3d(data)
            (x, y) = self._construct_eqns(data)
            self.fitting_model.fit(x, y)

            self.coef = self.fitting_model.coef_

            self.residuals = data - self.predict(data)
            self.rescov = sp.cov(cat_trials(self.residuals[:, :, self.p:]))

            return self
Beispiel #40
0
def ex15(
        exclude=sc.array([1, 2, 3, 4]), plotfilename='ex15.png',
        bovyprintargs={}):
    """ex15: solve exercise 15
    Input:
       exclude        - ID numbers to exclude from the analysis
       plotfilename   - filename for the output plot
    Output:
       plot
    History:
       2010-05-07 - Written - Bovy (NYU)
    """
    #Read the data
    data = read_data('data_allerr.dat', allerr=True)
    ndata = len(data)
    nsample = ndata - len(exclude)
    #Put the dat in the appropriate arrays and matrices
    Y = sc.zeros(nsample)
    X = sc.zeros(nsample)
    Z = sc.zeros((nsample, 2))
    jj = 0
    for ii in range(ndata):
        if sc.any(exclude == data[ii][0]):
            pass
        else:
            Y[jj] = data[ii][1][1]
            X[jj] = data[ii][1][0]
            Z[jj, 0] = X[jj]
            Z[jj, 1] = Y[jj]
            jj = jj + 1
    #Now compute the PCA solution
    Zm = sc.mean(Z, axis=0)
    Q = sc.cov(Z.T)
    eigs = linalg.eig(Q)
    maxindx = sc.argmax(eigs[0])
    V = eigs[1][maxindx]
    V = V / linalg.norm(V)

    m = sc.sqrt(1 / V[0]**2. - 1)
    bestfit = sc.array([-m * Zm[0] + Zm[1], m])

    #Plot result
    plot.bovy_print(**bovyprintargs)
    xrange = [0, 300]
    yrange = [0, 700]
    plot.bovy_plot(sc.array(xrange),
                   bestfit[1] * sc.array(xrange) + bestfit[0],
                   'k--',
                   xrange=xrange,
                   yrange=yrange,
                   xlabel=r'$x$',
                   ylabel=r'$y$',
                   zorder=2)
    plot.bovy_plot(X,
                   Y,
                   marker='o',
                   color='k',
                   linestyle='None',
                   zorder=0,
                   overplot=True)

    plot.bovy_text(r'$y = %4.2f \,x %4.0f' % (bestfit[1], bestfit[0]) + r'$',
                   bottom_right=True)
    plot.bovy_end_print(plotfilename)
def overlap_fp_fn(spikes, means=None, covariances=None):
    """ Return dicts of tuples (False positive rate, false negative rate)
    indexed by unit. This function needs :mod:`sklearn` if
    ``covariances`` is not set to ``'white'``.

    This function estimates the pairwise and total false positive and false
    negative rates for a number of waveform clusters. The results can be
    interpreted as follows: False positives are the fraction of spikes in a
    cluster that is estimated to belong to a different cluster (a specific
    cluster for pairwise results or any other cluster for total results).
    False negatives are the number spikes from other clusters that are
    estimated to belong to a given cluster (also expressed as fraction, this
    number can be larger than 1 in extreme cases).

    Details for the calculation can be found in
    (Hill et al. The Journal of Neuroscience. 2011).
    The calculation for total false positive and false negative rates does
    not follow Hill et al., who propose a simple addition of pairwise
    probabilities. Instead, the total error probabilities are estimated
    using all clusters at once.

    :param dict spikes: Dictionary, indexed by unit, of lists of
        spike waveforms as :class:`neo.core.Spike` objects or numpy arrays.
        If the waveforms have multiple channels, they will be flattened
        automatically. All waveforms need to have the same number of samples.
    :param dict means: Dictionary, indexed by unit, of lists of
        spike waveforms as :class:`neo.core.Spike` objects or numpy arrays.
        Means for units that are not in this dictionary will be estimated
        using the spikes. Note that if you pass ``'white'`` for
        ``covariances`` and you want to provide means, they have to be
        whitened in the same way as the spikes.
        Default: None, means will be estimated from data.
    :param covariances: Dictionary, indexed by unit, of lists of
        covariance matrices. Covariances  for units that are not in this
        dictionary will be estimated using the spikes. It is useful to give
        a covariance matrix if few spikes are present - consider using the
        noise covariance. If you use prewhitened spikes (i.e. all clusters
        are normal distributed, so their covariance matrix is the identity),
        you can pass ``'white'`` here. The calculation will be much faster in
        this case and the sklearn package is not required.
        Default: None, covariances will estimated from data.
    :type covariances: dict or str
    :returns: Two values:

        * A dictionary (indexed by unit) of total
          (false positive rate, false negative rate) tuples.
        * A dictionary of dictionaries, both indexed by units,
          of pairwise (false positive rate, false negative rate) tuples.
    :rtype: dict, dict
    """
    units = spikes.keys()

    total_spikes = 0
    for spks in spikes.itervalues():
        total_spikes += len(spks)
    if total_spikes < 1:
        return {u: (0.0, 0.0) for u in units}, {}

    if means is None:
        means = {}
    white = False
    if covariances is None:
        covariances = {}
    elif covariances == 'white':
        white = True
        covariances = {}

    # Convert Spike objects to arrays
    dimensionality = None
    spike_arrays = {}
    for u, spks in spikes.iteritems():
        spikelist = []
        if not spks or (len(spks) < 2 and u not in covariances):
            units.remove(u)
            continue
        for s in spks:
            if isinstance(s, neo.Spike):
                spikelist.append(
                    sp.asarray(s.waveform.rescale(pq.uV)).T.flatten())
            else:
                spikelist.append(s)
        spike_arrays[u] = sp.array(spikelist).T
        if dimensionality is None:
            dimensionality = spike_arrays[u].shape[0]
        elif dimensionality != spike_arrays[u].shape[0]:
            raise SpykeException('All spikes need to have the same number'
                                 'of samples!')

    if not units:
        return {}, {}
    if len(units) == 1:
        return {units[0]: (0.0, 0.0)}, {}

    # Convert or calculate means and covariances
    shaped_means = {}
    covs = {}
    if white:
        cov = sp.eye(dimensionality)
        covariances = {u: cov for u in units}

    for u in units:
        if u in means and _object_has_size(means[u], dimensionality):
            mean = means[u]
            if isinstance(mean, neo.Spike):
                shaped_means[u] = sp.asarray(mean.waveform.rescale(
                    pq.uV)).T.flatten()
            else:
                shaped_means[u] = means[u].T.flatten()
        else:
            shaped_means[u] = spike_arrays[u].mean(axis=1)

    if white:
        return _fast_overlap_whitened(spike_arrays, shaped_means)

    for u in units:
        if u not in covariances:
            covs[u] = sp.cov(spike_arrays[u])
        else:
            covs[u] = covariances[u]

    # Calculate pairwise false positives/negatives
    singles = {u: {} for u in units}
    for i, u1 in enumerate(units):
        u1 = units[i]
        for u2 in units[i + 1:]:
            error_rates = _pair_overlap(spike_arrays[u1], spike_arrays[u2],
                                        shaped_means[u1], shaped_means[u2],
                                        covs[u1], covs[u2])
            singles[u1][u2] = error_rates[0:2]
            singles[u2][u1] = error_rates[2:4]

    # Calculate complete false positives/negatives
    import sklearn
    mix = sklearn.mixture.GMM(n_components=2, covariance_type='full')
    mix_means = []
    mix_covars = []
    mix_weights = []
    for u in units:
        mix_means.append(shaped_means[u])
        mix_covars.append([covs[u]])
        mix_weights.append(spike_arrays[u].shape[1])
    mix.means_ = sp.vstack(mix_means)
    mix.covars_ = sp.vstack(mix_covars)
    mix_weights = sp.array(mix_weights, dtype=float)
    mix_weights /= mix_weights.sum()
    mix.weights_ = mix_weights

    # P(spikes of unit[i] in correct cluster)
    post_mean = sp.zeros(len(units))

    # sum(P(spikes of unit[i] in cluster[j])
    post_sum = sp.zeros((len(units), len(units)))

    for i, u in enumerate(units):
        posterior = mix.predict_proba(spike_arrays[u].T)
        post_mean[i] = posterior[:, i].mean()
        post_sum[i, :] = posterior.sum(axis=0)

    totals = {}
    for i, u in enumerate(units):
        fp = 1.0 - post_mean[i]
        ind = range(len(units))
        ind.remove(i)
        fn = post_sum[ind, i].sum() / float(spike_arrays[u].shape[1])
        totals[u] = (fp, fn)

    return totals, singles
Beispiel #42
0
    def _propose(self, step, po=None):
        """
        Generates proposals.
        returns two lists

        :Parameters:
            - `step`: Position in the markov chain history.
            - `po`: Process pool for parallel proposal generation

        :Returns:
            - `theta`: List of proposed self.dimensional points in parameter space
            - `prop`: List of self.nchains proposed phis.
        """
        po = None
        thetalist = []
        proplist = []
        initcov = identity(self.dimensions)
        if self.meld.initheta and step <= 1:
            # start from user-defined point in parameter space.
            for i in range(self.nchains):
                thetalist.append(self.meld.initheta)
            self.lastcv = initcov  # assume no covariance at the beginning
        else:
            for c in range(self.nchains):
                off = 0
                if step <= 1 or self.seqhist[c] == []:
                    # sample from the priors
                    while off < 50:
                        theta = [
                            self.parpriors[par].rvs() for par in self.parnames
                        ]
                        if not self.check_constraints(theta):
                            continue
                        if sum([
                                int(t >= self.parlimits[i][0]
                                    and t <= self.parlimits[i][1])
                                for i, t in enumerate(theta)
                        ]) == self.dimensions:
                            break
                        off += 1
                    if off == 50:  # try a compromising proposal
                        theta = self.seqhist[c][
                            -1]  # last accepted proposal for this chain
                        #                print "off:" , off
                    self.lastcv = initcov  # assume no covariance at the beginning
                else:
                    # use gaussian proposal
                    if step % 10 == 0 and len(
                            self.seqhist[c]
                    ) >= 10:  # recalculate covariance matrix only every ten steps
                        cv = self.scaling_factor * cov(
                            array(self.seqhist[c][-10:]), rowvar=0
                        ) + self.scaling_factor * self.e * identity(
                            self.dimensions)
                        self.lastcv = cv
                    else:
                        cv = self.lastcv
                    # print self.parlimits
                    while off < 50:
                        theta = multivariate_normal(self.seqhist[c][-1],
                                                    cv,
                                                    size=1).tolist()[0]
                        if sum([
                                int(t >= self.parlimits[i][0]
                                    and t <= self.parlimits[i][1])
                                for i, t in enumerate(theta)
                        ]) == self.dimensions:
                            break
                        off += 1
                    if off == 50:  # try a compromising proposal
                        theta = self.seqhist[c][
                            -1]  # last accepted proposal for this chain
                        # print "off:" , off
                thetalist.append(theta)
        if po:
            proplis = [
                po.apply_async(model_as_ra,
                               (t, self.meld.model, self.meld.phi.dtype.names))
                for t in thetalist
            ]
            proplist = [job.get() for job in proplis]
        else:
            proplist = [
                model_as_ra(t, self.meld.model, self.meld.phi.dtype.names)
                for t in thetalist
            ]
        propl = [p[:self.t] for p in proplist]
        return thetalist, propl
Beispiel #43
0
    N = 1000
    P = 4
    K = 2
    S = 500
    Y, F, G, B0, Cg0, Cn0 = generate_data(N, P, K, S)

    # compute eigenvalue decomp of RRM
    R = sp.dot(G, G.T)
    R /= R.diagonal().mean()
    R += 1e-4 * sp.eye(R.shape[0])
    Sr, Ur = la.eigh(R)

    # fit null model
    Cg = FreeFormCov(Y.shape[1])
    Cn = FreeFormCov(Y.shape[1])
    gp = GP2KronSum(Y=Y, S_R=Sr, U_R=Ur, Cg=Cg, Cn=Cn, F=F, A=sp.eye(P))
    gp.covar.Cg.setCovariance(0.5 * sp.cov(Y.T))
    gp.covar.Cn.setCovariance(0.5 * sp.cov(Y.T))
    gp.optimize(factr=10)

    import pdb

    pdb.set_trace()

    # run MTLMM
    from limix_lmm.lmm_core import MTLMM

    mtlmm = MTLMM(Y, F=F, A=sp.eye(P), Asnp=sp.eye(P), covar=gp.covar)
    pv, B = mtlmm.process(G)
Beispiel #44
0
        result.filters = filters
        result.args = args

        trace = result.trace
        result.trace = None
        save_results(result, rname)
        result.trace = trace

        # --- Plotting
        _ = display(result,
                    savedir=args.plot_dir,
                    show=args.display,
                    root=pname)
        normchain = (result.chain -
                     result.chain.mean(axis=0)) / result.chain.std(axis=0)
        corr = cov(normchain.T)

    # --- hemcee ---
    if args.backend == "hemcee":

        result = backends.run_hemcee(p0,
                                     scene,
                                     plans,
                                     scales=scales,
                                     nwarm=args.nwarm,
                                     niter=args.niter)

        result.labels = scene.parameter_names
        result.sourcepars = srcpars
        result.stamps = stamps
        result.filters = filters
Beispiel #45
0
    def _propose(self, step, po=None):
        """
        Generates proposals.
        returns two lists

        :Parameters:
            - `step`: Position in the markov chain history.
            - `po`: Process pool for parallel proposal generation

        :Returns:
            - `theta`: List of proposed self.dimensional points in parameter space
            - `prop`: List of self.nchains proposed phis.
        """
        thetalist = []
        proplist = []
        initcov = np.identity(self.dimensions)
        for c in range(self.nchains):
            if step <= 1 or self.seqhist[c] == []:
                # sample from the priors
                while 1:
                    theta = [self.parpriors[dist]() for dist in self.parnames]
                    if not self.check_constraints(theta):
                        continue
                    if sum([
                            int(
                                greater(t, self.parlimits[i][0])
                                and less(t, self.parlimits[i][1]))
                            for i, t in enumerate(theta)
                    ]) == self.dimensions:
                        break
                self.lastcv = initcov  # assume no covariance at the beginning
            else:
                # use gaussian proposal
                if step % 10 == 0 and len(
                        self.seqhist[c]
                ) >= 10:  # recalculate covariance matrix only every ten steps
                    cv = self.scaling_factor * cov(array(self.seqhist[c][
                        -10:])) + self.scaling_factor * self.e * identity(
                            self.dimensions)
                    self.lastcv = cv
                else:
                    cv = self.lastcv
                while 1:
                    theta = multivariate_normal(self.seqhist[c][-1],
                                                cv,
                                                size=1).tolist()[0]
                    if sum([
                            int(
                                greater(t, self.parlimits[i][0])
                                and less(t, self.parlimits[i][1]))
                            for i, t in enumerate(theta)
                    ]) == self.dimensions:
                        break
            thetalist.append(theta)
        if po:
            proplis = [
                po.apply_async(model_as_ra,
                               (t, self.meld.model, self.meld.phi.dtype.names))
                for t in thetalist
            ]
            proplist = [job.get() for job in proplis]
        else:
            proplist = [
                model_as_ra(t, self.meld.model, self.meld.phi.dtype.names)
                for t in thetalist
            ]
        propl = [p[:self.t] for p in proplist]
        return thetalist, propl
# 共分散
cov = sum((x - mu_x) * (y - mu_y)) / (N - 1)
cov

# 4 分散共分散行列 --------------------------------------------------------------------

# 元データの確認
cov_data

# 系列の抽出
x = cov_data["x"]
y = cov_data["y"]

# 分散共分散行列の計算
# --- 母数をNとする
sp.cov(x, y, ddof=0)

# 分散共分散行列の計算
# --- 母数をN-1とする
sp.cov(x, y, ddof=1)

# 5 ピアソンの積率相関係数 ----------------------------------------------------------------

# <ポイント>
# - 相関係数は線形的な関係性のみを評価できる
#   --- P128のような非線形な関係性は適切に評価できない点に注意

# 元データの確認
cov_data

# 系列の抽出
Beispiel #47
0
 def stats(self,
           startdate,
           enddate,
           mktbasket,
           avdate,
           output=False,
           mappingoverride=None):
     """
     Calculates statistics for a fund over a period.
     
     Parameters
     ----------
     startdate : datetime
         beginning of statistic period
     enddate : datetime
         end of statistic period
     mktbasket : dict
         dictionary of market streams
     output : bool
         if True, output results to db
     mappingoverride : None or mapping dictionary
     	whether to override the db mapping
     
     Returns
     -------
     stats : dict
         dictionary of statistics
     """
     actualstream, projstream = self.project(mktbasket, mappingoverride)
     if actualstream[startdate:enddate] is None: return None
     if projstream[startdate:enddate] is None: return None
     actual = actualstream[startdate:enddate].returns
     projected = projstream[startdate:enddate].returns
     diff = actual - projected
     outdata = {
         'TE':
         scipy.std(diff) * 100.0 * 100.0,
         'BETA':
         scipy.cov(projected, actual, bias=1)[1, 0] / scipy.var(projected),
         'ALPHA': (scipy.product(diff + 1.0))**(1.0 / diff.size) - 1.0,
         'VOL':
         scipy.std(actual) * scipy.sqrt(252.0),
         'PROJ':
         scipy.product(1.0 + projected) - 1.0,
         'ACT':
         scipy.product(1.0 + actual) - 1.0,
         'R2':
         0.0 if scipy.all(
             actual == 0.0) else scipy.corrcoef(projected, actual)[1,
                                                                   0]**2.0,
         'AV':
         self.av(avdate),
         'DELTA':
         self.deltaestimate(avdate)
     }
     outdata['DIFF'] = outdata['ACT'] - outdata['PROJ']
     outdata['PL'] = outdata['DELTA'] * outdata['DIFF'] * 100.0
     if output:
         cnxn = pyodbc.connect(ORACLESTRING)
         cursor = cnxn.cursor()
         sql = 'INSERT INTO FUNDOUTPUT VALUES ({0!s},{1!s},{2!s},{3!s},{4!s},{5!s},{6},{7},{8!s},{9!s},{10!s},{11!s},{12!s},{13!s});'
         sql = sql.format(self.fundcode, outdata['PROJ'], outdata['ACT'],
                          outdata['DIFF'], outdata['DELTA'], outdata['PL'],
                          oracledatebuilder(startdate),
                          oracledatebuilder(enddate), outdata['TE'],
                          outdata['R2'], outdata['BETA'], outdata['ALPHA'],
                          outdata['VOL'], outdata['AV'])
         cursor.execute(sql)
         cnxn.commit()
         cnxn.close()
     return outdata
Beispiel #48
0
def correlated_noise(bias_files,
                     target=0,
                     make_plots=False,
                     plot_corr=True,
                     figsize=(8, 8),
                     title=''):
    """
    Compute the correlated noise statistics for the overscan regions
    of the list of files, optionally making plots of the distributions.

    Parameters
    ----------
    bias_files: list
        List of bias files to analyze.  This list must have at least as many
        files as the target file index + 1.
    target: int
        Bias frame to compare to the mean biases constructed from the
        remaining files.
    make_plots: bool [False]
        Flag to determine if the png plots will be generated.
    plot_corr: bool [True]
        Flag to plot the histograms of correlation-corrected pixel
        values.  If False, then plot histograms of the uncorrected pixel
        values.
    figsize: tuple [(8, 8)]
        Figure size (in inches) of 4x4 grid of correlation plots.
    title: str ['']
        Title of 4x4 grid.

    Returns
    -------
    (dict, figure, figure):  tuple of results and matplotlib
        figures.  The first item is a dict of BiasStats objects,

        BiasStats = namedtuple('BiasStats', \
                    'noise_orig noise_corr corr_factor bias_oscan'.split())

        that contain the results for each amplifier.
    """
    # Extract the target filename and omit it from the list of bias files.
    target_file = bias_files.pop(target)

    # Get the target frame overscans.
    bias_oscans = get_overscans(target_file)
    oscan_shape = bias_oscans[1].shape

    # Construct the mean bias overscans from the remaining files.
    mean_oscans = get_mean_overscans(bias_files)

    # Loop over amps in target frame and compute statistics.
    bias_stats = dict()
    correlation_data = dict()
    for amp in bias_oscans:
        # Loop over other amps and construct the mean image of the
        # bias-subtracted overscans.  Require included amps to have
        # (unsubtracted) overscans with 4 < stdev < 25 rms ADU.
        reduced_mean_oscan = np.zeros(oscan_shape)
        num_oscan = 0
        for oamp, oscan in bias_oscans.items():
            if oamp == amp:
                continue
            reduced_mean_oscan += (oscan - mean_oscans[oamp])
            num_oscan += 1
        reduced_mean_oscan -= np.mean(reduced_mean_oscan)
        reduced_mean_oscan /= num_oscan

        fdata1 = bias_oscans[amp] - mean_oscans[amp]
        fmean1 = np.mean(fdata1)
        fdata1 -= fmean1
        dmat = np.vstack((reduced_mean_oscan.ravel(), fdata1.ravel()))
        covmat = scipy.cov(dmat, rowvar=True)
        corr_factor = covmat[0, 1] / covmat[0, 0]
        fdiff = fdata1 - corr_factor * reduced_mean_oscan
        bias_stats[amp] = BiasStats(np.sqrt(covmat[1, 1]), np.std(fdiff),
                                    corr_factor, np.mean(bias_oscans[amp]))
        correlation_data[amp] = reduced_mean_oscan, fdata1, fdiff

    f1 = None
    f2 = None
    if make_plots:
        f1, f2 = plot_correlated_noise(correlation_data,
                                       bias_stats,
                                       plot_corr=plot_corr,
                                       title=title,
                                       figsize=figsize)

    return (correlation_data, bias_stats), f1, f2
Beispiel #49
0
        ax1.set_title('log10 scaled counts (mean)')
        sns.distplot(mean2, ax=ax2, bins=20, kde=False)
        sns.despine()
        ax2.set_title('log10 scaled gene counts + gaussianized (mean)')
        # tweak the title
        ttl1 = ax1.title
        ttl1.set_weight('bold')
        ttl2 = ax2.title
        ttl2.set_weight('bold')
        PL.figtext(0.01, 0.01, date.today().isoformat())
        PL.tight_layout()
        PL.savefig(plotFile)
        PL.close()

        # Produce a PCA plot of the samples
        covY = SP.cov(Y3_gene)
        eigenvals, eigenvecs = linalg.eigh(covY + 1e-6 * SP.eye(covY.shape[0]))
        eigenvals = eigenvals[::-1]
        eigenvecs = eigenvecs[::-1]
        df_pcs = pd.DataFrame({
            'PC1': eigenvecs[0],
            'PC2': eigenvecs[1],
            'PC3': eigenvecs[2],
            'PC4': eigenvecs[3],
            'PC5': eigenvecs[4]
        })
        ## PC pairs plot coloured by assay time
        ## just use date and time for assaytime
        print "... producing PC pairs plot coloured by assay time"
        assaytime = [
            str(dt).split(" ")[0][:-3] for dt in sampleInfo['assaytime_rnaseq']
Beispiel #50
0
def estimate(file, detailed):

    # load in the strata distribution
    dist_line = ["0.0"]
    dist_line += file.readline().split()
    dist = array(dist_line, float)

    p = dist  # probabilyt of a program being in each strata
    I = len(dist)  # number of strata, including passive
    A = I - 1  # active strata

    Y = [[] for i in range(I)
         ]  # empty collection of samples divided up by stratum
    Y[0] = [0]
    s = ones((I))  # estimated standard deviations for each stage & strata

    # read in log file results
    num_samples = 0
    for result in file:
        stamp, stratum, perf1, perf2 = result.split()
        z = int(stratum)
        if True:  #z > 10:
            Y[int(stratum)].append((float(perf1), float(perf2)))
            num_samples += 2

    # compute empirical standard deviations for each stratum
    for i in range(1, I):
        if p[i] > 0.0 and len(Y[i]) > 2:

            YA = array(Y[i])
            sample1 = YA[:, 0]  # positive antithetic runs
            sample2 = YA[:, 1]  # negative antithetic runs

            s1 = sample1.std(ddof=1)  # 1 degree of freedom
            s2 = sample2.std(ddof=1)  # 1 degree of freedom
            covariance = cov(sample1, sample2)[0, 1]  # default is 1 df

            var = 0.25 * (s1 * s1 + s2 * s2 + 2.0 * covariance)
            s[i] = sqrt(var)
        else:
            s[i] = 1.0

    # report current estimates by strata
    if detailed:
        for i in range(1, I):
            stratum_samples = len(Y[i]) * 2.0
            print " % 3d % 5d" % (i, stratum_samples),

            if stratum_samples == 0:
                # no samples, so skip mean and half CI
                print
            elif stratum_samples < 4:
                # don't report half CI with less than 4 samples
                print " % 6.1f" % (array(Y[i]).mean())
            else:
                # do a full report
                print " % 6.1f +/- % 5.1f" \
                   % (array(Y[i]).mean(), 1.96*s[i]/sqrt(stratum_samples) )

        print

    # compute the current estimate and 95% confidence interval
    est = 0.0
    for i in range(1, I):
        stratum_samples = len(Y[i]) * 2.0
        if p[i] > 0.0 and stratum_samples > 2:
            est += p[i] / stratum_samples * array(Y[i]).sum()

    delta = 1.96 * sum(p * s) / sqrt(num_samples)

    print "%6i  % 5.1f +/- % 5.1f" % (num_samples, est, delta),

    return
Beispiel #51
0
        ### mean imputation for remaining nans
        print('Imputing mean')
        for i in range(psi.shape[0]):
            n_idx = sp.where(sp.isnan(psi[i, :]))[0]
            if n_idx.shape[0] == 0:
                continue
            psi[i, n_idx] = spst.nanmean(psi[i, :])

        ### center the data - I might not need to do this for the covariance
        psi -= sp.mean(psi, axis=1)[:, sp.newaxis]
        #psi -= sp.mean(psi, axis=0)

        ### compute kernel
        print('Computing covariances')
        K = sp.cov([psi[i, :] for i in range(psi.shape[0])])

        ### PCA
        print('Compute PCA ...')
        w_g, Vt_g = eigh(K)
        V_g = Vt_g.T
        w_g = w_g[::-1]
        V_g = V_g[::-1, :]
        print('... done')

        pickle.dump((w_g, V_g, ctypes, tn_labels, psi), open(picklefile, 'w'),
                    -1)
    else:
        print('Loading data from pickle: %s' % picklefile)
        (w_g, V_g, ctypes, tn_labels, psi) = pickle.load(open(picklefile, 'r'))
plt.imshow(ims, cmap="gray")
plt.colorbar()

# Visualization of band 2
plt.figure()
ims = skip_extrem(im[:, :, b2])
plt.imshow(ims, cmap="gray")
plt.colorbar()

# Mean differences
print "Median value of differences between band {} and {} is {}".format(
    b1, b2, 100. * sp.median(
        (im[:, :, b2].astype(float) - im[:, :, b1]) / im[:, :, b1]))

# Computation of the correlation
im.shape = (h * w, b)
cov = sp.cov(im[::4, :], bias=1, rowvar=0)
dcov = sp.sqrt(sp.diag(cov))
cor = cov / dcov[:, sp.newaxis]
cor /= dcov[sp.newaxis, :]
plt.figure()
plt.imshow(cor, interpolation='nearest')
plt.colorbar()

# Compute condition number
s = linalg.svd(cov, compute_uv=False)
print("Condition number is {}".format(s[0] / s[-1]))

# Plot all the figures
plt.show()
Beispiel #53
0
def correlated_noise(bias_files, target=0, make_plots=False, plot_corr=True,
                     figsize=(8, 8), title=''):
    """
    Compute the correlated noise statistics for the overscan regions
    of the list of files, optionally making plots of the distributions.

    Parameters
    ----------
    bias_files: list
        List of bias files to analyze.  This list must have at least as many
        files as the target file index + 1.
    target: int
        Bias frame to compare to the mean biases constructed from the
        remaining files.
    make_plots: bool [False]
        Flag to determine if the png plots will be generated.
    plot_corr: bool [True]
        Flag to plot the histograms of correlation-corrected pixel
        values.  If False, then plot histograms of the uncorrected pixel
        values.
    figsize: tuple [(8, 8)]
        Figure size (in inches) of 4x4 grid of correlation plots.
    title: str ['']
        Title of 4x4 grid.

    Returns
    -------
    (dict, figure, figure):  tuple of results and matplotlib
        figures.  The first item is a dict of BiasStats objects,

        BiasStats = namedtuple('BiasStats', \
                    'noise_orig noise_corr corr_factor bias_oscan'.split())

        that contain the results for each amplifier.
    """
    f1, f2 = None, None
    if make_plots:
        f1, ax1 = plt.subplots(4, 4, figsize=figsize)
        ax1 = {amp: subplot for amp, subplot in zip(imutils.allAmps(),
                                                    ax1.flatten())}
        f2, ax2 = plt.subplots(4, 4, figsize=figsize)
        ax2 = {amp: subplot for amp, subplot in zip(imutils.allAmps(),
                                                    ax2.flatten())}

    # Extract the target filename and omit it from the list of bias files.
    target_file = bias_files.pop(target)

    # Get the target frame overscans.
    bias_oscans = get_overscans(target_file)
    oscan_shape = bias_oscans[1].shape

    # Construct the mean bias overscans from the remaining files.
    mean_oscans = get_mean_overscans(bias_files)

    # Compute the mean values of the mean bias overscans.
    mean_oscan_values \
        = {amp: np.mean(oscan) for amp, oscan in mean_oscans.items()}

    # Loop over amps in target frame and compute statistics.
    bias_stats = dict()
    for amp in bias_oscans:
        # Loop over other amps and construct the mean image of the
        # bias-subtracted overscans.  Require included amps to have
        # (unsubtracted) overscans with 4 < stdev < 25 rms ADU.
        reduced_mean_oscan = np.zeros(oscan_shape)
        num_oscan = 0
        for oamp, oscan in bias_oscans.items():
            if oamp == amp:
                continue
            reduced_mean_oscan += (oscan - mean_oscans[oamp])
            num_oscan += 1
        reduced_mean_oscan -= np.mean(reduced_mean_oscan)
        reduced_mean_oscan /= num_oscan

        fdata1 = bias_oscans[amp] - mean_oscans[amp]
        fmean1 = np.mean(fdata1)
        fdata1 -= fmean1
        dmat = np.vstack((reduced_mean_oscan.flatten(), fdata1.flatten()))
        covmat = scipy.cov(dmat, rowvar=True)
        corr_factor = covmat[0, 1]/covmat[0, 0]
        fdiff = fdata1 - corr_factor*reduced_mean_oscan
        bias_stats[amp] = BiasStats(np.sqrt(covmat[1, 1]), np.std(fdiff),
                                    corr_factor,
                                    np.mean(bias_oscans[amp]))
                                    #fmean1)

        if make_plots:
            f1.suptitle(title)
            f2.suptitle(title)
            ax1[amp].hist2d(reduced_mean_oscan.flatten(), fdata1.flatten(),
                            bins=(100, 100), range=((-50, 50), (-50, 50)))
            label = 'amp %i, cov/var = %.2f' \
                    % (amp, bias_stats[amp].corr_factor)
            ax1[amp].text(-40, 40, label, fontsize=6, color='w',
                          fontweight='bold')
            if plot_corr:
                ax2[amp].hist(fdiff.flatten(), bins=100, range=(-50, 50),
                              histtype='step')
            else:
                ax2[amp].hist(fdata1.flatten(), bins=100, range=(-50, 50),
                              histtype='step')

    return bias_stats, f1, f2
Beispiel #54
0
def bces(x1, x2, x1err=[], x2err=[], cerr=[], logify=True, model='yx', \
         bootstrap=5000, verbose='normal', full_output=True):
    """
    Bivariate, Correlated Errors and intrinsic Scatter (BCES)
    translated from the FORTRAN code by Christina Bird and Matthew Bershady
    (Akritas & Bershady, 1996)

    Linear regression in the presence of heteroscedastic errors on both
    variables and intrinsic scatter

    Parameters
    ----------
      x1        : array of floats
                  Independent variable, or observable
      x2        : array of floats
                  Dependent variable
      x1err     : array of floats (optional)
                  Uncertainties on the independent variable
      x2err     : array of floats (optional)
                  Uncertainties on the dependent variable
      cerr      : array of floats (optional)
                  Covariances of the uncertainties in the dependent and
                  independent variables
      logify    : bool (default True)
                  Whether to take the log of the measurements in order to
                  estimate the best-fit power law instead of linear relation
      model     : {'yx', 'xy', 'bi', 'orth'}
                  BCES model with which to calculate regression. See Notes
                  below for details.
      bootstrap : False or int (default 5000)
                  get the errors from bootstrap resampling instead of the
                  analytical prescription? if bootstrap is an int, it is the
                  number of bootstrap resamplings
      verbose   : str (default 'normal')
                  Verbose level. Options are {'quiet', 'normal', 'debug'}
      full_output : bool (default True)
                  If True, return also the covariance between the
                  normalization and slope of the regression.

    Returns
    -------
      a         : tuple of length 2
                  Best-fit normalization and its uncertainty (a, da)
      b         : tuple of length 2
                  Best-fit slope and its uncertainty (b, db)

    Optional outputs
    ----------------
      cov_ab    : 2x2 array of floats
                  covariance between a and b. Returned if full_output is set to
                  True.

    Notes
    -----
      If verbose is normal or debug, the results from all the BCES models will
      be printed (still, only the one selected in *model* will be returned).

      the *model* parameter:
        -'yx' stands for BCES(Y|X)
        -'xy' stands for BCES(X|Y)
        -'bi' stands for BCES Bisector
        -'orth' stands for BCES Orthogonal

    """
    def _bess_bootstrap(npts, x1, x2, x1err, x2err, cerr, nsim):
        ##added by Gerrit, July 2014
        ##Unfortunately I needed a copy of the _bess function for bootstrapping.
        #Would be nicer if those two could be combined
        """
        Do the entire regression calculation for 4 slopes:
        OLS(Y|X), OLS(X|Y), bisector, orthogonal
        """
        #calculate sigma's for datapoints using length of confidence intervals
        sig11var = np.sum(x1err**2, axis=1, keepdims=True) / npts
        sig22var = np.sum(x2err**2, axis=1, keepdims=True) / npts
        sig12var = np.sum(cerr, axis=1, keepdims=True) / npts

        # calculate means and variances
        x1av = np.mean(x1, axis=1, keepdims=True)
        x1var = x1.var(axis=1, keepdims=True)
        x2av = np.mean(x2, axis=1, keepdims=True)
        x2var = x2.var(axis=1, keepdims=True)
        covar_x1x2 = np.mean((x1-np.mean(x1,axis=1,keepdims=True)) * \
                             (x2-np.mean(x2,axis=1,keepdims=True)),
                             axis=1,keepdims=True)

        # compute the regression slopes for OLS(X2|X1), OLS(X1|X2),
        # bisector and orthogonal
        if model == 'yx':
            modelint = 1
        else:
            modelint = 4
        b = np.zeros((modelint, nsim))
        b[0] = ((covar_x1x2 - sig12var) / (x1var - sig11var)).flatten()
        if model != 'yx':
            b[1] = ((x2var - sig22var) / (covar_x1x2 - sig12var)).flatten()
            b[2] = ((b[0] * b[1] - 1 + np.sqrt((1 + b[0] ** 2) * \
                   (1 + b[1] ** 2))) / (b[0] + b[1])).flatten()
            b[3] = 0.5 * ((b[1] - 1 / b[0]) + np.sign(covar_x1x2).flatten()* \
                   np.sqrt(4 + (b[1] - 1 / b[0]) ** 2))

        # compute intercepts for above 4 cases:
        a = x2av.flatten() - b * x1av.flatten()

        # set up variables to calculate standard deviations of slope and
        # intercept
        xi = []
        xi.append(((x1 - x1av) * (x2 - b[0].reshape(nsim,1) * x1 - \
                                  a[0].reshape(nsim,1)) + \
                   b[0].reshape(nsim,1) * x1err ** 2) / \
                  (x1var - sig11var))
        if model != 'yx':
            xi.append(((x2 - x2av) * (x2 - b[1].reshape(nsim,1) * x1 - \
                                      a[1].reshape(nsim,1)) + x2err ** 2) / \
                      covar_x1x2)
            xi.append((xi[0] * (1 + b[1].reshape(nsim,1) ** 2) + \
                       xi[1] * (1 + b[0].reshape(nsim,1) ** 2)) / \
                      ((b[0].reshape(nsim,1) + \
                       b[1].reshape(nsim,1)) * \
                       np.sqrt((1 + b[0].reshape(nsim,1) ** 2) * \
                               (1 + b[1].reshape(nsim,1) ** 2))))
            xi.append((xi[0] / b[0].reshape(nsim,1) ** 2 + xi[1]) * \
                      b[3].reshape(nsim,1) / \
                      np.sqrt(4 + (b[1].reshape(nsim,1) - \
                              1 / b[0].reshape(nsim,1)) ** 2))
        zeta = []
        for i in xrange(modelint):
            zeta.append(x2 - b[i].reshape(nsim, 1) * x1 - x1av * xi[i])

        # calculate  variance for all a and b
        bvar = np.zeros((4, nsim))
        avar = np.zeros((4, nsim))
        for i in xrange(modelint):
            bvar[i] = xi[i].var(axis=1, keepdims=False) / npts
            avar[i] = zeta[i].var(axis=1, keepdims=False) / npts
        return a, b, avar, bvar, xi, zeta

    def _bess(npts, x1, x2, x1err, x2err, cerr):
        """
        Do the entire regression calculation for 4 slopes:
          OLS(Y|X), OLS(X|Y), bisector, orthogonal
        """
        # calculate sigma's for datapoints using length of confidence
        # intervals
        sig11var = sum(x1err**2) / npts
        sig22var = sum(x2err**2) / npts
        sig12var = sum(cerr) / npts
        # calculate means and variances
        x1av = scipy.average(x1)
        x1var = scipy.std(x1)**2
        x2av = scipy.average(x2)
        x2var = scipy.std(x2)**2
        covar_x1x2 = sum((x1 - x1av) * (x2 - x2av)) / npts
        # compute the regression slopes for OLS(X2|X1), OLS(X1|X2),
        # bisector and orthogonal
        b = scipy.zeros(4)
        b[0] = (covar_x1x2 - sig12var) / (x1var - sig11var)
        b[1] = (x2var - sig22var) / (covar_x1x2 - sig12var)
        b[2] = (b[0] * b[1] - 1 + scipy.sqrt((1 + b[0] ** 2) * \
               (1 + b[1] ** 2))) / (b[0] + b[1])
        b[3] = 0.5 * ((b[1] - 1 / b[0]) + scipy.sign(covar_x1x2) * \
               scipy.sqrt(4 + (b[1] - 1 / b[0]) ** 2))
        # compute intercepts for above 4 cases:
        a = x2av - b * x1av
        # set up variables to calculate standard deviations of slope
        # and intercept
        xi = []
        xi.append(((x1 - x1av) * \
                   (x2 - b[0] * x1 - a[0]) + b[0] * x1err ** 2) / \
                  (x1var - sig11var))
        xi.append(((x2 - x2av) * (x2 - b[1] * x1 - a[1]) + x2err ** 2) / \
                  covar_x1x2)
        xi.append((xi[0] * (1 + b[1] ** 2) + xi[1] * (1 + b[0] ** 2)) / \
                  ((b[0] + b[1]) * \
                   scipy.sqrt((1 + b[0] ** 2) * (1 + b[1] ** 2))))
        xi.append((xi[0] / b[0] ** 2 + xi[1]) * b[3] / \
                  scipy.sqrt(4 + (b[1] - 1 / b[0]) ** 2))
        zeta = []
        for i in xrange(4):
            zeta.append(x2 - b[i] * x1 - x1av * xi[i])
        # calculate  variance for all a and b
        bvar = scipy.zeros(4)
        avar = scipy.zeros(4)
        for i in xrange(4):
            bvar[i] = scipy.std(xi[i])**2 / npts
            avar[i] = scipy.std(zeta[i])**2 / npts
        return a, b, avar, bvar, xi, zeta

    def _bootspbec(npts, x, y, xerr, yerr, cerr):
        """
        Bootstrap samples
        """
        j = scipy.random.randint(npts, size=npts)
        xboot = x[j]
        xerrboot = xerr[j]
        yboot = y[j]
        yerrboot = yerr[j]
        cerrboot = cerr[j]
        return xboot, yboot, xerrboot, yerrboot, cerrboot

    # ----  Main routine starts here  ---- #
    # convert to scipy arrays just in case
    x1 = scipy.array(x1)
    x2 = scipy.array(x2)
    x1err = scipy.array(x1err)
    x2err = scipy.array(x2err)
    cerr = scipy.array(cerr)
    models = [['yx', 'xy', 'bi', 'orth'],
              ['BCES(Y|X)', 'BCES(X|Y)', 'BCES Bisector', 'BCES Orthogonal']]
    # which to return?
    j = models[0].index(model)
    npts = len(x1)
    # are the errors defined?
    if len(x1err) == 0:
        x1err = scipy.zeros(npts)
    if len(x2err) == 0:
        x2err = scipy.zeros(npts)
    if len(cerr) == 0:
        cerr = scipy.zeros(npts)
    if verbose == 'debug':
        print 'x1 =', x1
        print 'x1err =', x1err
        print 'x2 =', x2
        print 'x2err =', x2err
        print 'cerr =', cerr
        print '\n ** Returning values for', models[1][j], '**'
        if bootstrap is not False:
            print '    with errors from %d bootstrap resamplings' % bootstrap
        print ''

    # calculate nominal fits
    bessresults = _bess(npts, x1, x2, x1err, x2err, cerr)
    (a, b, avar, bvar, xi, zeta) = bessresults
    # covariance between normalization and slope
    if full_output:
        covar_ab = scipy.cov(xi[j], zeta[j])

    if bootstrap is not False:
        # make bootstrap simulated datasets, and compute averages and
        # standard deviations of regression coefficients
        asum = scipy.zeros(4)
        assum = scipy.zeros(4)
        bsum = scipy.zeros(4)
        bssum = scipy.zeros(4)
        sda = scipy.zeros(4)
        sdb = scipy.zeros(4)
        for i in xrange(bootstrap):
            samples = _bootspbec(npts, x1, x2, x1err, x2err, cerr)
            (x1sim, x2sim, x1errsim, x2errsim, cerrsim) = samples
            besssim = _bess(npts, x1sim, x2sim, x1errsim, x2errsim, cerrsim)
            (asim, bsim, avarsim, bvarsim, xi, zeta) = besssim
            asum += asim
            assum += asim**2
            bsum += bsim
            bssum += bsim**2

        aavg = asum / bootstrap
        bavg = bsum / bootstrap
        for i in range(4):
            sdtest = assum[i] - bootstrap * aavg[i]**2
            if sdtest > 0:
                sda[i] = scipy.sqrt(sdtest / (bootstrap - 1))
            sdtest = bssum[i] - bootstrap * bavg[i]**2
            if sdtest > 0:
                sdb[i] = scipy.sqrt(sdtest / (bootstrap - 1))

    if verbose in ('normal', 'debug'):
        print '%s   B          err(B)' % ('Fit'.ljust(19)),
        print '         A          err(A)'
        for i in range(4):
            print '%s  %9.2e +/- %8.2e    %10.3e +/- %9.3e' \
                  %(models[1][i].ljust(16), b[i],
                    scipy.sqrt(bvar[i]), a[i], scipy.sqrt(avar[i]))
            if bootstrap is not False:
                print '%s  %9.2e +/- %8.2e    %10.3e +/- %9.3e' \
                      %('bootstrap'.ljust(16), bavg[i],
                        sdb[i], aavg[i], sda[i])
            print ''
        if verbose == 'debug':
            print 'cov[%s] =' % models[model]
            print covar_ab

    if bootstrap is not False:
        if full_output:
            return (a[j], sda[j]), (b[j], sdb[j]), covar_ab
        else:
            return (a[j], sda[j]), (b[j], sdb[j])
    if full_output:
        out = ((a[j], scipy.sqrt(avar[j])), (b[j], scipy.sqrt(bvar[j])),
               covar_ab)
    else:
        out = ((a[j], scipy.sqrt(avar[j])), (b[j], scipy.sqrt(bvar[j])))
    return out
Beispiel #55
0
    # define mean term
    mean = LinearMean(Y)
    print mean.Y

    # add first fixed effect
    F = 1. * (SP.rand(N, 2) < 0.2)
    A = SP.eye(P)
    mean.addFixedEffect(F=F, A=A)
    # add first fixed effect
    F = 1. * (SP.rand(N, 3) < 0.2)
    A = SP.ones((1, P))
    mean.addFixedEffect(F=F, A=A)

    # rotate stuff by row and cols
    C = SP.cov(Y.T)
    Sc, Uc = LA.eigh(C)
    Sr, Ur = LA.eigh(XX)
    d = SP.kron(Sc, Sr)
    mean.d = d
    mean.Lc = Uc.T
    mean.Lr = Ur.T
    mean.LRLdiag = Sr
    mean.LCL = C**2

    if 1:
        # calculate stuff to see if it goes through
        print mean.Ystar()
        print mean.Yhat()
        print mean.Xstar()
        print mean.Xhat()
Beispiel #56
0
from numpy import array, mat, shape, transpose
from scipy import cov, linalg
from pylab import load, arange

data2 = mat(
    array(
        load('raw3.dat', delimiter='\t', usecols=arange(0, 13, 1),
             unpack=True)))
time_series = mat(cov(data2, rowvar=1))
print 'covariance matrix : ', shape(time_series)
eval, evec = linalg.eig(mat(time_series))
print shape(eval), shape(evec)
print abs(evec)
print abs(eval)
Beispiel #57
0
def bces(x1, x2, x1err=None, x2err=None, cerr=None, nsim=5000, model='yx', \
         bootstrap=5000, verbose='normal', full_output=True):
    """
  Bivariate, Correlated Errors and intrinsic Scatter (BCES)
    translated from the FORTRAN code by Christina Bird and Matthew Bershady
    (Akritas & Bershady, 1996)

  Linear regression in the presence of heteroscedastic errors on both
  variables and intrinsic scatter

  Parameters
  ----------
    x1        : array of floats
                Independent variable, or observable
    x2        : array of floats
                Dependent variable
    x1err     : array of floats (optional)
                Uncertainties on the independent variable
    x2err     : array of floats (optional)
                Uncertainties on the dependent variable
    cerr      : array of floats (optional)
                Covariances of the uncertainties in the dependent and
                independent variables
    nsim      : int (default 1000)
                Number of bootstrap samples for uncertainties on best-fit
                parameters
    model     : {'yx', 'xy', 'bi', 'orth'}
                BCES model with which to calculate regression. See Notes
                below for details.
    bootstrap : False or int (default False)
                get the errors from bootstrap resampling instead of the
                analytical prescription? if bootstrap is an int, it is the
                number of bootstrap resamplings
    verbose   : str (default 'normal')
                Verbose level. Options are {'quiet', 'normal', 'debug'}
    full_output : bool (default True)
                If True, return also the covariance between the normalization
                and slope of the regression.

  Returns
  -------
    a         : tuple of length 2
                Best-fit normalization and its uncertainty (a, da)
    b         : tuple of length 2
                Best-fit slope and its uncertainty (b, db)

  Optional outputs
  ----------------
    cov       : 2x2 array of floats
                covariance between a and b. Returned if full_output is set to
                True.

  Notes
  -----
    If verbose is normal or debug, the results from all the BCES models will
    be printed (still, only the one selected in *model* will be returned).

    the *model* parameter:
      -'yx' stands for BCES(Y|X)
      -'xy' stands for BCES(X|Y)
      -'bi' stands for BCES Bisector
      -'orth' stands for BCES Orthogonal

  """
    def _bess(npts, x1, x2, x1err, x2err, cerr):
        """
    Do the entire regression calculation for 4 slopes:
      OLS(Y|X), OLS(X|Y), bisector, orthogonal
    """

        # calculate sigma's for datapoints using length of confidence intervals
        sig11var = sum(x1err**2) / npts
        sig22var = sum(x2err**2) / npts
        sig12var = sum(cerr) / npts

        # calculate means and variances
        x1av = scipy.average(x1)
        x1var = scipy.std(x1)**2
        x2av = scipy.average(x2)
        x2var = scipy.std(x2)**2
        covar_x1x2 = sum((x1 - x1av) * (x2 - x2av)) / npts

        # compute the regression slopes for OLS(X2|X1), OLS(X1|X2),
        # bisector and orthogonal
        b = scipy.zeros(4)
        b[0] = (covar_x1x2 - sig12var) / (x1var - sig11var)
        b[1] = (x2var - sig22var) / (covar_x1x2 - sig12var)
        b[2] = (b[0] * b[1] - 1 + scipy.sqrt((1 + b[0] ** 2) * \
               (1 + b[1] ** 2))) / (b[0] + b[1])
        b[3] = 0.5 * ((b[1] - 1 / b[0]) + scipy.sign(covar_x1x2) * \
               scipy.sqrt(4 + (b[1] - 1 / b[0]) ** 2))

        # compute intercepts for above 4 cases:
        a = x2av - b * x1av

        # set up variables to calculate standard deviations of slope and intercept
        xi = []
        xi.append(((x1 - x1av) * (x2 - b[0] * x1 - a[0]) + b[0] * x1err ** 2) / \
                  (x1var - sig11var))
        xi.append(((x2 - x2av) * (x2 - b[1] * x1 - a[1]) + x2err ** 2) / \
                  covar_x1x2)
        xi.append((xi[0] * (1 + b[1] ** 2) + xi[1] * (1 + b[0] ** 2)) / \
                  ((b[0] + b[1]) * scipy.sqrt((1 + b[0] ** 2) * (1 + b[1] ** 2))))
        xi.append((xi[0] / b[0] ** 2 + xi[1]) * b[3] / \
                  scipy.sqrt(4 + (b[1] - 1 / b[0]) ** 2))
        zeta = []
        for i in range(4):
            zeta.append(x2 - b[i] * x1 - x1av * xi[i])

        # calculate  variance for all a and b
        bvar = scipy.zeros(4)
        avar = scipy.zeros(4)
        for i in range(4):
            bvar[i] = scipy.std(xi[i])**2 / npts
            avar[i] = scipy.std(zeta[i])**2 / npts

        return a, b, avar, bvar, xi, zeta

    def _bootspbec(npts, x, y, xerr, yerr, cerr):
        """
    Bootstrap samples
    """
        j = scipy.random.randint(npts, size=npts)
        xboot = x[j]
        xerrboot = xerr[j]
        yboot = y[j]
        yerrboot = yerr[j]
        cerrboot = cerr[j]
        return xboot, yboot, xerrboot, yerrboot, cerrboot

    # ----  Main routine starts here  ---- #

    models = [['yx', 'xy', 'bi', 'orth'],
              ['BCES(Y|X)', 'BCES(X|Y)', 'BCES Bisector', 'BCES Orthogonal']]
    # which to return?
    j = models[0].index(model)

    npts = len(x1)
    # are the errors defined?
    if x1err is None:
        x1err = scipy.zeros(npts)
    if x2err is None:
        x2err = scipy.zeros(npts)
    if cerr is None:
        from scipy import random
        cerr = scipy.zeros(npts)
        #cerr = scipy.cov(x1err, x2err)[1][0] * scipy.ones(npts)

    if verbose == 'debug':
        print('x1 =', x1)
        print('x1err =', x1err)
        print('x2 =', x2)
        print('x2err =', x2err)
        print('cerr =', cerr)
        print('\n ** Returning values for', models[1][j], '**')
        if bootstrap is not False:
            print('    with errors from %d bootstrap resamplings' % bootstrap)
        print('')

    # calculate nominal fits
    bessresults = _bess(npts, x1, x2, x1err, x2err, cerr)
    (a, b, avar, bvar, xi, zeta) = bessresults
    # covariance between normalization and slope
    if full_output:
        covar_ab = scipy.cov(xi[j], zeta[j])

    if bootstrap is not False:
        # make bootstrap simulated datasets, and compute averages and
        # standard deviations of regression coefficients
        asum = scipy.zeros(4)
        assum = scipy.zeros(4)
        bsum = scipy.zeros(4)
        bssum = scipy.zeros(4)
        sda = scipy.zeros(4)
        sdb = scipy.zeros(4)
        for i in range(nsim):
            samples = _bootspbec(npts, x1, x2, x1err, x2err, cerr)
            (x1sim, x2sim, x1errsim, x2errsim, cerrsim) = samples
            besssim = _bess(npts, x1sim, x2sim, x1errsim, x2errsim, cerrsim)
            (asim, bsim, avarsim, bvarsim, xi, zeta) = besssim
            asum += asim
            assum += asim**2
            bsum += bsim
            bssum += bsim**2

        aavg = asum / nsim
        bavg = bsum / nsim
        for i in range(4):
            sdtest = assum[i] - nsim * aavg[i]**2
            if sdtest > 0:
                sda[i] = scipy.sqrt(sdtest / (nsim - 1))
            sdtest = bssum[i] - nsim * bavg[i]**2
            if sdtest > 0:
                sdb[i] = scipy.sqrt(sdtest / (nsim - 1))

    if verbose in ('normal', 'debug'):
        print('%s   B          err(B)' % ('Fit'.ljust(19)), end=' ')
        print('         A          err(A)')
        for i in range(4):
            print('%s  %9.2e +/- %8.2e    %10.3e +/- %9.3e' \
                  %(models[1][i].ljust(16), b[i],
                    scipy.sqrt(bvar[i]), a[i], scipy.sqrt(avar[i])))
            if bootstrap is not False:
                print('%s  %9.2e +/- %8.2e    %10.3e +/- %9.3e' \
                      %('bootstrap'.ljust(16), bavg[i], sdb[i], aavg[i], sda[i]))
            print('')
        if verbose == 'debug':
            print('cov[%s] =' % models[model])
            print(covar_ab)

    if bootstrap is not False:
        if full_output:
            return (a[j], sda[j]), (b[j], sdb[j]), covar_ab
        else:
            return (a[j], sda[j]), (b[j], sdb[j])

    if full_output:
        return (a[j], scipy.sqrt(avar[j])), (b[j],
                                             scipy.sqrt(bvar[j])), covar_ab
    else:
        return (a[j], scipy.sqrt(avar[j])), (b[j], scipy.sqrt(bvar[j]))
Beispiel #58
0
def main():
    a = [[1, 2, 3], [4, 5, 6]]

    print(median(a))
    print(corrcoef(a))
    print(cov(a))
Beispiel #59
0
"""
  Name     : c8_19_Roll_spread.py
  Book     : Python for Finance (2nd ed.)
  Publisher: Packt Publishing Ltd. 
  Author   : Yuxing Yan
  Date     : 6/6/2017
  email    : [email protected]
             [email protected]
"""

from matplotlib.finance import quotes_historical_yahoo_ochl as getData
import scipy as sp 
ticker='IBM' 
begdate=(2013,9,1) 
enddate=(2013,11,11) 
data= getData(ticker, begdate, enddate,asobject=True, adjusted=True) 
p=data.aclose 
d=sp.diff(p)
cov_=sp.cov(d[:-1],d[1:]) 
if cov_[0,1]<0: 
    print("Roll spread for ", ticker, 'is', round(2*sp.sqrt(-cov_[0,1]),3)) 
else: 
    print("Cov is positive for ",ticker, 'positive', round(cov_[0,1],3)) 

Beispiel #60
0
'''

#lets test the roll spread for IBM
#lets import the modules well be using
import yfinance as yf
import scipy as sp

#download data
data = yf.download('IBM', start='2013-9-1', end='2013-11-11')
'''
Key note is that roll spread is appropriate for high frequency data
However for purposes of demonstration we'll use historical data from IBM

'''
#determine change in prices
returns = sp.diff(data['Adj Close'])
#find covariance matrix
covariance = sp.cov(returns[:-1], returns[1:])

if covariance[0, 1] < 0:  #cov[0.1] defines a matrix of row 1 and column 0
    print("Roll spread for IBM is", round(2 * sp.sqrt(-covariance[0, 1]), 3))
else:
    print("Cov is positive for IBM ", round(covariance[0, 1], 3))
'''
When roll value is positive, Roll's model
would fail. In a real world, it could occur for many cases. Usually, practitioners
adopt two approaches: when the spread is negative, we just ignore those cases or use
other methods to estimate spread. The second approach is to add a negative sign in
front of a positive covariance.
'''