Beispiel #1
0
    def decompose(self, verbose=False):
        '''Perform the Singular Value Decomposition and identify the rank of the embedding subspace
        Characteristic of projection: the proportion of variance captured in the subspace'''
        X = self.X_com
        self.S = X * X.T
        self.U, self.s, self.V = linalg.svd(self.S)
        self.U, self.s, self.V = m(self.U), np.sqrt(self.s), m(self.V)
        self.d = np.linalg.matrix_rank(X)
        Vs, Xs, Ys, Zs = {}, {}, {}, {}
        for i in range(self.d):
            Zs[i] = self.s[i] * self.V[:, i]
            Vs[i] = X.T * (self.U[:, i] / self.s[i])
            Ys[i] = self.s[i] * self.U[:, i]
            Xs[i] = Ys[i] * (m(Vs[i]).T)
        self.Vs, self.Xs = Vs, Xs
        self.s_contributions = self.get_contributions(X, self.s, False)
        self.r = len(self.s_contributions[self.s_contributions > 0])
        self.r_characteristic = round(
            (self.s[:self.r]**2).sum() / (self.s**2).sum(), 4)
        self.orthonormal_base = {i: self.U[:, i] for i in range(self.r)}

        if verbose:
            msg1 = 'Rank of trajectory\t\t: {}\nDimension of projection space\t: {}'
            msg1 = msg1.format(self.d, self.r)
            msg2 = 'Characteristic of projection\t: {}'.format(
                self.r_characteristic)
            self._printer('DECOMPOSITION SUMMARY', msg1, msg2)
Beispiel #2
0
def get_cos_sim(vector, matrix):
    """ This function returns the cosine similarity between a vector and all
        the vectors in a matrix.
        Arguments:
            - (ndarray (n,)) vector: the vector (unoriented)
            - (np matrix (m, n)) matrix: a matrix, each row should be a vector
        Returns:
            - (np matrix (m, 1)): cosine similarities between the input vector
                and each vector in the matrix
    """
    from numpy import matrix as m
    from numpy.linalg import norm
    # we want the cosine similarity between each row and the input
    # vector therefore, we want (u.v)/(|u|*|v|) for all u in matrix
    # with v the input vector
    # therefore, we want:
    #     - the dot product of all rows with the input vector which should
    #         give us an m by 1 column vector with all the dot products,
    #         which is (X.v^t) with X the matrix, v the input doc, and ^t is
    #         transposition
    #     - the product of the norms of all the vectors, for which we will
    #       use np.linalg.norm, specifying the axis that yields an m by 1
    #       vector in the case of `matrix`
    # for numpy vector representation reasons, we have to transpose one
    # side of the division
    return matrix.dot(m(vector).T) / (norm(vector) * m(norm(matrix, axis=1))).T
Beispiel #3
0
 def embed(self, embedding_dimension=None, suspected_frequency=None, verbose=False, return_df=False):
     '''Embed the time series with embedding_dimension window size.
     Optional: suspected_frequency changes embedding_dimension such that it is divisible by suspected frequency'''
     if not embedding_dimension:
         self.embedding_dimension = self.ts_N//2
     else:
         self.embedding_dimension = embedding_dimension
     if suspected_frequency:
         self.suspected_frequency = suspected_frequency
         self.embedding_dimension = (self.embedding_dimension//self.suspected_frequency)*self.suspected_frequency
 
     self.K = self.ts_N-self.embedding_dimension+1
     self.X = m(linalg.hankel(self.ts, np.zeros(self.embedding_dimension))).T[:,:self.K]
     self.X_df = df(self.X)
     self.X_complete = self.X_df.dropna(axis=1)
     self.X_com = m(self.X_complete.values)
     self.X_missing = self.X_df.drop(self.X_complete.columns, axis=1)
     self.X_miss = m(self.X_missing.values)
     self.trajectory_dimentions = self.X_df.shape
     self.complete_dimensions = self.X_complete.shape
     self.missing_dimensions = self.X_missing.shape
     self.no_missing = self.missing_dimensions[1]==0
         
     if verbose:
         msg1 = 'Embedding dimension\t:  {}\nTrajectory dimensions\t: {}'
         msg2 = 'Complete dimension\t: {}\nMissing dimension     \t: {}'
         msg1 = msg1.format(self.embedding_dimension, self.trajectory_dimentions)
         msg2 = msg2.format(self.complete_dimensions, self.missing_dimensions)
         self._printer('EMBEDDING SUMMARY', msg1, msg2)
     
     if return_df:
         return self.X_df
Beispiel #4
0
 def forecast_recurrent(self, steps_ahead=12, singular_values=None, plot=False, return_df=False, **plotargs):
     '''Forecast from last point of original time series up to steps_ahead using recurrent methodology
     This method also fills any missing data from the original time series.'''
     try:
         self.X_com_hat
     except(AttributeError):
         self._forecast_prep(singular_values)
     self.ts_forecast = np.array(self.ts_v[0])
     for i in range(1, self.ts_N+steps_ahead):
         try:
             if np.isnan(self.ts_v[i]):
                 x = self.R.T*m(self.ts_forecast[max(0,i-self.R.shape[0]): i]).T
                 self.ts_forecast = np.append(self.ts_forecast,x[0])
             else:
                 self.ts_forecast = np.append(self.ts_forecast,self.ts_v[i])
         except(IndexError):
             x = self.R.T*m(self.ts_forecast[i-self.R.shape[0]: i]).T
             self.ts_forecast = np.append(self.ts_forecast, x[0])
     self.forecast_N = i+1
     new_index = pd.date_range(start=self.ts.index.min(),periods=self.forecast_N, freq=self.freq)
     forecast_df = df(self.ts_forecast, columns=['Forecast'], index=new_index)
     forecast_df['Original'] = np.append(self.ts_v, [np.nan]*steps_ahead)
     if plot:
         forecast_df.plot(title='Forecasted vs. original time series', **plotargs)
     if return_df:
         return forecast_df
Beispiel #5
0
    def embed(
        self,
        embedding_dimension=None,
        suspected_frequency=None,
        verbose=False,
        return_df=False,
    ):
        """Embed the time series with embedding_dimension window size.
        Optional: suspected_frequency changes embedding_dimension such that it is divisible by suspected frequency"""
        if not embedding_dimension:
            self.embedding_dimension = self.ts_N // 2
        else:
            self.embedding_dimension = embedding_dimension
        if suspected_frequency:
            self.suspected_frequency = suspected_frequency
            self.embedding_dimension = (
                self.embedding_dimension // self.suspected_frequency
            ) * self.suspected_frequency

        self.K = self.ts_N - self.embedding_dimension + 1
        self.X = m(linalg.hankel(self.ts, np.zeros(self.embedding_dimension))).T[
            :, : self.K
        ]
        self.X_df = pd.DataFrame(self.X)
        self.X_complete = self.X_df.dropna(axis=1)
        self.X_com = m(self.X_complete.values)
        self.X_missing = self.X_df.drop(self.X_complete.columns, axis=1)
        self.X_miss = m(self.X_missing.values)
        self.trajectory_dimentions = self.X_df.shape
        self.complete_dimensions = self.X_complete.shape
        self.missing_dimensions = self.X_missing.shape
        self.no_missing = self.missing_dimensions[1] == 0

        if return_df:
            return self.X_df
Beispiel #6
0
def get_docs_in_topic_space(model, extra_doc=None):
    """ Computes and returns the document vectors expressed as a function
        of the topics.
        Arguments:
            - (gensim.models.doc2vec.Doc2Vec) model: A doc2vec model
            - (str) extra_doc: optional. If not None, will place the extra
                document in the topic space and return it
        Returns:
            - (np.matrix) docs: Matrix with all the document vectors
                expressed as a function of the topics.
            - (np.ndarray) extra_vec: the vector for the `extra_doc` in the
                topic space. If no `extra_doc` is given, will be None.
    """
    import math
    import numpy as np

    topics = get_topic_vecs(model)
    # modifying math.exp so that it can be applied over an array
    exp = np.vectorize(math.exp)
    # shortening function name
    m = np.matrix
    ndocs = len(model.docvecs)
    # projecting documents onto topics
    doc_topic_proj = model.docvecs.vectors_docs.dot(topics.T)

    # this is a vectorized version of equation 3 in Hashimoto et al.'s
    # "Topic detection using paragraph vectors to support
    # active learning in systematic reviews", June 2016
    # instead of computing each item independantly, we compute
    # the entire matrix at once
    docs_as_topics = (
        np.apply_along_axis(func1d=exp,
                            axis=0,
                            arr=doc_topic_proj) /
        m(np.ones(ndocs)).T.dot(m(exp(sum(doc_topic_proj))))
    )

    # This chunk of code is only for the case that we want to place an extra
    # document in the topic space
    new_vec_proj = None
    if extra_doc is not None:
        new_vector = model.infer_vector(extra_doc)
        # placing extra document in topic space
        new_vec_proj = (exp(new_vector.dot(topics.T)) /
                        (sum(exp(new_vector.dot(topics.T))) *
                         np.ones(len(topics))
                         )
                        )

    # here is a version that is vectorized to a lesser
    # degree (still looping on columns)
    # [exp(dv.dot(topics.T)) /
    #  (sum(exp(dv.dot(topics.T))) *
    #   np.ones(len(topics)))  # multiplying ones vector by a scalar returns a
    #                          # vector with many times the same value
    #  for dv in model.docvecs]

    return docs_as_topics, new_vec_proj
Beispiel #7
0
def get_mfcc(name, path):
    b, _ = librosa.core.load(path + name, sr=SAMPLE_RATE)
    assert _ == SAMPLE_RATE
    try:
        ft1 = librosa.feature.mfcc(b, sr=SAMPLE_RATE, n_mfcc=20)
        ft2 = librosa.feature.zero_crossing_rate(b)[0]
        ft3 = librosa.feature.spectral_rolloff(b)[0]
        ft4 = librosa.feature.spectral_centroid(b)[0]
        ft5 = librosa.feature.spectral_contrast(b)[0]
        ft6 = librosa.feature.spectral_bandwidth(b)[0]
        ft1_trunc = np.hstack(
            (np.mean(ft1, axis=1), np.std(ft1, axis=1), skew(ft1, axis=1),
             np.max(ft1, axis=1), np.min(ft1, axis=1)))
        ft2_trunc = np.hstack(
            (np.mean(ft2), np.std(ft2), skew(ft2), np.max(ft2), np.min(ft2)))
        ft3_trunc = np.hstack(
            (np.mean(ft3), np.std(ft3), skew(ft3), np.max(ft3), np.min(ft3)))
        ft4_trunc = np.hstack(
            (np.mean(ft4), np.std(ft4), skew(ft4), np.max(ft4), np.min(ft4)))
        ft5_trunc = np.hstack(
            (np.mean(ft5), np.std(ft5), skew(ft5), np.max(ft5), np.min(ft5)))
        ft6_trunc = np.hstack(
            (np.mean(ft6), np.std(ft6), skew(ft6), np.max(ft6), np.m(ft6)))
        return pd.Series(
            np.hstack((ft1_trunc, ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc,
                       ft6_trunc)))
    except:
        print('bad file')
        return pd.Series([0] * 125)
Beispiel #8
0
 def _forecast_prep(self, singular_values=None):
     self.X_com_hat = np.zeros(self.complete_dimensions)
     self.verticality_coefficient = 0
     self.forecast_orthonormal_base = {}
     if singular_values:
         try:
             for i in singular_values:
                 self.forecast_orthonormal_base[i] = self.orthonormal_base[i]
         except:
             if singular_values == 0:
                 self.forecast_orthonormal_base[0] = self.orthonormal_base[0]
             else:
                 raise (
                     "Please pass in a list/array of singular value indices to use for forecast"
                 )
     else:
         self.forecast_orthonormal_base = self.orthonormal_base
     self.R = np.zeros(self.forecast_orthonormal_base[0].shape)[:-1]
     for Pi in self.forecast_orthonormal_base.values():
         self.X_com_hat += Pi * Pi.T * self.X_com
         pi = np.ravel(Pi)[-1]
         self.verticality_coefficient += pi ** 2
         self.R += pi * Pi[:-1]
     self.R = m(self.R / (1 - self.verticality_coefficient))
     self.X_com_tilde = self.diagonal_averaging(self.X_com_hat)
Beispiel #9
0
 def decompose(self, verbose=False):
     """Perform the Singular Value Decomposition and identify the rank of the embedding subspace
     Characteristic of projection: the proportion of variance captured in the subspace"""
     X = self.X_com
     self.S = X * X.T
     self.U, self.s, self.V = linalg.svd(self.S)
     self.U, self.s, self.V = m(self.U), np.sqrt(self.s), m(self.V)
     self.d = np.linalg.matrix_rank(X)
     Vs, Xs, Ys, Zs = {}, {}, {}, {}
     for i in range(self.d):
         Zs[i] = self.s[i] * self.V[:, i]
         Vs[i] = X.T * (self.U[:, i] / self.s[i])
         Ys[i] = self.s[i] * self.U[:, i]
         Xs[i] = Ys[i] * (m(Vs[i]).T)
     self.Vs, self.Xs = Vs, Xs
     self.s_contributions = self.get_contributions(X, self.s, False)
     self.r = len(self.s_contributions[self.s_contributions > 0])
     self.r_characteristic = round(
         (self.s[:self.r]**2).sum() / (self.s**2).sum(), 4)
     self.orthonormal_base = {i: self.U[:, i] for i in range(self.r)}
Beispiel #10
0
    def diagonal_averaging(hankel_matrix):
        """Performs anti-diagonal averaging from given hankel matrix
        Returns: Pandas DataFrame object containing the reconstructed series"""
        mat = m(hankel_matrix)
        L, K = mat.shape
        L_star, K_star = min(L, K), max(L, K)
        # new = np.zeros((L, K))
        if L > K:
            mat = mat.T
        ret = []

        # Diagonal Averaging
        for k in range(1 - K_star, L_star):
            mask = np.eye(K_star, k=k, dtype="bool")[::-1][:L_star, :]
            mask_n = sum(sum(mask))
            ma = np.ma.masked_array(mat.A, mask=1 - mask)
            ret += [ma.sum() / mask_n]

        return pd.DataFrame(ret).rename(columns={0: "Reconstruction"})
Beispiel #11
0
                          usecols = [0, 1], 
                          dtype = {0: 'S30', 1: 'int'}, 
                          names = ['word', document], 
                          header = None)
        
        # merge with previous ones
        y = pd.merge(y, y_i, on = 'word', how = 'outer')

        # kill NaNs
        y = y.fillna(0)

# choose prior
print ''
priorChoice = int(input('Uninformative (1) or informative (2) prior? '))
if priorChoice == 1:
    alpha_i = m.transpose(m([0.01] * len(y)))
elif priorChoice == 2:
    priors = pd.read_csv(rpath + 'corpus.csv', # load global frequencies
                         usecols = [0, 1], 
                         names = ['word', 'gfreq'], 
                         header = None)
    y = pd.merge(y, priors, on = 'word', how = 'left') # merge w/ y
    y = y.fillna(y['gfreq'].min()) # replace missing by argmin(alphas)
    alpha_i = m.transpose(m(y.gfreq)) # extract alphas
    del y['gfreq'] # clean up y
else:
    sys.exit('Invalid choice')

# estimate p_i
yword = m.transpose(m(np.hstack((['word'], np.array(y.word))))) # word list
y_i = m(y.iloc[:, 1:])
Beispiel #12
0
#                 load(m)
#             pos = queue.pop()

#         while pos:
#             solution.appendleft(pos)
#             pos = trail[pos.canonical()]

#         return list(solution)

#####

from numpy import matrix as m

grid = m([[5, 3, 0, 0, 7, 0, 0, 0, 0], [6, 0, 0, 1, 9, 5, 0, 0, 0],
          [0, 9, 8, 0, 0, 0, 0, 6, 0], [8, 0, 0, 0, 6, 0, 0, 0, 3],
          [4, 0, 0, 8, 0, 3, 0, 0, 1], [7, 0, 0, 0, 2, 0, 0, 0, 6],
          [0, 6, 0, 0, 0, 0, 2, 8, 0], [0, 0, 0, 4, 1, 9, 0, 0, 5],
          [0, 0, 0, 0, 8, 0, 0, 7, 9]])


def possible(row, col, n):
    for i in range(9):
        if grid[row, i] == n or grid[i, col] == n:
            return False
    row0 = row // 3 * 3
    col0 = col // 3 * 3
    for i in range(3):
        for j in range(3):
            if grid[row0 + i, col0 + j] == n:
                return False
    return True
Beispiel #13
0
set_printoptions(precision=4, threshold=None, edgeitems=None, linewidth=100, suppress=1, nanstr=None, infstr=None, formatter=None)

#if the modulo size for example is 1.5 so we will not change the values between -1.5 and 1.5, and 1.6 will become -1.4
def mod(num,modulo_size):
#    return multiply(m(sign(num)),m(num)%modulo_size)
    return m((m(num)+modulo_size)%(2*modulo_size)-modulo_size)
def quantizer(left,right,options):
    delta=1.0*(right-left)/options
    return r_[left+delta/2:right:delta]
def quantizise(numbers,quants):
    return m([min(quants, key=lambda x:abs(x-number)) for number in numbers.A1]).reshape(numbers.shape)

mod_size=1.5

y=random.uniform(-1.5,1.5,9).tolist()
x=m([i+0.1+random.normal(0,0.1) for i in y]).tolist()

nx=mod(x,mod_size)
ny=mod(y,mod_size)


if 0:
	q=quantizer(-mod_size,mod_size,70)
	nx=quantizise(nx,q)
	ny=quantizise(ny,q)

#A=m([[1,-1],[-2,1]])
#c=concatenate((nx,ny))
#d=c.T*A
#print A
#print d*A.I
Beispiel #14
0
def mod(num,modulo_size):
#    return multiply(m(sign(num)),m(num)%modulo_size)
    return m((m(num)+modulo_size)%(2*modulo_size)-modulo_size)
Beispiel #15
0
def quantizise(numbers,quants):
    return m([min(quants, key=lambda x:abs(x-number)) for number in numbers.A1]).reshape(numbers.shape)
Beispiel #16
0
def matrix_addition(a, b):
    return (m(a) + m(b)).tolist()
Beispiel #17
0
 def __init__(self, y, X, k = 0, nocons = False, vce = "ROBUST", cluster = None):
     y, X = clearNaN(y, X)
     self.depname = y.name
     self.nocons = nocons
     self.X0 = X
     self.k = k
     try: 
         self.klength = len(self.k)
         print("Using separated Ridge paramters for each eigenvalue")
     except:
         self.klength = 1
         print("Using constant Ridge paramter for each eigenvalue")
     self.n = len(y)
     
     if nocons == False: 
         cons = pd.Series(np.ones(self.n), index = X.index, name = "Cons")
         X = pd.concat([X, cons], axis = 1)
     self.l = X.shape[1]
     self.dep = np.array(y.values, dtype = float)
     self.X = X.values
     self.Xt = t(self.X)
     self.varlist = X.columns
     if self.klength == 1:
         self.VarX = inv(self.Xt @ self.X + self.k * np.identity(self.l))
     elif self.klength > 1:
         self.lam, self.vec = eigh(self.Xt @ self.X)
         idx = self.lam.argsort()[::-1]
         self.vec = self.vec[:,idx]
         self.lam = self.lam[idx]
         self.lam1 = self.lam + self.k
         self.D = self.lam1 * np.identity(self.l)
         self.VarX = self.vec @ inv(self.D) @ self.vec.transpose()
     self.VarXOLS = inv(self.Xt @ self.X)
     self.Px = self.X @ self.VarX @ self.Xt
     self.Mx = np.identity(self.n) - self.Px
     self.CovXy = self.Xt @ self.dep
     self.b = self.VarX @ self.CovXy
     self.bOLS = self.VarXOLS @ self.CovXy
     self.df = np.trace(self.Mx)
     self.u_hat = self.dep - m(self.X, self.b)
     self.u1 = self.u_hat.reshape(self.n, 1)
     self.u2 = self.u_hat**2
     self.SSR = t(self.u_hat) @ self.u_hat
     self.SE = self.SSR/float(self.df)
     self.Varb = self.SE * (self.VarX @ self.Xt @ self.X @ self.VarX) #default
     if vce == None:
         vce = ""
     if vce.upper() == "ROBUST":
         self.u2 = self.u2*self.n/self.df
         self.ohm = np.zeros([self.n,self.n])
         for i in range(self.n):
            self.ohm[i][i] = self.u2[i]
         self.XOX = self.Xt @ self.ohm @ self.X
         self.Varb = self.VarX @ self.XOX @ self.VarX
     if vce.upper() == "HC2":
         self.u2 = self.u2/np.diag(self.Mx)
         self.ohm = np.zeros([self.n,self.n])
         for i in range(self.n):
            self.ohm[i][i] = self.u2[i]
         self.XOX = self.Xt @ self.ohm @ self.X
         if np.all(cluster) != None:
             for i in range(self.l):
                 for j in range(self.l):
                     if cluster.iloc[i] != cluster.iloc[j]:
                         self.XOX[i][j] = 0
         self.Varb = self.VarX @ self.XOX @ self.VarX
     if np.all(np.any(cluster) != None):
         if np.all(np.all(cluster) != None):
             print("Cluster ID Retrieved!")
             try:
                 clsize = cluster.shape[1] 
             except: 
                 clsize = 1
             if clsize == 1:
                 ncl = len(np.unique(cluster))
                 self.o1 = self.u1 @ t(self.u1)*ncl/(ncl-1)*(self.n-1)/self.df
                 self.ohm = np.zeros(self.o1.shape) 
                 for i in range(self.n):
                     for j in range(self.n):
                         if np.all(cluster.iloc[i] == cluster.iloc[j]):
                             self.ohm[i][j] = self.o1[i][j]
             elif clsize == 2:
                 print("Twoway clustering: ", cluster.columns)
                 ncl1 = len(np.unique(cluster.iloc[:,0]))
                 ncl2 = len(np.unique(cluster.iloc[:,1]))
                 ncl12 = len(np.unique(cluster, axis = 0))
                 self.o1 = self.u1 @ t(self.u1)*(self.n-1)/self.df*ncl1/(ncl1-1)
                 self.o2 = self.u1 @ t(self.u1)*(self.n-1)/self.df*ncl2/(ncl2-1)
                 self.o12 = self.u1 @ t(self.u1)*(self.n-1)/self.df*ncl12/(ncl12-1)
                 #reghdfe in STATA did not adjust for cluster df properly
                 #This version is a better adjustment to cluster df
                 self.ohm1 = np.zeros(self.o1.shape)
                 self.ohm2 = np.zeros(self.o2.shape)
                 self.ohm12 = np.zeros(self.o12.shape)                    
                 print("Retrieving First VCE")
                 for i in range(self.n):
                     for j in range(self.n):
                         if cluster.iloc[i,0] == cluster.iloc[j,0]:
                             self.ohm1[i][j] = self.o1[i][j]
                 print("Retrieving Second VCE")
                 for i in range(self.n):
                     for j in range(self.n):
                         if cluster.iloc[i,1] == cluster.iloc[j,1]:
                             self.ohm2[i][j] = self.o2[i][j]
                 print("Retrieving Third VCE")
                 if ncl12 == self.n:
                     d1 = np.diag(self.o12)
                     self.ohm12 = d1 * np.identity(self.n)
                 else:
                     for i in range(self.n):
                         for j in range(self.n):
                             if np.any(cluster.iloc[i] == cluster.iloc[j]):
                                 self.ohm12[i][j] = self.o12[i][j]
                 self.ohm = self.ohm1 + self.ohm2 - self.ohm12
             else: 
                 print("Supports up to two way clustering only!")
         else:
             print("Incomplete Cluster ID!")
             print("HC1 Assumed")
             self.u2 = self.u2*self.n/self.df
             self.ohm = np.zeros([self.n,self.n])
             for i in range(self.n):
                 self.ohm[i][i] = self.u2[i]
         self.XOX = self.Xt @ self.ohm @ self.X
         self.Varb = self.VarX @ self.XOX @ self.VarX
         self.Varb1 = self.Varb
         if np.any(np.diag(self.Varb) < 0):
             print("Non Positive Semi-Definite VCE Matrix! Cameron, Gelbach & Miller (2011) Transformation Used")
             lb, vb = eigh(self.Varb)
             idx = lb.argsort()[::-1]
             vb = vb[:,idx]
             lb = lb[idx]
             for i in range(len(lb)):
                 lb[i] = max(0, lb[i])
             diag = lb * np.identity(len(lb))
             self.Varb = vb @ diag @ t(vb)
     self.SEb = np.sqrt(np.diag(self.Varb))
     if nocons == False:
         self.ypred = m(self.X, self.b) - np.mean(self.dep)
         self.ESS = t(self.ypred) @ self.ypred
     if nocons == True:
         self.ESS = t(m(self.X, self.b)) @ m(self.X, self.b)
     self.TSS = self.SSR + self.ESS
     self.R2 = self.ESS/self.TSS
     self.AR2 = 1 - (1-self.R2)*float(self.n-1)/float(self.df)
     self.ts = np.zeros(len(self.b))
     self.pvalue = np.zeros(len(self.b))
     for j in range(len(self.b)):
         self.ts[j] = self.b[j]/self.SEb[j]
         self.pvalue[j] = 2*ss.t.cdf(-abs(self.ts[j]), self.df)
     self.Mx = (np.identity(self.n) - self.Px)
     self.SSR1 = t(self.dep) @ self.Mx @ self.dep
import numpy as np
from numpy import array as a
from numpy import matrix as m
from numpy import mat
'''
Problem set 1
Do problems:

23 and 28 from section 1.2
'''

A = mat('[1 2 3;2 5 2;6 -3 1]')
A * m([1, 1, 1]).T

A = mat('1 1 1;1 1 1;1 1 0')
v = mat('4 5 6').T
A * v

I = np.identity(3)
I
I * v

A = mat('1 2 3;4 5 6;7 8 9')
A[:, 0]

A = mat('1 2 3; 2 5 2;6 -3 1')
x = mat('0;0;2')
b = A * x

m([A[0, :] * x, A[1, :] * x, A[2, :] * x])
Beispiel #19
0
 def __init__(self, y, X, nocons = False, vce = "ROBUST", cluster = None, gls = False):
     y, X = clearNaN(y, X) # get rid of null obs
     self.gls = gls
     self.depname = y.name 
     self.nocons = nocons 
     self.n = len(y) 
     if nocons == False: 
         cons = pd.Series(np.ones(self.n), index = X.index, name = "Cons")
         X = pd.concat([X, cons], axis = 1)
     self.l = X.shape[1]
     self.dep = np.array(y.values, dtype = float)
     self.X = X.values
     self.Xt = t(self.X)
     self.varlist = X.columns
     self.VarX = inv(self.Xt @ self.X) # the main inverse
     self.Px = self.X @ self.VarX @ self.Xt # Px matrix from Davidson Mackinnon
     self.Mx = np.identity(self.n) - self.Px # Mx matrix ""
     self.CovXy = self.Xt @ self.dep # Xty
     self.b = self.VarX @ self.CovXy # combining the inverse and the Xty
     self.df = np.trace(self.Mx) # degrees of freedom = trace(Mx)
     self.u_hat = self.dep - m(self.X, self.b) # get residuals
     self.u1 = self.u_hat.reshape(len(self.u_hat), 1) # data organisation
     self.u2 = self.u_hat**2 # get squared residuals 
     self.SSR = t(self.u_hat) @ self.u_hat # get SSR
     self.SE = self.SSR/float(self.df) # get sigma_u estimate
     self.Varb = self.SE * self.VarX #default 
     #Heteroskedasticity/Clustered/Serial Correlation works below 
     if vce == None:
         vce = ""
     if vce.upper() == "ROBUST":
         self.u2 = self.u2*self.n/self.df
         self.ohm = np.zeros([self.n,self.n])
         for i in range(self.n):
            self.ohm[i][i] = self.u2[i]
         self.XOX = self.Xt @ self.ohm @ self.X
         self.Varb = self.VarX @ self.XOX @ self.VarX
     if vce.upper() == "HC2": 
         self.u2 = self.u2/np.diag(self.Mx)
         self.ohm = np.zeros([self.n,self.n])
         for i in range(self.n):
            self.ohm[i][i] = self.u2[i]
         self.XOX = self.Xt @ self.ohm @ self.X
         self.Varb = self.VarX @ self.XOX @ self.VarX
     if np.all(np.any(cluster) != None):
         if np.all(np.all(cluster) != None):
             print("Cluster ID Retrieved!")
             try:
                 clsize = cluster.shape[1] 
             except: 
                 clsize = 1
             if clsize == 1:
                 ncl = len(np.unique(cluster))
                 self.o1 = self.u1 @ t(self.u1)*ncl/(ncl-1)*(self.n-1)/self.df
                 self.ohm = np.zeros(self.o1.shape)
                 for i in range(self.n):
                     for j in range(self.n):
                         if np.all(cluster.iloc[i] == cluster.iloc[j]):
                             self.ohm[i][j] = self.o1[i][j]
             elif clsize == 2:
                 print("Twoway clustering: ", cluster.columns)
                 ncl1 = len(np.unique(cluster.iloc[:,0]))
                 ncl2 = len(np.unique(cluster.iloc[:,1]))
                 ncl12 = len(np.unique(cluster, axis = 0))
                 self.o1 = self.u1 @ t(self.u1)*(self.n-1)/self.df*ncl1/(ncl1-1)
                 self.o2 = self.u1 @ t(self.u1)*(self.n-1)/self.df*ncl2/(ncl2-1)
                 self.o12 = self.u1 @ t(self.u1)*(self.n-1)/self.df*ncl12/(ncl12-1)
                 self.ohm1 = np.zeros(self.o1.shape)
                 self.ohm2 = np.zeros(self.o2.shape)
                 self.ohm12 = np.zeros(self.o12.shape)
                 print("Retrieving First VCE")
                 for i in range(self.n):
                     for j in range(self.n):
                         if cluster.iloc[i,0] == cluster.iloc[j,0]:
                             self.ohm1[i][j] = self.o1[i][j]
                 print("Retrieving Second VCE")
                 for i in range(self.n):
                     for j in range(self.n):
                         if cluster.iloc[i,1] == cluster.iloc[j,1]:
                             self.ohm2[i][j] = self.o2[i][j]
                 print("Retrieving Third VCE")
                 if ncl12 == self.n:
                     d1 = np.diag(self.o12)
                     self.ohm12 = d1 * np.identity(self.n)
                 else:
                     for i in range(self.n):
                         for j in range(self.n):
                             if np.any(cluster.iloc[i] == cluster.iloc[j]):
                                 self.ohm12[i][j] = self.o12[i][j]
                 self.ohm = self.ohm1 + self.ohm2 - self.ohm12
             else: 
                 print("Supports up to two way clustering only!")
         else:
             print("Incomplete Cluster ID!")
             print("HC1 Assumed")
             self.u2 = self.u2*self.n/self.df
             self.ohm = np.zeros([self.n,self.n])
             for i in range(self.n):
                 self.ohm[i][i] = self.u2[i]
         self.XOX = self.Xt @ self.ohm @ self.X
         self.Varb = self.VarX @ self.XOX @ self.VarX
         self.Varb1 = self.Varb
         if np.any(np.diag(self.Varb) < 0):
             print("Non Positive Semi-Definite VCE Matrix! Cameron, Gelbach & Miller (2011) Transformation Used")
             lb, vb = eigh(self.Varb)
             idx = lb.argsort()[::-1]
             vb = vb[:,idx]
             lb = lb[idx]
             for i in range(len(lb)):
                 lb[i] = max(0, lb[i])
             diag = lb * np.identity(len(lb))
             self.Varb = vb @ diag @ t(vb)
     if self.gls == True: 
         print("One step GLS")
         self.VarX = inv(self.XOX)
         self.CovXy = self.Xt @ self.ohm @ self.dep
         self.b = self.VarX @ self.CovXy
     self.SEb = np.sqrt(np.diag(self.Varb))
     if nocons == False:
         self.ypred = m(self.X, self.b) - np.mean(self.dep)
         self.ESS = t(self.ypred) @ self.ypred
     if nocons == True:
         self.ESS = t(m(self.X, self.b)) @ m(self.X, self.b)
     self.TSS = self.SSR + self.ESS
     self.R2 = self.ESS/self.TSS
     self.AR2 = 1 - (1-self.R2)*float(self.n-1)/float(self.df)
     self.ts = np.zeros(len(self.b))
     self.pvalue = np.zeros(len(self.b))
     for j in range(len(self.b)):
         self.ts[j] = self.b[j]/self.SEb[j]
         self.pvalue[j] = 2*ss.t.cdf(-abs(self.ts[j]), self.df)
     self.Mx = (np.identity(self.n) - self.Px)
     self.SSR1 = t(self.dep) @ self.Mx @ self.dep
     self.settest()
Beispiel #20
0
import numpy as np
from numpy.linalg import inv
from numpy import matmul as m

X = np.random.standard_normal((20, 20))
y = 10 * (np.random.random(20))
y = np.int32(y)
Xt = X.transpose()

theta = m(m(inv(m(Xt, X)), Xt), y)

print("Matrix y: ", y)
print("Matrix x: ", X)
print("Matrix Theta: ", theta)