Example #1
0
    def __init__(self, folder, n, m, pad, detailedSave=False):
        self.comm = MPI.COMM_WORLD
        size = self.comm.Get_size()
        self.size = size
        self.rank = self.comm.Get_rank()
        self.n = n
        self.m = m
        self.pad = pad
        self.folder = folder
        self.blocks = Blocks()

        self.detailedSave = detailedSave
        self.numOfBlocks = n * (1 + pad)

        kCheckpoint = 0  #0 = no checkpoint

        if os.path.exists("processedData/" + folder + "/checkpoint"):
            for k in range(n * (1 + self.pad) - 1, 0, -1):
                if os.path.exists("processedData/{0}/checkpoint/{1}/".format(
                        folder, k)):
                    path, dirs, files = os.walk(
                        "processedData/{0}/checkpoint/{1}/".format(folder,
                                                                   k)).next()
                    file_count = len(files)
                    if file_count == 2 * self.numOfBlocks:
                        kCheckpoint = k
                        if self.rank == 0:
                            print "Using Checkpoint #{0}".format(k)
                        break
        else:
            if self.rank == 0:
                os.makedirs("processedData/{0}/checkpoint/".format(folder))
        self.kCheckpoint = kCheckpoint
        if not os.path.exists("results"):
            if self.rank == 0:
                os.makedirs("results")
        if not os.path.exists("results/{0}".format(folder)):
            if self.rank == 0:
                os.makedirs("results/{0}".format(folder))

        if self.rank == 0:
            if not os.path.exists("results/{0}".format(folder + "_uc.npy")):
                uc = np.zeros((m * n, 1), dtype=complex)
                np.save("results/{0}".format(folder + "_uc.npy"), uc)
        ## So that the creation of files and directories are complete before the rest of the nodes continue
        initDone = False
        initDone = self.comm.bcast(initDone, root=0)
    def __init__(self, folder, n,m, pad):
        self.comm = MPI.COMM_WORLD
        size  = self.comm.Get_size()
        self.size = size
        self.rank = self.comm.Get_rank()
        self.n = n
        self.m = m
        self.pad = pad
        self.folder = folder
        self.blocks = Blocks()
        
        self.numOfBlocks = n*(1 + pad)
        
        kCheckpoint = 0 #0 = no checkpoint
        
        if os.path.exists("processedData/" + folder + "/checkpoint"):
            for k in range(n*(1 + self.pad) - 1, 0, -1):
                if os.path.exists("processedData/{0}/checkpoint/{1}/".format(folder, k)):
                    path, dirs, files = os.walk("processedData/{0}/checkpoint/{1}/".format(folder, k)).next()
                    file_count = len(files)
                    if file_count == 2*self.numOfBlocks:
                        kCheckpoint = k 
                        if self.rank == 0: print "Using Checkpoint #{0}".format(k)
                        break
        else:
            if self.rank == 0:
                os.makedirs("processedData/{0}/checkpoint/".format(folder))
        self.kCheckpoint = kCheckpoint
        if not os.path.exists("results"):
            if self.rank == 0:
                os.makedirs("results")
        if not os.path.exists("results/{0}".format(folder)):
            if self.rank == 0:
                os.makedirs("results/{0}".format(folder))   

        if self.rank==0:
            if not os.path.exists("results/{0}".format(folder + "_uc.npy")):
                uc = np.zeros((m*n,1), dtype=complex)
                np.save("results/{0}".format(folder + "_uc.npy"), uc)
        ## So that the creation of files and directories are complete before the rest of the nodes continue        
        initDone=False
        initDone = self.comm.bcast(initDone, root=0)
class ToeplitzFactorizor:
    
    def __init__(self, folder, n,m, pad, detailedSave = False):
        self.comm = MPI.COMM_WORLD
        size  = self.comm.Get_size()
        self.size = size
        self.rank = self.comm.Get_rank()
        self.n = n
        self.m = m
        self.pad = pad
        self.folder = folder
        self.blocks = Blocks()
        
        self.detailedSave = detailedSave
        self.numOfBlocks = n*(1 + pad)
        
        kCheckpoint = 0 #0 = no checkpoint
        
        if os.path.exists("processedData/" + folder + "/checkpoint"):
            for k in range(n*(1 + self.pad) - 1, 0, -1):
                if os.path.exists("processedData/{0}/checkpoint/{1}/".format(folder, k)):
                    path, dirs, files = os.walk("processedData/{0}/checkpoint/{1}/".format(folder, k)).next()
                    file_count = len(files)
                    if file_count == 2*self.numOfBlocks:
                        kCheckpoint = k 
                        if self.rank == 0: print "Using Checkpoint #{0}".format(k)
                        break
        else:
            if self.rank == 0:
                os.makedirs("processedData/{0}/checkpoint/".format(folder))
        self.kCheckpoint = kCheckpoint
        if not os.path.exists("results"):
            if self.rank == 0:
                os.makedirs("results")
        if not os.path.exists("results/{0}".format(folder)):
            if self.rank == 0:
                os.makedirs("results/{0}".format(folder))   

        if self.rank==0:
            if not os.path.exists("results/{0}".format(folder + "_uc.npy")):
                uc = np.zeros((m*n,1), dtype=complex)
                np.save("results/{0}".format(folder + "_uc.npy"), uc)
        ## So that the creation of files and directories are complete before the rest of the nodes continue        
        initDone=False
        initDone = self.comm.bcast(initDone, root=0)
        
        
    def addBlock(self, rank):
        folder = self.folder
        b = Block(rank)
        k = self.kCheckpoint
        if k!= 0:
            A1 = np.load("processedData/{0}/checkpoint/{1}/{2}A1.npy".format(folder, k, rank))
            A2 = np.load("processedData/{0}/checkpoint/{1}/{2}A2.npy".format(folder, k, rank))
            b.setA1(A1)
            b.setA2(A2)
        else:
            if rank >= self.n:
                m = self.m
                b.createA(np.zeros((m,m), complex))
                
            else:
                T = np.load("processedData/{0}/{1}.npy".format(folder,rank))
                b.setT(T)
        b.setName("results/{0}_uc.npy".format(folder))
        self.blocks.addBlock(b)     
        return 

    def fact(self, method, p):
        if method not in np.array([SEQ, WY1, WY2, YTY1, YTY2]):
            raise InvalidMethodException(method)
        if p < 1 and method != SEQ:
            raise InvalidPException(p)
        
        
        pad = self.pad
        m = self.m
        n = self.n
        
        folder = self.folder
        
        if self.kCheckpoint==0:
            self.__setup_gen()

            for b in self.blocks:
                if not pad and b.rank == n*(1 + pad) - 1:
                    b.updateuc(b.rank)
                    
        if (self.detailedSave):
            for b in self.blocks:        
                np.save("results/{0}/L_{1}-{2}.npy".format(folder, 0, b.rank), b.getA1())
        
        for k in range(self.kCheckpoint + 1,n*(1 + pad)):
            self.k = k
            if self.rank == 1:
                print "Loop {0}".format(k)
            ##Build generator at step k [A1(:e1, :) A2(s2:e2, :)]
            s1, e1, s2, e2 = self.__set_curr_gen(k, n)
            if method==SEQ:
                self.__seq_reduc(s1, e1, s2, e2)
            else:
                self.__block_reduc(s1, e1, s2, e2, m, p, method, k)
            
                
            ##Save results immediately if we reached the end of the loop
            for b in self.blocks:
                if b.rank <=e1 and b.rank + k == n*(1 + pad) - 1:
                    b.updateuc(k%self.n)
                if b.rank <= e1 and self.detailedSave:
                    np.save("results/{0}/L_{1}-{2}.npy".format(folder, k, b.rank + k), b.getA1())
                
            ##CheckPoint
            saveCheckpoint = False
            if self.rank==0:
                timePerLoop.append(time() - sum(timePerLoop) - startTime)
                
                elapsedTime = time() - startTime
                if elapsedTime + max(timePerLoop) >= MAXTIME: ##Max instead of np.mean, just to be safe
                    print "Saving Checkpoint #{0}".format(k)  
                    if not os.path.exists("processedData/{0}/checkpoint/{1}/".format(folder, k)):
                        try:
                            os.makedirs("processedData/{0}/checkpoint/{1}/".format(folder, k))
                        except: pass
                    saveCheckpoint = True
            saveCheckpoint = self.comm.bcast(saveCheckpoint, root=0)
            
            if saveCheckpoint:
                for b in self.blocks:
                    ##Creating Checkpoint
                    A1 = np.save("processedData/{0}/checkpoint/{1}/{2}A1.npy".format(folder, k, b.rank), b.getA1())
                    A2 = np.save("processedData/{0}/checkpoint/{1}/{2}A2.npy".format(folder, k, b.rank), b.getA2())
                exit()
                    
            

    ##Private Methods
    def __setup_gen(self):
        n = self.n
        m = self.m
        pad = self.pad
        A1 = np.zeros((m, m),complex)
        A2 = np.zeros((m, m), complex)
        cinv = None
        
        ##The root rank will compute the cholesky decomposition
        if self.blocks.hasRank(0) :
#            print self.blocks.getBlock(0).getT(), 'getBlock'
            c = cholesky(self.blocks.getBlock(0).getT())
            c = np.conj(c.T)
            cinv = inv(c)
#            print cinv, 'cinv'
        cinv = self.comm.bcast(cinv, root=0)
        for b in self.blocks:
            if b.rank < self.n:
                b.createA(b.getT().dot(cinv))
                print 'A1', A1.shape, b.rank
                print 'A2', A2.shape, b.rank
            
        ##We are done with T. We shouldn't ever have a reason to use it again
        for b in self.blocks:
            b.deleteT()
        
        return A1, A2

    def __set_curr_gen(self, k, n):
        s1 = 0
        e1 = min(n, (n*(1 + self.pad) - k)) -1
        s2 = k
        e2 = e1 + s2
        
        for b in self.blocks:
            if s1 <= b.rank <=e1:
                b.setWork1(b.rank + k)
            else:
                b.setWork1(None)
            if e2 >= b.rank >= s2:
                b.setWork2(b.rank - k)
            else:
                b.setWork2(None)
        return s1, e1, s2, e2
    
    def __temp_Comm(self, k, n, b):
        s1 = 0
        e1 = min(n, (n*(1 + self.pad) - k)) -1
        s2 = k
        e2 = e1 + s2
        
        N = self.size # number of processes
        n = np.arange(0,N)
        
        # find processes that are needed
        temp = np.where( np.logical_and( n >=  s1, n <= e1) )
        temp2 = np.where( np.logical_and( n >=  s2, n <= e2, n==0) )
        union = np.union1d(temp, temp2)
#        if self.rank == 0: print union
        
        exclusion = np.setxor1d(n, union) # find processes that are not needed
        newrank = np.arange(0, union.size)
        
        # making a new sub communicator between the processes that are needed
        group = self.comm.Get_group()
        newgroup = group.Excl(exclusion)
        newcomm = self.comm.Create(newgroup)
        
        # renaming new comm size and ranking scheme
        if self.rank in exclusion:
            assert newcomm == MPI.COMM_NULL
        else:
            assert newcomm.size == self.size-exclusion.size
#            print newrank[np.where(self.rank == union )], self.rank
            assert newcomm.rank == newrank[np.where(self.rank == union )][0]
        
        if self.rank not in exclusion:
            newcomm.Bcast(b.getTemp(), root=0) 
            
        group.Free(); newgroup.Free()
        if newcomm: newcomm.Free()
        return union

    def __block_reduc(self, s1, e1, s2, e2, m, p, method, k):
        n = self.n
       
        for sb1 in range (0, m, p):
            
            for b in self.blocks:
                b.setWork(None, None)
                if b.rank==0: b.setWork1(s2)
                if b.rank==s2: b.setWork2(0)
            #print k, b.rank, b.getWork1(), b.getWork2()
        
            sb2 = s2*m + sb1
            eb1 = min(sb1 + p, m) #next j
            eb2 = s2*m + eb1
            u1 = eb1
            u2 = eb2
            p_eff = min(p, m - sb1)
            
            #XX2 = np.zeros((p_eff, m), complex)
            temp = XX2 = np.zeros((p_eff, m+1), complex)
            if method == WY1 or method == WY2:
                S = np.array([np.zeros((m,p)),np.zeros((m,p))], complex)
            elif method == YTY1 or YTY2:
                S = np.zeros((p, p), complex)
            
            #b.createTemp(np.zeros((m+1), complex))
            for j in range(0, p_eff):
                j1 = sb1 + j
                j2 = sb2 + j
                #X2, beta= self.__house_vec(j1, s2, j, b) ##s2 or sb2?
                data= self.__house_vec(j1, s2, j, b) ##s2 or sb2?
                #XX2[j] = X2  
                temp[j] = data
                X2 = data[:self.m]
                beta = data[-1]
                self.__seq_update(X2, beta, eb1, eb2, s2, j1, m, n)
                
            b.createTemp(np.zeros((p_eff, m+1), complex))
            b.setTemp(temp)
            if b.getCond()[0]:
                pass
            else:
#                self.comm.Bcast(b.getTemp(), root=s2) 
                a = self.__temp_Comm(s2, n, b)
#            self.comm.Barrier()
            
            temp = b.getTemp()
            XX2 = temp[:,:m]
            for j in range(0, p_eff):
#                if b.rank == 0: print XX2.shape, beta
                beta = temp[j][-1]
                j1 = sb1 + j
                j2 = sb2 + j
                S = self.__aggregate(S, XX2, beta, p, j, j1, j2, p_eff, method, a)
            self.__set_curr_gen(s2, n) ## Updates work
#            if b.rank == 0: print 'set up'
            self.__block_update(XX2, sb1, eb1, u1, e1, s2,  sb2, eb2, u2, e2, S, method)

            #raise Exception()
        return
    
    def __block_update(self, X2, sb1, eb1, u1, e1,s2, sb2, eb2, u2, e2, S, method):
        def yty2():
            invT = S
            for b in self.blocks:
                #print s2,b.rank, b.work1, b.work2
                if b.work2 == None: 
                    continue
#                print s2,b.rank, b.work1, b.work2, 'a'
                s = 0 
                if b.rank == s2:
                    s = u1
                A2 = b.getA2()
                B2 = A2[s:, :m].dot(np.conj(X2[:p_eff, :m]).T)
                #print 's1', s, A2[s:, :m].shape
                self.comm.Send(B2, dest=b.getWork2()%self.size, tag=3*num + b.getWork2())
                
                M = np.empty((m - s, p_eff), complex)
                self.comm.Recv(M, source=b.getWork2()%self.size, tag=4*num + b.getWork2())
                
                A2[s:, :m] = A2[s:,:m] + M.dot(X2)
                del A2
                
            for b in self.blocks:
                if b.work1 == None: continue
                s = 0
#                print s2,b.rank, b.work1, b.work2,  'b'
                if b.rank == 0:
                    s=u1
                
                A1 = b.getA1()
                B1 = A1[s:, sb1:eb1]
                
                B2 = np.empty((m - s, p_eff), complex)
                self.comm.Recv(B2, source=b.getWork1()%self.size, tag=3*num + b.rank)  
                M = B1 - B2
                M = M.dot(inv(invT[:p_eff,:p_eff]))
                
                #print 's2', s, M.shape
                self.comm.Send(M, dest=b.getWork1()%self.size, tag=4*num + b.rank)
                A1[s:, sb1:eb1] = A1[s:, sb1:eb1] + M
                del A1   
#            for b in self.blocks:
#                if b.work2 == None: 
#                    continue
#                s = 0 
#                if b.rank == s2:
#                    s = u1
#                M = np.empty((m - s, p_eff), complex)
#                self.comm.Recv(M, source=b.getWork2()%self.size, tag=4*num + b.getWork2())
#                
#                A2 = b.getA2()
#                f = time()
#                A2[s:, :m] = A2[s:,:m] + M.dot(X2)
#                g = time()
#                #print 's3', s, M.shape, g-f
#                del A2 
            return 
        
        
        m = self.m
        n = self.n
        nru = e1*m - u1
        p_eff = eb1 - sb1 
        num = self.numOfBlocks
        
        if method == WY1:
            return wy1()
        elif method == WY2:
            return wy2()
        elif method ==YTY1:
            return yty1()
        elif method == YTY2:
            return yty2()

    def __aggregate(self,S,  X2, beta, p, j, j1, j2, p_eff, method, a):
        def yty2():
            invT = S
            #log("old invT = " + str(invT))
            for b in self.blocks:
                if b.rank  in  a:
                    if j == p_eff - 1:
                        invT[:p_eff, :p_eff] = triu(X2[:p_eff, :m].dot(np.conj(X2)[:p_eff, :m].T))
                        for jj in range(p_eff):
                            invT[jj,jj] = (invT[jj,jj] - 1.)/2.
                else: 
                    continue
            return invT           
        m = self.m
        n = self.n
        sb1 = j1 - j
        sb2 = j2 - j
        v = np.zeros(m*(n + 1), complex) 
        #log("sb1, sb2 = {0}, {1}".format(sb1, sb2)) 
        if method == WY1:
            return wy1()
        if method == WY2:
            return wy2()
        if method == YTY1:
            return yty1()
        if method == YTY2:
            return yty2()

    def __seq_reduc(self, s1, e1, s2, e2):
        n = self.n
        m = self.m
        for j in range (0, self.m):
            X2, beta = self.__house_vec(j, s2)
            
            self.__seq_update(X2, beta, e1*m, e2*m, s2, j, m, n)

    def __seq_update(self,X2, beta, e1, e2, s2, j, m, n):
        #X2 = np.array([X2])
        u = j + 1
        num = self.numOfBlocks
        
        nru = e1*m - (s2*m + j + 1)  
        for b in self.blocks:
            if b.work2 == None: 
                continue
#            print s2,b.rank, b.work1, b.work2,  'a'
            B1 = np.dot(b.getA2(), np.conj(X2.T))
            start = 0
            end = m
            if b.rank == s2:
                start = u
            if b.rank == e2/m:
                end = e2 % m or m
            B1 = B1[start:end]
            self.comm.Send(B1, dest=b.getWork2()%self.size, tag=4*num + b.getWork2())
            
            v = np.empty(end-start,complex)
            self.comm.Recv(v, source=b.getWork2()%self.size, tag=5*num + b.rank)
            A2 = b.getA2()
            A2[start:end,:] -= beta*v[np.newaxis].T.dot(np.array([X2[:]]))
            del A2
        
        for b in self.blocks:
            if b.work1 == None:
                continue 
#            print s2,b.rank, b.work1, b.work2,  'b'
            start = 0
            end = m
            if b.rank == 0:
                start = u
            if b.rank == e1/m:
                end = e1 % m or m
            B1 = np.empty(end-start, complex)
            
            self.comm.Recv(B1, source=b.getWork1()%self.size, tag=4*num + b.rank)
            A1 = b.getA1()
            B2 = A1[start:end, j]
                
            v = B2 - B1
            self.comm.Send(v, (b.getWork1())%self.size, 5*num + b.getWork1())
            A1[start:end,j] -= beta*v
            del A1

#        for b in self.blocks:
#            if b.work2 == None: 
#                continue
#            start = 0
#            end = m
#            if b.rank == s2:
#                start = u
#            if b.rank == e2/m :
#                end = e2 % m or m
#            v = np.empty(end-start,complex)
#            self.comm.Recv(v, source=b.getWork2()%self.size, tag=5*num + b.rank)
#            A2 = b.getA2()
#            A2[start:end,:] -= beta*v[np.newaxis].T.dot(np.array([X2[:]]))
#            #A2[start:end,:] -= beta*v.dot(np.conj(X2).T)
#            del A2
        
    def __house_vec(self, j, s2, j_count, b):
        isZero = np.array([0])
        b.setFalse(isZero)
#        print b.getCond()
        
        X2 = np.zeros(self.m, complex)
        data = np.zeros(self.m+1, complex)
        beta = 0
        blocks = self.blocks
        n = self.n
        num = self.numOfBlocks
        
        if blocks.hasRank(s2):
            A2 = blocks.getBlock(s2).getA2()
            if np.all(np.abs(A2[j, :]) < 1e-13):
                isZero=np.array([1])
                b.setTrue(isZero)
                self.comm.Bcast(b.getCond(), root=s2%self.size)
            del A2
        
        #isZero = self.comm.bcast(isZero, root=s2%self.size)
#        self.comm.Bcast(b.getCond(), root=s2%self.size)
        if b.getCond()[0]:
            print isZero
            data[:self.m] = X2
            data[-1] = beta  
            b.setTemp(data)
            return data
        
        if blocks.hasRank(s2):
            A2 = blocks.getBlock(s2).getA2()
            sigma = A2[j, :].dot(np.conj(A2[j,:]))
            self.comm.send(sigma, dest=0, tag=2*num + s2)
            
            z = self.comm.recv(source=0, tag=3*num + s2)
            beta = self.comm.recv(source=0, tag=4*num + s2)

            X2 = A2[j,:]/z
            A2[j, :] = X2
 
           #print X2.shape, beta, 'main'
            data[:self.m] = X2
            data[-1] = beta  
            b.setTemp(data)
            self.comm.send(data, dest=0, tag=5*num + s2)
            del A2
            
        if blocks.hasRank(0):
            A1 = blocks.getBlock(0).getA1()
            sigma = self.comm.recv(source=s2%self.size, tag=2*num + s2)
            alpha = (A1[j,j]**2 - sigma)**0.5            
            if (np.real(A1[j,j] + alpha) < np.real(A1[j, j] - alpha)):
                z = A1[j, j]-alpha
                A1[j,j] = alpha 
            else:
                z = A1[j, j]+alpha
                A1[j,j] = -alpha
            self.comm.send(z, dest=s2%self.size, tag=3*num + s2)
            beta = 2*z*z/(-sigma + z*z)           
            self.comm.send(beta, dest=s2%self.size, tag=4*num + s2)
            
            data = self.comm.recv(source=s2%self.size, tag=5*num + s2)
#            X2 = data[:self.m]
#            beta = data[-1]
            del A1
        
#        if blocks.hasRank(s2):
#            z = self.comm.recv(source=0, tag=3*num + s2)
#            beta = self.comm.recv(source=0, tag=4*num + s2)
#            
#            A2 = blocks.getBlock(s2).getA2()
#            X2 = A2[j,:]/z
#            A2[j, :] = X2
# 
#           #print X2.shape, beta, 'main'
#            data[:self.m] = X2
#            data[-1] = beta  
#            b.setTemp(data)
#            del A2


#        self.comm.Barrier()
#        for b in self.blocks:
#            if b.work1 == None: continue
#            print s2,b.rank, b.work1, b.work2
#            print 1/0.
#                self.comm.send(data, dest=b.rank, tag=b.work1*num + s2)
#                self.comm.recv(source = b.work1, tag=b.work1*num + s2)
#            if b.work2 == s2:
#                data = self.comm.recv(source=s2%self.size, tag=5*num + s2)
#                X2 = data[:self.m]
#                beta = data[-1]
#            if b.work1 == s2:
#                data = self.comm.recv(source=s2%self.size, tag=5*num + s2)
#                X2 = data[:self.m]
#                beta = data[-1]
        
#        self.comm.Bcast(b.getTemp(), root=s2%self.size)
#        data = b.getTemp()
#        X2 = data[:self.m]
#        beta = data[-1]
                
#        if blocks.hasRank(s2): 
#            # process zero exposes an array of 10 integers 
#            disp_unit = data.itemsize 
#        else: 
#            # other process do not expose memory 
#            data = None 
#            disp_unit = 1 
#        
#        win = MPI.Win.Create(data, disp_unit, comm=self.comm) 
#
#        # all processes get three integers from process zero 
#        data = np.zeros((self.m+1), complex) 
#        win.Fence() 
#        win.Rget(data, s2) 
#        win.Fence() 
#        X2 = data[:self.m]
#        beta = data[-1]
#
#        win.Free()
#        
        return data#X2, beta
Example #4
0
class ToeplitzFactorizor:
    def __init__(self, folder, n, m, pad, detailedSave=False):
        self.comm = MPI.COMM_WORLD
        size = self.comm.Get_size()
        self.size = size
        self.rank = self.comm.Get_rank()
        self.n = n
        self.m = m
        self.pad = pad
        self.folder = folder
        self.blocks = Blocks()

        self.detailedSave = detailedSave
        self.numOfBlocks = n * (1 + pad)

        kCheckpoint = 0  #0 = no checkpoint

        if os.path.exists("processedData/" + folder + "/checkpoint"):
            for k in range(n * (1 + self.pad) - 1, 0, -1):
                if os.path.exists("processedData/{0}/checkpoint/{1}/".format(
                        folder, k)):
                    path, dirs, files = os.walk(
                        "processedData/{0}/checkpoint/{1}/".format(folder,
                                                                   k)).next()
                    file_count = len(files)
                    if file_count == 2 * self.numOfBlocks:
                        kCheckpoint = k
                        if self.rank == 0:
                            print "Using Checkpoint #{0}".format(k)
                        break
        else:
            if self.rank == 0:
                os.makedirs("processedData/{0}/checkpoint/".format(folder))
        self.kCheckpoint = kCheckpoint
        if not os.path.exists("results"):
            if self.rank == 0:
                os.makedirs("results")
        if not os.path.exists("results/{0}".format(folder)):
            if self.rank == 0:
                os.makedirs("results/{0}".format(folder))

        if self.rank == 0:
            if not os.path.exists("results/{0}".format(folder + "_uc.npy")):
                uc = np.zeros((m * n, 1), dtype=complex)
                np.save("results/{0}".format(folder + "_uc.npy"), uc)
        ## So that the creation of files and directories are complete before the rest of the nodes continue
        initDone = False
        initDone = self.comm.bcast(initDone, root=0)

    def addBlock(self, rank):
        folder = self.folder
        b = Block(rank)
        k = self.kCheckpoint
        if k != 0:
            A1 = np.load("processedData/{0}/checkpoint/{1}/{2}A1.npy".format(
                folder, k, rank))
            A2 = np.load("processedData/{0}/checkpoint/{1}/{2}A2.npy".format(
                folder, k, rank))
            b.setA1(A1)
            b.setA2(A2)
        else:
            if rank >= self.n:
                m = self.m
                b.createA(np.zeros((m, m), complex))

            else:
                T = np.load("processedData/{0}/{1}.npy".format(folder, rank))
                b.setT(T)
        b.setName("results/{0}_uc.npy".format(folder))
        self.blocks.addBlock(b)
        return

    def fact(self, method, p):
        if method not in np.array([SEQ, WY1, WY2, YTY1, YTY2]):
            raise InvalidMethodException(method)
        if p < 1 and method != SEQ:
            raise InvalidPException(p)

        pad = self.pad
        m = self.m
        n = self.n

        folder = self.folder

        if self.kCheckpoint == 0:
            self.__setup_gen()

            for b in self.blocks:
                if not pad and b.rank == n * (1 + pad) - 1:
                    b.updateuc(b.rank)
        if (self.detailedSave):
            for b in self.blocks:
                np.save("results/{0}/L_{1}-{2}.npy".format(folder, 0, b.rank),
                        b.getA1())

        for k in range(self.kCheckpoint + 1, n * (1 + pad)):
            self.k = k
            if self.rank == 1:
                print "Loop {0}".format(k)
            ##Build generator at step k [A1(:e1, :) A2(s2:e2, :)]
            s1, e1, s2, e2 = self.__set_curr_gen(k, n)
            if method == SEQ:
                self.__seq_reduc(s1, e1, s2, e2)

            else:
                self.__block_reduc(s1, e1, s2, e2, m, p, method)

            ##Save results immediately if we reached the end of the loop

            for b in self.blocks:
                if b.rank <= e1 and b.rank + k == n * (1 + pad) - 1:
                    b.updateuc(k % self.n)
                if b.rank <= e1 and self.detailedSave:
                    np.save(
                        "results/{0}/L_{1}-{2}.npy".format(
                            folder, k, b.rank + k), b.getA1())

            ##CheckPoint
            saveCheckpoint = False
            if self.rank == 0:
                timePerLoop.append(time() - sum(timePerLoop) - startTime)

                elapsedTime = time() - startTime
                if elapsedTime + max(
                        timePerLoop
                ) >= MAXTIME:  ##Max instead of np.mean, just to be safe
                    print "Saving Checkpoint #{0}".format(k)
                    if not os.path.exists(
                            "processedData/{0}/checkpoint/{1}/".format(
                                folder, k)):
                        try:
                            os.makedirs(
                                "processedData/{0}/checkpoint/{1}/".format(
                                    folder, k))
                        except:
                            pass
                    saveCheckpoint = True

            saveCheckpoint = self.comm.bcast(saveCheckpoint, root=0)
            if saveCheckpoint:
                for b in self.blocks:
                    ##Creating Checkpoint
                    A1 = np.save(
                        "processedData/{0}/checkpoint/{1}/{2}A1.npy".format(
                            folder, k, b.rank), b.getA1())
                    A2 = np.save(
                        "processedData/{0}/checkpoint/{1}/{2}A2.npy".format(
                            folder, k, b.rank), b.getA2())
                exit()

    ##Private Methods
    def __setup_gen(self):
        n = self.n
        m = self.m
        pad = self.pad
        A1 = np.zeros((m, m), complex)
        A2 = np.zeros((m, m), complex)
        cinv = None

        ##The root rank will compute the cholesky decomposition
        if self.blocks.hasRank(0):
            c = cholesky(self.blocks.getBlock(0).getT())
            c = np.conj(c.T)
            cinv = inv(c)
        cinv = self.comm.bcast(cinv, root=0)
        for b in self.blocks:
            if b.rank < self.n:
                b.createA(b.getT().dot(cinv))

        ##We are done with T. We shouldn't ever have a reason to use it again
        for b in self.blocks:
            b.deleteT()

        return A1, A2

    def __set_curr_gen(self, k, n):
        s1 = 0
        e1 = min(n, (n * (1 + self.pad) - k)) - 1
        s2 = k
        e2 = e1 + s2
        for b in self.blocks:
            if s1 <= b.rank <= e1:
                b.setWork1(b.rank + k)
            else:
                b.setWork1(None)
            if e2 >= b.rank >= s2:
                b.setWork2(b.rank - k)
            else:
                b.setWork2(None)
        return s1, e1, s2, e2

    def __block_reduc(self, s1, e1, s2, e2, m, p, method):

        n = self.n

        ch = 0
        for sb1 in range(0, m, p):

            for b in self.blocks:
                b.setWork(None, None)
                if b.rank == 0: b.setWork1(s2)
                if b.rank == s2: b.setWork2(0)

            sb2 = s2 * m + sb1
            eb1 = min(sb1 + p, m)  #next j
            eb2 = s2 * m + eb1
            u1 = eb1
            u2 = eb2
            p_eff = min(p, m - sb1)

            XX2 = np.zeros((p_eff, m), complex)
            if method == WY1 or method == WY2:
                S = np.array([np.zeros((m, p)), np.zeros((m, p))], complex)
            elif method == YTY1 or YTY2:
                S = np.zeros((p, p), complex)
            for j in range(0, p_eff):
                j1 = sb1 + j
                j2 = sb2 + j
                X2, beta = self.__house_vec(j1, s2)  ##s2 or sb2?
                XX2[j] = X2
                self.__seq_update(X2, beta, eb1, eb2, s2, j1, m,
                                  n)  ##is this good?
                S = self.__aggregate(S, XX2, beta, p, j, j1, j2, p_eff, method)

            self.__set_curr_gen(s2, n)  ## Updates work
            self.__block_update(XX2, sb1, eb1, u1, e1, s2, sb2, eb2, u2, e2, S,
                                method)
            #if self.rank==1: print "block update"
            #raise Exception()
        return

    def __block_update(self, X2, sb1, eb1, u1, e1, s2, sb2, eb2, u2, e2, S,
                       method):
        def wy1():
            Y1, Y2 = S
            if p_eff == 0: return
            for b in self.blocks:
                if b.work2 == None:
                    continue
                s = 0
                if b.rank == s2:
                    s = u1
                A2 = b.getA2()
                B2 = A2[s:, :m].dot(np.conj(X2)[:p_eff, :m].T)
                self.comm.Send(B2,
                               dest=b.getWork2() % self.size,
                               tag=3 * num + b.getWork2())
                del A2

            for b in self.blocks:
                if b.work1 == None: continue
                s = 0
                if b.rank == 0:
                    s = u1
                A1 = b.getA1()
                B1 = A1[s:, sb1:eb1]
                B2 = np.empty((m - s, p_eff), complex)
                self.comm.Recv(B2,
                               source=b.getWork1() % self.size,
                               tag=3 * num + b.rank)
                M = B1 - B2
                self.comm.Send(M,
                               dest=b.getWork1() % self.size,
                               tag=4 * num + b.rank)
                A1[s:,
                   sb1:eb1] = A1[s:, sb1:eb1] + M.dot(Y1[sb1:eb1, :p_eff].T)
                del A1

            for b in self.blocks:
                if b.work2 == None:
                    continue
                s = 0
                if b.rank == s2:
                    s = u1
                M = np.empty((m - s, p_eff), complex)
                self.comm.Recv(M,
                               source=b.getWork2() % self.size,
                               tag=4 * num + b.getWork2())
                A2 = b.getA2()
                A2[s:, :m] = A2[s:, :m] + M.dot(Y2[:m, :p_eff].T)
                del A2

            return

        def wy2():
            W1, W2 = S
            if p_eff == 0: return
            for b in self.blocks:
                if b.work2 == None:
                    continue
                s = 0
                if b.rank == s2:
                    s = u1
                A2 = b.getA2()
                B2 = A2[s:, :m].dot(np.conj(W2[:m, :p_eff]))
                self.comm.Send(B2,
                               dest=b.getWork2() % self.size,
                               tag=3 * num + b.getWork2())
                del A2

            for b in self.blocks:
                if b.work1 == None: continue
                s = 0
                if b.rank == 0:
                    s = u1
                A1 = b.getA1()
                B1 = B1 = A1[s:, sb1:eb1].dot(W1[sb1:eb1, :p_eff])
                B2 = np.empty((m - s, p_eff), complex)
                self.comm.Recv(B2,
                               source=b.getWork1() % self.size,
                               tag=3 * num + b.rank)
                M = B1 - B2
                self.comm.Send(M,
                               dest=b.getWork1() % self.size,
                               tag=4 * num + b.rank)
                A1[s:, sb1:eb1] = A1[s:, sb1:eb1] + M
                del A1

            for b in self.blocks:
                if b.work2 == None:
                    continue
                s = 0
                if b.rank == s2:
                    s = u1
                M = np.empty((m - s, p_eff), complex)
                self.comm.Recv(M,
                               source=b.getWork2() % self.size,
                               tag=4 * num + b.getWork2())
                A2 = b.getA2()
                A2[s:, :m] = A2[s:, :m] + M.dot(X2)
                del A2
            return

        def yty1():
            T = S
            for b in self.blocks:
                if b.work2 == None:
                    continue
                s = 0
                if b.rank == s2:
                    s = u1
                A2 = b.getA2()
                B2 = A2[s:, :m].dot(np.conj(X2[:p_eff, :m]).T)
                self.comm.Send(B2,
                               dest=b.getWork2() % self.size,
                               tag=3 * num + b.getWork2())
                del A2

            for b in self.blocks:
                if b.work1 == None: continue
                s = 0
                if b.rank == 0:
                    s = u1
                A1 = b.getA1()
                B1 = A1[s:, sb1:eb1]
                B2 = np.empty((m - s, p_eff), complex)
                self.comm.Recv(B2,
                               source=b.getWork1() % self.size,
                               tag=3 * num + b.rank)
                M = B1 - B2
                M = M.dot(T[:p_eff, :p_eff])
                self.comm.Send(M,
                               dest=b.getWork1() % self.size,
                               tag=4 * num + b.rank)
                A1[s:, sb1:eb1] = A1[s:, sb1:eb1] + M
                del A1

            for b in self.blocks:
                if b.work2 == None:
                    continue
                s = 0
                if b.rank == s2:
                    s = u1
                M = np.empty((m - s, p_eff), complex)
                self.comm.Recv(M,
                               source=b.getWork2() % self.size,
                               tag=4 * num + b.getWork2())

                A2 = b.getA2()
                A2[s:, :m] = A2[s:, :m] + M.dot(X2)
                del A2

            return

        def yty2():
            invT = S
            for b in self.blocks:
                if b.work2 == None:
                    continue
                s = 0
                if b.rank == s2:
                    s = u1
                A2 = b.getA2()
                B2 = A2[s:, :m].dot(np.conj(X2[:p_eff, :m]).T)
                self.comm.Send(B2,
                               dest=b.getWork2() % self.size,
                               tag=3 * num + b.getWork2())
                del A2

            for b in self.blocks:
                if b.work1 == None: continue
                s = 0
                if b.rank == 0:
                    s = u1
                A1 = b.getA1()
                B1 = A1[s:, sb1:eb1]
                B2 = np.empty((m - s, p_eff), complex)
                self.comm.Recv(B2,
                               source=b.getWork1() % self.size,
                               tag=3 * num + b.rank)
                M = B1 - B2
                M = M.dot(inv(invT[:p_eff, :p_eff]))
                self.comm.Send(M,
                               dest=b.getWork1() % self.size,
                               tag=4 * num + b.rank)
                A1[s:, sb1:eb1] = A1[s:, sb1:eb1] + M
                del A1
            for b in self.blocks:
                if b.work2 == None:
                    continue
                s = 0
                if b.rank == s2:
                    s = u1
                M = np.empty((m - s, p_eff), complex)
                self.comm.Recv(M,
                               source=b.getWork2() % self.size,
                               tag=4 * num + b.getWork2())

                A2 = b.getA2()
                A2[s:, :m] = A2[s:, :m] + M.dot(X2)
                del A2
            return

        m = self.m
        n = self.n
        nru = e1 * m - u1
        p_eff = eb1 - sb1
        num = self.numOfBlocks

        if method == WY1:
            return wy1()
        elif method == WY2:
            return wy2()
        elif method == YTY1:
            return yty1()
        elif method == YTY2:
            return yty2()

    def __aggregate(self, S, X2, beta, p, j, j1, j2, p_eff, method):
        #log("aggregate")

        def wy1():
            Y1 = S[0]  ## it might be Y1 += new Y1
            Y2 = S[1]
            Y1[j1, j] = -beta
            Y2[:, j] = -beta * X2[j, :m]

            #log("Y1_init = " + str(Y1))
            #log("Y2_init = " + str(Y2))
            if (j > 0):
                v[:j] = beta * np.conj(X2)[j, :m].dot(Y2[:m, :j])
                #log("v = {}".format(v))
                Y1[j1, :j] = Y1[j1, :j] + v[:j]
                Y2[:m, :j] = Y2[:m, :j] + X2[j, :m][np.newaxis].T.dot(
                    v[:j][np.newaxis])
            #log("")
            #log("Y1_final = " + str(Y1))
            #log("Y2_final = " + str(Y2))
            return Y1, Y2

        def wy2():
            W1 = S[0]
            W2 = S[1]
            W1[j1, j] = -beta
            W2[:, j] = -beta * X2[j, :m]
            #log("W1_init = " + str(W1))
            #log("W2_init = " + str(W2))

            if j > 0:
                v[:j] = beta * X2[:j, :m].dot(np.conj(X2[j, :m].T))
                W1[sb1:j1, j] = W1[sb1:j1, :j].dot(v[:j])
                W2[:m, j] = W2[:m, j] + W2[:m, :j].dot(np.conj(v)[:j])
            #log("")
            #log("W1_final = " + str(W1))
            #log("W2_final = " + str(W2))
            return W1, W2

        def yty1():
            T = S
            T[j, j] = -beta
            if j > 0:
                v[:j] = beta * X2[:j, :m].dot(np.conj(X2)[j, :m].T)
                T[:j, j] = T[:j, :j].dot(v[:j])
            #log("T = " + str(T))
            return T

        def yty2():
            invT = S
            #log("old invT = " + str(invT))
            if j == p_eff - 1:
                invT[:p_eff, :p_eff] = triu(X2[:p_eff, :m].dot(
                    np.conj(X2)[:p_eff, :m].T))
                #log("invT = " + str(invT))
                for jj in range(p_eff):
                    invT[jj, jj] = (invT[jj, jj] - 1.) / 2.
            #log("invT = {}".format(invT))
            return invT

        m = self.m
        n = self.n
        sb1 = j1 - j
        sb2 = j2 - j
        v = np.zeros(m * (n + 1), complex)
        #log("sb1, sb2 = {0}, {1}".format(sb1, sb2))
        if method == WY1:
            return wy1()
        if method == WY2:
            return wy2()
        if method == YTY1:
            return yty1()
        if method == YTY2:
            return yty2()

    def __seq_reduc(self, s1, e1, s2, e2):
        n = self.n
        m = self.m
        for j in range(0, self.m):
            X2, beta = self.__house_vec(j, s2)

            self.__seq_update(X2, beta, e1 * m, e2 * m, s2, j, m, n)

    def __seq_update(self, X2, beta, e1, e2, s2, j, m, n):
        #X2 = np.array([X2])
        u = j + 1
        num = self.numOfBlocks

        nru = e1 * m - (s2 * m + j + 1)
        for b in self.blocks:
            if b.work2 == None:
                continue
            B1 = np.dot(b.getA2(), np.conj(X2.T))
            start = 0
            end = m
            if b.rank == s2:
                start = u
            if b.rank == e2 / m:
                end = e2 % m or m
            B1 = B1[start:end]
            self.comm.Send(B1,
                           dest=b.getWork2() % self.size,
                           tag=4 * num + b.getWork2())

        for b in self.blocks:
            if b.work1 == None: continue
            start = 0
            end = m
            if b.rank == 0:
                start = u
            if b.rank == e1 / m:
                end = e1 % m or m

            B1 = np.empty(end - start, complex)

            self.comm.Recv(B1,
                           source=b.getWork1() % self.size,
                           tag=4 * num + b.rank)
            A1 = b.getA1()
            B2 = A1[start:end, j]

            v = B2 - B1
            self.comm.Send(v, (b.getWork1()) % self.size,
                           5 * num + b.getWork1())
            A1[start:end, j] -= beta * v
            del A1

        for b in self.blocks:
            if b.work2 == None:
                continue
            start = 0
            end = m
            if b.rank == s2:
                start = u
            if b.rank == e2 / m:
                end = e2 % m or m
            v = np.empty(end - start, complex)
            self.comm.Recv(v,
                           source=b.getWork2() % self.size,
                           tag=5 * num + b.rank)
            A2 = b.getA2()
            A2[start:end, :] -= beta * v[np.newaxis].T.dot(np.array([X2[:]]))
            del A2

    def __house_vec(self, j, s2):
        isZero = False
        X2 = np.zeros(self.m, complex)
        beta = 0
        blocks = self.blocks
        n = self.n
        num = self.numOfBlocks

        if blocks.hasRank(s2):
            A2 = blocks.getBlock(s2).getA2()
            if np.all(np.abs(A2[j, :]) < 1e-13):
                isZero = True
            del A2
        isZero = self.comm.bcast(isZero, root=s2 % self.size)
        if isZero:
            return X2, beta

        if blocks.hasRank(s2):
            A2 = blocks.getBlock(s2).getA2()
            sigma = A2[j, :].dot(np.conj(A2[j, :]))
            self.comm.send(sigma, dest=0, tag=2 * num + s2)
            del A2
        if blocks.hasRank(0):
            A1 = blocks.getBlock(0).getA1()
            sigma = self.comm.recv(source=s2 % self.size, tag=2 * num + s2)
            alpha = (A1[j, j]**2 - sigma)**0.5
            if (np.real(A1[j, j] + alpha) < np.real(A1[j, j] - alpha)):
                z = A1[j, j] - alpha
                A1[j, j] = alpha
            else:
                z = A1[j, j] + alpha
                A1[j, j] = -alpha
            self.comm.send(z, dest=s2 % self.size, tag=3 * num + s2)
            beta = 2 * z * z / (-sigma + z * z)
            del A1

        if blocks.hasRank(s2):
            z = self.comm.recv(source=0, tag=3 * num + s2)
            A2 = blocks.getBlock(s2).getA2()
            X2 = A2[j, :] / z
            A2[j, :] = X2
            del A2
        beta = self.comm.bcast(beta, root=0)
        X2 = self.comm.bcast(X2, root=s2 % self.size)
        return X2, beta
class ToeplitzFactorizor:
    
    def __init__(self, folder, n,m, pad):
        self.comm = MPI.COMM_WORLD
        size  = self.comm.Get_size()
        self.size = size
        self.rank = self.comm.Get_rank()
        self.n = n
        self.m = m
        self.pad = pad
        self.folder = folder
        self.blocks = Blocks()
        
        self.numOfBlocks = n*(1 + pad)
        
        kCheckpoint = 0 #0 = no checkpoint
        
        if os.path.exists("processedData/" + folder + "/checkpoint"):
            for k in range(n*(1 + self.pad) - 1, 0, -1):
                if os.path.exists("processedData/{0}/checkpoint/{1}/".format(folder, k)):
                    path, dirs, files = os.walk("processedData/{0}/checkpoint/{1}/".format(folder, k)).next()
                    file_count = len(files)
                    if file_count == 2*self.numOfBlocks:
                        kCheckpoint = k 
                        if self.rank == 0: print "Using Checkpoint #{0}".format(k)
                        break
        else:
            if self.rank == 0:
                os.makedirs("processedData/{0}/checkpoint/".format(folder))
        self.kCheckpoint = kCheckpoint
        if not os.path.exists("results"):
            if self.rank == 0:
                os.makedirs("results")
        if not os.path.exists("results/{0}".format(folder)):
            if self.rank == 0:
                os.makedirs("results/{0}".format(folder))   

        if self.rank==0:
            if not os.path.exists("results/{0}".format(folder + "_uc.npy")):
                uc = np.zeros((m*n,1), dtype=complex)
                np.save("results/{0}".format(folder + "_uc.npy"), uc)
        ## So that the creation of files and directories are complete before the rest of the nodes continue        
        initDone=False
        initDone = self.comm.bcast(initDone, root=0)
        
        
    def addBlock(self, rank):
        folder = self.folder
        b = Block(rank)
        k = self.kCheckpoint
        if k!= 0:
            A1 = np.load("processedData/{0}/checkpoint/{1}/{2}A1.npy".format(folder, k, rank))
            A2 = np.load("processedData/{0}/checkpoint/{1}/{2}A2.npy".format(folder, k, rank))
            b.setA1(A1)
            b.setA2(A2)
        else:
            if rank >= self.n:
                m = self.m
                b.createA(np.zeros((m,m), complex))
                
            else:
                T = np.load("processedData/{0}/{1}.npy".format(folder,rank))
                b.setT(T)
        b.setName("results/{0}_uc.npy".format(folder))
        self.blocks.addBlock(b)     
       	return 

    def fact(self, method, p):
        if method not in np.array([SEQ, WY1, WY2, YTY1, YTY2]):
            raise InvalidMethodException(method)
        if p < 1 and method != SEQ:
            raise InvalidPException(p)
        
        
        pad = self.pad
        m = self.m
        n = self.n
        
        folder = self.folder
        
        if self.kCheckpoint==0:
            self.__setup_gen()
        

            for b in self.blocks:
                if not pad and b.rank == n*(1 + pad) - 1:
                    b.updateuc(b.rank)
                    #np.save("results/{0}/L_{1}-{2}.npy".format(folder, 0, b.rank), b.getA1())
        
        for k in range(self.kCheckpoint + 1,n*(1 + pad)):
            self.k = k
            if self.rank == 1:
                print "Loop {0}".format(k)
            ##Build generator at step k [A1(:e1, :) A2(s2:e2, :)]
            s1, e1, s2, e2 = self.__set_curr_gen(k, n)
            if method==SEQ:
                self.__seq_reduc(s1, e1, s2, e2)
                
            else:
                self.__block_reduc(s1, e1, s2, e2, m, p, method)
            
                
            ##Save results immediately if we reached the end of the loop
            
            for b in self.blocks:
                if b.rank <=e1 and b.rank + k == n*(1 + pad) - 1:
                    b.updateuc(k%self.n)
                    #np.save("results/{0}/L_{1}-{2}.npy".format(folder, k, b.rank + k), b.getA1())
                
            ##CheckPoint
            saveCheckpoint = False
            if self.rank==0:
                timePerLoop.append(time() - sum(timePerLoop) - startTime)
                
                elapsedTime = time() - startTime
                if elapsedTime + max(timePerLoop) >= MAXTIME: ##Max instead of np.mean, just to be safe
                    print "Saving Checkpoint #{0}".format(k)  
                    if not os.path.exists("processedData/{0}/checkpoint/{1}/".format(folder, k)):
                        try:
                            os.makedirs("processedData/{0}/checkpoint/{1}/".format(folder, k))
                        except: pass
                    saveCheckpoint = True
                 
                    
                
            saveCheckpoint = self.comm.bcast(saveCheckpoint, root=0)
            if saveCheckpoint:
                for b in self.blocks:
                    ##Creating Checkpoint
                    A1 = np.save("processedData/{0}/checkpoint/{1}/{2}A1.npy".format(folder, k, b.rank), b.getA1())
                    A2 = np.save("processedData/{0}/checkpoint/{1}/{2}A2.npy".format(folder, k, b.rank), b.getA2())
                exit()
                    
            

    ##Private Methods
    def __setup_gen(self):
        n = self.n
        m = self.m
        pad = self.pad
        A1 = np.zeros((m, m),complex)
        A2 = np.zeros((m, m), complex)
        cinv = None
        
        ##The root rank will compute the cholesky decomposition
        if self.blocks.hasRank(0) :
            c = cholesky(self.blocks.getBlock(0).getT())
            c = np.conj(c.T)
            cinv = inv(c)
        cinv = self.comm.bcast(cinv, root=0)
        for b in self.blocks:
            if b.rank < self.n:
                b.createA(b.getT().dot(cinv))
            
        		
        ##We are done with T. We shouldn't ever have a reason to use it again
        for b in self.blocks:
            b.deleteT()

        
        return A1, A2

    def __set_curr_gen(self, k, n):
        s1 = 0
        e1 = min(n, (n*(1 + self.pad) - k)) -1
        s2 = k
        e2 = e1 + s2
        for b in self.blocks:
            if s1 <= b.rank <=e1:
                b.setWork1(b.rank + k)
            else:
                b.setWork1(None)
            if e2 >= b.rank >= s2:
                b.setWork2(b.rank - k)
            else:
                b.setWork2(None)
        return s1, e1, s2, e2

    def __block_reduc(self, s1, e1, s2, e2, m, p, method):


        n = self.n
       
        ch = 0
        for sb1 in range (0, m, p):
            
            for b in self.blocks:
                b.setWork(None, None)
                if b.rank==0: b.setWork1(s2)
                if b.rank==s2: b.setWork2(0)
        
            sb2 = s2*m + sb1
            eb1 = min(sb1 + p, m) #next j
            eb2 = s2*m + eb1
            u1 = eb1
            u2 = eb2
            p_eff = min(p, m - sb1)
            
            XX2 = np.zeros((p_eff, m), complex)
            if method == WY1 or method == WY2:
                S = np.array([np.zeros((m,p)),np.zeros((m,p))], complex)
            elif method == YTY1 or YTY2:
                S = np.zeros((p, p), complex)
            for j in range(0, p_eff):
                j1 = sb1 + j
                j2 = sb2 + j
                X2, beta= self.__house_vec(j1, s2) ##s2 or sb2?
                XX2[j] = X2
                self.__seq_update(X2, beta, eb1, eb2, s2, j1, m, n) ##is this good?
                S = self.__aggregate(S, XX2, beta, p, j, j1, j2, p_eff, method)

            self.__set_curr_gen(s2, n) ## Updates work
            self.__block_update(XX2, sb1, eb1, u1, e1, s2,  sb2, eb2, u2, e2, S, method)
            #if self.rank==1: print "block update"
            #raise Exception()
        return
        
    def __block_update(self, X2, sb1, eb1, u1, e1,s2, sb2, eb2, u2, e2, S, method):
        def wy1():
            Y1, Y2 = S
            if p_eff == 0: return
            for b in self.blocks:
                if b.work2 == None: 
                    continue
                s = 0 
                if b.rank == s2:
                    s = u1
                A2 = b.getA2()
                B2 = A2[s:, :m].dot(np.conj(X2)[:p_eff,:m].T)    
                self.comm.Send(B2, dest=b.getWork2()%self.size, tag=3*num + b.getWork2())
                del A2

                    
            for b in self.blocks:
                if b.work1 == None: continue
                s = 0
                if b.rank == 0:
                    s=u1
                A1 = b.getA1()
                B1 = A1[s:, sb1:eb1]    
                B2 = np.empty((m - s, p_eff), complex)
                self.comm.Recv(B2, source=b.getWork1()%self.size, tag=3*num + b.rank)  
                M = B1 - B2
                self.comm.Send(M, dest=b.getWork1()%self.size, tag=4*num + b.rank)
                A1[s:, sb1:eb1] = A1[s:, sb1:eb1] + M.dot(Y1[sb1:eb1, :p_eff].T) 
                del A1

            for b in self.blocks:
                if b.work2 == None: 
                    continue
                s = 0 
                if b.rank == s2:
                    s = u1
                M = np.empty((m - s, p_eff), complex)
                self.comm.Recv(M, source=b.getWork2()%self.size, tag=4*num + b.getWork2())
                A2 = b.getA2()
                A2[s:, :m] = A2[s:,:m] + M.dot(Y2[:m, :p_eff].T)
                del A2
            
            return
           
        def wy2():
            W1, W2 = S
            if p_eff == 0: return
            for b in self.blocks:
                if b.work2 == None: 
                    continue
                s = 0 
                if b.rank == s2:
                    s = u1
                A2 = b.getA2()
                B2 = A2[s:, :m].dot(np.conj(W2[:m,:p_eff])) 
                self.comm.Send(B2, dest=b.getWork2()%self.size, tag=3*num + b.getWork2())
                del A2

            for b in self.blocks:
                if b.work1 == None: continue
                s = 0
                if b.rank == 0:
                    s=u1
                A1 = b.getA1()
                B1 = B1 = A1[s:, sb1:eb1].dot(W1[sb1:eb1, :p_eff]) 
                B2 = np.empty((m - s, p_eff), complex)
                self.comm.Recv(B2, source=b.getWork1()%self.size, tag=3*num + b.rank)  
                M = B1 - B2
                self.comm.Send(M, dest=b.getWork1()%self.size, tag=4*num + b.rank)
                A1[s:, sb1:eb1] = A1[s:, sb1:eb1] + M
                del A1          
   

            for b in self.blocks:
                if b.work2 == None: 
                    continue
                s = 0 
                if b.rank == s2:
                    s = u1
                M = np.empty((m - s, p_eff), complex)
                self.comm.Recv(M, source=b.getWork2()%self.size, tag=4*num + b.getWork2())
                A2 = b.getA2()
                A2[s:, :m] = A2[s:,:m] + M.dot(X2)
                del A2 
            return 


        def yty1():
            T = S
            for b in self.blocks:
                if b.work2 == None: 
                    continue
                s = 0 
                if b.rank == s2:
                    s = u1
                A2 = b.getA2()
                B2 = A2[s:, :m].dot(np.conj(X2[:p_eff, :m]).T)
                self.comm.Send(B2, dest=b.getWork2()%self.size, tag=3*num + b.getWork2())
                del A2
                
            for b in self.blocks:
                if b.work1 == None: continue
                s = 0
                if b.rank == 0:
                    s=u1
                A1 = b.getA1()
                B1 = A1[s:, sb1:eb1]
                B2 = np.empty((m - s, p_eff), complex)
                self.comm.Recv(B2, source=b.getWork1()%self.size, tag=3*num + b.rank)  
                M = B1 - B2
                M = M.dot(T[:p_eff,:p_eff])
                self.comm.Send(M, dest=b.getWork1()%self.size, tag=4*num + b.rank)
                A1[s:, sb1:eb1] = A1[s:, sb1:eb1] + M
                del A1                  

            for b in self.blocks:
                if b.work2 == None: 
                    continue
                s = 0 
                if b.rank == s2:
                    s = u1
                M = np.empty((m - s, p_eff), complex)
                self.comm.Recv(M, source=b.getWork2()%self.size, tag=4*num + b.getWork2())
                
                A2 = b.getA2()
                A2[s:, :m] = A2[s:,:m] + M.dot(X2)
                del A2 
            
            
            return

        def yty2():
            invT = S
            for b in self.blocks:
                if b.work2 == None: 
                    continue
                s = 0 
                if b.rank == s2:
                    s = u1
                A2 = b.getA2()
                B2 = A2[s:, :m].dot(np.conj(X2[:p_eff, :m]).T)
                self.comm.Send(B2, dest=b.getWork2()%self.size, tag=3*num + b.getWork2())
                del A2
                
            for b in self.blocks:
                if b.work1 == None: continue
                s = 0
                if b.rank == 0:
                    s=u1
                A1 = b.getA1()
                B1 = A1[s:, sb1:eb1]
                B2 = np.empty((m - s, p_eff), complex)
                self.comm.Recv(B2, source=b.getWork1()%self.size, tag=3*num + b.rank)  
                M = B1 - B2
                M = M.dot(inv(invT[:p_eff,:p_eff]))
                self.comm.Send(M, dest=b.getWork1()%self.size, tag=4*num + b.rank)
                A1[s:, sb1:eb1] = A1[s:, sb1:eb1] + M
                del A1   
            for b in self.blocks:
                if b.work2 == None: 
                    continue
                s = 0 
                if b.rank == s2:
                    s = u1
                M = np.empty((m - s, p_eff), complex)
                self.comm.Recv(M, source=b.getWork2()%self.size, tag=4*num + b.getWork2())
                
                A2 = b.getA2()
                A2[s:, :m] = A2[s:,:m] + M.dot(X2)
                del A2 
            return 
        
        
        m = self.m
        n = self.n
        nru = e1*m - u1
        p_eff = eb1 - sb1 
        num = self.numOfBlocks
        
        if method == WY1:
            return wy1()
        elif method == WY2:
            return wy2()
        elif method ==YTY1:
            return yty1()
        elif method == YTY2:
            return yty2()

    def __aggregate(self,S,  X2, beta, p, j, j1, j2, p_eff, method):
        #log("aggregate")
        
        def wy1():
            Y1 = S[0] ## it might be Y1 += new Y1
            Y2 = S[1]
            Y1[j1, j] = -beta
            Y2[:, j] =-beta*X2[j, :m]

            #log("Y1_init = " + str(Y1))
            #log("Y2_init = " + str(Y2))
            if (j > 0):
                v[: j ] = beta*np.conj(X2)[j, :m].dot(Y2[:m, :j])
                #log("v = {}".format(v))
                Y1[j1, :j] = Y1[j1, :j ] + v[:j ]
                Y2[:m, :j ] = Y2[:m, : j] + X2[j, :m][np.newaxis].T.dot(v[:j ][np.newaxis])
            #log("")
            #log("Y1_final = " + str(Y1))
            #log("Y2_final = " + str(Y2))
            return Y1, Y2
        def wy2():
            W1 = S[0]
            W2 = S[1]
            W1[j1, j] = -beta
            W2[:,j] = -beta*X2[j, :m]
            #log("W1_init = " + str(W1))
            #log("W2_init = " + str(W2))
            
            if j > 0:
                v[: j] = beta*X2[:j, :m].dot(np.conj(X2[j, :m].T))
                W1[sb1:j1, j] = W1[sb1:j1, :j].dot(v[:j])
                W2[:m, j]= W2[:m, j] + W2[:m, :j].dot(np.conj(v)[:j])
            #log("")
            #log("W1_final = " + str(W1))
            #log("W2_final = " + str(W2))
            return W1, W2
        def yty1():
            T = S
            T[j,j] = -beta
            if j > 0:
                v[:j] = beta*X2[:j, :m].dot(np.conj(X2)[j, :m].T)
                T[:j, j]=T[:j, :j].dot(v[:j])
            #log("T = " + str(T))
            return T
        def yty2():
            invT = S
            #log("old invT = " + str(invT))
            if j == p_eff - 1:
                invT[:p_eff, :p_eff] = triu(X2[:p_eff, :m].dot(np.conj(X2)[:p_eff, :m].T))
                #log("invT = " + str(invT))
                for jj in range(p_eff):
                    invT[jj,jj] = (invT[jj,jj] - 1.)/2.
            #log("invT = {}".format(invT))
            return invT
            
        m = self.m
        n = self.n
        sb1 = j1 - j
        sb2 = j2 - j
        v = np.zeros(m*(n + 1), complex) 
        #log("sb1, sb2 = {0}, {1}".format(sb1, sb2)) 
        if method == WY1:
            return wy1()
        if method == WY2:
            return wy2()
        if method == YTY1:
            return yty1()
        if method == YTY2:
            return yty2()



    def __seq_reduc(self, s1, e1, s2, e2):
        n = self.n
        m = self.m
        for j in range (0, self.m):
            X2, beta = self.__house_vec(j, s2)
            
            self.__seq_update(X2, beta, e1*m, e2*m, s2, j, m, n)

    def __seq_update(self,X2, beta, e1, e2, s2, j, m, n):
        #X2 = np.array([X2])
        u = j + 1
        num = self.numOfBlocks
        
        nru = e1*m - (s2*m + j + 1)
        for b in self.blocks:
            if b.work2 == None: 
                continue
            B1 = np.dot(b.getA2(), np.conj(X2.T))
            start = 0
            end = m
            if b.rank == s2:
                start = u
            if b.rank == e2/m:
                end = e2 % m or m
            B1 = B1[start:end]
            self.comm.Send(B1, dest=b.getWork2()%self.size, tag=4*num + b.getWork2())

        
        for b in self.blocks:
            if b.work1 == None: continue
            start = 0
            end = m
            if b.rank == 0:
                start = u
            if b.rank == e1/m:
                end = e1 % m or m

            B1 = np.empty(end-start, complex)
            
            self.comm.Recv(B1, source=b.getWork1()%self.size, tag=4*num + b.rank)
            A1 = b.getA1()
            B2 = A1[start:end, j]
                
            v = B2 - B1
            self.comm.Send(v, (b.getWork1())%self.size, 5*num + b.getWork1())
            A1[start:end,j] -= beta*v
            del A1

        for b in self.blocks:
            if b.work2 == None: 
                continue
            start = 0
            end = m
            if b.rank == s2:
                start = u
            if b.rank == e2/m :
                end = e2 % m or m
            v = np.empty(end-start,complex)
            self.comm.Recv(v, source=b.getWork2()%self.size, tag=5*num + b.rank)
            A2 = b.getA2()
            A2[start:end,:] -= beta*v[np.newaxis].T.dot(np.array([X2[:]]))
            del A2
        

    def __house_vec(self, j, s2):
        isZero = False
        X2 = np.zeros(self.m, complex)
        beta = 0
        blocks = self.blocks
        n = self.n
        num = self.numOfBlocks
        
        if blocks.hasRank(s2):
            A2 = blocks.getBlock(s2).getA2()
            if np.all(np.abs(A2[j, :]) < 1e-13):
                isZero=True
            del A2
        isZero = self.comm.bcast(isZero, root=s2%self.size)
        if isZero:
            return X2, beta
        
        if blocks.hasRank(s2):
            A2 = blocks.getBlock(s2).getA2()
            sigma = A2[j, :].dot(np.conj(A2[j,:]))
            self.comm.send(sigma, dest=0, tag=2*num + s2)
            del A2
        if blocks.hasRank(0):
            A1 = blocks.getBlock(0).getA1()
            sigma = self.comm.recv(source=s2%self.size, tag=2*num + s2)
            alpha = (A1[j,j]**2 - sigma)**0.5            
            if (np.real(A1[j,j] + alpha) < np.real(A1[j, j] - alpha)):
                z = A1[j, j]-alpha
                A1[j,j] = alpha 
            else:
                z = A1[j, j]+alpha
                A1[j,j] = -alpha
            self.comm.send(z, dest=s2%self.size, tag=3*num + s2)
            beta = 2*z*z/(-sigma + z*z)           
            del A1
            
        if blocks.hasRank(s2):
            z = self.comm.recv(source=0, tag=3*num + s2)
            A2 = blocks.getBlock(s2).getA2()
            X2 = A2[j,:]/z
            A2[j, :] = X2
            del A2
        beta = self.comm.bcast(beta, root=0)
        X2 = self.comm.bcast(X2, root=s2%self.size) 
        return X2, beta