def parseFile(filename, patIdx, medIdx, diagIdx, labelIdx, delim="|"): """ Parse a csv file using the delimiter and the appropriate columns of interest. The resultant sparse tensor has patient on the 0th mode, diagnosis on the 1st mode, and medications on the 2nd mode. Tensor info contains the axis information for each mode. """ print "Creating the tensor for " + filename patList = OrderedDict(sorted({}.items(), key=lambda t:t[1])) medList = OrderedDict(sorted({}.items(), key=lambda t:t[1])) diagList = OrderedDict(sorted({}.items(), key=lambda t:t[1])) patClass = OrderedDict(sorted({}.items(), key=lambda t:t[1])) ## storing tensor class as empty array tensorIdx = np.array([[0, 0, 0]]) datfile = open(filename) for i, line in enumerate(datfile): line = line.rstrip('\r\n') parse = line.split(delim) # insert them into the list if necessary if not patList.has_key(parse[patIdx]): patList[parse[patIdx]] = len(patList) patClass[parse[patIdx]] = parse[labelIdx] if not diagList.has_key(parse[diagIdx]): diagList[parse[diagIdx]] = len(diagList) if not medList.has_key(parse[medIdx]): medList[parse[medIdx]] = len(medList) patId = patList.get(parse[patIdx]) diagId = diagList.get(parse[diagIdx]) medId = medList.get(parse[medIdx]) # we know the first one is already mapped if i > 1: tensorIdx = np.append(tensorIdx, [[patId, diagId, medId]], axis=0) tensorVal = np.ones((tensorIdx.shape[0], 1)) # initialize size siz = np.array([len(patList), len(diagList), len(medList)]) X = sptensor.sptensor(tensorIdx, tensorVal, siz) tensorInfo = {} tensorInfo['axis'] = [patList.keys(), diagList.keys(), medList.keys()] tensorInfo['pat'] = patList.keys() tensorInfo['med'] = medList.keys() tensorInfo['diag'] = diagList.keys() tensorInfo['class'] = patClass.values() return X, tensorInfo
def evaluatePredictionAUC_3(self,experCount): run = 0 sumBaseAUC=0.0 sumCprAUC=0.0 testCount=4392 indexC1=np.where(self.X.subs[:,0]<testCount) indexC2=np.where(self.X.subs[:,0]>=testCount) #print( indexC1) subs1=self.X.subs[indexC1] subs2=self.X.subs[indexC2] subs2[:,0]=subs2[:,0]-testCount vals1=self.X.vals[indexC1] vals2=self.X.vals[indexC2] size1=np.array([testCount,self.X.shape[1],self.X.shape[2]]) size2=np.array([self.X.shape[0]-testCount,self.X.shape[1],self.X.shape[2]]) self.Y[self.Y==0]=-1 Y1=self.Y[:testCount] Y2=self.Y[testCount:] #print Y1.shape trainingX= sptensor.sptensor(subs1, vals1, size1) testX= sptensor.sptensor(subs2, vals2, size2) MCPR, cpstats, mstats = cp_apr_logis.cp_apr(trainingX, Y1, self.R, maxiters=100, maxinner=50) #MCPR, cpstats, mstats = CP_APR.cp_apr(trainingX, self.R, maxiters=100, maxinner=self.innerIter) MCPR.normalize_sort(1) klproj = KLProjection.KLProjection(MCPR.U, self.R) np.random.seed(10) testMatrix=klproj.projectSlice(testX, 0) ## scale by summing across the rows totWeight = np.sum(testMatrix, axis=1) zeroIdx = np.where(totWeight < 1e-100)[0] if len(zeroIdx) > 0: # for the zero ones we're going to evenly distribute evenDist = np.repeat(1.0 / self.R, len(zeroIdx)*self.R) testMatrix[zeroIdx, :] = evenDist.reshape((len(zeroIdx), self.R)) totWeight = np.sum(testMatrix, axis=1) twMat = np.repeat(totWeight, self.R).reshape(testMatrix.shape[0], self.R) testMatrix = testMatrix / twMat #print(MCPR.U[0]) #print(self.rawFeatures) rawXfile=self.data_dir+'experiment/trainingX_'+str(experCount)+'.csv' rawYfile=self.data_dir+'experiment/trainingY_'+str(experCount)+'.csv' cprXfile=self.data_dir+'experiment/testX_'+str(experCount)+'.csv' cprYfile=self.data_dir+'experiment/testY_'+str(experCount)+'.csv' np.savetxt(rawXfile,MCPR.U[0]) np.savetxt(rawYfile,Y1) np.savetxt(cprXfile, testMatrix) np.savetxt(cprYfile,Y2) print 'OK'
def tensorSubset(X, sm, subsetIds): """ Get a subset of the tensors specified by the subsetIds Parameters ------------ X : a list of tensors to subset sm : a 2-d numpy array specifying the tensor mode locations to compute the subset on subsetIds : a list of indices Output ----------- subsetX : a list of tensors with the indices rebased """ subsetX = [ti for ti in X] for row in range(sm.shape[0]): tensorIdx = sm[row, 0] tensorMode = sm[row, 1] subsetIdx = np.in1d(X[tensorIdx].subs[:,tensorMode].ravel(), subsetIds) subsIdx = np.where(subsetIdx)[0] subsetSubs = X[tensorIdx].subs[subsIdx,:] subsetVals = X[tensorIdx].vals[subsIdx] subsetSubs = rebase(subsetIds, subsetSubs) subsetShape = list(X[tensorIdx].shape) subsetShape[tensorMode] = len(subsetIds) subsetX[tensorIdx] = sptensor.sptensor(subsetSubs, subsetVals, subsetShape) return subsetX
def tensorSubset(X, sm, subsetIds): """ Get a subset of the tensors specified by the subsetIds Parameters ------------ X : a list of tensors to subset sm : a 2-d numpy array specifying the tensor mode locations to compute the subset on subsetIds : a list of indices Output ----------- subsetX : a list of tensors with the indices rebased """ subsetX = [ti for ti in X] for row in range(sm.shape[0]): tensorIdx = sm[row, 0] tensorMode = sm[row, 1] subsetIdx = np.in1d(X[tensorIdx].subs[:, tensorMode].ravel(), subsetIds) subsIdx = np.where(subsetIdx)[0] subsetSubs = X[tensorIdx].subs[subsIdx, :] subsetVals = X[tensorIdx].vals[subsIdx] subsetSubs = rebase(subsetIds, subsetSubs) subsetShape = list(X[tensorIdx].shape) subsetShape[tensorMode] = len(subsetIds) subsetX[tensorIdx] = sptensor.sptensor(subsetSubs, subsetVals, subsetShape) return subsetX
def loadMultiTensor(inFilePattern): """ Load the list of tensors from this input file format Parameters ------------ inFilePattern : the input file pattern for the 2 files with the tensor data and axis information Output ----------- X : the list of tensors in the file sharedModes : the 2-d array with the shared modes location axisDict : the axis information for all the tensors patClass : the patient cohort information """ infile = file(inFilePattern.format("data"), "rb") lenX = np.load(infile) X = [] for i in range(lenX): subs = np.load(infile) vals = np.load(infile) siz = np.load(infile) X.append(sptensor.sptensor(subs, vals, siz)) sharedModes = np.load(infile) tensorInfo = shelve.open(inFilePattern.format("info"), "r") axisDict = tensorInfo[AXIS] patClass = tensorInfo[CLASS] tensorInfo.close() return X, sharedModes, axisDict, patClass
def parseShared2DTensorFile(filename, axis0Dict, axis1Dict, axis0Idx, axis1Idx, valueIdx): print "Creating tensor from file " + filename ## initialize the dictionaries if nonexistent if axis0Dict is None: axis0Dict = OrderedDict(sorted({}.items(), key=lambda t:t[1])) if axis1Dict is None: axis1Dict = OrderedDict(sorted({}.items(), key=lambda t:t[1])) tensorIdx = np.array([[0, 0]], dtype=int) tensorVal = np.array([[0]], dtype=int) f = open(filename, "rb") for row in csv.reader(f): ## see if we need to add them to the if if not axis0Dict.has_key(row[axis0Idx]): axis0Dict[row[axis0Idx]] = len(axis0Dict) if not axis1Dict.has_key(row[axis1Idx]): axis1Dict[row[axis1Idx]] = len(axis1Dict) axis0Id = axis0Dict.get(row[axis0Idx]) axis1Id = axis1Dict.get(row[axis1Idx]) tensorIdx = np.vstack((tensorIdx, [[axis0Id, axis1Id]])) tensorVal = np.vstack((tensorVal, [[int(row[valueIdx])]])) tensorIdx = np.delete(tensorIdx, (0), axis=0) tensorVal = np.delete(tensorVal, (0), axis=0) f.close() tenX = sptensor.sptensor(tensorIdx, tensorVal, np.array([len(axis0Dict), len(axis1Dict)])) axisDict = {0: axis0Dict, 1: axis1Dict} return tenX, axisDict
def permute(verbose): subs = numpy.array([[1, 2, 3], [1, 1, 3], [2, 0, 1], [4, 3, 4], [1, 0, 1], [1, 0, 0]]); vals = numpy.array([[0.5], [1.5], [10], [3.5], [4.5], [5.5]]); siz = numpy.array([5, 5, 5]); obj = sptensor.sptensor(subs, vals, siz); if (verbose): print obj; print obj.permute([2,0,1]);
def ttmTests(verbose): subs = numpy.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1], [0, 1, 2]]) vals = numpy.array([[1], [2], [3], [4], [5]]) obj = sptensor.sptensor(subs, vals) A = numpy.array([[10, 20], [30, 40]]) print obj.ttm(A, 0) subs = numpy.array([[1, 2, 2], [1, 1, 2], [2, 0, 1], [1, 0, 1], [1, 0, 0]]) vals = numpy.array([[0.5], [1.5], [3.5], [4.5], [5.5]]) obj = sptensor.sptensor(subs, vals) print obj A = numpy.arange(18).reshape([6, 3]) print obj.ttm(A, 2) print obj.ttm([A, A], [1, 2])
def ttmTests(verbose): subs = numpy.array([[0,0,0],[0,1,1],[1,0,1],[1,1,1],[0,1,2]]); vals = numpy.array([[1],[2],[3],[4],[5]]); obj = sptensor.sptensor(subs,vals); A = numpy.array([[10,20],[30,40]]); print obj.ttm(A,0); subs = numpy.array([[1, 2, 2], [1, 1, 2], [2, 0, 1], [1, 0, 1], [1, 0, 0]]); vals = numpy.array([[0.5], [1.5], [3.5], [4.5], [5.5]]); obj = sptensor.sptensor(subs, vals); print obj; A = numpy.arange(18).reshape([6,3]); print obj.ttm(A,2); print obj.ttm([A,A],[1,2]);
def permute(verbose): subs = numpy.array([[1, 2, 3], [1, 1, 3], [2, 0, 1], [4, 3, 4], [1, 0, 1], [1, 0, 0]]) vals = numpy.array([[0.5], [1.5], [10], [3.5], [4.5], [5.5]]) siz = numpy.array([5, 5, 5]) obj = sptensor.sptensor(subs, vals, siz) if (verbose): print obj print obj.permute([2, 0, 1])
def tosptensor(self): """ returns the sptensor object that contains the same value with the tensor object.""" length = len(self.shape) sub = tools.allIndices(self.shape) return sptensor.sptensor( sub, self.data.flatten().reshape(self.data.size, 1), self.shape)
def mathops(verbose): subs = numpy.array([[0, 0, 0], [0, 0, 2], [1, 1, 1], [3, 3, 3], [0, 0, 0], [0, 0, 0]]); subs2 = numpy.array([[0, 2, 4], [0, 0, 2], [1, 1, 1], [3, 3, 3], [0, 0, 0], [0, 0, 0]]); vals = numpy.array([[0.5], [1.5], [2.5], [3.5], [4.5], [5.5]]); vals2 = numpy.array([[0.5], [1.5], [100], [3.5], [4.5], [5.5]]); siz = numpy.array([4, 4, 4]); obj = sptensor.sptensor(subs, vals, siz); obj2 = sptensor.sptensor(subs2, vals2, siz); if(verbose == 1): print obj == obj2; if(verbose == 1): print obj == obj; if(verbose == 1): print obj + 100; print obj - 100; print obj * 3.4; if(verbose == 1): print obj + obj2; print obj - obj2;
def tosptensor(self): """ returns the sptensor object that contains the same value with the tensor object.""" length = len(self.shape); sub = tools.allIndices(self.shape); return sptensor.sptensor( sub, self.data.flatten().reshape(self.data.size, 1), self.shape);
def tosptensor(self): """ returns the sptensor object that contains the same value with the tensor object.""" nnz = numpy.nonzero(self.data) vals = self.data[nnz] totVals = len(vals) vals = numpy.reshape(vals, (totVals, 1)) subs = numpy.zeros((totVals, self.ndims()),dtype = 'int') for n in range(self.ndims()): subs[:, n] = nnz[n] return sptensor.sptensor(subs, vals, self.shape)
def tosparsematTest(verbose): subs = numpy.array([[1, 3, 5], [1, 1, 0], [2, 2, 2], [3, 4, 4], [1, 1, 1], [1, 1, 1]]); vals = numpy.array([[0.5], [1.5], [100], [3.5], [4.5], [5.5]]); siz = numpy.array([4, 5, 6]); spt = sptensor.sptensor(subs, vals, siz); print spt; sptm = sptenmat.sptenmat(spt,[1]); print sptm; print sptm.tosparsemat();
def generateRandomProblem(MFull): ## calculate the two together nnz = np.nonzero(MFull.data) mfVals = MFull.data.flatten() xVals = np.reshape([np.random.poisson(l) for l in mfVals], (len(mfVals), 1)) Xsubs = np.zeros((len(mfVals), MFull.ndims())) Xsubs.dtype = 'int' for n in range(MFull.ndims()): Xsubs[:, n] = nnz[n] X = sptensor.sptensor(Xsubs, xVals, MFull.shape) ## return the observation return X
def tosptensor(self): """ returns the sptensor object that contains the same value with the tensor object.""" nnz = numpy.nonzero(self.data) vals = self.data[nnz] totVals = len(vals) vals = numpy.reshape(vals, (totVals, 1)) subs = numpy.zeros((totVals, self.ndims())) subs.dtype = 'int' for n in range(self.ndims()): subs[:, n] = nnz[n] return sptensor.sptensor(subs, vals, self.shape)
def tosparsematTest(verbose): subs = numpy.array([[1, 3, 5], [1, 1, 0], [2, 2, 2], [3, 4, 4], [1, 1, 1], [1, 1, 1]]) vals = numpy.array([[0.5], [1.5], [100], [3.5], [4.5], [5.5]]) siz = numpy.array([4, 5, 6]) spt = sptensor.sptensor(subs, vals, siz) print spt sptm = sptenmat.sptenmat(spt, [1]) print sptm print sptm.tosparsemat()
def parseFile(filename, patIdx, medIdx, diagIdx, labelIdx, delim="|"): """ Parse a csv file using the delimiter and the appropriate columns of interest. The resultant sparse tensor has patient on the 0th mode, diagnosis on the 1st mode, and medications on the 2nd mode. Tensor info contains the axis information for each mode. """ print "Creating the tensor for " + filename patList = OrderedDict(sorted({}.items(), key=lambda t: t[1])) medList = OrderedDict(sorted({}.items(), key=lambda t: t[1])) diagList = OrderedDict(sorted({}.items(), key=lambda t: t[1])) patClass = OrderedDict(sorted({}.items(), key=lambda t: t[1])) ## storing tensor class as empty array tensorIdx = np.array([[0, 0, 0]]) datfile = open(filename) for i, line in enumerate(datfile): line = line.rstrip('\r\n') parse = line.split(delim) # insert them into the list if necessary if not patList.has_key(parse[patIdx]): patList[parse[patIdx]] = len(patList) patClass[parse[patIdx]] = parse[labelIdx] if not diagList.has_key(parse[diagIdx]): diagList[parse[diagIdx]] = len(diagList) if not medList.has_key(parse[medIdx]): medList[parse[medIdx]] = len(medList) patId = patList.get(parse[patIdx]) diagId = diagList.get(parse[diagIdx]) medId = medList.get(parse[medIdx]) # we know the first one is already mapped if i > 1: tensorIdx = np.append(tensorIdx, [[patId, diagId, medId]], axis=0) tensorVal = np.ones((tensorIdx.shape[0], 1)) # initialize size siz = np.array([len(patList), len(diagList), len(medList)]) X = sptensor.sptensor(tensorIdx, tensorVal, siz) tensorInfo = {} tensorInfo['axis'] = [patList.keys(), diagList.keys(), medList.keys()] tensorInfo['pat'] = patList.keys() tensorInfo['med'] = medList.keys() tensorInfo['diag'] = diagList.keys() tensorInfo['class'] = patClass.values() return X, tensorInfo
def mathops(verbose): subs = numpy.array([[0, 0, 0], [0, 0, 2], [1, 1, 1], [3, 3, 3], [0, 0, 0], [0, 0, 0]]) subs2 = numpy.array([[0, 2, 4], [0, 0, 2], [1, 1, 1], [3, 3, 3], [0, 0, 0], [0, 0, 0]]) vals = numpy.array([[0.5], [1.5], [2.5], [3.5], [4.5], [5.5]]) vals2 = numpy.array([[0.5], [1.5], [100], [3.5], [4.5], [5.5]]) siz = numpy.array([4, 4, 4]) obj = sptensor.sptensor(subs, vals, siz) obj2 = sptensor.sptensor(subs2, vals2, siz) if (verbose == 1): print obj == obj2 if (verbose == 1): print obj == obj if (verbose == 1): print obj + 100 print obj - 100 print obj * 3.4 if (verbose == 1): print obj + obj2 print obj - obj2
def ctor(verbose): x = numpy.array([[[0, 0, 0.9052], [0.9121, 0, 0.7363]], [[0.1757, 0.2089, 0], [0, 0.7455, 0]], [[0, 0, 0.6754], [0, 0, 0]]]) obj = sptenmat.sptenmat(x, [0], [1, 2], [10, 10, 10]) print obj subs = numpy.array([[1, 3, 5], [1, 1, 0], [2, 2, 2], [3, 4, 4], [1, 1, 1], [1, 1, 1]]) vals = numpy.array([[0.5], [1.5], [100], [3.5], [4.5], [5.5]]) siz = numpy.array([4, 5, 6]) spt = sptensor.sptensor(subs, vals, siz) print spt obj = sptenmat.sptenmat(spt, [0, 1], [2]) print obj
def tosptensor(self): # extract the shape of sptensor newshape = self.tsize #extract the subscripts of sptensor rowsubs = [] if (len(self.rdims) != 0): rowshape = [] for i in range(0, len(self.rdims)): rowshape.extend([self.tsize[self.rdims[i]]]) for i in range(0, len(self.subs)): rowsubs.extend([tools.ind2sub(rowshape, self.subs[i][0])]) rowsubs = numpy.array(rowsubs) colsubs = [] if (len(self.cdims) != 0): colshape = [] for i in range(0, len(self.cdims)): colshape.extend([self.tsize[self.cdims[i]]]) for i in range(0, len(self.subs)): colsubs.extend([tools.ind2sub(colshape, self.subs[i][1])]) colsubs = numpy.array(colsubs) newsubs = [] for i in range(0, len(self.subs)): newsubs.extend([[]]) for k in range(0, len(newshape)): find = tools.find(self.rdims, k) if (find != -1): newsubs = numpy.concatenate( (newsubs, rowsubs[:, find].reshape([len(self.subs), 1])), axis=1) else: find = tools.find(self.cdims, k) newsubs = numpy.concatenate( (newsubs, colsubs[:, find].reshape([len(self.subs), 1])), axis=1) #extract the values of sptensor newvals = self.vals return sptensor.sptensor(numpy.array(newsubs, dtype="int"), newvals, newshape)
def ctor(verbose): x = numpy.array([ [[0,0,0.9052],[0.9121,0,0.7363]], [[0.1757,0.2089,0],[0,0.7455,0]], [[0,0,0.6754],[0,0,0]] ]) obj = sptenmat.sptenmat(x, [0], [1,2], [10,10,10]); print obj; subs = numpy.array([[1, 3, 5], [1, 1, 0], [2, 2, 2], [3, 4, 4], [1, 1, 1], [1, 1, 1]]); vals = numpy.array([[0.5], [1.5], [100], [3.5], [4.5], [5.5]]); siz = numpy.array([4, 5, 6]); spt = sptensor.sptensor(subs, vals, siz); print spt; obj = sptenmat.sptenmat(spt, [0,1], [2]); print obj;
def ctor(verbose): subs = numpy.array([[0, 0, 0], [0, 0, 2], [1, 1, 1], [3, 3, 3], [0, 0, 0], [0, 0, 0]]); subs2 = numpy.array([[0, 2, 3], [0, 0, 2], [1, 1, 1], [3, 3, 3], [0, 0, 0], [0, 0, 0]]); vals = numpy.array([[0.5], [1.5], [2.5], [3.5], [4.5], [5.5]]); vals2 = numpy.array([[0.5], [1.5], [100], [3.5], [4.5], [5.5]]); siz = numpy.array([5, 5, 5]); if(verbose == 1): print sptensor.sptensor(subs, vals, siz); print sptensor.sptensor(subs, vals); obj2 = sptensor.sptensor(subs2, vals2, siz); if(verbose == 1): print obj2; print obj2.totensor(); print sptensor.sptensor(subs, vals).totensor();
def generateRandomProblem(MFull): X = [] for M in MFull: ## get the non-zero entries nnz = np.nonzero(M.data) mfVals = M.data.flatten() xVals = np.reshape([np.random.poisson(l) for l in mfVals], (len(mfVals), 1)) xSubs = np.zeros((len(mfVals), M.ndims())) xSubs.dtype = 'int' for n in range(M.ndims()): xSubs[:, n] = nnz[n] ## figure out which ones are non-zero and build X with it to avoid extraneous properties nnzX = np.nonzero(xVals) print "Number of nonzeros:" + str(len(nnzX[0])) xVals = xVals[nnzX[0],:] xSubs = xSubs[nnzX[0],:] X.append(sptensor.sptensor(xSubs, xVals, M.shape)) ## return the observation return X
def ctor(verbose): subs = numpy.array([[0, 0, 0], [0, 0, 2], [1, 1, 1], [3, 3, 3], [0, 0, 0], [0, 0, 0]]) subs2 = numpy.array([[0, 2, 3], [0, 0, 2], [1, 1, 1], [3, 3, 3], [0, 0, 0], [0, 0, 0]]) vals = numpy.array([[0.5], [1.5], [2.5], [3.5], [4.5], [5.5]]) vals2 = numpy.array([[0.5], [1.5], [100], [3.5], [4.5], [5.5]]) siz = numpy.array([5, 5, 5]) if (verbose == 1): print sptensor.sptensor(subs, vals, siz) print sptensor.sptensor(subs, vals) obj2 = sptensor.sptensor(subs2, vals2, siz) if (verbose == 1): print obj2 print obj2.totensor() print sptensor.sptensor(subs, vals).totensor()
def tosptensor(self): # extract the shape of sptensor newshape = self.tsize; #extract the subscripts of sptensor rowsubs = []; if (len(self.rdims) != 0): rowshape = []; for i in range(0, len(self.rdims)): rowshape.extend([self.tsize[self.rdims[i]]]); for i in range(0, len(self.subs)): rowsubs.extend([tools.ind2sub(rowshape,self.subs[i][0])]); rowsubs = numpy.array(rowsubs); colsubs = []; if (len(self.cdims) != 0): colshape = []; for i in range(0, len(self.cdims)): colshape.extend([self.tsize[self.cdims[i]]]); for i in range(0, len(self.subs)): colsubs.extend([tools.ind2sub(colshape,self.subs[i][1])]); colsubs = numpy.array(colsubs); newsubs = []; for i in range(0, len(self.subs)): newsubs.extend([[]]); for k in range(0, len(newshape)): find = tools.find(self.rdims,k); if(find != -1): newsubs = numpy.concatenate((newsubs, rowsubs[:,find].reshape([len(self.subs),1])), axis = 1); else: find = tools.find(self.cdims,k); newsubs = numpy.concatenate((newsubs, colsubs[:,find].reshape([len(self.subs),1])), axis = 1); #extract the values of sptensor newvals = self.vals; return sptensor.sptensor(newsubs, newvals, newshape);
def generateRandomProblem(MFull): X = [] for M in MFull: ## get the non-zero entries nnz = np.nonzero(M.data) mfVals = M.data.flatten() xVals = np.reshape([np.random.poisson(l) for l in mfVals], (len(mfVals), 1)) xSubs = np.zeros((len(mfVals), M.ndims())) xSubs.dtype = 'int' for n in range(M.ndims()): xSubs[:, n] = nnz[n] ## figure out which ones are non-zero and build X with it to avoid extraneous properties nnzX = np.nonzero(xVals) print "Number of nonzeros:" + str(len(nnzX[0])) xVals = xVals[nnzX[0], :] xSubs = xSubs[nnzX[0], :] X.append(sptensor.sptensor(xSubs, xVals, M.shape)) ## return the observation return X
def tensorSubset(origTensor, subsetIds, subsetShape): """ Get a subset of the tensor specified by the subsetIds Parameters ------------ X : the original tensor subsetIds : a list of indices subsetShape : the shape of the new tensor Output ----------- subsetX : the tensor with the indices rebased """ subsetIdx = np.in1d(origTensor.subs[:, 0].ravel(), subsetIds) subsIdx = np.where(subsetIdx)[0] subsetSubs = origTensor.subs[subsIdx, :] subsetVals = origTensor.vals[subsIdx] # reindex the 0th mode subsetSubs = rebase(subsetIds, subsetSubs) return sptensor.sptensor(subsetSubs, subsetVals, subsetShape)
def constructTensor(med_file, diag_file): diag_med_comb = diag_cross_med(med_file, diag_file) ## create index map for subject_id, icdcode, and med_name patDict = createIndexMap(diag_med_comb.subject_id) medDict = createIndexMap(np.hstack(diag_med_comb.med_name)) diagDict = createIndexMap(np.hstack(diag_med_comb.code)) tensorIdx = np.array([[0,0,0]]) tensorVal = np.array([[0]]) for i in xrange(diag_med_comb.shape[0]): curDiag = [diagDict[x] for x in diag_med_comb.iloc[i,0]] curMed = [medDict[x] for x in diag_med_comb.iloc[i,1]] curPatId = patDict[diag_med_comb.iloc[i,2]] dmCombo = extmath.cartesian((curDiag, curMed)) tensorIdx = np.append(tensorIdx,np.column_stack((np.repeat(curPatId, dmCombo.shape[0]), dmCombo)),axis=0) tensorVal = np.append(tensorVal, np.ones((dmCombo.shape[0],1), dtype=np.int), axis=0) tensorIdx = np.delete(tensorIdx, (0), axis=0) tensorVal = np.delete(tensorVal, (0), axis=0) tenX = sptensor.sptensor(tensorIdx, tensorVal, np.array([len(patDict), len(diagDict), len(medDict)])) axisDict = {0: patDict, 1: diagDict, 2: medDict} return tenX, axisDict
def parseCarrier(f): headerRow = True patientId = None claimId = None procHier = loadJSON("cpt.json") icdHier = loadJSON("icd.json") patDict = OrderedDict(sorted({}.items(), key=lambda t: t[1])) diagDict = OrderedDict(sorted({}.items(), key=lambda t: t[1])) procDict = OrderedDict(sorted({}.items(), key=lambda t: t[1])) ## store the tensor index in an array tensorIdx = np.array([[0, 0, 0]]) tensorVal = np.array([[0]]) pid = 0 for row in csv.reader(open(f, "rb")): if pid > 10000: break # For the header, we will get the values we need if headerRow: pidIdx = [ i for i, item in enumerate(row) if re.search('DESYNPUF_ID', item) ][0] claimIdx = [ i for i, item in enumerate(row) if re.search('CLM_ID', item) ][0] diagIdx = [ i for i, item in enumerate(row) if re.search('ICD9_DGNS', item) ] hcpcsIdx = [ i for i, item in enumerate(row) if re.search('HCPCS_CD', item) ] headerRow = False continue ## get the diagnosis and procedure codes diagArray, diagCat = getDiagnosis(row, diagIdx, icdHier) for dc in set(diagCat): if not diagDict.has_key(dc): diagDict[dc] = len(diagDict) hcpcsArray, hcpcsCat = getProc(row, hcpcsIdx, procHier) for pc in set(hcpcsCat): if not procDict.has_key(pc): procDict[pc] = len(procDict) diagList = [diagDict[dc] for dc in diagCat] procList = [procDict[pc] for pc in hcpcsCat] if claimId == row[claimIdx]: ## same claim means same patient, so just add claimDiag.extend(diagList) claimHcpcs.extend(procList) continue if claimId != None: ## otherwise claim is different - so store off the old claim if len(claimDiag) > 0 and len(claimHcpcs) > 0: dpCombo = extmath.cartesian((claimDiag, claimHcpcs)) pid = patDict[patientId] tensorIdx = np.append(tensorIdx, np.column_stack( (np.repeat(pid, dpCombo.shape[0]), dpCombo)), axis=0) tensorVal = np.append(tensorVal, np.ones((dpCombo.shape[0], 1), dtype=np.int), axis=0) ## now we juse just update the new patient patientId = row[pidIdx] claimId = row[claimIdx] if not patDict.has_key(patientId): patDict[patientId] = len(patDict) claimDiag = diagList claimHcpcs = procList pid += 1 tensorIdx = np.delete(tensorIdx, (0), axis=0) tensorVal = np.delete(tensorVal, (0), axis=0) tenX = sptensor.sptensor( tensorIdx, tensorVal, np.array([len(patDict), len(diagDict), len(procDict)])) axisDict = {0: patDict, 1: diagDict, 2: procDict} return tenX, axisDict
R = 40 iters = 70 samples = 10 pcaModel = RandomizedPCA(n_components=R) stats = np.zeros((1, 6)) parser = argparse.ArgumentParser() parser.add_argument("pat", type=int, help="number of patients") args = parser.parse_args() pn = args.pat patList = np.arange(pn) ix = np.in1d(X.subs[:, 0].ravel(), patList) idx = np.where(ix)[0] xprime = sptensor.sptensor(X.subs[idx, :], X.vals[idx], [pn, X.shape[1], X.shape[2]]) flatX = sptenmat.sptenmat(xprime, [0]).tocsrmat() # matricize along the first mode stats = np.zeros((1, 6)) ## NMF Timing for k in range(samples): startTime = time.time() nmfModel = nimfa.mf(flatX, method="nmf", max_iter=iters, rank=R) nmfResult = nimfa.mf_run(nmfModel) elapsed = time.time() - startTime stats = np.vstack((stats, np.array([R, iters, pn, k, "NMF", elapsed]))) ## PCA Timing for k in range(samples): startTime = time.time()
import tensor; import sptensor; import numpy as np; import CP_APR import ktensor import KLProjection """ Test file associated with the CP decomposition using APR """ """ Test factorization of sparse matrix """ subs = np.array([[0,3,1], [1,0,1], [1,2,1], [1,3,1], [3,0,0]]); vals = np.array([[1],[1],[1],[1],[3]]); siz = np.array([5,5,2]) # 5x5x2 tensor X = sptensor.sptensor(subs, vals, siz) U0 = np.array([[0.7689, 0.8843, 0.7487, 0.0900], [0.1673, 0.5880, 0.8256, 0.1117], [0.8620, 0.1548, 0.7900, 0.1363], [0.9899, 0.1999, 0.3185, 0.6787], [0.5144, 0.4070, 0.5341, 0.4952]]) U1 = np.array([[0.1897, 0.5606, 0.8790, 0.9900], [0.4950, 0.9296, 0.9889, 0.5277], [0.1476, 0.6967, 0.0006, 0.4795], [0.0550, 0.5828, 0.8654, 0.8013], [0.8507, 0.8154, 0.6126, 0.2278]]) U2 = np.array([[0.4981, 0.5747, 0.7386, 0.2467], [0.9009, 0.8452, 0.5860, 0.6664]]) Minit = ktensor.ktensor(np.ones(4), [U0, U1, U2]) fms = Minit.fms(Minit) Y, cpstats, modelStats = CP_APR.cp_apr(X,4, Minit=Minit, maxiters=100); Y.normalize_sort(1) subs2 = np.array([[0,3,1], [1,2,0]]) vals2 = np.array([[1], [1]]) siz2 = np.array([2,5,2]) Xhat = sptensor.sptensor(subs2, vals2, siz2) klproj = KLProjection.KLProjection(Y.U, 4)
matrix_pkl = open("./nparr_pt_jdrange_med.pickle", "rb") nparr_pt_jdrange_med = pickle.load(matrix_pkl) matrix_pkl.close() ########################################################################################## # build SPARSE tensor from our data num_dims = len(nparr_pt_jdrange_med_binary.shape) nnz = np.nonzero(nparr_pt_jdrange_med_binary) data_values = nparr_pt_jdrange_med_binary[nnz].flatten() data_values = np.reshape(data_values, (len(data_values), 1)) nonzero_subs = np.zeros((len(data_values), num_dims)) nonzero_subs.dtype = 'int' for n in range(num_dims): nonzero_subs[:, n] = nnz[n] sparse_tensor_all_finite = sptensor.sptensor(nonzero_subs, data_values) ##classification for patients#### ##classification for patients: use MAP_CHANGE < -2 as a positive change #patients needed: l_patients_for_tensor = np.sort(list(df_MAP_CHANGE_finite.RUID)) l_patDict_idx_patients_for_tensor = np.sort( [patDict[ruid] for ruid in l_patients_for_tensor]) nparr_pt_jdrange_med_binary_subset = nparr_pt_jdrange_med_binary[ l_patDict_idx_patients_for_tensor] #build axisDict patDict = OrderedDict(sorted({}.items(), key=lambda t: t[1])) #axis dict, patient mode medDict = OrderedDict(sorted({}.items(), key=lambda t: t[1])) #axis dict, med mode
import ktensor import predictionModel #set data dirs data_dir = 'E:/test_project_python/XiamenData/' trainX = pd.read_csv( data_dir + 'SubTrainTensor_2387.csv', index_col=0, ) trainX = trainX.drop(['PID'], axis=1) trainXIndex = np.array(trainX.ix[:, :3].as_matrix(), dtype='int') trainXValue = np.array(trainX.ix[:, 3].as_matrix(), dtype='int').reshape( (trainXIndex.shape[0], 1)) trainXSize = np.array([2387, 247, 816]) trainTensor = sptensor.sptensor(trainXIndex, trainXValue, trainXSize) testX = pd.read_csv( data_dir + 'SubTestTensor_1024.csv', index_col=0, ) testX = testX.drop(['PID'], axis=1) testXIndex = np.array(testX.ix[:, :3].as_matrix(), dtype='int') testXValue = np.array(testX.ix[:, 3].as_matrix(), dtype='int').reshape( (testXIndex.shape[0], 1)) testXSize = np.array([1024, 247, 816]) testTensor = sptensor.sptensor(testXIndex, testXValue, testXSize) trainRe = pd.read_csv(data_dir + 'TrainResult_2387.csv') trainY = trainRe.ix[:, -1].as_matrix() trainY[trainY == 0] = -1
labelID = 1 outfile = 'results/iter-db-5-{0}.csv' set_desc = 'HF Patients Level 0 seed 0' infile = file("data/hf-tensor-label1-level0-data.dat", "rb") sqlLoadFile = "results/iter-{0}.sql".format(modelID) statsFile = "results/iter-stats-{0}.csv".format(modelID) fmsFile = "results/iter-fms-{0}.csv".format(modelID) # load the sparse tensor information subs = np.load(infile) vals = np.load(infile) siz = np.load(infile) infile.close() # now factor it X = sptensor.sptensor(subs, vals, siz) # Create a random initialization N = X.ndims() np.random.seed(0) F = []; for n in range(N): F.append(np.random.rand(X.shape[n], R)) Minit = ktensor.ktensor(np.ones(R), F) Y, ystats, fmsStats, mstats = cp_apr(X, R, Minit=Minit, outputfile=outfile, maxiters=iter) ## automate the creation of the sql file ystats = np.column_stack((np.repeat(modelID, ystats.shape[0]), ystats)) np.savetxt(statsFile, ystats, delimiter="|") fmsStats = np.column_stack((np.repeat(modelID, fmsStats.shape[0]), fmsStats))
df_MAP_CHANGE_first_10_ruid['MAP_CHANGE_GOOD'] = df_MAP_CHANGE_first_10_ruid['MAP_CHANGE_GOOD'].astype('int') l_patClass = df_MAP_CHANGE_first_10_ruid['MAP_CHANGE_GOOD'] od_patClass_first_10_ruid = OrderedDict(zip(patDict.keys(), l_patClass)) # build SPARSE tensor from our data nparr_data_by_pt = np.array(l_data_pt_med_jdrange) num_dims = len(nparr_data_by_pt.shape) nnz = np.nonzero(nparr_data_by_pt) data_values = nparr_data_by_pt[nnz].flatten() data_values = np.reshape(data_values, (len(data_values), 1)) nonzero_subs = np.zeros((len(data_values), num_dims)) nonzero_subs.dtype = 'int' for n in range(num_dims): nonzero_subs[:, n] = nnz[n] sparse_tensor_first_10_ruid = sptensor.sptensor(nonzero_subs, data_values) #save the tensor tensorIO.saveSingleTensor(sparse_tensor_first_10_ruid, axisDict, od_patClass_first_10_ruid, "htn-first10-tensor-{0}.dat") # ### LEFT OFF HERE: june 25, 6pm ################################################################## ## load the tensor ####### loaded_X, loaded_axisDict, loaded_classDict = tensorIO.loadSingleTensor("htn-first10-tensor-{0}.dat") ## do the decomposition ###### #store the data in "data" data = {'exptID': exptID, 'size': MSize, 'sparsity': AFill, "rank": R, "alpha": alpha, "gamma": gamma} def calculateValues(TM, M):
import tensor import sptensor import numpy as np import CP_APR import ktensor import KLProjection """ Test file associated with the CP decomposition using APR """ """ Test factorization of sparse matrix """ subs = np.array([[0, 3, 1], [1, 0, 1], [1, 2, 1], [1, 3, 1], [3, 0, 0]]) vals = np.array([[1], [1], [1], [1], [3]]) siz = np.array([5, 5, 2]) # 5x5x2 tensor X = sptensor.sptensor(subs, vals, siz) U0 = np.array([[0.7689, 0.8843, 0.7487, 0.0900], [0.1673, 0.5880, 0.8256, 0.1117], [0.8620, 0.1548, 0.7900, 0.1363], [0.9899, 0.1999, 0.3185, 0.6787], [0.5144, 0.4070, 0.5341, 0.4952]]) U1 = np.array([[0.1897, 0.5606, 0.8790, 0.9900], [0.4950, 0.9296, 0.9889, 0.5277], [0.1476, 0.6967, 0.0006, 0.4795], [0.0550, 0.5828, 0.8654, 0.8013], [0.8507, 0.8154, 0.6126, 0.2278]]) U2 = np.array([[0.4981, 0.5747, 0.7386, 0.2467], [0.9009, 0.8452, 0.5860, 0.6664]]) Minit = ktensor.ktensor(np.ones(4), [U0, U1, U2]) fms = Minit.fms(Minit) Y, cpstats, modelStats = CP_APR.cp_apr(X, 4, Minit=Minit, maxiters=100) Y.normalize_sort(1)
# from spase representation file to spase tensor x = pd.read_csv(data_dir + 'validationTensor_3412.csv').drop(['PID'], axis=1) #print x matrixIndex = np.array(x.ix[:, :3].as_matrix(), dtype='int') matrixValue = np.array(x.ix[:, 3].as_matrix(), dtype='int').reshape( (matrixIndex.shape[0], 1)) matrixSize = np.array([3412, 247, 816]) #print matrixValue re = pd.read_csv(data_dir + 'vadationResult_3412.csv') #print re Y = np.array(re.ix[:, 'InHosLabel']) #print Y X = sptensor.sptensor(matrixIndex, matrixValue, matrixSize) #print X.subs #print X.subs.shape ''' demoX=pd.read_csv(data_notebook+'demoF.csv') demoX.index=demoX.ix[:,0] demoX=np.array(demoX.ix[:,1:]) #demoX=demoX[:,1:] print(demoX.shape) #print demoX[:3] ''' goNum = [10, 30, 50, 80, 100, 125, 150, 180, 200] for i in range(len(goNum)): phennum = goNum[i]
import tensor import sptensor import numpy as np import CP_APR import ktensor """ Test file associated with the CP decomposition using APR """ """ Test factorization of sparse matrix """ subs = np.array([[0, 3, 1], [1, 0, 1], [1, 2, 1], [1, 3, 1], [3, 0, 0]]) vals = np.array([[1], [1], [1], [1], [3]]) siz = np.array([5, 5, 2]) # 5x5x2 tensor X = sptensor.sptensor(subs, vals, siz) U0 = np.array([[0.7689, 0.8843, 0.7487, 0.0900], [0.1673, 0.5880, 0.8256, 0.1117], [0.8620, 0.1548, 0.7900, 0.1363], [0.9899, 0.1999, 0.3185, 0.6787], [0.5144, 0.4070, 0.5341, 0.4952]]) U1 = np.array([[0.1897, 0.5606, 0.8790, 0.9900], [0.4950, 0.9296, 0.9889, 0.5277], [0.1476, 0.6967, 0.0006, 0.4795], [0.0550, 0.5828, 0.8654, 0.8013], [0.8507, 0.8154, 0.6126, 0.2278]]) U2 = np.array([[0.4981, 0.5747, 0.7386, 0.2467], [0.9009, 0.8452, 0.5860, 0.6664]]) Minit = ktensor.ktensor(np.ones(4), [U0, U1, U2]) fms = Minit.fms(Minit) Y, cpstats, modelStats = CP_APR.cp_apr(X, 4, Minit=Minit, maxiters=100) Y.normalize_sort(1) """ Test factorization of regular matrix """
R = 40 iters=70 samples=10 pcaModel = RandomizedPCA(n_components=R) stats = np.zeros((1, 6)) parser = argparse.ArgumentParser() parser.add_argument("pat", type=int, help="number of patients") args = parser.parse_args() pn = args.pat patList = np.arange(pn) ix = np.in1d(X.subs[:,0].ravel(), patList) idx = np.where(ix)[0] xprime = sptensor.sptensor(X.subs[idx, :], X.vals[idx], [pn, X.shape[1], X.shape[2]]) flatX = sptenmat.sptenmat(xprime, [0]).tocsrmat() # matricize along the first mode stats = np.zeros((1,6)) ## NMF Timing for k in range(samples): startTime = time.time() nmfModel = nimfa.mf(flatX, method="nmf", max_iter=iters, rank=R) nmfResult = nimfa.mf_run(nmfModel) elapsed = time.time() - startTime stats = np.vstack((stats, np.array([R, iters, pn, k, "NMF", elapsed]))) ## PCA Timing for k in range(samples): startTime = time.time() pcaModel.fit(flatX)
noiseNum = int(totNonzero*noise) noiseVals = np.random.poisson(lam=noiseParam, size=noiseNum) noiseSubs = np.random.randint(low=0, high=totNonzero, size=noiseNum) ## first choose a number between 0 and 1 to denote add or subtract noiseOp = np.random.randint(low=0, high=2, size=noiseNum) addIdx = np.where(noiseOp == 0)[0] Y = X.copy() Y.vals[noiseSubs[addIdx], 0] = Y.vals[noiseSubs[addIdx], 0] + noiseVals[addIdx] ## do the subtraction subtractIdx = np.where(noiseOp == 1)[0] Y.vals[noiseSubs[subtractIdx], 0] = Y.vals[noiseSubs[subtractIdx], 0] - noiseVals[subtractIdx] ## anything that was zero-ed out we want to fix nozIdx = np.where(Y.vals <= 0)[0] Y.vals[nozIdx] = 0 ## then we will add more by sampling empty space nozVals = np.random.poisson(lam=1, size=len(nozIdx)).reshape(len(nozIdx), 1) nozVals[np.where(nozVals == 0)] = 1 nozSub0 = np.random.randint(low=0, high=Y.shape[0], size=len(nozVals)) nozSub1 = np.random.randint(low=0, high=Y.shape[1], size=len(nozVals)) nozSub2 = np.random.randint(low=0, high=Y.shape[2], size=len(nozVals)) nozSubs = np.column_stack((nozSub0, nozSub1, nozSub2)) Y.subs = np.vstack((Y.subs, nozSubs)) Y.vals = np.vstack((Y.vals, nozVals)) Y = sptensor.sptensor(Y.subs, Y.vals, Y.shape) noiseTF = factorTensor(Y) fms = baseTF.greedy_fms(noiseTF) outfile.write(json.dumps({"expt": exptID, "type": "add+subtract", "noise": noise, "seed": seed, "rank": R, "0": fms['0'], "1": fms['1'], "2": fms['2']}) + "\n") outfile.close()
Y.vals[noiseSubs[subtractIdx], 0] = Y.vals[noiseSubs[subtractIdx], 0] - noiseVals[subtractIdx] ## anything that was zero-ed out we want to fix nozIdx = np.where(Y.vals <= 0)[0] Y.vals[nozIdx] = 0 ## then we will add more by sampling empty space nozVals = np.random.poisson(lam=1, size=len(nozIdx)).reshape(len(nozIdx), 1) nozVals[np.where(nozVals == 0)] = 1 nozSub0 = np.random.randint(low=0, high=Y.shape[0], size=len(nozVals)) nozSub1 = np.random.randint(low=0, high=Y.shape[1], size=len(nozVals)) nozSub2 = np.random.randint(low=0, high=Y.shape[2], size=len(nozVals)) nozSubs = np.column_stack((nozSub0, nozSub1, nozSub2)) Y.subs = np.vstack((Y.subs, nozSubs)) Y.vals = np.vstack((Y.vals, nozVals)) Y = sptensor.sptensor(Y.subs, Y.vals, Y.shape) noiseTF = factorTensor(Y) fms = baseTF.greedy_fms(noiseTF) outfile.write( json.dumps({ "expt": exptID, "type": "add+subtract", "noise": noise, "seed": seed, "rank": R, "0": fms['0'], "1": fms['1'], "2": fms['2'] }) + "\n")
########################################################################################## # build SPARSE tensor from our data num_dims = len(nparr_pt_jdrange_med_binary.shape) nnz = np.nonzero(nparr_pt_jdrange_med_binary) data_values = nparr_pt_jdrange_med_binary[nnz].flatten() data_values = np.reshape(data_values, (len(data_values), 1)) nonzero_subs = np.zeros((len(data_values), num_dims)) nonzero_subs.dtype = 'int' for n in range(num_dims): nonzero_subs[:, n] = nnz[n] sparse_tensor_all_finite = sptensor.sptensor(nonzero_subs, data_values) ##classification for patients#### ##classification for patients: use MAP_CHANGE < -2 as a positive change #patients needed: l_patients_for_tensor = np.sort(list(df_MAP_CHANGE_finite.RUID)) l_patDict_idx_patients_for_tensor = np.sort([patDict[ruid] for ruid in l_patients_for_tensor]) nparr_pt_jdrange_med_binary_subset = nparr_pt_jdrange_med_binary[l_patDict_idx_patients_for_tensor] #build axisDict patDict = OrderedDict(sorted({}.items(), key= lambda t:t[1])) #axis dict, patient mode medDict = OrderedDict(sorted({}.items(), key= lambda t:t[1])) #axis dict, med mode jdDict = OrderedDict(sorted({}.items(), key= lambda t:t[1])) #axis dict, jd mode jdrangeDict = OrderedDict(sorted({}.items(), key= lambda t:t[1])) #axis dict, jdrange mode
import tensor import sptensor import numpy as np import sim_APR import ktensor """ Test factorization of sparse matrix """ subs = np.array([[0, 3, 1], [1, 0, 1], [1, 2, 1], [1, 3, 1], [3, 0, 0]]) vals = np.array([[1], [1], [1], [1], [3]]) siz = np.array([5, 5, 2]) # 5x5x2 tensor # do the tensor with the same one X = [sptensor.sptensor(subs, vals, siz), sptensor.sptensor(subs, vals, siz)] sharedModes = [np.array([[0, 0], [1, 1]])] sapr = sim_APR.SAPR(X, 4, sharedModes) sapr.factorize() print sapr.M
import sys sys.path.insert(0, './pytensor') import sptensor # Set logging to DEBUG to see CP-ALS information logging.basicConfig(level=logging.DEBUG, format='%(asctime)-15s %(message)s') file = '../datasets/movielens-synthesized/ratings-synthesized-50k.csv' logging.debug("Loading dataset from file: %s", file) data = genfromtxt(file, delimiter=',', skip_header=1) logging.debug("Loaded data") # we need to convert data into two lists; subscripts/coordinates and values n = len(data) subs_1 = numpy.append(data[:,:2], numpy.zeros((n, 1)), 1) subs_2 = numpy.append(data[:,:2], numpy.ones((n, 1)), 1) subs = numpy.vstack([subs_1, subs_2]) subs = subs.astype(int) vals = numpy.hstack([data[:,2], data[:, 3]]) vals = vals.flatten() vals = [[x] for i,x in enumerate(vals)] vals = numpy.array(vals) spten2 = sptensor.sptensor(subs, vals) print spten2.shape