def runOutlierDetector(dir, distance=cosine_dense, removePercent=0.05, output=False, time_log=False): files = IO.getSortedFilelist(dir+'/') if output: counter = 0 for f in files: diseaseName = f[0:f.find('.mtx')] subTermDoc = IO.readInTDM(dir, diseaseName) if output: counter += 1 print 'Count:', counter # If sub term document matrix is empty, just skip it. if subTermDoc.shape[0]==1 or subTermDoc.shape[1]==1: continue if time_log: t1 = time.time() subTermDoc = outlierDetector(subTermDoc, distance, removePercent, output, time_log) if time_log: print 'Time for outlier detection on', diseaseName, ':', str(time.time() -t1)[:4] if output: print 'Writing', subTermDoc = sparse.coo_matrix(subTermDoc) IO.writeOutTDM(_subFolder+'/'+outlierRemoved+str(int(removePercent*100)), diseaseName, subTermDoc)
def _normalizeVectorLengths(M_lil): """ Normalize the length of a sparse lil_matrix. """ t1=time.time() # Create a norm-hash of each row-vector in the stemmed term-doc matrix. vectorLength=SearchTermDoc.createRLHash(M_lil, _RLHash,False) for row in range(1,M_lil.shape[0]): norm=vectorLength[row] for col in (M_lil.getrow(row).nonzero()[1])[1:]: M_lil[row,col]=(M_lil[row,col])/norm print "Normalized:",row t2=time.time() print "Total:"+str(t2-t1) # This is madness tfidfMatrix = M_lil # Save and overwrite the log_tfidf generated above IOmodule.writeOutTDM(_termDocDir, _tfidfName+'_norm', tfidfMatrix)
def saveMatrix(filename,matrix): t1 = time.time() IOmodule.writeOutTDM('testFolder',filename, matrix) t2 = time.time() print 'Time used: ',(t2-t1)
def _generateLogTFIDF(M_coo): """ Creates a Term-Frequency Inverse-Document-Frequency from a sparse coo_matrix, using log-transformation on TF and IDF. Returns a sparse lil_matrix to be used for vector-normalization. """ totalTime1=time.time() numberOfDocs = float(M_coo.shape[0]-1) print "Converting from coo to lil..." t1=time.time() tfidfMatrix=M_coo.tolil() t2=time.time() print "Matrix converted to lil in",(t2-t1) t1=time.time() for row in range(1,numberOfDocs+1): for col in (tfidfMatrix.getrow(row).nonzero()[1])[1:]: tf=tfidfMatrix[row,col] #if tf == 0: # print "Looked up zero-value at: "+str(docIndex)+" "+str(termVectorIndex) # raise Exception if tf <=0: print tf tf=0.000000000000001 # <---for svd try: tf = math.log(1 + tf) except: print tf if _termSum[col]==0: continue idf = math.log(numberOfDocs / _termSum[col]) tfidfMatrix[row,col]=tf*idf print "Row:",row t2=time.time() print "Total:"+str(t2-t1) # Save and overwrite the log_tfidf generate above IOmodule.writeOutTDM(_termDocDir, _tfidfName, tfidfMatrix) totalTime2=time.time() print "Total time: "+str(totalTime2-totalTime1) return tfidfMatrix
def createTermDoc(refreshHash=False): """ This function creates a large term-doc martix from a directory of sub term- doc matrices. It returns a matrix with dimensions given by the specified hash tables. It also saves the matrix for later use as a MatrixMarket .mtx file. """ t1 = time.time() if refreshHash: createTermAndPmidHashes() files = IOmodule.getSortedFilelist(_subMatrixDir+'/') # files = sorted([f for f in os.listdir(_subMatrixDir+"/") if os.path.isfile(_subMatrixDir+"/" + f)]) termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash) pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash) # Need to add one due to non zero indexing m=len(pmidHashTable)+1 n=len(termHashTable)+1 termDoc = sparse.lil_matrix((m,n)) # Insert values representing hashes for i in range(m): termDoc[i,0]=i termDoc[0,:]=range(n) for file in files: subMatrix=IOmodule.readInTDM(_subMatrixDir, file) subMCopy=subMatrix.todok() for i,j,v in zip(subMatrix.row, subMatrix.col, subMatrix.data): m = subMCopy[i,0] n = subMCopy[0,j] # Make sure not to add index's if m==0 or n==0: continue termDoc[m,n] += v print "Added",file IOmodule.writeOutTDM(_termDocDir, _termDoc, termDoc) t2 = time.time() print 'Time elapsed:',str(t2-t1) return termDoc
def _normalizeVectorLengths(M_dense, M_lil, filename): """ Normalize the length of a sparse matrix, represented as a dense and a lil - format. """ vectorLength = SearchTermDoc.createRLHash(M_lil, None, False) for row in range(1, M_lil.shape[0]): norm = vectorLength[row] for col in (M_lil.getrow(row).nonzero()[1])[1:]: M_dense[row, col] = (M_dense[row, col]) / norm tfidfMatrix = sparse.coo_matrix(M_dense) # Save the matrix IOmodule.writeOutTDM(_termDocDir, filename, tfidfMatrix)
def runAndSaveMatrices(): """ Transform a directory of matrices to a directory of decomposed matrices. """ files = IOmodule.getSortedFilelist(_oldMatrixDir+'/') for file in files: M_coo=IOmodule.readInTDM(_oldMatrixDir,file) # Make sure the matrix contains information (1-dim. is an empty matrix) if M_coo.shape[0]==1: continue print "Shape:"+str(M_coo.shape) # SVD does not run well single dimenstion matrices ## (remembering that the first dimension is indices and does not count) if M_coo.shape[0]>2: M_dense=M_coo.todense() # Run SVD U,Sig,Vt=_svd(M_dense) # Get the reduced semantic space S= _semanticSpace(U,Sig,Vt,_reduceBy) # Recombine the indices and the reduced matrix M_dense[1:,1:]=S.todense() # Save the matrix M_coo=sparse.coo_matrix(M_dense) IOmodule.writeOutTDM(_newMatrixDir, file, M_coo) print '' else: print "Dimensionality too low for svd" IOmodule.writeOutTDM(_newMatrixDir, file, M_coo) print ''
def medlineDir2MatrixDir(): """ This function converts a directory of MedLine records to a new directory of corresponding term-doc matrices. It takes the matrix dimensions (row: m, col: n). It creates a directory (in the home folder) named 'diseaseMatrices' and stores the matrices as 'MatrixMarket' .mtx files, named by the disease name. """ termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash) pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash) files = IOmodule.getSortedFilelist(_medlineDir+'/') # files = sorted([f for f in os.listdir(_medlineDir+"/") if os.path.isfile(_medlineDir+"/" + f)]) counter = 0 for file in files: data = _gatherMatrixData(file) # Get matrix dimensions (+1 for the [0,0] field) ## (Here follows a small 0.0001 sec. hack to get n = total number of terms) temp={} for pmid in data: for term in pmid[1]: temp[term[0]]=0 m=len(data)+1 n=len(temp)+1 M = _populateMatrix(m, n, data,termHashTable, pmidHashTable) diseaseName = file[0:file.find('.txt')] IOmodule.writeOutTDM(_subMatrixDir, diseaseName, M) counter += 1 print str(counter),"matrices made. Total number of terms:",len(M.getrow(0).nonzero()[0])
def constructDiseaseMatrix(subMatrixDir, avg=False, output=False, time_log=False): """ Recieves a subMatrixDir goes through all the files and sums up the column of it, creating a single row vector containing the sum of all column in the sub term doc matrix. It then proceeds to making a disease term doc, based on these row vector Optional flags are: avg, takes the average over the columns of the sub matrices instead of the sum. output, makes the funtion produce additional output time_log, makes the function print out how much time is spend on what """ if output: print 'Initialising...' if time_log: t1 = time.time() files = IO.getSortedFilelist(subMatrixDir) termHashTable = IO.pickleIn(_hashTablesDir, _termHash) diseaseHashTable = IO.pickleIn(_hashTablesDir, diseaseHash) diseaseMatrix=sparse.lil_matrix((len(files)+1,len(termHashTable)+1)) # Initialize subTermSum to something subTermSum = sparse.lil_matrix((1,1)) if output: print 'Done initialising disease matrix of size', str((len(files)+1,len(termHashTable)+1)) count = 0 if time_log: print 'Time for initialization:', str(time.time() - t1)[:4] for f in files: if time_log: t2 = time.time() diseaseName = f[0:f.find('.mtx')] if output: print 'Processing', diseaseName count+=1 print 'Numbers remaining', len(files)-count subTermDoc = IO.readInTDM(subMatrixDir, diseaseName) subTermDoc = subTermDoc.tolil() # If the subTermDoc contains nothing, just skip it if(subTermDoc.shape[0] == 1 and subTermDoc.shape[1] == 1): continue subTermSum = getColumnSum(subTermDoc,avg) subTermSum[0,0] = diseaseHashTable[diseaseName] subTermSum[0,:] = subTermDoc.getrow(0) diseaseMatrix[diseaseHashTable[diseaseName],0] = diseaseHashTable[diseaseName] if time_log: print 'Time for', diseaseName, str(time.time() - t2)[:4] t3 = time.time() if output: print 'Filling in values in disease matrix for', diseaseName for columnIndex in range(1,subTermSum.shape[1]): diseaseMatrix[diseaseHashTable[diseaseName],subTermSum[0,columnIndex]] = subTermSum[1,columnIndex] if time_log: print 'Values filled into disease matrix in', str(time.time() - t3)[:4] if output: print 'Completed filling in values.' # Hack way of making term hashes diseaseMatrix[0,:] = range(0,len(termHashTable)) if output: print 'Done making disease matrix, writing to' IO.writeOutTDM(_termDocDir, label, diseaseMatrix) if output: print 'Done writing disease matrix.' return diseaseMatrix