def createTermDoc(refreshHash=False): """ This function creates a large term-doc martix from a directory of sub term- doc matrices. It returns a matrix with dimensions given by the specified hash tables. It also saves the matrix for later use as a MatrixMarket .mtx file. """ t1 = time.time() if refreshHash: createTermAndPmidHashes() files = IOmodule.getSortedFilelist(_subMatrixDir+'/') # files = sorted([f for f in os.listdir(_subMatrixDir+"/") if os.path.isfile(_subMatrixDir+"/" + f)]) termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash) pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash) # Need to add one due to non zero indexing m=len(pmidHashTable)+1 n=len(termHashTable)+1 termDoc = sparse.lil_matrix((m,n)) # Insert values representing hashes for i in range(m): termDoc[i,0]=i termDoc[0,:]=range(n) for file in files: subMatrix=IOmodule.readInTDM(_subMatrixDir, file) subMCopy=subMatrix.todok() for i,j,v in zip(subMatrix.row, subMatrix.col, subMatrix.data): m = subMCopy[i,0] n = subMCopy[0,j] # Make sure not to add index's if m==0 or n==0: continue termDoc[m,n] += v print "Added",file IOmodule.writeOutTDM(_termDocDir, _termDoc, termDoc) t2 = time.time() print 'Time elapsed:',str(t2-t1) return termDoc
def medlineDir2MatrixDir(): """ This function converts a directory of MedLine records to a new directory of corresponding term-doc matrices. It takes the matrix dimensions (row: m, col: n). It creates a directory (in the home folder) named 'diseaseMatrices' and stores the matrices as 'MatrixMarket' .mtx files, named by the disease name. """ termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash) pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash) files = IOmodule.getSortedFilelist(_medlineDir+'/') # files = sorted([f for f in os.listdir(_medlineDir+"/") if os.path.isfile(_medlineDir+"/" + f)]) counter = 0 for file in files: data = _gatherMatrixData(file) # Get matrix dimensions (+1 for the [0,0] field) ## (Here follows a small 0.0001 sec. hack to get n = total number of terms) temp={} for pmid in data: for term in pmid[1]: temp[term[0]]=0 m=len(data)+1 n=len(temp)+1 M = _populateMatrix(m, n, data,termHashTable, pmidHashTable) diseaseName = file[0:file.find('.txt')] IOmodule.writeOutTDM(_subMatrixDir, diseaseName, M) counter += 1 print str(counter),"matrices made. Total number of terms:",len(M.getrow(0).nonzero()[0])
def constructDiseaseMatrix(subMatrixDir, avg=False, output=False, time_log=False): """ Recieves a subMatrixDir goes through all the files and sums up the column of it, creating a single row vector containing the sum of all column in the sub term doc matrix. It then proceeds to making a disease term doc, based on these row vector Optional flags are: avg, takes the average over the columns of the sub matrices instead of the sum. output, makes the funtion produce additional output time_log, makes the function print out how much time is spend on what """ if output: print 'Initialising...' if time_log: t1 = time.time() files = IO.getSortedFilelist(subMatrixDir) termHashTable = IO.pickleIn(_hashTablesDir, _termHash) diseaseHashTable = IO.pickleIn(_hashTablesDir, diseaseHash) diseaseMatrix=sparse.lil_matrix((len(files)+1,len(termHashTable)+1)) # Initialize subTermSum to something subTermSum = sparse.lil_matrix((1,1)) if output: print 'Done initialising disease matrix of size', str((len(files)+1,len(termHashTable)+1)) count = 0 if time_log: print 'Time for initialization:', str(time.time() - t1)[:4] for f in files: if time_log: t2 = time.time() diseaseName = f[0:f.find('.mtx')] if output: print 'Processing', diseaseName count+=1 print 'Numbers remaining', len(files)-count subTermDoc = IO.readInTDM(subMatrixDir, diseaseName) subTermDoc = subTermDoc.tolil() # If the subTermDoc contains nothing, just skip it if(subTermDoc.shape[0] == 1 and subTermDoc.shape[1] == 1): continue subTermSum = getColumnSum(subTermDoc,avg) subTermSum[0,0] = diseaseHashTable[diseaseName] subTermSum[0,:] = subTermDoc.getrow(0) diseaseMatrix[diseaseHashTable[diseaseName],0] = diseaseHashTable[diseaseName] if time_log: print 'Time for', diseaseName, str(time.time() - t2)[:4] t3 = time.time() if output: print 'Filling in values in disease matrix for', diseaseName for columnIndex in range(1,subTermSum.shape[1]): diseaseMatrix[diseaseHashTable[diseaseName],subTermSum[0,columnIndex]] = subTermSum[1,columnIndex] if time_log: print 'Values filled into disease matrix in', str(time.time() - t3)[:4] if output: print 'Completed filling in values.' # Hack way of making term hashes diseaseMatrix[0,:] = range(0,len(termHashTable)) if output: print 'Done making disease matrix, writing to' IO.writeOutTDM(_termDocDir, label, diseaseMatrix) if output: print 'Done writing disease matrix.' return diseaseMatrix
# Load the precomputed length of each column in the stemmed term-doc matrix #_termSum = IOmodule.pickleIn(_hashTablePath,_CLHash) ####################################################################################### #### Use stopword-removed, Porter-stemmed (english) and TFIDF-prefiltered TermDoc: #### ####################################################################################### # TFIDF-matrix file name _tfidfName = "label_TFIDFMatrix_reduced_90" # Vector-norm hash for then TFIDFMatrix #_RLHash = "RLHash_tfidf_stemmed" # Hash for the number of documents each term occur in _CLHash = "svdlabel_CLHash" # Load the precomputed length of each column in the stemmed term-doc matrix _termSum = IOmodule.pickleIn(_hashTablePath,_CLHash) #################################################################### print "Hashes loaded." def _generateLogTFIDF(M_coo): """ Creates a Term-Frequency Inverse-Document-Frequency from a sparse coo_matrix, using log-transformation on TF and IDF. Returns a sparse lil_matrix to be used for vector-normalization. """
# Main folder _path = os.getenv("HOME") + "/" + "The_Hive" # Sub folder _subFolder = _path + "/" + "term_doc" # Hashtable directory _hashTablePath = _subFolder + "/" + "hashTables" # Set True for Porter-stemming _stemmer = True # For term document set to True, for DiseaseMatrix set to false. _termdocumentmatrix = False # Cosine measure = True, Sum measure = False _cosineMeasure = False ############ if _termdocumentmatrix: # Disease label hash (for pmid lookup) _labelHash = IOmodule.pickleIn(_hashTablePath, "labelHash") print "Label hash loaded" else: # Disease label hash (for label lookup) _diseaseHash = IOmodule.pickleIn(_hashTablePath, "diseaseHash") #_reduced") _labelHash = dict(zip(_diseaseHash.values(), _diseaseHash.keys())) print "Disease hash loaded" ############ def search(M_lil, M_csc, queryString, top=20): """ This function is still a work in progress.. """ sanitizer = TextCleaner.sanitizeString()
# Term- and PMID-hash directory _hashTablesDir=_path+'/'+'term_doc'+'/'+"hashTables" # If subFolder do not exists if not os.path.isdir(_path+'/'+subFolder): os.mkdir(_path+'/'+subFolder) _stemmer=True #################################################################### #### Use stopword-removed TermDoc ################################## #################################################################### if not _stemmer: # Hashes termHashTable=IOmodule.pickleIn(_hashTablesDir, "termHash") pmidHashTable=IOmodule.pickleIn(_hashTablesDir, "pmidHash") revPmidHashTable=dict(zip(pmidHashTable.values(),pmidHashTable.keys())) #################################################################### #### Use stopword-removed and Porter-stemmed (english) TermDoc: #### #################################################################### else: # Stemmed hashes termHashTable=IOmodule.pickleIn(_hashTablesDir, "termHash_stemmed") pmidHashTable=IOmodule.pickleIn(_hashTablesDir, "pmidHash_stemmed") revPmidHashTable=dict(zip(pmidHashTable.values(),pmidHashTable.keys())) print "Hashes loaded"