def runOutlierDetector(dir, distance=cosine_dense, removePercent=0.05, output=False, time_log=False): files = IO.getSortedFilelist(dir+'/') if output: counter = 0 for f in files: diseaseName = f[0:f.find('.mtx')] subTermDoc = IO.readInTDM(dir, diseaseName) if output: counter += 1 print 'Count:', counter # If sub term document matrix is empty, just skip it. if subTermDoc.shape[0]==1 or subTermDoc.shape[1]==1: continue if time_log: t1 = time.time() subTermDoc = outlierDetector(subTermDoc, distance, removePercent, output, time_log) if time_log: print 'Time for outlier detection on', diseaseName, ':', str(time.time() -t1)[:4] if output: print 'Writing', subTermDoc = sparse.coo_matrix(subTermDoc) IO.writeOutTDM(_subFolder+'/'+outlierRemoved+str(int(removePercent*100)), diseaseName, subTermDoc)
def runTFIDF(): """ Create a normalized log-transformed TFIDF-matrix from a sparse coo_matrix. """ files = IOmodule.getSortedFilelist(_matrixDir + "/") # files = sorted([f for f in os.listdir(_matrixDir+"/") if os.path.isfile(_matrixDir+"/" + f)]) for file in files: file = file[:-4] subM_coo = IOmodule.readInTDM(_matrixDir, file) t1 = time.time() dense, lil = _generateLogTFIDF(subM_coo) t2 = time.time() print "Generated log_TFIDF in " + str(t2 - t1) t1 = time.time() _normalizeVectorLengths(dense, lil, file) t2 = time.time() print "Normalized vector lengths in " + str(t2 - t1) print "Done with: " + file + "\n"
def createDiseaseLabelHash(): """ Create and save a hash that connects every PMID with one or more diseases. """ t1 = time.time() files = IOmodule.getSortedFilelist(_subMatrixDir+'/') labelHash={} fileCount=0 for file in files: subMatrix=IOmodule.readInTDM(_subMatrixDir, file) colMatrix=subMatrix.tocsc() pmids=colMatrix.getcol(0)[1:].data for pmid in pmids: try: labelHash[pmid].append(file[:-4]) except: labelHash[pmid]=[] labelHash[pmid].append(file[:-4]) fileCount+=1 print "Remaining:",(len(files)-fileCount),"Completed",file[:-4] t2 = time.time() print 'Created disease label hash in:',str(t2-t1) IOmodule.pickleOut(_hashTablesDir, _labelHash,"btd", labelHash)
def createTermDoc(refreshHash=False): """ This function creates a large term-doc martix from a directory of sub term- doc matrices. It returns a matrix with dimensions given by the specified hash tables. It also saves the matrix for later use as a MatrixMarket .mtx file. """ t1 = time.time() if refreshHash: createTermAndPmidHashes() files = IOmodule.getSortedFilelist(_subMatrixDir+'/') # files = sorted([f for f in os.listdir(_subMatrixDir+"/") if os.path.isfile(_subMatrixDir+"/" + f)]) termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash) pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash) # Need to add one due to non zero indexing m=len(pmidHashTable)+1 n=len(termHashTable)+1 termDoc = sparse.lil_matrix((m,n)) # Insert values representing hashes for i in range(m): termDoc[i,0]=i termDoc[0,:]=range(n) for file in files: subMatrix=IOmodule.readInTDM(_subMatrixDir, file) subMCopy=subMatrix.todok() for i,j,v in zip(subMatrix.row, subMatrix.col, subMatrix.data): m = subMCopy[i,0] n = subMCopy[0,j] # Make sure not to add index's if m==0 or n==0: continue termDoc[m,n] += v print "Added",file IOmodule.writeOutTDM(_termDocDir, _termDoc, termDoc) t2 = time.time() print 'Time elapsed:',str(t2-t1) return termDoc
def loadMatrix(dirPath,filename): t1=time.time() M = IOmodule.readInTDM(dirPath, filename) t2=time.time() print str(t2-t1) return M
def getSemanticKeywords(top=20): diseaseList=[("Fibrodysplasia ossificans progressiva","Boy, normal birth, deformity of both big toes (missing joint), quick development of bone tumor near spine and osteogenesis at biopsy"), ("Adrenoleukodystrophy autosomal neonatal form","Normally developed boy age 5, progessive development of talking difficulties, seizures, ataxia, adrenal insufficiency and degeneration of visual and auditory functions"), ("Papillon Lefevre syndrome","Boy age 14, yellow keratotic plaques on the skin of palms and soles going up onto the dorsal side. Both hands and feet are affected. swollen vulnerable gums, loss of permanent teeth"), ("Kleine Levin Syndrome","Jewish boy age 16, monthly seizures, sleep aggressive and irritable when woken, highly increased sexual appetite and hunger"), ("Schinzel Giedion syndrome","Male child, malformations at birth, midfacial retraction with a deep groove under the eyes, and hypertelorism, short nose with a low nasal bridge and large low-set ears, wide mouth and retrognathia. Hypertrichosis with bright reddish hair and a median frontal cutaneous angioma, short neck with redundant skin, Bilateral inguinal hernias, hypospadias with a megameatus, and cryptorchidism")] matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_90" #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_10" #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_tfidf_stemmed_reduced90_outlierRemoved5" scoreDic={} totalTermDic={} for disease in diseaseList: filename=disease[0] symptoms=disease[1] symptoms=SearchTermDoc._modifySearchString(symptoms) symptoms=map(FilterInterface.porterStemmer,symptoms) M_coo=IOmodule.readInTDM(matrixDir,filename) totalTermDic[filename]=(M_coo.shape[1]-1) M_csc=M_coo.tocsc() termSum=[] for col in range(1,M_coo.shape[1]): term=revTermHashTable[M_csc[0,col]] termSum.append((sum(M_csc.getcol(col).data[:-1]),term)) termSum.sort() termSum.reverse() scoreDic[filename]={} for item in termSum: if item[1] in symptoms: scoreDic[filename][item[1]]=termSum.index(item) #return termSum[:top] for score in scoreDic.items(): print "Total number of terms for the disease:",totalTermDic[score[0]] print str(score[0])+'\t'+str(score[1].items()) print ''
def runAndSaveMatrices(): """ Transform a directory of matrices to a directory of decomposed matrices. """ files = IOmodule.getSortedFilelist(_oldMatrixDir+'/') for file in files: M_coo=IOmodule.readInTDM(_oldMatrixDir,file) # Make sure the matrix contains information (1-dim. is an empty matrix) if M_coo.shape[0]==1: continue print "Shape:"+str(M_coo.shape) # SVD does not run well single dimenstion matrices ## (remembering that the first dimension is indices and does not count) if M_coo.shape[0]>2: M_dense=M_coo.todense() # Run SVD U,Sig,Vt=_svd(M_dense) # Get the reduced semantic space S= _semanticSpace(U,Sig,Vt,_reduceBy) # Recombine the indices and the reduced matrix M_dense[1:,1:]=S.todense() # Save the matrix M_coo=sparse.coo_matrix(M_dense) IOmodule.writeOutTDM(_newMatrixDir, file, M_coo) print '' else: print "Dimensionality too low for svd" IOmodule.writeOutTDM(_newMatrixDir, file, M_coo) print ''
def createDiseaseHash(dir,output=False): """ Recieves a directory containing files to be hashed. It uses the filename as a key. It requires the files to be in .mtx format. The hashes starts from 1 ... number_of_files """ diseaseHashes={} files = IO.getSortedFilelist(dir) counter=0 for f in files: diseaseName=f[0:f.find('.mtx')] stdm=IO.readInTDM(dir,f) if stdm.shape[0]==1: continue if diseaseName not in diseaseHashes.keys(): counter+=1 if output: print 'Created', diseaseName, 'with hash', counter diseaseHashes[diseaseName]=counter IO.pickleOut(_hashTablesDir, diseaseHash,"btd", diseaseHashes)
def constructDiseaseMatrix(subMatrixDir, avg=False, output=False, time_log=False): """ Recieves a subMatrixDir goes through all the files and sums up the column of it, creating a single row vector containing the sum of all column in the sub term doc matrix. It then proceeds to making a disease term doc, based on these row vector Optional flags are: avg, takes the average over the columns of the sub matrices instead of the sum. output, makes the funtion produce additional output time_log, makes the function print out how much time is spend on what """ if output: print 'Initialising...' if time_log: t1 = time.time() files = IO.getSortedFilelist(subMatrixDir) termHashTable = IO.pickleIn(_hashTablesDir, _termHash) diseaseHashTable = IO.pickleIn(_hashTablesDir, diseaseHash) diseaseMatrix=sparse.lil_matrix((len(files)+1,len(termHashTable)+1)) # Initialize subTermSum to something subTermSum = sparse.lil_matrix((1,1)) if output: print 'Done initialising disease matrix of size', str((len(files)+1,len(termHashTable)+1)) count = 0 if time_log: print 'Time for initialization:', str(time.time() - t1)[:4] for f in files: if time_log: t2 = time.time() diseaseName = f[0:f.find('.mtx')] if output: print 'Processing', diseaseName count+=1 print 'Numbers remaining', len(files)-count subTermDoc = IO.readInTDM(subMatrixDir, diseaseName) subTermDoc = subTermDoc.tolil() # If the subTermDoc contains nothing, just skip it if(subTermDoc.shape[0] == 1 and subTermDoc.shape[1] == 1): continue subTermSum = getColumnSum(subTermDoc,avg) subTermSum[0,0] = diseaseHashTable[diseaseName] subTermSum[0,:] = subTermDoc.getrow(0) diseaseMatrix[diseaseHashTable[diseaseName],0] = diseaseHashTable[diseaseName] if time_log: print 'Time for', diseaseName, str(time.time() - t2)[:4] t3 = time.time() if output: print 'Filling in values in disease matrix for', diseaseName for columnIndex in range(1,subTermSum.shape[1]): diseaseMatrix[diseaseHashTable[diseaseName],subTermSum[0,columnIndex]] = subTermSum[1,columnIndex] if time_log: print 'Values filled into disease matrix in', str(time.time() - t3)[:4] if output: print 'Completed filling in values.' # Hack way of making term hashes diseaseMatrix[0,:] = range(0,len(termHashTable)) if output: print 'Done making disease matrix, writing to' IO.writeOutTDM(_termDocDir, label, diseaseMatrix) if output: print 'Done writing disease matrix.' return diseaseMatrix