def runOutlierDetector(dir, distance=cosine_dense, removePercent=0.05, output=False, time_log=False): files = IO.getSortedFilelist(dir+'/') if output: counter = 0 for f in files: diseaseName = f[0:f.find('.mtx')] subTermDoc = IO.readInTDM(dir, diseaseName) if output: counter += 1 print 'Count:', counter # If sub term document matrix is empty, just skip it. if subTermDoc.shape[0]==1 or subTermDoc.shape[1]==1: continue if time_log: t1 = time.time() subTermDoc = outlierDetector(subTermDoc, distance, removePercent, output, time_log) if time_log: print 'Time for outlier detection on', diseaseName, ':', str(time.time() -t1)[:4] if output: print 'Writing', subTermDoc = sparse.coo_matrix(subTermDoc) IO.writeOutTDM(_subFolder+'/'+outlierRemoved+str(int(removePercent*100)), diseaseName, subTermDoc)
def runTFIDF(): """ Create a normalized log-transformed TFIDF-matrix from a sparse coo_matrix. """ files = IOmodule.getSortedFilelist(_matrixDir + "/") # files = sorted([f for f in os.listdir(_matrixDir+"/") if os.path.isfile(_matrixDir+"/" + f)]) for file in files: file = file[:-4] subM_coo = IOmodule.readInTDM(_matrixDir, file) t1 = time.time() dense, lil = _generateLogTFIDF(subM_coo) t2 = time.time() print "Generated log_TFIDF in " + str(t2 - t1) t1 = time.time() _normalizeVectorLengths(dense, lil, file) t2 = time.time() print "Normalized vector lengths in " + str(t2 - t1) print "Done with: " + file + "\n"
def createDiseaseLabelHash(): """ Create and save a hash that connects every PMID with one or more diseases. """ t1 = time.time() files = IOmodule.getSortedFilelist(_subMatrixDir+'/') labelHash={} fileCount=0 for file in files: subMatrix=IOmodule.readInTDM(_subMatrixDir, file) colMatrix=subMatrix.tocsc() pmids=colMatrix.getcol(0)[1:].data for pmid in pmids: try: labelHash[pmid].append(file[:-4]) except: labelHash[pmid]=[] labelHash[pmid].append(file[:-4]) fileCount+=1 print "Remaining:",(len(files)-fileCount),"Completed",file[:-4] t2 = time.time() print 'Created disease label hash in:',str(t2-t1) IOmodule.pickleOut(_hashTablesDir, _labelHash,"btd", labelHash)
def pmidDuplicateCounter(directory, number=None): # files=sorted([f for f in os.listdir(directory) if os.path.isfile(directory+f)])[:number] files = IOmodule.getSortedFilelist(directory+'/') pmidCount={} counter=0 for f in files: # Open file descriptor fd = open(directory+'/'+f,'r') # Read in from file diseaseDic=eval(fd.read()) # Close the file descriptor nicely again fd.close() medlineRecords=diseaseDic['records'] for record in medlineRecords: pmidCount.setdefault(record['PMID'],0) pmidCount[record['PMID']]+=1 counter+=1 if counter % 1000 == 0: print counter # print "Files remaining:",(len(files)-counter) return pmidCount
def countRecordfield(directory,field): """ This function counts the number of identical fields in the MedLine records. This could for instance be used for a view into how many identical PMIDs that have been downloaded on the cross of different disease searches. It takes a medline record directory (full path) and a field. It returns a dictionary on the form: {PMID: #ids,...} """ fieldSum={} # files=sorted([f for f in os.listdir(directory) if os.path.isfile(directory+f)]) files = IOmodule.getSortedFilelist(directory+'/') counter=0 for f in files: diseaseDic=eval(open(directory+f,'r').read()) medlineRecords=diseaseDic['records'] for record in medlineRecords: item=record[field] fieldSum.setdefault(item,0) fieldSum[item]+=1 counter+=1 print "Files remaining:",(len(files)-counter) return fieldSum
def _readDiseases(indexStart=0,indexStop=None): """ Function for returning the content of all or some of the crawled diseases. By default all are returned in a dictionary of diseases on the form: [{DiseaseName:{db='',terms:'',syn=[],uid:'',desc:''}}] The reason all the dictionaries are returned as a list is to be sure of the order. """ path=_subFolder+"/"+diseaseFolder+'/' # files=sorted([f for f in os.listdir(path) if os.path.isfile(path+f)]) files = IOmodule.getSortedFilelist(path, startIndex=indexStart, stopIndex=indexStop) sortedcontents=[] for f in files: contents={} diseaseName=f[0:f.find('.txt')] diseaseAttr=eval(open(path+f,'r').read()) contents[diseaseName]=diseaseAttr sortedcontents.append(contents) return sortedcontents
def createTermDoc(refreshHash=False): """ This function creates a large term-doc martix from a directory of sub term- doc matrices. It returns a matrix with dimensions given by the specified hash tables. It also saves the matrix for later use as a MatrixMarket .mtx file. """ t1 = time.time() if refreshHash: createTermAndPmidHashes() files = IOmodule.getSortedFilelist(_subMatrixDir+'/') # files = sorted([f for f in os.listdir(_subMatrixDir+"/") if os.path.isfile(_subMatrixDir+"/" + f)]) termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash) pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash) # Need to add one due to non zero indexing m=len(pmidHashTable)+1 n=len(termHashTable)+1 termDoc = sparse.lil_matrix((m,n)) # Insert values representing hashes for i in range(m): termDoc[i,0]=i termDoc[0,:]=range(n) for file in files: subMatrix=IOmodule.readInTDM(_subMatrixDir, file) subMCopy=subMatrix.todok() for i,j,v in zip(subMatrix.row, subMatrix.col, subMatrix.data): m = subMCopy[i,0] n = subMCopy[0,j] # Make sure not to add index's if m==0 or n==0: continue termDoc[m,n] += v print "Added",file IOmodule.writeOutTDM(_termDocDir, _termDoc, termDoc) t2 = time.time() print 'Time elapsed:',str(t2-t1) return termDoc
def countFields(directory, fields): # files=sorted([f for f in os.listdir(directory) if os.path.isfile(directory+f)]) files = IOmodule.getSortedFilelist(directory+'/') fieldSum={} counter=0 pmidCounter=0 emptyCounter=0 noDescription=0 for f in files: fd = open(directory+f,'r') diseaseDic=eval(fd.read()) fd.close() medlineRecords=diseaseDic['records'] descriptionField=diseaseDic['description'] if descriptionField== '': noDescription+=1 if medlineRecords == []: print "Found empty record" emptyCounter+=1 for record in medlineRecords: pmidCounter+=1 for label in fields: fieldSum.setdefault(label,0) if label in record: fieldSum[label]+=1 counter+=1 print "Files remaining:",(len(files)-counter) return fieldSum,{'pmid count': pmidCounter},{'empty diseases': emptyCounter}
def runAndSaveMatrices(): """ Transform a directory of matrices to a directory of decomposed matrices. """ files = IOmodule.getSortedFilelist(_oldMatrixDir+'/') for file in files: M_coo=IOmodule.readInTDM(_oldMatrixDir,file) # Make sure the matrix contains information (1-dim. is an empty matrix) if M_coo.shape[0]==1: continue print "Shape:"+str(M_coo.shape) # SVD does not run well single dimenstion matrices ## (remembering that the first dimension is indices and does not count) if M_coo.shape[0]>2: M_dense=M_coo.todense() # Run SVD U,Sig,Vt=_svd(M_dense) # Get the reduced semantic space S= _semanticSpace(U,Sig,Vt,_reduceBy) # Recombine the indices and the reduced matrix M_dense[1:,1:]=S.todense() # Save the matrix M_coo=sparse.coo_matrix(M_dense) IOmodule.writeOutTDM(_newMatrixDir, file, M_coo) print '' else: print "Dimensionality too low for svd" IOmodule.writeOutTDM(_newMatrixDir, file, M_coo) print ''
def medlineDir2MatrixDir(): """ This function converts a directory of MedLine records to a new directory of corresponding term-doc matrices. It takes the matrix dimensions (row: m, col: n). It creates a directory (in the home folder) named 'diseaseMatrices' and stores the matrices as 'MatrixMarket' .mtx files, named by the disease name. """ termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash) pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash) files = IOmodule.getSortedFilelist(_medlineDir+'/') # files = sorted([f for f in os.listdir(_medlineDir+"/") if os.path.isfile(_medlineDir+"/" + f)]) counter = 0 for file in files: data = _gatherMatrixData(file) # Get matrix dimensions (+1 for the [0,0] field) ## (Here follows a small 0.0001 sec. hack to get n = total number of terms) temp={} for pmid in data: for term in pmid[1]: temp[term[0]]=0 m=len(data)+1 n=len(temp)+1 M = _populateMatrix(m, n, data,termHashTable, pmidHashTable) diseaseName = file[0:file.find('.txt')] IOmodule.writeOutTDM(_subMatrixDir, diseaseName, M) counter += 1 print str(counter),"matrices made. Total number of terms:",len(M.getrow(0).nonzero()[0])
def createDiseaseHash(dir,output=False): """ Recieves a directory containing files to be hashed. It uses the filename as a key. It requires the files to be in .mtx format. The hashes starts from 1 ... number_of_files """ diseaseHashes={} files = IO.getSortedFilelist(dir) counter=0 for f in files: diseaseName=f[0:f.find('.mtx')] stdm=IO.readInTDM(dir,f) if stdm.shape[0]==1: continue if diseaseName not in diseaseHashes.keys(): counter+=1 if output: print 'Created', diseaseName, 'with hash', counter diseaseHashes[diseaseName]=counter IO.pickleOut(_hashTablesDir, diseaseHash,"btd", diseaseHashes)
def createTermAndPmidHashes(): """ This function creates two hash tables of the PMID's and terms to be used for the term-doc matrix. Note that the terms are sanitized for any non-alphanumerical characters. And it is default to remove stop words. """ medlineDir = _medlineDir hashTables = _hashTablesDir termHashTable={} pmidHashTable={} termCounter = 0 pmidCounter = 0 files = IOmodule.getSortedFilelist(medlineDir+'/') # files = sorted([f for f in os.listdir(medlineDir+"/") if os.path.isfile(medlineDir+"/"+f)]) # Get the regex pattern that sanitizeses strings. sanitizer = TextCleaner.sanitizeString() for file in files: records = RecordHandler.loadMedlineRecords(medlineDir, file) # *Note* # Parts of the following loops could be optimized by using dictionaries # for direct loopkups instead of linear lookups, but since it's not # important, optimization will have to wait for another day. # Hash PMID's for diseaseRecords in records.values(): for record in diseaseRecords: pmid=record[0] if pmid not in pmidHashTable: pmidCounter+=1 pmidHashTable[pmid]=pmidCounter information='' # Get the abstract try: information=' '+record[1]['AB'] except: print 'Unable to get abstract', record[0] try: information+=' '+record[1]['TI'] except: print 'Unable to get title for', record[0] if 'MH' in record[1]: for meshterm in record[1]['MH']: information+=' '+meshterm # We do not want to print this, as most of the # records do not have MeSH. # print 'Unable to get MeSH terms for', record[0] # Sanitize the information information=sanitizer.sub(' ', information) # remove stopwords from the abstract information=FilterInterface.stopwordRemover(information) # OPTIONAL: # Stem the abstract if _stemmer: information=FilterInterface.porterStemmer(information) termList = [word for word in information.split(' ') if word != ''] for term in termList: if term not in termHashTable: termCounter+=1 termHashTable[term]=termCounter else: continue print str(termCounter)+" terms hashed. "+str(pmidCounter)+" pmids hashed." IOmodule.pickleOut(hashTables, _termHash,"btd", termHashTable) IOmodule.pickleOut(hashTables, _pmidHash,"btd", pmidHashTable) return termHashTable, pmidHashTable
def constructDiseaseMatrix(subMatrixDir, avg=False, output=False, time_log=False): """ Recieves a subMatrixDir goes through all the files and sums up the column of it, creating a single row vector containing the sum of all column in the sub term doc matrix. It then proceeds to making a disease term doc, based on these row vector Optional flags are: avg, takes the average over the columns of the sub matrices instead of the sum. output, makes the funtion produce additional output time_log, makes the function print out how much time is spend on what """ if output: print 'Initialising...' if time_log: t1 = time.time() files = IO.getSortedFilelist(subMatrixDir) termHashTable = IO.pickleIn(_hashTablesDir, _termHash) diseaseHashTable = IO.pickleIn(_hashTablesDir, diseaseHash) diseaseMatrix=sparse.lil_matrix((len(files)+1,len(termHashTable)+1)) # Initialize subTermSum to something subTermSum = sparse.lil_matrix((1,1)) if output: print 'Done initialising disease matrix of size', str((len(files)+1,len(termHashTable)+1)) count = 0 if time_log: print 'Time for initialization:', str(time.time() - t1)[:4] for f in files: if time_log: t2 = time.time() diseaseName = f[0:f.find('.mtx')] if output: print 'Processing', diseaseName count+=1 print 'Numbers remaining', len(files)-count subTermDoc = IO.readInTDM(subMatrixDir, diseaseName) subTermDoc = subTermDoc.tolil() # If the subTermDoc contains nothing, just skip it if(subTermDoc.shape[0] == 1 and subTermDoc.shape[1] == 1): continue subTermSum = getColumnSum(subTermDoc,avg) subTermSum[0,0] = diseaseHashTable[diseaseName] subTermSum[0,:] = subTermDoc.getrow(0) diseaseMatrix[diseaseHashTable[diseaseName],0] = diseaseHashTable[diseaseName] if time_log: print 'Time for', diseaseName, str(time.time() - t2)[:4] t3 = time.time() if output: print 'Filling in values in disease matrix for', diseaseName for columnIndex in range(1,subTermSum.shape[1]): diseaseMatrix[diseaseHashTable[diseaseName],subTermSum[0,columnIndex]] = subTermSum[1,columnIndex] if time_log: print 'Values filled into disease matrix in', str(time.time() - t3)[:4] if output: print 'Completed filling in values.' # Hack way of making term hashes diseaseMatrix[0,:] = range(0,len(termHashTable)) if output: print 'Done making disease matrix, writing to' IO.writeOutTDM(_termDocDir, label, diseaseMatrix) if output: print 'Done writing disease matrix.' return diseaseMatrix