def getMedlineList(pmids): """ This function takes a list of article-ids and returns a list of MedLine articles that contains an abstract. """ records = [] cleaned_records = [] listLength = len(pmids) Entrez.email = '*****@*****.**' for i in range(0, listLength, 650): tempList = pmids[i:i + 650] handle = Entrez.efetch(db='pubmed', id=tempList,rettype='medline', retmode='text') try: records.extend(list(Medline.parse(handle))) except: IOmodule.writeOutTxt(_mainFolder+'/'+'errordir_medline_records', pmids[i], '') print 'Downloaded',len(records),'MedLine articles.',str(listLength-len(records)),'remaining...' for article in records: if 'AB' in article: cleaned_records.append(article) print 'Returned',len(cleaned_records),'MedLine articles containing an abstract.' return cleaned_records
def runTFIDF(): """ Create a normalized log-transformed TFIDF-matrix from a sparse coo_matrix. """ files = IOmodule.getSortedFilelist(_matrixDir + "/") # files = sorted([f for f in os.listdir(_matrixDir+"/") if os.path.isfile(_matrixDir+"/" + f)]) for file in files: file = file[:-4] subM_coo = IOmodule.readInTDM(_matrixDir, file) t1 = time.time() dense, lil = _generateLogTFIDF(subM_coo) t2 = time.time() print "Generated log_TFIDF in " + str(t2 - t1) t1 = time.time() _normalizeVectorLengths(dense, lil, file) t2 = time.time() print "Normalized vector lengths in " + str(t2 - t1) print "Done with: " + file + "\n"
def createVLHash(M_lil): VLHash={} for pmidHash in range(M_lil.shape[0]): VLHash[pmidHash]=linalg.norm((M_lil.getrow(pmidHash).data[0])[1:]) IOmodule.pickleOut("/root/The_Hive/term_doc/hashTables", "VLHash","btd", VLHash)
def createRLHash(M_lil,filename,save_file=True): """ Precompute and save the norm of each row vector in the term-doc matrix. """ t1=time.time() if not os.path.isdir(_hashTablesDir): os.mkdir(_hashTablesDir) RLHash={} count=0 for pmidHash in range(1,M_lil.shape[0]): RLHash[pmidHash]=linalg.norm((M_lil.getrow(pmidHash).data[0])[1:]) count+=1 if save_file: print "Hashes created: "+str(count) if save_file: IOmodule.pickleOut(_hashTablesDir, filename,"btd", RLHash) else: return RLHash t2=time.time() print "Created and saved RowLength-hash in: "+str(t2-t1)
def createDiseaseLabelHash(): """ Create and save a hash that connects every PMID with one or more diseases. """ t1 = time.time() files = IOmodule.getSortedFilelist(_subMatrixDir+'/') labelHash={} fileCount=0 for file in files: subMatrix=IOmodule.readInTDM(_subMatrixDir, file) colMatrix=subMatrix.tocsc() pmids=colMatrix.getcol(0)[1:].data for pmid in pmids: try: labelHash[pmid].append(file[:-4]) except: labelHash[pmid]=[] labelHash[pmid].append(file[:-4]) fileCount+=1 print "Remaining:",(len(files)-fileCount),"Completed",file[:-4] t2 = time.time() print 'Created disease label hash in:',str(t2-t1) IOmodule.pickleOut(_hashTablesDir, _labelHash,"btd", labelHash)
def _normalizeVectorLengths(M_lil): """ Normalize the length of a sparse lil_matrix. """ t1=time.time() # Create a norm-hash of each row-vector in the stemmed term-doc matrix. vectorLength=SearchTermDoc.createRLHash(M_lil, _RLHash,False) for row in range(1,M_lil.shape[0]): norm=vectorLength[row] for col in (M_lil.getrow(row).nonzero()[1])[1:]: M_lil[row,col]=(M_lil[row,col])/norm print "Normalized:",row t2=time.time() print "Total:"+str(t2-t1) # This is madness tfidfMatrix = M_lil # Save and overwrite the log_tfidf generated above IOmodule.writeOutTDM(_termDocDir, _tfidfName+'_norm', tfidfMatrix)
def runOutlierDetector(dir, distance=cosine_dense, removePercent=0.05, output=False, time_log=False): files = IO.getSortedFilelist(dir+'/') if output: counter = 0 for f in files: diseaseName = f[0:f.find('.mtx')] subTermDoc = IO.readInTDM(dir, diseaseName) if output: counter += 1 print 'Count:', counter # If sub term document matrix is empty, just skip it. if subTermDoc.shape[0]==1 or subTermDoc.shape[1]==1: continue if time_log: t1 = time.time() subTermDoc = outlierDetector(subTermDoc, distance, removePercent, output, time_log) if time_log: print 'Time for outlier detection on', diseaseName, ':', str(time.time() -t1)[:4] if output: print 'Writing', subTermDoc = sparse.coo_matrix(subTermDoc) IO.writeOutTDM(_subFolder+'/'+outlierRemoved+str(int(removePercent*100)), diseaseName, subTermDoc)
def createCLHash(M_coo,filename,save_file=True): """ Precompute and save the length of each column vector in the term-doc matrix. Here the length refers to the number of elements. """ t1=time.time() if not os.path.isdir(_hashTablesDir): os.mkdir(_hashTablesDir) M_lil=(M_coo.transpose()).tolil() CLHash={} count=0 for termHash in range(1,M_coo.shape[1]): termVectorLength=len((M_lil.getrow(termHash).nonzero()[0])[1:]) CLHash[termHash]=termVectorLength count+=1 if save_file: print "Hashes created: "+str(count)+". Length:"+str(termVectorLength) t2=time.time() if save_file: print "Created and saved ColumnLength-hash in: "+str(t2-t1) IOmodule.pickleOut(_hashTablesDir, filename,"btd", CLHash) else: return CLHash
def saveMatrix(filename,matrix): t1 = time.time() IOmodule.writeOutTDM('testFolder',filename, matrix) t2 = time.time() print 'Time used: ',(t2-t1)
def _generateLogTFIDF(M_coo): """ Creates a Term-Frequency Inverse-Document-Frequency from a sparse coo_matrix, using log-transformation on TF and IDF. Returns a sparse lil_matrix to be used for vector-normalization. """ totalTime1=time.time() numberOfDocs = float(M_coo.shape[0]-1) print "Converting from coo to lil..." t1=time.time() tfidfMatrix=M_coo.tolil() t2=time.time() print "Matrix converted to lil in",(t2-t1) t1=time.time() for row in range(1,numberOfDocs+1): for col in (tfidfMatrix.getrow(row).nonzero()[1])[1:]: tf=tfidfMatrix[row,col] #if tf == 0: # print "Looked up zero-value at: "+str(docIndex)+" "+str(termVectorIndex) # raise Exception if tf <=0: print tf tf=0.000000000000001 # <---for svd try: tf = math.log(1 + tf) except: print tf if _termSum[col]==0: continue idf = math.log(numberOfDocs / _termSum[col]) tfidfMatrix[row,col]=tf*idf print "Row:",row t2=time.time() print "Total:"+str(t2-t1) # Save and overwrite the log_tfidf generate above IOmodule.writeOutTDM(_termDocDir, _tfidfName, tfidfMatrix) totalTime2=time.time() print "Total time: "+str(totalTime2-totalTime1) return tfidfMatrix
def pmidDuplicateCounter(directory, number=None): # files=sorted([f for f in os.listdir(directory) if os.path.isfile(directory+f)])[:number] files = IOmodule.getSortedFilelist(directory+'/') pmidCount={} counter=0 for f in files: # Open file descriptor fd = open(directory+'/'+f,'r') # Read in from file diseaseDic=eval(fd.read()) # Close the file descriptor nicely again fd.close() medlineRecords=diseaseDic['records'] for record in medlineRecords: pmidCount.setdefault(record['PMID'],0) pmidCount[record['PMID']]+=1 counter+=1 if counter % 1000 == 0: print counter # print "Files remaining:",(len(files)-counter) return pmidCount
def _readDiseases(indexStart=0,indexStop=None): """ Function for returning the content of all or some of the crawled diseases. By default all are returned in a dictionary of diseases on the form: [{DiseaseName:{db='',terms:'',syn=[],uid:'',desc:''}}] The reason all the dictionaries are returned as a list is to be sure of the order. """ path=_subFolder+"/"+diseaseFolder+'/' # files=sorted([f for f in os.listdir(path) if os.path.isfile(path+f)]) files = IOmodule.getSortedFilelist(path, startIndex=indexStart, stopIndex=indexStop) sortedcontents=[] for f in files: contents={} diseaseName=f[0:f.find('.txt')] diseaseAttr=eval(open(path+f,'r').read()) contents[diseaseName]=diseaseAttr sortedcontents.append(contents) return sortedcontents
def countRecordfield(directory,field): """ This function counts the number of identical fields in the MedLine records. This could for instance be used for a view into how many identical PMIDs that have been downloaded on the cross of different disease searches. It takes a medline record directory (full path) and a field. It returns a dictionary on the form: {PMID: #ids,...} """ fieldSum={} # files=sorted([f for f in os.listdir(directory) if os.path.isfile(directory+f)]) files = IOmodule.getSortedFilelist(directory+'/') counter=0 for f in files: diseaseDic=eval(open(directory+f,'r').read()) medlineRecords=diseaseDic['records'] for record in medlineRecords: item=record[field] fieldSum.setdefault(item,0) fieldSum[item]+=1 counter+=1 print "Files remaining:",(len(files)-counter) return fieldSum
def gatherOfAllThings(startIndex=0,stopIndex=None): # Get the number of diseases, based on start and stop. If # stopIndex = None, then it returns the whole range numberOfRareDiseases = len(_readDiseases(startIndex,stopIndex)) # Default number per chuck, before writeout numberToGet = 1 # Calculate the numbers of steps, for looping. steps = int(math.ceil(numberOfRareDiseases / numberToGet)) # Default directory to save information files in. directory = 'medline_records' _path_medlinerecords+=_mainFolder+'/'+directory if not os.path.isDir(_path_medlinerecords): os.mkdir(_path_medlinerecords) # Read in the range of diseases we want to get information about, # in a list, it needs to be sorted to support resume. d=_readDiseases(startIndex,stopIndex) for i in range(steps): # Read in the a chuck of diseases in a list diseaseList = d[i * numberToGet:i * numberToGet + numberToGet] diseaseDictionary = {} for i in range(len(diseaseList)): # Transfer the ordered disease list into an unordered # dictionary diseaseDictionary[diseaseList[i].keys()[0]] = diseaseList[i].values()[0] dictionary = {} # Runs through the disease dictionary and gets all the PMIDs # for each disease diseaseDictionary = getArticleIDs(diseaseDictionary) for disease in diseaseDictionary: dictionary[disease] = {} dictionary[disease]['records']=[] dictionary[disease]['description'] = diseaseDictionary[disease]['description'] dictionary[disease]['records'].extend(getMedlineList(diseaseDictionary[disease]['PMIDs'])) IOmodule.writeOutTxt(_path_medlinerecords, disease, dictionary[disease], 'w')
def _normalizeVectorLengths(M_dense, M_lil, filename): """ Normalize the length of a sparse matrix, represented as a dense and a lil - format. """ vectorLength = SearchTermDoc.createRLHash(M_lil, None, False) for row in range(1, M_lil.shape[0]): norm = vectorLength[row] for col in (M_lil.getrow(row).nonzero()[1])[1:]: M_dense[row, col] = (M_dense[row, col]) / norm tfidfMatrix = sparse.coo_matrix(M_dense) # Save the matrix IOmodule.writeOutTDM(_termDocDir, filename, tfidfMatrix)
def loadMatrix(dirPath,filename): t1=time.time() M = IOmodule.readInTDM(dirPath, filename) t2=time.time() print str(t2-t1) return M
def calculate_model_with_setted_parameter(t_input): s_address_file_model, s_address_file_drug, s_address_file_inputnode, s_address_folder_output, i_iteration, i_length, i_count_starting, i_resolution_drug, l_s_targets = t_input obj_iterator = Iterator_of_model_simulator( update_function, #imported module s_address_file_model, s_address_file_drug, s_address_file_inputnode, i_iteration, i_length, i_count_starting, l_s_targets, i_resolution_drug) obj_iterator.calculate_model_for_each_drug_concentrations() s_file_drug = os.path.split(s_address_file_drug)[1] s_drug = os.path.splitext(s_file_drug)[0] s_address_folder_output = os.path.join(s_address_folder_output, s_drug) IOmodule.make_forder_for_output(s_address_folder_output) IOmodule.save_output_as_txtfile(obj_iterator, s_address_folder_output) return s_address_file_model
def medlineDir2MatrixDir(): """ This function converts a directory of MedLine records to a new directory of corresponding term-doc matrices. It takes the matrix dimensions (row: m, col: n). It creates a directory (in the home folder) named 'diseaseMatrices' and stores the matrices as 'MatrixMarket' .mtx files, named by the disease name. """ termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash) pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash) files = IOmodule.getSortedFilelist(_medlineDir+'/') # files = sorted([f for f in os.listdir(_medlineDir+"/") if os.path.isfile(_medlineDir+"/" + f)]) counter = 0 for file in files: data = _gatherMatrixData(file) # Get matrix dimensions (+1 for the [0,0] field) ## (Here follows a small 0.0001 sec. hack to get n = total number of terms) temp={} for pmid in data: for term in pmid[1]: temp[term[0]]=0 m=len(data)+1 n=len(temp)+1 M = _populateMatrix(m, n, data,termHashTable, pmidHashTable) diseaseName = file[0:file.find('.txt')] IOmodule.writeOutTDM(_subMatrixDir, diseaseName, M) counter += 1 print str(counter),"matrices made. Total number of terms:",len(M.getrow(0).nonzero()[0])
def createDiseaseHash(dir,output=False): """ Recieves a directory containing files to be hashed. It uses the filename as a key. It requires the files to be in .mtx format. The hashes starts from 1 ... number_of_files """ diseaseHashes={} files = IO.getSortedFilelist(dir) counter=0 for f in files: diseaseName=f[0:f.find('.mtx')] stdm=IO.readInTDM(dir,f) if stdm.shape[0]==1: continue if diseaseName not in diseaseHashes.keys(): counter+=1 if output: print 'Created', diseaseName, 'with hash', counter diseaseHashes[diseaseName]=counter IO.pickleOut(_hashTablesDir, diseaseHash,"btd", diseaseHashes)
def createTermDoc(refreshHash=False): """ This function creates a large term-doc martix from a directory of sub term- doc matrices. It returns a matrix with dimensions given by the specified hash tables. It also saves the matrix for later use as a MatrixMarket .mtx file. """ t1 = time.time() if refreshHash: createTermAndPmidHashes() files = IOmodule.getSortedFilelist(_subMatrixDir+'/') # files = sorted([f for f in os.listdir(_subMatrixDir+"/") if os.path.isfile(_subMatrixDir+"/" + f)]) termHashTable=IOmodule.pickleIn(_hashTablesDir, _termHash) pmidHashTable=IOmodule.pickleIn(_hashTablesDir, _pmidHash) # Need to add one due to non zero indexing m=len(pmidHashTable)+1 n=len(termHashTable)+1 termDoc = sparse.lil_matrix((m,n)) # Insert values representing hashes for i in range(m): termDoc[i,0]=i termDoc[0,:]=range(n) for file in files: subMatrix=IOmodule.readInTDM(_subMatrixDir, file) subMCopy=subMatrix.todok() for i,j,v in zip(subMatrix.row, subMatrix.col, subMatrix.data): m = subMCopy[i,0] n = subMCopy[0,j] # Make sure not to add index's if m==0 or n==0: continue termDoc[m,n] += v print "Added",file IOmodule.writeOutTDM(_termDocDir, _termDoc, termDoc) t2 = time.time() print 'Time elapsed:',str(t2-t1) return termDoc
def getSemanticKeywords(top=20): diseaseList=[("Fibrodysplasia ossificans progressiva","Boy, normal birth, deformity of both big toes (missing joint), quick development of bone tumor near spine and osteogenesis at biopsy"), ("Adrenoleukodystrophy autosomal neonatal form","Normally developed boy age 5, progessive development of talking difficulties, seizures, ataxia, adrenal insufficiency and degeneration of visual and auditory functions"), ("Papillon Lefevre syndrome","Boy age 14, yellow keratotic plaques on the skin of palms and soles going up onto the dorsal side. Both hands and feet are affected. swollen vulnerable gums, loss of permanent teeth"), ("Kleine Levin Syndrome","Jewish boy age 16, monthly seizures, sleep aggressive and irritable when woken, highly increased sexual appetite and hunger"), ("Schinzel Giedion syndrome","Male child, malformations at birth, midfacial retraction with a deep groove under the eyes, and hypertelorism, short nose with a low nasal bridge and large low-set ears, wide mouth and retrognathia. Hypertrichosis with bright reddish hair and a median frontal cutaneous angioma, short neck with redundant skin, Bilateral inguinal hernias, hypospadias with a megameatus, and cryptorchidism")] matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_90" #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_stemmed_reduced_10" #matrixDir="/root/The_Hive/term_doc/new_diseaseMatrices_tfidf_stemmed_reduced90_outlierRemoved5" scoreDic={} totalTermDic={} for disease in diseaseList: filename=disease[0] symptoms=disease[1] symptoms=SearchTermDoc._modifySearchString(symptoms) symptoms=map(FilterInterface.porterStemmer,symptoms) M_coo=IOmodule.readInTDM(matrixDir,filename) totalTermDic[filename]=(M_coo.shape[1]-1) M_csc=M_coo.tocsc() termSum=[] for col in range(1,M_coo.shape[1]): term=revTermHashTable[M_csc[0,col]] termSum.append((sum(M_csc.getcol(col).data[:-1]),term)) termSum.sort() termSum.reverse() scoreDic[filename]={} for item in termSum: if item[1] in symptoms: scoreDic[filename][item[1]]=termSum.index(item) #return termSum[:top] for score in scoreDic.items(): print "Total number of terms for the disease:",totalTermDic[score[0]] print str(score[0])+'\t'+str(score[1].items()) print ''
def countFields(directory, fields): # files=sorted([f for f in os.listdir(directory) if os.path.isfile(directory+f)]) files = IOmodule.getSortedFilelist(directory+'/') fieldSum={} counter=0 pmidCounter=0 emptyCounter=0 noDescription=0 for f in files: fd = open(directory+f,'r') diseaseDic=eval(fd.read()) fd.close() medlineRecords=diseaseDic['records'] descriptionField=diseaseDic['description'] if descriptionField== '': noDescription+=1 if medlineRecords == []: print "Found empty record" emptyCounter+=1 for record in medlineRecords: pmidCounter+=1 for label in fields: fieldSum.setdefault(label,0) if label in record: fieldSum[label]+=1 counter+=1 print "Files remaining:",(len(files)-counter) return fieldSum,{'pmid count': pmidCounter},{'empty diseases': emptyCounter}
def main(): """make models by txt files in model list folder, and get drug information from drug_probability.txt input the number of iteration(calculation of one trajectory) and the length of trajectory. input the resolution of drug concentration change (if 10 is inputed, 0,0.1,0.2,,, 0.9,1 cases are calculated) finally, input the node names to observe. then this function will make text files of observed node pattern for each model in output folder""" s_address_NSC = os.path.dirname(os.path.realpath(__file__)) l_parameters = IOmodule.set_config(s_address_NSC) p = Pool(l_parameters[6]) l_list_mutation_files = [ os.path.join(l_parameters[0], s_filename_mutation) for s_filename_mutation in os.listdir(l_parameters[0]) ] l_list_drug_files = [ os.path.join(l_parameters[3], s_drug_file) for s_drug_file in os.listdir(l_parameters[3]) ] s_address_file_inputnode = os.path.join(s_address_NSC, "input_nodes.txt") l_list_inputs = [ (s_address_file_model, s_address_drug_file, s_address_file_inputnode, l_parameters[2], l_parameters[7], l_parameters[8], l_parameters[9], l_parameters[10], l_parameters[11]) for s_address_file_model in l_list_mutation_files for s_address_drug_file in l_list_drug_files ] #print(s_address_folder_models, s_address_file_drug, i_iteration, type(i_iteration), l_s_targets) l_list_for_check = p.map(calculate_model_with_setted_parameter, l_list_inputs) if l_list_mutation_files != l_list_for_check: for s_mutation_file in l_list_mutation_files: if s_mutation_file not in l_list_for_check: print(s_mutation_file + " is not calculated! by multiprocessing error!")
def main(fname, output=None): print("Input name", fname) print("Output name", output) tellpath = "/home/jneal/Phd/Codes/Phd-codes/WavelengthCalibration/testfiles/" # Updated for git repo #ext = 0 # for testing #hdr, data = Get_DRACS(fname, ext) #Get DRACS only finds relevant file in the path folder hdr = fits.getheader(fname) print("Observation time ", hrd[]) data = fits.getdata(fname) uncalib_combined = data["Combined"] #uncalib_noda = uncalib_data["Nod A"] #uncalib_nodb = uncalib_data["Nod B"] uncalib_data = [range(1,len(uncalib_combined)+1), uncalib_combined] # get hdr information and then find coresponding Telluric spectra calib_data = IOmodule.read_2col(tellpath + "Telluric_spectra_CRIRES_Chip-" + str(1) + ".txt") gf.print_fit_instructions() rough_a, rough_b = gf.get_rough_peaks(uncalib_data[0], uncalib_data[1], calib_data[0], calib_data[1]) rough_x_a = [coord[0] for coord in rough_a] rough_x_b = [coord[0] for coord in rough_b] good_a, good_b = gf.adv_wavelength_fitting(uncalib_data[0], uncalib_data[1], rough_x_a, calib_data[0], calib_data[1], rough_x_b) wl_map = gf.wavelength_mapping(good_a, good_b) calibrated_wl = np.polyval(wl_map, uncalib_data[0]) plt.plot(calibrated_wl, uncalib_data[1], label="Calibrated spectra") plt.plot(calib_data[0], calib_data[1], label="Telluric spectra") plt.title("Calibration Output") plt.show()
def runAndSaveMatrices(): """ Transform a directory of matrices to a directory of decomposed matrices. """ files = IOmodule.getSortedFilelist(_oldMatrixDir+'/') for file in files: M_coo=IOmodule.readInTDM(_oldMatrixDir,file) # Make sure the matrix contains information (1-dim. is an empty matrix) if M_coo.shape[0]==1: continue print "Shape:"+str(M_coo.shape) # SVD does not run well single dimenstion matrices ## (remembering that the first dimension is indices and does not count) if M_coo.shape[0]>2: M_dense=M_coo.todense() # Run SVD U,Sig,Vt=_svd(M_dense) # Get the reduced semantic space S= _semanticSpace(U,Sig,Vt,_reduceBy) # Recombine the indices and the reduced matrix M_dense[1:,1:]=S.todense() # Save the matrix M_coo=sparse.coo_matrix(M_dense) IOmodule.writeOutTDM(_newMatrixDir, file, M_coo) print '' else: print "Dimensionality too low for svd" IOmodule.writeOutTDM(_newMatrixDir, file, M_coo) print ''
# Main folder _path = os.getenv("HOME") + "/" + "The_Hive" # Sub folder _subFolder = _path + "/" + "term_doc" # Hashtable directory _hashTablePath = _subFolder + "/" + "hashTables" # Set True for Porter-stemming _stemmer = True # For term document set to True, for DiseaseMatrix set to false. _termdocumentmatrix = False # Cosine measure = True, Sum measure = False _cosineMeasure = False ############ if _termdocumentmatrix: # Disease label hash (for pmid lookup) _labelHash = IOmodule.pickleIn(_hashTablePath, "labelHash") print "Label hash loaded" else: # Disease label hash (for label lookup) _diseaseHash = IOmodule.pickleIn(_hashTablePath, "diseaseHash") #_reduced") _labelHash = dict(zip(_diseaseHash.values(), _diseaseHash.keys())) print "Disease hash loaded" ############ def search(M_lil, M_csc, queryString, top=20): """ This function is still a work in progress.. """ sanitizer = TextCleaner.sanitizeString()
# Term- and PMID-hash directory _hashTablesDir=_path+'/'+'term_doc'+'/'+"hashTables" # If subFolder do not exists if not os.path.isdir(_path+'/'+subFolder): os.mkdir(_path+'/'+subFolder) _stemmer=True #################################################################### #### Use stopword-removed TermDoc ################################## #################################################################### if not _stemmer: # Hashes termHashTable=IOmodule.pickleIn(_hashTablesDir, "termHash") pmidHashTable=IOmodule.pickleIn(_hashTablesDir, "pmidHash") revPmidHashTable=dict(zip(pmidHashTable.values(),pmidHashTable.keys())) #################################################################### #### Use stopword-removed and Porter-stemmed (english) TermDoc: #### #################################################################### else: # Stemmed hashes termHashTable=IOmodule.pickleIn(_hashTablesDir, "termHash_stemmed") pmidHashTable=IOmodule.pickleIn(_hashTablesDir, "pmidHash_stemmed") revPmidHashTable=dict(zip(pmidHashTable.values(),pmidHashTable.keys())) print "Hashes loaded"
# Load the precomputed length of each column in the stemmed term-doc matrix #_termSum = IOmodule.pickleIn(_hashTablePath,_CLHash) ####################################################################################### #### Use stopword-removed, Porter-stemmed (english) and TFIDF-prefiltered TermDoc: #### ####################################################################################### # TFIDF-matrix file name _tfidfName = "label_TFIDFMatrix_reduced_90" # Vector-norm hash for then TFIDFMatrix #_RLHash = "RLHash_tfidf_stemmed" # Hash for the number of documents each term occur in _CLHash = "svdlabel_CLHash" # Load the precomputed length of each column in the stemmed term-doc matrix _termSum = IOmodule.pickleIn(_hashTablePath,_CLHash) #################################################################### print "Hashes loaded." def _generateLogTFIDF(M_coo): """ Creates a Term-Frequency Inverse-Document-Frequency from a sparse coo_matrix, using log-transformation on TF and IDF. Returns a sparse lil_matrix to be used for vector-normalization. """
if __name__ == "__main__": # Do the test case HD30501-1 Chip-1 path = "/home/jneal/Phd/Codes/Phd-codes/WavelengthCalibration/testfiles/" # Updated for git repo #global param_nums #param_nums = 3 Dracspath = "/home/jneal/Phd/data/Crires/BDs-DRACS/" obj = "HD30501-1" objpath = Dracspath + obj + "/" chip = 0 # for testing hdr, DracsUncalibdata = Get_DRACS(objpath,0) UnCalibdata_comb = DracsUncalibdata["Combined"] #UnCalibdata_noda = DracsUncalibdata["Nod A"] #UnCalibdata_nodb = DracsUncalibdata["Nod B"] UnCalibdata = [range(1,1025), UnCalibdata_comb] Calibdata = IOmodule.read_2col(path + "Telluric_spectra_CRIRES_Chip-" + str( 1) + ".txt") #print(Calibdata) #get_rough_peaks() Test_pxl_pos = [58.4375, 189.0625, 583.0, 688.1875, 715.6875, 741.8125, 971.4375] Test_wl_pos = [2112.4897175403221, 2114.0333233870965, 2118.5533179435483, 2119.748622983871, 2120.0573441532256, 2120.358149395161, 2122.9624895161287] CoordsA = Test_pxl_pos # first one for now CoordsB = Test_wl_pos print_fit_instructions() #Comment Below line to skip this and work on next part good_coords_a, good_coords_b = adv_wavelength_fitting(UnCalibdata[0], UnCalibdata[1], CoordsA, Calibdata[0], Calibdata[1],
def createTermAndPmidHashes(): """ This function creates two hash tables of the PMID's and terms to be used for the term-doc matrix. Note that the terms are sanitized for any non-alphanumerical characters. And it is default to remove stop words. """ medlineDir = _medlineDir hashTables = _hashTablesDir termHashTable={} pmidHashTable={} termCounter = 0 pmidCounter = 0 files = IOmodule.getSortedFilelist(medlineDir+'/') # files = sorted([f for f in os.listdir(medlineDir+"/") if os.path.isfile(medlineDir+"/"+f)]) # Get the regex pattern that sanitizeses strings. sanitizer = TextCleaner.sanitizeString() for file in files: records = RecordHandler.loadMedlineRecords(medlineDir, file) # *Note* # Parts of the following loops could be optimized by using dictionaries # for direct loopkups instead of linear lookups, but since it's not # important, optimization will have to wait for another day. # Hash PMID's for diseaseRecords in records.values(): for record in diseaseRecords: pmid=record[0] if pmid not in pmidHashTable: pmidCounter+=1 pmidHashTable[pmid]=pmidCounter information='' # Get the abstract try: information=' '+record[1]['AB'] except: print 'Unable to get abstract', record[0] try: information+=' '+record[1]['TI'] except: print 'Unable to get title for', record[0] if 'MH' in record[1]: for meshterm in record[1]['MH']: information+=' '+meshterm # We do not want to print this, as most of the # records do not have MeSH. # print 'Unable to get MeSH terms for', record[0] # Sanitize the information information=sanitizer.sub(' ', information) # remove stopwords from the abstract information=FilterInterface.stopwordRemover(information) # OPTIONAL: # Stem the abstract if _stemmer: information=FilterInterface.porterStemmer(information) termList = [word for word in information.split(' ') if word != ''] for term in termList: if term not in termHashTable: termCounter+=1 termHashTable[term]=termCounter else: continue print str(termCounter)+" terms hashed. "+str(pmidCounter)+" pmids hashed." IOmodule.pickleOut(hashTables, _termHash,"btd", termHashTable) IOmodule.pickleOut(hashTables, _pmidHash,"btd", pmidHashTable) return termHashTable, pmidHashTable
def fetchPubmedDiseaseTerms(pages): """ Takes a URL-list of pages to crawl for pubmed terms, uids and optional describtions. Returns a dictionary on the form: {DiseaseName:{db='',terms:'',syn=[],uid:'',desc:''}} """ pubmedURLs={} printvar=0 pagenumber=0 desccounter=0 for page in pages: pagenumber+=1 # Open the page. for i in range(3): try: c=urllib2.urlopen(page) except: print "Could not open %s" % page print "Attempt",str(i+1),"out of 3" sleep(5) if i==2: print "Could not open page. Terminating.." raise StopIteration() try: soup=BeautifulSoup(c.read()) except HTMLParseError: print 'Experienced difficulties opening %s' % page IOmodule.writeOutTxt(_subFolder+"/"+diseaseFolder+'/'+errorFolder,strftime('%H%M%S'),page) continue # Get disease name. title=soup.html.head.title.string # Some pages are 'officially' not working. Catch them here. if title=='NIH Office of Rare Diseases Research (ORDR) - Error': IOmodule.writeOutTxt(_subFolder+"/"+diseaseFolder+'/'+errorFolder,'Page error'+strftime('%H%M%S'),page) print 'Page Error on %s' % page continue # Allocate dictionary. pubmedURLs[title]={} pubmedURLs[title]['db']='pubmed' # ..database to search in (pubmed by default) pubmedURLs[title]['terms']='' # ..handcrafted search term pubmedURLs[title]['syn']=[] # ..disease synonyms pubmedURLs[title]['uid']='' # ..search id pubmedURLs[title]['desc']='' # ..optional disease description # Check for PubMed direct links. links=soup('a') found=False for link in links: if (link.contents): if ((link.contents[0] == 'PubMed')) & ('href' in dict(link.attrs)): urlString = link['href'].lower() # If there is a PubMed direct link and it's an id: if ('uid=' in urlString) | ('uids=' in urlString) | ('idsfromresult=' in urlString): tokens = _parseURL(urlString) uid = tokens['from_uid'] pubmedURLs[title]['uid'] = uid pubmedURLs[title]['db'] = tokens['db'] printvar += 1 found = True print 'Found', str(printvar), 'PubMed terms/uids.', title continue # If there is a PubMed direct link and it's a handcrafted term: elif ('term=' in urlString): tokens = _parseURL(urlString) terms = tokens['term'] pubmedURLs[title]['terms'] = terms pubmedURLs[title]['db'] = tokens['db'] printvar += 1 found = True print 'Found', str(printvar), 'PubMed terms/uids.', title continue # Special case 1: If there is a PubMed direct link but the uid is not part of the tokens. elif ('/entrez/' not in urlString): start = urlString.find('/pubmed/') + 8 if '?' in urlString: end = urlString.find('?') uid = urlString[start:end] else: uid = urlString[start:] print uid pubmedURLs[title]['uid'] = uid printvar += 1 found = True print 'Found', str(printvar), 'PubMed terms/uids.', title, '. (Special case 1: No tokens)' # Special case 2: If there is a webenv, the url is (by experience) not working but the disease name is still valuable for a pbumed search. elif '&webenv=' in urlString: printvar += 1 found = True print 'Found', str(printvar), 'PubMed terms/uids.', title, '. (Special case 2: WebEnv)' # Terminate if an unexpected url shows up if link.contents: if (not found) & (link.contents[0]=='PubMed'): print 'Could not fetch url' raise StopIteration() # A simple addition to the printouts. if not found: printvar+=1 print 'Found',str(printvar),'Diseases.',title,'(no uid or term).' # Notify if an unexpected database shows up. if ((pubmedURLs[title]['db']!='')&(pubmedURLs[title]['db']!='omim')&(pubmedURLs[title]['db']!='pubmed')): print "*****Found different db:",pubmedURLs[title]['db'] print 'Could not fetch url' raise StopIteration() # Disease synonyms are also added to the term list. lis=soup('li') for li in lis: if ('synonym' in str(li.parent)): synonym=li.contents[0] if (',') in str(synonym): aditionalSynonyms=synonym.split(',') for syn in aditionalSynonyms: pubmedURLs[title]['syn'].append(syn) print ' ' + syn else: pubmedURLs[title]['syn'].append(synonym) print ' ' + synonym # Look for a optional disease description on rarediseases.info.nih.gov. descs=soup('span') for desc in descs: if ('id' in dict(desc.attrs)): idString=desc['id'].lower() if (('descriptionquestion' in idString) & ('#003366' not in str(desc))): desc=_cleanString(str(desc)) pubmedURLs[title]['desc']=desc desccounter+=1 print ' *Found optional disease description' print '' if ((pagenumber%20)==0): # Print status report print '*****************************************************' print 'Total pages looked in:',str(pagenumber),'\nPages found:',str(printvar),'\nRemaining in total:',(len(pages)-printvar),'out of',len(pages),'\nDescriptions found:',str(desccounter) print '*****************************************************' print 'Writing to files...' # Write out and flush dictionary for disease in pubmedURLs: # Remove some problematic tokens from the file name content=pubmedURLs[disease] disease=removeSlashes.sub(' ',disease) disease=removeCommas.sub(' ',disease) # Write out IOmodule.writeOutTxt(diseaseFolder,disease,content) pubmedURLs={} print 'Wrote successfully. Dictionary flushed.'
def constructDiseaseMatrix(subMatrixDir, avg=False, output=False, time_log=False): """ Recieves a subMatrixDir goes through all the files and sums up the column of it, creating a single row vector containing the sum of all column in the sub term doc matrix. It then proceeds to making a disease term doc, based on these row vector Optional flags are: avg, takes the average over the columns of the sub matrices instead of the sum. output, makes the funtion produce additional output time_log, makes the function print out how much time is spend on what """ if output: print 'Initialising...' if time_log: t1 = time.time() files = IO.getSortedFilelist(subMatrixDir) termHashTable = IO.pickleIn(_hashTablesDir, _termHash) diseaseHashTable = IO.pickleIn(_hashTablesDir, diseaseHash) diseaseMatrix=sparse.lil_matrix((len(files)+1,len(termHashTable)+1)) # Initialize subTermSum to something subTermSum = sparse.lil_matrix((1,1)) if output: print 'Done initialising disease matrix of size', str((len(files)+1,len(termHashTable)+1)) count = 0 if time_log: print 'Time for initialization:', str(time.time() - t1)[:4] for f in files: if time_log: t2 = time.time() diseaseName = f[0:f.find('.mtx')] if output: print 'Processing', diseaseName count+=1 print 'Numbers remaining', len(files)-count subTermDoc = IO.readInTDM(subMatrixDir, diseaseName) subTermDoc = subTermDoc.tolil() # If the subTermDoc contains nothing, just skip it if(subTermDoc.shape[0] == 1 and subTermDoc.shape[1] == 1): continue subTermSum = getColumnSum(subTermDoc,avg) subTermSum[0,0] = diseaseHashTable[diseaseName] subTermSum[0,:] = subTermDoc.getrow(0) diseaseMatrix[diseaseHashTable[diseaseName],0] = diseaseHashTable[diseaseName] if time_log: print 'Time for', diseaseName, str(time.time() - t2)[:4] t3 = time.time() if output: print 'Filling in values in disease matrix for', diseaseName for columnIndex in range(1,subTermSum.shape[1]): diseaseMatrix[diseaseHashTable[diseaseName],subTermSum[0,columnIndex]] = subTermSum[1,columnIndex] if time_log: print 'Values filled into disease matrix in', str(time.time() - t3)[:4] if output: print 'Completed filling in values.' # Hack way of making term hashes diseaseMatrix[0,:] = range(0,len(termHashTable)) if output: print 'Done making disease matrix, writing to' IO.writeOutTDM(_termDocDir, label, diseaseMatrix) if output: print 'Done writing disease matrix.' return diseaseMatrix