def searchDisease(M_lil, M_csc, queryString, top=20): """ This function is still a work in progress.. """ sanitizer = TextCleaner.sanitizeString() queryString = sanitizer.sub(' ', queryString) # OPTIONAL: # Stem the information if _stemmer: # Get the regex pattern that sanitizeses information and sanitize it # Stem the information queryString = FilterInterface.porterStemmer(queryString) # CHOOSE HEURISTIC: # Search-heuristic used to retrieve the list of results if _cosineMeasure: results = SearchInterface.sumMeasure(M_lil, M_csc, queryString) else: results = SearchInterface.sumMeasure(M_lil, M_csc, queryString) # Sort the results and reverse to get the highest score first results.sort() results.reverse() resultDic = {} for item in results[:top]: pmid = item[1] label = _labelHash[pmid] resultDic[label] = item[0] resultList = sorted(resultDic.items(), key=lambda(k, v):(v, k), reverse=True) return resultList[:20]
def search(M_lil, M_csc, queryString, top=20): """ This function is still a work in progress.. """ sanitizer = TextCleaner.sanitizeString() queryString = sanitizer.sub(' ', queryString) # OPTIONAL: # Stem the information if _stemmer: # Get the regex pattern that sanitizeses information and sanitize it # Stem the information queryString = FilterInterface.porterStemmer(queryString) # CHOOSE HEURISTIC: # Search-heuristic used to retrieve the list of results if _cosineMeasure: results = SearchInterface.cosineMeasure(M_lil, M_csc, queryString) else: results = SearchInterface.sumMeasure(M_lil, M_csc, queryString) # Sort the results and reverse to get the highest score first results.sort() results.reverse() # ########################################################################### # ### For the term-doc matrix: ############################################## # ########### # # 1: Mean # # ########### # # Get the sum cosine score the labels # ## (normDic counts the number of times a label has been summed) resultDic1 = {} normDic1 = {} for item in results[:top]: pmid = item[1] # Get the labels linked to the PMID ## (Several labels can be linked to one PMID) labels = _labelHash[pmid] for label in labels: try: resultDic1[label] += item[0] normDic1[label] += 1 except: resultDic1[label] = item[0] normDic1[label] = 1 # ############# # # 2: Median # # ############# # # Get the median cosine score of the labels # ## (normDic counts the number of times a label has been summed) resultDicList2 = {} normDic2 = {} for item in results[:top]: pmid = item[1] # Get the labels linked to the PMID ## (Several labels can be linked to one PMID) labels = _labelHash[pmid] for label in labels: try: resultDicList2[label].append(item[0]) normDic2[label] += 1 except: resultDicList2[label] = [] resultDicList2[label].append(item[0]) normDic2[label] = 1 resultDic2 = {} for label in resultDicList2.keys(): labelList = resultDicList2[label] numOfScores = len(labelList) if numOfScores > 2: medianIndex = numOfScores / 2 else: medianIndex = 0 resultDic2[label] = sorted(labelList)[medianIndex] # ########## # # 3: Max # # ########## # # Get the max cosine score of labels # ## (normDic counts the number of times a label has been summed) resultDicList3 = {} normDic3 = {} for item in results[:top]: pmid = item[1] # Get the labels linked to the PMID ## (Several labels can be linked to one PMID) labels = _labelHash[pmid] for label in labels: try: resultDicList3[label].append(item[0]) normDic3[label] += 1 except: resultDicList3[label] = [] resultDicList3[label].append(item[0]) normDic3[label] = 1 resultDic3 = {} for label in resultDicList3.keys(): labelList = resultDicList3[label] resultDic3[label] = max(labelList) # # Normalize the summed labels #for label in resultDic1.keys(): # resultDic1[label]/=normDic1[label] #for label in resultDic2.keys(): # resultDic2[label]/=normDic2[label] #for label in resultDic3.keys(): # resultDic3[label]/=normDic3[label] ############################################################################### ################################### ####### return pmid results ####### # Reverse and sort the concensus list resultList_mean = sorted(resultDic1.items(), key=lambda(k, v):(v, k), reverse=True) resultList_median = sorted(resultDic2.items(), key=lambda(k, v):(v, k), reverse=True) resultList_max = sorted(resultDic3.items(), key=lambda(k, v):(v, k), reverse=True) return [resultList_mean, resultList_median, resultList_max]