def collectExpansionWords(titleFile, snipFile): stopWordsDic = support.loadStopWord() fpTitle = open(titleFile) fpSnip = open(snipFile) expansionWordsCollect = {} for line in fpTitle.readlines(): if line.strip()!='': lineArr = line.strip().split('\t') if lineArr[0].isdigit() and lineArr[0] not in expansionWordsCollect: queryId = lineArr[0] expansionWordsCollect[queryId] = {} feedbackDocId = 0 else: titleSentence = support.removePunctuation(lineArr[1]) titleWordsList = titleSentence.strip().split(' ') for word in titleWordsList: wordLower = word.lower() if not word.isdigit() and word!='' and support.isNotStopWords(wordLower, stopWordsDic): if wordLower not in expansionWordsCollect[queryId]: expansionWordsCollect[queryId][wordLower] = {} expansionWordsCollect[queryId][wordLower][feedbackDocId] = 1 else: if feedbackDocId not in expansionWordsCollect[queryId][wordLower]: expansionWordsCollect[queryId][wordLower][feedbackDocId] = 1 else: expansionWordsCollect[queryId][wordLower][feedbackDocId] += 1 feedbackDocId += 1 for line in fpSnip.readlines(): if line.strip()!='': lineArr = line.strip().split('\t') if lineArr[0].isdigit(): queryId = lineArr[0] feedbackDocId = 0 else: snipSentence = support.removePunctuation(lineArr[1]) snipWordsList = snipSentence.strip().split(' ') for word in snipWordsList: wordLower = word.lower() if not word.isdigit() and word!='' and support.isNotStopWords(wordLower, stopWordsDic): if wordLower not in expansionWordsCollect[queryId]: expansionWordsCollect[queryId][wordLower] = {} expansionWordsCollect[queryId][wordLower][feedbackDocId] = 1 else: if feedbackDocId not in expansionWordsCollect[queryId][wordLower]: expansionWordsCollect[queryId][wordLower][feedbackDocId] = 1 else: expansionWordsCollect[queryId][wordLower][feedbackDocId] += 1 feedbackDocId += 1 #support.printDict(expansionWordsCollect, 2) return expansionWordsCollect
def collectExpansionMeshTerms(titleFile, snipFile, hasSynonym): stopWordMeshTermDict = {'pain', 'disease'} meshTermsDict = support.loadMeshTerms('disease') if hasSynonym == 'hasSynonym': meshTermSynonymDict = loadMeshTermsSynonym() meshTermsDict = meshTermDictAddSynonym(meshTermsDict, meshTermSynonymDict) fpTitle = open(titleFile) fpSnip = open(snipFile) expansionMeshWordsCollect = {} for line in fpTitle.readlines(): if line.strip() != '': lineArr = line.strip().split('\t') if lineArr[0].isdigit( ) and lineArr[0] not in expansionMeshWordsCollect: queryId = lineArr[0] expansionMeshWordsCollect[queryId] = {} else: titleSentence = support.removePunctuation(lineArr[1]) titleSentenceLower = support.sentenceLower(titleSentence) for meshTerm in meshTermsDict: #reMesh = re.compile(meshTerm) #numberMeshTerm = len(reMesh.findall(titleSentenceLower)) #if numberMeshTerm!=0: # expansionWordsCollect[queryId][meshTerm] = numberMeshTerm if meshTerm not in expansionMeshWordsCollect[ queryId] and titleSentenceLower.find( meshTerm ) != -1 and meshTerm not in stopWordMeshTermDict: expansionMeshWordsCollect[queryId][meshTerm] = 0 for line in fpSnip.readlines(): if line.strip() != '': lineArr = line.strip().split('\t') if lineArr[0].isdigit(): queryId = lineArr[0] else: snipSentence = support.removePunctuation(lineArr[1]) snipSentenceLower = support.sentenceLower(snipSentence) for meshTerm in meshTermsDict: if meshTerm not in expansionMeshWordsCollect[ queryId] and snipSentenceLower.find( meshTerm ) != -1 and meshTerm not in stopWordMeshTermDict: expansionMeshWordsCollect[queryId][meshTerm] = 0 return expansionMeshWordsCollect
def loadMeshTermsSynonym(): meshTermSynonymDict = {} meshTermSynonymFile = 'I:\\bibm2016\\experiments\\mesh\\MeSHWords.txt' fp = open(meshTermSynonymFile) for line in fp.readlines(): lineArr = line.strip().split(':') meshTerm = support.removePunctuation(lineArr[0]) if meshTerm not in meshTermSynonymDict: meshTermSynonymDict[meshTerm] = {} meshTermSynonymList = lineArr[1].strip('|').split('|') meshTermSynonymNumber = len(meshTermSynonymList)-1 if meshTermSynonymNumber>0: for meshTermSynonymIndex in range(1, meshTermSynonymNumber+1): meshTermSynonym = support.removePunctuation(meshTermSynonymList[meshTermSynonymIndex]) meshTermSynonymDict[meshTerm][meshTermSynonym] = 1 #support.printDict(meshTermSynonymDict, 1) return meshTermSynonymDict
def collectExpansionWords(titleFile, snipFile): stopWordsDic = support.loadStopWord() fpTitle = open(titleFile) fpSnip = open(snipFile) expansionWordsCollect = {} for line in fpTitle.readlines(): if line.strip() != '': lineArr = line.strip().split('\t') if lineArr[0].isdigit( ) and lineArr[0] not in expansionWordsCollect: queryId = lineArr[0] expansionWordsCollect[queryId] = {} else: titleSentence = support.removePunctuation(lineArr[1]) titleWordsList = titleSentence.strip().split(' ') for word in titleWordsList: wordLower = word.lower() if not word.isdigit( ) and word != '' and support.isNotStopWords( wordLower, stopWordsDic): if wordLower not in expansionWordsCollect[queryId]: expansionWordsCollect[queryId][wordLower] = 1 else: expansionWordsCollect[queryId][wordLower] += 1 for line in fpSnip.readlines(): if line.strip() != '': lineArr = line.strip().split('\t') if lineArr[0].isdigit(): queryId = lineArr[0] else: snipSentence = support.removePunctuation(lineArr[1]) snipWordsList = snipSentence.strip().split(' ') for word in snipWordsList: wordLower = word.lower() if not word.isdigit( ) and word != '' and support.isNotStopWords( wordLower, stopWordsDic): if wordLower not in expansionWordsCollect[queryId]: expansionWordsCollect[queryId][wordLower] = 1 else: expansionWordsCollect[queryId][wordLower] += 1 return expansionWordsCollect
def loadMeshTermsSynonym(): meshTermSynonymDict = {} meshTermSynonymFile = 'I:\\bibm2016\\experiments\\mesh\\MeSHWords.txt' fp = open(meshTermSynonymFile) for line in fp.readlines(): lineArr = line.strip().split(':') meshTerm = support.removePunctuation(lineArr[0]) if meshTerm not in meshTermSynonymDict: meshTermSynonymDict[meshTerm] = {} meshTermSynonymList = lineArr[1].strip('|').split('|') meshTermSynonymNumber = len(meshTermSynonymList) - 1 if meshTermSynonymNumber > 0: for meshTermSynonymIndex in range(1, meshTermSynonymNumber + 1): meshTermSynonym = support.removePunctuation( meshTermSynonymList[meshTermSynonymIndex]) meshTermSynonymDict[meshTerm][meshTermSynonym] = 1 #support.printDict(meshTermSynonymDict, 1) return meshTermSynonymDict
def collectExpansionMeshTerms(titleFile, snipFile): stopWordMeshTermDict = {}#{'pain', 'disease'} meshTermsDict = support.loadMeshTerms('all') #meshTermSynonymDict = loadMeshTermsSynonym() #meshTermsDict = meshTermDictAddSynonym(meshTermsDict, meshTermSynonymDict) fpTitle = open(titleFile) fpSnip = open(snipFile) expansionMeshWordsCollect = {} for line in fpTitle.readlines(): if line.strip()!='': lineArr = line.strip().split('\t') if lineArr[0].isdigit() and lineArr[0] not in expansionMeshWordsCollect: queryId = lineArr[0] expansionMeshWordsCollect[queryId] = {} else: titleSentence = support.removePunctuation(lineArr[1]) titleSentenceLower = support.sentenceLower(titleSentence) for meshTerm in meshTermsDict: #reMesh = re.compile(meshTerm) #numberMeshTerm = len(reMesh.findall(titleSentenceLower)) #if numberMeshTerm!=0: # expansionWordsCollect[queryId][meshTerm] = numberMeshTerm if meshTerm not in expansionMeshWordsCollect[queryId] and titleSentenceLower.find(meshTerm)!=-1 and meshTerm not in stopWordMeshTermDict: expansionMeshWordsCollect[queryId][meshTerm] = 0 for line in fpSnip.readlines(): if line.strip()!='': lineArr = line.strip().split('\t') if lineArr[0].isdigit(): queryId = lineArr[0] else: snipSentence = support.removePunctuation(lineArr[1]) snipSentenceLower = support.sentenceLower(snipSentence) for meshTerm in meshTermsDict: if meshTerm not in expansionMeshWordsCollect[queryId] and snipSentenceLower.find(meshTerm)!=-1 and meshTerm not in stopWordMeshTermDict: expansionMeshWordsCollect[queryId][meshTerm] = 0 return expansionMeshWordsCollect
def findDiagnosis(expansionWordsDashboard): meshTreeCodeDict = loadMeshTreeCode() import re fp = open('I:\\trec2015\\2015bquery18.txt') txt = fp.read() queryId = 11 diagnosisDict = {} for match in re.finditer(r"<diagnosis[\s\S]*?<\/diagnosis>", txt): diagnosisTxt = match.group() diagnosisTxt = diagnosisTxt.replace('<diagnosis>','').replace('</diagnosis>','') diagnosisTxt = support.removePunctuation(diagnosisTxt) diagnosisTxt = support.sentenceLower(diagnosisTxt) diagnosisDict[str(queryId)] = diagnosisTxt queryId += 1 for queryId in diagnosisDict: #print diagnosisDict[queryId] if diagnosisDict[queryId] in expansionWordsDashboard[queryId].keys(): print queryId, diagnosisDict[queryId], expansionWordsDashboard[queryId][diagnosisDict[queryId]], meshTreeCodeDict[diagnosisDict[queryId]] return 0
def findDiagnosis(expansionWordsDashboard, MeshTreeField): meshTreeCodeDict = loadMeshTreeCode(MeshTreeField) import re fp = open('I:\\trec2015\\2015bquery18.txt') txt = fp.read() queryId = 11 diagnosisDict = {} for match in re.finditer(r"<diagnosis[\s\S]*?<\/diagnosis>", txt): diagnosisTxt = match.group() diagnosisTxt = diagnosisTxt.replace('<diagnosis>', '').replace('</diagnosis>', '') diagnosisTxt = support.removePunctuation(diagnosisTxt) diagnosisTxt = support.sentenceLower(diagnosisTxt) diagnosisDict[str(queryId)] = diagnosisTxt #print str(queryId), diagnosisTxt queryId += 1 for queryId in diagnosisDict: #print diagnosisDict[queryId] if diagnosisDict[queryId] in expansionWordsDashboard[queryId].keys(): print queryId, diagnosisDict[queryId], expansionWordsDashboard[ queryId][diagnosisDict[queryId]], meshTreeCodeDict[ diagnosisDict[queryId]] return 0
def collectExpansionMeshTerms(titleFile, snipFile, hasSynonym, MeshTreeField): meshTermsDict = support.loadMeshTerms(MeshTreeField) if hasSynonym == 'hasSynonym': meshTermSynonymDict = loadMeshTermsSynonym() meshTermsDict = meshTermDictAddSynonym(meshTermsDict, meshTermSynonymDict) print 'load meshTermsDict done' fpTitle = open(titleFile) fpSnip = open(snipFile) expansionMeshWordsCollect = {} count = 0 for line in fpTitle.readlines(): print count if line.strip() != '': lineArr = line.strip().split('\t') if lineArr[0].isdigit( ) and lineArr[0] not in expansionMeshWordsCollect: queryId = lineArr[0] expansionMeshWordsCollect[queryId] = {} feedbackDocId = 0 else: titleSentence = support.removePunctuation(lineArr[1]) titleSentenceLower = support.sentenceLower(titleSentence) for meshTerm in meshTermsDict: wordsListInTitle = titleSentenceLower.strip().split(' ') meshTermLength = len(meshTerm.split(' ')) wordsListInTitleLength = len(wordsListInTitle) titleSentenceLowerWindow = '' findMeshCount = wordsListInTitleLength - meshTermLength for wordIndex in range(0, findMeshCount): windowSize = wordIndex + meshTermLength for windowIndex in range(wordIndex, windowSize): titleSentenceLowerWindow += wordsListInTitle[ windowIndex] + ' ' if titleSentenceLowerWindow.find( meshTerm ) != -1: # and meshTerm not in stopWordMeshTermDict: if meshTerm not in expansionMeshWordsCollect[ queryId]: expansionMeshWordsCollect[queryId][ meshTerm] = {} expansionMeshWordsCollect[queryId][meshTerm][ feedbackDocId] = 1 else: if feedbackDocId not in expansionMeshWordsCollect[ queryId][meshTerm]: expansionMeshWordsCollect[queryId][ meshTerm][feedbackDocId] = 1 else: expansionMeshWordsCollect[queryId][ meshTerm][feedbackDocId] += 1 titleSentenceLowerWindow = '' feedbackDocId += 1 count += 1 for line in fpSnip.readlines(): if line.strip() != '': lineArr = line.strip().split('\t') if lineArr[0].isdigit(): queryId = lineArr[0] feedbackDocId = 0 else: snipSentence = support.removePunctuation(lineArr[1]) snipSentenceLower = support.sentenceLower(snipSentence) for meshTerm in meshTermsDict: wordsListInSnip = snipSentenceLower.strip().split(' ') meshTermLength = len(meshTerm.split(' ')) wordsListInSnipLength = len(wordsListInSnip) snipSentenceLowerWindow = '' meshFindCount = wordsListInSnipLength - meshTermLength for wordIndex in range(0, meshFindCount): windowSize = wordIndex + meshTermLength for windowIndex in range(wordIndex, windowSize): snipSentenceLowerWindow += wordsListInSnip[ windowIndex] + ' ' if snipSentenceLowerWindow.find( meshTerm ) != -1: # and meshTerm not in stopWordMeshTermDict: if meshTerm not in expansionMeshWordsCollect[ queryId]: expansionMeshWordsCollect[queryId][ meshTerm] = {} expansionMeshWordsCollect[queryId][meshTerm][ feedbackDocId] = 1 else: if feedbackDocId not in expansionMeshWordsCollect[ queryId][meshTerm]: expansionMeshWordsCollect[queryId][ meshTerm][feedbackDocId] = 1 else: expansionMeshWordsCollect[queryId][ meshTerm][feedbackDocId] += 1 snipSentenceLowerWindow = '' feedbackDocId += 1 print count count += 1 #support.printDict(expansionMeshWordsCollect, 2) return expansionMeshWordsCollect
def collectExpansionMeshTerms(titleFile, snipFile, hasSynonym, MeshTreeField): meshTermsDict = support.loadMeshTerms(MeshTreeField) if hasSynonym=='hasSynonym': meshTermSynonymDict = loadMeshTermsSynonym() meshTermsDict = meshTermDictAddSynonym(meshTermsDict, meshTermSynonymDict) print 'load meshTermsDict done' fpTitle = open(titleFile) fpSnip = open(snipFile) expansionMeshWordsCollect = {} count = 0 for line in fpTitle.readlines(): print count if line.strip()!='': lineArr = line.strip().split('\t') if lineArr[0].isdigit() and lineArr[0] not in expansionMeshWordsCollect: queryId = lineArr[0] expansionMeshWordsCollect[queryId] = {} feedbackDocId = 0 else: titleSentence = support.removePunctuation(lineArr[1]) titleSentenceLower = support.sentenceLower(titleSentence) for meshTerm in meshTermsDict: wordsListInTitle = titleSentenceLower.strip().split(' ') meshTermLength = len(meshTerm.split(' ')) wordsListInTitleLength = len(wordsListInTitle) titleSentenceLowerWindow = '' findMeshCount = wordsListInTitleLength-meshTermLength for wordIndex in range(0, findMeshCount): windowSize = wordIndex+meshTermLength for windowIndex in range(wordIndex, windowSize): titleSentenceLowerWindow += wordsListInTitle[windowIndex]+' ' if titleSentenceLowerWindow.find(meshTerm)!=-1:# and meshTerm not in stopWordMeshTermDict: if meshTerm not in expansionMeshWordsCollect[queryId]: expansionMeshWordsCollect[queryId][meshTerm] = {} expansionMeshWordsCollect[queryId][meshTerm][feedbackDocId] = 1 else: if feedbackDocId not in expansionMeshWordsCollect[queryId][meshTerm]: expansionMeshWordsCollect[queryId][meshTerm][feedbackDocId] = 1 else: expansionMeshWordsCollect[queryId][meshTerm][feedbackDocId] += 1 titleSentenceLowerWindow = '' feedbackDocId += 1 count += 1 for line in fpSnip.readlines(): if line.strip()!='': lineArr = line.strip().split('\t') if lineArr[0].isdigit(): queryId = lineArr[0] feedbackDocId = 0 else: snipSentence = support.removePunctuation(lineArr[1]) snipSentenceLower = support.sentenceLower(snipSentence) for meshTerm in meshTermsDict: wordsListInSnip = snipSentenceLower.strip().split(' ') meshTermLength = len(meshTerm.split(' ')) wordsListInSnipLength = len(wordsListInSnip) snipSentenceLowerWindow = '' meshFindCount = wordsListInSnipLength-meshTermLength for wordIndex in range(0, meshFindCount): windowSize = wordIndex+meshTermLength for windowIndex in range(wordIndex, windowSize): snipSentenceLowerWindow += wordsListInSnip[windowIndex]+' ' if snipSentenceLowerWindow.find(meshTerm)!=-1:# and meshTerm not in stopWordMeshTermDict: if meshTerm not in expansionMeshWordsCollect[queryId]: expansionMeshWordsCollect[queryId][meshTerm] = {} expansionMeshWordsCollect[queryId][meshTerm][feedbackDocId] = 1 else: if feedbackDocId not in expansionMeshWordsCollect[queryId][meshTerm]: expansionMeshWordsCollect[queryId][meshTerm][feedbackDocId] = 1 else: expansionMeshWordsCollect[queryId][meshTerm][feedbackDocId] += 1 snipSentenceLowerWindow = '' feedbackDocId += 1 print count count += 1 #support.printDict(expansionMeshWordsCollect, 2) return expansionMeshWordsCollect