Python removePunctuationの例、support.removePunctuation Pythonの例

コード例 #1

0

ファイルを表示

ファイル: expansionByRandomness.py プロジェクト: heyunh2015/bibm2016

def collectExpansionWords(titleFile, snipFile):
    stopWordsDic = support.loadStopWord()
    fpTitle = open(titleFile)
    fpSnip = open(snipFile)
    expansionWordsCollect = {}
    for line in fpTitle.readlines():
        if line.strip()!='':
            lineArr = line.strip().split('\t')
            if lineArr[0].isdigit() and lineArr[0] not in expansionWordsCollect:
                queryId = lineArr[0]
                expansionWordsCollect[queryId] = {}
                feedbackDocId = 0
            else:
                titleSentence = support.removePunctuation(lineArr[1])
                titleWordsList = titleSentence.strip().split(' ')
                for word in titleWordsList:
                    wordLower = word.lower()
                    if not word.isdigit() and word!='' and support.isNotStopWords(wordLower, stopWordsDic):
                        if wordLower not in expansionWordsCollect[queryId]:
                            expansionWordsCollect[queryId][wordLower] = {}
                            expansionWordsCollect[queryId][wordLower][feedbackDocId] = 1
                        else:
                            if feedbackDocId not in expansionWordsCollect[queryId][wordLower]:
                                expansionWordsCollect[queryId][wordLower][feedbackDocId] = 1
                            else:
                                expansionWordsCollect[queryId][wordLower][feedbackDocId] += 1
                feedbackDocId += 1
                
    for line in fpSnip.readlines():
        if line.strip()!='':
            lineArr = line.strip().split('\t')
            if lineArr[0].isdigit():
                queryId = lineArr[0]
                feedbackDocId = 0
            else:
                snipSentence = support.removePunctuation(lineArr[1])
                snipWordsList = snipSentence.strip().split(' ')
                for word in snipWordsList:
                    wordLower = word.lower()
                    if not word.isdigit() and word!='' and support.isNotStopWords(wordLower, stopWordsDic):
                        if wordLower not in expansionWordsCollect[queryId]:
                            expansionWordsCollect[queryId][wordLower] = {}
                            expansionWordsCollect[queryId][wordLower][feedbackDocId] = 1
                        else:
                            if feedbackDocId not in expansionWordsCollect[queryId][wordLower]:
                                expansionWordsCollect[queryId][wordLower][feedbackDocId] = 1
                            else:
                                expansionWordsCollect[queryId][wordLower][feedbackDocId] += 1
                feedbackDocId += 1
    #support.printDict(expansionWordsCollect, 2)
                                
    return expansionWordsCollect

コード例 #2

0

ファイルを表示

ファイル: adjustParameters.py プロジェクト: heyunh2015/TIST

def collectExpansionMeshTerms(titleFile, snipFile, hasSynonym):
    stopWordMeshTermDict = {'pain', 'disease'}
    meshTermsDict = support.loadMeshTerms('disease')
    if hasSynonym == 'hasSynonym':
        meshTermSynonymDict = loadMeshTermsSynonym()
        meshTermsDict = meshTermDictAddSynonym(meshTermsDict,
                                               meshTermSynonymDict)

    fpTitle = open(titleFile)
    fpSnip = open(snipFile)
    expansionMeshWordsCollect = {}
    for line in fpTitle.readlines():
        if line.strip() != '':
            lineArr = line.strip().split('\t')
            if lineArr[0].isdigit(
            ) and lineArr[0] not in expansionMeshWordsCollect:
                queryId = lineArr[0]
                expansionMeshWordsCollect[queryId] = {}
            else:
                titleSentence = support.removePunctuation(lineArr[1])
                titleSentenceLower = support.sentenceLower(titleSentence)
                for meshTerm in meshTermsDict:
                    #reMesh = re.compile(meshTerm)
                    #numberMeshTerm = len(reMesh.findall(titleSentenceLower))
                    #if numberMeshTerm!=0:
                    #   expansionWordsCollect[queryId][meshTerm] = numberMeshTerm
                    if meshTerm not in expansionMeshWordsCollect[
                            queryId] and titleSentenceLower.find(
                                meshTerm
                            ) != -1 and meshTerm not in stopWordMeshTermDict:
                        expansionMeshWordsCollect[queryId][meshTerm] = 0

    for line in fpSnip.readlines():
        if line.strip() != '':
            lineArr = line.strip().split('\t')
            if lineArr[0].isdigit():
                queryId = lineArr[0]
            else:
                snipSentence = support.removePunctuation(lineArr[1])
                snipSentenceLower = support.sentenceLower(snipSentence)
                for meshTerm in meshTermsDict:
                    if meshTerm not in expansionMeshWordsCollect[
                            queryId] and snipSentenceLower.find(
                                meshTerm
                            ) != -1 and meshTerm not in stopWordMeshTermDict:
                        expansionMeshWordsCollect[queryId][meshTerm] = 0

    return expansionMeshWordsCollect

コード例 #3

0

ファイルを表示

ファイル: expansionByRandomness_disease.py プロジェクト: heyunh2015/bibm2016

def loadMeshTermsSynonym():
    meshTermSynonymDict = {}
    meshTermSynonymFile = 'I:\\bibm2016\\experiments\\mesh\\MeSHWords.txt'
    fp = open(meshTermSynonymFile)
    for line in fp.readlines():
        lineArr = line.strip().split(':')
        meshTerm = support.removePunctuation(lineArr[0])
        if meshTerm not in meshTermSynonymDict:
            meshTermSynonymDict[meshTerm] = {}
        meshTermSynonymList = lineArr[1].strip('|').split('|')
        meshTermSynonymNumber = len(meshTermSynonymList)-1
        if meshTermSynonymNumber>0:
            for meshTermSynonymIndex in range(1, meshTermSynonymNumber+1):
                meshTermSynonym = support.removePunctuation(meshTermSynonymList[meshTermSynonymIndex])
                meshTermSynonymDict[meshTerm][meshTermSynonym] = 1
                
    #support.printDict(meshTermSynonymDict, 1)
    return meshTermSynonymDict

コード例 #4

0

ファイルを表示

ファイル: googleExpansionBaseline.py プロジェクト: heyunh2015/TIST

def collectExpansionWords(titleFile, snipFile):
    stopWordsDic = support.loadStopWord()
    fpTitle = open(titleFile)
    fpSnip = open(snipFile)
    expansionWordsCollect = {}
    for line in fpTitle.readlines():
        if line.strip() != '':
            lineArr = line.strip().split('\t')
            if lineArr[0].isdigit(
            ) and lineArr[0] not in expansionWordsCollect:
                queryId = lineArr[0]
                expansionWordsCollect[queryId] = {}
            else:
                titleSentence = support.removePunctuation(lineArr[1])
                titleWordsList = titleSentence.strip().split(' ')
                for word in titleWordsList:
                    wordLower = word.lower()
                    if not word.isdigit(
                    ) and word != '' and support.isNotStopWords(
                            wordLower, stopWordsDic):
                        if wordLower not in expansionWordsCollect[queryId]:
                            expansionWordsCollect[queryId][wordLower] = 1
                        else:
                            expansionWordsCollect[queryId][wordLower] += 1

    for line in fpSnip.readlines():
        if line.strip() != '':
            lineArr = line.strip().split('\t')
            if lineArr[0].isdigit():
                queryId = lineArr[0]
            else:
                snipSentence = support.removePunctuation(lineArr[1])
                snipWordsList = snipSentence.strip().split(' ')
                for word in snipWordsList:
                    wordLower = word.lower()
                    if not word.isdigit(
                    ) and word != '' and support.isNotStopWords(
                            wordLower, stopWordsDic):
                        if wordLower not in expansionWordsCollect[queryId]:
                            expansionWordsCollect[queryId][wordLower] = 1
                        else:
                            expansionWordsCollect[queryId][wordLower] += 1

    return expansionWordsCollect

コード例 #5

0

ファイルを表示

def loadMeshTermsSynonym():
    meshTermSynonymDict = {}
    meshTermSynonymFile = 'I:\\bibm2016\\experiments\\mesh\\MeSHWords.txt'
    fp = open(meshTermSynonymFile)
    for line in fp.readlines():
        lineArr = line.strip().split(':')
        meshTerm = support.removePunctuation(lineArr[0])
        if meshTerm not in meshTermSynonymDict:
            meshTermSynonymDict[meshTerm] = {}
        meshTermSynonymList = lineArr[1].strip('|').split('|')
        meshTermSynonymNumber = len(meshTermSynonymList) - 1
        if meshTermSynonymNumber > 0:
            for meshTermSynonymIndex in range(1, meshTermSynonymNumber + 1):
                meshTermSynonym = support.removePunctuation(
                    meshTermSynonymList[meshTermSynonymIndex])
                meshTermSynonymDict[meshTerm][meshTermSynonym] = 1

    #support.printDict(meshTermSynonymDict, 1)
    return meshTermSynonymDict

コード例 #6

0

ファイルを表示

ファイル: googleExpansionPreExperiment.py プロジェクト: heyunh2015/bibm2016

def collectExpansionMeshTerms(titleFile, snipFile):
    stopWordMeshTermDict = {}#{'pain', 'disease'}
    meshTermsDict = support.loadMeshTerms('all')
    #meshTermSynonymDict = loadMeshTermsSynonym()
    #meshTermsDict = meshTermDictAddSynonym(meshTermsDict, meshTermSynonymDict) 
    
    fpTitle = open(titleFile)
    fpSnip = open(snipFile)
    expansionMeshWordsCollect = {}
    for line in fpTitle.readlines():
        if line.strip()!='':
            lineArr = line.strip().split('\t')
            if lineArr[0].isdigit() and lineArr[0] not in expansionMeshWordsCollect:
                queryId = lineArr[0]
                expansionMeshWordsCollect[queryId] = {}
            else:
                titleSentence = support.removePunctuation(lineArr[1])
                titleSentenceLower = support.sentenceLower(titleSentence)
                for meshTerm in meshTermsDict:
                    #reMesh = re.compile(meshTerm)  
                    #numberMeshTerm = len(reMesh.findall(titleSentenceLower))        
                    #if numberMeshTerm!=0:
                     #   expansionWordsCollect[queryId][meshTerm] = numberMeshTerm
                    if meshTerm not in expansionMeshWordsCollect[queryId] and titleSentenceLower.find(meshTerm)!=-1 and meshTerm not in stopWordMeshTermDict:
                        expansionMeshWordsCollect[queryId][meshTerm] = 0
                        
    for line in fpSnip.readlines():
        if line.strip()!='':
            lineArr = line.strip().split('\t')
            if lineArr[0].isdigit():
                queryId = lineArr[0]
            else:
                snipSentence = support.removePunctuation(lineArr[1])
                snipSentenceLower = support.sentenceLower(snipSentence)
                for meshTerm in meshTermsDict:
                    if meshTerm not in expansionMeshWordsCollect[queryId] and snipSentenceLower.find(meshTerm)!=-1 and meshTerm not in stopWordMeshTermDict:
                        expansionMeshWordsCollect[queryId][meshTerm] = 0
    
    return expansionMeshWordsCollect

コード例 #7

0

ファイルを表示

ファイル: expansionByRandomness.py プロジェクト: heyunh2015/bibm2016

def findDiagnosis(expansionWordsDashboard):
    meshTreeCodeDict = loadMeshTreeCode()
    import re
    fp = open('I:\\trec2015\\2015bquery18.txt')
    txt = fp.read()
    queryId = 11
    diagnosisDict = {}
    for match in re.finditer(r"<diagnosis[\s\S]*?<\/diagnosis>", txt):
        diagnosisTxt = match.group()
        diagnosisTxt = diagnosisTxt.replace('<diagnosis>','').replace('</diagnosis>','')
        diagnosisTxt = support.removePunctuation(diagnosisTxt)
        diagnosisTxt = support.sentenceLower(diagnosisTxt) 
        diagnosisDict[str(queryId)] = diagnosisTxt
        queryId += 1
    
    for queryId in diagnosisDict:
        #print diagnosisDict[queryId]
        if diagnosisDict[queryId] in expansionWordsDashboard[queryId].keys():
            print queryId, diagnosisDict[queryId], expansionWordsDashboard[queryId][diagnosisDict[queryId]], meshTreeCodeDict[diagnosisDict[queryId]]
    return 0

コード例 #8

0

ファイルを表示

def findDiagnosis(expansionWordsDashboard, MeshTreeField):
    meshTreeCodeDict = loadMeshTreeCode(MeshTreeField)
    import re
    fp = open('I:\\trec2015\\2015bquery18.txt')
    txt = fp.read()
    queryId = 11
    diagnosisDict = {}
    for match in re.finditer(r"<diagnosis[\s\S]*?<\/diagnosis>", txt):
        diagnosisTxt = match.group()
        diagnosisTxt = diagnosisTxt.replace('<diagnosis>',
                                            '').replace('</diagnosis>', '')
        diagnosisTxt = support.removePunctuation(diagnosisTxt)
        diagnosisTxt = support.sentenceLower(diagnosisTxt)
        diagnosisDict[str(queryId)] = diagnosisTxt
        #print str(queryId), diagnosisTxt
        queryId += 1

    for queryId in diagnosisDict:
        #print diagnosisDict[queryId]
        if diagnosisDict[queryId] in expansionWordsDashboard[queryId].keys():
            print queryId, diagnosisDict[queryId], expansionWordsDashboard[
                queryId][diagnosisDict[queryId]], meshTreeCodeDict[
                    diagnosisDict[queryId]]
    return 0

コード例 #9

0

ファイルを表示

def collectExpansionMeshTerms(titleFile, snipFile, hasSynonym, MeshTreeField):
    meshTermsDict = support.loadMeshTerms(MeshTreeField)
    if hasSynonym == 'hasSynonym':
        meshTermSynonymDict = loadMeshTermsSynonym()
        meshTermsDict = meshTermDictAddSynonym(meshTermsDict,
                                               meshTermSynonymDict)
    print 'load meshTermsDict done'

    fpTitle = open(titleFile)
    fpSnip = open(snipFile)
    expansionMeshWordsCollect = {}
    count = 0
    for line in fpTitle.readlines():
        print count
        if line.strip() != '':
            lineArr = line.strip().split('\t')
            if lineArr[0].isdigit(
            ) and lineArr[0] not in expansionMeshWordsCollect:
                queryId = lineArr[0]
                expansionMeshWordsCollect[queryId] = {}
                feedbackDocId = 0
            else:
                titleSentence = support.removePunctuation(lineArr[1])
                titleSentenceLower = support.sentenceLower(titleSentence)
                for meshTerm in meshTermsDict:
                    wordsListInTitle = titleSentenceLower.strip().split(' ')
                    meshTermLength = len(meshTerm.split(' '))
                    wordsListInTitleLength = len(wordsListInTitle)
                    titleSentenceLowerWindow = ''
                    findMeshCount = wordsListInTitleLength - meshTermLength
                    for wordIndex in range(0, findMeshCount):
                        windowSize = wordIndex + meshTermLength
                        for windowIndex in range(wordIndex, windowSize):
                            titleSentenceLowerWindow += wordsListInTitle[
                                windowIndex] + ' '
                        if titleSentenceLowerWindow.find(
                                meshTerm
                        ) != -1:  # and meshTerm not in stopWordMeshTermDict:
                            if meshTerm not in expansionMeshWordsCollect[
                                    queryId]:
                                expansionMeshWordsCollect[queryId][
                                    meshTerm] = {}
                                expansionMeshWordsCollect[queryId][meshTerm][
                                    feedbackDocId] = 1
                            else:
                                if feedbackDocId not in expansionMeshWordsCollect[
                                        queryId][meshTerm]:
                                    expansionMeshWordsCollect[queryId][
                                        meshTerm][feedbackDocId] = 1
                                else:
                                    expansionMeshWordsCollect[queryId][
                                        meshTerm][feedbackDocId] += 1
                        titleSentenceLowerWindow = ''

                feedbackDocId += 1
        count += 1

    for line in fpSnip.readlines():
        if line.strip() != '':
            lineArr = line.strip().split('\t')
            if lineArr[0].isdigit():
                queryId = lineArr[0]
                feedbackDocId = 0
            else:
                snipSentence = support.removePunctuation(lineArr[1])
                snipSentenceLower = support.sentenceLower(snipSentence)
                for meshTerm in meshTermsDict:
                    wordsListInSnip = snipSentenceLower.strip().split(' ')
                    meshTermLength = len(meshTerm.split(' '))
                    wordsListInSnipLength = len(wordsListInSnip)
                    snipSentenceLowerWindow = ''
                    meshFindCount = wordsListInSnipLength - meshTermLength
                    for wordIndex in range(0, meshFindCount):
                        windowSize = wordIndex + meshTermLength
                        for windowIndex in range(wordIndex, windowSize):
                            snipSentenceLowerWindow += wordsListInSnip[
                                windowIndex] + ' '
                        if snipSentenceLowerWindow.find(
                                meshTerm
                        ) != -1:  # and meshTerm not in stopWordMeshTermDict:
                            if meshTerm not in expansionMeshWordsCollect[
                                    queryId]:
                                expansionMeshWordsCollect[queryId][
                                    meshTerm] = {}
                                expansionMeshWordsCollect[queryId][meshTerm][
                                    feedbackDocId] = 1
                            else:
                                if feedbackDocId not in expansionMeshWordsCollect[
                                        queryId][meshTerm]:
                                    expansionMeshWordsCollect[queryId][
                                        meshTerm][feedbackDocId] = 1
                                else:
                                    expansionMeshWordsCollect[queryId][
                                        meshTerm][feedbackDocId] += 1
                        snipSentenceLowerWindow = ''
                feedbackDocId += 1
        print count
        count += 1

    #support.printDict(expansionMeshWordsCollect, 2)

    return expansionMeshWordsCollect

コード例 #10

0

ファイルを表示

ファイル: expansionByRandomness_disease.py プロジェクト: heyunh2015/bibm2016

def collectExpansionMeshTerms(titleFile, snipFile, hasSynonym, MeshTreeField):
    meshTermsDict = support.loadMeshTerms(MeshTreeField)
    if hasSynonym=='hasSynonym':
        meshTermSynonymDict = loadMeshTermsSynonym()
        meshTermsDict = meshTermDictAddSynonym(meshTermsDict, meshTermSynonymDict) 
    print 'load meshTermsDict done'
    
    fpTitle = open(titleFile)
    fpSnip = open(snipFile)
    expansionMeshWordsCollect = {}
    count = 0
    for line in fpTitle.readlines():
        print count
        if line.strip()!='':
            lineArr = line.strip().split('\t')
            if lineArr[0].isdigit() and lineArr[0] not in expansionMeshWordsCollect:
                queryId = lineArr[0]
                expansionMeshWordsCollect[queryId] = {}
                feedbackDocId = 0
            else:
                titleSentence = support.removePunctuation(lineArr[1])
                titleSentenceLower = support.sentenceLower(titleSentence)
                for meshTerm in meshTermsDict:
                    wordsListInTitle = titleSentenceLower.strip().split(' ')
                    meshTermLength = len(meshTerm.split(' '))
                    wordsListInTitleLength = len(wordsListInTitle)
                    titleSentenceLowerWindow = ''
                    findMeshCount = wordsListInTitleLength-meshTermLength
                    for wordIndex in range(0, findMeshCount):
                        windowSize = wordIndex+meshTermLength
                        for windowIndex in range(wordIndex, windowSize):
                            titleSentenceLowerWindow += wordsListInTitle[windowIndex]+' '
                        if titleSentenceLowerWindow.find(meshTerm)!=-1:# and meshTerm not in stopWordMeshTermDict:
                            if meshTerm not in expansionMeshWordsCollect[queryId]:
                                expansionMeshWordsCollect[queryId][meshTerm] = {}
                                expansionMeshWordsCollect[queryId][meshTerm][feedbackDocId] = 1
                            else:
                                if feedbackDocId not in expansionMeshWordsCollect[queryId][meshTerm]:
                                    expansionMeshWordsCollect[queryId][meshTerm][feedbackDocId] = 1
                                else:
                                    expansionMeshWordsCollect[queryId][meshTerm][feedbackDocId] += 1
                        titleSentenceLowerWindow = ''
                            
                feedbackDocId += 1
        count += 1
                        
    for line in fpSnip.readlines():
        if line.strip()!='':
            lineArr = line.strip().split('\t')
            if lineArr[0].isdigit():
                queryId = lineArr[0]
                feedbackDocId = 0
            else:
                snipSentence = support.removePunctuation(lineArr[1])
                snipSentenceLower = support.sentenceLower(snipSentence)
                for meshTerm in meshTermsDict:
                    wordsListInSnip = snipSentenceLower.strip().split(' ')
                    meshTermLength = len(meshTerm.split(' '))
                    wordsListInSnipLength = len(wordsListInSnip)
                    snipSentenceLowerWindow = ''
                    meshFindCount = wordsListInSnipLength-meshTermLength
                    for wordIndex in range(0, meshFindCount):
                        windowSize = wordIndex+meshTermLength
                        for windowIndex in range(wordIndex, windowSize):
                            snipSentenceLowerWindow += wordsListInSnip[windowIndex]+' '
                        if snipSentenceLowerWindow.find(meshTerm)!=-1:# and meshTerm not in stopWordMeshTermDict:
                            if meshTerm not in expansionMeshWordsCollect[queryId]:
                                expansionMeshWordsCollect[queryId][meshTerm] = {}
                                expansionMeshWordsCollect[queryId][meshTerm][feedbackDocId] = 1
                            else:
                                if feedbackDocId not in expansionMeshWordsCollect[queryId][meshTerm]:
                                    expansionMeshWordsCollect[queryId][meshTerm][feedbackDocId] = 1
                                else:
                                    expansionMeshWordsCollect[queryId][meshTerm][feedbackDocId] += 1
                        snipSentenceLowerWindow = ''
                feedbackDocId += 1
        print count
        count += 1
    
    #support.printDict(expansionMeshWordsCollect, 2)
    
    return expansionMeshWordsCollect