Ejemplo n.º 1
0
    def getCorefChains(self, doc, dictSentCoref, a_tree_bank):
        common = cUtil()
        sentsDict = common.getSentences(a_tree_bank)

        #dictSentCoref format: {docID: {sentNo: [(start_word_index,end_word_index,type,identifier,string,used_up),...]}}

        for corefC in doc:
            docID = str(corefC.document_id)
            docID = docID.split('/')[3].split('@')[0]
            if not dictSentCoref.has_key(docID):
                    dictSentCoref[docID] = {}
            corefChainID = int(corefC.identifier)

            for corefL in corefC:
                sentNo = corefL.sentence_index + 1

                corefStringWords = corefL.string.split(' ')
                start_word_index = corefL.start_word_index
                if corefStringWords[0].lower() in ['the','mrs','ms','miss','mr','mrs.','ms.','mr.']:
                    corefStringWords.pop(0)
                    start_word_index += 1
                string = ' '.join(corefStringWords)

                # nschneid: modified, was
                '''
                deduct = 0
                for word in corefStringWords:
                    if word.lower() not in ['the','mrs.','mr.']:
                            string += word + ' '
                    else:
                            deduct += 1
                string = string.rstrip(' ')
                start_word_index = corefL.start_word_index + deduct
                '''


                end_word_index = corefL.end_word_index + 1
                #print "COREFL", corefL.start_word_index, corefL.end_word_index, corefL.start_token_index, corefL.end_token_index
                adjusted = common.adjustIndices(sentsDict[docID][sentNo][1], sentsDict[docID][sentNo][0], start_word_index, end_word_index)
                start_word_index = adjusted[0]
                end_word_index = adjusted[1]
                if not corefL.type in ['ATTRIB','HEAD','APPOS']:
                    if not dictSentCoref[docID].has_key(sentNo): # First time sentence seen
                        dictSentCoref[docID][sentNo] = [(start_word_index, end_word_index, corefChainID, string, False)]
                    else: # Entries for sentence already in dictionary
                        dictSentCoref[docID][sentNo].append((start_word_index, end_word_index, corefChainID, string, False))


        # Sort the list of tuples for each sentence in the dictionary so that they are ordered by start_word_index
        for docID in dictSentCoref:
            for sentNo in dictSentCoref[docID]:
                tempList = dictSentCoref[docID][sentNo]
                tempList.sort()
                fDist = {}
                listCorefMult = []
                listCorefChains = []
                listFinal = []
                # Now swap any items so that if there are two elements with the same start_word_index, the one with the larger span appears first
                if len(tempList) > 1:
                    for i in range(0,len(tempList)-1):
                        if tempList[i][0] == tempList[i+1][0] and tempList[i][1] < tempList[i+1][1]:
                            tempList[i+1], tempList[i] = tempList[i], tempList[i+1]
                        # As we are only interested in sentence internal coreference, compile a list of corefChainIDs in the sentence
                        listCorefChains.append(int(tempList[i][2]))
                    # Add corefChainID for the last entry in the list
                    listCorefChains.append(int(tempList[i+1][2]))
                    fDist = nltk.FreqDist(listCorefChains) # Find how many times each corefChainID appears in the sentence
                    for e in fDist:
                        if fDist[e] > 1:
                            listCorefMult.append(e)
                # Construct list of sentence internal coreference instances and write back to dictionary
                for element in tempList:
                    if int(element[2]) in listCorefMult:
                        listFinal.append(element)
                dictSentCoref[docID][sentNo] = listFinal
        return dictSentCoref
Ejemplo n.º 2
0
    def getCorefChains(self, corefDict, docID, sentsDict):
        #print "corefDict, docID, sentsDict", corefDict, docID, sentsDict
        common = cUtil()
        dictSentCoref = {} # {docID: {sentNo: [(start_word_index,end_word_index,type,identifier,string,used_up),...]}}


        lCorefChains = []
        lCorefChainTemp = []

        for sentenceNr, sentenceCoref in enumerate(corefDict):
            for lCorefChain in sentenceCoref:
                lCorefChainTemp = []
                for lCorefPair in lCorefChain:
                    refExpr = lCorefPair[0]
                    antecedent = lCorefPair[1]
                    refExpr[1] = sentenceNr  # fix because they are always 0 as we process sentences individually
                    antecedent[1] = sentenceNr

                    if not antecedent in lCorefChainTemp:
                        lCorefChainTemp.append(antecedent)
                    if not refExpr[0] in antecedent[0]: # Exclude referring expressions if they are contained within the antecedent OR overlap
                        lCorefChainTemp.append(refExpr)
                lCorefChains.append(lCorefChainTemp)

        identifier = 0 # The ID of the chain

        if not dictSentCoref.has_key(docID):
            dictSentCoref[docID] = {}  # {docID: {sentNo: [(start_word_index,end_word_index,identifier,string,used_up),...]}}

        for listChain in lCorefChains:
            for coref in listChain:
                sentNo = int(coref[1]) + 1

                start_word_index = coref[3]
                corefStringWords = coref[0].split(' ')
                if corefStringWords[0].lower() in ['the']:
                    corefStringWords.pop(0)
                    start_word_index += 1
                string = ' '.join(corefStringWords)

                # nschneid: modified; was:
                '''
                string = coref[0]
                corefStringWords = string.split(' ')
                deduct = 0
                string = ''
                for word in corefStringWords:
                    if word.lower() not in ['the']:
                        string += word + ' '
                    else:
                        deduct += 1
                string = string.rstrip(' ')
                start_word_index = coref[3] + deduct
                '''

                end_word_index = coref[4]
                adjusted = common.adjustIndices(sentsDict[docID][sentNo][1], sentsDict[docID][sentNo][0], start_word_index, end_word_index)
                start_word_index = adjusted[0]
                end_word_index = adjusted[1]
                if not dictSentCoref[docID].has_key(sentNo):
                    dictSentCoref[docID][sentNo] = [(start_word_index,end_word_index,identifier,string,False)]
                else:
                    dictSentCoref[docID][sentNo].append((start_word_index,end_word_index,identifier,string,False))

            identifier += 1

        dictSentCoref = self.formatChains(dictSentCoref)
        return dictSentCoref
Ejemplo n.º 3
0
    def getNamedEnts(self,docID,folderLocationNE,folderLocationDep,treebank):
        dictNameEnt = {} # {docID: {sentNo: [(start_word_index, end_word_index, named_entity_string, type, sub_type, descriptor, consumed?),...]}}

        # Pre-load the Stanford dependency parse files as dependency information will be used in the construction of AMR fragments
        dictDepParse = self.buildDepDict(docID,folderLocationDep)
        #print 'dictDepParse:', dictDepParse

        common = cUtil()

        # Get "clean" and "tree" sentences - both are used in computing the word indices
        dictSents = common.getSentences(treebank)

        # Extract Named Entities from the BBN corpus files:
        sentNo = 1 # was 0
        bRead = False
        prevStartIndices = []

        # Pick the named entity file according to the doc ID - there are 4 files per OntoNotes "folder"
        quadrant = ''
        docExt = int(docID[-2:])
        if docExt < 25:
                quadrant = 'a'
        elif 25 <= docExt < 50:
                quadrant = 'b'
        elif 50 <= docExt < 75:
                quadrant = 'c'
        else:
                quadrant = 'd'

        # Open and read in named entity XML file
        fileName = 'wsj' + docID[4:6] + quadrant + '.qa'
        nameEntFile = open(folderLocationNE + '/' + fileName, 'r')

        for line in nameEntFile:
            #print line
            if '<DOCNO>' in line: # Obtain docID and reset list of previous start indices
                prevStartIndices = []
                docIDInFile = ((line.split('> ')[1].split(' <')[0])[:3] + '_' + (line.split('> ')[1].split(' <')[0])[3:]).lower()
                if docIDInFile == docID:
                    bRead = True
                else:
                    bRead = False
                continue
            if 'DOC>' in line or 'ROOT>' in line:
                continue
            if bRead == True: # If this is the document that you are looking for...
                print 'looking at line', line
                if line.startswith("'    "):
                    line = line[1:]
                line = line.rstrip('\n').rstrip('\r').rstrip(' ').replace('     ','')
                # Extract clean string and return as a list of words (from which start and end indices can be extracted)
                #wordList = self.getWordList(line)
                #print dictSents
                #print docID
                #print dictSents[docID]
                #print dictSents[docID][sentNo]
                treeString = dictSents[docID][sentNo][0]
                cleanString = dictSents[docID][sentNo][1]
                line = self.add_spaces(cleanString, line)
                line = '<SENTENCE>' + line + '</SENTENCE>' # Wrappers so that each sentence can be read as a separate XML string
                print line
                parsedLine = parseString(line)
                #parseTree = ElementTree.parse(line)
                # Retrieve the first xml tag (<tag>data</tag>) that the parser finds with the matching name:
                for element in chain( parsedLine.getElementsByTagName('ENAMEX'),
                                      parsedLine.getElementsByTagName('NUMEX'),
                                      parsedLine.getElementsByTagName('TIMEX')):
                    dependents = []
                    xmlTag = element.toxml()
                    xmlTagName = xmlTag.split()[0][1:]
                    xmlTagContent = xmlTag.split('>')[1]
                    xmlTagContent = xmlTagContent.split('<')[0]
                    xmlTagAttr = xmlTag.split('=\"')[1].split('\">')[0]
                    listTagAttr = xmlTagAttr.split(':')
                    entType = listTagAttr[0]
                    if len(listTagAttr) > 1:
                        entSubType = listTagAttr[1]
                    else:
                        entSubType = ''
                    # Find start and end word index from the 'clean' string verion of the line
                    indices = self.getWordIndices(xmlTagContent,cleanString.split(' '),prevStartIndices)
                    #print sentNo
                    #print xmlTagContent
                    #print indices
                    # Get dependents of the words in the xmlTagContent - for PERSON, ORGANIZATION, GPE, PRODUCT and FAC types (which can take a descriptor)
                    if '<ENAMEX' in xmlTag:
                        if entType in ['PERSON', 'ORGANIZATION', 'GPE', 'PRODUCT', 'FAC']:
                            taggedWordList = xmlTagContent.split(' ')
                            cleanWordList = cleanString.split(' ')
                            for taggedIndex in range(indices[0],indices[1]):
                                if dictDepParse[docID][sentNo].has_key(taggedIndex):
                                    for depIndex in dictDepParse[docID][sentNo][taggedIndex]:
                                        dependent = cleanWordList[depIndex]
                                        if dependent not in taggedWordList:
                                            dependents.append(dependent)
                    prevStartIndices.append(indices[0])
                    # Adjust indices to use 'tree' string indices, not clean string indices
                    indices = common.adjustIndices(cleanString, treeString, indices[0], indices[1])
                    #print indices
                    #print '---------'
                    if not dictNameEnt.has_key(docID):
                        dictNameEnt[docID]  = {}
                    if not dictNameEnt[docID].has_key(sentNo):
                        dictNameEnt[docID][sentNo] = []
                    dictNameEnt[docID][sentNo].append((indices[0],indices[1],xmlTagContent,entType,entSubType,dependents,xmlTag))
                prevStartIndices = []
                sentNo += 1
        nameEntFile.close()

        # Modify dictNameEnt entries to merge PERSON, ORGANIZATION, GPE, PRODUCT and FAC entities with their entity description NEs (X_DESC)
        dictNameEnt = self.mergeEntDesc(dictNameEnt)
        print dictNameEnt
        return dictNameEnt