Example #1
0
def classifyConnective(sentence, wordNum, connClassifier):

    parsetree = nltk.ParentedTree.fromstring(sentence['parsetree'])
    """
    wordList=parsetree.leaves()
    word=wordList[wordNum]
    wordString,connHead=connMatching.matchConnective(parsetree,word,wordNum)
    indices=[]
    for word in wordString:
        if word in wordList:
            indices.append(wordList.index(word))
    #print 'conn ',wordString
    """
    wordList = sentence['words']
    wordString, skip = matchConnectiveList(wordList, wordNum)

    if wordString == 'False':
        return 'False', 'N', 0
    else:

        if parsetree.leaves() != []:
            connLabel = connClassifier.classify(
                connExtractFeat.getfeatures(parsetree,
                                            range(wordNum,
                                                  wordNum + skip + 1)))
            #connLabel=connClassifier.classify(connExtractFeat.getfeatures(parsetree,indices))
            return wordString, connLabel, skip
        else:
            return 'False', 'N', 0
def classifyConnective(sentence,wordNum,connClassifier):

    parsetree = nltk.ParentedTree.fromstring(sentence['parsetree'])
    """
    wordList=parsetree.leaves()
    word=wordList[wordNum]
    wordString,connHead=connMatching.matchConnective(parsetree,word,wordNum)
    indices=[]
    for word in wordString:
        if word in wordList:
            indices.append(wordList.index(word))
    #print 'conn ',wordString
    """
    wordList=sentence['words']
    wordString,skip=matchConnectiveList(wordList,wordNum)

    if wordString=='False':
        return 'False','N',0
    else:
         
         if parsetree.leaves()!=[]:                        
             connLabel=connClassifier.classify(connExtractFeat.getfeatures(parsetree,range(wordNum,wordNum+skip+1)))
             #connLabel=connClassifier.classify(connExtractFeat.getfeatures(parsetree,indices))
             return wordString,connLabel,skip
         else:
             return 'False','N',0
def dataProcess(discourseBank, treeBank, connectiveList):
    featureSets = []
    docList = treeBank.keys()
    docList.sort()
    totalDiscourses = len(discourseBank)
    lastDiscourse = totalDiscourses - 1
    #print totalDiscourses
    dBIterator = 0
    oldExplicitIterator = dBIterator
    explicit = 0
    oldexplicit = explicit
    i = 0
    j = 0
    k = 0
    for doc in docList:

        sentenceList = treeBank[doc]['sentences']
        for sentenceOffset, sentence in enumerate(sentenceList):

            wordList = sentence['words']
            #print wordList
            lengthWordList = len(wordList)
            wordNum = 0
            while (wordNum < lengthWordList):
                wordStructure = wordList[wordNum]
                word = wordStructure[0]
                word = word.lower()
                #print word
                wordDictionary = wordStructure[1]
                #print wordDictionary
                #if not matchConnectiveList(connectiveList,word):
                #    continue
                relation = discourseBank[dBIterator]
                #print relation['Type']

                while (1):
                    #print 'Consecutive:%d'%dBIterator
                    if (relation['Type'] == 'Explicit'
                            or dBIterator == lastDiscourse):
                        break

                    dBIterator += 1
                    relation = discourseBank[dBIterator]
                    if (relation['Type'] == 'Explicit'):
                        connective = relation['ConnectiveHead']
                        #print 1,explicit,connective
                        explicit += 1
                    #print relation['Type']

                docWord = int(doc[4:])
                docConnective = int(relation['DocID'][4:])
                cOBWord = wordDictionary['CharacterOffsetBegin']
                cOEWord = wordDictionary['CharacterOffsetEnd']

                if relation['Type'] == 'Explicit':
                    connective = relation['ConnectiveHead']
                    connectiveLength = len(
                        relation['Connective']['CharacterSpanList'])
                    cOBConnective = relation['Connective'][
                        'CharacterSpanList'][0][0]
                    cOEConnective = relation['Connective'][
                        'CharacterSpanList'][connectiveLength - 1][1]

                if ((docConnective > docWord) or
                    (docWord == docConnective and cOEWord < cOBConnective)):

                    #                    if word=='in':
                    #                        print doc,sentenceOffset
                    #                        print sentence
                    #                        print relation
                    result, skip = matchConnectiveList(wordList, wordNum)

                    #if relation['Type']=='Explicit' and explicit==40:
                    #    print 1,explicit,word,connective
                    if result != 'False':
                        #print 3,word,result,connective
                        label = 'N'
                        tokenNo = range(wordNum, wordNum + skip + 1)
                        #tokenNo=[words[4] for words in tokenNumberLists]
                        tokens = [token[0] for token in wordList]
                        #if word=='either':
                        #print sentenceList[sentenceOffset-1]
                        #print word,wordNum
                        #print tokens
                        #print relation
                        parsetree = nltk.ParentedTree.fromstring(
                            sentence['parsetree'])
                        if parsetree.leaves() != []:
                            featureSets.append((connExtractFeat.getfeatures(
                                parsetree, tokenNo), label))
                    wordNum += skip

                elif ((docWord == docConnective) and
                      (cOBConnective <= cOBWord and cOEWord <= cOEConnective)):
                    #Important match the potential connectives to connective head and not connectives' raw text
                    l = connective.split()
                    l = [string.lower() for string in l]
                    if (word in l):
                        #better thing would have been to just match the character offset beginning and end of connective
                        result, skip = matchConnectiveList(wordList, wordNum)

                        if result == 'if then':
                            print 1, word, l, result, connective
                        if result != 'False':
                            label = 'Y'
                            tokenNumberLists = relation['Connective'][
                                'TokenList']
                            tokenNo = range(wordNum, wordNum + skip + 1)
                            #tokenNo=[words[4] for words in tokenNumberLists]
                            ##tokens=[token[0] for token in wordList]
                            #print tokens,tokenNo,word
                            #print relation
                            parsetree = nltk.ParentedTree.fromstring(
                                sentence['parsetree'])
                            if parsetree.leaves() != []:
                                #                                print sentence['parsetree']
                                #                                print doc,word,sentenceOffset
                                #                                print tokens
                                #                                print relation
                                featureSets.append(
                                    (connExtractFeat.getfeatures(
                                        parsetree, tokenNo), label))

                            if (explicit - oldexplicit > 1):
                                #                                print explicit
                                #                                print doc,word,connective
                                #
                                #                                print discourseBank[oldExplicitIterator]
                                #                                print relation

                                k += 1
                            oldexplicit = explicit
                            oldExplicitIterator = dBIterator

                            #if relation['Type']=='Explicit' and explicit==40:
                            #    print 2,explicit,word,connective
                            #print explicit,word,connective
                            i += 1
                        wordNum += skip

                    else:
                        #these lines are not required. they are the cases in which a word appears before the connective head
                        result, skip = matchConnectiveList(wordList, wordNum)
                        #print 2,word,l,result,connective
                        wordNum += skip
                        if result != 'False':
                            label = 'N'

                    j += 1

                    #getFeatureVector()
                elif ((docConnective < docWord) or
                      (docConnective == docWord and cOEConnective < cOBWord)):
                    #if relation['Type']=='Explicit' and explicit==40:
                    #    print 3,explicit,word,connective,sentenceOffset
                    if (dBIterator > totalDiscourses):
                        print 'kuch galat hai'
                    if dBIterator != lastDiscourse:
                        dBIterator += 1
                        relation = discourseBank[dBIterator]
                        if (relation['Type'] == 'Explicit'):
                            #print 2,explicit,connective
                            explicit += 1
                        wordNum -= 1
                #print doc,sentenceOffset,wordNum,cOBWord,word
                wordNum += 1
                #print i,dBIterator

    print i, j, k
    return featureSets
def dataProcess(discourseBank,treeBank,connectiveList):
    featureSets=[]
    docList=treeBank.keys()
    docList.sort()
    totalDiscourses=len(discourseBank)
    lastDiscourse=totalDiscourses-1
    #print totalDiscourses
    dBIterator=0
    oldExplicitIterator=dBIterator
    explicit=0
    oldexplicit=explicit
    i=0;j=0;k=0
    for doc in docList:
        
        sentenceList=treeBank[doc]['sentences']
        for sentenceOffset,sentence in enumerate(sentenceList):
            
            wordList=sentence['words']
            #print wordList
            lengthWordList=len(wordList)
            wordNum=0
            while(wordNum<lengthWordList):
                wordStructure=wordList[wordNum]
                word=wordStructure[0]
                word=word.lower()
                #print word
                wordDictionary=wordStructure[1]
                #print wordDictionary     
                #if not matchConnectiveList(connectiveList,word):
                #    continue
                relation=discourseBank[dBIterator]
                #print relation['Type']
                
                while(1): 
                    #print 'Consecutive:%d'%dBIterator 
                    if(relation['Type']=='Explicit' or dBIterator==lastDiscourse):
                        break
                    
                    dBIterator+=1
                    relation=discourseBank[dBIterator]
                    if (relation['Type']=='Explicit'):
                        connective=relation['ConnectiveHead']
                        #print 1,explicit,connective
                        explicit+=1
                    #print relation['Type']
                    
                
                docWord=int(doc[4:]);docConnective=int(relation['DocID'][4:])
                cOBWord=wordDictionary['CharacterOffsetBegin']
                cOEWord=wordDictionary['CharacterOffsetEnd']
                
                if relation['Type']=='Explicit':
                    connective=relation['ConnectiveHead']
                    connectiveLength=len(relation['Connective']['CharacterSpanList'])
                    cOBConnective=relation['Connective']['CharacterSpanList'][0][0]
                    cOEConnective=relation['Connective']['CharacterSpanList'][connectiveLength-1][1]
                
                if ((docConnective > docWord) or (docWord==docConnective and cOEWord<cOBConnective)):
                    
#                    if word=='in':
#                        print doc,sentenceOffset
#                        print sentence
#                        print relation
                    result,skip=matchConnectiveList(wordList,wordNum)
                    
                    #if relation['Type']=='Explicit' and explicit==40:
                    #    print 1,explicit,word,connective
                    if result!='False':
                        #print 3,word,result,connective
                        label='N'
                        tokenNo=range(wordNum,wordNum+skip+1)
                        #tokenNo=[words[4] for words in tokenNumberLists]
                        tokens=[token[0] for token in wordList]
                        #if word=='either':
                            #print sentenceList[sentenceOffset-1]
                            #print word,wordNum
                            #print tokens
                        #print relation
                        parsetree = nltk.ParentedTree.fromstring(sentence['parsetree'])
                        if parsetree.leaves()!=[]:                        
                            featureSets.append((connExtractFeat.getfeatures(parsetree,tokenNo),label))
                    wordNum+=skip
                    
                elif((docWord==docConnective) and ( cOBConnective <= cOBWord and cOEWord <= cOEConnective)):
                    #Important match the potential connectives to connective head and not connectives' raw text                    
                    l=connective.split()
                    l=[string.lower() for string in l]
                    if(word in l):
                        #better thing would have been to just match the character offset beginning and end of connective
                        result,skip=matchConnectiveList(wordList,wordNum)
                        
                        
                        if result=='if then':
                            print 1,word,l,result,connective
                        if result!='False':
                            label='Y'
                            tokenNumberLists=relation['Connective']['TokenList']
                            tokenNo = range(wordNum, wordNum+skip+1)
                            #tokenNo=[words[4] for words in tokenNumberLists]
                            ##tokens=[token[0] for token in wordList]
                            #print tokens,tokenNo,word
                            #print relation
                            parsetree = nltk.ParentedTree.fromstring(sentence['parsetree'])
                            if parsetree.leaves()!=[]:
#                                print sentence['parsetree']                                
#                                print doc,word,sentenceOffset
#                                print tokens
#                                print relation
                                featureSets.append((connExtractFeat.getfeatures(parsetree,tokenNo),label))
                            
                            
                            if (explicit-oldexplicit>1):
#                                print explicit
#                                print doc,word,connective
#                                
#                                print discourseBank[oldExplicitIterator]
#                                print relation
                                
                                k+=1
                            oldexplicit=explicit
                            oldExplicitIterator=dBIterator
                        
                            
                            #if relation['Type']=='Explicit' and explicit==40:
                            #    print 2,explicit,word,connective
                            #print explicit,word,connective
                            i+=1
                        wordNum+=skip
                            
                    else:
                        #these lines are not required. they are the cases in which a word appears before the connective head
                        result,skip=matchConnectiveList(wordList,wordNum)
                        #print 2,word,l,result,connective
                        wordNum+=skip
                        if result!='False':
                            label='N'
                        
                    j+=1
                    
                    
                            #getFeatureVector()
                elif((docConnective < docWord) or (docConnective==docWord and cOEConnective<cOBWord)):
                    #if relation['Type']=='Explicit' and explicit==40:                    
                    #    print 3,explicit,word,connective,sentenceOffset                 
                    if (dBIterator > totalDiscourses):
                        print 'kuch galat hai'  
                    if dBIterator!=lastDiscourse:
                        dBIterator+=1
                        relation=discourseBank[dBIterator]
                        if (relation['Type']=='Explicit'):
                            #print 2,explicit,connective
                            explicit+=1
                        wordNum-=1
                #print doc,sentenceOffset,wordNum,cOBWord,word
                wordNum+=1         
                    #print i,dBIterator
                  
    print i,j,k
    return featureSets