def classifyConnective(sentence, wordNum, connClassifier): parsetree = nltk.ParentedTree.fromstring(sentence['parsetree']) """ wordList=parsetree.leaves() word=wordList[wordNum] wordString,connHead=connMatching.matchConnective(parsetree,word,wordNum) indices=[] for word in wordString: if word in wordList: indices.append(wordList.index(word)) #print 'conn ',wordString """ wordList = sentence['words'] wordString, skip = matchConnectiveList(wordList, wordNum) if wordString == 'False': return 'False', 'N', 0 else: if parsetree.leaves() != []: connLabel = connClassifier.classify( connExtractFeat.getfeatures(parsetree, range(wordNum, wordNum + skip + 1))) #connLabel=connClassifier.classify(connExtractFeat.getfeatures(parsetree,indices)) return wordString, connLabel, skip else: return 'False', 'N', 0
def classifyConnective(sentence,wordNum,connClassifier): parsetree = nltk.ParentedTree.fromstring(sentence['parsetree']) """ wordList=parsetree.leaves() word=wordList[wordNum] wordString,connHead=connMatching.matchConnective(parsetree,word,wordNum) indices=[] for word in wordString: if word in wordList: indices.append(wordList.index(word)) #print 'conn ',wordString """ wordList=sentence['words'] wordString,skip=matchConnectiveList(wordList,wordNum) if wordString=='False': return 'False','N',0 else: if parsetree.leaves()!=[]: connLabel=connClassifier.classify(connExtractFeat.getfeatures(parsetree,range(wordNum,wordNum+skip+1))) #connLabel=connClassifier.classify(connExtractFeat.getfeatures(parsetree,indices)) return wordString,connLabel,skip else: return 'False','N',0
def dataProcess(discourseBank, treeBank, connectiveList): featureSets = [] docList = treeBank.keys() docList.sort() totalDiscourses = len(discourseBank) lastDiscourse = totalDiscourses - 1 #print totalDiscourses dBIterator = 0 oldExplicitIterator = dBIterator explicit = 0 oldexplicit = explicit i = 0 j = 0 k = 0 for doc in docList: sentenceList = treeBank[doc]['sentences'] for sentenceOffset, sentence in enumerate(sentenceList): wordList = sentence['words'] #print wordList lengthWordList = len(wordList) wordNum = 0 while (wordNum < lengthWordList): wordStructure = wordList[wordNum] word = wordStructure[0] word = word.lower() #print word wordDictionary = wordStructure[1] #print wordDictionary #if not matchConnectiveList(connectiveList,word): # continue relation = discourseBank[dBIterator] #print relation['Type'] while (1): #print 'Consecutive:%d'%dBIterator if (relation['Type'] == 'Explicit' or dBIterator == lastDiscourse): break dBIterator += 1 relation = discourseBank[dBIterator] if (relation['Type'] == 'Explicit'): connective = relation['ConnectiveHead'] #print 1,explicit,connective explicit += 1 #print relation['Type'] docWord = int(doc[4:]) docConnective = int(relation['DocID'][4:]) cOBWord = wordDictionary['CharacterOffsetBegin'] cOEWord = wordDictionary['CharacterOffsetEnd'] if relation['Type'] == 'Explicit': connective = relation['ConnectiveHead'] connectiveLength = len( relation['Connective']['CharacterSpanList']) cOBConnective = relation['Connective'][ 'CharacterSpanList'][0][0] cOEConnective = relation['Connective'][ 'CharacterSpanList'][connectiveLength - 1][1] if ((docConnective > docWord) or (docWord == docConnective and cOEWord < cOBConnective)): # if word=='in': # print doc,sentenceOffset # print sentence # print relation result, skip = matchConnectiveList(wordList, wordNum) #if relation['Type']=='Explicit' and explicit==40: # print 1,explicit,word,connective if result != 'False': #print 3,word,result,connective label = 'N' tokenNo = range(wordNum, wordNum + skip + 1) #tokenNo=[words[4] for words in tokenNumberLists] tokens = [token[0] for token in wordList] #if word=='either': #print sentenceList[sentenceOffset-1] #print word,wordNum #print tokens #print relation parsetree = nltk.ParentedTree.fromstring( sentence['parsetree']) if parsetree.leaves() != []: featureSets.append((connExtractFeat.getfeatures( parsetree, tokenNo), label)) wordNum += skip elif ((docWord == docConnective) and (cOBConnective <= cOBWord and cOEWord <= cOEConnective)): #Important match the potential connectives to connective head and not connectives' raw text l = connective.split() l = [string.lower() for string in l] if (word in l): #better thing would have been to just match the character offset beginning and end of connective result, skip = matchConnectiveList(wordList, wordNum) if result == 'if then': print 1, word, l, result, connective if result != 'False': label = 'Y' tokenNumberLists = relation['Connective'][ 'TokenList'] tokenNo = range(wordNum, wordNum + skip + 1) #tokenNo=[words[4] for words in tokenNumberLists] ##tokens=[token[0] for token in wordList] #print tokens,tokenNo,word #print relation parsetree = nltk.ParentedTree.fromstring( sentence['parsetree']) if parsetree.leaves() != []: # print sentence['parsetree'] # print doc,word,sentenceOffset # print tokens # print relation featureSets.append( (connExtractFeat.getfeatures( parsetree, tokenNo), label)) if (explicit - oldexplicit > 1): # print explicit # print doc,word,connective # # print discourseBank[oldExplicitIterator] # print relation k += 1 oldexplicit = explicit oldExplicitIterator = dBIterator #if relation['Type']=='Explicit' and explicit==40: # print 2,explicit,word,connective #print explicit,word,connective i += 1 wordNum += skip else: #these lines are not required. they are the cases in which a word appears before the connective head result, skip = matchConnectiveList(wordList, wordNum) #print 2,word,l,result,connective wordNum += skip if result != 'False': label = 'N' j += 1 #getFeatureVector() elif ((docConnective < docWord) or (docConnective == docWord and cOEConnective < cOBWord)): #if relation['Type']=='Explicit' and explicit==40: # print 3,explicit,word,connective,sentenceOffset if (dBIterator > totalDiscourses): print 'kuch galat hai' if dBIterator != lastDiscourse: dBIterator += 1 relation = discourseBank[dBIterator] if (relation['Type'] == 'Explicit'): #print 2,explicit,connective explicit += 1 wordNum -= 1 #print doc,sentenceOffset,wordNum,cOBWord,word wordNum += 1 #print i,dBIterator print i, j, k return featureSets
def dataProcess(discourseBank,treeBank,connectiveList): featureSets=[] docList=treeBank.keys() docList.sort() totalDiscourses=len(discourseBank) lastDiscourse=totalDiscourses-1 #print totalDiscourses dBIterator=0 oldExplicitIterator=dBIterator explicit=0 oldexplicit=explicit i=0;j=0;k=0 for doc in docList: sentenceList=treeBank[doc]['sentences'] for sentenceOffset,sentence in enumerate(sentenceList): wordList=sentence['words'] #print wordList lengthWordList=len(wordList) wordNum=0 while(wordNum<lengthWordList): wordStructure=wordList[wordNum] word=wordStructure[0] word=word.lower() #print word wordDictionary=wordStructure[1] #print wordDictionary #if not matchConnectiveList(connectiveList,word): # continue relation=discourseBank[dBIterator] #print relation['Type'] while(1): #print 'Consecutive:%d'%dBIterator if(relation['Type']=='Explicit' or dBIterator==lastDiscourse): break dBIterator+=1 relation=discourseBank[dBIterator] if (relation['Type']=='Explicit'): connective=relation['ConnectiveHead'] #print 1,explicit,connective explicit+=1 #print relation['Type'] docWord=int(doc[4:]);docConnective=int(relation['DocID'][4:]) cOBWord=wordDictionary['CharacterOffsetBegin'] cOEWord=wordDictionary['CharacterOffsetEnd'] if relation['Type']=='Explicit': connective=relation['ConnectiveHead'] connectiveLength=len(relation['Connective']['CharacterSpanList']) cOBConnective=relation['Connective']['CharacterSpanList'][0][0] cOEConnective=relation['Connective']['CharacterSpanList'][connectiveLength-1][1] if ((docConnective > docWord) or (docWord==docConnective and cOEWord<cOBConnective)): # if word=='in': # print doc,sentenceOffset # print sentence # print relation result,skip=matchConnectiveList(wordList,wordNum) #if relation['Type']=='Explicit' and explicit==40: # print 1,explicit,word,connective if result!='False': #print 3,word,result,connective label='N' tokenNo=range(wordNum,wordNum+skip+1) #tokenNo=[words[4] for words in tokenNumberLists] tokens=[token[0] for token in wordList] #if word=='either': #print sentenceList[sentenceOffset-1] #print word,wordNum #print tokens #print relation parsetree = nltk.ParentedTree.fromstring(sentence['parsetree']) if parsetree.leaves()!=[]: featureSets.append((connExtractFeat.getfeatures(parsetree,tokenNo),label)) wordNum+=skip elif((docWord==docConnective) and ( cOBConnective <= cOBWord and cOEWord <= cOEConnective)): #Important match the potential connectives to connective head and not connectives' raw text l=connective.split() l=[string.lower() for string in l] if(word in l): #better thing would have been to just match the character offset beginning and end of connective result,skip=matchConnectiveList(wordList,wordNum) if result=='if then': print 1,word,l,result,connective if result!='False': label='Y' tokenNumberLists=relation['Connective']['TokenList'] tokenNo = range(wordNum, wordNum+skip+1) #tokenNo=[words[4] for words in tokenNumberLists] ##tokens=[token[0] for token in wordList] #print tokens,tokenNo,word #print relation parsetree = nltk.ParentedTree.fromstring(sentence['parsetree']) if parsetree.leaves()!=[]: # print sentence['parsetree'] # print doc,word,sentenceOffset # print tokens # print relation featureSets.append((connExtractFeat.getfeatures(parsetree,tokenNo),label)) if (explicit-oldexplicit>1): # print explicit # print doc,word,connective # # print discourseBank[oldExplicitIterator] # print relation k+=1 oldexplicit=explicit oldExplicitIterator=dBIterator #if relation['Type']=='Explicit' and explicit==40: # print 2,explicit,word,connective #print explicit,word,connective i+=1 wordNum+=skip else: #these lines are not required. they are the cases in which a word appears before the connective head result,skip=matchConnectiveList(wordList,wordNum) #print 2,word,l,result,connective wordNum+=skip if result!='False': label='N' j+=1 #getFeatureVector() elif((docConnective < docWord) or (docConnective==docWord and cOEConnective<cOBWord)): #if relation['Type']=='Explicit' and explicit==40: # print 3,explicit,word,connective,sentenceOffset if (dBIterator > totalDiscourses): print 'kuch galat hai' if dBIterator!=lastDiscourse: dBIterator+=1 relation=discourseBank[dBIterator] if (relation['Type']=='Explicit'): #print 2,explicit,connective explicit+=1 wordNum-=1 #print doc,sentenceOffset,wordNum,cOBWord,word wordNum+=1 #print i,dBIterator print i,j,k return featureSets