예제 #1
0
def constructFeatures(discourseBank, treeBank):
    featureSet = []
    for relation in discourseBank:
        if relation['Type'] == 'Explicit':
            arg1Set = set()
            arg2Set = set()
            arg1Tokens = []
            arg2Tokens = []
            for i in relation['Arg1']['TokenList']:
                arg1Set.add(i[3])
                #arg1Tokens.append(i[4])
            for i in relation['Arg2']['TokenList']:
                arg2Set.add(i[3])
                #arg2Tokens.append(i[4])

            arg1 = list(arg1Set)
            arg2 = list(arg2Set)
            arg1.sort()
            arg2.sort()
            #print arg1Set, arg2Set
            if arg1[-1] < arg2[0]:
                label = 'PS'

            if (len(arg1Set) == 1 and len(arg2Set) == 1):
                if (arg1Set.pop() == arg2Set.pop()):
                    label = 'SS'

            connective = relation['Connective']['TokenList']
            connectiveRawText = relation['Connective']['RawText']
            #print relation
            #chm= conn_head_mapper.ConnHeadMapper()
            #head,connectiveTokens = chm.map_raw_connective(relation['Connective']['RawText'])

            #connectiveStartValue=relation['Connective']['TokenList'][0][4]
            #connectiveTokens=[i+connectiveStartValue for i in connectiveTokens]

            sentenceOffset = connective[0][3]
            connectiveTokens = [token[4] for token in connective]
            doc = relation['DocID']
            ptree = treeBank[doc]['sentences'][sentenceOffset]['parsetree']
            ptree = nltk.ParentedTree.fromstring(ptree)
            if ptree.leaves() != []:
                featureSet.append(
                    (argPositionFeat.getFeatures(ptree, connectiveRawText,
                                                 connectiveTokens), label))
                #print featureSet
    return featureSet
def constructFeatures(discourseBank,treeBank):
    featureSet=[]
    for relation in discourseBank:
        if relation['Type']=='Explicit':
            arg1Set=set()
            arg2Set=set()
            arg1Tokens=[]
            arg2Tokens=[]
            for i in relation['Arg1']['TokenList']:
                arg1Set.add(i[3])
                #arg1Tokens.append(i[4])
            for i in relation['Arg2']['TokenList']:
                arg2Set.add(i[3])
                #arg2Tokens.append(i[4])
            
            arg1=list(arg1Set)
            arg2=list(arg2Set)
            arg1.sort()
            arg2.sort()
            #print arg1Set, arg2Set
            if arg1[-1] < arg2[0]:
                label='PS'
            
            if (len(arg1Set)==1 and len(arg2Set)==1):
                if (arg1Set.pop()==arg2Set.pop()):
                    label='SS'
            
            connective=relation['Connective']['TokenList']
            connectiveRawText = relation['Connective']['RawText']
            #print relation
            #chm= conn_head_mapper.ConnHeadMapper()
            #head,connectiveTokens = chm.map_raw_connective(relation['Connective']['RawText'])
                    
            #connectiveStartValue=relation['Connective']['TokenList'][0][4]
            #connectiveTokens=[i+connectiveStartValue for i in connectiveTokens]            
            
            sentenceOffset=connective[0][3]
            connectiveTokens=[token[4] for token in connective]            
            doc=relation['DocID']
            ptree=treeBank[doc]['sentences'][sentenceOffset]['parsetree']
            ptree = nltk.ParentedTree.fromstring(ptree)
            if ptree.leaves() != []:
                featureSet.append((argPositionFeat.getFeatures(ptree, connectiveRawText, connectiveTokens), label))
                #print featureSet
    return featureSet
예제 #3
0
def classifyOther(sentence, wordString, wordNum, skip, argPosClassifier,
                  senseClassifier, argClassifier):
    wordList = sentence['words']
    parsetree = nltk.ParentedTree.fromstring(sentence['parsetree'])

    if parsetree.leaves() != []:
        leaf_index = range(wordNum, wordNum + skip + 1)
        c = wordString.strip()
        """
	c = ""
	for i in (leaf_index):
	        if i == 0:
	        	c = c + parsetree[parsetree.leaf_treeposition(i)]
        	else:
			c = c + " " + parsetree[parsetree.leaf_treeposition(i)]  
	c=c.strip()
	leave_list = parsetree.leaves() 
        s=''
        for i in leave_list:
            if i in [',','.','!','?','%','(',')','$','#','@','*','^'] or i[0] in ["'",'"','`']:
                s=s+i
            else:
                s=s+' '+i
            
        s = s.encode('utf-8')
        semantic_feat = semantic.semantic_features(s)
	"""

        fv1 = argPositionFeat.getFeatures(parsetree, c, leaf_index)
        argPosLabel = argPosClassifier.classify(fv1)

        argDict = finalArgsExtractor.argsExtract(argClassifier, parsetree,
                                                 leaf_index)

        arg1string = argDict['arg1']
        arg2string = argDict['arg2']
        arg1string = arg1string.replace(' ,', ',')
        arg2string = arg2string.replace(' ,', ',')
        arg1string = arg1string.replace('`` ', '"')
        arg2string = arg2string.replace('`` ', '"')
        while u" n't" in arg1string:
            arg1string = arg1string.replace(" n't", "n't")

        while u" n't" in arg2string:
            arg2string = arg2string.replace(" n't", "n't")

        while u" 's" in arg1string:
            arg1string = arg1string.replace(" 's", "'s")

        while u" 's" in arg2string:
            arg2string = arg2string.replace(" 's", "'s")

        arg1 = []
        arg2 = []
        leaves = parsetree.leaves()
        arg1words = argDict['arg1'].split()
        arg2words = argDict['arg2'].split()
        ind = -1

        string = ' '.join(leaves)
        if ' '.join(arg1words[:2]) in string:
            ind = string.index(' '.join(arg1words[:2]))
            ind = len(string[:ind].split()) - 1
            if ind == 0:
                ind = -1

        for word in arg1words:
            ind += leaves[ind + 1:].index(word) + 1
            arg1.append(ind)

        ind = -1
        if ' '.join(arg2words[:2]) in string:
            ind = string.index(' '.join(arg2words[:2]))
            ind = len(string[:ind].split()) - 1
            if ind == 0:
                ind = -1

        arg2words = argDict['arg2'].split()
        for word in arg2words:
            ind += leaves[ind + 1:].index(word) + 1
            arg2.append(ind)

#	leaves=parsetree.leaves()
#	arg1words=argDict['arg1'].split()
#	for word in arg1words:
#		arg1.append(leaves.index(word))
#
#	arg2words=argDict['arg2'].split()
#	for word in arg2words:
#		arg2.append(leaves.index(word))

        fv2 = explicitSenseFeat.featureExtraction(parsetree, leaf_index,
                                                  sentence, argDict, c)
        #senseFeatureVector.update(semantic_feat)
        senseLabel = senseClassifier.classify(fv2)
    return argPosLabel, senseLabel, arg1, arg2
def classifyOther(sentence,wordString,wordNum,skip,argPosClassifier,senseClassifier,argClassifier):
    wordList=sentence['words']
    parsetree = nltk.ParentedTree.fromstring(sentence['parsetree'])

    if parsetree.leaves()!=[]:                        
       	leaf_index=range(wordNum,wordNum+skip+1)
	c=wordString.strip()

	"""
	c = ""
	for i in (leaf_index):
	        if i == 0:
	        	c = c + parsetree[parsetree.leaf_treeposition(i)]
        	else:
			c = c + " " + parsetree[parsetree.leaf_treeposition(i)]  
	c=c.strip()
	leave_list = parsetree.leaves() 
        s=''
        for i in leave_list:
            if i in [',','.','!','?','%','(',')','$','#','@','*','^'] or i[0] in ["'",'"','`']:
                s=s+i
            else:
                s=s+' '+i
            
        s = s.encode('utf-8')
        semantic_feat = semantic.semantic_features(s)
	"""

	fv1 = argPositionFeat.getFeatures(parsetree,c,leaf_index)
        argPosLabel=argPosClassifier.classify(fv1)
	
        argDict=finalArgsExtractor.argsExtract(argClassifier,parsetree,leaf_index)

        arg1string=argDict['arg1']
        arg2string=argDict['arg2']
	arg1string=arg1string.replace(' ,',',')
	arg2string=arg2string.replace(' ,',',')
	arg1string=arg1string.replace('`` ' , '"')
	arg2string=arg2string.replace('`` ' , '"')
	while u" n't" in arg1string:
		arg1string = arg1string.replace(" n't", "n't")

	while u" n't" in arg2string:
                arg2string = arg2string.replace(" n't", "n't")

	while u" 's" in arg1string:
                arg1string = arg1string.replace(" 's", "'s")

	while u" 's" in arg2string:
                arg2string = arg2string.replace(" 's", "'s")

	arg1=[]
	arg2=[]
	leaves=parsetree.leaves()
	arg1words=argDict['arg1'].split()
	arg2words=argDict['arg2'].split()
	ind = -1

	string = ' '.join(leaves)
	if ' '.join(arg1words[:2]) in string:
		ind = string.index(' '.join(arg1words[:2]))
		ind = len(string[:ind].split())-1
		if ind == 0:
			ind = -1

	for word in arg1words:
		ind += leaves[ind+1:].index(word)+1
		arg1.append(ind)

	ind = -1
	if ' '.join(arg2words[:2]) in string:
		ind = string.index(' '.join(arg2words[:2]))
		ind = len(string[:ind].split())-1
		if ind == 0:
			ind = -1

	arg2words=argDict['arg2'].split()
	for word in arg2words:
		ind += leaves[ind+1:].index(word)+1
		arg2.append(ind)


#	leaves=parsetree.leaves()
#	arg1words=argDict['arg1'].split()
#	for word in arg1words:
#		arg1.append(leaves.index(word))		
#	
#	arg2words=argDict['arg2'].split()
#	for word in arg2words:
#		arg2.append(leaves.index(word))

	fv2=explicitSenseFeat.featureExtraction(parsetree,leaf_index,sentence,argDict,c)
        #senseFeatureVector.update(semantic_feat)
        senseLabel=senseClassifier.classify(fv2)
    return argPosLabel,senseLabel,arg1,arg2