def build(tree_head_dict): #def build(): pb_instances = propbank_ptb.instances() all_sub_trees = [] for inst in pb_instances[:]: if str(inst).find('*') == -1 and str(inst).find(',') == -1: arguments = [] tree = inst.tree #tree.draw() pred_tree = inst.predicate.select(tree) (pred,r) = parseExpr(str(pred_tree),0,0) (parsed,r) = parseExpr(str(tree),0,0) remove_functional_tags(parsed) #all_sub_trees = all_sub_trees + print_all_subtrees(parsed,[]) # use this function to print all subtrees and then use java program to find heads for (argloc,argid) in inst.arguments: if str(argloc.select(tree)).split(')')[0].find('*') == -1: #print('%s' % (argloc.select(tree).pprint(10000)[:])) wordNum = int(str(argloc).split(':')[0]) h = int(str(argloc).split(':')[1]) #print wordNum #print h arg = traverse_tree_depth(parsed,wordNum,h) arguments.append((arg,argid)) #for a in arguments: #print a.data #pruned = pruning(parsed,pred,inst.predicate.wordnum,[]) #print t_word = pred.word t_w_pos = pred.data #t_word = inst.roleset.split('.')[0] #t_w_pos = inst.predicate.select(inst.tree).node pred_parrent = find_pred_parrent(parsed,inst.predicate.wordnum,None) subcat = find_subcat(pred_parrent) #print t_word #print t_w_pos for (arg,label) in arguments: path_list = get_path(arg,parsed,inst.predicate.wordnum) (h,h_pos) = extract_head(arg,tree_head_dict) path = ''.join(path_list) distance = len(path_list) pt = arg.data.rstrip() t_word_pls_pt = str(t_word)+str(pt) t_word_pls_h_word = str(t_word)+str(h) distance_pls_t_word = str(distance)+str(t_word) subcatStar = find_subcat(arg.parent) subcatAt = find_subcat(arg) print 'h='+str(h)+' h_pos='+str(h_pos)+' h_word='+str(h)+' h_word_pos='+str(h_pos)+' path='+str(path)+' t_word_pls_pt='+t_word_pls_pt+' t_word_pls_h_word='+t_word_pls_h_word+' subcat='+str(subcat)+ ' subcatAt='+str(subcatAt)+ ' subcatStar='+str(subcatStar)+ ' '+label
from argext import * from nltk.corpus import propbank_ptb from math import floor from sys import stdout if __name__ == '__main__' : print('\n- acquiring experiment data -\n') # vars exp_name = 'SemanticArgumentClassification' # the experiment's name files = [exp_name + '_data_train.arff', exp_name + '_data_dev.arff', exp_name + '_data_test.arff'] # the output filenames ratios = [0.6, 0.2, 0.2] # their corresponding ratios (60%, 20%, 20%) pbi_ratio = 1. # ratio of the total PropBank corpus to acquire data from # init pbi = propbank_ptb.instances() featurelist = ['predicate', 'path', 'phraseType', 'position', 'voice', 'class'] # initialize ARGInstanceBuilder with featurelist arg_the_builder = ARGInstanceBuilder(dict.fromkeys(featurelist)) arglist = [] # arglist for the extracted ARGInstances # extract ARGInstances pbi_ratio_index = floor(len(pbi)*pbi_ratio) for i in range(pbi_ratio_index) : if (i%20) == 0 : stdout.write("\rextracting ARGInstances...%.2f%%" % (i*100/pbi_ratio_index)) stdout.flush() try : arglist += arg_the_builder.get_arginstances(pbi[i]) # add extracted ARGInstances from current Propbank Instance to arglist except : print("Error at PropBankInstance with index : " + str(i)) stdout.write("\rextracting ARGInstances...done \n")
def build(tree_head_dict): #def build(): pb_instances = propbank_ptb.instances() all_sub_trees = [] ##predicted_context_labels = read_without_context_labels() ## ##f = 0 ## ##t = 0 ## for inst in pb_instances[:]: if int(str(inst).split('/')[1]) > 01 and int(str(inst).split('/')[1]) < 22: # for getting training data according to CoNLL 2005 task #if int(str(inst).split('/')[1]) == 23: # for getting testing data according to CoNLL 2005 task if str(inst).find('*') == -1 and str(inst).find(',') == -1: arguments = [] tree = inst.tree #tree.draw() pred_tree = inst.predicate.select(tree) (pred,r) = parseExpr(str(pred_tree),0,0) (parsed,r) = parseExpr(str(tree),0,0) #remove_functional_tags(parsed) #all_sub_trees = all_sub_trees + print_all_subtrees(parsed,[]) # use this function to print all subtrees and then use java program to find heads gold_context_labels_list = [] for (argloc,argid) in inst.arguments: if str(argloc.select(tree)).split(')')[0].find('*') == -1: #print('%s' % (argloc.select(tree).pprint(10000)[:])) wordNum = int(str(argloc).split(':')[0]) h = int(str(argloc).split(':')[1]) #print wordNum #print h arg = traverse_tree_depth(parsed,wordNum,h) arguments.append((arg,wordNum,argid)) gold_context_labels_list.append(argid) #for a in arguments: #print a.data #pruned = pruning(parsed,pred,inst.predicate.wordnum,[]) #print t_word = pred.word t_w_pos = pred.data #t_word = inst.roleset.split('.')[0] #t_w_pos = inst.predicate.select(inst.tree).node pred_parrent = find_pred_parrent(parsed,inst.predicate.wordnum,None) subcat = find_subcat(pred_parrent) if pred_parrent.word != None: ParentWord = pred_parrent.word # word of the parrent node of pred else: ParentWord = 'none' ParentWordPos = pred_parrent.data.rstrip() # pos of the parrent node of pred ## gold context labels context_labels_all = gold_context_labels_list # for training #predicted ##t = t + len(arguments) ## ##context_labels_all = predicted_context_labels[f:t] ## for testing ##f = t ## d = 0 #print len(inst.arguments) for (arg,wordNum,label) in arguments: path_list = get_path(arg,parsed,inst.predicate.wordnum) (h,h_pos) = extract_head(arg,tree_head_dict) path = ''.join(path_list) distance = len(path_list) pt = arg.data.rstrip() t_word_pls_pt = str(t_word)+str(pt) t_word_pls_h_word = str(t_word)+str(h) distance_pls_t_word = str(distance)+str(t_word) subcatStar = find_subcat(arg.parent) subcatAt = find_subcat(arg) if wordNum < inst.wordnum: position = 'before' else: position = 'after' temp = [] for i in range(0,len(context_labels_all)): if i != d: temp.append(context_labels_all[i]) context_labels = ':'.join(temp) #context_labels = ':'.join([a for a in context_labels_all if context_labels_all.index(a) != d]) ## d = d + 1 ## #print ft + ' ' + str(context_labels_all) + ' ' + context_labels + ' ' + str(len(inst.arguments)) #without context labels #print 't_word='+str(t_word)+' t_w_pos='+str(t_w_pos)+' h_word='+str(h)+' h_word_pos='+str(h_pos)+' path='+str(path)+' t_word_pls_pt='+t_word_pls_pt+' t_word_pls_h_word='+t_word_pls_h_word+' subcat='+str(subcat)+ ' subcatAt='+str(subcatAt)+ ' subcatStar='+str(subcatStar)+' pt='+pt+' position='+position+' ParentWord='+ParentWord+' ParentWordPos='+ParentWordPos+ ' '+label #print 't_word='+str(t_word)+' t_w_pos='+str(t_w_pos)+' h_word='+str(h)+' h_word_pos='+str(h_pos)+' path='+str(path)+' t_word_pls_pt='+t_word_pls_pt+' t_word_pls_h_word='+t_word_pls_h_word+' subcat='+str(subcat)+ ' subcatAt='+str(subcatAt)+ ' subcatStar='+str(subcatStar)+' pt='+pt+' position='+position+' ParentWord='+ParentWord+' ParentWordPos='+ParentWordPos+ ' ?' #with context labels print 't_word='+str(t_word)+' t_w_pos='+str(t_w_pos)+' h_word='+str(h)+' h_word_pos='+str(h_pos)+' path='+str(path)+' t_word_pls_pt='+t_word_pls_pt+' t_word_pls_h_word='+t_word_pls_h_word+' subcat='+str(subcat)+ ' subcatAt='+str(subcatAt)+ ' subcatStar='+str(subcatStar)+' pt='+pt+' position='+position+' ParentWord='+ParentWord+' ParentWordPos='+ParentWordPos+' context_labels='+context_labels+ ' '+label