def build(tree_head_dict):
#def build():
	pb_instances = propbank_ptb.instances()
	all_sub_trees = []
	for inst in pb_instances[:]:
		if str(inst).find('*') == -1 and str(inst).find(',') == -1:
			arguments = []
			tree = inst.tree
			#tree.draw()
			pred_tree =  inst.predicate.select(tree)
			(pred,r) = parseExpr(str(pred_tree),0,0)
			(parsed,r) = parseExpr(str(tree),0,0)
			remove_functional_tags(parsed)
			#all_sub_trees = all_sub_trees + print_all_subtrees(parsed,[]) # use this function to print all subtrees and then use java program to find heads 
			for (argloc,argid) in inst.arguments:
				if str(argloc.select(tree)).split(')')[0].find('*') == -1:
					#print('%s' % (argloc.select(tree).pprint(10000)[:]))
					wordNum = int(str(argloc).split(':')[0])
					h = int(str(argloc).split(':')[1])
					#print wordNum
					#print h
					arg = traverse_tree_depth(parsed,wordNum,h)
					arguments.append((arg,argid))
			#for a in arguments:
				#print a.data
			#pruned = pruning(parsed,pred,inst.predicate.wordnum,[])
			#print 
			t_word = pred.word
			t_w_pos = pred.data
			#t_word = inst.roleset.split('.')[0]
			#t_w_pos = inst.predicate.select(inst.tree).node
			pred_parrent = find_pred_parrent(parsed,inst.predicate.wordnum,None)
			subcat = find_subcat(pred_parrent)
			#print t_word
			#print t_w_pos
			for (arg,label) in arguments:
				path_list = get_path(arg,parsed,inst.predicate.wordnum)
				(h,h_pos) = extract_head(arg,tree_head_dict)
				path = ''.join(path_list)
				distance = len(path_list)
				pt = arg.data.rstrip()
				t_word_pls_pt = str(t_word)+str(pt)
				t_word_pls_h_word = str(t_word)+str(h)
				distance_pls_t_word = str(distance)+str(t_word)
				subcatStar = find_subcat(arg.parent)
				subcatAt = find_subcat(arg)
				
				
				print 'h='+str(h)+' h_pos='+str(h_pos)+' h_word='+str(h)+' h_word_pos='+str(h_pos)+' path='+str(path)+' t_word_pls_pt='+t_word_pls_pt+' t_word_pls_h_word='+t_word_pls_h_word+' subcat='+str(subcat)+ ' subcatAt='+str(subcatAt)+ ' subcatStar='+str(subcatStar)+ ' '+label
from argext import *
from nltk.corpus import propbank_ptb
from math import floor
from sys import stdout

if __name__ == '__main__' :
	print('\n- acquiring experiment data -\n')
	
	# vars
	exp_name = 'SemanticArgumentClassification' # the experiment's name
	files = [exp_name + '_data_train.arff', exp_name + '_data_dev.arff', exp_name + '_data_test.arff'] # the output filenames
	ratios = [0.6, 0.2, 0.2] # their corresponding ratios (60%, 20%, 20%)
	pbi_ratio = 1. # ratio of the total PropBank corpus to acquire data from
	# init
	pbi = propbank_ptb.instances()
	featurelist = ['predicate', 'path', 'phraseType', 'position', 'voice', 'class'] # initialize ARGInstanceBuilder with featurelist
	arg_the_builder = ARGInstanceBuilder(dict.fromkeys(featurelist))
	arglist = [] # arglist for the extracted ARGInstances

	# extract ARGInstances
	pbi_ratio_index = floor(len(pbi)*pbi_ratio)
	for i in range(pbi_ratio_index) :
		if (i%20) == 0 :
			stdout.write("\rextracting ARGInstances...%.2f%%" % (i*100/pbi_ratio_index))
			stdout.flush()
		try :
			arglist += arg_the_builder.get_arginstances(pbi[i]) # add extracted ARGInstances from current Propbank Instance to arglist
		except :
			print("Error at PropBankInstance with index : " + str(i))
	stdout.write("\rextracting ARGInstances...done   \n")
def build(tree_head_dict):
#def build():
	pb_instances = propbank_ptb.instances()
	all_sub_trees = []
	##predicted_context_labels = read_without_context_labels() ##
	##f = 0 ##
	##t = 0 ##
	for inst in pb_instances[:]:
	 if int(str(inst).split('/')[1]) > 01 and int(str(inst).split('/')[1]) < 22:	# for getting training data according to CoNLL 2005 task
	 #if int(str(inst).split('/')[1]) == 23:	# for getting testing data according to CoNLL 2005 task
		if str(inst).find('*') == -1 and str(inst).find(',') == -1:
			arguments = []
			tree = inst.tree
			#tree.draw()
			pred_tree =  inst.predicate.select(tree)
			(pred,r) = parseExpr(str(pred_tree),0,0)
			(parsed,r) = parseExpr(str(tree),0,0)
			#remove_functional_tags(parsed)
			#all_sub_trees = all_sub_trees + print_all_subtrees(parsed,[]) # use this function to print all subtrees and then use java program to find heads 
			gold_context_labels_list = []
			for (argloc,argid) in inst.arguments:
				if str(argloc.select(tree)).split(')')[0].find('*') == -1:
					#print('%s' % (argloc.select(tree).pprint(10000)[:]))
					wordNum = int(str(argloc).split(':')[0])
					h = int(str(argloc).split(':')[1])
					#print wordNum
					#print h
					arg = traverse_tree_depth(parsed,wordNum,h)
					arguments.append((arg,wordNum,argid))
					gold_context_labels_list.append(argid)
			#for a in arguments:
				#print a.data
			#pruned = pruning(parsed,pred,inst.predicate.wordnum,[])
			#print 
			t_word = pred.word
			t_w_pos = pred.data
			#t_word = inst.roleset.split('.')[0]
			#t_w_pos = inst.predicate.select(inst.tree).node
			pred_parrent = find_pred_parrent(parsed,inst.predicate.wordnum,None)
			subcat = find_subcat(pred_parrent)
			if pred_parrent.word != None:
				ParentWord = pred_parrent.word # word of the parrent node of pred
			else:
				ParentWord = 'none'
			ParentWordPos = pred_parrent.data.rstrip() # pos of the parrent node of pred
			## gold context labels
			context_labels_all = gold_context_labels_list # for training
			#predicted
			##t = t + len(arguments) ##
			##context_labels_all = predicted_context_labels[f:t] ## for testing
			##f = t ##
			d = 0
			#print len(inst.arguments)
			for (arg,wordNum,label) in arguments:
				 
				path_list = get_path(arg,parsed,inst.predicate.wordnum)
				(h,h_pos) = extract_head(arg,tree_head_dict)
				path = ''.join(path_list)
				distance = len(path_list)
				pt = arg.data.rstrip()
				t_word_pls_pt = str(t_word)+str(pt)
				t_word_pls_h_word = str(t_word)+str(h)
				distance_pls_t_word = str(distance)+str(t_word)
				subcatStar = find_subcat(arg.parent)
				subcatAt = find_subcat(arg)
				
		
				if wordNum < inst.wordnum:
					position = 'before'
				else:
					position = 'after'
				
				temp = []
				for i in range(0,len(context_labels_all)):
					if i != d:
						temp.append(context_labels_all[i])
				context_labels = ':'.join(temp)
				#context_labels = ':'.join([a for a in context_labels_all if context_labels_all.index(a) != d]) ##
				d = d + 1 ##
				#print ft + ' ' + str(context_labels_all) + ' ' + context_labels + '   ' + str(len(inst.arguments))
				#without context labels
				#print 't_word='+str(t_word)+' t_w_pos='+str(t_w_pos)+' h_word='+str(h)+' h_word_pos='+str(h_pos)+' path='+str(path)+' t_word_pls_pt='+t_word_pls_pt+' t_word_pls_h_word='+t_word_pls_h_word+' subcat='+str(subcat)+ ' subcatAt='+str(subcatAt)+ ' subcatStar='+str(subcatStar)+' pt='+pt+' position='+position+' ParentWord='+ParentWord+' ParentWordPos='+ParentWordPos+ ' '+label
				#print 't_word='+str(t_word)+' t_w_pos='+str(t_w_pos)+' h_word='+str(h)+' h_word_pos='+str(h_pos)+' path='+str(path)+' t_word_pls_pt='+t_word_pls_pt+' t_word_pls_h_word='+t_word_pls_h_word+' subcat='+str(subcat)+ ' subcatAt='+str(subcatAt)+ ' subcatStar='+str(subcatStar)+' pt='+pt+' position='+position+' ParentWord='+ParentWord+' ParentWordPos='+ParentWordPos+ ' ?'
				
				#with context labels
				print 't_word='+str(t_word)+' t_w_pos='+str(t_w_pos)+' h_word='+str(h)+' h_word_pos='+str(h_pos)+' path='+str(path)+' t_word_pls_pt='+t_word_pls_pt+' t_word_pls_h_word='+t_word_pls_h_word+' subcat='+str(subcat)+ ' subcatAt='+str(subcatAt)+ ' subcatStar='+str(subcatStar)+' pt='+pt+' position='+position+' ParentWord='+ParentWord+' ParentWordPos='+ParentWordPos+' context_labels='+context_labels+ ' '+label