def average_and_std_type(all_results, index):
    avg = EmptyObject()

    for n, run in enumerate(all_results):
        if n == 0:
            avg.a_o_e = np.array(run[index].a_o_e)
            avg.a_o_t = np.array(run[index].a_o_t)
            avg.policy_diff1 = run[index].policy_diff1
            avg.policy_diff2 = run[index].policy_diff2
            avg.e_on_e = [run[index].e_on_e]
            avg.e_o_t = [run[index].e_o_t]
        else:
            avg.a_o_e = np.vstack((avg.a_o_e, np.array(run[index].a_o_e)))
            avg.a_o_t = np.vstack((avg.a_o_t, np.array(run[index].a_o_t)))
            avg.policy_diff1 = np.vstack(
                (avg.policy_diff1, np.array(run[index].policy_diff1)))
            avg.policy_diff2 = np.vstack(
                (avg.policy_diff2, np.array(run[index].policy_diff2)))
            avg.e_on_e.append(run[index].e_on_e)
            avg.e_o_t.append(run[index].e_o_t)
    avg.mean_a_o_e = np.mean(avg.a_o_e, axis=0)
    avg.std_a_o_e = np.std(avg.a_o_e, axis=0)
    avg.mean_a_o_t = np.mean(avg.a_o_t, axis=0)
    avg.std_a_o_t = np.std(avg.a_o_t, axis=0)
    avg.mean_policy_diff1 = np.mean(avg.policy_diff1, axis=0)
    avg.std_policy_diff1 = np.std(avg.policy_diff1, axis=0)
    avg.mean_policy_diff2 = np.mean(avg.policy_diff2, axis=0)
    avg.std_policy_diff2 = np.std(avg.policy_diff2, axis=0)
    avg.e_on_e = np.mean(avg.e_on_e)
    avg.e_o_t = np.mean(avg.e_o_t)
    return avg
def average_and_std_type(all_results,index):
	avg = EmptyObject()

	for n,run in enumerate(all_results):
		if n ==0:
			avg.a_o_e = np.array(run[index].a_o_e); avg.a_o_t = np.array(run[index].a_o_t)
			avg.policy_diff1 = run[index].policy_diff1; avg.policy_diff2 = run[index].policy_diff2
			avg.e_on_e = [run[index].e_on_e]; avg.e_o_t = [run[index].e_o_t]
		else:
			avg.a_o_e = np.vstack((avg.a_o_e,np.array(run[index].a_o_e))); avg.a_o_t =np.vstack((avg.a_o_t, np.array(run[index].a_o_t)))
			avg.policy_diff1 =np.vstack((avg.policy_diff1, np.array(run[index].policy_diff1))); avg.policy_diff2 =np.vstack((avg.policy_diff2,np.array(run[index].policy_diff2)))
			avg.e_on_e.append(run[index].e_on_e); avg.e_o_t.append(run[index].e_o_t)
	avg.mean_a_o_e  = np.mean(avg.a_o_e,axis = 0); avg.std_a_o_e = np.std(avg.a_o_e,axis = 0)
	avg.mean_a_o_t  = np.mean(avg.a_o_t,axis = 0); avg.std_a_o_t = np.std(avg.a_o_t,axis = 0)
	avg.mean_policy_diff1 = np.mean(avg.policy_diff1,axis=0); avg.std_policy_diff1  = np.std(avg.policy_diff1,axis = 0)
	avg.mean_policy_diff2 = np.mean(avg.policy_diff2,axis=0); avg.std_policy_diff2  = np.std(avg.policy_diff2,axis = 0)
	avg.e_on_e = np.mean(avg.e_on_e); avg.e_o_t = np.mean(avg.e_o_t)
	return avg
def learn_from_failure(expert1,expert2,apprentice,iterations,steps,initial_states,test_states,failure = "false",initial_bad_states = None):
	#initialise the lagrange multipliers to 1
	print "INITIALISED LEARNING. LEARNING FROM FAILURE = ",failure
	direc ="results/"
	fn.make_dir(direc)
	C = 5.0
	D=.7
	delta_c = .96
	disc = expert1.disc
	
	a,s,f = expert1.feature_f.shape
	#experts
	exp1_policy,ignore,exp1_state_exp,exp1_all = inference(expert1,steps,initial_states,discount =0.9)
	if initial_bad_states == None:
		exp2_policy,ignore,exp2_state_exp,exp2_all = inference(expert2,steps,initial_states,discount = 0.9)
	else:
		exp2_policy,ignore,exp2_state_exp,exp2_all = inference(expert2,steps,initial_bad_states,discount = 0.9)
	#print "POLICYY", exp1_policy.shape

	exp1_feature_avg = np.dot(exp1_state_exp.reshape(s*a,order = "F"),expert1.feature_f.reshape(s*a,f,order ="F"))
	exp2_feature_avg = np.dot(exp2_state_exp.reshape(s*a,order = "F"),expert2.feature_f.reshape(s*a,f,order = "F"))

	e_on_e = eval_value(expert1.w,exp1_policy,expert1,test_states,steps)
	t_o_t = eval_value(expert2.w,exp2_policy,expert2,test_states,steps)
	expert_on_taboo = eval_value(expert2.w,exp1_policy,expert2,test_states,steps)
	z_stat = None

	#initiate results structure
	results = EmptyObject()
	results.a_o_e = []
	results.a_o_t = []
	results.policy_diff1 = []
	results.policy_diff2 = []
	results.e_on_e = e_on_e
	results.t_o_t = t_o_t
	results.e_o_t = expert_on_taboo
	# learning rate
	rate = 0.08
	rate2 = 0.08
	# delay before failure data is includes. Large numbers avoid oscilations
	delay = 0

	for i in range(iterations):
		apprentice_policy,z_stat,a_state_exp,a_all = inference(apprentice,steps,initial_states,z_states = None,discount = 0.9)
		apprentice_feature_avg = np.dot(a_state_exp.reshape(s*a,order = "F"),apprentice.feature_f.reshape(s*a,f,order = "F"))
		difference_exp1 = exp1_feature_avg - apprentice_feature_avg
		if initial_bad_states == None:
			difference_exp2 = exp2_feature_avg - apprentice_feature_avg
		else:
			apprentice_policy,z_stat,a_state_exp_bad,a_all = inference(apprentice,steps,initial_bad_states,z_states = None,discount = 0.9)
			apprentice_feature_avg_bad = np.dot(a_state_exp_bad.reshape(s*a,order = "F"),apprentice.feature_f.reshape(s*a,f,order = "F"))
			difference_exp2 = apprentice_feature_avg_bad - exp2_feature_avg 
		if i ==0:
			difference_random = np.copy(difference_exp2)
			apprentice_feature_avg_bad_prev = apprentice_feature_avg*0
		if failure =="L2":
			#first update the alphas according to their gradient.
			apprentice.w = fn.pin_to_threshold(apprentice.w + rate*difference_exp1,C,-C)		
			if i>delay:
				apprentice.zeta =-difference_exp2
			#print "ZETAAA",apprentice.zeta
			#print "-------------------------------------------"s
		elif failure == "L1":
			apprentice.w = apprentice.w + rate*difference_exp1
			#apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta)
			apprentice.zeta = -0.9*difference_exp2
		elif failure == "false":
			apprentice.w = apprentice.w + rate*difference_exp1
		elif failure == "slow":
			apprentice.w = apprentice.w + rate*difference_exp1	
			C = C*delta_c
			if 1./C>D:
				C = 1/D
			if i >delay:
				apprentice.zeta =-difference_exp2/(C)
		apprentice_feature_avg_bad_prev =apprentice_feature_avg_bad 
		apprentice.reward_f = apprentice.buildRewardFunction()
		#evaluation
		a_on_e = eval_value(expert1.w,apprentice_policy,apprentice,test_states,steps) 
		a_o_t = eval_value(expert2.w,apprentice_policy,apprentice,test_states,steps) 
		#if i ==iterations-1:
		if i <iterations:			
			print "failure",failure
			print "Iteration",i
			print "Aprentice on Expert" ,a_on_e
			print "Expert on expert",e_on_e
			print "Apprentice on Taboo",a_o_t
			print "Taboo on Taboo",t_o_t
			print "Expert on Taboo",expert_on_taboo
			print "______________________________________"
		results.a_o_e.append(a_on_e)
		results.a_o_t.append(a_o_t)
		results.policy_diff1.append(np.sum(np.sum(np.absolute(apprentice_policy-exp1_policy)))/(2*disc.tot_states)*100)
		results.policy_diff2.append(np.sum(np.sum(np.absolute(apprentice_policy-exp2_policy)))/(2*disc.tot_states)*100)
		if i  == iterations-1:
			print "Policy Difference",results.policy_diff1[-1]
			print "Policy Difference",results.policy_diff2[-1]
	return results
def learn_from_failure(expert1,
                       expert2,
                       apprentice,
                       iterations,
                       steps,
                       initial_states,
                       test_states,
                       failure="false",
                       initial_bad_states=None):
    #initialise the lagrange multipliers to 1
    print "INITIALISED LEARNING. LEARNING FROM FAILURE = ", failure
    direc = "results/"
    fn.make_dir(direc)
    # learning rate
    rate = 0.08
    rate2 = 0.08
    C = 5.0
    D = .7
    delta_c = .96
    delay = 0
    disc = expert1.disc

    a, s, f = expert1.feature_f.shape
    #experts
    exp1_policy, ignore, exp1_state_exp, exp1_all = inference(expert1,
                                                              steps,
                                                              initial_states,
                                                              discount=0.9)
    if initial_bad_states == None:
        exp2_policy, ignore, exp2_state_exp, exp2_all = inference(
            expert2, steps, initial_states, discount=0.9)
    else:
        exp2_policy, ignore, exp2_state_exp, exp2_all = inference(
            expert2, steps, initial_bad_states, discount=0.9)
    #print "POLICYY", exp1_policy.shape

    exp1_feature_avg = np.dot(exp1_state_exp.reshape(s * a, order="F"),
                              expert1.feature_f.reshape(s * a, f, order="F"))
    exp2_feature_avg = np.dot(exp2_state_exp.reshape(s * a, order="F"),
                              expert2.feature_f.reshape(s * a, f, order="F"))

    e_on_e = eval_value(expert1.w, exp1_policy, expert1, test_states, steps)
    t_o_t = eval_value(expert2.w, exp2_policy, expert2, test_states, steps)
    expert_on_taboo = eval_value(expert2.w, exp1_policy, expert2, test_states,
                                 steps)
    z_stat = None

    #initiate results structure
    results = EmptyObject()
    results.a_o_e = []
    results.a_o_t = []
    results.policy_diff1 = []
    results.policy_diff2 = []
    results.e_on_e = e_on_e
    results.t_o_t = t_o_t
    results.e_o_t = expert_on_taboo

    for i in range(iterations):
        apprentice_policy, z_stat, a_state_exp, a_all = inference(
            apprentice, steps, initial_states, z_states=None, discount=0.9)
        apprentice_feature_avg = np.dot(
            a_state_exp.reshape(s * a, order="F"),
            apprentice.feature_f.reshape(s * a, f, order="F"))
        difference_exp1 = exp1_feature_avg - apprentice_feature_avg
        if initial_bad_states == None:
            difference_exp2 = exp2_feature_avg - apprentice_feature_avg
        else:
            apprentice_policy, z_stat, a_state_exp_bad, a_all = inference(
                apprentice,
                steps,
                initial_bad_states,
                z_states=None,
                discount=0.9)
            apprentice_feature_avg_bad = np.dot(
                a_state_exp_bad.reshape(s * a, order="F"),
                apprentice.feature_f.reshape(s * a, f, order="F"))
            difference_exp2 = apprentice_feature_avg_bad - exp2_feature_avg
        if i == 0:
            difference_random = np.copy(difference_exp2)
            apprentice_feature_avg_bad_prev = apprentice_feature_avg * 0
        #updates
        elif failure == "L1":
            apprentice.w = apprentice.w + rate * difference_exp1
            #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta)
            apprentice.zeta = 0.9 * difference_exp2
        elif failure == "false":
            apprentice.w = apprentice.w + rate * difference_exp1
        elif failure == "slow":
            apprentice.w = apprentice.w + rate * difference_exp1
            C = C * delta_c
            if 1. / C > D:
                C = 1 / D
            if i > delay:
                apprentice.zeta = -difference_exp2 / (C)
            #print "ZETAAA",apprentice.zeta
            #print "-------------------------------------------"
        elif failure == "cvx":
            delay = 0
            apprentice.w = apprentice.w + rate * difference_exp1
            #sings = difference_random*difference_exp2
            #print sings
            #idx = np.where(sings < 0)
            #difference_exp2[idx]=0
            rho = 0.01
            #if rho>0.8:
            #	rho=0.8
            #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta)
            if i > delay:
                apprentice.zeta = 0.9 * (apprentice_feature_avg_bad_prev -
                                         rho * apprentice_feature_avg_bad +
                                         (rho - 1) * exp2_feature_avg)
            apprentice_feature_avg_bad_prev = apprentice_feature_avg_bad
            #apprentice.zeta = difference_random - 0.2*difference_exp2
        elif failure == "sign":
            apprentice.w = apprentice.w + rate * difference_exp1
            rho = 0.01
            apprentice.zeta = np.sign(difference_random)
        elif failure == "only":
            apprentice.zeta = apprentice.zeta - rate2 * (difference_exp2 +
                                                         D * apprentice.zeta)
            apprentice.zeta = -2 * difference_exp2
            #print "ZETAAA",apprentice.zeta
            #print "-------------------------------------------"

        apprentice.reward_f = apprentice.buildRewardFunction()
        #evaluation
        a_on_e = eval_value(expert1.w, apprentice_policy, apprentice,
                            test_states, steps)
        a_o_t = eval_value(expert2.w, apprentice_policy, apprentice,
                           test_states, steps)
        #if i ==iterations-1:
        if i < iterations:
            print "failure", failure
            print "Iteration", i
            print "Aprentice on Expert", a_on_e
            print "Expert on expert", e_on_e
            print "Apprentice on Taboo", a_o_t
            print "Taboo on Taboo", t_o_t
            print "Expert on Taboo", expert_on_taboo
            print "______________________________________"
        results.a_o_e.append(a_on_e)
        results.a_o_t.append(a_o_t)
        results.policy_diff1.append(
            np.sum(np.sum(np.absolute(apprentice_policy - exp1_policy))) /
            (2 * disc.tot_states) * 100)
        results.policy_diff2.append(
            np.sum(np.sum(np.absolute(apprentice_policy - exp2_policy))) /
            (2 * disc.tot_states) * 100)
        if i == iterations - 1:
            print "Policy Difference", results.policy_diff1[-1]
            print "Policy Difference", results.policy_diff2[-1]
    return results
def learn_from_failure(expert1,expert2,apprentice,iterations,steps,initial_states,test_states,failure = "false",initial_bad_states = None):
	#initialise the lagrange multipliers to 1
	print "INITIALISED LEARNING. LEARNING FROM FAILURE = ",failure
	direc ="results/"
	fn.make_dir(direc)
	# learning rate
	rate = 0.08
	rate2 = 0.08
	C = 5.0
	D=.7
	delta_c = .96
	delay = 0
	disc = expert1.disc
	
	a,s,f = expert1.feature_f.shape
	#experts
	exp1_policy,ignore,exp1_state_exp,exp1_all = inference(expert1,steps,initial_states,discount =0.9)
	if initial_bad_states == None:
		exp2_policy,ignore,exp2_state_exp,exp2_all = inference(expert2,steps,initial_states,discount = 0.9)
	else:
		exp2_policy,ignore,exp2_state_exp,exp2_all = inference(expert2,steps,initial_bad_states,discount = 0.9)
	#print "POLICYY", exp1_policy.shape

	exp1_feature_avg = np.dot(exp1_state_exp.reshape(s*a,order = "F"),expert1.feature_f.reshape(s*a,f,order ="F"))
	exp2_feature_avg = np.dot(exp2_state_exp.reshape(s*a,order = "F"),expert2.feature_f.reshape(s*a,f,order = "F"))

	e_on_e = eval_value(expert1.w,exp1_policy,expert1,test_states,steps)
	t_o_t = eval_value(expert2.w,exp2_policy,expert2,test_states,steps)
	expert_on_taboo = eval_value(expert2.w,exp1_policy,expert2,test_states,steps)
	z_stat = None

	#initiate results structure
	results = EmptyObject()
	results.a_o_e = []
	results.a_o_t = []
	results.policy_diff1 = []
	results.policy_diff2 = []
	results.e_on_e = e_on_e
	results.t_o_t = t_o_t
	results.e_o_t = expert_on_taboo


	for i in range(iterations):
		apprentice_policy,z_stat,a_state_exp,a_all = inference(apprentice,steps,initial_states,z_states = None,discount = 0.9)
		apprentice_feature_avg = np.dot(a_state_exp.reshape(s*a,order = "F"),apprentice.feature_f.reshape(s*a,f,order = "F"))
		difference_exp1 = exp1_feature_avg - apprentice_feature_avg
		if initial_bad_states == None:
			difference_exp2 = exp2_feature_avg - apprentice_feature_avg
		else:
			apprentice_policy,z_stat,a_state_exp_bad,a_all = inference(apprentice,steps,initial_bad_states,z_states = None,discount = 0.9)
			apprentice_feature_avg_bad = np.dot(a_state_exp_bad.reshape(s*a,order = "F"),apprentice.feature_f.reshape(s*a,f,order = "F"))
			difference_exp2 = apprentice_feature_avg_bad - exp2_feature_avg 
		if i ==0:
			difference_random = np.copy(difference_exp2)
			apprentice_feature_avg_bad_prev = apprentice_feature_avg*0
		#updates
		elif failure == "L1":
			apprentice.w = apprentice.w + rate*difference_exp1
			#apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta)
			apprentice.zeta = 0.9*difference_exp2
		elif failure == "false":
			apprentice.w = apprentice.w + rate*difference_exp1
		elif failure == "slow":
			apprentice.w = apprentice.w + rate*difference_exp1	
			C = C*delta_c
			if 1./C>D:
				C = 1/D
			if i >delay:
				apprentice.zeta =-difference_exp2/(C)
			#print "ZETAAA",apprentice.zeta
			#print "-------------------------------------------"
		elif failure == "cvx":	
			delay = 0
			apprentice.w = apprentice.w + rate*difference_exp1
			#sings = difference_random*difference_exp2
			#print sings
			#idx = np.where(sings < 0)
			#difference_exp2[idx]=0
			rho = 0.01
			#if rho>0.8:
			#	rho=0.8
			#apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta)
			if i>delay:
				apprentice.zeta =0.9*(apprentice_feature_avg_bad_prev - rho*apprentice_feature_avg_bad + (rho-1)*exp2_feature_avg) 
			apprentice_feature_avg_bad_prev =apprentice_feature_avg_bad 
			#apprentice.zeta = difference_random - 0.2*difference_exp2
		elif failure == "sign":	
			apprentice.w = apprentice.w + rate*difference_exp1
			rho = 0.01
			apprentice.zeta =np.sign(difference_random)
		elif failure == "only":
			apprentice.zeta =apprentice.zeta -rate2*(difference_exp2 + D*apprentice.zeta)
			apprentice.zeta = -2*difference_exp2
			#print "ZETAAA",apprentice.zeta
			#print "-------------------------------------------"
		
		apprentice.reward_f = apprentice.buildRewardFunction()
		#evaluation
		a_on_e = eval_value(expert1.w,apprentice_policy,apprentice,test_states,steps) 
		a_o_t = eval_value(expert2.w,apprentice_policy,apprentice,test_states,steps) 
		#if i ==iterations-1:
		if i <iterations:			
			print "failure",failure
			print "Iteration",i
			print "Aprentice on Expert" ,a_on_e
			print "Expert on expert",e_on_e
			print "Apprentice on Taboo",a_o_t
			print "Taboo on Taboo",t_o_t
			print "Expert on Taboo",expert_on_taboo
			print "______________________________________"
		results.a_o_e.append(a_on_e)
		results.a_o_t.append(a_o_t)
		results.policy_diff1.append(np.sum(np.sum(np.absolute(apprentice_policy-exp1_policy)))/(2*disc.tot_states)*100)
		results.policy_diff2.append(np.sum(np.sum(np.absolute(apprentice_policy-exp2_policy)))/(2*disc.tot_states)*100)
		if i  == iterations-1:
			print "Policy Difference",results.policy_diff1[-1]
			print "Policy Difference",results.policy_diff2[-1]
	return results
def learn_from_failure(expert1,
                       expert2,
                       apprentice,
                       iterations,
                       steps,
                       initial_states,
                       test_states,
                       failure="false",
                       initial_bad_states=None):
    #initialise the lagrange multipliers to 1
    print "INITIALISED LEARNING. LEARNING FROM FAILURE = ", failure
    direc = "results/"
    fn.make_dir(direc)
    C = 5.0
    D = .7
    delta_c = .96
    disc = expert1.disc

    a, s, f = expert1.feature_f.shape
    #experts
    exp1_policy, ignore, exp1_state_exp, exp1_all = inference(expert1,
                                                              steps,
                                                              initial_states,
                                                              discount=0.90)
    if initial_bad_states == None:
        exp2_policy, ignore, exp2_state_exp, exp2_all = inference(
            expert2, steps, initial_states, discount=0.90)
    else:
        exp2_policy, ignore, exp2_state_exp, exp2_all = inference(
            expert2, steps, initial_bad_states, discount=0.90)
    #print "POLICYY", exp1_policy.shape

    exp1_feature_avg = np.dot(exp1_state_exp.reshape(s * a, order="F"),
                              expert1.feature_f.reshape(s * a, f, order="F"))
    exp2_feature_avg = np.dot(exp2_state_exp.reshape(s * a, order="F"),
                              expert2.feature_f.reshape(s * a, f, order="F"))

    e_on_e = eval_value(expert1.w, exp1_policy, expert1, test_states, steps)
    t_o_t = eval_value(expert2.w, exp2_policy, expert2, test_states, steps)
    expert_on_taboo = eval_value(expert2.w, exp1_policy, expert2, test_states,
                                 steps)
    z_stat = None

    #initiate results structure
    results = EmptyObject()
    results.a_o_e = []
    results.a_o_t = []
    results.policy_diff1 = []
    results.policy_diff2 = []
    results.e_on_e = e_on_e
    results.t_o_t = t_o_t
    results.e_o_t = expert_on_taboo
    # learning rate
    rate = 0.08
    rate2 = 0.08
    # delay before failure data is includes. Large numbers avoid oscilations
    delay = 0

    for i in range(iterations):
        apprentice_policy, z_stat, a_state_exp, a_all = inference(
            apprentice, steps, initial_states, z_states=None, discount=0.95)
        apprentice_feature_avg = np.dot(
            a_state_exp.reshape(s * a, order="F"),
            apprentice.feature_f.reshape(s * a, f, order="F"))
        difference_exp1 = exp1_feature_avg - apprentice_feature_avg
        if initial_bad_states == None:
            difference_exp2 = exp2_feature_avg - apprentice_feature_avg
        else:
            apprentice_policy, z_stat, a_state_exp_bad, a_all = inference(
                apprentice,
                steps,
                initial_bad_states,
                z_states=None,
                discount=0.95)
            apprentice_feature_avg_bad = np.dot(
                a_state_exp_bad.reshape(s * a, order="F"),
                apprentice.feature_f.reshape(s * a, f, order="F"))
            difference_exp2 = apprentice_feature_avg_bad - exp2_feature_avg
        if i == 0:
            difference_random = np.copy(difference_exp2)
            apprentice_feature_avg_bad_prev = apprentice_feature_avg * 0
        if failure == "L2":
            #first update the alphas according to their gradient.
            apprentice.w = fn.pin_to_threshold(
                apprentice.w + rate * difference_exp1, C, -C)
            if i > delay:
                apprentice.zeta = -difference_exp2
            #print "ZETAAA",apprentice.zeta
            #print "-------------------------------------------"s
        elif failure == "L1":
            apprentice.w = apprentice.w + rate * difference_exp1
            #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta)
            apprentice.zeta = 0.9 * difference_exp2
        elif failure == "false":
            apprentice.w = apprentice.w + rate * difference_exp1
        elif failure == "slow":
            apprentice.w = apprentice.w + rate * difference_exp1
            C = C * delta_c
            if 1. / C > D:
                C = 1 / D
            if i > delay:
                apprentice.zeta = -difference_exp2 / (C)
        apprentice_feature_avg_bad_prev = apprentice_feature_avg_bad
        apprentice.reward_f = apprentice.buildRewardFunction()
        #evaluation
        a_on_e = eval_value(expert1.w, apprentice_policy, apprentice,
                            test_states, steps)
        a_o_t = eval_value(expert2.w, apprentice_policy, apprentice,
                           test_states, steps)
        #if i ==iterations-1:
        if i < iterations:
            print "failure", failure
            print "Iteration", i
            print "Aprentice on Expert", a_on_e
            print "Expert on expert", e_on_e
            print "Apprentice on Taboo", a_o_t
            print "Taboo on Taboo", t_o_t
            print "Expert on Taboo", expert_on_taboo
            print "______________________________________"
        results.a_o_e.append(a_on_e)
        results.a_o_t.append(a_o_t)
        results.policy_diff1.append(
            np.sum(np.sum(np.absolute(apprentice_policy - exp1_policy))) /
            (2 * disc.tot_states) * 100)
        results.policy_diff2.append(
            np.sum(np.sum(np.absolute(apprentice_policy - exp2_policy))) /
            (2 * disc.tot_states) * 100)
        if i == iterations - 1:
            print "Policy Difference", results.policy_diff1[-1]
            print "Policy Difference", results.policy_diff2[-1]
    return results