def experiment_overlapping(expert_feature = toy_problem_simple,apprentice_feature = toy_problem_simple,name = "simple_feature",iterations_per_run=60,steps=15,runs=20): direc = "results/aamas" #initial_states = [disc.quantityToState([0,0,1,2,2]),disc.quantityToState([0,0,3,4,1]),disc.quantityToState([0,1,2,2,2]),disc.quantityToState([0,0,3,2,1])] #test_states =[disc.quantityToState([0,0,2,2,1]),disc.quantityToState([0,0,2,4,2]),disc.quantityToState([0,0,3,1,3]),disc.quantityToState([0,0,3,2,1])] fn.make_dir(direc+"/"+name) results_array = [] disc = DiscModel(target = [4,4],boundaries = [4,4],feature = expert_feature) disc_a = DiscModel(target = [4,4],boundaries = [4,4],feature = apprentice_feature) expert2 = Model(disc,"obstacle2_reach", load_saved = False) expert1 = Model(disc,"avoid_reach", load_saved = True) test_states = np.random.randint(0,disc.tot_states,10) bad_states = np.random.randint(0,disc.tot_states,5) for i in range(runs): apprentice = Model(disc_a,"dual_reward", load_saved = True) initial_states = np.random.randint(0,disc.tot_states,5) results_failure = learn_from_failure(expert1,expert2,apprentice,iterations_per_run,steps,initial_states,test_states,failure = "L1",initial_bad_states = bad_states) if i ==0: apprentice.visualise_reward() apprentice = Model(disc_a,"dual_reward", load_saved = True) results_normal = learn_from_failure(expert1,expert2,apprentice,iterations_per_run,steps,initial_states,test_states,failure = "false",initial_bad_states = bad_states) if i ==0: apprentice.visualise_reward() apprentice = Model(disc_a,"dual_reward", load_saved = True) results_slow = learn_from_failure(expert1,expert2,apprentice,iterations_per_run,steps,initial_states,test_states,failure = "slow",initial_bad_states = bad_states) if i ==0: apprentice.visualise_reward() results_array.append([results_failure,results_normal,results_slow]) fn.pickle_saver(results_array,direc+"/"+name+".pkl")
def experiment_contrasting(expert_feature = toy_problem_simple,apprentice_feature = toy_problem_simple,name = "simple_feature",iterations_per_run=60,steps=15,runs=20): direc = "results/aamas" #initial_states = [disc.quantityToState([0,0,1,2,2]),disc.quantityToState([0,0,3,4,1]),disc.quantityToState([0,1,2,2,2]),disc.quantityToState([0,0,3,2,1])] #test_states =[disc.quantityToState([0,0,2,2,1]),disc.quantityToState([0,0,2,4,2]),disc.quantityToState([0,0,3,1,3]),disc.quantityToState([0,0,3,2,1])] fn.make_dir(direc+"/"+name) results_array = [] disc = DiscModel(target = [4,4],boundaries = [4,4],feature = expert_feature) disc_a = DiscModel(target = [4,4],boundaries = [4,4],feature = apprentice_feature) expert2 = Model(disc,"obstacle2", load_saved = False) expert1 = Model(disc,"avoid_reach", load_saved = True) test_states = np.random.randint(0,disc.tot_states,100) bad_states = np.random.randint(0,disc.tot_states,5) for i in range(runs): apprentice = Model(disc_a,"dual_reward", load_saved = True) initial_states = np.random.randint(0,disc.tot_states,10) results_failure = learn_from_failure(expert1,expert2,apprentice,iterations_per_run,steps,initial_states,test_states,failure = "L1",initial_bad_states = bad_states) if i ==0: apprentice.visualise_reward() apprentice = Model(disc_a,"uniform", load_saved = True) results_normal = learn_from_failure(expert1,expert2,apprentice,iterations_per_run,steps,initial_states,test_states,failure = "false",initial_bad_states = bad_states) if i ==0: apprentice.visualise_reward() apprentice = Model(disc_a,"dual_reward", load_saved = True) results_slow = learn_from_failure(expert1,expert2,apprentice,iterations_per_run,steps,initial_states,test_states,failure = "slow",initial_bad_states = bad_states) if i ==0: apprentice.visualise_reward() results_array.append([results_failure,results_normal,results_slow]) fn.pickle_saver(results_array,direc+"/"+name+".pkl")
def experiment_data_size(expert_feature = toy_problem_simple,apprentice_feature = toy_problem_simple,name = "simple_feature",iterations_per_run=50,steps=15,runs=6): direc = "results/aamas" #initial_states = [disc.quantityToState([0,0,1,2,2]),disc.quantityToState([0,0,3,4,1]),disc.quantityToState([0,1,2,2,2]),disc.quantityToState([0,0,3,2,1])] #test_states =[disc.quantityToState([0,0,2,2,1]),disc.quantityToState([0,0,2,4,2]),disc.quantityToState([0,0,3,1,3]),disc.quantityToState([0,0,3,2,1])] fn.make_dir(direc+"/"+name) results_array = [] disc = DiscModel(target = [4,4],boundaries = [4,4],feature = expert_feature) disc_a = DiscModel(target = [4,4],boundaries = [4,4],feature = apprentice_feature) training_sizes = [2,5,25,50,100] fail = np.zeros([len(training_sizes),runs]);normal = np.zeros([len(training_sizes),runs]);slow = np.zeros([len(training_sizes),runs]) if expert_feature != apprentice_feature: expert_2_test = Model(disc,"obstacle2", load_saved = False) expert_1_test = Model(disc,"avoid_reach", load_saved = True) expert2 = Model(disc_a,"obstacle2", load_saved = False) expert2.reward_f = expert_2_test.reward_f expert1 = Model(disc_a,"avoid_reach", load_saved = True) expert1.reward_f = expert_1_test.reward_f else: expert2 = Model(disc,"obstacle2", load_saved = False) expert1 = Model(disc,"avoid_reach", load_saved = True) test_states = np.random.randint(0,disc.tot_states,10) bad_states = np.random.randint(0,disc.tot_states,5) for enn,size in enumerate(training_sizes): print "SIZE=",size print "============================================================================" for n,i in enumerate(range(runs)): print "RUN",i apprentice = Model(disc_a,"dual_reward", load_saved = True) #initial_states = np.random.randint(0,disc.tot_states,5) initial_states = np.random.randint(0,disc.tot_states,size) results_failure = learn_from_failure(expert1,expert2,apprentice,iterations_per_run,steps,initial_states,test_states,failure = "L1",initial_bad_states = bad_states) fail[enn,i] = results_failure.e_on_e - results_failure.a_o_e[-1] apprentice = Model(disc_a,"uniform", load_saved = True) results_normal = learn_from_failure(expert1,expert2,apprentice,iterations_per_run,steps,initial_states,test_states,failure = "false",initial_bad_states = bad_states) normal[enn,i] = results_normal.e_on_e - results_normal.a_o_e[-1] apprentice = Model(disc_a,"dual_reward", load_saved = True) results_slow = learn_from_failure(expert1,expert2,apprentice,iterations_per_run,steps,initial_states,test_states,failure = "slow",initial_bad_states = bad_states) slow[enn,i] = results_slow.e_on_e - results_slow.a_o_e[-1] results_array.append([results_failure,results_normal,results_slow]) fn.pickle_saver((results_array,fail,normal,slow),direc+"/"+name+".pkl")
def experiment_data_size(expert_feature = toy_problem_simple,apprentice_feature = toy_problem_simple,name = "simple_feature",iterations_per_run=50,steps=15,runs=6): direc = "results/aamas" #initial_states = [disc.quantityToState([0,0,1,2,2]),disc.quantityToState([0,0,3,4,1]),disc.quantityToState([0,1,2,2,2]),disc.quantityToState([0,0,3,2,1])] #test_states =[disc.quantityToState([0,0,2,2,1]),disc.quantityToState([0,0,2,4,2]),disc.quantityToState([0,0,3,1,3]),disc.quantityToState([0,0,3,2,1])] fn.make_dir(direc+"/"+name) results_array = [] disc = DiscModel(target = [4,4],boundaries = [4,4],feature = expert_feature) disc_a = DiscModel(target = [4,4],boundaries = [4,4],feature = apprentice_feature) training_sizes = [2,5,25,50,100] fail = np.zeros([len(training_sizes),runs]);normal = np.zeros([len(training_sizes),runs]);slow = np.zeros([len(training_sizes),runs]) if expert_feature != apprentice_feature: expert_2_test = Model(disc,"obstacle2", load_saved = False) expert_1_test = Model(disc,"avoid_reach", load_saved = True) expert2 = Model(disc_a,"obstacle2", load_saved = False) expert2.reward_f = expert_2_test.reward_f expert1 = Model(disc_a,"avoid_reach", load_saved = True) expert1.reward_f = expert_1_test.reward_f else: expert2 = Model(disc,"obstacle2", load_saved = False) expert1 = Model(disc,"avoid_reach", load_saved = True) test_states = np.random.randint(0,disc.tot_states,10) bad_states = np.random.randint(0,disc.tot_states,5) for enn,size in enumerate(training_sizes): print "SIZE=",size print "============================================================================" for n,i in enumerate(range(runs)): print "RUN",i apprentice = Model(disc_a,"dual_reward", load_saved = True) #initial_states = np.random.randint(0,disc.tot_states,5) initial_states = np.random.randint(0,disc.tot_states,size) results_failure = learn_from_failure(expert1,expert2,apprentice,iterations_per_run,steps,initial_states,test_states,failure = "sign",initial_bad_states = bad_states) fail[enn,i] = results_failure.e_on_e - results_failure.a_o_e[-1] apprentice = Model(disc_a,"uniform", load_saved = True) results_normal = learn_from_failure(expert1,expert2,apprentice,iterations_per_run,steps,initial_states,test_states,failure = "false",initial_bad_states = bad_states) normal[enn,i] = results_normal.e_on_e - results_normal.a_o_e[-1] apprentice = Model(disc_a,"dual_reward", load_saved = True) results_slow = learn_from_failure(expert1,expert2,apprentice,iterations_per_run,steps,initial_states,test_states,failure = "L1",initial_bad_states = bad_states) slow[enn,i] = results_slow.e_on_e - results_slow.a_o_e[-1] results_array.append([results_failure,results_normal,results_slow]) fn.pickle_saver((results_array,fail,normal,slow),direc+"/"+name+".pkl")
def learn_from_failure(expert1,expert2,apprentice,iterations,steps,initial_states,test_states,failure = "false",initial_bad_states = None): #initialise the lagrange multipliers to 1 print "INITIALISED LEARNING. LEARNING FROM FAILURE = ",failure direc ="results/" fn.make_dir(direc) C = 5.0 D=.7 delta_c = .96 disc = expert1.disc a,s,f = expert1.feature_f.shape #experts exp1_policy,ignore,exp1_state_exp,exp1_all = inference(expert1,steps,initial_states,discount =0.9) if initial_bad_states == None: exp2_policy,ignore,exp2_state_exp,exp2_all = inference(expert2,steps,initial_states,discount = 0.9) else: exp2_policy,ignore,exp2_state_exp,exp2_all = inference(expert2,steps,initial_bad_states,discount = 0.9) #print "POLICYY", exp1_policy.shape exp1_feature_avg = np.dot(exp1_state_exp.reshape(s*a,order = "F"),expert1.feature_f.reshape(s*a,f,order ="F")) exp2_feature_avg = np.dot(exp2_state_exp.reshape(s*a,order = "F"),expert2.feature_f.reshape(s*a,f,order = "F")) e_on_e = eval_value(expert1.w,exp1_policy,expert1,test_states,steps) t_o_t = eval_value(expert2.w,exp2_policy,expert2,test_states,steps) expert_on_taboo = eval_value(expert2.w,exp1_policy,expert2,test_states,steps) z_stat = None #initiate results structure results = EmptyObject() results.a_o_e = [] results.a_o_t = [] results.policy_diff1 = [] results.policy_diff2 = [] results.e_on_e = e_on_e results.t_o_t = t_o_t results.e_o_t = expert_on_taboo # learning rate rate = 0.08 rate2 = 0.08 # delay before failure data is includes. Large numbers avoid oscilations delay = 0 for i in range(iterations): apprentice_policy,z_stat,a_state_exp,a_all = inference(apprentice,steps,initial_states,z_states = None,discount = 0.9) apprentice_feature_avg = np.dot(a_state_exp.reshape(s*a,order = "F"),apprentice.feature_f.reshape(s*a,f,order = "F")) difference_exp1 = exp1_feature_avg - apprentice_feature_avg if initial_bad_states == None: difference_exp2 = exp2_feature_avg - apprentice_feature_avg else: apprentice_policy,z_stat,a_state_exp_bad,a_all = inference(apprentice,steps,initial_bad_states,z_states = None,discount = 0.9) apprentice_feature_avg_bad = np.dot(a_state_exp_bad.reshape(s*a,order = "F"),apprentice.feature_f.reshape(s*a,f,order = "F")) difference_exp2 = apprentice_feature_avg_bad - exp2_feature_avg if i ==0: difference_random = np.copy(difference_exp2) apprentice_feature_avg_bad_prev = apprentice_feature_avg*0 if failure =="L2": #first update the alphas according to their gradient. apprentice.w = fn.pin_to_threshold(apprentice.w + rate*difference_exp1,C,-C) if i>delay: apprentice.zeta =-difference_exp2 #print "ZETAAA",apprentice.zeta #print "-------------------------------------------"s elif failure == "L1": apprentice.w = apprentice.w + rate*difference_exp1 #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta) apprentice.zeta = -0.9*difference_exp2 elif failure == "false": apprentice.w = apprentice.w + rate*difference_exp1 elif failure == "slow": apprentice.w = apprentice.w + rate*difference_exp1 C = C*delta_c if 1./C>D: C = 1/D if i >delay: apprentice.zeta =-difference_exp2/(C) apprentice_feature_avg_bad_prev =apprentice_feature_avg_bad apprentice.reward_f = apprentice.buildRewardFunction() #evaluation a_on_e = eval_value(expert1.w,apprentice_policy,apprentice,test_states,steps) a_o_t = eval_value(expert2.w,apprentice_policy,apprentice,test_states,steps) #if i ==iterations-1: if i <iterations: print "failure",failure print "Iteration",i print "Aprentice on Expert" ,a_on_e print "Expert on expert",e_on_e print "Apprentice on Taboo",a_o_t print "Taboo on Taboo",t_o_t print "Expert on Taboo",expert_on_taboo print "______________________________________" results.a_o_e.append(a_on_e) results.a_o_t.append(a_o_t) results.policy_diff1.append(np.sum(np.sum(np.absolute(apprentice_policy-exp1_policy)))/(2*disc.tot_states)*100) results.policy_diff2.append(np.sum(np.sum(np.absolute(apprentice_policy-exp2_policy)))/(2*disc.tot_states)*100) if i == iterations-1: print "Policy Difference",results.policy_diff1[-1] print "Policy Difference",results.policy_diff2[-1] return results
def learn_from_failure(expert1, expert2, apprentice, iterations, steps, initial_states, test_states, failure="false", initial_bad_states=None): #initialise the lagrange multipliers to 1 print "INITIALISED LEARNING. LEARNING FROM FAILURE = ", failure direc = "results/" fn.make_dir(direc) # learning rate rate = 0.08 rate2 = 0.08 C = 5.0 D = .7 delta_c = .96 delay = 0 disc = expert1.disc a, s, f = expert1.feature_f.shape #experts exp1_policy, ignore, exp1_state_exp, exp1_all = inference(expert1, steps, initial_states, discount=0.9) if initial_bad_states == None: exp2_policy, ignore, exp2_state_exp, exp2_all = inference( expert2, steps, initial_states, discount=0.9) else: exp2_policy, ignore, exp2_state_exp, exp2_all = inference( expert2, steps, initial_bad_states, discount=0.9) #print "POLICYY", exp1_policy.shape exp1_feature_avg = np.dot(exp1_state_exp.reshape(s * a, order="F"), expert1.feature_f.reshape(s * a, f, order="F")) exp2_feature_avg = np.dot(exp2_state_exp.reshape(s * a, order="F"), expert2.feature_f.reshape(s * a, f, order="F")) e_on_e = eval_value(expert1.w, exp1_policy, expert1, test_states, steps) t_o_t = eval_value(expert2.w, exp2_policy, expert2, test_states, steps) expert_on_taboo = eval_value(expert2.w, exp1_policy, expert2, test_states, steps) z_stat = None #initiate results structure results = EmptyObject() results.a_o_e = [] results.a_o_t = [] results.policy_diff1 = [] results.policy_diff2 = [] results.e_on_e = e_on_e results.t_o_t = t_o_t results.e_o_t = expert_on_taboo for i in range(iterations): apprentice_policy, z_stat, a_state_exp, a_all = inference( apprentice, steps, initial_states, z_states=None, discount=0.9) apprentice_feature_avg = np.dot( a_state_exp.reshape(s * a, order="F"), apprentice.feature_f.reshape(s * a, f, order="F")) difference_exp1 = exp1_feature_avg - apprentice_feature_avg if initial_bad_states == None: difference_exp2 = exp2_feature_avg - apprentice_feature_avg else: apprentice_policy, z_stat, a_state_exp_bad, a_all = inference( apprentice, steps, initial_bad_states, z_states=None, discount=0.9) apprentice_feature_avg_bad = np.dot( a_state_exp_bad.reshape(s * a, order="F"), apprentice.feature_f.reshape(s * a, f, order="F")) difference_exp2 = apprentice_feature_avg_bad - exp2_feature_avg if i == 0: difference_random = np.copy(difference_exp2) apprentice_feature_avg_bad_prev = apprentice_feature_avg * 0 #updates elif failure == "L1": apprentice.w = apprentice.w + rate * difference_exp1 #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta) apprentice.zeta = 0.9 * difference_exp2 elif failure == "false": apprentice.w = apprentice.w + rate * difference_exp1 elif failure == "slow": apprentice.w = apprentice.w + rate * difference_exp1 C = C * delta_c if 1. / C > D: C = 1 / D if i > delay: apprentice.zeta = -difference_exp2 / (C) #print "ZETAAA",apprentice.zeta #print "-------------------------------------------" elif failure == "cvx": delay = 0 apprentice.w = apprentice.w + rate * difference_exp1 #sings = difference_random*difference_exp2 #print sings #idx = np.where(sings < 0) #difference_exp2[idx]=0 rho = 0.01 #if rho>0.8: # rho=0.8 #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta) if i > delay: apprentice.zeta = 0.9 * (apprentice_feature_avg_bad_prev - rho * apprentice_feature_avg_bad + (rho - 1) * exp2_feature_avg) apprentice_feature_avg_bad_prev = apprentice_feature_avg_bad #apprentice.zeta = difference_random - 0.2*difference_exp2 elif failure == "sign": apprentice.w = apprentice.w + rate * difference_exp1 rho = 0.01 apprentice.zeta = np.sign(difference_random) elif failure == "only": apprentice.zeta = apprentice.zeta - rate2 * (difference_exp2 + D * apprentice.zeta) apprentice.zeta = -2 * difference_exp2 #print "ZETAAA",apprentice.zeta #print "-------------------------------------------" apprentice.reward_f = apprentice.buildRewardFunction() #evaluation a_on_e = eval_value(expert1.w, apprentice_policy, apprentice, test_states, steps) a_o_t = eval_value(expert2.w, apprentice_policy, apprentice, test_states, steps) #if i ==iterations-1: if i < iterations: print "failure", failure print "Iteration", i print "Aprentice on Expert", a_on_e print "Expert on expert", e_on_e print "Apprentice on Taboo", a_o_t print "Taboo on Taboo", t_o_t print "Expert on Taboo", expert_on_taboo print "______________________________________" results.a_o_e.append(a_on_e) results.a_o_t.append(a_o_t) results.policy_diff1.append( np.sum(np.sum(np.absolute(apprentice_policy - exp1_policy))) / (2 * disc.tot_states) * 100) results.policy_diff2.append( np.sum(np.sum(np.absolute(apprentice_policy - exp2_policy))) / (2 * disc.tot_states) * 100) if i == iterations - 1: print "Policy Difference", results.policy_diff1[-1] print "Policy Difference", results.policy_diff2[-1] return results
def learn_from_failure(expert1,expert2,apprentice,iterations,steps,initial_states,test_states,failure = "false",initial_bad_states = None): #initialise the lagrange multipliers to 1 print "INITIALISED LEARNING. LEARNING FROM FAILURE = ",failure direc ="results/" fn.make_dir(direc) # learning rate rate = 0.08 rate2 = 0.08 C = 5.0 D=.7 delta_c = .96 delay = 0 disc = expert1.disc a,s,f = expert1.feature_f.shape #experts exp1_policy,ignore,exp1_state_exp,exp1_all = inference(expert1,steps,initial_states,discount =0.9) if initial_bad_states == None: exp2_policy,ignore,exp2_state_exp,exp2_all = inference(expert2,steps,initial_states,discount = 0.9) else: exp2_policy,ignore,exp2_state_exp,exp2_all = inference(expert2,steps,initial_bad_states,discount = 0.9) #print "POLICYY", exp1_policy.shape exp1_feature_avg = np.dot(exp1_state_exp.reshape(s*a,order = "F"),expert1.feature_f.reshape(s*a,f,order ="F")) exp2_feature_avg = np.dot(exp2_state_exp.reshape(s*a,order = "F"),expert2.feature_f.reshape(s*a,f,order = "F")) e_on_e = eval_value(expert1.w,exp1_policy,expert1,test_states,steps) t_o_t = eval_value(expert2.w,exp2_policy,expert2,test_states,steps) expert_on_taboo = eval_value(expert2.w,exp1_policy,expert2,test_states,steps) z_stat = None #initiate results structure results = EmptyObject() results.a_o_e = [] results.a_o_t = [] results.policy_diff1 = [] results.policy_diff2 = [] results.e_on_e = e_on_e results.t_o_t = t_o_t results.e_o_t = expert_on_taboo for i in range(iterations): apprentice_policy,z_stat,a_state_exp,a_all = inference(apprentice,steps,initial_states,z_states = None,discount = 0.9) apprentice_feature_avg = np.dot(a_state_exp.reshape(s*a,order = "F"),apprentice.feature_f.reshape(s*a,f,order = "F")) difference_exp1 = exp1_feature_avg - apprentice_feature_avg if initial_bad_states == None: difference_exp2 = exp2_feature_avg - apprentice_feature_avg else: apprentice_policy,z_stat,a_state_exp_bad,a_all = inference(apprentice,steps,initial_bad_states,z_states = None,discount = 0.9) apprentice_feature_avg_bad = np.dot(a_state_exp_bad.reshape(s*a,order = "F"),apprentice.feature_f.reshape(s*a,f,order = "F")) difference_exp2 = apprentice_feature_avg_bad - exp2_feature_avg if i ==0: difference_random = np.copy(difference_exp2) apprentice_feature_avg_bad_prev = apprentice_feature_avg*0 #updates elif failure == "L1": apprentice.w = apprentice.w + rate*difference_exp1 #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta) apprentice.zeta = 0.9*difference_exp2 elif failure == "false": apprentice.w = apprentice.w + rate*difference_exp1 elif failure == "slow": apprentice.w = apprentice.w + rate*difference_exp1 C = C*delta_c if 1./C>D: C = 1/D if i >delay: apprentice.zeta =-difference_exp2/(C) #print "ZETAAA",apprentice.zeta #print "-------------------------------------------" elif failure == "cvx": delay = 0 apprentice.w = apprentice.w + rate*difference_exp1 #sings = difference_random*difference_exp2 #print sings #idx = np.where(sings < 0) #difference_exp2[idx]=0 rho = 0.01 #if rho>0.8: # rho=0.8 #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta) if i>delay: apprentice.zeta =0.9*(apprentice_feature_avg_bad_prev - rho*apprentice_feature_avg_bad + (rho-1)*exp2_feature_avg) apprentice_feature_avg_bad_prev =apprentice_feature_avg_bad #apprentice.zeta = difference_random - 0.2*difference_exp2 elif failure == "sign": apprentice.w = apprentice.w + rate*difference_exp1 rho = 0.01 apprentice.zeta =np.sign(difference_random) elif failure == "only": apprentice.zeta =apprentice.zeta -rate2*(difference_exp2 + D*apprentice.zeta) apprentice.zeta = -2*difference_exp2 #print "ZETAAA",apprentice.zeta #print "-------------------------------------------" apprentice.reward_f = apprentice.buildRewardFunction() #evaluation a_on_e = eval_value(expert1.w,apprentice_policy,apprentice,test_states,steps) a_o_t = eval_value(expert2.w,apprentice_policy,apprentice,test_states,steps) #if i ==iterations-1: if i <iterations: print "failure",failure print "Iteration",i print "Aprentice on Expert" ,a_on_e print "Expert on expert",e_on_e print "Apprentice on Taboo",a_o_t print "Taboo on Taboo",t_o_t print "Expert on Taboo",expert_on_taboo print "______________________________________" results.a_o_e.append(a_on_e) results.a_o_t.append(a_o_t) results.policy_diff1.append(np.sum(np.sum(np.absolute(apprentice_policy-exp1_policy)))/(2*disc.tot_states)*100) results.policy_diff2.append(np.sum(np.sum(np.absolute(apprentice_policy-exp2_policy)))/(2*disc.tot_states)*100) if i == iterations-1: print "Policy Difference",results.policy_diff1[-1] print "Policy Difference",results.policy_diff2[-1] return results
def learn_from_failure(expert1, expert2, apprentice, iterations, steps, initial_states, test_states, failure="false", initial_bad_states=None): #initialise the lagrange multipliers to 1 print "INITIALISED LEARNING. LEARNING FROM FAILURE = ", failure direc = "results/" fn.make_dir(direc) C = 5.0 D = .7 delta_c = .96 disc = expert1.disc a, s, f = expert1.feature_f.shape #experts exp1_policy, ignore, exp1_state_exp, exp1_all = inference(expert1, steps, initial_states, discount=0.90) if initial_bad_states == None: exp2_policy, ignore, exp2_state_exp, exp2_all = inference( expert2, steps, initial_states, discount=0.90) else: exp2_policy, ignore, exp2_state_exp, exp2_all = inference( expert2, steps, initial_bad_states, discount=0.90) #print "POLICYY", exp1_policy.shape exp1_feature_avg = np.dot(exp1_state_exp.reshape(s * a, order="F"), expert1.feature_f.reshape(s * a, f, order="F")) exp2_feature_avg = np.dot(exp2_state_exp.reshape(s * a, order="F"), expert2.feature_f.reshape(s * a, f, order="F")) e_on_e = eval_value(expert1.w, exp1_policy, expert1, test_states, steps) t_o_t = eval_value(expert2.w, exp2_policy, expert2, test_states, steps) expert_on_taboo = eval_value(expert2.w, exp1_policy, expert2, test_states, steps) z_stat = None #initiate results structure results = EmptyObject() results.a_o_e = [] results.a_o_t = [] results.policy_diff1 = [] results.policy_diff2 = [] results.e_on_e = e_on_e results.t_o_t = t_o_t results.e_o_t = expert_on_taboo # learning rate rate = 0.08 rate2 = 0.08 # delay before failure data is includes. Large numbers avoid oscilations delay = 0 for i in range(iterations): apprentice_policy, z_stat, a_state_exp, a_all = inference( apprentice, steps, initial_states, z_states=None, discount=0.95) apprentice_feature_avg = np.dot( a_state_exp.reshape(s * a, order="F"), apprentice.feature_f.reshape(s * a, f, order="F")) difference_exp1 = exp1_feature_avg - apprentice_feature_avg if initial_bad_states == None: difference_exp2 = exp2_feature_avg - apprentice_feature_avg else: apprentice_policy, z_stat, a_state_exp_bad, a_all = inference( apprentice, steps, initial_bad_states, z_states=None, discount=0.95) apprentice_feature_avg_bad = np.dot( a_state_exp_bad.reshape(s * a, order="F"), apprentice.feature_f.reshape(s * a, f, order="F")) difference_exp2 = apprentice_feature_avg_bad - exp2_feature_avg if i == 0: difference_random = np.copy(difference_exp2) apprentice_feature_avg_bad_prev = apprentice_feature_avg * 0 if failure == "L2": #first update the alphas according to their gradient. apprentice.w = fn.pin_to_threshold( apprentice.w + rate * difference_exp1, C, -C) if i > delay: apprentice.zeta = -difference_exp2 #print "ZETAAA",apprentice.zeta #print "-------------------------------------------"s elif failure == "L1": apprentice.w = apprentice.w + rate * difference_exp1 #apprentice.zeta = apprentice.zeta + rate2*(difference_exp2+ D*apprentice.zeta) apprentice.zeta = 0.9 * difference_exp2 elif failure == "false": apprentice.w = apprentice.w + rate * difference_exp1 elif failure == "slow": apprentice.w = apprentice.w + rate * difference_exp1 C = C * delta_c if 1. / C > D: C = 1 / D if i > delay: apprentice.zeta = -difference_exp2 / (C) apprentice_feature_avg_bad_prev = apprentice_feature_avg_bad apprentice.reward_f = apprentice.buildRewardFunction() #evaluation a_on_e = eval_value(expert1.w, apprentice_policy, apprentice, test_states, steps) a_o_t = eval_value(expert2.w, apprentice_policy, apprentice, test_states, steps) #if i ==iterations-1: if i < iterations: print "failure", failure print "Iteration", i print "Aprentice on Expert", a_on_e print "Expert on expert", e_on_e print "Apprentice on Taboo", a_o_t print "Taboo on Taboo", t_o_t print "Expert on Taboo", expert_on_taboo print "______________________________________" results.a_o_e.append(a_on_e) results.a_o_t.append(a_o_t) results.policy_diff1.append( np.sum(np.sum(np.absolute(apprentice_policy - exp1_policy))) / (2 * disc.tot_states) * 100) results.policy_diff2.append( np.sum(np.sum(np.absolute(apprentice_policy - exp2_policy))) / (2 * disc.tot_states) * 100) if i == iterations - 1: print "Policy Difference", results.policy_diff1[-1] print "Policy Difference", results.policy_diff2[-1] return results