acc = np.zeros(ITER) for _ in range(ITER): print " Iteration: " + str(_) print " Retraining with " + str(len(dagger.net.data)) + " examples" dagger.retrain() acc[_] = dagger.svm.acc() iteration_states = [] dagger.record = True for i in range(SAMP): if i >= LIMIT_DATA: dagger.record = False dagger.rollout() iteration_states += dagger.get_recent_rollout_states().tolist() r[_] = r[_] + dagger.get_reward() / SAMP if _ == ITER - 1 and t == 0: dagger_analysis.count_states(np.array(iteration_states)) dagger_analysis.save_states("comparisons/boost_dt_comparisons/boost_dt_dagger_final.png") dagger_analysis.show_states() if t == 0: dagger_analysis.reset_density() dagger_analysis.count_states(dagger.get_states()) dagger_analysis.save_states("comparisons/boost_dt_comparisons/boost_dt_dagger.png") dagger_analysis.show_states() plotter.plot_state_actions(mdp.pi, rewards=rewards, sinks=sinks, filename='comparisons/boost_dt_comparisons/boost_dt_dagger_state_action.png') dagger_data[t,:] = r dagger_acc[t,:] = acc # print value_iter_data # print classic_il_data
def run(ne, lr): plotter = plot_class.Plotter() comparisons_directory, data_directory = make_name(ne, lr) if not os.path.exists(comparisons_directory): os.makedirs(comparisons_directory) if not os.path.exists(data_directory): os.makedirs(data_directory) #ITER = 25 #TRIALS = 10 #SAMP = 20 ITER = 10 TRIALS = 3 SAMP = 10 LIMIT_DATA = 1 DEPTH = 6 H = 15 W = 15 grid = BasicGrid(H, W) rewards = scenarios.scenario3['rewards'] sinks = scenarios.scenario3['sinks'] grid.reward_states = rewards grid.sink_states = sinks mdp = ClassicMDP(ClassicPolicy(grid), grid) #mdp.value_iteration() #mdp.save_policy('scen4.p') mdp.load_policy('scen4.p') value_iter_pi = mdp.pi plotter.plot_state_actions(value_iter_pi, rewards = grid.reward_states, sinks = grid.sink_states, filename=comparisons_directory + 'value_iter_state_action.png') value_iter_data = np.zeros([TRIALS, ITER]) classic_il_data = np.zeros([TRIALS, ITER]) classic_il_acc = np.zeros([TRIALS, ITER]) classic_il_loss = np.zeros([TRIALS, ITER]) for t in range(TRIALS): print "\nIL Trial: " + str(t) mdp.load_policy('scen4.p') boost = SVC(kernel='linear') boost = AdaBoostClassifier(base_estimator=boost, algorithm='SAMME', n_estimators=ne, learning_rate=lr) sup = ScikitSupervise(grid, mdp, Classifier=boost) sup.sample_policy() value_iter_analysis = Analysis(W, H, ITER, rewards=rewards, sinks=sinks, desc='Value iter policy') value_iter_r = np.zeros(ITER) classic_il_r = np.zeros(ITER) acc = np.zeros(ITER) loss = np.zeros(ITER) sup.record = True #for _ in range(4): # sup.rollout() for i in range(ITER): print " Iteration: " + str(i) mdp.pi = value_iter_pi sup.record = True for _ in range(SAMP): if _ >= LIMIT_DATA: sup.record = False sup.rollout() value_iter_r[i] += sup.get_reward() / (SAMP) sup.record = False print " Training on " + str(len(sup.net.data)) + " examples" sup.train() acc[i] = sup.svm.acc() for _ in range(SAMP): sup.record=False sup.rollout() loss[i] += sup.get_loss() / float(SAMP) classic_il_r[i] += sup.get_reward() / SAMP #print acc if t == 0: plotter.plot_state_actions(mdp.pi, rewards=rewards, sinks=sinks, filename=comparisons_directory + 'svm_classic_il_state_action.png') classic_il_data[t,:] = classic_il_r value_iter_data[t,:] = value_iter_r classic_il_acc[t,:] = acc classic_il_loss[t,:] = loss #DAGGER dagger_data = np.zeros((TRIALS, ITER)) dagger_analysis = Analysis(H, W, ITER, rewards = grid.reward_states, sinks=grid.sink_states, desc="Dagger's policy progression") dagger_acc = np.zeros((TRIALS, ITER)) dagger_loss = np.zeros((TRIALS, ITER)) for t in range(TRIALS): print "DAgger Trial: " + str(t) mdp.load_policy('scen4.p') dagger = SVMDagger(grid, mdp, depth=DEPTH) dagger.svm.nonlinear=False dagger.record = True dagger.rollout() #for _ in range(5): # dagger.rollout() r = np.zeros(ITER) acc = np.zeros(ITER) loss = np.zeros(ITER) for _ in range(ITER): print " Iteration: " + str(_) print " Retraining with " + str(len(dagger.net.data)) + " examples" dagger.retrain() acc[_] = dagger.svm.acc() iteration_states = [] dagger.record = True for i in range(SAMP): if i >= LIMIT_DATA: dagger.record = False dagger.rollout() loss[_] += dagger.get_loss() / float(SAMP) iteration_states += dagger.get_recent_rollout_states().tolist() r[_] = r[_] + dagger.get_reward() / SAMP #if _ == ITER - 1 and t == 0: if _ == 0 and t ==0: dagger_analysis.count_states(np.array(iteration_states)) dagger_analysis.save_states(comparisons_directory + "svm_dagger_final.png") dagger_analysis.show_states() if t == 0: dagger_analysis.reset_density() dagger_analysis.count_states(dagger.get_states()) dagger_analysis.save_states(comparisons_directory + "svm_dagger.png") dagger_analysis.show_states() plotter.plot_state_actions(mdp.pi, rewards=rewards, sinks=sinks, filename=comparisons_directory + 'svm_dagger_state_action.png') dagger_data[t,:] = r dagger_acc[t,:] = acc dagger_loss[t,:] = loss # print value_iter_data # print classic_il_data # print dagger_data print classic_il_loss print dagger_loss np.save(data_directory + 'svm_sup_data.npy', value_iter_data) np.save(data_directory + 'svm_classic_il_data.npy', classic_il_data) np.save(data_directory + 'svm_dagger_data.npy', dagger_data) np.save(data_directory + 'svm_dagger_acc.npy', dagger_acc) np.save(data_directory + 'svm_classic_il_acc.npy', classic_il_acc) analysis = Analysis(H, W, ITER, rewards=rewards, sinks=sinks, desc="General comparison") analysis.get_perf(value_iter_data) analysis.get_perf(classic_il_data) analysis.get_perf(dagger_data) #analysis.plot(names = ['Value iteration', 'Adaboost IL'], filename=comparisons_directory + 'svm_reward_comparison.png', ylims=[-60, 100]) analysis.plot(names = ['Value iteration', 'LSVM Boosted IL', 'LSVM DAgger'], filename=comparisons_directory + 'svm_reward_comparison.png', ylims=[-60, 100]) print "Saving analysis to: " + comparisons_directory + 'svm_reward_comparison.png' acc_analysis = Analysis(H, W, ITER, rewards = grid.reward_states, sinks=grid.sink_states, desc="Accuracy comparison") acc_analysis.get_perf(classic_il_acc) acc_analysis.get_perf(dagger_acc) acc_analysis.plot(names = ['LSVM Boosted Acc.', 'LSVM DAgger Acc.'], label='Accuracy', filename=comparisons_directory + 'svm_acc_comparison.png', ylims=[0,1]) #acc_analysis.plot(names = ['Adaboost IL Acc.'], label='Accuracy', filename=comparisons_directory + 'svm_acc_comparison.png', ylims=[0,1]) loss_analysis = Analysis(H, W, ITER, rewards=rewards, sinks=sinks, desc="Loss plot") loss_analysis.get_perf(classic_il_loss) loss_analysis.get_perf(dagger_loss) loss_analysis.plot(names = ['LSVM Boosted IL loss', 'LSVM DAgger loss'], filename=comparisons_directory + 'loss_plot.png', ylims=[0, 1])
classic_il_acc = np.zeros([TRIALS, ITER]) for t in range(TRIALS): mdp.load_policy() sup = SVMSupervise(grid, mdp) sup.sample_policy() value_iter_analysis = Analysis(W, H, ITER, rewards=rewards, sinks=sinks, desc='Value iter policy') r = 0.0 for _ in range(ITER * SAMP): sup.rollout() r = r + sup.get_reward() / (ITER * SAMP) print "Value iter reward: " + str(r) if t == 0: value_iter_analysis.count_states(sup.get_states()) value_iter_analysis.save_states("comparisons/svm_comparisons/value_iter.png") value_iter_analysis.show_states() sup.train() classic_il_acc[t,:] = np.zeros(ITER) + sup.svm.acc() value_iter_data[t,:] = np.zeros(ITER) + r r = 0.0 sup.net.clear_data() sup.sample_policy() il_analysis = Analysis(H, W, ITER, rewards=rewards, sinks=sinks, desc="IL's policy") for _ in range(SAMP * ITER): sup.animate = False sup.rollout()
sup_data = np.zeros([TRIALS,ITER]) classic_il_data = np.zeros([TRIALS, ITER]) for t in range(TRIALS): mdp.load_policy() sup = Supervise(grid, mdp) sup.sample_policy() supervisor_analysis = Analysis(H, W, ITER, rewards = grid.reward_states, sinks=grid.sink_states, desc="Supervisor's policy") r = 0.0 for _ in range(ITER * SAMP): sup.rollout() r = r + sup.get_reward() / (ITER * SAMP) print "Value iter reward: " + str(r) if t == 0: supervisor_analysis.count_states(sup.get_states()) supervisor_analysis.save_states("comparisons/comparisons/value_iter.png") supervisor_analysis.show_states() sup.train() classic_train, classic_test = sup.net.return_stats() classic_train = np.zeros((TRIALS, ITER)) + classic_train classic_test = np.zeros((TRIALS, ITER)) + classic_test sup_data[t,:] = np.zeros(ITER) + r r = 0.0 sup.net.clear_data() sup.sample_policy() il_analysis = Analysis(H, W, ITER, rewards = grid.reward_states, sinks = grid.sink_states, desc="IL's policy") print sup.get_states()
mdp.load_policy() nsupervise = Supervise(grid,mdp) #Collect Noisy Supervise Samples for t in range(ITER*SAMP): nsupervise.rollout() nsupervise.train() #Evaluate Policy r = 0.0 for t in range(SAMP): nsupervise.rollout() r = r+nsupervise.get_reward()/SAMP r_SN = np.zeros(ITER)+r data[k,:] = r_SN analysis.count_states(nsupervise.get_states()) test_loss_n[k] = nsupervise.get_test_loss() train_loss_n[k] = nsupervise.get_train_loss() analysis.show_states() analysis.get_perf(data) # #####NOISY SUPERVISOR LOGISTIC##### # data = np.zeros([TRIALS,ITER]) # test_loss = np.zeros([TRIALS]) # train_loss = np.zeros([TRIALS]) # for k in range(TRIALS): # mdp.load_policy()
plotter.plot_state_actions(mdp.pi, rewards = grid.reward_states, sinks = grid.sink_states) q.rollout() a = mdp.pi.get_next(State(0, 0)) print "action: " + str(a) tup = q.Q.preprocess(0, 0, a) print q.Q.dataset[tup] print "Actual: " + str(np.mean(q.Q.dataset[tup])) print "predicted: " + str(q.Q.get(State(0, 0), a)) for ac in mdp.pi.available_actions: if ac != a: print "Seeing for action: " + str(ac) tup = q.Q.preprocess(0, 0, ac) if tup in q.Q.dataset: print "Actual: " + str(np.mean(q.Q.dataset[tup])) #print np.mean(q.Q.dataset[tup]) else: print "No actual" print "predicted: " + str(q.Q.get(State(0, 0), ac)) #q.animate=True #q.rollout() an.count_states(q.get_states()) an.show_states()
mdp.load_policy() ####DAgger########## data = np.zeros([TRIALS,ITER]) for k in range(TRIALS): mdp.load_policy() dagger = SVMDagger(grid, mdp) dagger.rollout() # rollout with supervisor policy r_D = np.zeros(ITER) for t in range(ITER): dagger.retrain() for i in range(SAMP): dagger.rollout() r_D[t] = r_D[t]+dagger.get_reward()/SAMP analysis.count_states(dagger.get_states()) data[k,:] = r_D analysis.show_states() analysis.get_perf(data) analysis.save("test.p") ####SUPERVISE######## # data = np.zeros([TRIALS,ITER]) # for k in range(TRIALS): # mdp.load_policy() # supervise = Supervise(grid,mdp) # #Collect Supervise Samples # for t in range(ITER*SAMP): # supervise.rollout()
q.guide() r = r + q.get_reward() / (ITER) print "Value iter reward: " + str(r) value_iter_data[t,:] = np.zeros(ITER) + r r = 0.0 q.clear_states() mdp.pi = QPolicy(q) a = Analysis(W, H, ITER, rewards=rewards, sinks=sinks, desc='Q policy') for i in range(ITER * SAMP): q.rollout() r = r + q.get_reward() / (ITER * SAMP) print "Q learn reward: " + str(r) if t == 0: a.count_states(q.get_states()) a.show_states() plotter.plot_state_actions(mdp.pi, rewards = grid.reward_states, sinks = grid.sink_states) classic_q_data[t,:] = np.zeros(ITER) + r # DAGGER dagger_data = np.zeros((TRIALS, ITER)) dagger_analysis = Analysis(H, W, ITER, rewards = grid.reward_states, sinks=grid.sink_states, desc="Dagger's policy progression") for t in range(TRIALS): print "Trial: " + str(t) mdp.load_policy(filename='scen1.p') dagger = SVMDagger(grid, mdp) dagger.rollout()