#sum_diffs += sv_mc.mc_update( s_hash, alpha, G) #sum_diffs += sv_td.td0_update( s_hash=s_hash, alpha=alpha, # gamma=gamma, sn_hash=sn_hash, # reward=reward) for s_hash in mc_updateD.keys(): mc_updateD[s_hash] *= alpha td_updateD[s_hash] *= alpha sum_diffs = sum( [abs(v) for v in mc_updateD.values()] + \ [abs(v) for v in td_updateD.values()] ) # update state values for s_hash in mc_updateD.keys(): sv_mc.delta_update( s_hash, mc_updateD[s_hash] ) sv_td.delta_update( s_hash, td_updateD[s_hash] ) if sum_diffs < 1e-3: break if inner_loop >= LOOP_LIMIT: print('LOOP EXIT') # add this loops state values to running_ave mc_rms_raveL[i_loop].add_val( sv_mc.calc_rms_error( true_valueD ) ) td_rms_raveL[i_loop].add_val( sv_td.calc_rms_error( true_valueD ) ) mc_rmsL = [R.get_ave() for R in mc_rms_raveL] td_rmsL = [R.get_ave() for R in td_rms_raveL]
mc_avegD[s_hash] = RunningAve() if (s_hash,sn_hash) not in td_averD: td_averD[(s_hash,sn_hash)] = RunningAve() mc_avegD[s_hash].add_val( G ) td_averD[(s_hash,sn_hash)].add_val( reward ) # set the Monte Carlo V(s) values for this experiment for s_hash, G in mc_avegD.items(): sv_mc.set_Vs( s_hash, G.get_ave() ) # set the TD(0) values for this experiment for update_loop in range(20): errD, total_err = calc_td_error( show_values=False ) for s_hash, err in errD.items(): sv_td.delta_update( s_hash=s_hash, delta=err*alpha) # add this loops RMS of state values to RMS running_ave mc_rms_raveL[i_loop].add_val( sv_mc.calc_rms_error( true_valueD ) ) td_rms_raveL[i_loop].add_val( sv_td.calc_rms_error( true_valueD ) ) mc_rmsL = [R.get_ave() for R in mc_rms_raveL] td_rmsL = [R.get_ave() for R in td_rms_raveL] fig, ax = plt.subplots() ax.plot(mc_rmsL, 'r-', label='MC') ax.plot(td_rmsL, 'c-', label='TD(0)') td_erros = [0.23570226, 0.23500565, 0.14095847, 0.13294523, 0.12816648, 0.12551351, 0.12472649, 0.12393498, 0.1234716 , 0.12199879,