np.column_stack((np.arange(timesteps), np.concatenate(actions))), advantages_pl: np.concatenate(advantages), learning_rate_pl: learning_rate }) # validation #val_rewards = [get_rollout(sess, env, rollout_limit, stochastic=True, seed=(epochs+i))[2] for i in range(10)] # store and print training statistics #mtr = np.mean([np.sum(r) for r in rewards]) #mvr = np.mean([np.sum(r) for r in val_rewards]) mtr = np.mean([np.sum(r) for r in rewards]) #mvr = np.mean(np.sort([np.sum(r) for r in val_rewards])[5:-5]) statistics.append( [epoch, env.get_nbactions(), mtr, loss, win_rate]) if epoch % 10 == 0: print('Epoch:, %4d. training reward: %6.2f, loss: %7.4f' % (epoch + 1, mtr, loss)) if epoch % 100 == 0: saver.save(sess, "{}/{}.ckpt".format(model, model)) if epoch % 400 == 0: #Get win-rate win_rate = get_winrate(sess, env) print("Win Rate:", win_rate) if win_rate > win_rate_best: saver.save(sess, "{}/{}_best.ckpt".format(model, model))
actions.append(a) rewards.append(r) timesteps += t # compute advantages advantages = get_advantages(rewards, rollout_limit, discount_factor) # policy gradient update loss, _ = sess.run(fetches=[loss_f, train_f], feed_dict={ states_pl: np.concatenate(states), actions_pl: np.column_stack((np.arange(timesteps), np.concatenate(actions))), advantages_pl: np.concatenate(advantages), learning_rate_pl: learning_rate }) mtr = np.mean([np.sum(r) for r in rewards]) #mvr = np.mean(np.sort([np.sum(r) for r in val_rewards])[5:-5]) statistics.append([epoch, env.get_nbactions(), mtr, loss, win_rate]) if epoch % 10 == 0: print('%4d. training reward: %6.2f, loss: %7.4f' % (epoch+1, mtr, loss)) if epoch % 100 == 0: saver.save(sess, "{}/{}.ckpt".format(model,model)) if epoch % 400 == 0: #Get win-rate win_rate = get_winrate(sess, env) print(win_rate) if win_rate > win_rate_best: saver.save(sess, "{}/{}_best.ckpt".format(model,model)) print('done')