def __init__(self, train_mode=0): self.train_mode = train_mode agent = DQNAgent(mode=self.train_mode) user = user_simulator() self.manager = dialog_manager(agent, user, self.train_mode, maximum_turn=20) self.simulation_epoch_size = 800
gamma=gamma, prior_eps=prior_eps, alpha=alpha, beta=beta, v_min=v_min, v_max=v_max, atom_size=atom_size, support=support, batch_size=batch_size) # train agent = DQNAgent(algorithm=algorithm, env=env, memory_size=memory_size, batch_size=batch_size, obs_dim=obs_dim, action_dim=action_dim, target_update=target_update, gamma=gamma, alpha=alpha, beta=beta, n_step=n_step, device=device) #%% agent.train(num_frames, plot=False) #%% # agent.env = gym.wrappers.Monitor(env, "videos", force=True) agent.test(render=True)
from real_user import real_user from state_tracker import state_tracker from DQN_agent import DQNAgent from natural_language_understanding import NL_understanding as NLU from natural_language_generator_rule import NL_rule_generator as NLG nlu = NLU() nlg = NLG() agent = DQNAgent(mode=1) agent.initialize() def respond(msg, user, state_keeper): # user turn user.update_sentence(msg) user_action, episode_over, dialog_status = user.generate_user_response() user_action = nlu.convert_nl_to_state(user_action) print('User State: {}\nEpisode_over: {}'.format(user_action, episode_over)) state_keeper.update(user_action=user_action) # agent turn agent_state = state_keeper.get_agent_input_vector() agent_action, action, episode_over = agent.generate_agent_response(agent_state, \ state_keeper.all_slots['user_informed_slots'], state_keeper.act) if nlg.convert_state_to_nl(agent_action) != '': agent_action['sentence'] = nlg.convert_state_to_nl(agent_action) else: agent_action['sentence'] = 'Response not available ...' print('Agent State: {}'.format(agent_action)) state_keeper.update(agent_action=agent_action) return agent_action['sentence']
def execute_experiment(args): #### HARD RULES if args['parallel'] != 0: args['use_gpu'] = 0 if args['agent_type'] == 'human': args['use_gpu'] = 0 args['render_delay'] = 0 args['mode'] = 'play' args['display_prob'] = 1 # args['action_repeat'] = 1 if args['env_name'] == 'key_mdp-v0': args['action_repeat'] = 1 arch_names = [n for n in args.keys() if 'architecture' in n] for arch_name in arch_names: if args[arch_name] is None: continue else: args[arch_name] = args[arch_name].split('-') cnf = configuration.Configuration() #Global settings gl_st = configuration.GlobalSettings(args) cnf.set_global_settings(gl_st) #Agent settings if args['agent_type'] == 'dqn': ag_st = configuration.DQNSettings(args['scale']) elif args['agent_type'] == 'hdqn': ag_st = configuration.hDQNSettings(args['scale']) elif args['agent_type'] == 'human': ag_st = configuration.HumanSettings() else: raise ValueError("Wrong agent %s" % args['agent_type']) ag_st.update(args) cnf.set_agent_settings(ag_st) #Environment settings utils.insert_dirs(cnf.gl.env_dirs) if args['env_name'] == 'SF-v0': #Space Fortress env_st = configuration.SpaceFortressSettings(new_attrs=args) elif args['env_name'] == 'key_mdp-v0': #MDP env_st = configuration.Key_MDPSettings(new_attrs=args) else: raise ValueError("Wrong env_name %s, (env_names: s%)"\ .format(args['env_name'], ', '.join(CT.env_names))) env_st.set_reward_function() cnf.set_environment_settings(env_st) environment = Environment(cnf) tf.set_random_seed(gl_st.random_seed) random.seed(gl_st.random_seed) if gl_st.gpu_fraction == '': raise ValueError("--gpu_fraction should be defined") if not gl_st.use_gpu: os.environ['CUDA_VISIBLE_DEVICES'] = "-1" frac = utils.calc_gpu_fraction(gl_st.gpu_fraction) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=frac) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if ag_st.agent_type == 'dqn': agent = DQNAgent(cnf, environment, sess) elif ag_st.agent_type == 'hdqn': agent = HDQNAgent(cnf, environment, sess) elif ag_st.agent_type == 'human': agent = HumanAgent(cnf, environment) else: raise ValueError("Wrong agent %s".format()) if ag_st.mode == 'train': agent.train() elif ag_st.mode == 'play': agent.play() elif ag_st.mode == 'graph': pass else: raise ValueError("Wrong mode " + str(ag_st.mode)) #agent.show_attrs() tf.reset_default_graph()
target_update = 100 seed = 0 env = LunarLander() # whether or not to use wandb. All wandb code is commented out so that code can be run without it log = False def seed_everything(seed_value): random.seed(seed_value) np.random.seed(seed_value) torch.manual_seed(seed_value) os.environ['PYTHONHASHSEED'] = str(seed_value) if torch.cuda.is_available(): torch.cuda.manual_seed(seed_value) torch.cuda.manual_seed_all(seed_value) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True seed_everything(seed) agent = DQNAgent(env, memory_size, batch_size, target_update, log=log, seed=seed) agent.train(num_frames)
BATCH_SIZE = 32 TAU = 0.001 EPSILON = 0.99 GAMMA = 0.97 LR = 1e-3 MEMORY_SIZE = 10000 f1 = 128 env = gym.make("CartPole-v1") state_dim = env.observation_space.shape action_dim = env.action_space.n agent = DQNAgent(state_dim=state_dim, action_dim=action_dim, tau=TAU, epsilon=EPSILON, mem_size=MEMORY_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, lr=LR) # tf.summary.FileWriter('logs/',agent.sess.graph) # initialize the buffer with some transitions counter = 0 while counter < 5 * BATCH_SIZE: s = env.reset() while True: a = agent.choose_action(s) s_, r, done, _ = env.step(a) agent.store(s, a, r, s_, done) counter += 1 if done: