env = DtRewardWrapper(env) # Set seeds seed(args.seed) state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy policy = DDPG(state_dim, action_dim, max_action, net_type="cnn") replay_buffer = utils.ReplayBuffer(args.replay_buffer_max_size) # Evaluate untrained policy evaluations = [evaluate_policy(env, policy)] exp.metric("rewards", evaluations[0]) total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True episode_reward = None env_counter = 0 while total_timesteps < args.max_timesteps: if done: if total_timesteps != 0: print(("Total T: %d Episode Num: %d Episode T: %d Reward: %f") %
action_dim, max_action, net_type=args.net_type, args=args) if args.load_model: policy.load("TD3_2_best_start_0_best", "./pytorch_models/TD3_2_best_start_0") print("load suceed!") if not args.priority_replay: replay_buffer = ReplayBuffer(args.replay_buffer_max_size) else: replay_buffer = PriReplayMemory(args, args.replay_buffer_max_size) # Evaluate untrained policy evaluation = np.mean([evaluate_policy(e, policy) for e in envs]) evaluations = [evaluation] exp.metric("rewards", evaluations[0]) total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True episode_reward = None env_counter = 0 best_eval_rew = -np.float("Inf") best_eval_index = 0 env = envs[random.randint(0, len(envs) - 1)] while total_timesteps < args.max_timesteps: