def evaluator(process_ind, args,
              global_logs,
              evaluator_logs,
              env_prototype,
              model_prototype,
              global_model):
    # logs
    print("---------------------------->", process_ind, "evaluator")
    # env
    env = env_prototype(args.env_params, process_ind)
    # memory
    # model
    local_device = torch.device('cuda')#('cpu')
    local_model = model_prototype(args.model_params,
                                  args.state_shape,
                                  args.action_space,
                                  args.action_shape).to(local_device)
    # sync global model to local
    local_model.load_state_dict(global_model.state_dict())

    # params

    # setup
    local_model.eval()
    torch.set_grad_enabled(False)

    last_eval_time = time.time()
    while global_logs.learner_step.value < args.agent_params.steps:
        time.sleep(5)
        if time.time() - last_eval_time > args.agent_params.evaluator_freq:
            # sync global model to local
            local_model.load_state_dict(global_model.state_dict())
            # main control loop
            experience = reset_experience()
            # counters
            step = 0
            episode_steps = 0
            episode_reward = 0.
            total_steps = 0
            total_reward = 0.
            nepisodes = 0
            nepisodes_solved = 0
            # flags
            flag_reset = True   # True when: terminal1 | episode_steps > self.early_stop
            # while step < args.agent_params.evaluator_steps:
            while nepisodes < 2:
                # deal w/ reset
                if flag_reset:
                    # reset episode stats
                    episode_steps = 0
                    episode_reward = 0.
                    # reset game
                    experience = env.reset()
                    assert experience.state1 is not None
                    # flags
                    flag_reset = False

                # run a single step
                action, _, _ = local_model.get_action(experience.state1, device=local_device)
                experience = env.step(action)

                # check conditions & update flags
                if experience.terminal1:
                    nepisodes_solved += 1
                    flag_reset = True
                if args.env_params.early_stop and (episode_steps + 1) >= args.env_params.early_stop:
                    flag_reset = True

                # update counters & stats
                step += 1
                episode_steps += 1
                episode_reward += experience.reward
                if flag_reset:
                    nepisodes += 1
                    total_steps += episode_steps
                    total_reward += episode_reward

            # report stats
            # push local stats to logger
            with evaluator_logs.logger_lock.get_lock():
                evaluator_logs.total_steps.value = total_steps
                evaluator_logs.total_reward.value = total_reward
                evaluator_logs.nepisodes.value = nepisodes
                evaluator_logs.nepisodes_solved.value = nepisodes_solved
                evaluator_logs.logger_lock.value = True

            # save model
            print("Saving model " + args.model_name + " ...")
            torch.save(local_model.state_dict(), args.model_name)
            print("Saved  model " + args.model_name + ".")

            last_eval_time = time.time()
def ddpg_actor(process_ind, args,
               global_logs,
               actor_logs,
               env_prototype,
               model_prototype,
               global_memory,
               global_model):
    # logs
    print("---------------------------->", process_ind, "actor")

    # env
    env = env_prototype(args.env_params, process_ind, args.num_envs_per_actor)
    # memory
    # model
    local_device = torch.device('cuda')#('cpu')
    local_model = model_prototype(args.model_params,
                                  args.state_shape,
                                  args.action_space,
                                  args.action_shape).to(local_device)
    # sync global model to local
    local_model.load_state_dict(global_model.state_dict())

    # params
    random_process = args.agent_params.random_process(size=args.action_space,
        theta=0.15, sigma=0.3, n_steps_annealing=args.memory_params.memory_size*100)

    # setup
    local_model.eval()
    torch.set_grad_enabled(False)

    # main control loop
    experience = reset_experience()
    # counters
    step = 0
    episode_steps = 0
    episode_reward = 0.
    total_steps = 0
    total_reward = 0.
    nepisodes = 0
    nepisodes_solved = 0
    # flags
    flag_reset = True   # True when: terminal1 | episode_steps > self.early_stop
    # local buffers for nstep
    states_nstep     = deque(maxlen=args.agent_params.nstep + 2)
    actions_nstep    = deque(maxlen=args.agent_params.nstep + 1)
    rewards_nstep    = deque(maxlen=args.agent_params.nstep + 1)
    terminal1s_nstep = deque(maxlen=args.agent_params.nstep + 1)
    while global_logs.learner_step.value < args.agent_params.steps:
        # deal w/ reset
        if flag_reset:
            # reset episode stats
            episode_steps = 0
            episode_reward = 0.
            # reset game
            experience = env.reset()
            assert experience.state1 is not None
            # local buffers for nstep
            states_nstep.clear()
            states_nstep.append(experience.state1)
            actions_nstep.clear()
            rewards_nstep.clear()
            terminal1s_nstep.clear()
            # flags
            flag_reset = False

        # run a single step
        action, _, _ = local_model.get_action(experience.state1, random_process.sample(), device=local_device)
        experience = env.step(action)

        # local buffers for nstep
        states_nstep.append(experience.state1)
        actions_nstep.append(experience.action)
        rewards_nstep.append(experience.reward)
        terminal1s_nstep.append(experience.terminal1)

        # push to memory
        # NOTE: now states_nstep[-1] has not yet been passed through the model
        # NOTE: so its qvalue & max_qvalue are not yet available for calculating the tderr for priority
        # NOTE: so here we only push the second most recent tuple [-2] into the memory
        # NOTE: and do an extra forward of [-1] only when the current episode terminates
        # NOTE: then push the most recent tuple into memory
        # read as: from state0, take action0, accumulate rewards_between in n step, arrive at stateN, results in terminalN
        # state0: states_nstep[0]
        # action0: actions_nstep[0]
        # rewards_between: discounted sum over rewards_nstep[0] ~ rewards_nstep[-2]-
        # stateN: states_nstep[-2]
        # terminalN: terminal1s_nstep[-2]
        # qvalue0: qvalues_nstep[0]
        # max_qvalueN: max_qvalues_nstep[-1] # NOTE: this stores the value for states_nstep[-2]
        if len(states_nstep) >= 3:
            rewards_between = np.sum([rewards_nstep[i] * np.power(args.agent_params.gamma, i) for i in range(len(rewards_nstep) - 1)])
            gamma_sn = np.power(args.agent_params.gamma, len(states_nstep) - 2)
            priority = 0.
            global_memory.feed((states_nstep[0],
                                actions_nstep[0],
                                [rewards_between],
                                [gamma_sn],
                                states_nstep[-2],
                                terminal1s_nstep[-2]),
                                priority)

        # check conditions & update flags
        if experience.terminal1:
            nepisodes_solved += 1
            flag_reset = True
        if args.env_params.early_stop and (episode_steps + 1) >= args.env_params.early_stop:
            flag_reset = True

        # NOTE: now we do the extra forward step of the most recent state
        # NOTE: then push the tuple into memory, if the current episode ends
        if flag_reset:
            if len(states_nstep) >= (args.agent_params.nstep + 2):    # (nstep+1) experiences available, use states_nstep[1] as s0
                rewards_between = np.sum([rewards_nstep[i] * np.power(args.agent_params.gamma, i - 1) for i in range(1, len(rewards_nstep))])
                gamma_sn = np.power(args.agent_params.gamma, len(states_nstep) - 2)
                priority = 0.
                global_memory.feed((states_nstep[1],
                                    actions_nstep[1],
                                    [rewards_between],
                                    [gamma_sn],
                                    states_nstep[-1],
                                    terminal1s_nstep[-1]),
                                    priority)
            else:                                   # not all available, just use the oldest states_nstep[0] as s0
                rewards_between = np.sum([rewards_nstep[i] * np.power(args.agent_params.gamma, i) for i in range(len(rewards_nstep))])
                gamma_sn = np.power(args.agent_params.gamma, len(states_nstep) - 1)
                priority = 0.
                global_memory.feed((states_nstep[0],
                                    actions_nstep[0],
                                    [rewards_between],
                                    [gamma_sn],
                                    states_nstep[-1],
                                    terminal1s_nstep[-1]),
                                    priority)

        # update counters & stats
        with global_logs.actor_step.get_lock():
            global_logs.actor_step.value += 1
        step += 1
        episode_steps += 1
        episode_reward += experience.reward
        if flag_reset:
            nepisodes += 1
            total_steps += episode_steps
            total_reward += episode_reward

        # sync global model to local
        if step % args.agent_params.actor_sync_freq == 0:
            local_model.load_state_dict(global_model.state_dict())

        # report stats
        if step % args.agent_params.actor_freq == 0: # then push local stats to logger & reset local
            # push local stats to logger
            with actor_logs.nepisodes.get_lock():
                actor_logs.total_steps.value += total_steps
                actor_logs.total_reward.value += total_reward
                actor_logs.nepisodes.value += nepisodes
                actor_logs.nepisodes_solved.value += nepisodes_solved
            # reset local stats
            total_steps = 0
            total_reward = 0.
            nepisodes = 0
            nepisodes_solved = 0
Beispiel #3
0
def tester(process_ind, args, env_prototype, model_prototype):
    # logs
    print("---------------------------->", process_ind, "tester")
    # env
    env = env_prototype(args.env_params, process_ind)
    # memory
    # model
    local_device = torch.device('cpu')
    local_model = model_prototype(args.model_params, args.state_shape,
                                  args.action_space,
                                  args.action_shape).to(local_device)
    # sync global model to local
    local_model.load_state_dict(torch.load(args.model_file))

    # params

    # setup
    local_model.eval()
    torch.set_grad_enabled(False)

    # main control loop
    experience = reset_experience()
    # counters
    step = 0
    episode_steps = 0
    episode_reward = 0.
    total_steps = 0
    total_reward = 0.
    nepisodes = 0
    nepisodes_solved = 0
    # flags
    flag_reset = True  # True when: terminal1 | episode_steps > self.early_stop
    while nepisodes < args.agent_params.tester_nepisodes:
        # deal w/ reset
        if flag_reset:
            # reset episode stats
            episode_steps = 0
            episode_reward = 0.
            # reset game
            experience = env.reset()
            assert experience.state1 is not None
            # flags
            flag_reset = False

        # run a single step
        action, _, _ = local_model.get_action(experience.state1)
        experience = env.step(action)

        # check conditions & update flags
        if experience.terminal1:
            nepisodes_solved += 1
            flag_reset = True
        if args.env_params.early_stop and (episode_steps +
                                           1) >= args.env_params.early_stop:
            flag_reset = True

        # update counters & stats
        step += 1
        episode_steps += 1
        episode_reward += experience.reward
        if flag_reset:
            nepisodes += 1
            total_steps += episode_steps
            total_reward += episode_reward
            print("Testing Episode ", nepisodes)

    # report stats
    print("nepisodes:", nepisodes)
    print("avg_steps:", total_steps / nepisodes)
    print("avg_reward:", total_reward / nepisodes)
    print("nepisodes_solved:", nepisodes_solved)
    print("repisodes_solved:", nepisodes_solved / nepisodes)