def learn(self):
        # For logging
        time_str = time.strftime("%Y%m%d-%H%M%S")
        logger_ins = logger.Logger('/home/swn/catkin_ws/src/turtlebot3_waypoint_navigation/src/log', output_formats=[logger.HumanOutputFormat(sys.stdout)])
        board_logger = tensorboard_logging.Logger(os.path.join(logger_ins.get_dir(), "tf_board", time_str))

        # reassigning the members of class into this function for simplicity
        total_timesteps = int(self.total_timesteps)
        nenvs = 1
        #nenvs = env.num_envs # for multiple instance training
        ob_space = self.env.observation_space
        ac_space = self.env.action_space
        nbatch = nenvs * self.nsteps
        nminibatches = self.nminibatches
        nbatch_train = nbatch // nminibatches
        noptepochs = self.noptepochs
        nsteps = self.nsteps
        save_interval = self.save_interval
        log_interval = self.log_interval
        restore_path = self.restore_path
        gamma = self.gamma
        lam = self.lam
        lr = self.lr
        cliprange = self.cliprange
        deterministic = self.deterministic
        step_reward = [[0.0, 0.0]]

        # Define a function to make Actor-Critic Model 
        make_model = lambda : Model(policy=self.policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                                    nsteps=self.nsteps, ent_coef=self.ent_coef, vf_coef=self.vf_coef,
                                    max_grad_norm=self.max_grad_norm, deterministic=self.deterministic)
        
        # Save function 
        # if save_interval and logger_ins.get_dir():
        #     import cloudpickle
        #     with open(osp.join(logger_ins.get_dir(), 'make_model.pkl'), 'wb') as fh:
        #         fh.write(cloudpickle.dumps(make_model))

        # Make a model
        model = make_model()

        # Restore when the path is provided
        if restore_path is not None:
            model.restore(restore_path)
        
        # Create a runner instance (generating samples with nsteps)
        runner = Runner(env=self.env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

        # Double ended queue with max size 100 to store episode info
        epinfobuf = deque(maxlen=100)

        # Get the start time
        tfirststart = time.time()

        # Calculate # for update (iteration)
        nupdates = total_timesteps//nbatch
        assert(nupdates > 0)

        '''
        PPO (iterating)
        1. Run policy in the environment for T timesteps
        2. Compute advantage estimates (in Model class)
        3. Optimise Loss w.r.t weights of policy, with K epochs and minibatch size M < N (# of actors) * T (timesteps)
        4. Update weights (in Model class)
        '''
        # In every update, one loop of PPO algorithm is executed
        for update in range(1, nupdates+1):
            
            # INITIALISE PARAMETERS
            assert nbatch % nminibatches == 0
            nbatch_train = nbatch // nminibatches
            tstart = time.time()
            frac = 1.0 - (update - 1.0) / nupdates
            lrnow = lr(frac)
            cliprangenow = cliprange(frac)

            # 1. Run policy and get samples for nsteps
            ids, obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run()
            epinfobuf.extend(epinfos)
            mblossvals = []

            # Do not train or log if in deterministic mode:
            if deterministic:
                continue
            
            # 3. Optimise Loss w.r.t weights of policy, with K epochs and minibatch size M < N (# of actors) * T (timesteps)
            if states is None: # nonrecurrent version
                #
                inds = np.arange(nbatch)
                # Update weights using optimiser by noptepochs
                for _ in range(noptepochs):
                    #np.random.shuffle(inds)

                    # In each epoch, update weights using samples every minibatch in the total batch
                    # epoch = m(32)*minibatch(4)
                    for start in range(0, nbatch, nbatch_train):
                        end = start + nbatch_train
                        mbinds = inds[start:end]
                        returns_np = np.asarray(returns[mbinds])
                        # 4. Update weights
                        mblossvals.append(model.train(lrnow, cliprangenow, ids[mbinds], [obs[i] for i in mbinds], returns[mbinds],
                                        masks[mbinds], actions[mbinds], values[mbinds],
                                        neglogpacs[mbinds]))

            else: # recurrent version
                assert nenvs % nminibatches == 0
                envsperbatch = nenvs // nminibatches
                envinds = np.arange(nenvs)
                flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
                envsperbatch = nbatch_train // nsteps
                for _ in range(noptepochs):
                    #np.random.shuffle(envinds)
                    for start in range(0, nenvs, envsperbatch):
                        end = start + envsperbatch
                        mbenvinds = envinds[start:end]
                        mbflatinds = flatinds[mbenvinds].ravel()
                        print(mbflatinds)
                        slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                        mbstates = states[mbenvinds]
                        mblossvals.append(model.train(lrnow, cliprangenow,
                                        [obs[i] for i in mbinds], returns[mbflatinds], masks[mbflatinds], actions[mbflatinds],
                                        values[mbflatinds], neglogpacs[mbflatinds], mbstates))

            # Calculate mean loss
            lossvals = np.mean(mblossvals, axis=0)

            tnow = time.time()
            fps = int(nbatch / (tnow - tstart))
            
            '''
            Logging and saving model & weights
            '''
            if update % log_interval == 0 or update == 1:
                #ev = explained_variance(values, returns)
                logger_ins.logkv("serial_timesteps", update*nsteps)
                logger_ins.logkv("nupdates", update)
                logger_ins.logkv("total_timesteps", update*nbatch)
                logger_ins.logkv("fps", fps)
                #logger.logkv("explained_variance", float(ev))
                logger_ins.logkv('eprewmean', self.safemean([epinfo['r'] for epinfo in epinfobuf]))
                logger_ins.logkv('eplenmean', self.safemean([epinfo['l'] for epinfo in epinfobuf]))
                logger_ins.logkv('time_elapsed', tnow - tfirststart)
                for (lossval, lossname) in zip(lossvals, model.loss_names):
                    logger_ins.logkv(lossname, lossval)
                logger_ins.dumpkvs()
                for (lossval, lossname) in zip(lossvals, model.loss_names):
                    board_logger.log_scalar(lossname, lossval, update)
                board_logger.log_scalar("eprewmean", self.safemean([epinfo['r'] for epinfo in epinfobuf]), update)
                board_logger.flush()

                reward_arr = np.asarray([epinfo['r'] for epinfo in epinfobuf])
                #reward_new = np.delete(reward_arr, np.where(reward_arr == 0.0))
                step_reward = np.append(step_reward,[[update, self.safemean([reward for reward in reward_arr])]], axis=0)
                sio.savemat('/home/swn/catkin_ws/src/Turtlebot3_Pheromone/src/log/MATLAB/step_reward_{}.mat'.format(self.time_str), {'data':step_reward},True,'5',False,False,'row')
            if save_interval and (update % save_interval == 0 or update == 1) and logger_ins.get_dir():
                checkdir = osp.join(logger_ins.get_dir(), 'checkpoints', '{}'.format(self.time_str))
                if not os.path.isdir(checkdir):
                    os.makedirs(checkdir)
                savepath = osp.join(checkdir, '%.5i'%update +"r"+"{:.2f}".format(self.safemean([epinfo['r'] for epinfo in epinfobuf])))
                print('Saving to', savepath)
                model.save(savepath)
        print("Done with training. Exiting.")
        self.env.close()
        return model
Example #2
0
def main(args):
    time_str = time.strftime("%Y%m%d-%H%M%S")
    logger_ins = logger.Logger(
        HOME + '/catkin_ws/src/Turtlebot3_Pheromone/src/log',
        output_formats=[logger.HumanOutputFormat(sys.stdout)])
    board_logger = tensorboard_logging.Logger(
        os.path.join(logger_ins.get_dir(), "tf_board", time_str))
    ########################################################
    game_state = phero_turtlebot_exp2.Env(
    )  # game_state has frame_step(action) function
    actor_critic = ActorCritic(game_state)
    random.seed(args.random_seed)
    ########################################################
    num_trials = 600
    trial_len = 256
    log_interval = 5
    train_indicator = 1
    tfirststart = time.time()

    # Reward Logging
    with open(HOME +
              '/catkin_ws/src/Turtlebot3_Pheromone/src/log/csv/{}.csv'.format(
                  actor_critic.file_name),
              mode='w') as csv_file:
        csv_writer = csv.writer(csv_file,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerow(['Episode', 'Average Reward'])

    # Double ended queue with max size 100 to store episode info
    epinfobuf = deque(maxlen=100)

    # Experiment related
    num_robots = game_state.num_robots

    current_state = game_state.reset()

    # actor_critic.read_human_data()

    step_reward = np.array([0, 0]).reshape(1, 2)
    #step_Q = [0,0]
    step = 0

    if (train_indicator == 2):
        for i in range(num_trials):
            print("trial:" + str(i))
            #game_state.step(0.3, 0.2, 0.0)
            #game_state.reset()

            current_state = game_state.reset()
            ##############################################################################################
            total_reward = 0

            for j in range(100):
                step = step + 1
                #print("step is %s", step)

                ###########################################################################################
                #print('wanted value is %s:', game_state.observation_space.shape[0])
                current_state = current_state.reshape(
                    (1, game_state.observation_space.shape[0]))
                action, eps = actor_critic.act(current_state)
                action = action.reshape((1, game_state.action_space.shape[0]))
                print("action is speed: %s, angular: %s", action[0][1],
                      action[0][0])
                _, new_state, reward, done, _ = game_state.step(
                    0.1, action[0][1] * 5, action[0][0] * 5
                )  # we get reward and state here, then we need to calculate if it is crashed! for 'dones' value
                total_reward = total_reward + reward

    if (train_indicator == 1):

        # actor_critic.actor_model.load_weights("actormodel-90-1000.h5")
        # actor_critic.critic_model.load_weights("criticmodel-90-1000.h5")
        step_reward = np.array([0, 0]).reshape(1, 2)
        for i in range(num_trials):
            print("trial:" + str(i))

            #game_state.step(0.3, 0.2, 0.0)
            #game_state.reset()
            ''' Get states of multiple robots (num_robots x num_states) '''
            _, current_states = game_state.reset()
            ##############################################################################################
            #total_reward = 0
            epinfos = []
            for j in range(trial_len):

                ###########################################################################################
                #print('wanted value is %s:', game_state.observation_space.shape[0])
                current_states = current_states.reshape(
                    (num_robots, game_state.observation_space.shape[0]))
                actions = []
                for k in range(num_robots):
                    action, eps = actor_critic.act(current_states[k])
                    action = action.reshape(
                        (1, game_state.action_space.shape[0]))
                    actions.append(action)
                actions = np.squeeze(np.asarray(actions))
                #print("Actions: {}".format(actions))
                #print("action is speed: %s, angular: %s", action[0][1], action[0][0])
                _, new_states, rewards, dones, infos = game_state.step(
                    actions, 0.1
                )  # we get reward and state here, then we need to calculate if it is crashed! for 'dones' value
                #print("Rewards: {}".format(rewards))
                #total_reward = total_reward + reward
                ###########################################################################################

                if j == (trial_len - 1):
                    dones = np.array([True,
                                      True]).reshape(game_state.num_robots, 1)
                    #print("this is reward:", total_reward)
                    #print('eps is', eps)

                step = step + 1
                #plot_reward(step,reward,ax,fig)
                #step_reward = np.append(step_reward,[step,reward])
                #step_start = time.time()
                #sio.savemat('step_reward.mat',{'data':step_reward},True,'5', False, False,'row')
                #print("step is %s", step)
                #print("info: {}".format(info[0]['episode']['r']))
                #Q_values = actor_critic.read_Q_values(current_state, action)
                #step_Q = np.append(step_Q,[step,Q_values[0][0]])
                #print("step_Q is %s", Q_values[0][0])
                #sio.savemat('step_Q.mat',{'data':step_Q},True,'5', False, False,'row')
                #print("Train_step time: {}".format(time.time() - step_start))

                epinfos.append(infos[0]['episode'])

                start_time = time.time()

                if (j % 5 == 0):
                    actor_critic.train(j)
                    actor_critic.update_target()

                end_time = time.time()
                print("Train time: {}".format(end_time - start_time))
                #print("new_state: {}".format(new_state))
                new_states = new_states.reshape(
                    (num_robots, game_state.observation_space.shape[0]))

                # print shape of current_state
                #print("current_state is %s", current_state)
                ##########################################################################################
                actor_critic.remember(current_states, actions, rewards,
                                      new_states, dones)
                actor_critic.replay_buffer.add(current_states, actions,
                                               rewards, new_states, dones)
                current_states = new_states

                ##########################################################################################
            if (i % 10 == 0):
                actor_critic.save_weight(i, trial_len)
            epinfobuf.extend(epinfos)
            tnow = time.time()
            #fps = int(nbatch / (tnow - tstart))

            ##################################################
            ##      Logging and saving model & weights      ##
            ##################################################

            if i % log_interval == 0 or i == 0:
                #ev = explained_variance(values, returns)
                reward_mean = safemean([epinfo['r'] for epinfo in epinfobuf])
                logger_ins.logkv("serial_timesteps", i * trial_len)
                logger_ins.logkv("nupdates", i)
                logger_ins.logkv("total_timesteps", i * trial_len)
                logger_ins.logkv('eprewmean', reward_mean)
                logger_ins.logkv(
                    'eplenmean',
                    safemean([epinfo['l'] for epinfo in epinfobuf]))
                logger_ins.logkv('time_elapsed', tnow - tfirststart)
                # for (lossval, lossname) in zip(lossvals, model.loss_names):
                #     logger_ins.logkv(lossname, lossval)
                # logger_ins.dumpkvs()
                # for (lossval, lossname) in zip(lossvals, model.loss_names):
                #     board_logger.log_scalar(lossname, lossval, update)
                board_logger.log_scalar("eprewmean", reward_mean, i)

                board_logger.flush()
                with open(
                        HOME +
                        '/catkin_ws/src/Turtlebot3_Pheromone/src/log/csv/{}.csv'
                        .format(actor_critic.file_name),
                        mode='a') as csv_file:
                    csv_writer = csv.writer(csv_file,
                                            delimiter=',',
                                            quotechar='"',
                                            quoting=csv.QUOTE_MINIMAL)
                    csv_writer.writerow(['%i' % i, '%0.2f' % reward_mean])
                step_reward = np.append(step_reward,
                                        [[num_trials, reward_mean]],
                                        axis=0)
                sio.savemat(
                    HOME +
                    '/catkin_ws/src/Turtlebot3_Pheromone/src/log/MATLAB/step_reward_{}.mat'
                    .format(actor_critic.time_str), {'data': step_reward},
                    True, '5', False, False, 'row')

    if train_indicator == 0:
        for i in range(num_trials):
            print("trial:" + str(i))
            current_state = game_state.reset()

            actor_critic.actor_model.load_weights(path +
                                                  "actormodel-2950-256.h5")
            actor_critic.critic_model.load_weights(path +
                                                   "criticrmodel-2950-256.h5")
            ##############################################################################################
            total_reward = 0

            for j in range(trial_len):

                ###########################################################################################
                current_state = current_state.reshape(
                    (1, game_state.observation_space.shape[0]))

                start_time = time.time()
                action = actor_critic.play(
                    current_state
                )  # need to change the network input output, do I need to change the output to be [0, 2*pi]
                action = action.reshape((1, game_state.action_space.shape[0]))
                end_time = time.time()
                print(1 / (end_time - start_time),
                      "fps for calculating next step")

                _, new_state, reward, done = game_state.step(
                    0.1, action[0][1], action[0][0]
                )  # we get reward and state here, then we need to calculate if it is crashed! for 'dones' value
                total_reward = total_reward + reward
                ###########################################################################################

                if j == (trial_len - 1):
                    done = 1
                    print("this is reward:", total_reward)

                # if (j % 5 == 0):
                # 	actor_critic.train()
                # 	actor_critic.update_target()

                new_state = new_state.reshape(
                    (1, game_state.observation_space.shape[0]))
                # actor_critic.remember(cur_state, action, reward, new_state, done)   # remember all the data using memory, memory data will be samples to samples automatically.
                # cur_state = new_state

                ##########################################################################################
                #actor_critic.remember(current_state, action, reward, new_state, done)
                current_state = new_state
Example #3
0
def learn(policy,
          env,
          nsteps,
          total_timesteps,
          ent_coef,
          lr,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          restore_path=None,
          deterministic=False):

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    time_str = time.strftime("%m_%m_%H_%M_%S")
    board_logger = tensorboard_logging.Logger(
        os.path.join(logger.get_dir(), "tf_board", time_str))

    nenvs = 1
    #nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm,
                               deterministic=deterministic)
    if save_interval and logger.get_dir():
        import cloudpickle
        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
            fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    if restore_path is not None:
        model.restore(restore_path)
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    nupdates = total_timesteps // nbatch
    assert (nupdates > 0)
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        ids, obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        epinfobuf.extend(epinfos)
        mblossvals = []

        # Do not train or log if in deterministic mode:
        if deterministic:
            continue

        if states is None:  # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                #np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, ids[mbinds],
                                    [obs[i] for i in mbinds], returns[mbinds],
                                    masks[mbinds], actions[mbinds],
                                    values[mbinds], neglogpacs[mbinds]))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                #np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow,
                                    [obs[i] for i in mbinds],
                                    returns[mbflatinds], masks[mbflatinds],
                                    actions[mbflatinds], values[mbflatinds],
                                    neglogpacs[mbflatinds], mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            #ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            #logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            logger.dumpkvs()
            board_logger.log_scalar(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]),
                update)
            board_logger.flush()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir():
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            if not os.path.isdir(checkdir):
                os.makedirs(checkdir)
            savepath = osp.join(
                checkdir, "r" +
                "{:.2f}".format(safemean([epinfo['r']
                                          for epinfo in epinfobuf])) +
                '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    print("Done with training. Exiting.")
    env.close()
    return model