def obs_model(args, data=[], **kwargs):
    X_obs, X_act = data
    x_dim = X_obs[0].shape[1]
    model = ObservableModel(obs_dim=x_dim)
    filter = rnn_filter.ObservableRNNFilter(model)
    filter.train(X_obs, X_act, on_unused_input='ignore')
    return model, filter
Example #2
0
    def mctsrun(self, env_exp, model, numsteps=100, render=False):
        rngs = []
        if model is None:
            model = ObservableModel(self.dimensions[0])
        print(env_exp)
        mct = MCTS(model)
        rngs.append(self.rng().get_state())
        o = env_exp.reset()
        obs = model._process_obs(o)
        print("obs", obs)
        init_q = model.initial_state
        a = np.zeros(2)
        act = model._process_act(a)
        q = mct.update_state(init_q, obs, act)
        o0 = np.copy(o)
        q0 = np.copy(q)
        undiscountedReturn = 0
        discountedReturn = 0
        discount = 1
        discountconstant = 0.9
        print(model)

        done = False
        while not done:

            if render: self.render()
            a = mct.SelectAction(q)
            if a == 1:
                action = np.array([10, 0])
            else:
                action = np.array([-10, 0])
            o, r, done = self.step(action)
            action = np.array([0, a])
            print(o)
            #print(action)
            q = mct.update_state(q, o, action)
            undiscountedReturn += r
            discountedReturn += r * discount
            discount *= discountconstant
            print('return:', undiscountedReturn)
            if done: break
        print('discountedreturn:', discountedReturn)
        print('undiscountedreturn:', undiscountedReturn)
Example #3
0
def load_rpsp_policy(args, model_exp, **kwargs):
    """
    Load an RPSP policy and policy updater
    @param args: command line arguments
    @param model_exp: observable model
    @param kwargs: policy updater keyword args
    @return: observable model, policy updater, and logger
    """
    model = ObservableModel(obs_dim=args.x_dim)

    X_obs, X_act = get_exploration_trajs(args, model_exp, kwargs.get('env'),
                                         args.a_dim,
                                         kwargs.get('min_traj_length'))
    tic = time()
    psr, filter = model_call(args, data=[X_obs, X_act], x_dim=args.x_dim)
    print('INIT RPSP without refinement takes:', time() - tic)
    state_dim = filter.state_dimension
    pi_react = get_policy[args.pi_exp](x_dim=state_dim,
                                       output_dim=args.a_dim,
                                       num_layers=args.nL,
                                       nh=args.nh,
                                       activation=args.nn_act,
                                       rng=args.rng,
                                       min_std=args.min_std)
    if isinstance(filter, rffpsr_rnn.RFFPSR_RNN):
        pi = psrlite_policy.RFFPSRNetworkPolicy(filter, pi_react,
                                                np.zeros((args.a_dim)))
    else:
        pi = psrlite_policy.PSRLitePolicy(filter, pi_react,
                                          np.zeros((args.a_dim)))

    pp = Log(args, args.flname, pred_model=filter)
    print('Building policy psr graph')

    tic = time()
    PiUpdater = policy_updater[args.vr][args.method](pi, **kwargs)
    print('took ', time() - tic)
    return model, PiUpdater, pp
def load_rpsp_policy(args, model_exp, **kwargs):
    """
    Load an RPSP policy and policy updater
    @param args: command line arguments
    @param model_exp: observable model
    @param kwargs: policy updater keyword args
    @return: observable model, policy updater, and logger
    """
    model = ObservableModel(obs_dim=args.x_dim)

    X_obs, X_act = get_exploration_trajs(args,
                                         model_exp,
                                         kwargs.get('env'),
                                         args.a_dim,
                                         kwargs.get('min_traj_length'))
    tic = time()
    psr, filter = model_call(args, data=[X_obs, X_act], x_dim=args.x_dim)
    print ('INIT RPSP without refinement takes:', time() - tic)
    state_dim = filter.state_dimension

    pp = Log(args, args.flname, pred_model=filter)
    print ('took ', time() - tic)
    return filter, pp
Example #5
0
    def run(self,
            model,
            policy,
            max_traj_length,
            min_traj_length=0,
            num_trajs=0,
            num_samples=0,
            render=False):
        '''
        Generate trajectories of length up max_traj_length each.
        
        Returns:
        A list of trajectories (See models.Trajectory).
        
        Additional parameters:
        - model: An object that implements models.FilteringModel interface. 
            Used to track the state. If None, an ObservableModel is used,
            which returns the current observation.
        - policy: An object that implements policies.Policy interface. 
            Used to provide actions.
        - render: Whether to render generated trajectories in real-time. 
            This calls 'render' method which needs ot be implemented.
        - num_trajs: Number of trajectories to return.
        - num_samples: Total number of samples in generated trajectories.
            
        Must set num_trajs or num_samples (but not both) to a positive number.
        '''
        trajs = []
        rngs = []

        if model is None:
            model = ObservableModel(self.dimensions[0])

        if (num_samples > 0) == (num_trajs > 0):
            raise ValueError(
                'Must specify exactly one of num_trajs and num_samples')

        done_all = False
        d_o, d_a = self.dimensions
        i_sample = 0
        tic = time.time()
        best_traj = [0, [0.0]]

        while not done_all:
            obs = np.empty((max_traj_length, d_o))
            act = np.empty((max_traj_length, d_a))
            rwd = np.empty((max_traj_length, 1))
            vel = np.empty((max_traj_length, 1))
            act_probs = np.empty((max_traj_length, 1))
            env_states = []
            states = np.empty((max_traj_length, model.state_dimension))
            dbg_info = {}
            rngs.append(self.rng().get_state())

            # Make a reset for each trajectory
            policy.reset()
            o = self.reset()
            q = model.reset(o)
            o0 = np.copy(o)
            q0 = np.copy(q)
            env_states.append(self.env_state)
            forward_pos = self.env_state[0][0]

            for j in xrange(max_traj_length):
                if render: self.render()
                a, p, info = policy.sample_action(q)
                o, r, done = self.step(a)
                env_states.append(self.env_state)
                q = model.update_state(o, a)
                act[j, :] = a
                obs[j, :] = o
                rwd[j] = r
                states[j, :] = q
                act_probs[j, :] = p
                vel[j] = (self.env_state[0][0] - forward_pos) / float(self.dt)
                forward_pos = self.env_state[0][0]
                for (k, v) in info.items():
                    if j == 0:
                        # Build arrays for diagnostic info
                        if type(v) is np.ndarray:
                            dbg_info[k] = np.empty((max_traj_length, v.size))
                        else:
                            dbg_info[k] = np.empty((max_traj_length, 1))

                    dbg_info[k][j, :] = v  # act variance

                if done: break

            j += 1
            drop_traj = False

            if j >= min_traj_length:
                # Check if we need to truncate trajectory to maintain num_samples
                if num_samples > 0 and i_sample + j >= num_samples:
                    j -= (i_sample + j - num_samples)
                    done_all = True
                # TODO: remove this will never happen because of outer if?
                if j < min_traj_length:
                    # Last trajectory is too short. Ignore it.
                    drop_traj = True

                if not drop_traj:
                    i_sample += j

                    new_traj = Trajectory(obs=obs[:j, :],
                                          states=states[:j, :],
                                          act=act[:j, :],
                                          rewards=rwd[:j, :],
                                          act_probs=act_probs[:j, :],
                                          obs0=o0,
                                          state0=q0,
                                          rng=rngs[-1],
                                          vel=vel[:j, :])

                    for (k, v) in dbg_info.iteritems():
                        dbg_info[k] = v[:j, :]
                    new_traj.dbg_info = dbg_info
                    trajs.append(new_traj)

                    if np.sum(rwd[:j, :]) >= np.sum(
                            trajs[best_traj[0]].rewards):
                        best_traj[0] = len(trajs) - 1
                        best_traj[1] = env_states

                    if num_trajs > 0 and len(trajs) == num_trajs:
                        done_all = True
        print('Gathering trajectories took:', time.time() - tic)
        # add best trajectory
        trajs[best_traj[0]].env_states = best_traj[
            1]  # save env states for best trajectory
        trajs[-1].bib = best_traj[0]  # save best in batch on last trajectory
        return trajs
def run_policy_continuous(args, flname):
    """
    Train a continuous RPSPnet from commandline arguments
    @param args: command line args
    @param flname: filename to store results
    @return: logger results to save
    """
    args.flname = flname
    env = load_environment(args)
    env = load_environment(args)
    (x_dim, a_dim) = env.dimensions
    args.a_dim = a_dim
    args.x_dim = x_dim
    print(x_dim,"@@")
    model_exp = ObservableModel(x_dim)
    pi_exp = policies.RandomGaussianPolicy(x_dim, rng=args.rng)
    baseline = args.b
    min_traj_length = getattr(args, 'mintrajlen', args.past + args.fut + 2)
    PiUpdater = None
    fkwargs = {'baseline': baseline, 'lr': args.lr, 'beta_reinf': args.wrwd,
               'beta_pred': args.wpred, 'beta_pred_decay': args.wdecay,
               'beta_only_reinf': args.wrwd_only, 'gamma': args.gamma,
               'grad_step': args.grad_step, 'trpo_step': args.trpo_step,
               'past': args.past, 'fut': args.fut, 'cg_opt': args.cg_opt,
               'max_traj_length': args.len, 'num_trajs': args.numtrajs,
               'normalize_grad': args.norm_g, 'hvec': args.hvec,
               'env': env, 'min_traj_len': min_traj_length}
    print ('build updater ... ', args.method)

    #run the observable model with reactive policy
    if args.method == 'obsVR':
        model, PiUpdater, pp = load_observable_policy(args, model_exp, **fkwargs)
    elif args.method == 'arVR':
        model, PiUpdater, pp = load_finite_mem_policy(args, model_exp, **fkwargs)
    else:
        #run the psr network with obs model or psr model
        model,  pp = load_rpsp_policy(args, model_exp, **fkwargs)
    print ('done building updater')
    print ('len:', args.len, 'num trajs:', args.numtrajs, 'iter:', args.iter)
    state_shape = (1,model._state_dim)
    num_actions = 64
    batch_size =8
    q_learner = dqn.Agent(state_shape, num_actions, batch_size=batch_size)
    best_mean_rewards=-100
    best_rewards=-100
    MAX_EPISODES = 8000
    MAX_STEPS = 50
    mct=MCTS(model)
    episode_history = deque(maxlen=25)
    for i in xrange(MAX_EPISODES):

        # initialize
        action=np.zeros(2)
        _act=np.zeros(2)
        o = env.reset()
        obs=model._process_obs(o)
        init_q=model.initial_state
        a=np.zeros(2)
        act=model._process_act(a)
        state=mct.update_state(init_q,obs, act)
        total_rewards = 0

        for t in range(MAX_STEPS):
            #env.render()
            a = q_learner.choose_action(state)
            _act[0]=int('{:0>6b}'.format(a)[0:3], 2)
            _act[1]=int('{:0>6b}'.format(a)[3:6], 2)
            for n in range(2):
                action[n] = 1.4 - 0.2 * _act[n]
            action=np.array([action[0],action[1]])
            next_obs, reward, done = env.step(action)
            if t == 48:
                done=True
            t_next_obs=mct.model._process_obs(next_obs)
            t_act=mct.model._process_act(action)
            total_rewards += reward
            next_state=mct.update_state(state,t_next_obs, t_act)
            q_learner.update_buffer(state, a, reward, next_state, done)
            
            # Only start learning after buffer has some experience in it
            if i > 50:
                q_learner.update_policy()


            state = next_state
            if done == True: 
                 break
        episode_history.append(total_rewards)
        mean_rewards = np.mean(episode_history)
        print("Episode {}".format(i))
        print("Finished after {} timesteps".format(t+1))
        print("Reward for this episode: {}".format(total_rewards))
        print("Average reward for last 100 episodes: {:.2f}".format(mean_rewards))

        if mean_rewards >= best_mean_rewards:
            best_mean_rewards=mean_rewards
        if total_rewards >= best_rewards:
            best_rewards=total_rewards 
        #print>>file,mean_rewards
        print(mean_rewards ,file=file)
    print("best reward",best_rewards)
    print("best_mean_reward",best_mean_rewards) 
Example #7
0
def run_policy_continuous(args, flname):
    """
    Train a continuous RPSPnet from commandline arguments
    @param args: command line args
    @param flname: filename to store results
    @return: logger results to save
    """
    args.flname = flname
    env = load_environment(args)
    env = load_environment(args)
    (x_dim, a_dim) = env.dimensions
    args.a_dim = a_dim
    args.x_dim = x_dim
    model_exp = ObservableModel(x_dim)
    pi_exp = policies.RandomGaussianPolicy(x_dim, rng=args.rng)
    baseline = args.b
    min_traj_length = getattr(args, 'mintrajlen', args.past + args.fut + 2)
    PiUpdater = None
    fkwargs = {
        'baseline': baseline,
        'lr': args.lr,
        'beta_reinf': args.wrwd,
        'beta_pred': args.wpred,
        'beta_pred_decay': args.wdecay,
        'beta_only_reinf': args.wrwd_only,
        'gamma': args.gamma,
        'grad_step': args.grad_step,
        'trpo_step': args.trpo_step,
        'past': args.past,
        'fut': args.fut,
        'cg_opt': args.cg_opt,
        'max_traj_length': args.len,
        'num_trajs': args.numtrajs,
        'normalize_grad': args.norm_g,
        'hvec': args.hvec,
        'env': env,
        'min_traj_len': min_traj_length
    }
    print('build updater ... ', args.method)

    #run the observable model with reactive policy
    if args.method == 'obsVR':
        model, PiUpdater, pp = load_observable_policy(args, model_exp,
                                                      **fkwargs)
    elif args.method == 'arVR':
        model, PiUpdater, pp = load_finite_mem_policy(args, model_exp,
                                                      **fkwargs)
    else:
        #run the psr network with obs model or psr model
        model, PiUpdater, pp = load_rpsp_policy(args, model_exp, **fkwargs)
    print('done building updater')
    print('len:', args.len, 'num trajs:', args.numtrajs, 'iter:', args.iter)

    def run_experiment():
        if args.loadfile != '':
            PiUpdater._load(args.params)
        elif args.load_reactive != '':
            re_params = load_params(args.load_reactive)
            try:
                PiUpdater._policy._policy._load(re_params)
            except AttributeError:
                pass

        learn_policy(PiUpdater,
                     model,
                     env,
                     min_traj_length=0,
                     max_traj_len=args.len,
                     num_trajs=args.numtrajs,
                     num_samples=args.numsamples,
                     num_iter=args.iter,
                     logger=pp.logger)

    try:
        run_experiment()
    except AssertionError as exc:
        print('WARNING: Got AssertionError !')
        print('Message: %s' % exc.message)
        print('Stacktrace:')
        traceback.print_exc()
        return None
    pp._results['params'] = PiUpdater._save()
    if args.addobs or args.method == 'arVR':
        try:
            re_params = PiUpdater._policy._policy._save()
        except AttributeError:
            re_params = PiUpdater._policy._save()
        save_params(re_params, 're_pi_{}.pkl'.format(args.seed), args.tfile)
    env.close()
    return pp._results