Beispiel #1
0
    def run(self, algo, T, **kwargs):
        visualiser = GraphViz(description='tmp')
        params, pomdp = self.params, None
        total_rewards, budget = 0, params.budget

        log.info('~~~ initialising ~~~')
        with PomdpParser(params.env_config) as ctx:
            # creates model and solver
            model = self.create_model(ctx.copy_env())
            pomdp = self.create_solver(algo, model)

            # supply additional algo params
            belief = ctx.random_beliefs(
            ) if params.random_prior else ctx.generate_beliefs()

            if algo == 'pbvi':
                belief_points = ctx.generate_belief_points(kwargs['stepsize'])
                pomdp.add_configs(belief_points)
            elif algo == 'pomcp':
                pomdp.add_configs(budget, belief, **kwargs)

        # have fun!
        log.info('''
        ++++++++++++++++++++++
            Starting State:  {}
            Starting Budget:  {}
            Time Horizon: {}
            Max Play: {}
        ++++++++++++++++++++++'''.format(model.curr_state, budget, T,
                                         params.max_play))

        for i in range(params.max_play):
            # plan, take action and receive environment feedbacks
            pomdp.solve(T)
            action = pomdp.get_action(belief)
            new_state, obs, reward, cost = pomdp.take_action(action)

            if params.snapshot and isinstance(pomdp, POMCP):
                # takes snapshot of belief tree before it gets updated
                self.snapshot_tree(visualiser, pomdp.tree, '{}.gv'.format(i))

            # update states
            belief = pomdp.update_belief(belief, action, obs)
            total_rewards += reward
            budget -= cost

            # print ino
            log.info('\n'.join([
                'Taking action: {}'.format(action),
                'Observation: {}'.format(obs),
                'Reward: {}'.format(reward),
                'Budget: {}'.format(budget),
                'New state: {}'.format(new_state),
                ## 'New Belief: {}'.format(belief),
                '=' * 20
            ]))

            if budget <= 0:
                log.info('Budget spent.')

            if action == 'Catch' and ('tagged' in new_state):
                break

            input("Pulsa intro para ejecutar el siguiente paso del algoritmo")

        log.info('{} games played. Total reward = {}'.format(
            i + 1, total_rewards))
        return pomdp
Beispiel #2
0
    def run(self, modo, problema, algo, T, **kwargs):
        steps = math.array([], float)
        rewards = math.array([], float)
        if modo == "Benchmark":
            c = 0
        else:
            c = 29
        while c < 30:
            c += 1
            if modo == "Benchmark":
                log.info("===================== Ejecucion " + str(c) +
                         '=====================')
            visualiser = GraphViz(description='tmp')
            params, pomdp = self.params, None
            total_rewards, budget = 0, params.budget

            with PomdpParser(params.env_config) as ctx:
                model = self.create_model(ctx.copy_env())
                pomdp = self.create_solver(algo, model)

                belief = ctx.random_beliefs(
                ) if params.random_prior else ctx.generate_beliefs()

                if algo == 'pbvi':
                    belief_points = ctx.generate_belief_points(
                        kwargs['stepsize'])
                    pomdp.add_configs(belief_points)
                elif algo == 'pomcp':
                    pomdp.add_configs(budget, belief, **kwargs)

            if modo != "Benchmark":
                log.info('''
                ++++++++++++++++++++++
                    Estado inicial:  {}
                    Presupuesto:  {}
                    Creencia: {}
                    Horizonte de tiempo: {}
                    Numero de juegos maximo: {}
                ++++++++++++++++++++++'''.format(model.curr_state, budget,
                                                 belief, T, params.max_play))
            condicion_parada = False
            i = 0
            while not condicion_parada and params.max_play > i:
                i += 1
                pomdp.solve(T, modo)
                action = pomdp.get_action(belief)
                new_state, obs, reward, cost = pomdp.take_action(action)

                if problema == "Tigre":
                    condicion_parada = action == "open-left" or action == "open-right"
                elif problema == "LaserTag":
                    condicion_parada = action == "Catch"
                elif problema == "Recipientes":
                    condicion_parada = action == "bebe-izq" or action == "bebe-med" or action == "bebe-der"

                if params.snapshot and isinstance(pomdp, POMCP):
                    self.snapshot_tree(visualiser, pomdp.tree,
                                       '{}.gv'.format(i))

                belief = pomdp.update_belief(belief, action, obs)
                total_rewards += reward
                budget -= cost

                if modo == "Interactivo":
                    log.info('\n'.join([
                        'Accion tomada: {}'.format(action),
                        'Observacion: {}'.format(obs),
                        'Recompensa: {}'.format(reward),
                        'Presupuesto: {}'.format(budget),
                        'Nuevo estado: {}'.format(new_state),
                        'Nueva creencia: {}'.format(belief),
                        'Paso numero: {}'.format(i), '=' * 20
                    ]))

                if budget <= 0:
                    log.info('Se ha sobrepasado el presupuesto establecido.')
                if params.max_play != 'inf' and params.max_play <= i:
                    log.info(
                        'Se ha sobrepasado el número máximo de pasos establecido.'
                    )

            log.info('{} pasos ejecutados. Recompensa total acumulada = {}\n'.
                     format(i, total_rewards))
            steps = math.append(steps, i)
            rewards = math.append(rewards, total_rewards)

        if modo == "Benchmark":
            mean_steps = steps.mean()
            std_steps = steps.std()
            mean_rewards = rewards.mean()
            std_rewards = rewards.std()
            print(
                "#########################################################################################"
            )
            print("#    RESULTADOS DEL BENCHMARK:")
            print("#    Valor medio pasos: ", mean_steps)
            print("#    Desviacion tipica pasos: ", std_steps)
            print("#    Valor medio recompensas: ", mean_rewards)
            print("#    Desviacion tipica recompensas: ", std_rewards)
            print(
                "#########################################################################################"
            )

        return pomdp
Beispiel #3
0
    def replay(self, algo, T, **kwargs):
        visualiser = GraphViz(description='tmp')
        params, pomdp = self.params, None
        total_rewards, budget = 0, params.budget

        log.info('~~~ initialising experience replay ~~~')
        ## 4 experiences
        with PomdpParser(params.env_config) as ctx:

            for simulation in range(4):
                log.info('~~~ initialising simulation: ' + str(simulation) +
                         '~~~')

                # creates model and solver
                model = self.create_model(ctx.copy_env())
                pomdp = self.create_solver(algo, model)

                # supply additional algo params
                belief = ctx.random_beliefs(
                ) if params.random_prior else ctx.generate_beliefs()

                if algo == 'pbvi':
                    # charging alphavec policy file
                    # belief_points = pomdp.generate_reachable_belief_points(belief, 50)
                    # pomdp.add_configs(belief_points)
                    pomdp.charging_policy(params.policyfile)
                    # pomdp.solve(T)

                elif algo == 'pomcp':
                    pomdp.add_configs(budget, belief, **kwargs)

                total_rewards = 0
                # have fun!
                log.info('''
                ++++++++++++++++++++++
                Init Belief: {}
                Max Play: {}
                ++++++++++++++++++++++'''.format(belief, params.max_play))

                for i in range(params.max_play):
                    # plan, take action and receive environment feedbacks
                    if algo == 'pomcp':
                        pomdp.solve(T)
                    # take action
                    action = pomdp.get_action(belief)
                    # new_state, obs, reward, cost = pomdp.take_action(action)
                    # getting exp action
                    exp_action = self.getting_mode_from_expfile(
                        i, simulation, pomdp)

                    if exp_action == -1:
                        log.info('\n'.join(
                            ['Observation: {}'.format(obs), 'Mission ended']))
                        plotting.destroy()
                        break

                    if params.snapshot and isinstance(pomdp, POMCP):
                        # takes snapshot of belief tree before it gets updated
                        self.snapshot_tree(visualiser, pomdp.tree,
                                           '{}.gv'.format(i))

                    if i == 0:
                        plotting = AnimateBeliefPlot(belief, action,
                                                     exp_action)
                    else:
                        plotting.update(belief, action, exp_action, obs)

                    # getting features to play symbolic observation
                    features = self.getting_features_from_expfile(
                        i, simulation, pomdp)
                    #print(features)
                    label = self.classif_model.predict(features)
                    #print(label)
                    # transforming label in pomdp observation
                    available_observations = pomdp.model.observations
                    obs = available_observations[int(label[0])]

                    belief = pomdp.update_belief(belief, exp_action, obs)

                    # print loginfo
                    log.info('\n'.join([
                        'Observation: {}'.format(obs),
                        'POMDP would take action: {}'.format(action),
                        'action taken during EXPERIMENT: {}'.format(
                            exp_action), 'New Belief: {}'.format(belief),
                        '=' * 20
                    ]))

        return pomdp
Beispiel #4
0
    def run(self, algo, T, **kwargs):
        visualiser = GraphViz(description='tmp')
        params, pomdp = self.params, None
        total_rewards, budget = 0, params.budget
        environment = params.env_config
        benchmark = params.benchmark

        log.info('~~~ initialising ~~~')
        with PomdpParser(params.env_config) as ctx:
            # creates model and solver
            model = self.create_model(ctx.copy_env())
            pomdp = self.create_solver(algo, model)

            # supply additional algo params
            belief = ctx.random_beliefs(
            ) if params.random_prior else ctx.generate_beliefs()

            if algo == 'pbvi':
                belief_points = ctx.generate_belief_points(kwargs['stepsize'])
                pomdp.add_configs(belief_points)
            elif algo == 'pomcp':
                pomdp.add_configs(budget, belief, **kwargs)

        # have fun!
        log.info('''
        ++++++++++++++++++++++
            Starting State:  {}
            Starting Budget:  {}
            Init Belief: {}
            Time Horizon: {}
            Max Play: {}
        ++++++++++++++++++++++'''.format(model.curr_state, budget, belief, T,
                                         params.max_play))

        for i in range(params.max_play):
            # plan, take action and receive environment feedbacks
            pomdp.solve(T)
            action = pomdp.get_action(belief)
            new_state, obs, reward, cost = pomdp.take_action(action)

            if params.snapshot and isinstance(pomdp, POMCP):
                # takes snapshot of belief tree before it gets updated
                self.snapshot_tree(visualiser, pomdp.tree, '{}.gv'.format(i))

            # update states
            belief = pomdp.update_belief(belief, action, obs)
            total_rewards += reward
            budget -= cost

            # Printing the details for every step of the interactive simulation
            # log.info('\n'.join([
            #     'Taking action: {}'.format(action),
            #     'Observation: {}'.format(obs),
            #     'Reward: {}'.format(reward),
            #     'Budget: {}'.format(budget),
            #     'New state: {}'.format(new_state),
            #     'New Belief: {}'.format(belief),
            #     '=' * 20
            # ]))

            # Tiger problem ----------------------------------------------------------------
            # When the open action is selected, the tiger problem will end, either the person scapes or is eaten by the tiger, so it has to stop.
            if "Tiger-2D.POMDP" in environment:
                if "open" in action:
                    log.info('\n'.join([
                        'Taking action: {}'.format(action),
                        'Observation: {}'.format(obs),
                        'Reward: {}'.format(reward), '=' * 20
                    ]))
                    break
                log.info('\n'.join([
                    'Taking action: {}'.format(action),
                    'Observation: {}'.format(obs), 'Reward: {}'.format(reward),
                    'New Belief: {}'.format(belief), '=' * 20
                ]))
            # Web ads problem ----------------------------------------------------------------
            # When the adv action is selected, the web ad problem will end, either the person gets a tie or a skate advertisement so it has to stop.
            if "Web.POMDP" in environment:
                if params.benchmark == 0:
                    if "adv" in action:
                        log.info('\n'.join([
                            'Taking action: {}'.format(action),
                            'Observation: {}'.format(obs),
                            'Reward: {}'.format(reward), '=' * 20
                        ]))
                        break
                    log.info('\n'.join([
                        'Taking action: {}'.format(action),
                        'Observation: {}'.format(obs),
                        'Reward: {}'.format(reward),
                        'New Belief: {}'.format(belief), '=' * 20
                    ]))

            # Landing problem ----------------------------------------------------------------
            # When the arrive action is selected, the landing problem will end, either the tripulation finds a treasure or they die horribly to the creatures in the landing.
            if "Landing.POMDP" in environment:
                if "arrive" in action:
                    log.info('\n'.join([
                        'Taking action: {}'.format(action),
                        'Observation: {}'.format(obs),
                        'Reward: {}'.format(reward), '=' * 20
                    ]))
                    break
                log.info('\n'.join([
                    'Taking action: {}'.format(action),
                    'Observation: {}'.format(obs), 'Reward: {}'.format(reward),
                    'New Belief: {}'.format(belief), '=' * 20
                ]))

            #Tag problem ----------------------------------------------------------------
            # When the status is tagger, the robot s will catch robot t, the tag problem will end so it has to stop.
            if "Tag.POMDP" in environment:
                if params.benchmark == 0:
                    if "tagged" in model.curr_state:
                        log.info('\n'.join([
                            'Taking action: {}'.format(action),
                            'Observation: {}'.format(obs),
                            'Reward: {}'.format(reward), '=' * 20
                        ]))
                        break
                    log.info('\n'.join([
                        'Taking action: {}'.format(action),
                        'Observation: {}'.format(obs),
                        'Reward: {}'.format(reward),
                        'New state: {}'.format(new_state),
                        #'New Belief: {}'.format(belief),
                        '=' * 20
                    ]))

        # Printing the total steps and reward when the loop ends.
        if params.benchmark == 0:
            log.info(
                'Simulation ended after {} steps. Total reward = {}'.format(
                    i + 1, total_rewards))

        return pomdp
Beispiel #5
0
    def runBench(self, algo, T, **kwargs):
        visualiser = GraphViz(description='tmp')
        params, pomdp = self.params, None
        total_rewards, budget = 0, params.budget
        environment = params.env_config
        benchmark = params.benchmark

        log.info('~~~ Initialising simulation ~~~')
        with PomdpParser(params.env_config) as ctx:
            # creates model and solver
            model = self.create_model(ctx.copy_env())
            pomdp = self.create_solver(algo, model)

            # supply additional algo params
            belief = ctx.random_beliefs(
            ) if params.random_prior else ctx.generate_beliefs()

            if algo == 'pbvi':
                belief_points = ctx.generate_belief_points(kwargs['stepsize'])
                pomdp.add_configs(belief_points)
            elif algo == 'pomcp':
                pomdp.add_configs(budget, belief, **kwargs)

        # have fun!
        log.info('''
           ++++++++++++++++++++++
               Starting State:  {}
               Starting Budget:  {}
               Init Belief: {}
               Time Horizon: {}
               Max Play: {}
           ++++++++++++++++++++++'''.format(model.curr_state, budget, belief,
                                            T, params.max_play))

        for i in range(params.max_play):
            # plan, take action and receive environment feedbacks
            pomdp.solve(T)
            action = pomdp.get_action(belief)
            new_state, obs, reward, cost = pomdp.take_action(action)

            if params.snapshot and isinstance(pomdp, POMCP):
                # takes snapshot of belief tree before it gets updated
                self.snapshot_tree(visualiser, pomdp.tree, '{}.gv'.format(i))

            # update states
            belief = pomdp.update_belief(belief, action, obs)
            total_rewards += reward
            budget -= cost

            #Computing final results when a problem stops
            if "open" in action or "tagged" in model.curr_state or "adv" in action or "arrive" in action:
                log.info('Ended simulation after {} steps. Total reward = {}'.
                         format(i + 1, total_rewards))
                self.step_list.append(i + 1)
                self.fReward_list.append(total_rewards)
                self.steps += i + 1
                self.fReward += total_rewards

                break

            # Printing the details for every step of the interactive simulation
            # log.info('\n'.join([
            #   'Taking action: {}'.format(action),
            #   'Observation: {}'.format(obs),
            #   'Reward: {}'.format(reward),
            #   'Budget: {}'.format(budget),
            #   'New state: {}'.format(new_state),
            #   'New Belief: {}'.format(belief),
            #   '=' * 20
            # ]))

            if budget <= 0:
                log.info('Budget spent.')

        return pomdp
Beispiel #6
0
    def replay(self, algo, T, **kwargs):
        visualiser = GraphViz(description='tmp')
        params, pomdp = self.params, None
        total_rewards, budget = 0, params.budget

        log.info('~~~ initialising experience replay ~~~')
        with PomdpParser(params.env_config) as ctx:
            total_rewards_simulations = []
            for simulation in range(params.sim):
                log.info('~~~ initialising simulation: ' + str(simulation) +
                         '~~~')

                # creates model and solver
                model = self.create_model(ctx.copy_env())
                pomdp = self.create_solver(algo, model)

                # supply additional algo params
                belief = ctx.random_beliefs(
                ) if params.random_prior else ctx.generate_beliefs()

                if algo == 'pbvi':
                    # charging alphavec policy file
                    # belief_points = pomdp.generate_reachable_belief_points(belief, 50)
                    # pomdp.add_configs(belief_points)
                    pomdp.charging_policy(params.policyfile)
                    # pomdp.solve(T)

                elif algo == 'pomcp':
                    pomdp.add_configs(budget, belief, **kwargs)

                total_rewards = 0
                # have fun!
                log.info('''
                ++++++++++++++++++++++
                Starting State:  {}
                Init Belief: {}
                Max Play: {}
                ++++++++++++++++++++++'''.format(model.curr_state, belief,
                                                 params.max_play))

                for i in range(params.max_play):
                    # plan, take action and receive environment feedbacks
                    if algo == 'pomcp':
                        pomdp.solve(T)
                    action = pomdp.get_action(belief)
                    new_state, obs, reward, cost = pomdp.take_action(action)

                    if params.snapshot and isinstance(pomdp, POMCP):
                        # takes snapshot of belief tree before it gets updated
                        self.snapshot_tree(visualiser, pomdp.tree,
                                           '{}.gv'.format(i))

                    # update states
                    belief = pomdp.update_belief(belief, action, obs)
                    total_rewards += reward
                    budget -= cost

                    # print ino
                    log.info('\n'.join([
                        'Taking action: {}'.format(action),
                        'Observation: {}'.format(obs),
                        'Reward: {}'.format(reward),
                        'Budget: {}'.format(budget),
                        'New state: {}'.format(new_state),
                        'New Belief: {}'.format(belief), '=' * 20
                    ]))

                    if budget <= 0:
                        log.info('Budget spent.')
                log.info('{} games played. Total reward = {}'.format(
                    i + 1, total_rewards))
                total_rewards_simulations.append(total_rewards)

            exp_total_reward = np.mean(total_rewards_simulations)
            std_exp_total_reward = np.std(total_rewards_simulations)
            print(params.sim, 'simulations played.')
            print('Exp total reward = ', exp_total_reward)
            print('Std Exp total reward = ', std_exp_total_reward)
            log.info('{} simulations played. Exp total reward = {}'.format(
                params.sim, exp_total_reward))
            log.info('Total rewards observed = {}'.format(
                total_rewards_simulations))
        return pomdp