コード例 #1
0
    def example_to_str(self, example, controller, rewards):
        verbose_str = []
        from core.price_tracker import PriceScaler
        for session_id, session in enumerate(controller.sessions):
            bottom, top = PriceScaler.get_price_range(session.kb)
            s = 'Agent[{}: {}], bottom ${}, top ${}'.format(
                session_id, session.kb.role, bottom, top)
            verbose_str.append(s)

        strs = example.to_text()
        for str in strs:
            verbose_str.append(str)
        s = "reward: [0]{}\nreward: [1]{}".format(rewards[0], rewards[1])
        verbose_str.append(s)
        return verbose_str
コード例 #2
0
ファイル: a2c_trainer.py プロジェクト: princeton-nlp/cocoa
    def example_to_str(self, example, controller, rewards, sid=None):
        verbose_str = []
        from core.price_tracker import PriceScaler
        if sid is not None:
            verbose_str.append('[Scenario id: {}]'.format(sid))
        for session_id, session in enumerate(controller.sessions):
            bottom, top = PriceScaler.get_price_range(session.kb)
            s = 'Agent[{}: {}], bottom ${}, top ${}'.format(session_id, session.kb.role, bottom, top)
            verbose_str.append(s)
        verbose_str.append("They are negotiating for "+session.kb.facts['item']['Category'])

        strs = self.example_to_text(example)
        for str in strs:
            verbose_str.append(str)
        s = "reward: [0]{}\nreward: [1]{}".format(rewards[0], rewards[1])
        verbose_str.append(s)
        return verbose_str
コード例 #3
0
ファイル: rl_trainer.py プロジェクト: TimerChen/cocoa
    def learn(self, args):
        if args.model_type == 'reinforce':
            train_policy = True
            train_critic = False
        elif args.model_type == 'critic':
            train_policy = False
            train_critic = True
        elif args.model_type == 'tom':
            train_policy = False
            train_critic = False

        rewards = [None] * 2
        s_rewards = [None] * 2

        critic_report_stats = RLStatistics()
        critic_stats = RLStatistics()
        last_time = time.time()

        tensorboard_every = 1
        history_train_losses = [[], []]

        for i in range(args.num_dialogues):
            # Rollout
            scenario = self._get_scenario()
            controller = self._get_controller(scenario, split='train')
            # print('set controller for{} {}.'.format(self.training_agent, controller))
            controller.sessions[0].set_controller(controller)
            controller.sessions[1].set_controller(controller)
            example = controller.simulate(args.max_turns, verbose=args.verbose)

            for session_id, session in enumerate(controller.sessions):
                # if args.only_run != True and session_id != self.training_agent:
                #     continue

                # Compute reward
                reward = self.get_reward(example, session)
                # Standardize the reward
                all_rewards = self.all_rewards[session_id]
                all_rewards.append(reward)
                s_reward = (reward - np.mean(all_rewards)) / max(
                    1e-4, np.std(all_rewards))

                rewards[session_id] = reward
                s_rewards[session_id] = s_reward

            for session_id, session in enumerate(controller.sessions):
                # Only train one agent
                if session_id != self.training_agent:
                    continue

                batch_iter = session.iter_batches()
                T = next(batch_iter)

                if train_policy:
                    loss = self.update(batch_iter,
                                       reward,
                                       self.model,
                                       discount=args.discount_factor)
                    history_train_losses[session_id].append(loss)

                if train_critic:
                    stats = self.update_critic(batch_iter,
                                               reward,
                                               self.critic,
                                               discount=args.discount_factor)
                    critic_report_stats.update(stats)
                    critic_stats.update(stats)

            # print('verbose: ', args.verbose)

            if args.verbose:
                if train_policy or args.model_type == 'tom':
                    from core.price_tracker import PriceScaler
                    for session_id, session in enumerate(controller.sessions):
                        bottom, top = PriceScaler.get_price_range(session.kb)
                        print('Agent[{}: {}], bottom ${}, top ${}'.format(
                            session_id, session.kb.role, bottom, top))

                    strs = example.to_text()
                    for str in strs:
                        print(str)
                    print("reward: [0]{}\nreward: [1]{}".format(
                        self.all_rewards[0][-1], self.all_rewards[1][-1]))
                    # print("Standard reward: [0]{} [1]{}".format(s_rewards[0], s_rewards[1]))

            # Save logs on tensorboard
            if (i + 1) % tensorboard_every == 0:
                for j in range(2):
                    self.writer.add_scalar(
                        'agent{}/reward'.format(j),
                        np.mean(self.all_rewards[j][-tensorboard_every:]), i)
                    if len(history_train_losses[j]) >= tensorboard_every:
                        tmp = np.concatenate(
                            history_train_losses[j][-tensorboard_every:],
                            axis=0)
                        tmp = np.mean(tmp, axis=0)
                        self.writer.add_scalar('agent{}/total_loss'.format(j),
                                               tmp[0], i)
                        self.writer.add_scalar('agent{}/logp_loss'.format(j),
                                               tmp[1], i)
                        self.writer.add_scalar('agent{}/intent_loss'.format(j),
                                               tmp[2], i)
                        self.writer.add_scalar('agent{}/price_loss'.format(j),
                                               tmp[3], i)

            if ((i + 1) % args.report_every) == 0:
                import seaborn as sns
                import matplotlib.pyplot as plt
                if args.histogram:
                    sns.set_style('darkgrid')

                if train_policy:
                    for j in range(2):
                        print('agent={}'.format(j), end=' ')
                        print('step:', i, end=' ')
                        print('reward:', rewards[j], end=' ')
                        print('scaled reward:', s_rewards[j], end=' ')
                        print('mean reward:', np.mean(self.all_rewards[j]))
                        if args.histogram:
                            self.agents[
                                j].env.dialogue_generator.get_policyHistogram(
                                )

                if train_critic:
                    critic_report_stats.output(i + 1, 0, 0, last_time)
                    critic_report_stats = RLStatistics()

                print('-' * 10)
                if args.histogram:
                    plt.show()

                last_time = time.time()

            # Save model
            if (i > 0 and i % 100 == 0) and not args.only_run:
                if train_policy:
                    valid_stats = self.validate(args)
                    self.drop_checkpoint(
                        args,
                        i,
                        valid_stats,
                        model_opt=self.agents[
                            self.training_agent].env.model_args)
                    self.update_opponent('policy')

                elif train_critic:
                    # TODO: reverse!
                    self.drop_checkpoint(
                        args,
                        i,
                        critic_stats,
                        model_opt=self.agents[
                            self.training_agent].env.model_args)
                    critic_stats = RLStatistics()
                else:
                    valid_stats = self.validate(args)
                    print('valid result: ', valid_stats.str_loss())