def example_to_str(self, example, controller, rewards): verbose_str = [] from core.price_tracker import PriceScaler for session_id, session in enumerate(controller.sessions): bottom, top = PriceScaler.get_price_range(session.kb) s = 'Agent[{}: {}], bottom ${}, top ${}'.format( session_id, session.kb.role, bottom, top) verbose_str.append(s) strs = example.to_text() for str in strs: verbose_str.append(str) s = "reward: [0]{}\nreward: [1]{}".format(rewards[0], rewards[1]) verbose_str.append(s) return verbose_str
def example_to_str(self, example, controller, rewards, sid=None): verbose_str = [] from core.price_tracker import PriceScaler if sid is not None: verbose_str.append('[Scenario id: {}]'.format(sid)) for session_id, session in enumerate(controller.sessions): bottom, top = PriceScaler.get_price_range(session.kb) s = 'Agent[{}: {}], bottom ${}, top ${}'.format(session_id, session.kb.role, bottom, top) verbose_str.append(s) verbose_str.append("They are negotiating for "+session.kb.facts['item']['Category']) strs = self.example_to_text(example) for str in strs: verbose_str.append(str) s = "reward: [0]{}\nreward: [1]{}".format(rewards[0], rewards[1]) verbose_str.append(s) return verbose_str
def learn(self, args): if args.model_type == 'reinforce': train_policy = True train_critic = False elif args.model_type == 'critic': train_policy = False train_critic = True elif args.model_type == 'tom': train_policy = False train_critic = False rewards = [None] * 2 s_rewards = [None] * 2 critic_report_stats = RLStatistics() critic_stats = RLStatistics() last_time = time.time() tensorboard_every = 1 history_train_losses = [[], []] for i in range(args.num_dialogues): # Rollout scenario = self._get_scenario() controller = self._get_controller(scenario, split='train') # print('set controller for{} {}.'.format(self.training_agent, controller)) controller.sessions[0].set_controller(controller) controller.sessions[1].set_controller(controller) example = controller.simulate(args.max_turns, verbose=args.verbose) for session_id, session in enumerate(controller.sessions): # if args.only_run != True and session_id != self.training_agent: # continue # Compute reward reward = self.get_reward(example, session) # Standardize the reward all_rewards = self.all_rewards[session_id] all_rewards.append(reward) s_reward = (reward - np.mean(all_rewards)) / max( 1e-4, np.std(all_rewards)) rewards[session_id] = reward s_rewards[session_id] = s_reward for session_id, session in enumerate(controller.sessions): # Only train one agent if session_id != self.training_agent: continue batch_iter = session.iter_batches() T = next(batch_iter) if train_policy: loss = self.update(batch_iter, reward, self.model, discount=args.discount_factor) history_train_losses[session_id].append(loss) if train_critic: stats = self.update_critic(batch_iter, reward, self.critic, discount=args.discount_factor) critic_report_stats.update(stats) critic_stats.update(stats) # print('verbose: ', args.verbose) if args.verbose: if train_policy or args.model_type == 'tom': from core.price_tracker import PriceScaler for session_id, session in enumerate(controller.sessions): bottom, top = PriceScaler.get_price_range(session.kb) print('Agent[{}: {}], bottom ${}, top ${}'.format( session_id, session.kb.role, bottom, top)) strs = example.to_text() for str in strs: print(str) print("reward: [0]{}\nreward: [1]{}".format( self.all_rewards[0][-1], self.all_rewards[1][-1])) # print("Standard reward: [0]{} [1]{}".format(s_rewards[0], s_rewards[1])) # Save logs on tensorboard if (i + 1) % tensorboard_every == 0: for j in range(2): self.writer.add_scalar( 'agent{}/reward'.format(j), np.mean(self.all_rewards[j][-tensorboard_every:]), i) if len(history_train_losses[j]) >= tensorboard_every: tmp = np.concatenate( history_train_losses[j][-tensorboard_every:], axis=0) tmp = np.mean(tmp, axis=0) self.writer.add_scalar('agent{}/total_loss'.format(j), tmp[0], i) self.writer.add_scalar('agent{}/logp_loss'.format(j), tmp[1], i) self.writer.add_scalar('agent{}/intent_loss'.format(j), tmp[2], i) self.writer.add_scalar('agent{}/price_loss'.format(j), tmp[3], i) if ((i + 1) % args.report_every) == 0: import seaborn as sns import matplotlib.pyplot as plt if args.histogram: sns.set_style('darkgrid') if train_policy: for j in range(2): print('agent={}'.format(j), end=' ') print('step:', i, end=' ') print('reward:', rewards[j], end=' ') print('scaled reward:', s_rewards[j], end=' ') print('mean reward:', np.mean(self.all_rewards[j])) if args.histogram: self.agents[ j].env.dialogue_generator.get_policyHistogram( ) if train_critic: critic_report_stats.output(i + 1, 0, 0, last_time) critic_report_stats = RLStatistics() print('-' * 10) if args.histogram: plt.show() last_time = time.time() # Save model if (i > 0 and i % 100 == 0) and not args.only_run: if train_policy: valid_stats = self.validate(args) self.drop_checkpoint( args, i, valid_stats, model_opt=self.agents[ self.training_agent].env.model_args) self.update_opponent('policy') elif train_critic: # TODO: reverse! self.drop_checkpoint( args, i, critic_stats, model_opt=self.agents[ self.training_agent].env.model_args) critic_stats = RLStatistics() else: valid_stats = self.validate(args) print('valid result: ', valid_stats.str_loss())