Exemple #1
0
 def write_conversations_format(self, outfile, world):
     Conversations.save_conversations(
         self._logs,
         outfile,
         world.opt,
         self_chat=world.opt.get('selfchat_task', False),
     )
Exemple #2
0
 def write_conversations_format(self, outfile, world):
     logging.info(f'Saving log to {outfile} in Conversations format')
     Conversations.save_conversations(
         self._logs,
         outfile,
         world.opt,
         self_chat=world.opt.get('selfchat_task', False),
     )
Exemple #3
0
 def test_world_logging(self):
     with testing_utils.tempdir() as tmpdir:
         save_report = os.path.join(tmpdir, 'report')
         testing_utils.eval_model(
             dict(
                 model_file='zoo:unittest/transformer_generator2/model',
                 task='integration_tests:multiturn_candidate',
                 save_world_logs=True,
                 report_filename=save_report,
                 truncate=1024,
                 dynamic_batching='full',
                 batchsize=4,
             )
         )
         convo_fle = (
             str(save_report)
             + '_integration_tests:multiturn_candidate_replies.jsonl'
         )
         convos = Conversations(convo_fle)
         for convo in convos:
             self.assertEquals(len(convo), 2 * 4)  # each episode is 4 turns
             # now assert that they are all from the same dynamic batch index
             dyn_batch_idx = convo[0]['dyn_batch_idx']
             for i, turn in enumerate(convo):
                 if i % 2 == 0 and i > 0:
                     # we log the batch index in the teacher acts only
                     self.assertEquals(dyn_batch_idx, turn['dyn_batch_idx'])
 def test_world_logging_buffersize(self):
     """
     Test world logging with dynamic batching.
     
     Checks when the number of examples exceeds the buffersize.
     """
     with testing_utils.tempdir() as tmpdir:
         save_report = os.path.join(tmpdir, 'report')
         testing_utils.eval_model(
             dict(
                 model_file='zoo:unittest/transformer_generator2/model',
                 task='integration_tests:RepeatTeacher:2000',
                 world_logs=save_report + '.jsonl',
                 report_filename=save_report,
                 truncate=1024,
                 dynamic_batching='full',
                 batchsize=4,
             ),
             valid_datatype='train:evalmode',
             skip_test=True,
         )
         convo_fle = str(save_report) + '.jsonl'
         convos = Conversations(convo_fle)
         # we expect there to be 2000 episodes logged in the convos
         self.assertEquals(len(convos), 2000)
Exemple #5
0
    def load_from_chunk(self, chunk_idx: int) -> List[ChunkOutput]:
        """
        Given the chunk index, load examples from that chunk.

        Return a list of tuples. The function `_create_message` will take these tuples
        to form the Message object that is returned by the teacher.
        """
        convs = Conversations(_path(self.opt, self.data_files[chunk_idx]))
        chunk = []
        for conv in convs:
            turns = [t for t in conv.turns if t.get('id') != 'context']
            ep = self._get_ep_from_turns(turns[::2], turns[1::2])
            if not ep:
                continue
            chunk += ep
        return chunk
Exemple #6
0
    def _build_pairings_file(self):
        """
        Build and save pairings to pairings file.
        """
        onboarding_pairs = []
        with open(self.onboarding_path) as f:
            for line in f:
                onboarding_pairs.append(json.loads(line))

        pairings_filepath = self._get_vs_path('pairings_files')

        self._print_progress(f'building pairings file, saving at {pairings_filepath}')
        conversations = {
            config_id: Conversations(self.chat_files[config_id])
            for config_id in self.config_ids
        }
        pairs = self._build_conversation_pairs(conversations)

        with open(pairings_filepath, 'w') as f:
            # Write the onboarding convo
            pairs = onboarding_pairs + pairs
            for pair in pairs:
                f.write(json.dumps(pair) + "\n")
    def test_conversations(self):
        act_list = [
            [
                [
                    {'id': 'Emily', 'text': 'Hello, do you like this test?'},
                    {'id': 'Stephen', 'text': 'Why yes! I love this test!'},
                ],
                [
                    {'id': 'Emily', 'text': 'So will you stamp this diff?'},
                    {'id': 'Stephen', 'text': 'Yes, I will do it right now!'},
                ],
            ],
            [
                [
                    {
                        'id': 'A',
                        'text': 'Somebody once told me the world is gonna roll me',
                    },
                    {'id': 'B', 'text': 'I aint the sharpest tool in the shed'},
                ],
                [
                    {
                        'id': 'A',
                        'text': 'She was looking kind of dumb with her finger and her thumb',
                    },
                    {'id': 'B', 'text': 'In the shape of an L on her forehead'},
                ],
            ],
        ]
        self.opt = {
            'A': 'B',
            'C': 'D',
            'E': 'F',
        }

        self.convo_datapath = os.path.join(self.datapath, 'convo1')
        Conversations.save_conversations(
            act_list,
            self.convo_datapath,
            self.opt,
            self_chat=False,
            other_info='Blah blah blah',
        )
        assert os.path.exists(self.convo_datapath + '.jsonl')
        assert os.path.exists(self.convo_datapath + '.metadata')

        convos = Conversations(self.convo_datapath + '.jsonl')

        # test conversations loaded
        self.assertEqual(convos.num_conversations, 2)

        # test speakers saved
        speakers = {'Stephen', 'Emily', 'A', 'B'}
        self.assertEqual(set(convos.metadata.speakers), speakers)

        # test opt saved
        for x in ['A', 'C', 'E']:
            self.assertEqual(
                self.opt[x], convos.metadata.opt[x],
            )

        # test kwargs
        self.assertEqual({'other_info': 'Blah blah blah'}, convos.metadata.extra_data)

        # test reading conversations
        with testing_utils.capture_output() as out:
            convos.read_conv_idx(0)
        str_version = (
            'Emily: Hello, do you like this test?\n'
            'Stephen: Why yes! I love this test!\n'
            'Emily: So will you stamp this diff?\n'
            'Stephen: Yes, I will do it right now!\n'
        )
        self.assertIn(str_version, out.getvalue())
Exemple #8
0
    def run(self):
        opt = self.opt
        if int(len(self.opt["agent_suffixes"])) % 2 != 0:
            raise RuntimeError("Agent suffix input should be even")
        suffixes = {}
        for i in range(int(len(self.opt["agent_suffixes"]) / 2)):
            agent = self.opt["agent_suffixes"][2 * i]
            suffix = self.opt["agent_suffixes"][2 * i + 1]
            suffixes[agent] = suffix

        with PathManager.open(opt["report_path"]) as r:
            report = json.load(r)["report"]
        tod_metrics = report["tod_metrics"]

        if opt["num_conversations"] > -1:
            tod_metrics = tod_metrics[:opt["num_conversations"]]

        source = self.opt["source_file"].replace(".jsonl", "")
        if self.opt["out_file"]:
            out = self.opt["out_file"]
        else:
            if ("conversations" in source
                ):  # just to make sure we don't overwrite anything...
                out = source.replace("conversations", "cleaned_conversations")
            else:
                out = "cleaned_" + source

        speakers = []
        with PathManager.open(out + ".jsonl", "w") as f:
            conversations = Conversations(source + ".jsonl")
            for i, conversation in enumerate(conversations):
                if opt["num_conversations"] >= 0 and i >= opt[
                        "num_conversations"]:
                    break
                cleaned_dialog = []
                for parlay_round in conversation.episode["dialog"]:
                    cleaned_parlay_round = []
                    for turn in parlay_round:
                        turn_type = self._get_turn_type(turn)
                        if turn_type in self.opt["included_speakers"]:
                            if turn_type in suffixes:
                                turn["id"] += suffixes[turn_type]
                            if turn["id"] not in speakers:
                                speakers.append(turn["id"])
                            cleaned_parlay_round.append(turn)
                    if len(cleaned_parlay_round) > 0:
                        cleaned_dialog.append(cleaned_parlay_round)
                convo = {}
                convo["dialog"] = cleaned_dialog
                convo["metadata_path"] = Metadata._get_path(out)
                convo["context"] = [{
                    "synthetic_task_success":
                    tod_metrics[i]["synthetic_task_success"],
                    "goal_text":
                    tod_metrics[i]["goal"]["text"],
                }]
                json_convo = json.dumps(convo)
                f.write(json_convo + "\n")

            old_meta = Metadata(source + ".jsonl")
            Metadata.save_metadata(out, old_meta.opt, old_meta.self_chat,
                                   speakers, **old_meta.extra_data)
Exemple #9
0
    def test_conversations(self):
        act_list = [
            [
                [
                    {
                        'id': 'Emily',
                        'text': 'Hello, do you like this test?'
                    },
                    {
                        'id': 'Stephen',
                        'text': 'Why yes! I love this test!'
                    },
                ],
                [
                    {
                        'id': 'Emily',
                        'text': 'So will you stamp this diff?'
                    },
                    {
                        'id': 'Stephen',
                        'text': 'Yes, I will do it right now!'
                    },
                ],
            ],
            [
                [
                    {
                        'id': 'A',
                        'text':
                        'Somebody once told me the world is gonna roll me',
                    },
                    {
                        'id': 'B',
                        'text': 'I aint the sharpest tool in the shed'
                    },
                ],
                [
                    {
                        'id':
                        'A',
                        'text':
                        'She was looking kind of dumb with her finger and her thumb',
                    },
                    {
                        'id': 'B',
                        'text': 'In the shape of an L on her forehead'
                    },
                ],
            ],
        ]
        self.opt = {'A': 'B', 'C': 'D', 'E': 'F'}

        self.convo_datapath = os.path.join(self.datapath, 'convo1')
        Conversations.save_conversations(
            act_list,
            self.convo_datapath,
            self.opt,
            self_chat=False,
            other_info='Blah blah blah',
        )
        assert os.path.exists(self.convo_datapath + '.jsonl')
        assert os.path.exists(self.convo_datapath + '.metadata')

        convos = Conversations(self.convo_datapath + '.jsonl')

        # test conversations loaded
        self.assertEqual(len(convos), 2)

        # test speakers saved
        speakers = {'Stephen', 'Emily', 'A', 'B'}
        self.assertEqual(set(convos.metadata.speakers), speakers)

        # test opt saved
        for x in ['A', 'C', 'E']:
            self.assertEqual(self.opt[x], convos.metadata.opt[x])

        # test kwargs
        self.assertEqual({'other_info': 'Blah blah blah'},
                         convos.metadata.extra_data)

        # test reading conversations
        with self.assertLogs(logger=logging.logger, level='DEBUG') as cm:
            convos.read_conv_idx(0)
            str_version = ('Emily: Hello, do you like this test?\n'
                           'Stephen: Why yes! I love this test!\n'
                           'Emily: So will you stamp this diff?\n'
                           'Stephen: Yes, I will do it right now!\n')
            self.assertIn(str_version, "\n".join(cm.output))

        # test getting a specific turn
        first = convos[0]  # Conversation
        self.assertEqual(first[0].id, 'Emily')
        self.assertEqual(first[3].text, 'Yes, I will do it right now!')
Exemple #10
0
def dump_data(opt):
    """
    Dump task data to ACUTE-Eval.
    """
    # create repeat label agent and assign it to the specified task
    agent = RepeatLabelAgent(opt)
    world = create_task(opt, agent)
    task = opt.get('task')
    speaker_0_id = opt.get('speaker_0_id') or f'{task}_as_human'
    speaker_1_id = opt.get('speaker_1_id') or f'{task}_as_model'
    if opt['outfile'] is None:
        outfile = tempfile.mkstemp(prefix='{}_{}_'.format(
            opt['task'], opt['datatype']),
                                   suffix='.txt')[1]
    else:
        outfile = opt['outfile']

    num_episodes = (world.num_episodes() if opt['num_episodes'] == -1 else min(
        opt['num_episodes'], world.num_episodes()))
    log_timer = TimeLogger()

    print(f'[ starting to convert, saving output to {outfile} ]')
    dialogues = []
    for _ in range(num_episodes):
        episode = []
        episode_done = False
        while not episode_done:
            world.parley()
            acts = world.get_acts()
            text = acts[0].get('text')
            split_text = text.split('\n')
            label = random.choice(acts[0].get('labels',
                                              acts[0].pop('eval_labels',
                                                          None)))
            if not episode and opt.get('prepended_context'):
                # first turn
                context = split_text[:-1]
                text = split_text[-1]
                context_turn = [{
                    'text': context,
                    'episode_done': False,
                    'id': 'context'
                } for _ in range(2)]
                episode.append(context_turn)
            turn = [
                {
                    'text': text,
                    'episode_done': False,
                    'id': speaker_0_id
                },
                {
                    'text': label,
                    'episode_done': False,
                    'id': speaker_1_id
                },
            ]
            episode.append(turn)
            if acts[0].get('episode_done', False):
                episode[-1][-1]['episode_done'] = True
                episode_done = True
                dialogues.append(episode)

            if log_timer.time() > opt['log_every_n_secs']:
                text, _log = log_timer.log(world.total_parleys,
                                           world.num_examples())
                print(text)

        if world.epoch_done():
            break

    Conversations.save_conversations(dialogues, outfile, opt)