コード例 #1
0
    def _setup_data(self, opt):
        # check that the data was downloaded and set up properly
        self._check_data_downloaded(opt)
        # Load map from image ID to gender
        data = self._load_gender_data(opt['datatype'])

        extra_data = []
        if self.add_unknown_classes:
            # load about data (unknown but inferred)
            extra_data = gend_utils.get_inferred_about_data(
                self.opt['task'], self.opt['datatype'])

            # now create partner/TO data: true neutral
            for ex in data:
                partner_ex = deepcopy(ex)
                partner_ex['labels'] = [f'PARTNER:{gend_utils.NEUTRAL}']
                partner_ex['class_type'] = 'neutral'
                extra_data.append(ex)

            sample_rate = self.opt['unknown_temp']
            if sample_rate < 1.0:
                to_samp = int(sample_rate * len(extra_data))
                sampled = random.sample(extra_data, to_samp)
                data += sampled
            else:
                data += extra_data

        data = data + extra_data
        if self.is_train:
            random.shuffle(data)

        return data
コード例 #2
0
    def _setup_data(self, opt):
        build(opt)
        datatype = opt['datatype']
        dt = datatype.split(':')[0]
        # Build a map from persona to gender
        persona_map = {}
        personas = json.load(
            open(os.path.join(opt['datapath'], PERSONA_PATH), 'rb'))['old']
        for gender, lst in personas.items():
            for x in lst:
                persona_map[int(x['char_id'])] = {
                    'name': x['name'],
                    'gender': gender
                }

        # Build a list of dialogue utterances and associated persona IDs
        light_world = pickle.load(
            open(os.path.join(opt['datapath'], LIGHT_DATA_PATH.format(dt)),
                 'rb'))
        utt_to_pers = []
        for x in light_world:
            for act in x['conv_info']['acts']:
                text = act['text']
                p_uid = act['id'].lower()
                self_char_id = None
                partner_char_id = None
                for y in x['conv_info']['characters']:
                    # if identifying self utterances, grab your own id
                    if y[0].lower() == p_uid:
                        self_char_id = y[1]['id']
                        self_name = y[0].lower()
                    # else grab the partner's id
                    elif y[0].lower() != p_uid:
                        partner_char_id = y[1]['id']
                        partner_name = y[0].lower()
                if self_char_id is not None and partner_char_id is not None:
                    utt_to_pers.append({
                        'text': text,
                        'self_id': self_char_id,
                        'self_name': self_name,
                        'partner_id': partner_char_id,
                        'partner_name': partner_name,
                    })

        self.data = []
        missing = 0

        counts = {
            'partner': {
                gend_utils.UNKNOWN: 0,
                gend_utils.FEM: 0,
                gend_utils.MASC: 0
            },
            'self': {
                gend_utils.UNKNOWN: 0,
                gend_utils.FEM: 0,
                gend_utils.MASC: 0
            },
        }

        for x in utt_to_pers:
            if x['self_id'] in persona_map and x['partner_id'] in persona_map:
                self_gender = persona_map[x['self_id']]['gender']
                partner_gender = persona_map[x['partner_id']]['gender']
                act = {
                    'text': x['text'].lower(),
                    'self_id': x['self_id'],
                    'partner_id': x['partner_id'],
                    'id': 'LIGHT Gender',
                    'episode_done': True,
                }
                if self_gender == gend_utils.NEUTRAL:
                    # not True neutral
                    self_gender = gend_utils.UNKNOWN
                if partner_gender == gend_utils.NEUTRAL:
                    # not True neutral
                    partner_gender = gend_utils.UNKNOWN
                if self_gender is not None and self.labels_to_use != 'partner':
                    labels = [f'SELF:{self_gender}']
                    self_act = deepcopy(act)
                    self_act['labels'] = labels
                    self_act['class_type'] = 'self'
                    self.data.append(self_act)
                if partner_gender is not None and self.labels_to_use != 'self':
                    labels = [f'PARTNER:{partner_gender}']
                    partner_act = deepcopy(act)
                    partner_act['labels'] = labels
                    partner_act['class_type'] = 'partner'
                    self.data.append(partner_act)

                counts['partner'][partner_gender] += 1
                counts['self'][self_gender] += 1
            else:
                missing += 1

        if self.labels_to_use == 'all' and self.add_unknown_classes:
            # load about data
            all_about_data = gend_utils.get_inferred_about_data(
                self.opt['task'], self.opt)
            sample_rate = self.opt['unknown_temp']
            if sample_rate < 1.0:
                # do something here
                to_samp = int(sample_rate * len(all_about_data))
                sampled = random.sample(all_about_data, to_samp)
                self.data += sampled
            else:
                self.data += all_about_data

        total = len(self.data)
        print(f'Total: {total}')
        for x in ['self', 'partner']:
            print(f'Totals for {x}:')
            subtot = sum(counts[x].values())
            for k, v in counts[x].items():
                print(f'\t{k}: {v} ({v / subtot})')
コード例 #3
0
    def _setup_data(self, opt):
        counts = {
            'partner': {
                gend_utils.UNKNOWN: 0,
                gend_utils.FEM: 0,
                gend_utils.MASC: 0
            },
            'self': {
                gend_utils.UNKNOWN: 0,
                gend_utils.FEM: 0,
                gend_utils.MASC: 0
            },
        }

        dt = opt['datatype'].split(':')[0]
        if dt == 'test':
            warn_once('No test set; switching to valid')
            dt = 'valid'

        # build data
        print('[ Building data ... ]')
        new_eps = []
        orig_teacher = OrigConvai2Teacher(opt)
        total_exs = orig_teacher.num_examples()
        num_exs = 0
        while num_exs < total_exs:
            current_episode = []
            episode_done = False

            while not episode_done:
                # TODO: eventually all teachers should return Messages, so
                # we should assert this
                action = Message(orig_teacher.act())
                current_episode.append(action)
                episode_done = action.get('episode_done', False)
                num_exs += 1

            # now we have the entire episode,... do something
            first_ex = current_episode[0]
            first_ex_text = []
            partner_persona = []
            your_persona = []
            for line in first_ex['text'].split('\n'):
                # NOTE: we flip "your" and "partner" here since we are taking the 'text'
                # field instead of the 'label'
                if 'partner\'s persona: ' in line:
                    your_persona.append(line.split('partner\'s persona: ')[1])
                elif 'your persona: ' in line:
                    partner_persona.append(line.split('your persona: ')[1])
                else:
                    first_ex_text.append(line)

            your, your_prob, partner, partner_prob = self.get_genders(
                your_persona, partner_persona)

            for i, ex in enumerate(current_episode):
                counts['self'][your] += 1
                counts['partner'][partner] += 1
                if i == 0:
                    text = '\n'.join(first_ex_text)
                else:
                    text = ex['text']
                new_ex = {
                    'text': text,
                    'episode_done': True,
                    'your_persona': '\n'.join(your_persona),
                    'partner_persona': '\n'.join(partner_persona),
                    'id': 'ConvAI2 Gender',
                }
                if not self.use_probably:
                    new_ex['partner_prob'] = partner_prob
                    new_ex['your_prob'] = your_prob

                if your is not None and self.labels_to_use != 'partner':
                    # Get the your task
                    labels = [f'SELF:{your}']
                    your_ex = deepcopy(new_ex)
                    your_ex['labels'] = labels
                    your_ex['class_type'] = 'self'
                    new_eps.append(your_ex)

                if partner is not None and self.labels_to_use != 'self':
                    # Get the partner task
                    labels = [f'PARTNER:{partner}']
                    partner_ex = deepcopy(new_ex)
                    partner_ex['labels'] = labels
                    partner_ex['class_type'] = 'partner'
                    new_eps.append(partner_ex)

        if self.labels_to_use == 'all' and self.add_unknown_classes:
            # load about data
            all_about_data = gend_utils.get_inferred_about_data(
                self.opt['task'], self.opt)
            sample_rate = self.opt['unknown_temp']
            if sample_rate < 1.0:
                to_samp = int(sample_rate * len(all_about_data))
                sampled = random.sample(all_about_data, to_samp)
                new_eps += sampled
            else:
                new_eps += all_about_data

        if self.is_train:
            random.shuffle(new_eps)

        self.data = new_eps
        print(f'Missing cnt: {self.missing_cnt} / {len(self.data) * 2}')
        for x in ['self', 'partner']:
            print(f'Totals for {x}:')
            subtot = sum(counts[x].values())
            for k, v in counts[x].items():
                print(f'\t{k}: {v} ({v / subtot})')