def _load_gender_data(self, opt): """ Get data file. """ build(opt) dt = opt['datatype'].split(':')[0] folder = os.path.join(opt['datapath'], 'md_gender', 'data_to_release', 'image_chat') fle = 'engaging_imagechat_gender_captions_hashed.{}.jsonl'.format(dt) return os.path.join(folder, fle)
def _get_convos(self, opt): """ Get the test/train/valid split. """ build(opt) with open(os.path.join(opt['datapath'], NEW_DATAFILE), 'r') as f: ex_jsons = f.read().splitlines() convos = [json.loads(ex) for ex in ex_jsons] return convos
def _load_persona_map(self, opt): build(opt) persona_map_path = os.path.join( opt['datapath'], 'md_gender', 'data_to_release', 'convai2', 'convai2_all_personas_map.json', ) with open(persona_map_path, 'rb') as f: self.persona_map = json.load(f)
def _load_gender_data(self, opt): build(opt) dt = opt['datatype'].split(':')[0] fle = os.path.join(opt['datapath'], 'md_gender', 'data_to_release', f'wizard/{dt}.jsonl') data = [] with open(fle, 'r') as f: lines = f.read().splitlines() for line in lines: ex = json.loads(line) ex['class_type'] = 'about' gender = ex['gender'] ex['label'] = f'ABOUT:{gender}' data.append(ex) return data
def get_explicitly_gendered_words(opt): """ Load list of explicitly gendered words from. <https://github.com/uclanlp/gn_glove/blob/main/wordlist/>. Examples include brother, girl, actress, husbands, etc. """ build(opt) folder = os.path.join(opt['datapath'], 'md_gender', 'data_to_release', 'word_list') male_words = os.path.join(folder, 'male_word_file.txt') female_words = os.path.join(folder, 'female_word_file.txt') with open(male_words, 'r') as f: male = f.read().splitlines() with open(female_words, 'r') as f: female = f.read().splitlines() return male, female
def _setup_data(self, opt): build(opt) datatype = opt['datatype'] dt = datatype.split(':')[0] # Build a map from persona to gender persona_map = {} personas = json.load( open(os.path.join(opt['datapath'], PERSONA_PATH), 'rb'))['old'] for gender, lst in personas.items(): for x in lst: persona_map[int(x['char_id'])] = { 'name': x['name'], 'gender': gender } # Build a list of dialogue utterances and associated persona IDs light_world = pickle.load( open(os.path.join(opt['datapath'], LIGHT_DATA_PATH.format(dt)), 'rb')) utt_to_pers = [] for x in light_world: for act in x['conv_info']['acts']: text = act['text'] p_uid = act['id'].lower() self_char_id = None partner_char_id = None for y in x['conv_info']['characters']: # if identifying self utterances, grab your own id if y[0].lower() == p_uid: self_char_id = y[1]['id'] self_name = y[0].lower() # else grab the partner's id elif y[0].lower() != p_uid: partner_char_id = y[1]['id'] partner_name = y[0].lower() if self_char_id is not None and partner_char_id is not None: utt_to_pers.append({ 'text': text, 'self_id': self_char_id, 'self_name': self_name, 'partner_id': partner_char_id, 'partner_name': partner_name, }) self.data = [] missing = 0 counts = { 'partner': { gend_utils.UNKNOWN: 0, gend_utils.FEM: 0, gend_utils.MASC: 0 }, 'self': { gend_utils.UNKNOWN: 0, gend_utils.FEM: 0, gend_utils.MASC: 0 }, } for x in utt_to_pers: if x['self_id'] in persona_map and x['partner_id'] in persona_map: self_gender = persona_map[x['self_id']]['gender'] partner_gender = persona_map[x['partner_id']]['gender'] act = { 'text': x['text'].lower(), 'self_id': x['self_id'], 'partner_id': x['partner_id'], 'id': 'LIGHT Gender', 'episode_done': True, } if self_gender == gend_utils.NEUTRAL: # not True neutral self_gender = gend_utils.UNKNOWN if partner_gender == gend_utils.NEUTRAL: # not True neutral partner_gender = gend_utils.UNKNOWN if self_gender is not None and self.labels_to_use != 'partner': labels = [f'SELF:{self_gender}'] self_act = deepcopy(act) self_act['labels'] = labels self_act['class_type'] = 'self' self.data.append(self_act) if partner_gender is not None and self.labels_to_use != 'self': labels = [f'PARTNER:{partner_gender}'] partner_act = deepcopy(act) partner_act['labels'] = labels partner_act['class_type'] = 'partner' self.data.append(partner_act) counts['partner'][partner_gender] += 1 counts['self'][self_gender] += 1 else: missing += 1 if self.labels_to_use == 'all' and self.add_unknown_classes: # load about data all_about_data = gend_utils.get_inferred_about_data( self.opt['task'], self.opt) sample_rate = self.opt['unknown_temp'] if sample_rate < 1.0: # do something here to_samp = int(sample_rate * len(all_about_data)) sampled = random.sample(all_about_data, to_samp) self.data += sampled else: self.data += all_about_data total = len(self.data) print(f'Total: {total}') for x in ['self', 'partner']: print(f'Totals for {x}:') subtot = sum(counts[x].values()) for k, v in counts[x].items(): print(f'\t{k}: {v} ({v / subtot})')