def _parseGoal(self, goal, true_goal, domain): """Parses user goal into dictionary format.""" goal[domain] = {} goal[domain] = {'informable': {}, 'requestable': [], 'booking': []} if 'info' in true_goal[domain]: if domain == 'train': # we consider dialogues only where train had to be booked! if 'book' in true_goal[domain]: goal[domain]['requestable'].append('reference') if 'reqt' in true_goal[domain]: if 'id' in true_goal[domain]['reqt']: goal[domain]['requestable'].append('id') else: if 'reqt' in true_goal[domain]: for s in true_goal[domain]['reqt']: # addtional requests: if s in [ 'phone', 'address', 'postcode', 'reference', 'id' ]: # ones that can be easily delexicalized goal[domain]['requestable'].append(s) if 'book' in true_goal[domain]: goal[domain]['requestable'].append("reference") for s, v in true_goal[domain]['info'].items(): s_, v_ = clean_slot_values(domain, s, v) if len(v_.split()) > 1: v_ = ' '.join( [token.text for token in self.reader.nlp(v_)]).strip() goal[domain]["informable"][s_] = v_ if 'book' in true_goal[domain]: goal[domain]["booking"] = true_goal[domain]['book'] return goal
def preprocess_db(db_paths): dbs = {} nlp = spacy.load('en_core_web_sm') for domain in ontology.all_domains: with open(db_paths[domain], 'r') as f: dbs[domain] = json.loads(f.read().lower()) for idx, entry in enumerate(dbs[domain]): new_entry = copy.deepcopy(entry) for key, value in entry.items(): if type(value) is not str: continue del new_entry[key] key, value = clean_slot_values(domain, key, value) tokenize_and_back = ' '.join([token.text for token in nlp(value)]).strip() new_entry[key] = tokenize_and_back dbs[domain][idx] = new_entry with open(db_paths[domain].replace('.json', '_processed.json'), 'w') as f: json.dump(dbs[domain], f, indent=2) print('[%s] DB processed! '%domain)
def preprocess_main(self, save_path=None, is_test=False): """ """ data = {} count = 0 self.unique_da = {} ordered_sysact_dict = {} for fn, raw_dial in tqdm(list(self.convlab_data.items())): count += 1 # if count == 100: # break compressed_goal = {} dial_domains, dial_reqs = [], [] for dom, g in raw_dial['goal'].items(): if dom != 'topic' and dom != 'message' and g: if g.get('reqt'): for i, req_slot in enumerate(g['reqt']): if ontology.normlize_slot_names.get(req_slot): g['reqt'][i] = ontology.normlize_slot_names[ req_slot] dial_reqs.append(g['reqt'][i]) compressed_goal[dom] = g if dom in ontology.all_domains: dial_domains.append(dom) dial_reqs = list(set(dial_reqs)) dial = {'goal': compressed_goal, 'log': []} single_turn = {} constraint_dict = OrderedDict() prev_constraint_dict = {} prev_turn_domain = ['general'] ordered_sysact_dict[fn] = {} for turn_num, dial_turn in enumerate(raw_dial['log']): dial_state = dial_turn['metadata'] if not dial_state: # user u = ' '.join(clean_text(dial_turn['text']).split()) if dial_turn['span_info']: u_delex = clean_text( self.delex_by_annotation(dial_turn)) else: u_delex = self.delex_by_valdict(dial_turn['text']) single_turn['user'] = u single_turn['user_delex'] = u_delex else: #system if dial_turn['span_info']: s_delex = clean_text( self.delex_by_annotation(dial_turn)) else: if not dial_turn['text']: print(fn) s_delex = self.delex_by_valdict(dial_turn['text']) single_turn['resp'] = s_delex # get belief state for domain in dial_domains: if not constraint_dict.get(domain): constraint_dict[domain] = OrderedDict() info_sv = dial_state[domain]['semi'] for s, v in info_sv.items(): s, v = clean_slot_values(domain, s, v) if len(v.split()) > 1: v = ' '.join([ token.text for token in self.nlp(v) ]).strip() if v != '': constraint_dict[domain][s] = v book_sv = dial_state[domain]['book'] for s, v in book_sv.items(): if s == 'booked': continue s, v = clean_slot_values(domain, s, v) if len(v.split()) > 1: v = ' '.join([ token.text for token in self.nlp(v) ]).strip() if v != '': constraint_dict[domain][s] = v constraints = [] cons_delex = [] turn_dom_bs = [] for domain, info_slots in constraint_dict.items(): if info_slots: constraints.append('[' + domain + ']') cons_delex.append('[' + domain + ']') for slot, value in info_slots.items(): constraints.append(slot) constraints.extend(value.split()) cons_delex.append(slot) if domain not in prev_constraint_dict: turn_dom_bs.append(domain) elif prev_constraint_dict[ domain] != constraint_dict[domain]: turn_dom_bs.append(domain) sys_act_dict = {} turn_dom_da = set() for act in dial_turn['dialog_act']: d, a = act.split('-') turn_dom_da.add(d) turn_dom_da = list(turn_dom_da) if len(turn_dom_da) != 1 and 'general' in turn_dom_da: turn_dom_da.remove('general') if len(turn_dom_da) != 1 and 'booking' in turn_dom_da: turn_dom_da.remove('booking') # get turn domain turn_domain = turn_dom_bs for dom in turn_dom_da: if dom != 'booking' and dom not in turn_domain: turn_domain.append(dom) if not turn_domain: turn_domain = prev_turn_domain if len(turn_domain) == 2 and 'general' in turn_domain: turn_domain.remove('general') if len(turn_domain) == 2: if len(prev_turn_domain) == 1 and prev_turn_domain[ 0] == turn_domain[1]: turn_domain = turn_domain[::-1] # get system action for dom in turn_domain: sys_act_dict[dom] = {} add_to_last_collect = [] booking_act_map = { 'inform': 'offerbook', 'book': 'offerbooked' } for act, params in dial_turn['dialog_act'].items(): if act == 'general-greet': continue d, a = act.split('-') if d == 'general' and d not in sys_act_dict: sys_act_dict[d] = {} if d == 'booking': d = turn_domain[0] a = booking_act_map.get(a, a) add_p = [] for param in params: p = param[0] if p == 'none': continue elif ontology.da_abbr_to_slot_name.get(p): p = ontology.da_abbr_to_slot_name[p] if p not in add_p: add_p.append(p) add_to_last = True if a in [ 'request', 'reqmore', 'bye', 'offerbook' ] else False if add_to_last: add_to_last_collect.append((d, a, add_p)) else: sys_act_dict[d][a] = add_p for d, a, add_p in add_to_last_collect: sys_act_dict[d][a] = add_p for d in copy.copy(sys_act_dict): acts = sys_act_dict[d] if not acts: del sys_act_dict[d] if 'inform' in acts and 'offerbooked' in acts: for s in sys_act_dict[d]['inform']: sys_act_dict[d]['offerbooked'].append(s) del sys_act_dict[d]['inform'] ordered_sysact_dict[fn][len(dial['log'])] = sys_act_dict sys_act = [] if 'general-greet' in dial_turn['dialog_act']: sys_act.extend(['[general]', '[greet]']) for d, acts in sys_act_dict.items(): sys_act += ['[' + d + ']'] for a, slots in acts.items(): self.unique_da[d + '-' + a] = 1 sys_act += ['[' + a + ']'] sys_act += slots # get db pointers matnums = self.db.get_match_num(constraint_dict) match_dom = turn_domain[0] if len( turn_domain) == 1 else turn_domain[1] match = matnums[match_dom] dbvec = self.db.addDBPointer(match_dom, match) bkvec = self.db.addBookingPointer(dial_turn['dialog_act']) single_turn['pointer'] = ','.join( [str(d) for d in dbvec + bkvec]) single_turn['match'] = str(match) single_turn['constraint'] = ' '.join(constraints) single_turn['cons_delex'] = ' '.join(cons_delex) single_turn['sys_act'] = ' '.join(sys_act) single_turn['turn_num'] = len(dial['log']) single_turn['turn_domain'] = ' '.join( ['[' + d + ']' for d in turn_domain]) prev_turn_domain = copy.deepcopy(turn_domain) prev_constraint_dict = copy.deepcopy(constraint_dict) if 'user' in single_turn: dial['log'].append(single_turn) for t in single_turn['user'].split() + single_turn[ 'resp'].split() + constraints + sys_act: self.vocab.add_word(t) for t in single_turn['user_delex'].split(): if '[' in t and ']' in t and not t.startswith( '[') and not t.endswith(']'): single_turn['user_delex'].replace( t, t[t.index('['):t.index(']') + 1]) elif not self.vocab.has_word(t): self.vocab.add_word(t) single_turn = {} data[fn] = dial # pprint(dial) # if count == 20: # break self.vocab.construct() self.vocab.save_vocab('data/multi-woz-processed/vocab') with open('data/multi-woz-analysis/dialog_acts.json', 'w') as f: json.dump(ordered_sysact_dict, f, indent=2) with open('data/multi-woz-analysis/dialog_act_type.json', 'w') as f: json.dump(self.unique_da, f, indent=2) return data
def get_delex_valdict(self, ): skip_entry_type = { 'taxi': ['taxi_phone'], 'police': ['id'], 'hospital': ['id'], 'hotel': [ 'id', 'location', 'internet', 'parking', 'takesbookings', 'stars', 'price', 'n', 'postcode', 'phone' ], 'attraction': [ 'id', 'location', 'pricerange', 'price', 'openhours', 'postcode', 'phone' ], 'train': ['price', 'id'], 'restaurant': [ 'id', 'location', 'introduction', 'signature', 'type', 'postcode', 'phone' ], } entity_value_to_slot = {} ambiguous_entities = [] for domain, db_data in self.db.dbs.items(): print('Processing entity values in [%s]' % domain) if domain != 'taxi': for db_entry in db_data: for slot, value in db_entry.items(): if slot not in skip_entry_type[domain]: if type(value) is not str: raise TypeError( "value '%s' in domain '%s' should be rechecked" % (slot, domain)) else: slot, value = clean_slot_values( domain, slot, value) value = ' '.join([ token.text for token in self.nlp(value) ]).strip() if value in entity_value_to_slot and entity_value_to_slot[ value] != slot: # print(value, ": ",entity_value_to_slot[value], slot) ambiguous_entities.append(value) entity_value_to_slot[value] = slot else: # taxi db specific db_entry = db_data[0] for slot, ent_list in db_entry.items(): if slot not in skip_entry_type[domain]: for ent in ent_list: entity_value_to_slot[ent] = 'car' ambiguous_entities = set(ambiguous_entities) ambiguous_entities.remove('cambridge') ambiguous_entities = list(ambiguous_entities) for amb_ent in ambiguous_entities: # departure or destination? arrive time or leave time? entity_value_to_slot.pop(amb_ent) entity_value_to_slot['parkside'] = 'address' entity_value_to_slot['parkside, cambridge'] = 'address' entity_value_to_slot['cambridge belfry'] = 'name' entity_value_to_slot['hills road'] = 'address' entity_value_to_slot['hills rd'] = 'address' entity_value_to_slot['Parkside Police Station'] = 'name' single_token_values = {} multi_token_values = {} for val, slt in entity_value_to_slot.items(): if val in ['cambridge']: continue if len(val.split()) > 1: multi_token_values[val] = slt else: single_token_values[val] = slt with open(self.delex_sg_valdict_path, 'w') as f: single_token_values = OrderedDict( sorted(single_token_values.items(), key=lambda kv: len(kv[0]), reverse=True)) json.dump(single_token_values, f, indent=2) print('single delex value dict saved!') with open(self.delex_mt_valdict_path, 'w') as f: multi_token_values = OrderedDict( sorted(multi_token_values.items(), key=lambda kv: len(kv[0]), reverse=True)) json.dump(multi_token_values, f, indent=2) print('multi delex value dict saved!') with open(self.ambiguous_val_path, 'w') as f: json.dump(ambiguous_entities, f, indent=2) print('ambiguous value dict saved!') return single_token_values, multi_token_values, ambiguous_entities
def get_db_values(value_set_path): processed = {} bspn_word = [] nlp = spacy.load('en_core_web_sm') with open(value_set_path, 'r') as f: value_set = json.loads(f.read().lower()) with open('db/ontology.json', 'r') as f: otlg = json.loads(f.read().lower()) for domain, slots in value_set.items(): processed[domain] = {} bspn_word.append('[' + domain + ']') for slot, values in slots.items(): s_p = ontology.normlize_slot_names.get(slot, slot) if s_p in ontology.informable_slots[domain]: bspn_word.append(s_p) processed[domain][s_p] = [] for domain, slots in value_set.items(): for slot, values in slots.items(): s_p = ontology.normlize_slot_names.get(slot, slot) if s_p in ontology.informable_slots[domain]: for v in values: _, v_p = clean_slot_values(domain, slot, v) v_p = ' '.join([token.text for token in nlp(v_p)]).strip() processed[domain][s_p].append(v_p) for x in v_p.split(): if x not in bspn_word: bspn_word.append(x) for domain_slot, values in otlg.items(): domain, slot = domain_slot.split('-') if domain == 'bus': domain = 'taxi' if slot == 'price range': slot = 'pricerange' if slot == 'book stay': slot = 'stay' if slot == 'book day': slot = 'day' if slot == 'book people': slot = 'people' if slot == 'book time': slot = 'time' if slot == 'arrive by': slot = 'arrive' if slot == 'leave at': slot = 'leave' if slot == 'leaveat': slot = 'leave' if slot not in processed[domain]: processed[domain][slot] = [] bspn_word.append(slot) for v in values: _, v_p = clean_slot_values(domain, slot, v) v_p = ' '.join([token.text for token in nlp(v_p)]).strip() if v_p not in processed[domain][slot]: processed[domain][slot].append(v_p) for x in v_p.split(): if x not in bspn_word: bspn_word.append(x) with open(value_set_path.replace('.json', '_processed.json'), 'w') as f: json.dump(processed, f, indent=2) with open('data/multi-woz-processed/bspn_word_collection.json', 'w') as f: json.dump(bspn_word, f, indent=2) print('DB value set processed! ')
def get_db_values(value_set_path ): # value_set.json, all the domain[slot] values in datasets processed = {} bspn_word = [] nlp = spacy.load('en_core_web_sm') with open(value_set_path, 'r') as f: # read value set file in lower value_set = json.loads(f.read().lower()) with open('db/ontology.json', 'r') as f: # read ontology in lower, all the domain-slot values otlg = json.loads(f.read().lower()) for domain, slots in value_set.items( ): # add all informable slots to bspn_word, create lists holder for values processed[domain] = {} bspn_word.append('[' + domain + ']') for slot, values in slots.items(): s_p = ontology.normlize_slot_names.get(slot, slot) if s_p in ontology.informable_slots[domain]: bspn_word.append(s_p) processed[domain][s_p] = [] for domain, slots in value_set.items( ): # add all words of values of informable slots to bspn_word for slot, values in slots.items(): s_p = ontology.normlize_slot_names.get(slot, slot) if s_p in ontology.informable_slots[domain]: for v in values: _, v_p = clean_slot_values(domain, slot, v) v_p = ' '.join([token.text for token in nlp(v_p)]).strip() processed[domain][s_p].append(v_p) for x in v_p.split(): if x not in bspn_word: bspn_word.append(x) for domain_slot, values in otlg.items( ): # split domain-slots to domains and slots domain, slot = domain_slot.split('-') if domain == 'bus': domain = 'taxi' if slot == 'price range': slot = 'pricerange' if slot == 'book stay': slot = 'stay' if slot == 'book day': slot = 'day' if slot == 'book people': slot = 'people' if slot == 'book time': slot = 'time' if slot == 'arrive by': slot = 'arrive' if slot == 'leave at': slot = 'leave' if slot == 'leaveat': slot = 'leave' if slot not in processed[ domain]: # add all slots and words of values if not already in processed and bspn_word processed[domain][slot] = [] bspn_word.append(slot) for v in values: _, v_p = clean_slot_values(domain, slot, v) v_p = ' '.join([token.text for token in nlp(v_p)]).strip() if v_p not in processed[domain][slot]: processed[domain][slot].append(v_p) for x in v_p.split(): if x not in bspn_word: bspn_word.append(x) with open(value_set_path.replace('.json', '_processed.json'), 'w') as f: json.dump(processed, f, indent=2) # save processed.json with open('data/multi-woz-processed/bspn_word_collection.json', 'w') as f: json.dump(bspn_word, f, indent=2) # save bspn_word print('DB value set processed! ')