def delexicaliseReferenceNumber(sent, turn): """Based on the belief state, we can find reference number that during data gathering was created randomly.""" domains = [ 'restaurant', 'hotel', 'attraction', 'train', 'taxi', 'hospital' ] # , 'police'] if turn['metadata']: for domain in domains: if turn['metadata'][domain]['book']['booked']: for slot in turn['metadata'][domain]['book']['booked'][0]: if slot == 'reference': val = '[' + domain + '_' + slot + ']' else: val = '[' + domain + '_' + slot + ']' key = normalize( turn['metadata'][domain]['book']['booked'][0][slot]) sent = (' ' + sent + ' ').replace(' ' + key + ' ', ' ' + val + ' ') # try reference with hashtag key = normalize( "#" + turn['metadata'][domain]['book']['booked'][0][slot]) sent = (' ' + sent + ' ').replace(' ' + key + ' ', ' ' + val + ' ') # try reference with ref# key = normalize( "ref#" + turn['metadata'][domain]['book']['booked'][0][slot]) sent = (' ' + sent + ' ').replace(' ' + key + ' ', ' ' + val + ' ') return sent
def queryResult(domain, turn): """Returns the list of entities for a given domain based on the annotation of the belief state""" # query the db sql_query = "select * from {}".format(domain) flag = True #print turn['metadata'][domain]['semi'] for key, val in turn['metadata'][domain]['semi'].items(): if val == "" or val == "dont care" or val == 'not mentioned' or val == "don't care" or val == "dontcare" or val == "do n't care": pass else: if flag: sql_query += " where " val2 = val.replace("'", "''") val2 = normalize(val2) val2 = val2.replace("'", "''") # change query for trains if key == 'leaveAt': sql_query += r" " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" " + key + "=" + r"'" + val2 + r"'" flag = False else: val2 = val.replace("'", "''") val2 = normalize(val2) val2 = val2.replace("'", "''") if key == 'leaveAt': sql_query += r" and " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" and " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" and " + key + "=" + r"'" + val2 + r"'" #try: # "select * from attraction where name = 'queens college'" #print sql_query #print domain num_entities = len(dbs[domain].execute(sql_query).fetchall()) return num_entities
def createDelexData(): """Main function of the script - loads delexical dictionary, goes through each dialogue and does: 1) data normalization 2) delexicalization 3) addition of database pointer 4) saves the delexicalized data """ # download the data loadData() # create dictionary of delexicalied values that then we will search against, order matters here! dic = delexicalize.prepareSlotValuesIndependent() delex_data = {} fin1 = open('data/multi-woz/data.json', 'r') data = json.load(fin1) fin2 = open('data/multi-woz/dialogue_acts.json', 'r') data2 = json.load(fin2) for dialogue_name in tqdm(data): dialogue = data[dialogue_name] # print dialogue_name idx_acts = 1 for idx, turn in enumerate(dialogue['log']): # normalization, split and delexicalization of the sentence sent = normalize(turn['text']) words = sent.split() sent = delexicalize.delexicalise(' '.join(words), dic) # parsing reference number GIVEN belief state sent = delexicaliseReferenceNumber(sent, turn) # changes to numbers only here digitpat = re.compile('\d+') sent = re.sub(digitpat, '[value_count]', sent) # delexicalized sentence added to the dialogue dialogue['log'][idx]['text'] = sent if idx % 2 == 1: # if it's a system turn # add database pointer pointer_vector = addDBPointer(turn) # add booking pointer pointer_vector = addBookingPointer(dialogue, turn, pointer_vector) # print pointer_vector dialogue['log'][idx - 1]['db_pointer'] = pointer_vector.tolist() # FIXING delexicalization: dialogue = fixDelex(dialogue_name, dialogue, data2, idx, idx_acts) idx_acts += 1 delex_data[dialogue_name] = dialogue with open('data/multi-woz/delex.json', 'w') as outfile: json.dump(delex_data, outfile) return delex_data
def createDelexData(dialogue): """Main function of the script - loads delexical dictionary, goes through each dialogue and does: 1) data normalization 2) delexicalization 3) addition of database pointer 4) saves the delexicalized data """ # create dictionary of delexicalied values that then we will search against, order matters here! dic = delexicalize.prepareSlotValuesIndependent() delex_data = {} # fin1 = open('data/multi-woz/data.json', 'r') # data = json.load(fin1) # dialogue = data[dialogue_name] dial = dialogue['cur'] idx_acts = 1 for idx, turn in enumerate(dial['log']): # print(idx) # print(turn) # normalization, split and delexicalization of the sentence sent = normalize(turn['text']) words = sent.split() sent = delexicalize.delexicalise(' '.join(words), dic) # parsing reference number GIVEN belief state sent = delexicaliseReferenceNumber(sent, turn) # changes to numbers only here digitpat = re.compile('\d+') sent = re.sub(digitpat, '[value_count]', sent) # print(sent) # delexicalized sentence added to the dialogue dial['log'][idx]['text'] = sent if idx % 2 == 1: # if it's a system turn # add database pointer pointer_vector, db_results, num_entities = addDBPointer(turn) # add booking pointer pointer_vector = addBookingPointer(dial, turn, pointer_vector) # print pointer_vector dial['log'][idx - 1]['db_pointer'] = pointer_vector.tolist() idx_acts += 1 dial = get_dial(dial) if dial: dialogue = {} dialogue['usr'] = [] dialogue['sys'] = [] dialogue['db'] = [] dialogue['bs'] = [] for turn in dial: # print(turn) dialogue['usr'].append(turn[0]) dialogue['sys'].append(turn[1]) dialogue['db'].append(turn[2]) dialogue['bs'].append(turn[3]) delex_data['cur'] = dialogue return delex_data
def prepareSlotValuesIndependent(): domains = [ 'restaurant', 'hotel', 'attraction', 'train', 'taxi', 'hospital', 'police' ] requestables = ['phone', 'address', 'postcode', 'reference', 'id'] dic = [] dic_area = [] dic_food = [] dic_price = [] # read databases for domain in domains: try: fin = open( os.path.join(os.path.dirname(__file__), os.pardir, 'db', domain + '_db.json')) db_json = json.load(fin) fin.close() for ent in db_json: for key, val in ent.items(): if val == '?' or val == 'free': pass elif key == 'address': dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) if "road" in val: val = val.replace("road", "rd") dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) elif "rd" in val: val = val.replace("rd", "road") dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) elif "st" in val: val = val.replace("st", "street") dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) elif "street" in val: val = val.replace("street", "st") dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) elif key == 'name': dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) if "b & b" in val: val = val.replace("b & b", "bed and breakfast") dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) elif "bed and breakfast" in val: val = val.replace("bed and breakfast", "b & b") dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) elif "hotel" in val and 'gonville' not in val: val = val.replace("hotel", "") dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) elif "restaurant" in val: val = val.replace("restaurant", "") dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) elif key == 'postcode': dic.append((normalize(val), '[' + domain + '_' + 'postcode' + ']')) elif key == 'phone': dic.append((val, '[' + domain + '_' + 'phone' + ']')) elif key == 'trainID': dic.append( (normalize(val), '[' + domain + '_' + 'id' + ']')) elif key == 'department': dic.append((normalize(val), '[' + domain + '_' + 'department' + ']')) # NORMAL DELEX elif key == 'area': dic_area.append((normalize(val), '[' + 'value' + '_' + 'area' + ']')) elif key == 'food': dic_food.append((normalize(val), '[' + 'value' + '_' + 'food' + ']')) elif key == 'pricerange': dic_price.append( (normalize(val), '[' + 'value' + '_' + 'pricerange' + ']')) else: pass # TODO car type? except: pass if domain == 'hospital': dic.append( (normalize('Hills Rd'), '[' + domain + '_' + 'address' + ']')) dic.append((normalize('Hills Road'), '[' + domain + '_' + 'address' + ']')) dic.append( (normalize('CB20QQ'), '[' + domain + '_' + 'postcode' + ']')) dic.append(('01223245151', '[' + domain + '_' + 'phone' + ']')) dic.append(('1223245151', '[' + domain + '_' + 'phone' + ']')) dic.append(('0122324515', '[' + domain + '_' + 'phone' + ']')) dic.append((normalize('Addenbrookes Hospital'), '[' + domain + '_' + 'name' + ']')) elif domain == 'police': dic.append( (normalize('Parkside'), '[' + domain + '_' + 'address' + ']')) dic.append( (normalize('CB11JG'), '[' + domain + '_' + 'postcode' + ']')) dic.append(('01223358966', '[' + domain + '_' + 'phone' + ']')) dic.append(('1223358966', '[' + domain + '_' + 'phone' + ']')) dic.append((normalize('Parkside Police Station'), '[' + domain + '_' + 'name' + ']')) # add at the end places from trains fin = open( os.path.join(os.path.dirname(__file__), os.path.pardir, 'db', 'train_db.json'), 'r') db_json = json.load(fin) fin.close() for ent in db_json: for key, val in ent.items(): if key == 'departure' or key == 'destination': dic.append( (normalize(val), '[' + 'value' + '_' + 'place' + ']')) # add specific values: for key in [ 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday' ]: dic.append((normalize(key), '[' + 'value' + '_' + 'day' + ']')) # more general values add at the end dic.extend(dic_area) dic.extend(dic_food) dic.extend(dic_price) return dic
def queryResultVenues(domain, turn, real_belief=False): # query the db sql_query = "select * from {}".format(domain) flag = True if real_belief == True: items = turn.items() elif real_belief == 'tracking': for slot in turn[domain]: key = slot[0].split("-")[1] val = slot[0].split("-")[2] if key == "price range": key = "pricerange" elif key == "leave at": key = "leaveAt" elif key == "arrive by": key = "arriveBy" if val == "do n't care": pass else: if flag: sql_query += " where " val2 = val.replace("'", "''") val2 = normalize(val2) val2 = val2.replace("'", "''") if key == 'leaveAt': sql_query += key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += key + " < " + r"'" + val2 + r"'" else: sql_query += r" " + key + "=" + r"'" + val2 + r"'" flag = False else: val2 = val.replace("'", "''") val2 = normalize(val2) val2 = val2.replace("'", "''") if key == 'leaveAt': sql_query += r" and " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" and " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" and " + key + "=" + r"'" + val2 + r"'" try: # "select * from attraction where name = 'queens college'" return dbs[domain].execute(sql_query).fetchall() except: return [] # TODO test it pass else: items = turn['metadata'][domain]['semi'].items() flag = True for key, val in items: if val == "" or val == "dontcare" or val == 'not mentioned' or val == "don't care" or val == "dont care" or val == "do n't care": pass else: if flag: sql_query += " where " val2 = val.replace("'", "''") val2 = normalize(val2) val2 = val2.replace("'", "''") if key == 'leaveAt': sql_query += r" " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" " + key + "=" + r"'" + val2 + r"'" flag = False else: val2 = val.replace("'", "''") val2 = normalize(val2) val2 = val2.replace("'", "''") if key == 'leaveAt': sql_query += r" and " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" and " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" and " + key + "=" + r"'" + val2 + r"'" try: # "select * from attraction where name = 'queens college'" return dbs[domain].execute(sql_query).fetchall() except: raise return [] # TODO test it