def delexicaliseReferenceNumber(sent, turn): """Based on the belief state, we can find reference number that during data gathering was created randomly.""" domains = [ 'restaurant', 'hotel', 'attraction', 'train', 'taxi', 'hospital' ] # , 'police'] if turn['metadata']: for domain in domains: if turn['metadata'][domain]['book']['booked']: for slot in turn['metadata'][domain]['book']['booked'][0]: if slot == 'reference': val = '[' + domain + '_' + slot + ']' else: val = '[' + domain + '_' + slot + ']' key = normalize( turn['metadata'][domain]['book']['booked'][0][slot]) sent = (' ' + sent + ' ').replace(' ' + key + ' ', ' ' + val + ' ') # try reference with hashtag key = normalize( "#" + turn['metadata'][domain]['book']['booked'][0][slot]) sent = (' ' + sent + ' ').replace(' ' + key + ' ', ' ' + val + ' ') # try reference with ref# key = normalize( "ref#" + turn['metadata'][domain]['book']['booked'][0][slot]) sent = (' ' + sent + ' ').replace(' ' + key + ' ', ' ' + val + ' ') return sent
def get_goal(did, dialogue_goal, dsv_dict): goal = {} booking_goal = {} for domain in DOMAINS: domain_goal = dialogue_goal[domain] domain_sv = dsv_dict[domain] if len(domain_goal) == 0: continue goal[domain] = {} booking_goal[domain] = {} for k, v in domain_goal.items(): if k not in GOAL_TYPES: if k in DOMAINS: if v != False: goal[domain]['name'] = v else: print( "ID: {} Odd goal type in domain goal {} vs. {}".format( did, domain_goal, GOAL_TYPES)) for k, v in domain_goal['info'].items(): if k in domain_sv.keys() or domain == 'taxi': goal[domain][k] = normalize(v) else: print( "ID: {} DB-goal unmatched slot type in domain info goal {} vs. {}" .format(did, domain_goal['info'], domain_sv.keys())) for k, v in domain_goal['fail_info'].items(): if k in domain_sv.keys() or domain == 'taxi': goal[domain]['fail_' + k] = normalize(v) else: print( "ID: {} DB-goal unmatched slot type in domain fail-info goal {} vs. {}" .format(did, domain_goal['fail_info'], domain_sv.keys())) if 'reqt' in domain_goal: goal[domain]['request'] = [] for k in domain_goal['reqt']: if k in domain_sv.keys() or domain == 'taxi': if ' ' in k: # phone number, entrance fee k = k.replace(' ', '') goal[domain]['request'].append(k) else: print( "ID: {} DB-goal unmatched slot type in domain reqt goal {} vs. {}" .format(did, domain_goal['reqt'], domain_sv.keys())) if 'book' in domain_goal: for k, v in domain_goal['book'].items(): if k in ['pre_invalid', 'invalid']: continue booking_goal[domain][k] = normalize(v) if 'fail_book' in domain_goal: for k, v in domain_goal['fail_book'].items(): if k in ['pre_invalid', 'invalid']: continue booking_goal[domain]['fail_' + k] = normalize(v) return goal, booking_goal
def get_global_entity(): hospital_db = load_json('data/WOZ/db/hospital_db.json') attraction_db = load_json('data/WOZ/db/attraction_db.json') hotel_db = load_json('data/WOZ/db/hotel_db.json') police_db = load_json('data/WOZ/db/police_db.json') restaurant_db = load_json('data/WOZ/db/restaurant_db.json') taxi_db = load_json('data/WOZ/db/taxi_db.json') train_db = load_json('data/WOZ/db/train_db.json') db_list = [ hospital_db, attraction_db, hotel_db, police_db, restaurant_db, taxi_db, train_db ] types = [ "address", "area", "name", "phone", "postcode", "type", "department", "pricerange", "stars", "food", "taxi_colors", "taxi_types", "arriveBy", "day", "departure", "destination", "duration", "leaveAt", "price", "trainID" ] global_entity = set() for db in db_list: for elem in db: for ty in types: if ty in elem: global_entity.add( normalize(str(elem[ty])).replace(" ", "_")) global_entity.remove("?") return global_entity
def queryResult(domain, turn): """Returns the list of entities for a given domain based on the annotation of the belief state""" # query the db sql_query = "select * from {}".format(domain) flag = True #print turn['metadata'][domain]['semi'] for key, val in turn['metadata'][domain]['semi'].items(): if val == "" or val == "dont care" or val == 'not mentioned' or val == "don't care" or val == "dontcare" or val == "do n't care": pass else: if flag: sql_query += " where " val2 = val.replace("'", "''") val2 = normalize(val2) val2 = val2.replace("'", "''") # change query for trains if key == 'leaveAt': sql_query += r" " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" " + key + "=" + r"'" + val2 + r"'" flag = False else: val2 = val.replace("'", "''") val2 = normalize(val2) val2 = val2.replace("'", "''") if key == 'leaveAt': sql_query += r" and " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" and " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" and " + key + "=" + r"'" + val2 + r"'" #try: # "select * from attraction where name = 'queens college'" #print sql_query #print domain num_entities = len(dbs[domain].execute(sql_query).fetchall()) return num_entities
def queryResultVenues(self, domain, turn, real_belief=False): # query the db sql_query = "select * from {}".format(domain) if real_belief == True: items = turn.items() else: items = turn['metadata'][domain]['semi'].items() flag = True for key, val in items: if val == "" or val == "dontcare" or val == 'not mentioned' or val == "don't care" or val == "dont care" or val == "do n't care": pass else: if flag: sql_query += " where " val2 = val.replace("'", "''") val2 = normalize(val2) if key == 'leaveAt': sql_query += r" " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" " + key + "=" + r"'" + val2 + r"'" flag = False else: val2 = val.replace("'", "''") val2 = normalize(val2) if key == 'leaveAt': sql_query += r" and " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" and " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" and " + key + "=" + r"'" + val2 + r"'" try: # "select * from attraction where name = 'queens college'" return self.dbs[domain].execute(sql_query).fetchall() except: return [] # TODO test it
def createDelexData(): """Main function of the script - loads delexical dictionary, goes through each dialogue and does: 1) data normalization 2) delexicalization 3) addition of database pointer 4) saves the delexicalized data """ # create dictionary of delexicalied values that then we will search against, order matters here! dic = delexicalize.prepareSlotValuesIndependent() delex_data = {} fin1 = open('data/woz2/data.json') data = json.load(fin1) for dialogue_name in tqdm(data): if 'WOZ' not in dialogue_name: continue dialogue = data[dialogue_name] #print dialogue_name for idx, turn in enumerate(dialogue['log']): # normalization, split and delexicalization of the sentence sent = normalize(turn['text']) words = sent.split() sent = delexicalize.delexicalise(' '.join(words), dic) # changes to numbers only here digitpat = re.compile('\d+') sent = re.sub(digitpat, '[value_count]', sent) # delexicalized sentence added to the dialogue dialogue['log'][idx]['text'] = sent if idx % 2 == 1: # if it's a system turn # add database pointer pointer_vector = addDBPointer(turn) #print pointer_vector dialogue['log'][idx - 1]['db_pointer'] = pointer_vector.tolist() delex_data[dialogue_name] = dialogue with open('data/delex.json', 'w') as outfile: json.dump(delex_data, outfile) return delex_data
def get_state(did, tid, utt_state, dsv_dict): state = {} unnormalized_state = {} booking_state = {} for domain in DOMAINS: domain_state = utt_state[domain] domain_sv = dsv_dict[domain] b_state = domain_state['book'] for k, v in b_state.items(): if len(v) > 0: if domain not in booking_state: booking_state[domain] = {} if k == 'booked': bookings = [] for b in v: booking = {} for b_k, b_v in b.items(): booking[b_k] = normalize(b_v) bookings.append(booking) booking_state[domain][k] = bookings continue booking_state[domain][k] = normalize(v) info_state = domain_state['semi'] for k, v in info_state.items(): if len(v) == 0 or v == 'not mentioned' or v in DONTCARE: continue if domain not in state: state[domain] = {} unnormalized_state[domain] = {} if k in domain_sv.keys() or domain == 'taxi': state[domain][k] = normalize(v) unnormalized_state[domain][k] = v else: print( "ID: {} Turn: {} DB-State unmatched slot type in domain info state {} vs. {}" .format(did, tid, info_state, domain_sv.keys())) return state, booking_state, unnormalized_state
def createDelexData(sent, sent_act, bs, dic, turn, option): # normalization, split and delexicalization of the sentence sent = normalize(sent) words = sent.split() sent = delexicalize.delexicalise(' '.join(words), dic) # parsing reference number GIVEN belief state sent = delexicaliseReferenceNumber(sent, turn) # changes to numbers only here digitpat = re.compile('\d+') sent = re.sub(digitpat, '[value_count]', sent) if option == 'user': sent = fixDelex(sent, None, bs) if option == 'sys': sent = fixDelex(sent, sent_act, None) return sent.strip()
def create_autotag_graph( corpus, concept_labels, quiet, language, semantic_threshold, min_concepts, ): progress_print(quiet, 'Loading dictionary used to find similar words.') progress_print(quiet, 'This is known to take 7 minutes or more, hang on…') comparator = word_similarity.get_comparator(language) comparator.ensure_loaded_corpus() progress_print(quiet, 'Done loading dictionary.') similarity_graph = create_bound_graph() for dataset, text in clint.textui.progress.bar( corpus.items(), 'Associating datasets to concepts', hide=quiet or None, expected_size=len(corpus), ): tokens = word_tokenize(text) tokens = tuple(normalize(remove_stopwords_nb(tokens))) # Compute the relevance for each concept scorelist = [] for concept, labels in concept_labels.items(): scores = tuple( map(lambda token: compute_score(token, labels, comparator), tokens)) score = max(scores, default=0.0) scorelist.append((concept, score)) sorted_scores = sorted(scorelist, key=lambda x: x[1], reverse=True) filtered_score = [(c, s) for c, s in sorted_scores if s >= semantic_threshold] if len(filtered_score) < min_concepts: filtered_score = sorted_scores[:min_concepts] # Add the scores to a graph for concept, score in filtered_score: add_similarity_link(similarity_graph, dataset, concept, score) return similarity_graph
def get_act(did, tid, acts): output_act = {} if did[:-5] not in acts: return None, output_act if str(tid) not in acts[did[:-5]]: return None, output_act sys_act = acts[did[:-5]][str(tid)] if sys_act == 'No Annotation': return None, output_act for domain_act, slots in sys_act.items(): domain, act = domain_act.split('-') domain = domain.lower() act = act.lower() if domain not in output_act: output_act[domain] = {} if domain == 'general': output_act[domain][act] = '' continue if domain == 'booking': if act == 'request': output_act[domain][act] = [] for slot in slots: output_act[domain][act].append(slot[0].lower()) else: output_act[domain][act] = {} for slot in slots: if slot[0] == 'none': continue output_act[domain][act][slot[0].lower()] = slot[1].lower() continue if act == 'request': output_act[domain][act] = [] for slot in slots: output_act[domain][act].append(slot[0].lower()) if act == 'inform': output_act[domain][act] = {} for slot in slots: output_act[domain][act][slot[0].lower()] = normalize(slot[1]) return sys_act, output_act
def get_entity_by_domain(): hospital_db = load_json('data/WOZ/db/hospital_db.json') attraction_db = load_json('data/WOZ/db/attraction_db.json') hotel_db = load_json('data/WOZ/db/hotel_db.json') police_db = load_json('data/WOZ/db/police_db.json') restaurant_db = load_json('data/WOZ/db/restaurant_db.json') taxi_db = load_json('data/WOZ/db/taxi_db.json') train_db = load_json('data/WOZ/db/train_db.json') db_list = { "hospital": hospital_db, "attraction": attraction_db, "hotel": hotel_db, "police": police_db, "restaurant": restaurant_db, "taxi": taxi_db, "train": train_db } types = [ "address", "area", "name", "phone", "postcode", "type", "department", "pricerange", "stars", "food", "taxi_colors", "taxi_types", "arriveBy", "day", "departure", "destination", "duration", "leaveAt", "price", "trainID" ] entity_by_domain = {} for domain, db in db_list.items(): global_entity = set() for elem in db: for ty in types: if ty in elem: global_entity.add( normalize(str(elem[ty])).replace(" ", "_")) if ("?" in list(global_entity)): global_entity.remove("?") entity_by_domain[domain] = list(global_entity) return entity_by_domain
def createDelexData(): """Main function of the script - loads delexical dictionary, goes through each dialogue and does: 1) data normalization 2) delexicalization 3) addition of database pointer 4) saves the delexicalized data """ # download the data loadData() # create dictionary of delexicalied values that then we will search against, order matters here! dic = delexicalize.prepareSlotValuesIndependent() delex_data = {} with open('data/multi-woz/data.json') as fin1: data = json.load(fin1) with open('data/multi-woz/dialogue_acts.json') as fin2: data2 = json.load(fin2) cnt = 10 for dialogue_name in tqdm(data): dialogue = data[dialogue_name] # print(dialogue_name) idx_acts = 1 for idx, turn in enumerate(dialogue['log']): # normalization, split and delexicalization of the sentence sent = normalize(turn['text']) words = sent.split() sent = delexicalize.delexicalise(' '.join(words), dic) # parsing reference number GIVEN belief state sent = delexicaliseReferenceNumber(sent, turn) # changes to numbers only here digitpat = re.compile('\d+') sent = re.sub(digitpat, '[value_count]', sent) # delexicalized sentence added to the dialogue dialogue['log'][idx]['text'] = sent if idx % 2 == 1: # if it's a system turn # add database pointer pointer_vector = addDBPointer(turn) # add booking pointer pointer_vector = addBookingPointer(dialogue, turn, pointer_vector) # print(pointer_vector) dialogue['log'][idx - 1]['db_pointer'] = pointer_vector.tolist() # FIXING delexicalization: dialogue = fixDelex(dialogue_name, dialogue, data2, idx, idx_acts) idx_acts += 1 delex_data[dialogue_name] = dialogue with open('data/multi-woz/delex.json', 'w') as outfile: json.dump(delex_data, outfile) return delex_data
def prepareSlotValuesIndependent(): domains = [ 'restaurant', 'hotel', 'attraction', 'train', 'taxi', 'hospital', 'police' ] requestables = ['phone', 'address', 'postcode', 'reference', 'id'] dic = [] dic_area = [] dic_food = [] dic_price = [] mdit = {} # read databases for domain in domains: # try: fin = open('db/' + domain + '_db.json') db_json = json.load(fin) fin.close() for ent in db_json: for key, val in ent.items(): if val == '?' or val == 'free': pass elif key == 'address': dic.append( (normalize(val), '[' + domain + '_' + 'address' + ']')) if "road" in val: val = val.replace("road", "rd") dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) elif "rd" in val: val = val.replace("rd", "road") dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) elif "st" in val: val = val.replace("st", "street") dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) elif "street" in val: val = val.replace("street", "st") dic.append((normalize(val), '[' + domain + '_' + 'address' + ']')) elif key == 'name': dic.append( (normalize(val), '[' + domain + '_' + 'name' + ']')) if "b & b" in val: val = val.replace("b & b", "bed and breakfast") dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) elif "bed and breakfast" in val: val = val.replace("bed and breakfast", "b & b") dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) elif "hotel" in val and 'gonville' not in val: val = val.replace("hotel", "") dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) elif "restaurant" in val: val = val.replace("restaurant", "") dic.append((normalize(val), '[' + domain + '_' + 'name' + ']')) elif key == 'postcode': dic.append((normalize(val), '[' + domain + '_' + 'postcode' + ']')) elif key == 'phone': dic.append((val, '[' + domain + '_' + 'phone' + ']')) elif key == 'trainID': dic.append( (normalize(val), '[' + domain + '_' + 'id' + ']')) elif key == 'department': dic.append((normalize(val), '[' + domain + '_' + 'department' + ']')) # NORMAL DELEX elif key == 'area': dic_area.append( (normalize(val), '[' + 'value' + '_' + 'area' + ']')) elif key == 'food': dic_food.append( (normalize(val), '[' + 'value' + '_' + 'food' + ']')) elif key == 'pricerange': dic_price.append( (normalize(val), '[' + 'value' + '_' + 'pricerange' + ']')) else: pass # TODO car type? if normalize(val) in mdit and domain != mdit[normalize( val)] and key not in [ 'area', 'food', 'pricerange', 'id' ] and 'value' not in normalize(val): print(key, '\\', normalize(val), '\\', mdit[normalize(val)], '\\', domain) # multiple domain disambiguation else: mdit[normalize(val)] = domain # except: # pass if domain == 'hospital': dic.append( (normalize('Hills Rd'), '[' + domain + '_' + 'address' + ']')) dic.append((normalize('Hills Road'), '[' + domain + '_' + 'address' + ']')) dic.append( (normalize('CB20QQ'), '[' + domain + '_' + 'postcode' + ']')) dic.append(('01223245151', '[' + domain + '_' + 'phone' + ']')) dic.append(('1223245151', '[' + domain + '_' + 'phone' + ']')) dic.append(('0122324515', '[' + domain + '_' + 'phone' + ']')) dic.append((normalize('Addenbrookes Hospital'), '[' + domain + '_' + 'name' + ']')) elif domain == 'police': dic.append( (normalize('Parkside'), '[' + domain + '_' + 'address' + ']')) dic.append( (normalize('CB11JG'), '[' + domain + '_' + 'postcode' + ']')) dic.append(('01223358966', '[' + domain + '_' + 'phone' + ']')) dic.append(('1223358966', '[' + domain + '_' + 'phone' + ']')) dic.append((normalize('Parkside Police Station'), '[' + domain + '_' + 'name' + ']')) # add at the end places from trains fin = open('db/' + 'train' + '_db.json') db_json = json.load(fin) fin.close() for ent in db_json: for key, val in ent.items(): if key == 'departure' or key == 'destination': dic.append( (normalize(val), '[' + 'value' + '_' + 'place' + ']')) # add specific values: for key in [ 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday' ]: dic.append((normalize(key), '[' + 'value' + '_' + 'day' + ']')) # more general values add at the end dic.extend(dic_area) dic.extend(dic_food) dic.extend(dic_price) return dic
def queryResultVenues(domain, turn, real_belief=False): # query the db sql_query = "select * from {}".format(domain) flag = True if real_belief == True: items = turn.items() elif real_belief == 'tracking': for slot in turn[domain]: key = slot[0].split("-")[1] val = slot[0].split("-")[2] if key == "price range": key = "pricerange" elif key == "leave at": key = "leaveAt" elif key == "arrive by": key = "arriveBy" if val == "do n't care": pass else: if flag: sql_query += " where " val2 = val.replace("'", "''") val2 = normalize(val2) val2 = val2.replace("'", "''") if key == 'leaveAt': sql_query += key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += key + " < " + r"'" + val2 + r"'" else: sql_query += r" " + key + "=" + r"'" + val2 + r"'" flag = False else: val2 = val.replace("'", "''") val2 = normalize(val2) val2 = val2.replace("'", "''") if key == 'leaveAt': sql_query += r" and " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" and " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" and " + key + "=" + r"'" + val2 + r"'" try: # "select * from attraction where name = 'queens college'" return dbs[domain].execute(sql_query).fetchall() except: return [] # TODO test it pass else: items = turn['metadata'][domain]['semi'].items() flag = True for key, val in items: if val == "" or val == "dontcare" or val == 'not mentioned' or val == "don't care" or val == "dont care" or val == "do n't care": pass else: if flag: sql_query += " where " val2 = val.replace("'", "''") val2 = normalize(val2) val2 = val2.replace("'", "''") if key == 'leaveAt': sql_query += r" " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" " + key + "=" + r"'" + val2 + r"'" flag = False else: val2 = val.replace("'", "''") val2 = normalize(val2) val2 = val2.replace("'", "''") if key == 'leaveAt': sql_query += r" and " + key + " > " + r"'" + val2 + r"'" elif key == 'arriveBy': sql_query += r" and " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" and " + key + "=" + r"'" + val2 + r"'" try: # "select * from attraction where name = 'queens college'" return dbs[domain].execute(sql_query).fetchall() except: raise return [] # TODO test it
def queryResultVenues(self, domain, turn, bs=None, real_belief=False): # query the db sql_query = "select * from {}".format(domain) # import pdb # pdb.set_trace() if real_belief == True: items = turn.items() else: items = turn['metadata'][domain]['semi'].items() # if bs is None: # return [] if bs is not None: items = bs.items() # # print(bs, turn.items()) if len(items) == 0: return [] # import pdb # pdb.set_trace() # else: # items = [] # if bs['domain'] == domain: # items = bs.items() # bs['domain'] = '' # else: # items = [] # items_ = bs.items() # items_remains = {} # items_all = dict(items) # for k, v in items_: # # try: # # items_remains[k] = items_all[k] # # except Exception: # # continue # if k in items_all.keys(): # # items_remains[k] = items_all[k] # items_remains[k] = v # items = items_remains.items() # # print(items) # items = items_ # import pdb flag = True for key, val in items: if val == "" or val == "dontcare" or val == 'not mentioned' or val == "don't care" or val == "dont care" or val == "do n't care" or val == "none": pass else: if flag: sql_query += " where " val2 = val.replace("'", "''") val2 = normalize(val2) if key.lower() == 'leaveAt'.lower(): sql_query += r" " + key + " > " + r"'" + val2 + r"'" elif key.lower() == 'arriveBy'.lower(): sql_query += r" " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" " + key + "=" + r"'" + val2 + r"'" flag = False else: val2 = val.replace("'", "''") val2 = normalize(val2) if key.lower() == 'leaveAt'.lower(): sql_query += r" and " + key + " > " + r"'" + val2 + r"'" elif key.lower() == 'arriveBy'.lower(): sql_query += r" and " + key + " < " + r"'" + val2 + r"'" else: sql_query += r" and " + key + "=" + r"'" + val2 + r"'" try: # "select * from attraction where name = 'queens college'" # print(sql_query) return self.dbs[domain].execute(sql_query).fetchall() except: return [] # TODO test it
def loadVocab(self): # iterate through dialog and make vocab self.inputvocab = ['[VALUE_DONTCARE]', '[VALUE_COUNT]'] self.outputvocab = ['[VALUE_DONTCARE]', '[VALUE_COUNT]'] self.vocab = [] # init inputvocab with informable values for s, vs in self.s2v['informable'].iteritems(): for v in vs: if v == 'none': continue self.inputvocab.extend(v.split()) self.inputvocab.extend( ['[SLOT_' + s.upper() + ']', '[VALUE_' + s.upper() + ']']) self.outputvocab.extend( ['[SLOT_' + s.upper() + ']', '[VALUE_' + s.upper() + ']']) # init inputvocab with requstable values for s, vs in self.s2v['requestable'].iteritems(): for v in vs: if v == 'none': continue self.inputvocab.extend(v.split()) self.inputvocab.extend( ['[SLOT_' + s.upper() + ']', '[VALUE_' + s.upper() + ']']) self.outputvocab.extend( ['[SLOT_' + s.upper() + ']', '[VALUE_' + s.upper() + ']']) # add every word in semidict into vocab for s in self.semidict.keys(): for v in self.semidict[s]: self.inputvocab.extend(v.split()) # for grouping sentences sentKeys = {} self.sentGroup = [] # lemmatizer lmtzr = WordNetLemmatizer() # form lexican ivocab = [] ovocab = [] for i in range(len(self.dialog)): print '\tsetting up vocab, finishing ... %.2f%%\r' %\ (100.0*float(i)/float(len(self.dialog))), sys.stdout.flush() # parsing dialog for j in range(len(self.dialog[i]['dial'])): # text normalisation self.dialog[i]['dial'][j]['sys']['sent'] = normalize( self.dialog[i]['dial'][j]['sys']['sent']) self.dialog[i]['dial'][j]['usr']['transcript'] = normalize( self.dialog[i]['dial'][j]['usr']['transcript']) # this turn turn = self.dialog[i]['dial'][j] # system side words,_,_,_,_ = self.extractSeq(turn['sys']['sent'],\ type='target',index=False) ovocab.extend(words) # sentence group key key = tuple( set( sorted([ lmtzr.lemmatize(w) for w in words if w not in self.stopwords ]))) if key in sentKeys: sentKeys[key][1] += 1 self.sentGroup.append(sentKeys[key][0]) else: sentKeys[key] = [len(sentKeys), 1] self.sentGroup.append(sentKeys[key][0]) # user side words = self.delexicalise(turn['usr']['transcript']).split() mwords,words,_,_,_ = self.extractSeq(turn['usr']['transcript'],\ type='source',index=False) ivocab.extend(mwords) ivocab.extend(words) """ for hyp in t['usr']['asr']: words = self.delexicalise(normalize(hyp['asr-hyp'])).split() ivocab.extend(words) """ # re-assigning sentence group w.r.t their frequency mapping = {} idx = 0 cnt = 0 for key, val in sorted(sentKeys.iteritems(), key=lambda x: x[1][1], reverse=True): mapping[val[0]] = idx #print idx, val[1], key if idx < self.dl - 1: cnt += val[1] idx += 1 #raw_input() print '\tsemi-supervised action examples: %2.2f%%' % \ (float(cnt)/float(len(self.sentGroup))*100) for i in range(len(self.sentGroup)): self.sentGroup[i] = min(mapping[self.sentGroup[i]], self.dl - 1) # set threshold for input vocab counts = dict() for w in ivocab: counts[w] = counts.get(w, 0) + 1 self.inputvocab = ['<unk>','</s>','<slot>','<value>'] + \ sorted(list(set(self.inputvocab+\ [w for w,c in sorted(counts.iteritems(),key=operator.itemgetter(1)) if c>1]))) # set threshold for output vocab counts = dict() for w in ovocab: counts[w] = counts.get(w, 0) + 1 self.outputvocab = ['<unk>','</s>'] + \ sorted(list(set(self.outputvocab+['thank','you','goodbye']+\ [w for w,c in sorted(counts.iteritems(),key=operator.itemgetter(1))]))) # the whole vocab self.vocab = ['<unk>','</s>','<slot>','<value>'] + \ list(set(self.inputvocab[4:]).union(self.outputvocab[2:])) # create snapshot dimension self.snapshots = ['OFFERED', 'CHANGED'] for w in self.outputvocab: if w.startswith('[VALUE'): self.snapshots.append(w) self.snapshots = sorted(self.snapshots)
def prepareSlotValues(self): """ 向s2v中加入kb里已存在键的requestable值 :return: """ print '\tprepare slot value templates ...' # put db requestable values into s2v for e in self.db: for s, v in e.iteritems(): if self.s2v['requestable'].has_key(s): self.s2v['requestable'][s].append(v.lower()) if self.s2v['other'].has_key(s): self.s2v['other'][s].append(v.lower()) # sort values # 将值排序 for s, vs in self.s2v['informable'].iteritems(): self.s2v['informable'][s] = sorted(list(set(vs))) for s, vs in self.s2v['requestable'].iteritems(): self.s2v['requestable'][s] = sorted(list(set(vs))) for s, vs in self.s2v['other'].iteritems(): self.s2v['other'][s] = sorted(list(set(vs))) # make a 1-on-1 mapping for delexicalisation # 从semidict中提取 self.supervalues = [] # 最简单的最抽象的value food self.values = [] # type of food , cuisine... self.slots = [] # SLOT_FOOD for s, vs in self.s2v['informable'].iteritems(): # adding slot delexicalisation self.supervalues.extend([s for x in self.semidict[s]]) self.values.extend([normalize(x) for x in self.semidict[s]]) self.slots.extend( ['[SLOT_' + s.upper() + ']' for x in self.semidict[s]]) # adding value delexicalisation for v in vs: self.supervalues.extend([v for x in self.semidict[v]]) self.values.extend([normalize(x) for x in self.semidict[v]]) self.slots.extend( ['[VALUE_' + s.upper() + ']' for x in self.semidict[v]]) for s, vs in self.s2v['requestable'].items() + self.s2v['other'].items( ): # adding value delexicalisation self.values.extend([normalize(v) for v in vs]) self.supervalues.extend([v for v in vs]) self.slots.extend(['[VALUE_' + s.upper() + ']' for v in vs]) # adding slot delexicalisation self.supervalues.extend([s for x in self.semidict[s]]) self.values.extend([normalize(x) for x in self.semidict[s]]) self.slots.extend( ['[SLOT_' + s.upper() + ']' for x in self.semidict[s]]) # incorporate dontcare values self.values.extend([normalize(v) for v in self.semidict['any']]) self.supervalues.extend(['dontcare' for v in self.semidict['any']]) self.slots.extend(['[VALUE_DONTCARE]' for v in self.semidict['any']]) # sorting according to length self.values, self.supervalues, self.slots = zip(*sorted(\ zip(self.values,self.supervalues,self.slots),\ key=lambda x: len(x[0]),reverse=True)) # 将三个list按照长度同步的进行排序 # for generating semantic labels self.infovs = [] # {'area=centre','area=east',...} self.infoseg = [0] # s2v informable中有3个键,指示长度:[0,7,100,105] self.reqs = [] # 3个键与exit和none的组合 self.reqseg = [0] # self.dontcare = [] # 如果用户不指名信息,则从dontcare中随机 for s in sorted(self.s2v['informable'].keys()): self.infovs.extend( [s + '=' + v for v in self.s2v['informable'][s]]) self.infovs.append(s + '=dontcare') self.infovs.append(s + '=none') self.infoseg.append(len(self.infovs)) # dont care values self.dontcare.append(len(self.infovs) - 1) self.dontcare.append(len(self.infovs) - 2) # fixme for s in sorted(self.s2v['informable'].keys()): self.reqs.extend([s + '=exist', s + '=none']) self.reqseg.append(len(self.reqs)) for s in sorted(self.s2v['requestable'].keys()): self.reqs.extend([s + '=exist', s + '=none']) self.reqseg.append(len(self.reqs)) # for ngram indexing self.ngs2v = [ ] # [('area', ['centre', 'east', 'north', 'south', 'west', 'any', 'none']), ...] for s in sorted(self.s2v['informable'].keys()): self.ngs2v.append((s, self.s2v['informable'][s] + ['any', 'none'])) for s in sorted(self.s2v['informable'].keys()): self.ngs2v.append((s, ['exist', 'none'])) for s in sorted(self.s2v['requestable'].keys()): self.ngs2v.append((s, ['exist', 'none']))
def extractSeq(self, sent, type='source', normalise=False, index=True): ''' :param sent: :param type: :param normalise: :param index: :return: ''' # setup vocab if type == 'source': vocab = self.vocab elif type == 'target': vocab = self.vocab # standardise sentences if normalise: sent = normalize(sent) # preporcessing words = sent.split() if type == 'source': if len(words) == 0: words = ['<unk>'] elif type == 'target': words = ['</s>'] + words + ['</s>'] # indexing, non-delexicalised if index: idx = map(lambda w: vocab.index(w) if w in vocab else 0, words) else: idx = words # delexicalise all sent = self.delexicalise(' '.join(words), mode='all') # value-->key的替换 sent = re.sub(digitpat, '[VALUE_COUNT]', sent) # 正则表达式,将数字替换成[]表示 words = sent.split() # formulate delex positions allvs = self.infovs + self.reqs sltpos = [[] for x in allvs] valpos = [[] for x in allvs] names = [] for i in range(len(words)): if '::' not in words[i]: continue # handling offer changing if words[i].startswith('[VALUE_NAME]'): name = words[i].replace('[VALUE_NAME]::', '') names.append(name) # remove pos identifier tok, ID = words[i].split("::") words[i] = tok # record position mytok, sov = tok[1:-1].lower().split('_') ID = ID.replace('-', ' ') mylist = sltpos if mytok == 'slot' else valpos for j in range(len(allvs)): s, v = allvs[j].split('=') comp = s if mytok == 'slot' else v if comp == ID: if mytok == 'slot': sltpos[j].append(i) else: valpos[j].append(i) # indexing, delexicalised if index: midx = map(lambda w: vocab.index(w) if w in vocab else 0, words) else: midx = words return midx, idx, sltpos, valpos, names