def createDelexData(): """Main function of the script - loads delexical dictionary, goes through each dialogue and does: 1) data normalization 2) delexicalization 3) addition of database pointer 4) saves the delexicalized data """ # create dictionary of delexicalied values that then we will search against, order matters here! dic = delexicalize.prepareSlotValuesIndependent() delex_data = {} fin1 = open('data/woz2/data.json') data = json.load(fin1) for dialogue_name in tqdm(data): if 'WOZ' not in dialogue_name: continue dialogue = data[dialogue_name] #print dialogue_name for idx, turn in enumerate(dialogue['log']): # normalization, split and delexicalization of the sentence sent = normalize(turn['text']) words = sent.split() sent = delexicalize.delexicalise(' '.join(words), dic) # changes to numbers only here digitpat = re.compile('\d+') sent = re.sub(digitpat, '[value_count]', sent) # delexicalized sentence added to the dialogue dialogue['log'][idx]['text'] = sent if idx % 2 == 1: # if it's a system turn # add database pointer pointer_vector = addDBPointer(turn) #print pointer_vector dialogue['log'][idx - 1]['db_pointer'] = pointer_vector.tolist() delex_data[dialogue_name] = dialogue with open('data/delex.json', 'w') as outfile: json.dump(delex_data, outfile) return delex_data
def createDelexData(sent, sent_act, bs, dic, turn, option): # normalization, split and delexicalization of the sentence sent = normalize(sent) words = sent.split() sent = delexicalize.delexicalise(' '.join(words), dic) # parsing reference number GIVEN belief state sent = delexicaliseReferenceNumber(sent, turn) # changes to numbers only here digitpat = re.compile('\d+') sent = re.sub(digitpat, '[value_count]', sent) if option == 'user': sent = fixDelex(sent, None, bs) if option == 'sys': sent = fixDelex(sent, sent_act, None) return sent.strip()
def createDelexData(): """Main function of the script - loads delexical dictionary, goes through each dialogue and does: 1) data normalization 2) delexicalization 3) addition of database pointer 4) saves the delexicalized data """ # download the data loadData() # create dictionary of delexicalied values that then we will search against, order matters here! dic = delexicalize.prepareSlotValuesIndependent() delex_data = {} with open('data/multi-woz/data.json') as fin1: data = json.load(fin1) with open('data/multi-woz/dialogue_acts.json') as fin2: data2 = json.load(fin2) cnt = 10 for dialogue_name in tqdm(data): dialogue = data[dialogue_name] # print(dialogue_name) idx_acts = 1 for idx, turn in enumerate(dialogue['log']): # normalization, split and delexicalization of the sentence sent = normalize(turn['text']) words = sent.split() sent = delexicalize.delexicalise(' '.join(words), dic) # parsing reference number GIVEN belief state sent = delexicaliseReferenceNumber(sent, turn) # changes to numbers only here digitpat = re.compile('\d+') sent = re.sub(digitpat, '[value_count]', sent) # delexicalized sentence added to the dialogue dialogue['log'][idx]['text'] = sent if idx % 2 == 1: # if it's a system turn # add database pointer pointer_vector = addDBPointer(turn) # add booking pointer pointer_vector = addBookingPointer(dialogue, turn, pointer_vector) # print(pointer_vector) dialogue['log'][idx - 1]['db_pointer'] = pointer_vector.tolist() # FIXING delexicalization: dialogue = fixDelex(dialogue_name, dialogue, data2, idx, idx_acts) idx_acts += 1 delex_data[dialogue_name] = dialogue with open('data/multi-woz/delex.json', 'w') as outfile: json.dump(delex_data, outfile) return delex_data