def turn_read_content(fileDatas, dataIdx, feature_type): sentences, acts, scores, labels = [], [], [], [] for fileData in fileDatas[dataIdx]: dialog_sentences, dialog_scores, machine_acts, dialog_labels = read_nbest_dialog_content( fileData, label_index) sentence_turn, act_turn, score_turn, label_turn = [], [], [], [] for turn_id in xrange(len(dialog_sentences)): cur_sentence = [] # user sentence feature for nbest_id in range( len(dialog_sentences[turn_id])): if feature_type in ['sentsent', 'sentbow']: cur_sentbest = default_text2id( dialog_sentences[turn_id][nbest_id], vocab) elif feature_type in ['bowsent', 'bowbow']: cur_sentbest = text2bow( dialog_sentences[turn_id][nbest_id], self.vocab[label_name]) cur_sentence.append(cur_sentbest) # sys act feature if feature_type in ['sentbow', 'bowbow']: cur_act = text2bow(machine_acts[turn_id], self.vocab[label_name]) elif feature_type in ['sentsent', 'bowsent']: cur_act = default_text2id( machine_acts[turn_id], self.vocab[label_name]) cur_score = dialog_scores[turn_id] cur_label = dialog_labels[turn_id] tmp_label_out = len(cur_label) if self.track_dict[ label_name].output_type == 'sigmoid' else 1 sentence_turn.append(cur_sentence) act_turn.append(cur_act) score_turn.append(cur_score) label_turn.append(cur_label) sentences.append(sentence_turn) scores.append(score_turn) acts.append(act_turn) labels.append(label_turn) return sentences, scores, acts, labels, tmp_label_out
def get_batch_new_state(self, fileDatas): """[用于线下]同时生成多个新state,调用该方法不会影响self的状态(注意样本的个数不能超过LecTrack的batch_size) batch的每个example是:包含若干个turn的一个对话拼接成一个长的句子,输出是:对于每个example有一个预测值""" assert (len(fileDatas) <= self.batch_size) tracker_outputs = [] for i in xrange(len(fileDatas)): tracker_outputs.append({ "goal-labels": {}, "method-label": { "none": 1.0 }, "requested-slots": {} }) for label_index, label_name in enumerate(self.label_index_list): sentences, scores, labels = [], [], [] for fileData in fileDatas: dialog_sentences, dialog_scores, dialog_labels = read_1best_dialog_content( fileData, label_index) cur_sentence, cur_score, cur_label = dialog_sentences[ -1], dialog_scores[-1], dialog_labels[-1] cur_sentence = default_text2id(cur_sentence, self.vocab[label_name]) assert len(cur_sentence) > 0 and len(cur_sentence) == len( cur_score) sentences.append(cur_sentence) scores.append(cur_score) labels.append(cur_label) tmp_label_out = len(cur_label) if self.track_dict[ label_name].output_type == 'sigmoid' else 1 data_batch = self.track_dict[label_name].multiWordSentBatch( sentences, scores, labels, tmp_label_out) outputs = self.track_dict[label_name].predict(data_batch)[0] for i in xrange(len(tracker_outputs)): cur_outputs = outputs[i].asnumpy() self._updateState(tracker_outputs[i], cur_outputs, label_name, top_n=10) # remove "signature" from requested_slots for i in xrange(len(tracker_outputs)): if "signature" in tracker_outputs[i]["requested-slots"]: del tracker_outputs[i]["requested-slots"]["signature"] return tracker_outputs
def get_new_state(self, dm_output, asr_output, pre_state=None, us_live_goal=None): """[用于线上]生成新的state,调用该方法会影响self的状态,即当前输入的turn会被认为与之前的turn相关""" self.turn += 1 cur_state = { "goal-labels": {}, "method-label": { "none": 1.0 }, "requested-slots": {} } self._updateHistory(dm_output, asr_output, us_live_goal) # construct data format for generating DataBatch fileData = {} fileData["turns"] = [] for i in xrange(len(self.history_label["turns"])): turnData = genTurnData_nbest(self.history_log["turns"][i], self.history_label["turns"][i]) fileData["turns"].append(turnData) # update state for label_index, label_name in enumerate(self.label_index_list): dialog_sentences, dialog_scores, dialog_labels = read_1best_dialog_content( fileData, label_index) cur_sentence, cur_score, cur_label = dialog_sentences[ -1], dialog_scores[-1], dialog_labels[-1] cur_sentence = default_text2id(cur_sentence, self.vocab[label_name]) assert len(cur_sentence) > 0 and len(cur_sentence) == len( cur_score) tmp_label_out = len( cur_label ) if self.track_dict[label_name].output_type == 'sigmoid' else 1 data_batch = self.track_dict[label_name].oneSentenceBatch( cur_sentence, cur_score, cur_label, tmp_label_out) cur_outputs = self.track_dict[label_name].predict(data_batch)[0] cur_outputs = cur_outputs[0].asnumpy() self._updateState(cur_state, cur_outputs, label_name, top_n=5) # remove "signature" from requested_slots if "signature" in cur_state["requested-slots"]: del cur_state["requested-slots"]["signature"] return cur_state
def get_x_y_from_data(data_json_file, labelIdx): raw_sentences, scores, labels = default_read_content( data_json_file, labelIdx) sentences = [] for i in xrange(len(raw_sentences)): raw_sentence = raw_sentences[i] sentences.append(default_text2id(raw_sentence, vocab)) # padding to max sentence length with '</s>' padding_word = '</s>' sequence_length = 360 padded_sentences = [] for i in xrange(len(sentences)): sentence = sentences[i] num_padding = sequence_length - len(sentence) new_sentence = sentence + [vocab[padding_word]] * num_padding padded_sentences.append(new_sentence) # convert to np array x = np.array(padded_sentences) y = np.array(labels) return x, y
def __init__(self, path, labelIdx,vocab, vocab1, buckets, batch_size, max_nbest, max_sentlen, init_states, data_components, label_out=1, feature_type='bowbow'): super(MATTurnSentIter, self).__init__() self.vocab = vocab self.vocab1=vocab1 self.padding_id = self.vocab['</s>'] self.label_out = label_out self.max_nbest = max_nbest self.max_sentlen = max_sentlen self.feature_type = feature_type self.len_sent = self.max_sentlen if self.feature_type in ['sentsent', 'sentbow'] else len(self.vocab) self.len_act_sent = self.max_sentlen if self.feature_type in ['sentsent', 'bowsent'] else len(self.vocab1) sentences, scores, acts, labels = turn_read_content(path, labelIdx[0],0) #sentences1, scores1, acts1, labels1 = turn_read_content(path, labelIdx[0],1) lab=[] for i in labelIdx: se,sc,ac,l=turn_read_content(path, i,0) lab.append(l) labels0=[] for i in range(len(labels)): d0=[] for j in range(len(labels[i])): ll=[] for lb in lab: ll.append(lb[i][j]) d0.append(ll) labels0.append(d0) labels=labels0 """ sentences: (dialog_num, turn_num, nbest_num, sentence_len) scores: (dialog_num, turn_num, nbest_num) acts: (dialog_num, turn_num, machine_act_len) labels: (dialog_num, turn_num, ) """ """ new sentences: (dialog_num, turn_num, 2, nbest_num, sentence_len) scores: (dialog_num, turn_num, nbest_num) acts: (dialog_num, turn_num, 2 , machine_act_len) labels: (dialog_num, turn_num,4 ) """ buckets.sort() self.buckets = buckets self.data = [[] for _ in buckets] self.data_act = [[] for _ in buckets] self.data_score = [[] for _ in buckets] self.label = [[] for _ in buckets] # pre-allocate with the largest bucket for better memory sharing self.default_bucket_key = max(buckets) for i in range(len(sentences)): sentence = sentences[i] score = scores[i] act = acts[i] label = labels[i] for turn_id in range(len(sentence)): # user sentence feature #for i in range(2): for nbest_id in range(len(sentence[turn_id])): if self.feature_type in ['sentsent', 'sentbow']: sentence[turn_id][nbest_id] = default_text2id(sentence[turn_id][nbest_id], self.vocab) elif self.feature_type in ['bowsent', 'bowbow']: sentence[turn_id][nbest_id] = text2bow(sentence[turn_id][nbest_id], self.vocab) # sys act feature if self.feature_type in ['sentbow', 'bowbow']: act[turn_id] = text2bow(act[turn_id], self.vocab1) elif self.feature_type in ['sentsent', 'bowsent']: act[turn_id] = default_text2id(act[turn_id], self.vocab1) for i, bkt in enumerate(buckets): if bkt == len(sentence): self.data[i].append(sentence) self.data_score[i].append(score) self.data_act[i].append(act) self.label[i].append(label) break """ sentence: (turn_num, nbest_num, len_sent) score: (turn_num, nbest_num) act: (turn_num, len_act_sent) label: (turn_num, label_out) """ # we just ignore the sentence it is longer than the maximum # bucket size here embed_weight = mx.nd.array(np.load('embed_vN3.npy')) slotsent="food pricerange name area" slota=default_text2id(slotsent, self.vocab) slotarr=slotsent.split() #print slota label_len=len(labelIdx) val_len=[] for i in labelIdx: val_len.append(len(ontologyDict[u'informable'][slotarr[i]])) vl=[] for i in labelIdx: vla=[] for key in ontologyDict[u'informable'][slotarr[i]]: #print key v=default_text2id(key,self.vocab) tmp=mx.nd.array(v) tmp= mx.nd.Embedding(data=tmp, input_dim=len(self.vocab), weight=embed_weight, output_dim=300, name='embed') tmp=mx.nd.sum(tmp,axis=0) v=tmp.asnumpy() vla.append(v) vla=np.asarray(vla) vl.append(vla) #vl=np.asarray(vl) #print vl #print len(vl) sa=[] for i in labelIdx: tmp=mx.nd.array([slota[i]]) tmp= mx.nd.Embedding(data=tmp, input_dim=len(self.vocab), weight=embed_weight, output_dim=300, name='embed') sa.append(tmp.asnumpy()) slota=np.squeeze(np.asarray(sa)) slot=np.zeros((batch_size,label_len,300)) for i in range(batch_size): slot[i]=slota value=[] for j in range(label_len): tmp=np.zeros((batch_size,val_len[j],300)) for i in range(batch_size): tmp[i]=vl[j] value.append(tmp) vl_name=[] for i in range(label_len): vl_name.append("value_%d"%i) self.vl_name=vl_name # convert data into ndarrays for better speed during training data_mask_len =[np.zeros((len(x), )) for i, x in enumerate(self.data)] datatmp = [np.full((len(x), buckets[i],self.max_nbest,self.len_sent), self.padding_id) for i, x in enumerate(self.data)] data_act = [np.full((len(x), buckets[i],self.len_act_sent), 0.0) for i, x in enumerate(self.data_act)] data_score =[np.zeros((len(x), buckets[i], self.max_nbest)) for i, x in enumerate(self.data_score)] label = [np.zeros((len(x), buckets[i], self.label_out)) for i, x in enumerate(self.label)] data =[np.zeros((len(x), buckets[i],self.len_sent,300)) for i, x in enumerate(self.data)] #slot =[np.zeros((len(x), buckets[i],300),dtype=np.float32) for i, x in enumerate(self.data)] for i_bucket in range(len(self.buckets)): for i_diag in range(len(self.data[i_bucket])): data_mask_len[i_bucket][i_diag]=len(self.data[i_bucket][i_diag]) for i_turn in range(len(self.data[i_bucket][i_diag])): act = self.data_act[i_bucket][i_diag][i_turn] #for i in range(2): data_act[i_bucket][i_diag, i_turn, :len(act)] = act label[i_bucket][i_diag, i_turn, :] = self.label[i_bucket][i_diag][i_turn] # be careful that, here, max_nbest can be smaller than current turn nbest number. extra-best will be truncated. #for i_data in range(2): tempsent=[] for i_nbest in range(min(len(self.data[i_bucket][i_diag][i_turn]), self.max_nbest)): sentence = self.data[i_bucket][i_diag][i_turn][i_nbest] datatmp[i_bucket][i_diag, i_turn,i_nbest, :len(sentence)] = sentence tmp=mx.nd.array(datatmp[i_bucket][i_diag, i_turn,i_nbest]) tmp= mx.nd.Embedding(data=tmp, input_dim=len(self.vocab), weight=embed_weight, output_dim=300, name='embed') sentence=tmp.asnumpy() score = self.data_score[i_bucket][i_diag][i_turn][i_nbest] #preprocess sent =sentence*score #if i_nbest ==0: tempsent.append(sent) data_score[i_bucket][i_diag, i_turn, i_nbest] = score tempsent=np.asarray(tempsent) scoredsent=np.sum(tempsent,axis=0) #scoredsent=scoredsent*2-1 data[i_bucket][i_diag, i_turn] = scoredsent """ data: (bucket_num, dialog_num, bucket_size/turn_num, max_nbest, len_sent) score: (bucket_num, dialog_num, bucket_size/turn_num, max_nbest) data_act: (bucket_num, dialog_num, bucket_size/turn_num, len_act_sent) label: (bucket_num, dialog_num, bucket_size/turn_num, label_out) """ self.data_mask_len=data_mask_len self.data = data self.data_act = data_act self.data_score = data_score self.label = label self.slot=slot self.value=value # backup corpus self.all_data_mask_len = copy.deepcopy(self.data_mask_len) self.all_data = copy.deepcopy(self.data) self.all_data_act = copy.deepcopy(self.data_act) self.all_data_score = copy.deepcopy(self.data_score) self.all_label = copy.deepcopy(self.label) # Get the size of each bucket, so that we could sample # uniformly from the bucket sizeS=0 bucket_sizes = [len(x) for x in self.data] print("Summary of dataset ==================") for bkt, size in zip(buckets, bucket_sizes): sizeS+=size print("bucket of len %3d : %d samples" % (bkt, size)) self.batch_size = batch_size #self.make_data_iter_plan() self.init_states = init_states self.data_components = data_components self.size=int(sizeS/batch_size) self.provide_data = self.data_components + self.init_states
def multiTurnBatch(self, labelIdx,sentences, acts, scores, labels, label_out, vocab, vocab1,feature_type='bowbow'): assert(len(sentences) <= self.batch_size) print len(vocab) print len(vocab1) cur_bucket_key = self.getMatchKey(max([len(s) for s in sentences])) padding_id = vocab['</s>'] len_sent = self.max_sentlen if feature_type in ['sentsent', 'sentbow'] else len(vocab) len_act_sent = self.max_sentlen if feature_type in ['sentsent', 'bowsent'] else len(vocab1) embed_weight = mx.nd.array(np.load('embed_vN3.npy')) # convert data into ndarrays for better speed during training slotsent="food pricerange name area" slota=default_text2id(slotsent, vocab) slotarr=slotsent.split() #print slota val_len=len(ontologyDict[u'informable'][slotarr[labelIdx]]) vl=[] for key in ontologyDict[u'informable'][slotarr[labelIdx]]: #print key v=default_text2id(key,vocab) tmp=mx.nd.array(v) tmp= mx.nd.Embedding(data=tmp, input_dim=len(vocab), weight=embed_weight, output_dim=300, name='embed') tmp=mx.nd.sum(tmp,axis=0) v=tmp.asnumpy() vl.append(v) vl=np.asarray(vl) #print vl #print len(vl) tmp=mx.nd.array([slota[labelIdx]]) tmp= mx.nd.Embedding(data=tmp, input_dim=len(vocab), weight=embed_weight, output_dim=300, name='embed') slota=tmp.asnumpy() value=np.zeros((self.batch_size,val_len,300)) slot=np.zeros((self.batch_size,300)) for i in range(self.batch_size): slot[i]=slota value[i]=vl datatmp = np.full((self.batch_size, cur_bucket_key,2, self.max_nbest, len_sent), padding_id,dtype=np.double) data_act = np.full((self.batch_size, cur_bucket_key,2, len_act_sent), padding_id,dtype=np.double) data_score = np.zeros((self.batch_size, cur_bucket_key, self.max_nbest)) label = np.zeros((self.batch_size, cur_bucket_key, label_out)) data = np.full((self.batch_size, cur_bucket_key,2, len_sent, 300), padding_id,dtype=np.double) for i_diag in range(len(sentences)): for i_turn in range(len(sentences[i_diag])): act = acts[i_diag][i_turn] for i in range(2): data_act[i_diag, i_turn,i, :len(act[i])] = act[i] label[i_diag, i_turn, :] = labels[i_diag][i_turn] # be careful that, here, max_nbest can be smaller than current turn nbest number. extra-best will be truncated. for i_data in range(2): tempsent=[] for i_nbest in range(min(len(sentences[i_diag][i_turn][i_data]), self.max_nbest)): sentence = sentences[i_diag][i_turn][i_data][i_nbest] datatmp[i_diag, i_turn, i_data,i_nbest, :len(sentence)] = sentence tmp=mx.nd.array(datatmp[i_diag, i_turn, i_data,i_nbest]) tmp= mx.nd.Embedding(data=tmp, input_dim=len(vocab), weight=embed_weight, output_dim=300, name='embed') sentence=tmp.asnumpy() score = scores[i_diag][i_turn][i_nbest] #preprocess sent =sentence*score tempsent.append(sent) data_score[i_diag, i_turn, i_nbest] = score tempsent=np.asarray(tempsent) scoredsent=np.sum(tempsent,axis=0) #scoredsent=scoredsent*2-1 data[i_diag, i_turn, i_data] = scoredsent data_names = [x[0] for x in self.default_provide_data] init_state_arrays = [mx.nd.zeros(x[1]) for x in self.init_states] data_all = [mx.nd.array(data), mx.nd.array(data_act)] if 'score' in data_names: data_all += [mx.nd.array(data_score)] if 'slot' in data_names: data_all += [mx.nd.array(slot)] if 'value' in data_names: data_all += [mx.nd.array(value)] data_all += init_state_arrays label_names = ['softmax_label'] label_all = [mx.nd.array(label)] data_batch = SimpleBatch(data_names, data_all, label_names, label_all, cur_bucket_key) return data_batch
def __init__(self, path, labelIdx, vocab, buckets, batch_size, max_nbest, max_sentlen, init_states, data_components, label_out=1, feature_type='bowbow'): super(DSTTurnSentIter, self).__init__() self.vocab = vocab self.padding_id = self.vocab['</s>'] self.label_out = label_out self.max_nbest = max_nbest self.max_sentlen = max_sentlen self.feature_type = feature_type self.len_sent = self.max_sentlen if self.feature_type in [ 'sentsent', 'sentbow' ] else len(self.vocab) self.len_act_sent = self.max_sentlen if self.feature_type in [ 'sentsent', 'bowsent' ] else len(self.vocab) sentences, scores, acts, labels = turn_read_content(path, labelIdx) """ sentences: (dialog_num, turn_num, nbest_num, sentence_len) scores: (dialog_num, turn_num, nbest_num) acts: (dialog_num, turn_num, machine_act_len) labels: (dialog_num, turn_num, ) """ buckets.sort() self.buckets = buckets self.data = [[] for _ in buckets] self.data_act = [[] for _ in buckets] self.data_score = [[] for _ in buckets] self.label = [[] for _ in buckets] # pre-allocate with the largest bucket for better memory sharing self.default_bucket_key = max(buckets) for i in range(len(sentences)): sentence = sentences[i] score = scores[i] act = acts[i] label = labels[i] for turn_id in range(len(sentence)): # user sentence feature for nbest_id in range(len(sentence[turn_id])): if self.feature_type in ['sentsent', 'sentbow']: sentence[turn_id][nbest_id] = default_text2id( sentence[turn_id][nbest_id], self.vocab) elif self.feature_type in ['bowsent', 'bowbow']: sentence[turn_id][nbest_id] = text2bow( sentence[turn_id][nbest_id], self.vocab) # sys act feature if self.feature_type in ['sentbow', 'bowbow']: act[turn_id] = text2bow(act[turn_id], self.vocab) elif self.feature_type in ['sentsent', 'bowsent']: act[turn_id] = default_text2id(act[turn_id], self.vocab) for i, bkt in enumerate(buckets): if bkt == len(sentence): self.data[i].append(sentence) self.data_score[i].append(score) self.data_act[i].append(act) self.label[i].append(label) break """ sentence: (turn_num, nbest_num, len_sent) score: (turn_num, nbest_num) act: (turn_num, len_act_sent) label: (turn_num, label_out) """ # we just ignore the sentence it is longer than the maximum # bucket size here # convert data into ndarrays for better speed during training data = [ np.full((len(x), buckets[i], self.max_nbest, self.len_sent), self.padding_id) for i, x in enumerate(self.data) ] data_act = [ np.full((len(x), buckets[i], self.len_act_sent), self.padding_id) for i, x in enumerate(self.data_act) ] data_score = [ np.zeros((len(x), buckets[i], self.max_nbest)) for i, x in enumerate(self.data_score) ] label = [ np.zeros((len(x), buckets[i], self.label_out)) for i, x in enumerate(self.label) ] for i_bucket in range(len(self.buckets)): for i_diag in range(len(self.data[i_bucket])): for i_turn in range(len(self.data[i_bucket][i_diag])): act = self.data_act[i_bucket][i_diag][i_turn] data_act[i_bucket][i_diag, i_turn, :len(act)] = act label[i_bucket][ i_diag, i_turn, :] = self.label[i_bucket][i_diag][i_turn] # be careful that, here, max_nbest can be smaller than current turn nbest number. extra-best will be truncated. for i_nbest in range( min(len(self.data[i_bucket][i_diag][i_turn]), self.max_nbest)): sentence = self.data[i_bucket][i_diag][i_turn][i_nbest] score = self.data_score[i_bucket][i_diag][i_turn][ i_nbest] data[i_bucket][i_diag, i_turn, i_nbest, :len(sentence)] = sentence data_score[i_bucket][i_diag, i_turn, i_nbest] = score """ data: (bucket_num, dialog_num, bucket_size/turn_num, max_nbest, len_sent) score: (bucket_num, dialog_num, bucket_size/turn_num, max_nbest) data_act: (bucket_num, dialog_num, bucket_size/turn_num, len_act_sent) label: (bucket_num, dialog_num, bucket_size/turn_num, label_out) """ self.data = data self.data_act = data_act self.data_score = data_score self.label = label # backup corpus self.all_data = copy.deepcopy(self.data) self.all_data_act = copy.deepcopy(self.data_act) self.all_data_score = copy.deepcopy(self.data_score) self.all_label = copy.deepcopy(self.label) # Get the size of each bucket, so that we could sample # uniformly from the bucket bucket_sizes = [len(x) for x in self.data] print("Summary of dataset ==================") for bkt, size in zip(buckets, bucket_sizes): print("bucket of len %3d : %d samples" % (bkt, size)) self.batch_size = batch_size #self.make_data_iter_plan() self.init_states = init_states self.data_components = data_components self.provide_data = self.data_components + self.init_states