def read_file(filename): with codecs.open(filename, 'r') as f: lines = f.readlines() lines = [line.split('\t<=>\t') for line in lines] datas = [] for line in lines: utterance = line[0] class_string = process_class(line[1]) enc_lis = process_sent(class_string) dec_lis = process_sent(utterance) if len(enc_lis) > 0 and len(dec_lis): datas.append((utterance, class_string)) return datas
def convert_file_to_ids(self, filename): with codecs.open(filename, 'r') as f: lines = f.readlines() lines = [line.split('\t<=>\t') for line in lines] lis = [] for (sent, label) in lines: sent_lis = process_sent(sent) if (len(sent_lis)) == 0: continue sent_ids = [self.word2idx.get(w) if w in self.word2idx else Constants.UNK for w in sent_lis] if label.strip() == '': label_lis = [] else: label_lis = label.strip().split(';') label_ids = [] for l in label_lis: # this is a error-prone line # if the class2idx does not have the class, then it will miss some one # typical example: class2idx from train not contain all of the classes in valid. # therefore, for valid, the class ids are not correct # we keep this just for code coherence. if l in self.class2idx: label_ids.append(self.class2idx.get(l)) lis.append((sent_ids, label_ids, label_lis)) print('Total data num: {}'.format(len(lis))) return lis
def judge_utt_label(utterance, class_string, memory, cuda): classes = class_string.strip().split(';') new_classes = [] for cls in classes: lis = cls.strip().split(' ', 2) if len(lis) == 3 and lis[2] in utterance: if random.random() > 0.5: new_classes.append( ' '.join(lis[:2] + ['unk'] * len(lis[2].strip().split()))) else: new_classes.append(cls.strip()) else: new_classes.append(cls.strip()) #if cls[1] in ['food', 'name', 'area', 'near']: new_class_string = ' ; '.join(new_classes) if new_class_string == class_string: return None else: lis = process_sent(new_class_string) word2idx = memory['enc2idx'] ids = [ word2idx[w] if w in word2idx else Constants.UNK for w in lis ] data = torch.tensor(ids).view(1, -1) if cuda: data = data.cuda() return data
def data_info(string, memory, cuda): lis = process_sent(string) if len(lis) == 0: raise Exception("Input string can not be empty string") word2idx = memory['enc2idx'] ids = [word2idx[w] if w in word2idx else Constants.UNK for w in lis] data = torch.tensor(ids).view(1, -1) word2idx = memory['dec2idx'] ids, oov_list = seq2extend_ids(lis, word2idx) enc_batch_extend_vocab_idx = torch.tensor(ids).view(1, -1) if len(oov_list) == 0: extra_zeros = None else: extra_zeros = torch.zeros((1, len(oov_list))) if cuda: data = data.cuda() enc_batch_extend_vocab_idx = enc_batch_extend_vocab_idx.cuda() if extra_zeros is not None: extra_zeros = extra_zeros.cuda() return data, None, extra_zeros, enc_batch_extend_vocab_idx, oov_list
def data_info(string, memory, cuda): results = string.strip().split(';') lis = [] tol = [] for string in results: l = process_sent(string) lis.append(l) tol.extend(l) datas = [] word2idx = memory['enc2idx'] for l in lis: ids = [word2idx[w] if w in word2idx else Constants.UNK for w in l] data = torch.tensor(ids).view(1, -1) if cuda: data = data.cuda() datas.append(data) word2idx = memory['dec2idx'] ids, oov_list = seq2extend_ids(tol, word2idx) enc_batch_extend_vocab_idx = torch.tensor(ids).view(1, -1) if len(oov_list) == 0: extra_zeros = None else: extra_zeros = torch.zeros((1, len(oov_list))) if cuda: enc_batch_extend_vocab_idx = enc_batch_extend_vocab_idx.cuda() if extra_zeros is not None: extra_zeros = extra_zeros.cuda() return datas, None, extra_zeros, enc_batch_extend_vocab_idx, oov_list
def judge_utt_label(utterance, triple, class_string, memory, cuda): classes = triple.strip().split(';') new_string = class_string for cls in classes: lis = cls.strip().split('-', 2) if len(lis) == 3 and lis[2] in utterance: if random.random() > 0.5: for word in lis[2].strip().split(): new_string = new_string.replace(word, 'unk') if new_string == class_string: return None else: results = new_string.strip().split(';') datas = [] for r in results: lis = process_sent(r) word2idx = memory['enc2idx'] ids = [ word2idx[w] if w in word2idx else Constants.UNK for w in lis ] data = torch.tensor(ids).view(1, -1) if cuda: data = data.cuda() datas.append(data) return datas
def build_word_vocab(self, filename, class_file, frequency=1): words = [] sents = [] with codecs.open(filename, 'r') as f: lines = f.readlines() for line in lines: sents.append(line.split('\t<=>\t')[0]) sents.append(process_class(line.split('\t<=>\t')[1])) for sent in sents: ws = process_sent(sent) words.extend(ws) counter = Counter(words) lis = counter.most_common() print('Total words num: {}'.format(len(lis))) num = 0 for (word, count) in lis: if count < frequency: break num += 1 print('Words num with frequency >= {}: {}'.format(frequency, num)) word2idx = { Constants.PAD_WORD: Constants.PAD, Constants.UNK_WORD: Constants.UNK, Constants.BOS_WORD: Constants.BOS, Constants.EOS_WORD: Constants.EOS } # =========================================== word2idx['dontcare'] = len(word2idx) # =========================================== class_vocab = self.get_act_slot_vocab(class_file) for word in class_vocab: if word not in word2idx: word2idx[word] = len(word2idx) for (word, count) in lis: if count >= frequency: if word not in word2idx: word2idx[word] = len(word2idx) print('Final voacb size: {}'.format(len(word2idx))) print('==========================================') return word2idx
def build_word_vocab(self, filename, frequency=1): words = [] with codecs.open(filename, 'r') as f: lines = f.readlines() sents = [line.split('\t<=>\t')[0].strip() for line in lines] #""" for line in lines: classes = line.split('\t<=>\t')[1].strip().split(';') for cls in classes: lis = cls.strip().split('-', 2) if len(lis) == 3: sents.append(lis[2].strip()) #""" for sent in sents: ws = process_sent(sent) words.extend(ws) counter = Counter(words) lis = counter.most_common() print('Total words num: {}'.format(len(lis))) num = 0 for (word, count) in lis: if count < frequency: break num += 1 print('Words num with frequency >= {}: {}'.format(frequency, num)) word2idx = { Constants.PAD_WORD: Constants.PAD, Constants.UNK_WORD: Constants.UNK, Constants.BOS_WORD: Constants.BOS, Constants.EOS_WORD: Constants.EOS } # =========================================== word2idx['dontcare'] = len(word2idx) # =========================================== for (word, count) in lis: if count >= frequency: if word not in word2idx: word2idx[word] = len(word2idx) print('Final voacb size: {}'.format(len(word2idx))) print('==========================================') return word2idx
def label_info(string, memory, enc_oov_list, cuda): lis = process_sent(string) word2idx = memory['dec2idx'] inp_ids = value2ids(lis, word2idx) out_ids = value2extend_ids(lis, word2idx, enc_oov_list) inp_ids = [Constants.BOS] + inp_ids out_ids = out_ids + [Constants.EOS] inp_ids = torch.tensor(inp_ids).view(1, -1) out_ids = torch.tensor(out_ids) if cuda: inp_ids = inp_ids.cuda() out_ids = out_ids.cuda() return inp_ids, out_ids
def decode_utterance(model, class_string, memory, cuda, nbest): #class_string = process_class(class_string) sent_lis = process_sent(class_string) if len(sent_lis) == 0: return [''] data, lengths, extra_zeros, enc_batch_extend_vocab_idx, oov_list = \ DADataset.data_info(class_string, memory, cuda) # Model processing ## encoder outputs, hiddens = model.encoder(data, lengths) s_decoder = model.enc_to_dec(hiddens) s_t_1 = s_decoder y_t = torch.tensor([Constants.BOS]).view(1, 1) if cuda: y_t = y_t.cuda() out_lis = beam_search(model.decoder, extra_zeros, enc_batch_extend_vocab_idx, s_decoder, outputs, lengths, len(memory['dec2idx']), cuda, nbest=nbest) res = [[] for _ in range(len(out_lis))] for i in range(len(out_lis)): out_ids = out_lis[i][1:-1] for vid in out_ids: if vid < len(memory['idx2dec']): res[i].append(memory['idx2dec'][vid]) else: res[i].append(oov_list[vid - len(memory['idx2dec'])]) utts = [] for out_lis in res: utterance = ' '.join(out_lis) utts.append(utterance) return utts