Example #1
0
 def read_file(filename):
     with codecs.open(filename, 'r') as f:
         lines = f.readlines()
     lines = [line.split('\t<=>\t') for line in lines]
     datas = []
     for line in lines:
         utterance = line[0]
         class_string = process_class(line[1])
         enc_lis = process_sent(class_string)
         dec_lis = process_sent(utterance)
         if len(enc_lis) > 0 and len(dec_lis):
             datas.append((utterance, class_string))
     return datas
Example #2
0
 def convert_file_to_ids(self, filename):
     with codecs.open(filename, 'r') as f:
         lines = f.readlines()
     lines = [line.split('\t<=>\t') for line in lines]
     lis = []
     for (sent, label) in lines:
         sent_lis = process_sent(sent)
         if (len(sent_lis)) == 0:
             continue
         sent_ids = [self.word2idx.get(w) if w in self.word2idx else Constants.UNK for w in sent_lis]
         if label.strip() == '':
             label_lis = []
         else:
             label_lis = label.strip().split(';')
         label_ids = []
         for l in label_lis:
             # this is a error-prone line
             # if the class2idx does not have the class, then it will miss some one
             # typical example: class2idx from train not contain all of the classes in valid.
             # therefore, for valid, the class ids are not correct
             # we keep this just for code coherence.
             if l in self.class2idx:
                 label_ids.append(self.class2idx.get(l))
         lis.append((sent_ids, label_ids, label_lis))
     print('Total data num: {}'.format(len(lis)))
     return lis
Example #3
0
    def judge_utt_label(utterance, class_string, memory, cuda):

        classes = class_string.strip().split(';')
        new_classes = []
        for cls in classes:
            lis = cls.strip().split(' ', 2)
            if len(lis) == 3 and lis[2] in utterance:
                if random.random() > 0.5:
                    new_classes.append(
                        ' '.join(lis[:2] +
                                 ['unk'] * len(lis[2].strip().split())))
                else:
                    new_classes.append(cls.strip())
            else:
                new_classes.append(cls.strip())
                #if cls[1] in ['food', 'name', 'area', 'near']:
        new_class_string = ' ; '.join(new_classes)
        if new_class_string == class_string:
            return None
        else:
            lis = process_sent(new_class_string)
            word2idx = memory['enc2idx']
            ids = [
                word2idx[w] if w in word2idx else Constants.UNK for w in lis
            ]
            data = torch.tensor(ids).view(1, -1)
            if cuda:
                data = data.cuda()
            return data
Example #4
0
    def data_info(string, memory, cuda):

        lis = process_sent(string)
        if len(lis) == 0:
            raise Exception("Input string can not be empty string")

        word2idx = memory['enc2idx']
        ids = [word2idx[w] if w in word2idx else Constants.UNK for w in lis]
        data = torch.tensor(ids).view(1, -1)

        word2idx = memory['dec2idx']
        ids, oov_list = seq2extend_ids(lis, word2idx)
        enc_batch_extend_vocab_idx = torch.tensor(ids).view(1, -1)

        if len(oov_list) == 0:
            extra_zeros = None
        else:
            extra_zeros = torch.zeros((1, len(oov_list)))

        if cuda:
            data = data.cuda()
            enc_batch_extend_vocab_idx = enc_batch_extend_vocab_idx.cuda()
            if extra_zeros is not None:
                extra_zeros = extra_zeros.cuda()

        return data, None, extra_zeros, enc_batch_extend_vocab_idx, oov_list
Example #5
0
    def data_info(string, memory, cuda):

        results = string.strip().split(';')
        lis = []
        tol = []
        for string in results:
            l = process_sent(string)
            lis.append(l)
            tol.extend(l)

        datas = []
        word2idx = memory['enc2idx']
        for l in lis:
            ids = [word2idx[w] if w in word2idx else Constants.UNK for w in l]
            data = torch.tensor(ids).view(1, -1)
            if cuda:
                data = data.cuda()
            datas.append(data)

        word2idx = memory['dec2idx']
        ids, oov_list = seq2extend_ids(tol, word2idx)
        enc_batch_extend_vocab_idx = torch.tensor(ids).view(1, -1)

        if len(oov_list) == 0:
            extra_zeros = None
        else:
            extra_zeros = torch.zeros((1, len(oov_list)))

        if cuda:
            enc_batch_extend_vocab_idx = enc_batch_extend_vocab_idx.cuda()
            if extra_zeros is not None:
                extra_zeros = extra_zeros.cuda()

        return datas, None, extra_zeros, enc_batch_extend_vocab_idx, oov_list
Example #6
0
    def judge_utt_label(utterance, triple, class_string, memory, cuda):

        classes = triple.strip().split(';')
        new_string = class_string
        for cls in classes:
            lis = cls.strip().split('-', 2)
            if len(lis) == 3 and lis[2] in utterance:
                if random.random() > 0.5:
                    for word in lis[2].strip().split():
                        new_string = new_string.replace(word, 'unk')
        if new_string == class_string:
            return None
        else:

            results = new_string.strip().split(';')
            datas = []
            for r in results:
                lis = process_sent(r)
                word2idx = memory['enc2idx']
                ids = [
                    word2idx[w] if w in word2idx else Constants.UNK
                    for w in lis
                ]
                data = torch.tensor(ids).view(1, -1)
                if cuda:
                    data = data.cuda()
                datas.append(data)
            return datas
Example #7
0
    def build_word_vocab(self, filename, class_file, frequency=1):
        words = []
        sents = []
        with codecs.open(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            sents.append(line.split('\t<=>\t')[0])
            sents.append(process_class(line.split('\t<=>\t')[1]))

        for sent in sents:
            ws = process_sent(sent)
            words.extend(ws)

        counter = Counter(words)
        lis = counter.most_common()
        print('Total words num: {}'.format(len(lis)))
        num = 0
        for (word, count) in lis:
            if count < frequency:
                break
            num += 1
        print('Words num with frequency >= {}: {}'.format(frequency, num))

        word2idx = {
            Constants.PAD_WORD: Constants.PAD,
            Constants.UNK_WORD: Constants.UNK,
            Constants.BOS_WORD: Constants.BOS,
            Constants.EOS_WORD: Constants.EOS
        }

        # ===========================================
        word2idx['dontcare'] = len(word2idx)
        # ===========================================

        class_vocab = self.get_act_slot_vocab(class_file)

        for word in class_vocab:
            if word not in word2idx:
                word2idx[word] = len(word2idx)

        for (word, count) in lis:
            if count >= frequency:
                if word not in word2idx:
                    word2idx[word] = len(word2idx)
        print('Final voacb size: {}'.format(len(word2idx)))
        print('==========================================')
        return word2idx
Example #8
0
    def build_word_vocab(self, filename, frequency=1):
        words = []
        with codecs.open(filename, 'r') as f:
            lines = f.readlines()
            sents = [line.split('\t<=>\t')[0].strip() for line in lines]

        #"""
        for line in lines:
            classes = line.split('\t<=>\t')[1].strip().split(';')
            for cls in classes:
                lis = cls.strip().split('-', 2)
                if len(lis) == 3:
                    sents.append(lis[2].strip())
        #"""

        for sent in sents:
            ws = process_sent(sent)
            words.extend(ws)

        counter = Counter(words)
        lis = counter.most_common()
        print('Total words num: {}'.format(len(lis)))
        num = 0
        for (word, count) in lis:
            if count < frequency:
                break
            num += 1
        print('Words num with frequency >= {}: {}'.format(frequency, num))

        word2idx = {
            Constants.PAD_WORD: Constants.PAD,
            Constants.UNK_WORD: Constants.UNK,
            Constants.BOS_WORD: Constants.BOS,
            Constants.EOS_WORD: Constants.EOS
        }

        # ===========================================
        word2idx['dontcare'] = len(word2idx)
        # ===========================================

        for (word, count) in lis:
            if count >= frequency:
                if word not in word2idx:
                    word2idx[word] = len(word2idx)
        print('Final voacb size: {}'.format(len(word2idx)))
        print('==========================================')
        return word2idx
Example #9
0
    def label_info(string, memory, enc_oov_list, cuda):

        lis = process_sent(string)

        word2idx = memory['dec2idx']

        inp_ids = value2ids(lis, word2idx)
        out_ids = value2extend_ids(lis, word2idx, enc_oov_list)
        inp_ids = [Constants.BOS] + inp_ids
        out_ids = out_ids + [Constants.EOS]
        inp_ids = torch.tensor(inp_ids).view(1, -1)
        out_ids = torch.tensor(out_ids)

        if cuda:
            inp_ids = inp_ids.cuda()
            out_ids = out_ids.cuda()

        return inp_ids, out_ids
Example #10
0
def decode_utterance(model, class_string, memory, cuda, nbest):

    #class_string = process_class(class_string)
    sent_lis = process_sent(class_string)
    if len(sent_lis) == 0:
        return ['']

    data, lengths, extra_zeros, enc_batch_extend_vocab_idx, oov_list = \
            DADataset.data_info(class_string, memory, cuda)

    # Model processing
    ## encoder
    outputs, hiddens = model.encoder(data, lengths)

    s_decoder = model.enc_to_dec(hiddens)
    s_t_1 = s_decoder
    y_t = torch.tensor([Constants.BOS]).view(1, 1)
    if cuda:
        y_t = y_t.cuda()
    out_lis = beam_search(model.decoder,
                          extra_zeros,
                          enc_batch_extend_vocab_idx,
                          s_decoder,
                          outputs,
                          lengths,
                          len(memory['dec2idx']),
                          cuda,
                          nbest=nbest)
    res = [[] for _ in range(len(out_lis))]
    for i in range(len(out_lis)):
        out_ids = out_lis[i][1:-1]
        for vid in out_ids:
            if vid < len(memory['idx2dec']):
                res[i].append(memory['idx2dec'][vid])
            else:
                res[i].append(oov_list[vid - len(memory['idx2dec'])])
    utts = []
    for out_lis in res:
        utterance = ' '.join(out_lis)
        utts.append(utterance)

    return utts