Esempio n. 1
0
 def __init__(
     self,
     wordlm,
     subwordlm,
     word_dict,
     subword_dict,
     subwordlm_weight=0.8,
     oov_penalty=1.0,
     open_vocab=True,
 ):
     super(MultiLevelLM, self).__init__()
     self.wordlm = wordlm
     self.subwordlm = subwordlm
     self.word_eos = word_dict["<eos>"]
     self.word_unk = word_dict["<unk>"]
     self.var_word_eos = torch.LongTensor([self.word_eos])
     self.var_word_unk = torch.LongTensor([self.word_unk])
     self.space = subword_dict["<space>"]
     self.eos = subword_dict["<eos>"]
     self.lexroot = make_lexical_tree(word_dict, subword_dict,
                                      self.word_unk)
     self.log_oov_penalty = math.log(oov_penalty)
     self.open_vocab = open_vocab
     self.subword_dict_size = len(subword_dict)
     self.subwordlm_weight = subwordlm_weight
     self.normalized = True
Esempio n. 2
0
 def __init__(
     self, wordlm, word_dict, subword_dict, oov_penalty=0.0001, open_vocab=True
 ):
     super(LookAheadWordLM, self).__init__()
     self.wordlm = wordlm
     self.word_eos = word_dict["<eos>"]
     self.word_unk = word_dict["<unk>"]
     self.var_word_eos = torch.LongTensor([self.word_eos])
     self.var_word_unk = torch.LongTensor([self.word_unk])
     self.space = subword_dict["<space>"]
     self.eos = subword_dict["<eos>"]
     self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
     self.oov_penalty = oov_penalty
     self.open_vocab = open_vocab
     self.subword_dict_size = len(subword_dict)
     self.zero_tensor = torch.FloatTensor([self.zero])
     self.normalized = True
Esempio n. 3
0
 def __init__(self,
              wordlm,
              word_dict,
              subword_dict,
              oov_penalty=0.0001,
              open_vocab=True):
     super(LookAheadWordLM, self).__init__()
     self.wordlm = wordlm
     self.word_eos = word_dict["<eos>"]
     self.word_unk = word_dict["<unk>"]
     self.xp_word_eos = self.xp.full(1, self.word_eos, "i")
     self.xp_word_unk = self.xp.full(1, self.word_unk, "i")
     self.space = subword_dict["<space>"]
     self.eos = subword_dict["<eos>"]
     self.lexroot = make_lexical_tree(word_dict, subword_dict,
                                      self.word_unk)
     self.oov_penalty = oov_penalty
     self.open_vocab = open_vocab
     self.subword_dict_size = len(subword_dict)
     self.normalized = True
Esempio n. 4
0
    def __init__(self, vocabulary, meetingpath, charlist, bpe=False):
        """Meeting-wise KB in decoder
        """
        self.meetingdict = {}
        self.meetingdict_sym = {}
        self.meetingmask = {}
        self.meetinglextree = {}
        self.chardict = {}
        self.charlist = charlist
        self.bpe = bpe
        for i, char in enumerate(charlist):
            self.chardict[char] = i

        self.maxlen = 0
        self.unkidx = vocabulary.get_idx('<unk>')
        for filename in os.listdir(meetingpath):
            worddict, wordlist = {}, []
            with open(os.path.join(meetingpath, filename)) as fin:
                for word in fin:
                    word = tuple(word.split()) if bpe else word.strip()
                    worddict[word] = len(wordlist) + 1
                    wordlist.append(word)
            self.meetingdict[filename] = vocabulary.get_ids(wordlist,
                                                            oov_sym='<blank>')
            self.meetinglextree[filename] = make_lexical_tree(
                worddict, self.chardict, -1)
            self.maxlen = len(
                wordlist) if len(wordlist) > self.maxlen else self.maxlen
        # pad meeting wordlist
        for meeting, wordlist in self.meetingdict.items():
            self.meetingdict_sym[meeting] = vocabulary.get_syms(
                self.meetingdict[meeting])
            self.meetingdict[meeting] = wordlist + [self.unkidx] * (
                self.maxlen - len(wordlist) + 1)
            self.meetingmask[meeting] = [0] * (len(wordlist)) + [1] * (
                self.maxlen - len(wordlist)) + [0]
        self.unkidx = self.maxlen
        self.maxlen = self.maxlen + 1
        self.vocab = vocabulary
        self.char_worddict, self.char_dictmask, self.charind, self.char_wordlist = self.get_character_dict(
        )
Esempio n. 5
0
 def __init__(self,
              wordlm,
              subwordlm,
              word_dict,
              subword_dict,
              subwordlm_weight=0.8,
              oov_penalty=1.0,
              open_vocab=True):
     super(MultiLevelLM, self).__init__()
     self.wordlm = wordlm
     self.subwordlm = subwordlm
     self.word_eos = word_dict['<eos>']
     self.word_unk = word_dict['<unk>']
     self.xp_word_eos = self.xp.full(1, self.word_eos, 'i')
     self.xp_word_unk = self.xp.full(1, self.word_unk, 'i')
     self.space = subword_dict['<space>']
     self.eos = subword_dict['<eos>']
     self.lexroot = make_lexical_tree(word_dict, subword_dict,
                                      self.word_unk)
     self.log_oov_penalty = math.log(oov_penalty)
     self.open_vocab = open_vocab
     self.subword_dict_size = len(subword_dict)
     self.subwordlm_weight = subwordlm_weight
     self.normalized = True