def __init__(self,
                 lm_weight=2.0,
                 lexicon_path="WER_data/lexicon.txt",
                 token_path="WER_data/letters.lst",
                 lm_path="WER_data/4-gram.bin"):
        lexicon = load_words(lexicon_path)
        word_dict = create_word_dict(lexicon)

        self.token_dict = Dictionary(token_path)
        self.lm = KenLM(lm_path, word_dict)

        self.sil_idx = self.token_dict.get_index("|")
        self.unk_idx = word_dict.get_index("<unk>")
        self.token_dict.add_entry("#")
        self.blank_idx = self.token_dict.get_index('#')

        self.trie = Trie(self.token_dict.index_size(), self.sil_idx)
        start_state = self.lm.start(start_with_nothing=False)

        for word, spellings in lexicon.items():
            usr_idx = word_dict.get_index(word)
            _, score = self.lm.score(start_state, usr_idx)
            for spelling in spellings:
                # max_reps should be 1; using 0 here to match DecoderTest bug
                spelling_idxs = tkn_to_idx(spelling,
                                           self.token_dict,
                                           max_reps=0)
                self.trie.insert(spelling_idxs, usr_idx, score)

        self.trie.smear(SmearingMode.MAX)
        self.opts = DecoderOptions(beam_size=2500,
                                   beam_threshold=100.0,
                                   lm_weight=lm_weight,
                                   word_score=2.0,
                                   unk_score=-math.inf,
                                   log_add=False,
                                   sil_weight=-1,
                                   criterion_type=CriterionType.CTC)
        return np.frombuffer(file.read(N * N * 4), dtype=np.float32)


def assert_near(x, y, tol):
    assert abs(x - y) <= tol


# load test files

T, N = load_TN(os.path.join(testing_data_path, "TN.bin"))
emissions = load_emissions(os.path.join(testing_data_path, "emission.bin"))
transitions = load_transitions(
    os.path.join(testing_data_path, "transition.bin"))
lexicon = loadWords(os.path.join(testing_data_path, "words.lst"))
wordDict = createWordDict(lexicon)
tokenDict = Dictionary(os.path.join(testing_data_path, "letters.lst"))
tokenDict.addEntry("1")
lm = KenLM(os.path.join(testing_data_path, "lm.arpa"), wordDict)

# test LM

#sentence = ["the", "cat", "sat", "on", "the", "mat"]
#lm_state = lm.start(False)
#total_score = 0
#lm_score_target = [-1.05971, -4.19448, -3.33383, -2.76726, -1.16237, -4.64589]
#for i in range(len(sentence)):
#    lm_state, lm_score = lm.score(lm_state, wordDict.getIndex(sentence[i]))
#    assert_near(lm_score, lm_score_target[i], 1e-5)
#    total_score += lm_score
#lm_state, lm_score = lm.finish(lm_state)
#total_score += lm_score
class WlDecoder:
    """
    Wav2Letter-based decoder. Follows the official examples for the python bindings, 
    see https://github.com/facebookresearch/wav2letter/blob/master/bindings/python/examples/decoder_example.py
    """
    def __init__(self,
                 lm_weight=2.0,
                 lexicon_path="WER_data/lexicon.txt",
                 token_path="WER_data/letters.lst",
                 lm_path="WER_data/4-gram.bin"):
        lexicon = load_words(lexicon_path)
        word_dict = create_word_dict(lexicon)

        self.token_dict = Dictionary(token_path)
        self.lm = KenLM(lm_path, word_dict)

        self.sil_idx = self.token_dict.get_index("|")
        self.unk_idx = word_dict.get_index("<unk>")
        self.token_dict.add_entry("#")
        self.blank_idx = self.token_dict.get_index('#')

        self.trie = Trie(self.token_dict.index_size(), self.sil_idx)
        start_state = self.lm.start(start_with_nothing=False)

        for word, spellings in lexicon.items():
            usr_idx = word_dict.get_index(word)
            _, score = self.lm.score(start_state, usr_idx)
            for spelling in spellings:
                # max_reps should be 1; using 0 here to match DecoderTest bug
                spelling_idxs = tkn_to_idx(spelling,
                                           self.token_dict,
                                           max_reps=0)
                self.trie.insert(spelling_idxs, usr_idx, score)

        self.trie.smear(SmearingMode.MAX)
        self.opts = DecoderOptions(beam_size=2500,
                                   beam_threshold=100.0,
                                   lm_weight=lm_weight,
                                   word_score=2.0,
                                   unk_score=-math.inf,
                                   log_add=False,
                                   sil_weight=-1,
                                   criterion_type=CriterionType.CTC)

    def collapse(self, prediction):
        result = []

        for p in prediction:
            if result and p == result[-1]:
                continue
            result.append(p)

        blank = '#'
        space = '|'

        result = [x for x in result if x != blank]
        result = [(x if x != space else ' ') for x in result if x != blank]
        return result

    def predictions(self, emissions):
        t, n = emissions.size()

        emissions = emissions.cpu().numpy()
        decoder = WordLMDecoder(self.opts, self.trie, self.lm, self.sil_idx,
                                self.blank_idx, self.unk_idx, [])
        results = decoder.decode(emissions.ctypes.data, t, n)

        prediction = [
            self.token_dict.get_entry(x) for x in results[0].tokens if x >= 0
        ]
        prediction = self.collapse(prediction)

        return prediction
Beispiel #4
0
    # load test files
    # load time and number of tokens for dumped acoustic scores
    T, N = load_tn(os.path.join(data_path, "TN.bin"))
    # load emissions [Batch=1, Time, Ntokens]
    emissions = load_emissions(os.path.join(data_path, "emission.bin"))
    # load transitions (from ASG loss optimization) [Ntokens, Ntokens]
    transitions = load_transitions(os.path.join(data_path, "transition.bin"))
    # load lexicon file, which defines spelling of words
    # the format word and its tokens spelling separated by the spaces,
    # for example for letters tokens with ASG loss:
    # ann a n 1 |
    lexicon = load_words(os.path.join(data_path, "words.lst"))
    # read lexicon and store it in the w2l dictionary
    word_dict = create_word_dict(lexicon)
    # create w2l dict with tokens set (letters in this example)
    token_dict = Dictionary(os.path.join(data_path, "letters.lst"))
    # add repetition symbol as soon as we have ASG acoustic model
    token_dict.add_entry("1")
    # create Kenlm language model
    lm = KenLM(os.path.join(data_path, "lm.arpa"), word_dict)

    # test LM
    sentence = ["the", "cat", "sat", "on", "the", "mat"]
    # start LM with nothing, get its current state
    lm_state = lm.start(False)
    total_score = 0
    lm_score_target = [
        -1.05971, -4.19448, -3.33383, -2.76726, -1.16237, -4.64589
    ]
    # iterate over words in the sentence
    for i in range(len(sentence)):