Esempio n. 1
0
def test_new_model_order():
    lm = ARPAModelSimple()
    assert lm.order() is None

    for p in PARSERS:
        lm = arpa.loadf(TEST_ARPA, parser=p)[0]
        assert lm.order() == 5
Esempio n. 2
0
def load_adaptation_sample(filename):
    if filename.endswith(".gz"):
        A_models = arpa.load(gzip.open(filename, mode='rt'))
    else:
        A_models = arpa.loadf(filename)
    A = A_models[0]
    return A
Esempio n. 3
0
def test_loadf_dumpf():
    lm = arpa.loadf(TEST_ARPA)[0]
    out = tempfile.NamedTemporaryFile(mode='w+t', delete=False)
    arpa.dumpf(lm, out.name)
    out.close()
    assert filecmp.cmp(TEST_ARPA, out.name, shallow=False)
    os.unlink(out.name)
Esempio n. 4
0
def get_sentence_score(sentence_indexes, models, test_set, probabilities, guesses, alpha_start, alpha_transition):
    logger = logging.getLogger('recognizer')
    # print("Alpha start {}".format(alpha_start))
    # print("Alpha transition {}".format(alpha_transition))

    top_best = 3

    lm_models = arpa.loadf("ukn.3.lm")
    lm = lm_models[0]

    emission_scores = get_emission_scores(sentence_indexes, models, test_set)

    if (alpha_start and alpha_transition):
        guess = get_viterbi_sentence(emission_scores, alpha_start, alpha_transition)
    else:
        guess = list(emission_scores.idxmax(axis = 0))

    guesses.extend(guess)

    word_probabilities = [v for k, v in emission_scores.to_dict().items()]
    probabilities.extend(word_probabilities)


    logger.debug("Guess {}".format(guess))
    logger.debug("Probability {}".format(word_probabilities))


    return emission_scores
Esempio n. 5
0
def test_loadf_dumpf():
    lm = arpa.loadf(TEST_ARPA)[0]
    out = tempfile.NamedTemporaryFile(mode="w+t", delete=False)
    arpa.dumpf(lm, out.name)
    out.close()
    assert filecmp.cmp(TEST_ARPA, out.name, shallow=False)
    os.unlink(out.name)
Esempio n. 6
0
 def __init__(self,
              labels,
              model_path=None,
              alpha=0.5,
              beta=0.5,
              cutoff_top_n=40,
              cutoff_prob=-2.1,
              beam_width=64,
              blank_id=0,
              space_id=60,
              vocab=None,
              trie_path=None):
     self.NEG_INF = -float("inf")
     self.labels = labels
     self.model_path = model_path
     self.beam_size = beam_width
     self.alpha = alpha
     self.beta = beta
     self.blank_id = blank_id
     self.cutoff_top_n = cutoff_top_n
     self.cutoff_prob = cutoff_prob
     self.vocab = vocab
     self.space_id = space_id
     self.lm = arpa.loadf(self.model_path)[0]
     self.trie_path = trie_path
     self.trie_root = CustomUnpickler(open(self.trie_path, 'rb')).load()
Esempio n. 7
0
 def load_realigning_LM(self):
     self.N_range = (
         self.realigning_lm_params['min_number_of_words'],
         self.realigning_lm_params['max_number_of_words'],
     )
     self.stt_end_tokens = ['</s>', '<s>']
     logging.info(f"Loading LM for realigning: {self.realigning_lm_params['arpa_language_model']}")
     return arpa.loadf(self.realigning_lm_params['arpa_language_model'])[0]
Esempio n. 8
0
def test_loadf_dumpf_write():
    for p in PARSERS:
        for suf in ['.arpa', '.gz']:
            # read
            lm1 = arpa.loadf(TEST_ARPA, parser=p)[0]
            # write
            out1 = tempfile.NamedTemporaryFile(mode='w+t', suffix=suf, delete=False)
            arpa.dumpf(lm1, out1.name)
            out1.close()
            # read again
            lm2 = arpa.loadf(out1.name, parser=p)[0]
            # write again
            out2 = tempfile.NamedTemporaryFile(mode='w+t', suffix='.arpa', delete=False)
            arpa.dumpf(lm2, out2.name)
            out2.close()
            # compare
            assert filecmp.cmp(TEST_ARPA, out2.name, shallow=False)
            os.unlink(out2.name)
Esempio n. 9
0
def test_manual_contains():
    lm = arpa.loadf(TEST_ARPA)[0]
    assert 'foo' in lm
    with pytest.raises(ValueError):
        assert ('foo', ) in lm
    with pytest.raises(ValueError):
        assert 'a little' in lm
    with pytest.raises(ValueError):
        assert ('a', 'little') in lm
def recognize_SLM(models: dict, test_set: SinglesData):
    # recognizer with SLM
    SLMmodel = arpa.loadf("ukn.3.lm")[0]
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    probabilities = []
    guesses = []
    Xlengths = test_set.get_all_Xlengths()
    wordlist = test_set.wordlist
    # Iterate the testing data.
    for video in test_set.sentences_index:
        # Create a word sequence of each video
        word_sequence = [
            wordlist[index] for index in test_set.sentences_index[video]
        ]
        # Enumerate words in a video
        for i, word_index in enumerate(test_set.sentences_index[video]):
            score_dict = {}
            best_word = ""
            max_score = float('-inf')
            X, lengths = Xlengths[word_index]
            prefix = ""
            """
        # for 2-gram and 3-gram
        if i > 1:
          prefix = word_sequence[i-2: i]
        elif i == 1:
          prefix = word_sequence[i - 1]
        """

            # for 2-gram
            if i > 0:
                prefix = word_sequence[i - 1]

            # Iterate all possible words in models, and calculate the Log Liklihood
            for key in models:
                try:
                    liklihood = SLMmodel.log_p(prefix + " " + key)
                except:
                    liklihood = 0

                try:
                    model = models[key]
                    logL = model.score(X, lengths) + SLMmodel.log_p(key)
                    score_dict[key] = logL

                    # store the best guess words
                    if logL > max_score:
                        max_score = logL
                        best_word = key
                except:
                    score_dict[key] = -1

            # add prob dictionary and the best guess of each word
            probabilities.append(score_dict)
            guesses.append(best_word)

    return probabilities, guesses
Esempio n. 11
0
def _test_log_s(sentences, sos, eos):
    lm_me = arpa.loadf(TEST_ARPA)[0]
    lm_ken = kenlm.LanguageModel(TEST_ARPA)
    results_me = []
    results_ken = []
    for sentence in sentences:
        score_me = lm_me.log_s(sentence, sos=sos, eos=eos)
        score_ken = lm_ken.score(sentence, bool(sos), bool(eos))
        results_me.append(score_me)
        results_ken.append(score_ken)
    assert all(round(m - k, 2) == 0 for m, k in zip(results_me, results_ken))
Esempio n. 12
0
def test_loadf_dumpf_read():
    for p in PARSERS:
        for src in [TEST_ARPA, TEST_ARPA_GZ]:
            # read
            lm = arpa.loadf(src, parser=p)[0]
            # write
            out = tempfile.NamedTemporaryFile(mode='w+t', suffix='.arpa', delete=False)
            arpa.dumpf(lm, out.name)
            out.close()
            # compare
            assert filecmp.cmp(TEST_ARPA, out.name, shallow=False)
            os.unlink(out.name)
Esempio n. 13
0
def _test_log_p(queries):
    lm_me = arpa.loadf(TEST_ARPA)[0]
    lm_ken = kenlm.LanguageModel(TEST_ARPA)
    results_me = []
    results_ken = []
    for ngram in queries:
        prob_me = lm_me.log_p(ngram)
        prob_ken = list(lm_ken.full_scores(' '.join(ngram), False,
                                           False))[-1][0]
        results_me.append(prob_me)
        results_ken.append(prob_ken)
    assert all(round(m - k, 4) == 0 for m, k in zip(results_me, results_ken))
Esempio n. 14
0
def recognize(models: dict, test_set: SinglesData):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    probabilities = []
    guesses = []

    #num_cores = multiprocessing.cpu_count()

    startTime = int(round(time.time() * 1000))

    p = os.path.join('data', 'ukn.3.lm')
    language_model = arpa.loadf(p)[0]

    idx = 0
    #Parallel(n_jobs=num_cores)(process_by_sq(models, X, length, probabilities, guesses) for _, (X, length) in test_set.get_all_Xlengths().items())
    for _, (X, length) in test_set.get_all_Xlengths().items():
        probability_by_word = {}
        bare_prob = {}
        sq = [si for k, si in test_set.sentences_index.items() if idx in si]
        for word, model in models.items():
            w_score = -float(math.inf)
            try:
                w_score = model.score(X, length)
            except:
                pass
            bare_prob[word] = w_score
            probability_by_word[word] = w_score + calculate_with_LM(
                guesses, idx, language_model, sq[0], word)

        idx += 1
        best_guess = max(probability_by_word.keys(),
                         key=(lambda w: probability_by_word[w]))
        best_bare = max(bare_prob.values())
        guesses.append(best_guess, best_bare)
        probabilities.append(probability_by_word)

    endTime = int(round(time.time() * 1000))
    print(endTime - startTime, 'ms')

    return probabilities, [g[0] for g in guesses]
Esempio n. 15
0
def test_loadf_dumpf_write():
    for p in PARSERS:
        for suf in ['.arpa', '.gz']:
            # read
            lm1 = arpa.loadf(TEST_ARPA, parser=p)[0]
            # write
            out1 = tempfile.NamedTemporaryFile(mode='w+t',
                                               suffix=suf,
                                               delete=False)
            arpa.dumpf(lm1, out1.name)
            out1.close()
            # read again
            lm2 = arpa.loadf(out1.name, parser=p)[0]
            # write again
            out2 = tempfile.NamedTemporaryFile(mode='w+t',
                                               suffix='.arpa',
                                               delete=False)
            arpa.dumpf(lm2, out2.name)
            out2.close()
            # compare
            assert filecmp.cmp(TEST_ARPA, out2.name, shallow=False)
            os.unlink(out2.name)
Esempio n. 16
0
def test_loadf_dumpf_read():
    for p in PARSERS:
        for src in [TEST_ARPA, TEST_ARPA_GZ]:
            # read
            lm = arpa.loadf(src, parser=p)[0]
            # write
            out = tempfile.NamedTemporaryFile(mode='w+t',
                                              suffix='.arpa',
                                              delete=False)
            arpa.dumpf(lm, out.name)
            out.close()
            # compare
            assert filecmp.cmp(TEST_ARPA, out.name, shallow=False)
            os.unlink(out.name)
Esempio n. 17
0
def test_input_equality():
    lm = ARPAModelSimple()
    with pytest.raises(KeyError):
        assert lm.p('foo') == lm.p(('foo', ))
    with pytest.raises(KeyError):
        assert lm.p('xxx') == lm.p(('xxx', ))
    with pytest.raises(KeyError):
        assert lm.p('a little') == lm.p(('a', 'little'))
    with pytest.raises(KeyError):
        assert lm.p('xxx little') == lm.p(('xxx', 'little'))

    lm = arpa.loadf(TEST_ARPA)[0]
    assert lm.p('foo') == lm.p(('foo', ))
    assert lm.p('xxx') == lm.p(('xxx', ))
    assert lm.p('a little') == lm.p(('a', 'little'))
    assert lm.p('xxx little') == lm.p(('xxx', 'little'))
Esempio n. 18
0
def load_background(filename):
    if filename.endswith(".gz"):
        B_models = arpa.load(gzip.open(filename, mode='rt'))
    else:
        B_models = arpa.loadf(filename)
    B = B_models[0]  # ARPA files may contain several models.

    # We can recover f_B_star (i.e., discounted probabilities) from interpolated probabilities
    # As B is an interpolated model, i.e., p_B(w|h) = f_B_star(w|h) + bow_B(h) * p_B(w|h')
    # Thus,
    #
    #    f_B_star(w|h) = p_B(w|h) - bow_B(h) * p_B(w|h')
    #
    # where h' = h[1:]
    f_B_star = dict()
    for n in range(2, B.order() + 1):
        print("%d-gram" % n)
        # progress_count = 0
        for e in B._entries(n):  # entry format: (log10(prob), hw, log10(bow))
            hw = e[1]
            h = hw[:-1]
            h_prime_w = hw[1:]
            f_B_star[hw] = B._base**float(e[0]) - B._base**(
                float(B._bos[h]) + float(log_p(B, h_prime_w)))
            # assert f_B_star[hw] >= 0

            # progress_count += 1
            # if progress_count % 2000 == 0:
            #     print(progress_count)

    # Index structure:
    # len(h) --> h --> {w | hw is seen in the corpus}, where len(h) >= 1
    B_hist_index = [defaultdict(list) for i in range(B.order())]
    for n in range(2, B.order() + 1):
        print("%d-gram" % n)
        # progress_count = 0
        for e in B._entries(n):
            hw = e[1]
            h = hw[:-1]
            w = hw[-1]
            B_hist_index[len(h)][h].append(w)

            # progress_count += 1
            # if progress_count % 2000 == 0:
            #     print(progress_count)

    return B, f_B_star, B_hist_index
Esempio n. 19
0
    def __init__(self, ngram, vocab=None, base=np.exp(1)):
        self.lm = arpa.loadf(ngram)[0]
        self.base = base
        with open(ngram) as f:
            for line in f:
                if line.startswith("\\1-grams:"):
                    break
                if line.startswith("ngram "):
                    self.n = int(line.replace("ngram ", "").split("=")[0])
        self.context = ["<s>"]
        self.vocab = None
        is_first_line = True
        is_subword_nmt = True
        if vocab:
            self.vocab = list()
            with open(vocab) as f:
                for line in f:
                    w = line.strip()
                    if is_first_line:
                        is_first_line = False
                        if w is "{":
                            is_subword_nmt = True
                            continue
                        else:
                            is_subword_nmt = False

                    if is_subword_nmt:
                        if w is "}":
                            break
                        else:
                            p = w.split(": ")
                            v = p[0][1:len(p[0]) - 1]
                            i = int(p[1][0:len(p[1]) - 1])
                            if v == "<s>":
                                continue
                            else:
                                self.vocab.append(v)
                                if self.vocab[i] is not v:
                                    print(
                                        "Wrong word index!! index: %d vocab in the file: %s vocab in the list: %s"
                                        % (i, v, self.vocab[i]))
                    else:
                        self.vocab.append(
                            w.split("\t")[0].split(" ")[0].strip())
            print("%d vocabs were loaded for shallow fusion w/ arpa" %
                  len(self.vocab))
Esempio n. 20
0
def error_detector(lmAdaptPath, sentence, threshold):
    # Reading input language model.
    models = arpa.loadf(lmAdaptPath)
    # ARPA files may contain several models.
    lm = models[0]
    words = sentence.split()
    scores = dict(zip(words, [0] * len(words)))
    n_grams = list(ngrams(words, 3))
    for n_gram in n_grams:
        prop = lm.p(n_gram)
        if prop < threshold:
            for word in n_gram:
                scores[word] += 1
    sent_errors = ['0']
    for n_gram in n_grams:
        if scores[n_gram[1]] > 1:
            sent_errors.append('1')
        else:
            sent_errors.append('0')
    sent_errors.append('0')
    return " ".join(sent_errors)
Esempio n. 21
0
import arpa
import re
import pickle
import pandas as pd
import itertools
from functools import reduce

import numpy as np

from asl_data import SinglesData
from asl_utils import show_errors

ukn3 = arpa.loadf("lm/ukn.3.lm")
lm = ukn3[0]

probabilities = pickle.load(open("data/probabilities.pkl", "rb"))
test_set = pickle.load(open("data/test_set.pkl", "rb"))
df_probs = pd.DataFrame(probabilities)
# print(df_prob.head())
lm_factor = 20.0


def score_with_lm1():
    for video_num, indices in test_set.sentences_index.items():
        #     visual_model_guesses = df_probs.iloc[indices,:].idxmax(axis=1)
        ngram_indices = []
        for sentence_idx, word_idx in enumerate(indices):
            if ngram_indices:
                ngram_prefix = df_probs.iloc[ngram_indices, :].idxmax(
                    axis=1).tolist()
                row = df_probs.iloc[word_idx, :]
                          help="If set, save errors in pickle format",
                          action='store_true')

    args = parser.parse_args()
    input = args.input
    lm = args.lm
    n = args.n
    threshold = args.threshold
    print_words = args.print_words
    save = args.save

    # Get sentences of asr output.
    sentences = get_hypothesis(input, True)

    # Reading input language model.
    models = arpa.loadf(lm)
    # ARPA files may contain several models.
    lm = models[0]
    # For each sentence find words that have low propability
    # and keep a score of them.
    errors = []
    for sent in sentences:
        words = sent.split()
        scores = dict(zip(words, [0] * len(words)))
        n_grams = list(ngrams(words, n))
        for n_gram in n_grams:
            prop = lm.p(n_gram)
            if prop < threshold:
                for word in n_gram:
                    scores[word] += 1
        sent_errors = [0]
Esempio n. 23
0
#!/usr/bin/env python3

import sys
import arpa
import os
import math
import numpy as np

lmfile = sys.argv[1]

print("lmfile=", lmfile)

lms = arpa.loadf(lmfile)
lm = lms[0]


def log_p(B, ngram):
    # words = B._check_input(ngram)
    # if B._unk:
    #     words = B._replace_unks(words)
    # return log_p_raw(B, words)
    return log_p_raw(B, ngram)


def log_p_raw(B, ngram):
    ret = B._ps.get(ngram, None)
    if ret is not None:
        return ret
    else:
        # if len(ngram) == 1:
        #     raise KeyError
Esempio n. 24
0
                    best_sentence = s
            except:
                continue
        if best_sentence is not None:
            sentence_guesses[video_num] = best_sentence

    errors = 0
    for video_num in sentence_guesses:
        correct_sentence = [
            test_set.wordlist[i] for i in test_set.sentences_index[video_num]
        ]
        recognised_sentence = sentence_guesses[video_num]
        for c, r in zip(correct_sentence, list(recognised_sentence)):
            if c != r:
                errors += 1
        # print('Correct {}'.format(correct_sentence))
        # print('Recognised {}'.format(recognised_sentence))
        # print()
    print(float(errors) / float(178))


if __name__ == '__main__':
    # use n-gram
    models = train_all_words(features_custom,
                             all_model_selectors['SelectorBIC'])
    test_set = asl.build_test(features_custom)
    # load 3-gram language model
    lm_models = arpa.loadf(os.path.join('data', 'n-grams', 'ukn.3.lm'))
    lm = lm_models[0]
    recognize_ngram(lm, models, test_set)
Esempio n. 25
0
def recognize_ngram(models: dict, test_set: SinglesData):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    # This import is necessary to be able recognize language model in arpa files
    # could be easily installed using the following command :
    # pip install arpa
    import arpa
    import itertools
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    # TODO implement the recognizer
    probabilities = []
    guesses = []
    probabilities_dict = {}
    guesses_dict = {}

    #load the language model
    lm_models = arpa.loadf('lm3_sri.lm')
    lm = lm_models[0]  # ARPA files may contain several models.

    #for word_id in range(0, len(test_set.get_all_Xlengths())):
    #    probabilities_dict[word_id] = 'None'
    #    guesses_dict[word_id] = 'None'

    for video_index in test_set._load_sentence_word_indices():
        word_ids = test_set._load_sentence_word_indices()[video_index]
        video_probs = collections.OrderedDict()
        for word_id in word_ids:

            current_sequence = test_set.get_item_sequences(word_id)
            current_length = test_set.get_item_Xlengths(word_id)
            probs = {}
            for word, model in models.items():
                try:
                    probs[word] = model.score(current_sequence[0],
                                              current_length[1])
                except:
                    print('failed for word_id {} and word: {}'.format(
                        word_id, word))
                    probs[word] = float('-inf')

            if len(word_ids) > 5:
                top_words = sorted(probs, key=probs.get, reverse=True)[:3]
            elif len(word_ids) == 5:
                top_words = sorted(probs, key=probs.get, reverse=True)[:4]
            elif len(word_ids) < 5:
                top_words = sorted(probs, key=probs.get, reverse=True)[:6]

            probabilities_dict[word_id] = probs
            video_probs[word_id] = {x: probs[x] for x in top_words}

        sentences = list(itertools.product(*video_probs.values()))
        sentences_prob = []

        for sentence_index in range(len(sentences)):
            sentence = sentences[sentence_index]
            visual_prob = 0
            word_index = 0
            for word_id in word_ids:
                word_id_probs = video_probs[word_id]
                visual_prob = visual_prob + word_id_probs[sentence[word_index]]
                word_index = word_index + 1

            sentence_string = ''
            for word in sentence:
                sentence_string = sentence_string + ' ' + word
            try:
                language_prob = lm.log_s(sentence_string.strip())
                alpha = 1
                beta = 25
                sentence_prob = alpha * visual_prob + beta * language_prob
                sentences_prob.append(sentence_prob)
                print(language_prob)
            except:
                print('no language for sor sentence: {}',
                      sentence_string.strip())
                sentences_prob.append(float('-inf'))

        #find the sentence with the highest prob then extract word_ids
        max_sentence = sentences[sentences_prob.index(max(sentences_prob))]

        word_index = 0
        for word_id in word_ids:
            guesses_dict[word_id] = max_sentence[word_index]
            word_index = word_index + 1

    for key in sorted(guesses_dict):
        probabilities.append(probabilities_dict[key])
        guesses.append(guesses_dict[key])

    return probabilities, guesses
def recognize_unigram(models: dict, test_set: SinglesData,
                      lm_scaling_factor: int):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :param lm_scaling_factor: int
        multiply the language model probability by int value so it's on a closer scale the the HMM log_ls probability
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    probabilities = []
    guesses = []

    # read in the language model from the ukn.1.lm arpa file provided
    # note: lm stands for language model

    try:
        language_models = arpa.loadf('./data/ukn.1.lm')
        lm = language_models[0]  # ARPA files may contain several models.
    except:
        print("Problem reading the language model from the ARPA file")
        raise

    # implement the recognizer

    # print("Total Length: {}".format(test_set.num_items))
    for video_num in test_set.sentences_index:
        for word_id in test_set.sentences_index[video_num]:
            log_ls = {}  # dict of log liklihoods of a word
            best_score = float("-inf")  # best log_l thus far
            best_guess = None  # best guess for what the word from the test set could be
            x, lengths = test_set.get_item_Xlengths(word_id)

            for word, model in models.items():

                try:
                    # Assumes a HMM model
                    log_ls[word] = model.score(x, lengths)
                except:
                    # Unable to process word with this model
                    log_ls[word] = float("-inf")
                else:
                    # Remove a trailing digit from word if it has one before passing to language model
                    word_key = ''.join(
                        word[:-1] if word[-1].isdigit() else word)
                    log_ls[word] = log_ls[word] + lm_scaling_factor * lm.log_p(
                        word_key)

                if log_ls[word] > best_score:
                    best_score = log_ls[word]
                    best_guess = word
                    # print("New Best Guess for {}: {}".format(word_id, best_guess))

            probabilities.append(log_ls)
            guesses.append(best_guess)

    return probabilities, guesses
Esempio n. 27
0
def recognize(models: dict, test_set: SinglesData):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    probabilities = []
    guesses = []
    n_gram = 3
    lsm = arpa.loadf("slm/devel-lm-M{}.sri.lm".format(n_gram))
    lm = lsm[0]
    # TODO implement the recognizer

    # go foreach sentences:
    #   for each word in sentence:
    #      probability = {}
    #      for each train-word(model):
    #           score guess-word with model
    #           find in slm: logP of word that has predecessor of current guess-word.
    #           total_score = K * logP + logL
    #           probability[train-word] = total_score
    #
    #      # find max total_score
    #      import operator
    #      guess_word = max(probability.items(), key=operator.itemgetter(1))[0]
    #      guesses.append(guess_word)
    #      probabilities.append(probability)

    K = 50
    for test_X, test_Xlength in test_set.get_all_Xlengths().values():

        probability = {}
        for word, model in models.items():
            # calculate the scores for each model(word) and update the 'probabilities' list.
            try:
                logL = model.score(test_X, test_Xlength)
                if not guesses:
                    logP = lm.log_p('<s>')
                    score_start = K * logP + logL
                    logP = lm.log_p('</s>')
                    score_end = K * logP + logL
                    if score_end > score_start:
                        logP = lm.log_p('</s>')
                    else:
                        logP = lm.log_p('<s>')
                else:
                    w = get_adjecent_words(guesses, n_gram)
                    logP = lm.log_p(w)

                probability[word] = K * logP + logL
            except:
                probability[word] = float("-inf")
                pass

        import operator
        guess_word = max(probability.items(), key=operator.itemgetter(1))[0]

        guesses.append(guess_word)
        probabilities.append(probability)

    return probabilities, guesses
Esempio n. 28
0
try:
    import arpa
except:
    pass
import os
import pandas


def process_by_sq(models, X, length, probabilities, guesses):
    probability_by_word = {}

    for word, model in models.items():
        try:
            w_score = model.score(X, length)
            probability_by_word[word] = w_score
        except Exception:
            probability_by_word[word] = -float(math.inf)

    best_guess = max(probability_by_word.keys(),
                     key=(lambda w: probability_by_word[w]))
    guesses.append(best_guess)
    probabilities.append(probability_by_word)


if __name__ == '__main__':

    #df_probs = pandas.DataFrame(data={'col1': 4, 'col2': 4}, index=3)
    p = os.path.join('data', 'ukn.3.lm')
    models = arpa.loadf(p)

    print('d')
Esempio n. 29
0
from asl_data import SinglesData
from asl_utils import show_errors

from string import digits
import pickle
import arpa
import itertools

LM_SCALE = 150

lm1 = arpa.loadf("ukn.1.lm")[0]
lm2 = arpa.loadf("ukn.2.lm")[0]
lm3 = arpa.loadf("ukn.3.lm")[0]

exceptional_words = {
    'SAY-P': 'SAY',
    'IX-P': 'IX',
}

with open('probabilities.pickle', 'rb') as file:
    gm = pickle.load(file)  # gesture model

with open('test_set.pickle', 'rb') as file:
    test_set = pickle.load(file)


def clean_word(word):
    w = word[:-1] if word[-1].isdigit() else word
    return exceptional_words.get(w, w)

Esempio n. 30
0
import arpa

models = arpa.loadf(
    "/Users/huangruizhe/Downloads/PycharmProjects/lm_adapt/data/c5-end.arpa")
lm = models[0]  # ARPA files may contain several models.

# probability p(end|in, the)
print(lm.p("4 9 5"))
print(lm.log_p("4 9 5"))

# sentence score w/ sentence markers
print(lm.s("4 9 3 4 7 5 7"))
print(lm.log_s("4 9 3 4 7 5 7"))

# sentence score w/o sentence markers
# print(lm.s("4 9 3 4 7 5 7", sos=False, eos=False))
print(lm.log_s("4 9 3 4 7 5 7", sos=False, eos=False))

# entries of order n, e.g. (-0.4317983, ('3', '4'), 0.3461446)
# ref: python-arpa/arpa/models/simple.py
print([e for e in lm._entries(2)])

# vocabularies
print([v for v in lm.vocabulary()])
Esempio n. 31
0
def test_manual_p():
    lm = arpa.loadf(TEST_ARPA)[0]
    assert round(lm.p('<s>'), 4) == 0
Esempio n. 32
0
def test_manual_log_p_unk():
    lm = arpa.loadf(TEST_ARPA)[0]
    assert lm.log_p('UnladenSwallow') == -1.995635
def one_gram(df_prob):
    # TODO 1-gram lm
    lm = arpa.loadf(os.path.join("data", "ukn.1.lm"))
    print(lm[0].s("JOHN WRITE HOMEWORK"))
    print(lm[0].log_s("JOHN WRITE HOMEWORK"))
Esempio n. 34
0
    sm = sum(features_success_rates.values())
    features_weights = [(k, v / sm)
                        for (k, v) in features_success_rates.items()]
    pickle.dump(
        {
            "features_probs": features_probs,
            "features_weights": features_weights
        }, open("data/feature_models_data.pkl", "wb"))

    ensemble_guess(features_probs, features_weights, test_set)

    #[avg_sequences_probs([ground,norm,polar,delta]) for (ground,norm,polar,delta) in list(zip(*model_probs.values()))]


language_model = arpa.loadf("lm/ukn.3.lm")[0]


def ensemble_guess(features_probs=None, features_weights=None, test_set=None):
    if features_probs is None or features_weights is None:
        l = pickle.load(open("data/feature_models_data.pkl", "rb"))
        features_probs = l["features_probs"]
        features_weights = l["features_weights"]
        print("feature models data loaded")

    if test_set is None:
        test_set = asl.build_test(features_ground)

    features_weights = dict(features_weights)
    ensemble_probabilities = [
        merge_seq_dicts(ground, norm, polar, delta, custom, features_weights)
Esempio n. 35
0
def get_viterbi_sentence(scores, alpha_start = 1, alpha_transition = 1):
    logger = logging.getLogger('recognizer')

    top = 5
    min_score = 1e6 * (-1)

    lm_models = arpa.loadf("ukn.3.lm")
    lm = lm_models[0]

    states_num, observations_num = scores.shape
    states = list(scores.index)
    observations = list(scores.columns.values)

    viterbi = pd.DataFrame(index = states, columns = observations)
    backpointers = pd.DataFrame(index = states, columns = observations)

    step_0 = 0
    # Initialization step 0
    for state in states:
        emission_score = scores.get_value(state, observations[step_0])
        sentence = ['<s>']
        sentence.append(state)
        transition_score = get_n_gram_score(sentence, n_gram_model = lm, n_gram = 3)
        viterbi.set_value(state, observations[step_0], emission_score + alpha_start * transition_score)
        backpointers.set_value(state, observations[step_0], 0)

    # Recursion
    for observation in range(1, len(observations)):
        logger.debug("Observation {}".format(observation))
        # Get the last top states from previous step
        top_states = list(scores.sort_values(by = observations[observation - 1], ascending = False)[0:top].index)
        for state in states:
            # Get emission score from currente step
            emission_score = scores.get_value(state, observations[observation])
            # Get the max score emission_score + emission_score + transition_score
            for top_state in top_states:
                best_score = min_score
                best_state = None
                sentence = []
                sentence.append(top_state)
                sentence.append(state)
                # Get the transition score
                transition_score = get_n_gram_score(sentence, n_gram_model = lm, n_gram = 3)
                # Get the previous emission score
                emission_score_previous = scores.get_value(top_state, observations[observation - 1])
                # middle_score = alpha_transition * transition_score + emission_score + emission_score_previous
                middle_score = alpha_transition * transition_score + emission_score_previous
                # print("Middle score {}".format(middle_score))
                # Update the max score
                if middle_score > best_score:
                    best_score = middle_score
                    best_state = top_state
                    # print("Best score {}". format(best_score))
            state_score = best_score + emission_score
            viterbi.set_value(state, observations[observation], state_score)
            backpointers.set_value(state, observations[observation], best_state)

    # Termination
    # steps = len(observations)
    # last_state = list(viterbi.sort_values(by = observations[steps - 1], ascending = False)[0:1].index)
    # viterbi_sentence = []
    # viterbi_sentence.extend(last_state)

    # for observation in range(steps - 1, 0, -1):
    #     viterbi_sentence.append(backpointers.get_value(viterbi_sentence[steps - 1 - observation], observations[observation]))
    # return list(reversed(viterbi_sentence))
    #return viterbi

    viterbi_sentence = list(viterbi.idxmax(axis = 0))
    return viterbi_sentence
Esempio n. 36
0
def recognize_ngram(models: dict, test_set: SinglesData, probs, BIC_guesses):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    probabilities = []
    guesses = []

    model = arpa.loadf("devel-lm-M3.sri.lm")
    lm = model[0]  # ARPA files may contain several models.
    # TODO implement the recognizer
    # return probabilities, guesses
    test_sequences = list(test_set.get_all_Xlengths().values())
    word_keys = list(test_set.get_all_Xlengths().keys())
    i = -1
    for sentence in test_set.sentences_index.values():
        f = {}
        maxs = float("-inf")
        prob = []
        words = []

        sentenceLength = 0
        for word_index in sentence:
            i += 1
            word = test_set.wordlist[word_index]
            sentenceLength += 1
            try:
                f[word] = probs[word][i]
            except:
                f[word] = float("-inf")
            prob.append(
                f[word]
            )  ## These are Just the probabilities unchanged from the BIC recognizer.

        # Find Six most probable words and generate the possible permutations
        sixwords = sorted(f, key=f.get, reverse=True)[:6]
        for k in permutations(sixwords, r=sentenceLength):
            l = 0
            for j in range(len(k)):
                l += f[k[j]]
            try:
                sentenceLP = l + 13 * lm.log_s(
                    " ".join(k)
                )  ## According to one student in the forum 13 is the best hyperparameter
                if sentenceLP > maxs:  ## https://discussions.udacity.com/t/slm-data-for-this-asl-dataset/230822/8?u=spiros
                    sentence = " ".join(k)
                    maxs = sentenceLP
                    words = list(k)
            except:
                pass

        if (words == []):
            words = BIC_guesses[len(guesses):len(guesses) +
                                sentenceLength]  ## Fall back to BIC guesses
        probabilities.append(prob)
        guesses += words
    return (probabilities, guesses)
Esempio n. 37
0
            subs][:args.dataset_limit_train]

    model_info.max_dial_len = hcn_utils.get_feature_size_from_data(
        subsets_file_lists[subs][0], feat_sep=True)

if args.states:
    dialogue_settings = {}
    for cond in ['user_plan', 'system_plan']:
        dialogue_settings[cond] = dialogue_utils.dialogue(
            nlg_utils.NLG(args.states, cond), rig_db)
else:
    dialogue_settings = {}

if args.dact_lm:
    logger.debug('Loading DAct language model')
    dact_lm = arpa.loadf(args.dact_lm)[0]
    if DEBUG:
        input(dact_lm)
else:
    dact_lm = None

if model_info.max_dial_len == 0:
    logger.error('No dialogue found')
    sys.exit()

logger.info('Loading actions set')
try:
    with open(os.path.join(task_dir, 'orca_action_set.txt'), 'r') as lfp:
        available_actions = pickle.load(lfp)
except:
    with open(os.path.join(task_dir, 'orca_action_set.txt'), 'r') as lfp:
Esempio n. 38
0
from utils.word_to_characters import lexicon_dic
import arpa
import re

# from pynlpl.lm import lm
with open('mydata/data/local/lm/phones.txt', 'r') as f:
    alphabet = []
    for char in f:
        alphabet.append(char[0])

NEG_INF = -float("inf")
lexicon_dict = lexicon_dic()
print("dict length : {}".format(len(lexicon_dict)))
#load the language models
lm_models = arpa.loadf(
    "/home/emekonnen/mydata/E2E-ASR-pytorch/mydata/data/local/lm/3-gram.pruned.3e-7.arpa"
)
#lm_models = lm.ARPALanguageModel("/home/emekonnen/mydata/E2E-ASR-pytorch/mydata/data/local/lm/3-gram.arpa")

lm = lm_models[0]


def compute_probs(trigrams):
    total_probs = 0
    for tri in trigrams:
        try:
            total_probs += lm.log_p(" ".join(tri))
        except KeyError:
            pass
    return total_probs