예제 #1
0
def recognize(models: dict, test_set: SinglesData):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    probabilities = []
    guesses = ['' for i in range(len(test_set.get_all_Xlengths()))]
    # TODO implement the recognizer
    # return probabilities, guesses
    for i in range(len(test_set.get_all_sequences())):
        current_sequence = test_set.get_item_sequences(i)
        current_X, current_lengths = test_set.get_item_Xlengths(i)
        best_word = None
        p = {}
        best_score = float('-inf')

        for word in models:
            model = models[word]
            try:
                score = model.score(current_X, current_lengths)
                p[word] = score
            except:
                p[word] = 0

            if score > best_score:
                best_score = score
                best_word = word

        probabilities.append(p)
        guesses[i] = best_word

    return probabilities, guesses
예제 #2
0
def recognize(models: dict, test_set: SinglesData):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    # TODO implement the recognizer
    probabilities = []
    guesses = []

    for word_id in range(0, len(test_set.get_all_Xlengths())):
        #for word_id in range(0,2):

        current_sequence = test_set.get_item_sequences(word_id)
        current_length = test_set.get_item_Xlengths(word_id)
        probs = {}
        for word, model in models.items():
            try:
                probs[word] = model.score(current_sequence[0],
                                          current_length[1])
            except:
                print('failed for word_id {} and word: {}'.format(
                    word_id, word))
                probs[word] = float('-inf')

        guess = max(probs, key=probs.get)
        probabilities.append(probs)
        guesses.append(guess)

    return probabilities, guesses
예제 #3
0
def recognize(models: dict, test_set: SinglesData):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    probabilities = []
    guesses = []
    # TODO implement the recognizer
    # return probabilities, guesses

    for item in range(test_set.num_items):
        sequences = test_set.get_item_sequences(item)
        X,lengths = test_set.get_item_Xlengths(item)
        logLs = dict()
        for w,m in models.items(): 
            try:
                score  = m.score(X,lengths)
            except:
                # print('failed {}'.format(w))
                continue
            else:
                logLs[w] = score

        guess_word = max(logLs, key=logLs.get)
        probabilities.append(logLs)
        guesses.append(guess_word)

    return probabilities,guesses
        
예제 #4
0
def recognize(models: dict, test_set: SinglesData):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    probabilities = []
    guesses = []
    # TODO implement the recognizer
    # return probabilities, guesses
    for word_id in range(0, len(test_set.get_all_Xlengths())):
        cur_seq = test_set.get_item_sequences(word_id)
        cur_x, cur_len = test_set.get_item_Xlengths(word_id)
        best_score = -float('INF')
        best_guess = None
        prob_dict = {}
        for word, model in models.items():
            score = -float('INF')
            try:
                score = model.score(cur_x, cur_len)
            except:
                pass
            prob_dict[word] = score
            if score >= best_score:
                best_score = score
                best_guess = word
        probabilities.append(prob_dict)
        guesses.append(best_guess)
    return (probabilities, guesses)
예제 #5
0
def recognize(models: dict, test_set: SinglesData):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    probabilities = []
    guesses = []

    for item in range(test_set.num_items):
        probs = {}
        best_prob = float("-inf")
        best_guess = ""
        for model_name in models:
            try:
                score = models[model_name].score(
                    *test_set.get_item_sequences(item))
                probs[model_name] = score
                if score > best_prob:
                    best_prob = score
                    best_guess = model_name

            except:
                probs[model_name] = float("-inf")

        probabilities.append(probs)
        guesses.append(best_guess)

    return (probabilities, guesses)
예제 #6
0
def recognize(models: dict, test_set: SinglesData):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    probabilities = []
    guesses = []
    for i in range(test_set.num_items):
        word = test_set.wordlist[i]
        seqs = test_set.get_item_sequences(i)
        X = test_set.get_item_Xlengths(i)
        p_hash = {}
        best_score = -1000000000
        best_word = None
        for m_key in models:
            try:
                model = models[m_key]
                logL = model.score(X[0], X[1])
                p_hash[m_key] = logL
                if logL > best_score:
                    best_score = logL
                    best_word = m_key
            except ValueError:
                continue
        probabilities.append(p_hash)
        guesses.append(best_word)
    return (probabilities, guesses)
예제 #7
0
def recognize_ngram(models: dict, test_set: SinglesData):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    # This import is necessary to be able recognize language model in arpa files
    # could be easily installed using the following command :
    # pip install arpa
    import arpa
    import itertools
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    # TODO implement the recognizer
    probabilities = []
    guesses = []
    probabilities_dict = {}
    guesses_dict = {}

    #load the language model
    lm_models = arpa.loadf('lm3_sri.lm')
    lm = lm_models[0]  # ARPA files may contain several models.

    #for word_id in range(0, len(test_set.get_all_Xlengths())):
    #    probabilities_dict[word_id] = 'None'
    #    guesses_dict[word_id] = 'None'

    for video_index in test_set._load_sentence_word_indices():
        word_ids = test_set._load_sentence_word_indices()[video_index]
        video_probs = collections.OrderedDict()
        for word_id in word_ids:

            current_sequence = test_set.get_item_sequences(word_id)
            current_length = test_set.get_item_Xlengths(word_id)
            probs = {}
            for word, model in models.items():
                try:
                    probs[word] = model.score(current_sequence[0],
                                              current_length[1])
                except:
                    print('failed for word_id {} and word: {}'.format(
                        word_id, word))
                    probs[word] = float('-inf')

            if len(word_ids) > 5:
                top_words = sorted(probs, key=probs.get, reverse=True)[:3]
            elif len(word_ids) == 5:
                top_words = sorted(probs, key=probs.get, reverse=True)[:4]
            elif len(word_ids) < 5:
                top_words = sorted(probs, key=probs.get, reverse=True)[:6]

            probabilities_dict[word_id] = probs
            video_probs[word_id] = {x: probs[x] for x in top_words}

        sentences = list(itertools.product(*video_probs.values()))
        sentences_prob = []

        for sentence_index in range(len(sentences)):
            sentence = sentences[sentence_index]
            visual_prob = 0
            word_index = 0
            for word_id in word_ids:
                word_id_probs = video_probs[word_id]
                visual_prob = visual_prob + word_id_probs[sentence[word_index]]
                word_index = word_index + 1

            sentence_string = ''
            for word in sentence:
                sentence_string = sentence_string + ' ' + word
            try:
                language_prob = lm.log_s(sentence_string.strip())
                alpha = 1
                beta = 25
                sentence_prob = alpha * visual_prob + beta * language_prob
                sentences_prob.append(sentence_prob)
                print(language_prob)
            except:
                print('no language for sor sentence: {}',
                      sentence_string.strip())
                sentences_prob.append(float('-inf'))

        #find the sentence with the highest prob then extract word_ids
        max_sentence = sentences[sentences_prob.index(max(sentences_prob))]

        word_index = 0
        for word_id in word_ids:
            guesses_dict[word_id] = max_sentence[word_index]
            word_index = word_index + 1

    for key in sorted(guesses_dict):
        probabilities.append(probabilities_dict[key])
        guesses.append(guesses_dict[key])

    return probabilities, guesses
예제 #8
0
def recognize(models: dict, test_set: SinglesData):
    """ Recognize test word sequences from word models set

   :param models: dict of trained models
       {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
   :param test_set: SinglesData object
   :return: (list, list)  as probabilities, guesses
       both lists are ordered by the test set word_id
       probabilities is a list of dictionaries where each key a word and value is Log Liklihood
           [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
            ]
       guesses is a list of the best guess words ordered by the test set word_id
           ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
   """
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    
    probabilities = []
    guesses = []
    
    for s in range(0,test_set.num_items):
        X,lengths = test_set.get_item_Xlengths(s)
        
        sequences = test_set.get_item_sequences(s)
        gword,blogL = None,float('-inf')
        # word = test_set.df.iloc[s].word
        probs_dict={}
        
        for word,model in models.items():
            probs_dict[word]  = float('-inf') 
            try:
                probs_dict[word] = model.score(X,lengths) 
                if probs_dict[word] > blogL:
                    gword = word
                    blogL = probs_dict[word]
            except Exception as e:
                # print(str(e))
                pass
                    
        probabilities.append(probs_dict)    
        guesses.append(gword)
        
            # guesses.append(word)
            # probabilities.append(logL)
     # except:
     #  pass
    # def run_through_test_set(x):
    #  # print(x)
    #  pass
    #  # test_set.get_word_Xlengths(x.word)
     # model = models[x.word]
     # logL = model.score(x.get_item_sequences,x.get_item_Xlengths)
     # probabilities.append(logL)
     # guesses.append(x.word)
     # video speaker start_frame end_frame word
     # get_item_Xlengths
     # get_item_sequences
  
    # test_set.df.apply(run_through_test_set,axis=0)
    # TODO implement the recognizer
    return probabilities, guesses
    raise NotImplementedError