def recognize(models: dict, test_set: SinglesData): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ warnings.filterwarnings("ignore", category=DeprecationWarning) probabilities = [] guesses = ['' for i in range(len(test_set.get_all_Xlengths()))] # TODO implement the recognizer # return probabilities, guesses for i in range(len(test_set.get_all_sequences())): current_sequence = test_set.get_item_sequences(i) current_X, current_lengths = test_set.get_item_Xlengths(i) best_word = None p = {} best_score = float('-inf') for word in models: model = models[word] try: score = model.score(current_X, current_lengths) p[word] = score except: p[word] = 0 if score > best_score: best_score = score best_word = word probabilities.append(p) guesses[i] = best_word return probabilities, guesses
def recognize(models: dict, test_set: SinglesData): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement the recognizer probabilities = [] guesses = [] for word_id in range(0, len(test_set.get_all_Xlengths())): #for word_id in range(0,2): current_sequence = test_set.get_item_sequences(word_id) current_length = test_set.get_item_Xlengths(word_id) probs = {} for word, model in models.items(): try: probs[word] = model.score(current_sequence[0], current_length[1]) except: print('failed for word_id {} and word: {}'.format( word_id, word)) probs[word] = float('-inf') guess = max(probs, key=probs.get) probabilities.append(probs) guesses.append(guess) return probabilities, guesses
def recognize(models: dict, test_set: SinglesData): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ warnings.filterwarnings("ignore", category=DeprecationWarning) probabilities = [] guesses = [] # TODO implement the recognizer # return probabilities, guesses for item in range(test_set.num_items): sequences = test_set.get_item_sequences(item) X,lengths = test_set.get_item_Xlengths(item) logLs = dict() for w,m in models.items(): try: score = m.score(X,lengths) except: # print('failed {}'.format(w)) continue else: logLs[w] = score guess_word = max(logLs, key=logLs.get) probabilities.append(logLs) guesses.append(guess_word) return probabilities,guesses
def recognize(models: dict, test_set: SinglesData): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ warnings.filterwarnings("ignore", category=DeprecationWarning) probabilities = [] guesses = [] # TODO implement the recognizer # return probabilities, guesses for word_id in range(0, len(test_set.get_all_Xlengths())): cur_seq = test_set.get_item_sequences(word_id) cur_x, cur_len = test_set.get_item_Xlengths(word_id) best_score = -float('INF') best_guess = None prob_dict = {} for word, model in models.items(): score = -float('INF') try: score = model.score(cur_x, cur_len) except: pass prob_dict[word] = score if score >= best_score: best_score = score best_guess = word probabilities.append(prob_dict) guesses.append(best_guess) return (probabilities, guesses)
def recognize(models: dict, test_set: SinglesData): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ warnings.filterwarnings("ignore", category=DeprecationWarning) probabilities = [] guesses = [] for item in range(test_set.num_items): probs = {} best_prob = float("-inf") best_guess = "" for model_name in models: try: score = models[model_name].score( *test_set.get_item_sequences(item)) probs[model_name] = score if score > best_prob: best_prob = score best_guess = model_name except: probs[model_name] = float("-inf") probabilities.append(probs) guesses.append(best_guess) return (probabilities, guesses)
def recognize(models: dict, test_set: SinglesData): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ warnings.filterwarnings("ignore", category=DeprecationWarning) probabilities = [] guesses = [] for i in range(test_set.num_items): word = test_set.wordlist[i] seqs = test_set.get_item_sequences(i) X = test_set.get_item_Xlengths(i) p_hash = {} best_score = -1000000000 best_word = None for m_key in models: try: model = models[m_key] logL = model.score(X[0], X[1]) p_hash[m_key] = logL if logL > best_score: best_score = logL best_word = m_key except ValueError: continue probabilities.append(p_hash) guesses.append(best_word) return (probabilities, guesses)
def recognize_ngram(models: dict, test_set: SinglesData): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ # This import is necessary to be able recognize language model in arpa files # could be easily installed using the following command : # pip install arpa import arpa import itertools warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement the recognizer probabilities = [] guesses = [] probabilities_dict = {} guesses_dict = {} #load the language model lm_models = arpa.loadf('lm3_sri.lm') lm = lm_models[0] # ARPA files may contain several models. #for word_id in range(0, len(test_set.get_all_Xlengths())): # probabilities_dict[word_id] = 'None' # guesses_dict[word_id] = 'None' for video_index in test_set._load_sentence_word_indices(): word_ids = test_set._load_sentence_word_indices()[video_index] video_probs = collections.OrderedDict() for word_id in word_ids: current_sequence = test_set.get_item_sequences(word_id) current_length = test_set.get_item_Xlengths(word_id) probs = {} for word, model in models.items(): try: probs[word] = model.score(current_sequence[0], current_length[1]) except: print('failed for word_id {} and word: {}'.format( word_id, word)) probs[word] = float('-inf') if len(word_ids) > 5: top_words = sorted(probs, key=probs.get, reverse=True)[:3] elif len(word_ids) == 5: top_words = sorted(probs, key=probs.get, reverse=True)[:4] elif len(word_ids) < 5: top_words = sorted(probs, key=probs.get, reverse=True)[:6] probabilities_dict[word_id] = probs video_probs[word_id] = {x: probs[x] for x in top_words} sentences = list(itertools.product(*video_probs.values())) sentences_prob = [] for sentence_index in range(len(sentences)): sentence = sentences[sentence_index] visual_prob = 0 word_index = 0 for word_id in word_ids: word_id_probs = video_probs[word_id] visual_prob = visual_prob + word_id_probs[sentence[word_index]] word_index = word_index + 1 sentence_string = '' for word in sentence: sentence_string = sentence_string + ' ' + word try: language_prob = lm.log_s(sentence_string.strip()) alpha = 1 beta = 25 sentence_prob = alpha * visual_prob + beta * language_prob sentences_prob.append(sentence_prob) print(language_prob) except: print('no language for sor sentence: {}', sentence_string.strip()) sentences_prob.append(float('-inf')) #find the sentence with the highest prob then extract word_ids max_sentence = sentences[sentences_prob.index(max(sentences_prob))] word_index = 0 for word_id in word_ids: guesses_dict[word_id] = max_sentence[word_index] word_index = word_index + 1 for key in sorted(guesses_dict): probabilities.append(probabilities_dict[key]) guesses.append(guesses_dict[key]) return probabilities, guesses
def recognize(models: dict, test_set: SinglesData): """ Recognize test word sequences from word models set :param models: dict of trained models {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} :param test_set: SinglesData object :return: (list, list) as probabilities, guesses both lists are ordered by the test set word_id probabilities is a list of dictionaries where each key a word and value is Log Liklihood [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, ] guesses is a list of the best guess words ordered by the test set word_id ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] """ warnings.filterwarnings("ignore", category=DeprecationWarning) probabilities = [] guesses = [] for s in range(0,test_set.num_items): X,lengths = test_set.get_item_Xlengths(s) sequences = test_set.get_item_sequences(s) gword,blogL = None,float('-inf') # word = test_set.df.iloc[s].word probs_dict={} for word,model in models.items(): probs_dict[word] = float('-inf') try: probs_dict[word] = model.score(X,lengths) if probs_dict[word] > blogL: gword = word blogL = probs_dict[word] except Exception as e: # print(str(e)) pass probabilities.append(probs_dict) guesses.append(gword) # guesses.append(word) # probabilities.append(logL) # except: # pass # def run_through_test_set(x): # # print(x) # pass # # test_set.get_word_Xlengths(x.word) # model = models[x.word] # logL = model.score(x.get_item_sequences,x.get_item_Xlengths) # probabilities.append(logL) # guesses.append(x.word) # video speaker start_frame end_frame word # get_item_Xlengths # get_item_sequences # test_set.df.apply(run_through_test_set,axis=0) # TODO implement the recognizer return probabilities, guesses raise NotImplementedError