def preplexity(LM, test_dir, language, smoothing=False, delta=0): """ Computes the preplexity of language model given a test corpus INPUT: LM : (dictionary) the language model trained by lm_train test_dir : (string) The top-level directory name containing data e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/' language : `(string) either 'e' (English) or 'f' (French) smoothing : (boolean) True for add-delta smoothing, False for no smoothing delta : (float) smoothing parameter where 0<delta<=1 """ files = os.listdir(test_dir) pp = 0 N = 0 vocab_size = len(LM["uni"]) for ffile in files: if ffile.split(".")[-1] != language: continue opened_file = open(test_dir + ffile, "r") for line in opened_file: processed_line = preprocess(line, language) tpp = log_prob(processed_line, LM, smoothing, delta, vocab_size) if tpp > float("-inf"): pp = pp + tpp N += len(processed_line.split()) opened_file.close() if N > 0: pp = 2**(-pp / N) return pp
def preplexity(LM, test_dir, language, smoothing=False, delta=0): """ Computes the preplexity of language model given a test corpus INPUT: LM : (dictionary) the language model trained by lm_train test_dir : (string) The top-level directory name containing data e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/' language : `(string) either 'e' (English) or 'f' (French) smoothing : (boolean) True for add-delta smoothing, False for no smoothing delta : (float) smoothing parameter where 0<delta<=1 """ files = os.listdir(test_dir) pp = 0 N = 0 vocab_size = len(LM["uni"]) for ffile in files: if ffile.split(".")[-1] != language: continue opened_file = open(test_dir + ffile, "r") for line in opened_file: processed_line = preprocess(line, language) tpp = log_prob(processed_line, LM, smoothing, delta, vocab_size) if tpp > float("-inf"): pp = pp + tpp N += len(processed_line.split()) opened_file.close() if N > 0: pp = 2**(-pp / N) return pp #test # test_LM = lm_train("../data/Hansard/Testing/", "e", "e_temp") # delta_list = [0.05, 0.25, 0.5, 0.75, 1] # for delta in delta_list: # print(preplexity(test_LM, "../data/Hansard/Testing/", "f", True, delta)) # print(preplexity(test_LM, "../data/Hansard/Testing/", "f")) # print(preplexity(test_LM, "../data/Hansard/Testing/", "e")) # 38.351063093155126 # 74.51702675962814 # 106.02237248692346 # 132.35249009777215 # 155.79005310855305
print("Testing part 3") smoothing = True delta = 0.5 deltas = [0.1, 0.3, 0.5, 0.7, 0.9] vocabSize = 0 languages = ['e', 'f'] for language in languages: LM = pickle.load(open(language + '.pickle', 'rb')) vocabSize = len(LM['uni']) for filename in os.listdir(data_dir): if filename.endswith('.' + language): f = open(os.path.join(data_dir, filename), 'r') for line in f: line = line.rstrip() sentence = preprocess(line, language) log_p = log_prob(sentence, LM, smoothing, delta, vocabSize) #print("{}\tProb: {}".format(sentence, log_p)) for d in deltas: print("Language: {}, delta: {}, perp: {}".format( language, d, preplexity(LM, test_dir, language, smoothing, d))) if 4 in test_parts: print("Testing part 4") num_sentences = 1000 max_iter = 10 AM = align_ibm1(data_dir, num_sentences, max_iter, fn_AM) from pprint import pprint pprint(AM) if 5 in test_parts: print("Testing part 5")
# to the value unpickled lm2 = unpickler.load() e_sent = preprocess( 'It is indeed a great honour to be entrusted with this task.', 'e') f_sent = preprocess( 'Chers collegues, vous me faites un grand honneur en me confiant cette tache.', 'f') deltas = [0, 0.1, 0.25, 0.5, 0.75] lptxt = e_sent + '\n\n' for d in deltas: l = log_prob(e_sent, lm1, smoothing=True if d != 0 else False, delta=d, vocabSize=len(lm1['uni'])) lptxt += 'Delta: ' + str(d) + ' log probability = ' + str(l) + '\n' lptxt += '\n\n' lptxt += f_sent + '\n\n' for d in deltas: l = log_prob(f_sent, lm2, smoothing=True if d != 0 else False, delta=d, vocabSize=len(lm2['uni'])) lptxt += 'Delta: ' + str(d) + ' log probability = ' + str(l) + '\n' f = open("Task3.txt", "w+")