Exemple #1
0
def preplexity(LM, test_dir, language, smoothing=False, delta=0):
    """
	Computes the preplexity of language model given a test corpus
	
	INPUT:
	
	LM : 		(dictionary) the language model trained by lm_train
	test_dir : 	(string) The top-level directory name containing data
				e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/'
	language : `(string) either 'e' (English) or 'f' (French)
	smoothing : (boolean) True for add-delta smoothing, False for no smoothing
	delta : 	(float) smoothing parameter where 0<delta<=1
	"""

    files = os.listdir(test_dir)
    pp = 0
    N = 0
    vocab_size = len(LM["uni"])

    for ffile in files:
        if ffile.split(".")[-1] != language:
            continue

        opened_file = open(test_dir + ffile, "r")
        for line in opened_file:
            processed_line = preprocess(line, language)
            tpp = log_prob(processed_line, LM, smoothing, delta, vocab_size)

            if tpp > float("-inf"):
                pp = pp + tpp
                N += len(processed_line.split())
        opened_file.close()
    if N > 0:
        pp = 2**(-pp / N)
    return pp
Exemple #2
0
def preplexity(LM, test_dir, language, smoothing=False, delta=0):
    """
    Computes the preplexity of language model given a test corpus
    
    INPUT:
    
    LM :        (dictionary) the language model trained by lm_train
    test_dir :  (string) The top-level directory name containing data
                e.g., '/u/cs401/A2_SMT/data/Hansard/Testing/'
    language : `(string) either 'e' (English) or 'f' (French)
    smoothing : (boolean) True for add-delta smoothing, False for no smoothing
    delta :     (float) smoothing parameter where 0<delta<=1
    """

    files = os.listdir(test_dir)
    pp = 0
    N = 0
    vocab_size = len(LM["uni"])

    for ffile in files:
        if ffile.split(".")[-1] != language:
            continue

        opened_file = open(test_dir + ffile, "r")
        for line in opened_file:
            processed_line = preprocess(line, language)
            tpp = log_prob(processed_line, LM, smoothing, delta, vocab_size)

            if tpp > float("-inf"):
                pp = pp + tpp
                N += len(processed_line.split())
        opened_file.close()
    if N > 0:
        pp = 2**(-pp / N)
    return pp


#test
# test_LM = lm_train("../data/Hansard/Testing/", "e", "e_temp")
# delta_list = [0.05, 0.25, 0.5, 0.75, 1]
# for delta in delta_list:
#     print(preplexity(test_LM, "../data/Hansard/Testing/", "f", True, delta))

# print(preplexity(test_LM, "../data/Hansard/Testing/", "f"))
# print(preplexity(test_LM, "../data/Hansard/Testing/", "e"))
# 38.351063093155126
# 74.51702675962814
# 106.02237248692346
# 132.35249009777215
# 155.79005310855305
    print("Testing part 3")
    smoothing = True
    delta = 0.5
    deltas = [0.1, 0.3, 0.5, 0.7, 0.9]
    vocabSize = 0
    languages = ['e', 'f']
    for language in languages:
        LM = pickle.load(open(language + '.pickle', 'rb'))
        vocabSize = len(LM['uni'])
        for filename in os.listdir(data_dir):
            if filename.endswith('.' + language):
                f = open(os.path.join(data_dir, filename), 'r')
                for line in f:
                    line = line.rstrip()
                    sentence = preprocess(line, language)
                    log_p = log_prob(sentence, LM, smoothing, delta, vocabSize)
                    #print("{}\tProb: {}".format(sentence, log_p))
        for d in deltas:
            print("Language: {}, delta: {}, perp: {}".format(
                language, d, preplexity(LM, test_dir, language, smoothing, d)))

if 4 in test_parts:
    print("Testing part 4")
    num_sentences = 1000
    max_iter = 10
    AM = align_ibm1(data_dir, num_sentences, max_iter, fn_AM)
    from pprint import pprint
    pprint(AM)

if 5 in test_parts:
    print("Testing part 5")
Exemple #4
0
        # to the value unpickled
        lm2 = unpickler.load()

e_sent = preprocess(
    'It is indeed a great honour to be entrusted with this task.', 'e')
f_sent = preprocess(
    'Chers collegues, vous me faites un grand honneur en me confiant cette tache.',
    'f')

deltas = [0, 0.1, 0.25, 0.5, 0.75]

lptxt = e_sent + '\n\n'
for d in deltas:
    l = log_prob(e_sent,
                 lm1,
                 smoothing=True if d != 0 else False,
                 delta=d,
                 vocabSize=len(lm1['uni']))
    lptxt += 'Delta: ' + str(d) + ' log probability = ' + str(l) + '\n'

lptxt += '\n\n'
lptxt += f_sent + '\n\n'
for d in deltas:
    l = log_prob(f_sent,
                 lm2,
                 smoothing=True if d != 0 else False,
                 delta=d,
                 vocabSize=len(lm2['uni']))
    lptxt += 'Delta: ' + str(d) + ' log probability = ' + str(l) + '\n'
f = open("Task3.txt", "w+")