def main(): course_dir = '/usr/local/data/cs465/' if len(sys.argv) < 5 or (sys.argv[1] == 'TRAIN' and len(sys.argv) != 5): print(""" Prints the log-probability of each file under a smoothed n-gram model. Usage: {} TRAIN smoother lexicon trainpath {} TEST smoother lexicon trainpath files... Example: {} TRAIN add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {} TEST add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {}hw-lm/speech/sample* Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1 (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0 the \"1\" in loglinear1 can be replaced with any C >= 0 ) lexicon is the location of the word vector file, which is only used in the loglinear model trainpath is the location of the training corpus (the search path for this includes "{}") """.format(sys.argv[0], sys.argv[0], sys.argv[0], course_dir, sys.argv[0], course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR)) sys.exit(1) mode = sys.argv[1] argv = sys.argv[2:] smoother = argv.pop(0) lexicon = argv.pop(0) train_file = argv.pop(0) if mode == 'TRAIN': lm = Probs.LanguageModel() lm.set_smoother(smoother) lm.read_vectors(lexicon) lm.train(train_file) lm.save(get_model_filename(smoother, lexicon, train_file)) elif mode == 'TEST': if not argv: print("warning: no input files specified") lm = Probs.LanguageModel.load( get_model_filename(smoother, lexicon, train_file)) # We use natural log for our internal computations and that's # the kind of log-probability that fileLogProb returns. # But we'd like to print a value in bits: so we convert # log base e to log base 2 at print time, by dividing by log(2). total_cross_entropy = 0. for testfile in argv: ce = lm.filelogprob(testfile) / math.log(2) #print("{:g}\t{}".format(ce, testfile)) total_cross_entropy -= ce print(total_cross_entropy) print(sum([lm.num_tokens(testfile) for testfile in argv])) #print('Overall cross-entropy:\t{0:.5f}'.format(total_cross_entropy/sum([lm.num_tokens(testfile) for testfile in argv]))) else: sys.exit(-1)
def main(): course_dir = '/usr/local/data/cs465/' argv = sys.argv[1:] if len(argv) < 2: print """ Prints the log-probability of each file under a smoothed n-gram model. Usage: %s smoother lexicon trainpath files... Example: %s add0.01 %shw-lm/lexicons/words-10.txt switchboard-small %shw-lm/speech/sample* Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1 (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0 the \"1\" in loglinear1 can be replaced with any C >= 0 ) lexicon is the location of the word vector file, which is only used in the loglinear model trainpath is the location of the training corpus (the search path for this includes "%s") """ % (sys.argv[0], sys.argv[0], course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR) sys.exit(1) smoother = argv.pop(0) lexicon = argv.pop(0) train_file = argv.pop(0) if not argv: print "warning: no input files specified" lm = Probs.LanguageModel() lm.set_smoother(smoother) lm.read_vectors(lexicon) lm.train(train_file) # We use natural log for our internal computations and that's # the kind of log-probability that fileLogProb returns. # But we'd like to print a value in bits: so we convert # log base e to log base 2 at print time, by dividing by log(2). for testfile in argv: print "%g\t%s" % (lm.filelogprob(testfile) / math.log(2), testfile)
def main(): course_dir = '/usr/local/data/cs465/' if len(sys.argv) < 6 or (sys.argv[1] == 'TRAIN' and len(sys.argv) != 6): # print(""" # Prints the log-probability of each file under a smoothed n-gram model. # # Usage: {} TRAIN smoother lexicon trainpath # {} TEST smoother lexicon trainpath files... # Example: {} TRAIN add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small # {} TEST add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {}hw-lm/speech/sample* # # Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1 # (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0 # the \"1\" in loglinear1 can be replaced with any C >= 0 ) # lexicon is the location of the word vector file, which is only used in the loglinear model # trainpath is the location of the training corpus # (the search path for this includes "{}") # """.format(sys.argv[0], sys.argv[0], sys.argv[0], course_dir, sys.argv[0], course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR)) mode = sys.argv[1] argv = sys.argv[2:] smoother = argv.pop(0) lexicon = argv.pop(0) train_file1 = argv.pop(0) train_file2 = argv.pop(0) epochs = 10 if mode == 'TRAIN': #Train Model1 lm1 = Probs.LanguageModel() #Comment following line when you want cross entropy reading lm1.set_vocab_size(train_file1, train_file2) lm1.set_smoother(smoother) lm1.read_vectors(lexicon) lm1.train(train_file1,epochs) lm1.save(get_model_filename(smoother, lexicon, train_file1)) #Train Model2 lm2 = Probs.LanguageModel() #Comment following line when you want cross entropy reading lm2.set_vocab_size(train_file1, train_file2) lm2.set_smoother(smoother) lm2.read_vectors(lexicon) lm2.train(train_file2, epochs) lm2.save(get_model_filename(smoother, lexicon, train_file2)) elif mode == 'TEST': if not argv: print("warning: no input files specified") priorprob_corpus1 = float(argv.pop(0)) #Load parameters of the trained models lm1 = Probs.LanguageModel.load(get_model_filename(smoother, lexicon, train_file1)) lm2 = Probs.LanguageModel.load(get_model_filename(smoother, lexicon, train_file2)) # We use natural log for our internal computations and that's # the kind of log-probability that fileLogProb returns. # But we'd like to print a value in bits: so we convert # log base e to log base 2 at print time, by dividing by log(2). #Class counters to keep track of number of predictions in each class class1_counter = 0 class2_counter = 0 #Counter of wrong predictions for evaluation wrong_predictions = 0 total_cross_entropy1 = 0. total_cross_entropy2 = 0. total_cross_entropy = 0. files_length_accuracy = defaultdict(list) #Loop for predicting each dev/test file for testfile in argv: ce1 = lm1.filelogprob(testfile) / math.log(2) # print("#{:g}\t{}".format(ce1, testfile)) #Number of tokens in the test file used for averaging probability token_count = lm1.num_tokens(testfile) #Compute posterior probability for class 1 map1 = ((math.log(priorprob_corpus1) + lm1.filelogprob(testfile)) / math.log(2) ) / token_count #Compute posterior probability for class 2 map2 = ((math.log(1 - priorprob_corpus1) + lm2.filelogprob(testfile)) / math.log(2)) / token_count ce2 = lm2.filelogprob(testfile) / math.log(2) # print("#{:g}\t{}".format(ce2, testfile)) total_cross_entropy1 -= ce1 total_cross_entropy2 -= ce2 #Compare probabilities for prediction if map1 > map2: print(train_file1,"\t",testfile) class1_counter += 1 prediction, filelength = evaluate(testfile, 'english') wrong_predictions += prediction else: print(train_file2, "\t", testfile) class2_counter += 1 prediction, filelength = evaluate(testfile, 'spanish') wrong_predictions += prediction #files_length_accuracy[filelength].append(1-prediction) #Print Outputs for Class 1 print(class1_counter,"files were more probably",train_file1,"({percent:.2f}%)".format(percent = 100*class1_counter/ (class1_counter + class2_counter))) #Print Outputs for Class 2 print(class2_counter, "files were more probably", train_file2, "({percent:.2f}%)".format(percent = 100 * class2_counter/ (class1_counter + class2_counter))) print("#",wrong_predictions,"Error Rate: ", " ({percent:.2f}%)".format(percent = 100 * wrong_predictions/(class1_counter + class2_counter))) #filename = 'P3_{}_{}_{}_{}_data.txt'.format(smoother, basename(lexicon), basename(train_file1), basename(train_file2)) #f = open(filename, "w") #for key, val in files_length_accuracy.items(): # print("#File of length ", key," were ", 100*sum(val)/len(val), "% accurate.") # f.write(str(key)+" "+str(100*sum(val)/len(val))+"\n") #f.close() # for p1,p2 in zip(ce1_list, ce2_list): # if p1> p2: total_cross_entropy2 -= ce2 total_cross_entropy = total_cross_entropy1 + total_cross_entropy2 # print('#Overall cross-entropy:\t{0:.5f}'.format(total_cross_entropy1/sum([lm1.num_tokens(testfile) for testfile in argv]))) # print('#Overall cross-entropy:\t{0:.5f}'.format(total_cross_entropy2/sum([lm2.num_tokens(testfile) for testfile in argv]))) print('#Overall cross-entropy:\t{0:.5f}'.format(0.5*total_cross_entropy/sum([lm1.num_tokens(testfile) for testfile in argv]))) else: sys.exit(-1) if __name__ == "__main__": main()
def main(): course_dir = '/usr/local/data/cs465/' if len(sys.argv) < 6 or (sys.argv[1] == 'TRAIN' and len(sys.argv) != 6): print(""" Prints the log-probability of each file under a smoothed n-gram model. Usage: {} TRAIN smoother lexicon trainpath {} TEST smoother lexicon trainpath files... Example: {} TRAIN add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {} TEST add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {}hw-lm/speech/sample* Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1 (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0 the \"1\" in loglinear1 can be replaced with any C >= 0 ) lexicon is the location of the word vector file, which is only used in the loglinear model trainpath is the location of the training corpus (the search path for this includes "{}") """.format(sys.argv[0], sys.argv[0], sys.argv[0], course_dir, sys.argv[0], course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR)) sys.exit(1) mode = sys.argv[1] argv = sys.argv[2:] smoother = argv.pop(0) lexicon = argv.pop(0) train_file1 = argv.pop(0) train_file2 = argv.pop(0) if mode == 'TRAIN': lm = Probs.LanguageModel() lm.set_vocab_size(train_file1, train_file2) lm.set_smoother(smoother) lm.read_vectors(lexicon) lm.train(train_file1) lm.save(get_model_filename(smoother, lexicon, train_file1)) lm.train(train_file2) lm.save(get_model_filename(smoother, lexicon, train_file2)) elif mode == 'TEST': if len(argv) < 2: print("warning: not enough") lm1 = Probs.LanguageModel.load( get_model_filename(smoother, lexicon, train_file1)) lm2 = Probs.LanguageModel.load( get_model_filename(smoother, lexicon, train_file2)) # We use natural log for our internal computations and that's # the kind of log-probability that fileLogProb returns. # But we'd like to print a value in bits: so we convert # log base e to log base 2 at print time, by dividing by log(2). prior_gen = argv.pop(0) prior_gen = float(prior_gen) #file_len_acc = open("len_acc.txt","w") total_cross_1 = 0. total_cross_2 = 0. sum_acc1 = 0. sum_acc2 = 0. count_1 = 0 count_2 = 0 file_count = 0 for testfile in argv: file_count += 1 log_prior_1 = math.log(prior_gen, 2) ce1 = lm1.filelogprob(testfile) / math.log(2) log_posterior_1 = ce1 + log_prior_1 log_prior_2 = math.log(1 - prior_gen, 2) ce2 = lm2.filelogprob(testfile) / math.log(2) log_posterior_2 = ce2 + log_prior_2 total_cross_1 -= log_posterior_1 total_cross_2 -= log_posterior_2 if log_posterior_1 > log_posterior_2: print(train_file1 + "\t" + testfile) count_1 += 1 else: print(train_file2 + "\t" + testfile) count_2 += 1 #filename_spt = testfile.split("/") #length = filename_spt[2].split(".")[1] CON = max(0 - log_posterior_1, 0 - log_posterior_2) try: p1 = pow(2, log_posterior_1 + CON) p2 = pow(2, log_posterior_2 + CON) acc1 = p1 / (p1 + p2) acc2 = p2 / (p1 + p2) #print(acc1) #print(acc2) sum_acc1 += acc1 sum_acc2 += acc2 except Exception as e: #print(e) if log_posterior_1 > log_posterior_2: sum_acc1 += 1 else: sum_acc2 += 1 setname = testfile.split("/")[1] #if setname == "english": # print(sum_acc1) # print(total_cross_1) # elif setname == "spanish": # print(sum_acc2) # print(total_cross_2) # print(file_count) # print(sum([lm1.num_tokens(testfile) for testfile in argv])) #if filename_spt[1] == train_file1: #file_len_acc.write(length+" "+str(log_posterior_1)+"\n") #elif filename_spt[1] == train_file2: #file_len_acc.write(length+" "+str(log_posterior_2)+"\n") #file_len_acc.close() prob1 = round((float(count_1) / float(count_1 + count_2)) * 100, 2) prob2 = round((float(count_2) / float(count_1 + count_2)) * 100, 2) print( str(count_1) + " files were more probably " + train_file1 + " (" + str(prob1) + "%") print( str(count_2) + " files were more probably " + train_file2 + " (" + str(prob2) + "%)") else: sys.exit(-1)
def main(): course_dir = '/usr/local/data/cs465/' if len(sys.argv) < 5 or (sys.argv[1] == 'TRAIN' and len(sys.argv) != 6): print(""" Prints the log-probability of each file under a smoothed n-gram model. Usage: {} TRAIN smoother lexicon trainpath {} TEST smoother lexicon trainpath files... Example: {} TRAIN add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {} TEST add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {}hw-lm/speech/sample* Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1 (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0 the \"1\" in loglinear1 can be replaced with any C >= 0 ) lexicon is the location of the word vector file, which is only used in the loglinear model trainpath is the location of the training corpus (the search path for this includes "{}") """.format(sys.argv[0], sys.argv[0], sys.argv[0], course_dir, sys.argv[0], course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR)) sys.exit(1) mode = sys.argv[1] argv = sys.argv[2:] smoother = argv.pop(0) lexicon = argv.pop(0) train_file1 = argv.pop(0) train_file2 = argv.pop(0) if mode == 'TRAIN': lm = Probs.LanguageModel() lm.set_vocab_size(train_file1, train_file2) lm.set_smoother(smoother) lm.read_vectors(lexicon) lm.train(train_file1) lm.save(get_model_filename(smoother, lexicon, train_file1)) lm.train(train_file2) lm.save(get_model_filename(smoother, lexicon, train_file2)) elif mode == 'TEST': if not argv: print("warning: no input files specified") lm1 = Probs.LanguageModel.load(get_model_filename(smoother, lexicon, train_file1)) lm2 = Probs.LanguageModel.load(get_model_filename(smoother, lexicon, train_file2)) prior_lm1 = float(argv.pop(0)) assert prior_lm1 <= 1 and prior_lm1 >= 0 prior_lm2 = 1 - prior_lm1 # We use natural log for our internal computations and that's # the kind of log-probability that fileLogProb returns. # But we'd like to print a value in bits: so we convert # log base e to log base 2 at print time, by dividing by log(2). lm1_type = train_file1.split('/')[-1].split('.')[0] lm2_type = train_file2.split('/')[-1].split('.')[0] file_correct = {} file_total = {} for testfile in argv: file_length = testfile.split("/")[-1].split(".")[1] file_type = testfile.split("/")[-1].split(".")[0] lm1_ce = (math.log(prior_lm1) + lm1.filelogprob(testfile)) / math.log(2) lm2_ce = (math.log(prior_lm2) + lm2.filelogprob(testfile)) / math.log(2) file_total[file_length] = file_total.get(file_length, 0) + 1 if lm1_ce > lm2_ce: if file_type == lm1_type: file_correct[file_length] = file_correct.get(file_length, 0) + 1 else: if file_type == lm2_type: file_correct[file_length] = file_correct.get(file_length, 0) + 1 accuracies = {} for key, value in file_total.items(): accuracies[key] = file_correct[key] / float(value) print(accuracies) else: sys.exit(-1)
def main(): course_dir = '/usr/local/data/cs465/' if len(sys.argv) < 5 or (sys.argv[1] == 'TRAIN' and len(sys.argv) != 6): print(""" Prints the log-probability of each file under a smoothed n-gram model. Usage: {} TRAIN smoother lexicon trainpath {} TEST smoother lexicon trainpath files... Example: {} TRAIN add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {} TEST add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {}hw-lm/speech/sample* Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1 (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0 the \"1\" in loglinear1 can be replaced with any C >= 0 ) lexicon is the location of the word vector file, which is only used in the loglinear model trainpath is the location of the training corpus (the search path for this includes "{}") """.format(sys.argv[0], sys.argv[0], sys.argv[0], course_dir, sys.argv[0], course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR)) sys.exit(1) mode = sys.argv[1] argv = sys.argv[2:] smoother = argv.pop(0) lexicon = argv.pop(0) train_file1 = argv.pop(0) train_file2 = argv.pop(0) if mode == 'TRAIN': lm = Probs.LanguageModel() lm.set_vocab_size(train_file1, train_file2) lm.set_smoother(smoother) lm.read_vectors(lexicon) lm.train(train_file1) lm.save(get_model_filename(smoother, lexicon, train_file1)) lm.train(train_file2) lm.save(get_model_filename(smoother, lexicon, train_file2)) elif mode == 'TEST': if not argv: print("warning: no input files specified") lm1 = Probs.LanguageModel.load( get_model_filename(smoother, lexicon, train_file1)) lm2 = Probs.LanguageModel.load( get_model_filename(smoother, lexicon, train_file2)) prior_lm1 = float(argv.pop(0)) assert prior_lm1 <= 1 and prior_lm1 >= 0 prior_lm2 = 1 - prior_lm1 # We use natural log for our internal computations and that's # the kind of log-probability that fileLogProb returns. # But we'd like to print a value in bits: so we convert # log base e to log base 2 at print time, by dividing by log(2). lm1_count = 0 lm2_count = 0 train_file1 = basename(train_file1) train_file2 = basename(train_file2) for testfile in argv: lm1_ce = (math.log(prior_lm1) + lm1.filelogprob(testfile)) / math.log(2) lm2_ce = (math.log(prior_lm2) + lm2.filelogprob(testfile)) / math.log(2) if lm1_ce > lm2_ce: lm1_count += 1 print(train_file1 + '\t' + testfile) else: lm2_count += 1 print(train_file2 + '\t' + testfile) print("{0:d} files were more probably {1:s} ({2:.2f}%)".format(lm1_count, train_file1, \ float(100.0 * lm1_count / (lm1_count + lm2_count)))) print("{0:d} files were more probably {1:s} ({2:.2f}%)".format(lm2_count, train_file2, \ float(100.0 * lm2_count / (lm1_count + lm2_count)))) else: sys.exit(-1)
def main(): course_dir = '/usr/local/data/cs465/' if len(sys.argv) < 5 or (sys.argv[1] == 'TRAIN' and len(sys.argv) != 5): print(""" Prints the log-probability of each file under a smoothed n-gram model. Usage: {} TRAIN smoother lexicon trainpath {} TEST smoother lexicon trainpath files... Example: {} TRAIN add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {} TEST add0.01 {}hw-lm/lexicons/words-10.txt switchboard-small {}hw-lm/speech/sample* Possible values for smoother: uniform, add1, backoff_add1, backoff_wb, loglinear1 (the \"1\" in add1/backoff_add1 can be replaced with any real lambda >= 0 the \"1\" in loglinear1 can be replaced with any C >= 0 ) lexicon is the location of the word vector file, which is only used in the loglinear model trainpath is the location of the training corpus (the search path for this includes "{}") """.format(sys.argv[0], sys.argv[0], sys.argv[0], course_dir, sys.argv[0], course_dir, course_dir, Probs.DEFAULT_TRAINING_DIR)) sys.exit(1) argv = sys.argv[1:] smoother = argv.pop(0) lexicon = argv.pop(0) train_file = argv.pop(0) if not argv: print("warning: no input files specified") lm = Probs.LanguageModel() lm.set_smoother(smoother) lm.read_vectors(lexicon) lm.train(train_file) total_words = 0 total_error = 0.0 for testfile in argv: f = open(testfile) line = f.readline() sequences = [] # Read data from file line = f.readline() while line: w_list = [] items = line.split() line = f.readline() error_rate = float(items[0]) log_p_uw = float(items[1]) words = int(items[2]) for i in range(3, words + 5): w_list.append(items[i]) w_list = w_list[1:-1] # log probability computation # trigram model log_prob = 0.0 x, y = Probs.BOS, Probs.BOS for z in w_list: log_prob += math.log(lm.prob(x, y, z)) x = y y = z log_prob += math.log(lm.prob(x, y, Probs.EOS)) # bigram model #y = Probs.BOS #for z in w: # log_prob += math.log(lm.prob_bigram(y, z)) # y = z #log_prob += math.log(lm.prob_bigram(y, Probs.EOS)) # unigram model #for z in w: # log_prob += math.log(lm.prob_unigram(z)) sequences.append( (error_rate, words, log_p_uw + log_prob / math.log(2))) # Pick the best match, the one with highest probability best_match = max(sequences, key=lambda item: item[2]) total_error += best_match[0] * best_match[1] total_words += best_match[1] print('{0}\t{1}'.format(best_match[0], testfile)) print('{0:0.03f}\t{1}'.format(total_error / total_words, "OVERALL"))