def main(args): if not args.extra: train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.dataset_file) pred_p = c.classifyPerceptron(train_set, train_labels, dev_set, args.lrate, args.max_iter) print("Perceptron") accuracy, f1, precision, recall = compute_accuracies( pred_p, dev_set, dev_labels) pred_lr = c.classifyLR(train_set, train_labels, dev_set, args.lrate, args.max_iter) print("\nLogistic Regression") accuracy, f1, precision, recall = compute_accuracies( pred_lr, dev_set, dev_labels) else: train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.dataset_file, extra=True) predicted_labels = c.classifyEC(train_set, train_labels, dev_set, args.k) print("kNN, k = {}".format(args.k)) accuracy, f1, precision, recall = compute_accuracies( predicted_labels, dev_set, dev_labels)
def main(args): laplace_number_set = np.linspace(0, 0.2, 2001) print(laplace_number_set) laplace_performance = np.zeros(2000) for number in range(2000): if (number == 0): laplace_performance[number] = 0 else: train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.training_dir, args.development_dir, args.stemming) predicted_labels = nb.naiveBayes(train_set, train_labels, dev_set, laplace_number_set[number]) accuracy, f1, precision, recall = compute_accuracies( predicted_labels, dev_set, dev_labels) laplace_performance[number] = accuracy best_laplace = np.argmax(laplace_performance) train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.training_dir, args.development_dir, args.stemming) predicted_labels = nb.naiveBayes(train_set, train_labels, dev_set, best_laplace) accuracy, f1, precision, recall = compute_accuracies( predicted_labels, dev_set, dev_labels) laplace_performance[number] = accuracy print("best laplace parameter:", best_laplace) print("Accuracy:", accuracy) print("F1-Score:", f1) print("Precision:", precision) print("Recall:", recall)
def main(args): ### Part 1 ### train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.dataset_file, part=1) train_set = torch.tensor(train_set, dtype=torch.float32) train_labels = torch.tensor(train_labels, dtype=torch.int64) dev_set = torch.tensor(dev_set, dtype=torch.float32) losses, predicted_labels, net = p1.fit(train_set, train_labels, dev_set, args.max_iter) accuracy, f1, precision, recall = compute_accuracies(predicted_labels, dev_set, dev_labels, N_class=3) print(' ##### Part 1 results #####') print("Accuracy:", accuracy) print("F1-Score:", f1) print("Precision:", precision) print("Recall:", recall) print("num_parameters:", sum([np.prod(w.shape) for w in net.get_parameters()])) torch.save(net, "net_p1.model") ### Part 2 ### train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.dataset_file, part=2) train_set = torch.tensor(train_set, dtype=torch.float32) train_labels = torch.tensor(train_labels, dtype=torch.int64) dev_set = torch.tensor(dev_set, dtype=torch.float32) _, predicted_labels, net = p2.fit(train_set, train_labels, dev_set, 10 * args.max_iter) accuracy, f1, precision, recall = compute_accuracies(predicted_labels, dev_set, dev_labels, N_class=5) print(' ##### Part 2 results #####') print("Accuracy:", accuracy) print("F1-Score:", f1) print("Precision:", precision) print("Recall:", recall) print("num_parameters:", sum([np.prod(w.shape) for w in net.get_parameters()])) torch.save(net, "net_p2.model") ### Part 3 ### # input provided will be normalized to 0.0 - 1.0 train_set, _, dev_set, _ = reader.load_dataset( args.dataset_file, part=2) # use same data as part 2 train_set = torch.tensor(train_set, dtype=torch.float32) / 255.0 dev_set = torch.tensor(dev_set, dtype=torch.float32) / 255.0 _, x_recon, net = p3.fit(train_set, dev_set, args.max_iter * 10) diff = (x_recon - dev_set.numpy())**2 MSE = diff.mean() print(' ##### Part 3 results #####') print("MSE:", MSE) print("num_parameters:", sum([np.prod(w.shape) for w in net.get_parameters()])) torch.save(net, "net_p3.model")
def main(args): train_set = load_dataset(args.training_file, args.case_sensitive) test_set = load_dataset(args.test_file, args.case_sensitive) if args.baseline: print("You are running the baseline algorithm!") accuracy = compute_accuracies(test_set, baseline(train_set, strip_tags(test_set))) else: print("You are running the Viterbi algorithm!") accuracy = compute_accuracies(test_set, viterbi(train_set, strip_tags(test_set))) print("Accuracy:",accuracy)
def main(args): if args.method == 'perceptron': train_set, train_labels, dev_set,dev_labels = reader.load_dataset(args.dataset_file) pred_p = c.classifyPerceptron(train_set, train_labels, dev_set, args.lrate, args.max_iter) print("Perceptron") accuracy,f1,precision,recall = compute_accuracies(pred_p, dev_labels) elif args.method == 'knn': train_set, train_labels, dev_set,dev_labels = reader.load_dataset(args.dataset_file, extra=True) predicted_labels = c.classifyKNN(train_set, train_labels, dev_set, args.k) print("kNN, k = {}".format(args.k)) accuracy,f1,precision,recall = compute_accuracies(predicted_labels, dev_labels) else: print("Method must be either perceptron or knn!")
def test_unigram_dev_stem_false_lower_false(): print("Running unigram test..."+'\n') train_set, train_labels, dev_set, dev_labels = reader.load_dataset( "data/spam_data/train", "data/spam_data/dev", stemming=False, lower_case=False, use_tqdm=False ) predicted_labels = nb.naiveBayes( train_set, train_labels, dev_set, smoothing_parameter=1.0, pos_prior=0.5) if len(predicted_labels) != len(dev_labels): print("The length of the list of predictions is not equivalent to the length of the list of development labels.") errorDict = { 'name': 'Unigram test on dev set without stemming and without lowercase', 'score': 0, 'max_score': 20, 'visibility': 'visible' } return json.dumps(errorDict, indent=1) ( accuracy, f1, precision, recall, ) = mp2.compute_accuracies(predicted_labels, dev_set, dev_labels) print("Accuracy:",accuracy) print("F1-Score:",f1) print("Precision:",precision) print("Recall:",recall) total_score = 0 if accuracy >= 0.81: total_score += 5 print("+ 5 points for accuracy above " + str(0.81)) else: print("Accuracy needs to be above " + str(0.81)) if accuracy >= 0.86: total_score += 5 print("+ 5 points for accuracy above " + str(0.86)) else: print("Accuracy needs to be above " + str(0.86)) if accuracy >= 0.91: total_score += 5 print("+ 5 points for accuracy above " + str(0.91)) else: print("Accuracy needs to be above " + str(0.91)) if accuracy >= 0.95: total_score += 5 print("+ 5 points for accuracy above " + str(0.95)) else: print("Accuracy needs to be above " + str(0.95)) resultDict = { 'name': 'Unigram test on dev set without stemming and without lowercase', 'score': total_score, 'max_score': 20, 'visibility': 'visible' } return json.dumps(resultDict, indent=1)
def main(season): train_set, train_labels, rec_set, url_to_jpg = reader.load_dataset() seasons = {'summer': 0, 'fall': 1, 'winter': 2, 'spring': 3} res = knn(train_set, train_labels, rec_set, url_to_jpg, seasons[season]) print(res)
def main(args): train_set, train_labels, dev_set,dev_labels = reader.load_dataset(args.training_dir,args.development_dir,args.stemming) predicted_labels = nb.naiveBayes(train_set,train_labels, dev_set, args.laplace) accuracy,f1,precision,recall = compute_accuracies(predicted_labels,dev_set,dev_labels) print("Accuracy:",accuracy) print("F1-Score:",f1) print("Precision:",precision) print("Recall:",recall)
def main(args): train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.training_dir, args.development_dir, stemming=False, lower_case=True) best_tf_idf_words = tf_idf.compute_tf_idf(train_set, train_labels, dev_set) print("Finished executing compute_tf_idf()")
def main(args): # ### Part 1 ### # train_set, train_labels, dev_set,dev_labels = reader.load_dataset(args.dataset_file, part=1) # train_set = torch.tensor(train_set,dtype=torch.float32) # train_labels = torch.tensor(train_labels,dtype=torch.int64) # dev_set = torch.tensor(dev_set,dtype=torch.float32) # losses,predicted_labels,net = p1.fit(train_set,train_labels, dev_set,args.max_iter) # accuracy,f1,precision,recall = compute_accuracies(predicted_labels,dev_set,dev_labels, N_class=3) # print(' ##### Part 1 results #####') # print("Accuracy:",accuracy) # print("F1-Score:",f1) # print("Precision:",precision) # print("Recall:",recall) # print("num_parameters:", sum([ np.prod(w.shape) for w in net.get_parameters()])) # torch.save(net, "net_p1.model") # ### Part 2 ### # train_set, train_labels, dev_set,dev_labels = reader.load_dataset(args.dataset_file, part=2) # train_set = torch.tensor(train_set,dtype=torch.float32) # train_labels = torch.tensor(train_labels,dtype=torch.int64) # dev_set = torch.tensor(dev_set,dtype=torch.float32) # _,predicted_labels,net = p2.fit(train_set,train_labels, dev_set,10*args.max_iter) # accuracy,f1,precision,recall = compute_accuracies(predicted_labels,dev_set,dev_labels, N_class=5) # print(' ##### Part 2 results #####') # print("Accuracy:",accuracy) # print("F1-Score:",f1) # print("Precision:",precision) # print("Recall:",recall) # print("num_parameters:", sum([ np.prod(w.shape) for w in net.get_parameters()])) # torch.save(net, "net_p2.model") ### Part 3 ### # input provided will be normalized to 0.0 - 1.0 train_set, _, dev_set, _ = reader.load_dataset( args.dataset_file, part=2) # use same data as part 2 train_set = torch.tensor(train_set, dtype=torch.float32) / 255.0 dev_set = torch.tensor(dev_set, dtype=torch.float32) / 255.0 _, x_recon, net = p3.fit(train_set, dev_set, args.max_iter * 10) diff = (x_recon - dev_set.numpy())**2 MSE = diff.mean() print(' ##### Part 3 results #####') print("MSE:", MSE) print("num_parameters:", sum([np.prod(w.shape) for w in net.get_parameters()])) torch.save(net, "net_p3.model") # Show some original images in 1st row & reconstructed images in 2nd row # For debug only import matplotlib.pyplot as plt idx = np.random.choice(x_recon.shape[0], 10) fig, axes = plt.subplots(2, 10) for i in range(10): axes[0, i].imshow(dev_set[idx[i]].reshape(28, 28), cmap='gray') axes[1, i].imshow(x_recon[idx[i]].reshape(28, 28), cmap='gray') plt.show()
def main(args): #Modify stemming and lower case below. Note that our test cases may use both settings of the two parameters train_set, train_labels, dev_set, dev_labels = reader.load_dataset(args.training_dir,args.development_dir,stemming=False,lower_case=False) predicted_labels = nb.naiveBayes(train_set, train_labels, dev_set) accuracy, false_positive, false_negative, true_positive, true_negative = compute_accuracies(predicted_labels,dev_labels) print("Accuracy:",accuracy) print("False Positive", false_positive) print("Fale Negative", false_negative) print("True Positive", true_positive) print("True Negative", true_negative)
def main(args): train_set, train_labels, dev_set,dev_labels = reader.load_dataset(args.dataset_file) train_set = torch.tensor(train_set,dtype=torch.float32) train_labels = torch.tensor(train_labels,dtype=torch.int64) dev_set = torch.tensor(dev_set,dtype=torch.float32) _,predicted_labels,net = p.fit(train_set,train_labels, dev_set,args.max_iter) accuracy,f1,precision,recall = compute_accuracies(predicted_labels,dev_set,dev_labels) print("Accuracy:",accuracy) print("F1-Score:",f1) print("Precision:",precision) print("Recall:",recall) torch.save(net, "net.model")
def main(args): train_set = load_dataset(args.training_file, args.case_sensitive) test_set = load_dataset(args.test_file, args.case_sensitive) if args.baseline: print("You are running the baseline algorithm!") predTags, unseenIdx, seenIdx = baseline(train_set, strip_tags(test_set)) check_seen_accuracy = True accuracy = compute_accuracies(test_set, predTags, unseenIdx, seenIdx, check_seen_accuracy) else: print("You are running the Viterbi algorithm!") predTags, unseenIdx, seenIdx = viterbi(train_set, strip_tags(test_set)) check_seen_accuracy = False # check_seen_accuracy = True accuracy = compute_accuracies(test_set, predTags, unseenIdx, seenIdx, check_seen_accuracy) if check_seen_accuracy: print("Accuracy of sentences with no unseen words") else: print("accuracy of sentences with unseen words") print("Accuracy:", accuracy)
def main(args): train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.dataset_file) if not args.extra: predicted_labels = p.classify(train_set, train_labels, dev_set, args.lrate, args.max_iter) else: predicted_labels = p.classifyEC(train_set, train_labels, dev_set, args.lrate, args.max_iter) accuracy, f1, precision, recall = compute_accuracies( predicted_labels, dev_set, dev_labels) print("Accuracy:", accuracy) print("F1-Score:", f1) print("Precision:", precision) print("Recall:", recall)
def main(args): train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.training_dir, args.development_dir, args.stemming) predicted_labels = nb.naiveBayes(train_set, train_labels, dev_set, args.laplace) answer = open('answer.txt', 'w') test = open('./data/test.jsonl', 'r').readlines() for i in range(len(test)): answer.write(json.loads(test[i])['id']) answer.write(",") if predicted_labels[i] == 0: answer.write("SARCASM") else: answer.write("NOT_SARCASM") answer.write("\n")
def main(args): train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.training_dir, args.development_dir, args.stemming, args.lower_case) predicted_labels = nb.naiveBayesMixture(train_set, train_labels, dev_set, args.bigram_lambda, args.unigram_smoothing, args.bigram_smoothing, args.pos_prior) accuracy, f1, precision, recall = compute_accuracies( predicted_labels, dev_set, dev_labels) print("Accuracy:", accuracy) print("F1-Score:", f1) print("Precision:", precision) print("Recall:", recall)
def bigram_check(): train_set, train_labels, dev_set, dev_labels = reader.load_dataset( "data/bigram_check/train", "data/bigram_check/dev", stemming=False, lower_case=False, use_tqdm=False, ) predicted_labels = nbm.naiveBayesMixture( train_set, train_labels, dev_set, unigram_smoothing_parameter=1.0, bigram_smoothing_parameter=1.0, bigram_lambda=1.0, pos_prior=0.5, ) if isinstance(predicted_labels, np.ndarray): predicted_labels = list(predicted_labels.reshape(-1)) if predicted_labels == [1, 1]: return BIGRAM_PENALTY else: return 1.0
def main(args): #Modify stemming and lower case below. Note that our test cases may use both settings of the two parameters max_iterations = 10 accuracy_limit = 0.87 min_accuracy = 0 max_accuracy = 0 unigram_smoothing_parameter = 0.0625 bigram_smoothing_parameter = 0.125 bigram_lambda = 0.05 # unigram smoothing parameter tuning domain min_unigram_smoothing_parameter = 0.0000001 max_unigram_smoothing_parameter = 1.0 # bigram smoothing parameter tuning domain min_bigram_smoothing_parameter = 0.0000001 max_bigram_smoothing_parameter = 1.0 # bigram_lambda tuning domain min_bigram_lambda = 0.0000001 max_bigram_lambda = 1.0 #bigram_lambda tuner iteration = 0 while min_accuracy < accuracy_limit or max_accuracy < accuracy_limit: if iteration > max_iterations: break # min_bigram_lambda train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.training_dir, args.development_dir, stemming=False, lower_case=False) predicted_labels = nb.bigramBayes(train_set, train_labels, dev_set, unigram_smoothing_parameter, bigram_smoothing_parameter, min_bigram_lambda) min_accuracy, false_positive, false_negative, true_positive, true_negative = compute_accuracies( predicted_labels, dev_labels) # max_bigram_lambda train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.training_dir, args.development_dir, stemming=False, lower_case=False) predicted_labels = nb.bigramBayes(train_set, train_labels, dev_set, unigram_smoothing_parameter, bigram_smoothing_parameter, max_bigram_lambda) max_accuracy, false_positive, false_negative, true_positive, true_negative = compute_accuracies( predicted_labels, dev_labels) print("Iteration:", iteration) print("unigram_smoothing_parameter:", unigram_smoothing_parameter) print("bigram_smoothing_parameter:", bigram_smoothing_parameter) print("min_bigram_lambda:", min_bigram_lambda) print("max_bigram_lambda:", max_bigram_lambda) print("min_Accuracy:", min_accuracy) print("max_Accuracy:", max_accuracy) print("False Positive:", false_positive) print("False Negative:", false_negative) print("True Positive:", true_positive) print("True Negative:", true_negative) if (min_accuracy < max_accuracy): min_bigram_lambda += (max_bigram_lambda - min_bigram_lambda) / 2 bigram_lambda = max_bigram_lambda else: max_bigram_lambda -= (max_bigram_lambda - min_bigram_lambda) / 2 bigram_lambda = min_bigram_lambda iteration += 1 # unigram_smoothing_parameter tuner iteration = 0 while min_accuracy < accuracy_limit or max_accuracy < accuracy_limit: if iteration > max_iterations: break # min_unigram_smoothing_parameter train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.training_dir, args.development_dir, stemming=False, lower_case=False) predicted_labels = nb.bigramBayes(train_set, train_labels, dev_set, min_unigram_smoothing_parameter, bigram_smoothing_parameter, bigram_lambda) min_accuracy, false_positive, false_negative, true_positive, true_negative = compute_accuracies( predicted_labels, dev_labels) # max_unigram_smoothing_parameter train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.training_dir, args.development_dir, stemming=False, lower_case=False) predicted_labels = nb.bigramBayes(train_set, train_labels, dev_set, max_unigram_smoothing_parameter, bigram_smoothing_parameter, bigram_lambda) max_accuracy, false_positive, false_negative, true_positive, true_negative = compute_accuracies( predicted_labels, dev_labels) print("Iteration:", iteration) print("min_unigram_smoothing_parameter:", min_unigram_smoothing_parameter) print("max_unigram_smoothing_parameter:", max_unigram_smoothing_parameter) print("bigram_smoothing_parameter:", bigram_smoothing_parameter) print("bigram_lambda:", bigram_lambda) print("min_Accuracy:", min_accuracy) print("max_Accuracy:", max_accuracy) print("False Positive:", false_positive) print("False Negative:", false_negative) print("True Positive:", true_positive) print("True Negative:", true_negative) if (min_accuracy < max_accuracy): min_unigram_smoothing_parameter += ( max_unigram_smoothing_parameter - min_unigram_smoothing_parameter) / 2 unigram_smoothing_parameter = max_unigram_smoothing_parameter else: max_unigram_smoothing_parameter -= ( max_unigram_smoothing_parameter - min_unigram_smoothing_parameter) / 2 unigram_smoothing_parameter = min_unigram_smoothing_parameter iteration += 1 # bigram_smoothing_parameter tuner iteration = 0 while min_accuracy < accuracy_limit or max_accuracy < accuracy_limit: if iteration > max_iterations: break # min_bigram_smoothing_parameter train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.training_dir, args.development_dir, stemming=False, lower_case=False) predicted_labels = nb.bigramBayes(train_set, train_labels, dev_set, unigram_smoothing_parameter, min_bigram_smoothing_parameter, bigram_lambda) min_accuracy, false_positive, false_negative, true_positive, true_negative = compute_accuracies( predicted_labels, dev_labels) # max_bigram_smoothing_parameter train_set, train_labels, dev_set, dev_labels = reader.load_dataset( args.training_dir, args.development_dir, stemming=False, lower_case=False) predicted_labels = nb.bigramBayes(train_set, train_labels, dev_set, unigram_smoothing_parameter, max_bigram_smoothing_parameter, bigram_lambda) max_accuracy, false_positive, false_negative, true_positive, true_negative = compute_accuracies( predicted_labels, dev_labels) print("Iteration:", iteration) print("unigram_smoothing_parameter:", unigram_smoothing_parameter) print("min_bigram_smoothing_parameter:", min_bigram_smoothing_parameter) print("max_bigram_smoothing_parameter:", max_bigram_smoothing_parameter) print("bigram_lambda:", bigram_lambda) print("min_Accuracy:", min_accuracy) print("max_Accuracy:", max_accuracy) print("False Positive:", false_positive) print("False Negative:", false_negative) print("True Positive:", true_positive) print("True Negative:", true_negative) if (min_accuracy < max_accuracy): min_bigram_smoothing_parameter += ( max_bigram_smoothing_parameter - min_bigram_smoothing_parameter) / 2 bigram_smoothing_parameter = max_bigram_smoothing_parameter else: max_bigram_smoothing_parameter -= ( max_bigram_smoothing_parameter - min_bigram_smoothing_parameter) / 2 bigram_smoothing_parameter = min_bigram_smoothing_parameter iteration += 1
def test_bigram_dev_stem_false_lower_false(): print("Running mixture model test..."+'\n') train_set, train_labels, dev_set, dev_labels = reader.load_dataset( "data/spam_data/train", "data/spam_data/dev", stemming=False, lower_case=False, use_tqdm=False ) predicted_labels = nbm.naiveBayesMixture(train_set, train_labels, dev_set, bigram_lambda=0.05, unigram_smoothing_parameter=1, bigram_smoothing_parameter=0.005, pos_prior=0.5) if len(predicted_labels) != len(dev_labels): print("The length of the list of predictions is not equivalent to the length of the list of development labels.") errorDict = { 'name': 'Mixture model test on dev set without stemming and without lowercase', 'score': 0, 'max_score': 5, 'visibility': 'visible' } return json.dumps(errorDict, indent=1) ( accuracy, f1, precision, recall ) = mp2.compute_accuracies(predicted_labels, dev_set, dev_labels) print("Accuracy:",accuracy) print("F1-Score:",f1) print("Precision:",precision) print("Recall:",recall) total_score = 0 if accuracy >= 0.80: total_score += 1.25 print("+ 1.25 points for accuracy above " + str(0.80)) else: print("Accuracy needs to be above " + str(0.80)) if accuracy >= 0.85: total_score += 1.25 print("+ 1.25 points for accuracy above " + str(0.85)) else: print("Accuracy needs to be above " + str(0.85)) if accuracy >= 0.90: total_score += 1.25 print("+ 1.25 points for accuracy above " + str(0.90)) else: print("Accuracy needs to be above " + str(0.90)) if accuracy >= 0.95: total_score += 1.25 print("+ 1.25 points for accuracy above " + str(0.95)) else: print("Accuracy needs to be above " + str(0.95)) if bigram_check() == BIGRAM_PENALTY: print(f"We hypothesize that your implementation of naiveBayesMixture is not correct. " f"Therefore, we applied a penalty multiplier of {BIGRAM_PENALTY} to your score.") total_score *= BIGRAM_PENALTY resultDict = { 'name': 'Mixture test on dev set without stemming and without lowercase', 'score': total_score, 'max_score': 5, 'visibility': 'visible' } return json.dumps(resultDict, indent = 1)