def testEffects(in_filename,out_filename): fracs = [0.01,0.1,0.2,0.5,0.6,0.75,0.9,1] trainResults = [] testResults = [] for f in fracs: temp = pd.read_csv(in_filename) (_,trainSet) = split.split(temp,"trainingSet.csv","testSet.csv",f,False) testSet = pd.read_csv("testSet.csv") (probs,headers) = nb.trainModel(1.0,trainSet) print("Fraction size: " + str(f)) trainResult = nb.test(trainSet,probs,headers,"Training Data Set") trainResults.append(trainResult) testResult = nb.test(testSet,probs,headers,"Test Data Set") testResults.append(testResult) return (fracs,trainResults,testResults)
def testEffects(in_filename,out_filename): bins = [2,5,10,50,100,200] trainResults = [] testResults = [] for b in bins: temp = pd.read_csv(in_filename) current = discretize.discretize(temp,out_filename,b,False) (trainSet,testSet) = split.split(current,"trainingSet.csv","testSet.csv",0.2,False) (probs,headers) = nb.trainModel(1,trainSet) print("Bin size: " + str(b)) trainResult = nb.test(trainSet,probs,headers,"Training Data Set") trainResults.append(trainResult) testResult = nb.test(testSet,probs,headers,"Test Data Set") testResults.append(testResult) return (bins,trainResults,testResults)
# classify the test dataset # read the test dataset label_test_buf = list() test_path = os.path.expanduser('./spam_classification/SPARSE.TEST') with open(test_path, newline='') as test: reader = csv.reader(test, delimiter=' ') for row in reader: label_test_buf.append(int(row[0])) label_test = np.asarray(label_test_buf, dtype=int) nd_test = len(label_test) count_d_w_test = np.zeros([nd_test, nw], dtype=int) with open(test_path, newline='') as test: reader = csv.reader(test, delimiter=' ') for d_id, row in enumerate(reader): current_email = csv.reader(row[2:-1], delimiter=':') for rows in current_email: w_id = int(rows[0]) count = int(rows[1]) count_d_w_test[d_id][w_id - 1] = count df_test = pd.DataFrame(count_d_w_test) nb_model = nb.train(df_train) nb_predictions = nb.test(nb_model, df_test) y = pd.Series(label_test) nb_error = nb.compute_error(y, nb_predictions) print('NB Test error: {}'.format(nb_error)) words = nb.k_most_indicative_words(5, nb_model.to_dataframe().iloc[:, :-1]) print('The {} most spam-worthy words are: {}'.format(len(words), words))
dataLines = dataFile.readlines() labelsLines = labelsFile.readlines() for line in dataLines: data.append(line.rstrip('\n').split(' ')) for line in labelsLines: labels.append(line.rstrip('\n')) return data, labels train_data, train_labels = readFile('train') test_data, test_labels = readFile('test') vocab = vocabulary(train_data) pi = estimate_pi(train_labels) theta = estimate_theta(train_data, train_labels, vocab) t = test(theta, pi, vocab, test_data) predicted = [] for data in t: pred = None maxScore = -sys.float_info.max for score in data: if score[0] > maxScore: maxScore = score[0] pred = score[1] predicted.append(pred) correct = 0 for i in range(len(test_labels)): if test_labels[i] == predicted[i]: correct += 1
pi_gt = {'class2': 0.5, 'class1': 0.5} vocab = vocabulary(train_data) print('Vocabulary result: {}'.format(vocab_gt == vocab)) pi = estimate_pi(train_labels) pi_success = True for class_name in pi_gt: if abs(pi_gt[class_name] - pi[class_name]) > 10**-5: pi_success = False print('Pi result: {}'.format(pi_success)) theta = estimate_theta(train_data, train_labels, vocab) theta_success = True for class_name in theta_gt: for word in theta[class_name]: if abs(theta[class_name][word] - theta_gt[class_name][word]) > 10**-5: theta_success = False print('Theta result: {}'.format(theta_success)) scores = test(theta, pi, vocab, test_data) scores_success = True for score_gt, score_pred in zip(scores_gt, scores): gt_dict = dict([(x[1], x[0]) for x in score_gt]) pred_dict = dict([(x[1], x[0]) for x in score_pred]) for class_name in gt_dict: if abs(gt_dict[class_name] - pred_dict[class_name]) > 10**-5: scores_success = False print('Scores result: {}'.format(scores_success))