def case2(indexes=CASE_2_ATTRIBUTE_INDEXES,output=True): accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',') shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO) # equiprobable priors prior_spam = 0.5 prior_ham = 0.5 for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i) #parameter estimation #but now we take 10 attributes into consideration sample_means_word_spam = list() sample_means_word_ham = list() sample_variances_word_spam = list() sample_variances_word_ham = list() for attr_index in indexes: sample_means_word_spam.append(nb.take_mean_spam(train_set,attr_index,SPAM_ATTR_INDEX)) sample_means_word_ham.append(nb.take_mean_ham(train_set,attr_index,SPAM_ATTR_INDEX)) sample_variances_word_spam.append(nb.take_variance_spam(train_set,attr_index,SPAM_ATTR_INDEX)) sample_variances_word_ham.append(nb.take_variance_ham(train_set,attr_index,SPAM_ATTR_INDEX)) #sample standard deviations from sample variances sample_std_devs_spam = map(lambda x: x ** (1/2.0), sample_variances_word_spam) sample_std_devs_ham = map(lambda x: x ** (1/2.0), sample_variances_word_ham) hits = 0.0 misses = 0.0 #number of instances correctly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # now we test the hypothesis against the test set for row in test_set: # ou seja, o produto de todas as prob. condicionais das palavras dada a classe # eu sei que ta meio confuso, mas se olhar com cuidado eh bonito fazer isso tudo numa linha soh! =) product_of_all_conditional_probs_spam = reduce(lambda acc,cur: acc * stats.gumbel_l(sample_means_word_spam[cur], sample_std_devs_spam[cur]).pdf(row[indexes[cur]]) , xrange(10), 1) # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior! posterior_spam = prior_spam * product_of_all_conditional_probs_spam product_of_all_conditional_probs_ham = reduce(lambda acc,cur: acc * stats.gumbel_l(sample_means_word_ham[cur], sample_std_devs_ham[cur]).pdf(row[indexes[cur]]) , xrange(10), 1) posterior_ham = prior_ham * product_of_all_conditional_probs_ham # whichever is greater - that will be our prediction if posterior_spam > posterior_ham: guess = 1 else: guess = 0 if(row[SPAM_ATTR_INDEX] == guess): hits += 1 else: misses += 1 # we'll use these to calculate metrics if (row[SPAM_ATTR_INDEX] == 1 ): is_spam += 1 if guess == 1: guessed_spam += 1 correctly_is_spam += 1 else: guessed_ham += 1 else: is_ham += 1 if guess == 1: guessed_spam += 1 else: guessed_ham += 1 correctly_is_ham += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits/(hits+misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if(is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam/is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if(guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam/guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if(is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham/is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if(guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham/guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 2 - TEN ATTRIBUTES - USING GUMBEL LEFT MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: '+str(round(mean_accuracy,5)) print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5)) print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8)) print '' print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5)) print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5)) print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8)) print '' print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5)) print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5)) print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8)) print '' print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5)) print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5)) print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8)) print '' print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5)) print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5)) print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8))
def case1(index=CASE_1_ATTRIBUTE_INDEX,output=True,ret='accuracy'): accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',') shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO) # equiprobable priors prior_spam = 0.5 prior_ham = 0.5 for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i) #parameter estimation sample_mean_word_spam = nb.take_mean_spam(train_set,index,SPAM_ATTR_INDEX) sample_mean_word_ham = nb.take_mean_ham(train_set,index,SPAM_ATTR_INDEX) sample_variance_word_spam = nb.take_variance_spam(train_set,index,SPAM_ATTR_INDEX) sample_variance_word_ham = nb.take_variance_ham(train_set,index,SPAM_ATTR_INDEX) #sample standard deviations from sample variance sample_std_dev_spam = sample_variance_word_spam ** (1/2.0) sample_std_dev_ham = sample_variance_word_ham ** (1/2.0) hits = 0.0 misses = 0.0 #number of instances corretcly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # now we test the hypothesis against the test set for row in test_set: # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior! posterior_spam = prior_spam * stats.norm(sample_mean_word_spam, sample_std_dev_spam).pdf(row[index]) posterior_ham = prior_ham * stats.norm(sample_mean_word_ham, sample_std_dev_ham).pdf(row[index]) # whichever is greater - that will be our evaluation if posterior_spam > posterior_ham: guess = 1 else: guess = 0 if(row[SPAM_ATTR_INDEX] == guess): hits += 1 else: misses += 1 # we'll use these to calculate metrics if (row[SPAM_ATTR_INDEX] == 1 ): is_spam += 1 if guess == 1: guessed_spam += 1 correctly_is_spam += 1 else: guessed_ham += 1 else: is_ham += 1 if guess == 1: guessed_spam += 1 else: guessed_ham += 1 correctly_is_ham += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits/(hits+misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if(is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam/is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if(guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam/guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if(is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham/is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if(guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham/guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 1 - ONE ATTRIBUTE - USING NORMAL MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: '+str(round(mean_accuracy,5)) print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5)) print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8)) print '' print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5)) print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5)) print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8)) print '' print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5)) print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5)) print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8)) print '' print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5)) print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5)) print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8)) print '' print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5)) print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5)) print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8)) # we'll only use these return values to compute rankings # for example in script which_attribute_case_1 if ret == 'utility': return mean_accuracy * mean_precision_ham elif ret =='accuracy': return mean_accuracy else: print 'UNKNOWN METRIC: '+ret sys.exit()
def case1(index=CASE_1_ATTRIBUTE_INDEX,output=True,ret='accuracy'): accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',') shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO) # equiprobable priors prior_spam = 0.5 prior_ham = 0.5 for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i) #parameter estimation sample_mean_word_spam = nb.take_mean_spam(train_set,index,SPAM_ATTR_INDEX) sample_mean_word_ham = nb.take_mean_ham(train_set,index,SPAM_ATTR_INDEX) sample_variance_word_spam = nb.take_variance_spam(train_set,index,SPAM_ATTR_INDEX) sample_variance_word_ham = nb.take_variance_ham(train_set,index,SPAM_ATTR_INDEX) #sample standard deviations from sample variance sample_std_dev_spam = sample_variance_word_spam ** (1/2.0) sample_std_dev_ham = sample_variance_word_ham ** (1/2.0) hits = 0.0 misses = 0.0 #number of instances corretcly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # now we test the hypothesis against the test set for row in test_set: # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior! posterior_spam = prior_spam * stats.gumbel_l(sample_mean_word_spam, sample_std_dev_spam).pdf(row[index]) posterior_ham = prior_ham * stats.gumbel_l(sample_mean_word_ham, sample_std_dev_ham).pdf(row[index]) # whichever is greater - that will be our evaluation if posterior_spam > posterior_ham: guess = 1 else: guess = 0 if(row[SPAM_ATTR_INDEX] == guess): hits += 1 else: misses += 1 # we'll use these to calculate metrics if (row[SPAM_ATTR_INDEX] == 1 ): is_spam += 1 if guess == 1: guessed_spam += 1 correctly_is_spam += 1 else: guessed_ham += 1 else: is_ham += 1 if guess == 1: guessed_spam += 1 else: guessed_ham += 1 correctly_is_ham += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits/(hits+misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if(is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam/is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if(guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam/guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if(is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham/is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if(guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham/guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 1 - ONE ATTRIBUTE - USING GUMBEL LEFT MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: '+str(round(mean_accuracy,5)) print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5)) print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8)) print '' print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5)) print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5)) print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8)) print '' print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5)) print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5)) print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8)) print '' print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5)) print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5)) print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8)) print '' print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5)) print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5)) print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8)) # we'll only use these return values to compute rankings # for example in script which_attribute_case_1 if ret == 'utility': return mean_accuracy * mean_precision_ham elif ret =='accuracy': return mean_accuracy else: print 'UNKNOWN METRIC: '+ret sys.exit()
def case3(output=True): accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/normalized_data.csv", "rb"), delimiter=',') shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS, TRAIN_TEST_RATIO) # equiprobable priors prior_spam = 0.5 prior_ham = 0.5 for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set, test_set = prep.split_sets(shuffled, TRAIN_TEST_RATIO, i) #parameter estimation #but now we take ALL attributes into consideration sample_means_word_spam = list() sample_means_word_ham = list() sample_variances_word_spam = list() sample_variances_word_ham = list() # all but the last one for attr_index in xrange(57): sample_means_word_spam.append( nb.take_mean_spam(train_set, attr_index, SPAM_ATTR_INDEX)) sample_means_word_ham.append( nb.take_mean_ham(train_set, attr_index, SPAM_ATTR_INDEX)) sample_variances_word_spam.append( nb.take_variance_spam(train_set, attr_index, SPAM_ATTR_INDEX)) sample_variances_word_ham.append( nb.take_variance_ham(train_set, attr_index, SPAM_ATTR_INDEX)) #sample standard deviations from sample variances sample_std_devs_spam = map(lambda x: x**(1 / 2.0), sample_variances_word_spam) sample_std_devs_ham = map(lambda x: x**(1 / 2.0), sample_variances_word_ham) hits = 0.0 misses = 0.0 #number of instances correctly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # now we test the hypothesis against the test set for row in test_set: # ou seja, o produto de todas as prob. condicionais das palavras dada a classe # eu sei que ta meio confuso, mas se olhar com cuidado eh bonito fazer isso tudo numa linha soh! =) product_of_all_conditional_probs_spam = reduce( lambda acc, cur: acc * stats.norm(sample_means_word_spam[ cur], sample_std_devs_spam[cur]).pdf(row[ CASE_2_ATTRIBUTE_INDEXES[cur]]), xrange(10), 1) # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior! posterior_spam = prior_spam * product_of_all_conditional_probs_spam product_of_all_conditional_probs_ham = reduce( lambda acc, cur: acc * stats.norm(sample_means_word_ham[ cur], sample_std_devs_ham[cur]).pdf(row[ CASE_2_ATTRIBUTE_INDEXES[cur]]), xrange(10), 1) posterior_ham = prior_ham * product_of_all_conditional_probs_ham # whichever is greater - that will be our prediction if posterior_spam > posterior_ham: guess = 1 else: guess = 0 if (row[SPAM_ATTR_INDEX] == guess): hits += 1 else: misses += 1 # we'll use these to calculate metrics if (row[SPAM_ATTR_INDEX] == 1): is_spam += 1 if guess == 1: guessed_spam += 1 correctly_is_spam += 1 else: guessed_ham += 1 else: is_ham += 1 if guess == 1: guessed_spam += 1 else: guessed_ham += 1 correctly_is_ham += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits / (hits + misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if (is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam / is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if (guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam / guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if (is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham / is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if (guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham / guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 3 - ALL ATTRIBUTES - USING NORMAL MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: ' + str(round(mean_accuracy, 5)) print 'STD. DEV. OF ACCURACY: ' + str(round(std_dev_accuracy, 5)) print 'VARIANCE OF ACCURACY: ' + str(round(variance_accuracy, 8)) print '' print 'MEAN PRECISION FOR SPAM: ' + str(round(mean_precision_spam, 5)) print 'STD. DEV. OF PRECISION FOR SPAM: ' + str( round(std_dev_precision_spam, 5)) print 'VARIANCE OF PRECISION FOR SPAM: ' + str( round(variance_precision_spam, 8)) print '' print 'MEAN RECALL FOR SPAM: ' + str(round(mean_recall_spam, 5)) print 'STD. DEV. OF RECALL FOR SPAM: ' + str( round(std_dev_recall_spam, 5)) print 'VARIANCE OF RECALL FOR SPAM: ' + str( round(variance_recall_spam, 8)) print '' print 'MEAN PRECISION FOR HAM: ' + str(round(mean_precision_ham, 5)) print 'STD. DEV. OF PRECISION FOR HAM: ' + str( round(std_dev_precision_ham, 5)) print 'VARIANCE OF PRECISION FOR HAM: ' + str( round(variance_precision_ham, 8)) print '' print 'MEAN RECALL FOR HAM: ' + str(round(mean_recall_ham, 5)) print 'STD. DEV. OF RECALL FOR HAM: ' + str( round(std_dev_recall_ham, 5)) print 'VARIANCE OF RECALL FOR HAM: ' + str( round(variance_recall_ham, 8))
def case3(output=True): # does not distinguish between emails where an attribute appears more than # once as opposed to those where said attribute appears only once. # note that we're not using equiprobabilities for the priors. # we'll simulate the population priors using the sample priors. accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() # just ones and zeros m = np.loadtxt(open("resources/binarized_data.csv", "rb"), delimiter=',') shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS, TRAIN_TEST_RATIO) for i in xrange(NUMBER_OF_ROUNDS): train_set, test_set = prep.split_sets(shuffled, TRAIN_TEST_RATIO, i) prior_spam = nb.take_p_spam(train_set, SPAM_ATTR_INDEX) prior_ham = nb.take_p_ham(train_set, SPAM_ATTR_INDEX) hits = 0.0 misses = 0.0 #number of instances correctly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # we will pre-calculate these in order to save processing ps_attribute_spam = map( lambda elem: nb.take_p_attribute_spam(train_set, elem, SPAM_ATTR_INDEX), xrange(54)) ps_attribute_ham = map( lambda elem: nb.take_p_attribute_ham(train_set, elem, SPAM_ATTR_INDEX), xrange(54)) # now we test the hypothesis against the test set for row in test_set: # esses sao os valores do produtorio das probabilidades condicionais para spam e para ham, respectivamente. product_p_spam_attributes = reduce( lambda acc, elem: acc * ps_attribute_spam[elem] if row[elem] == 1 else acc, xrange(54), 1.0) product_p_ham_attributes = reduce( lambda acc, elem: acc * ps_attribute_ham[elem] if row[elem] == 1 else acc, xrange(54), 1.0) # o posterior eh igual ao prior vezes o produtorio das probabilidades condicionais p_spam_attribute = prior_spam * product_p_spam_attributes p_ham_attribute = prior_ham * product_p_ham_attributes # whichever is greater - that will be our prediction if p_spam_attribute > p_ham_attribute: guess = 1 else: guess = 0 # all these values should in theory be divided by the number # of features we're using, but we just want to know which # of the two (spam or ham) is more likely so even though the result # will be skewed by a constant, both classes will be multiplied # by that constant hence the comparison between the two is # still valid if (row[SPAM_ATTR_INDEX] == 0) and (guess == 0): is_ham += 1 guessed_ham += 1 correctly_is_ham += 1 hits += 1 elif (row[SPAM_ATTR_INDEX] == 0) and (guess == 1): is_ham += 1 guessed_spam += 1 misses += 1 elif (row[SPAM_ATTR_INDEX] == 1) and (guess == 0): is_spam += 1 guessed_ham += 1 misses += 1 elif (row[SPAM_ATTR_INDEX] == 1) and (guess == 1): is_spam += 1 guessed_spam += 1 correctly_is_spam += 1 hits += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits / (hits + misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if (is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam / is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if (guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam / guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if (is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham / is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if (guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham / guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 3 - ALL ATTRIBUTES - USING BERNOULLI MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: ' + str(round(mean_accuracy, 5)) print 'STD. DEV. OF ACCURACY: ' + str(round(std_dev_accuracy, 5)) print 'VARIANCE OF ACCURACY: ' + str(round(variance_accuracy, 8)) print '' print 'MEAN PRECISION FOR SPAM: ' + str(round(mean_precision_spam, 5)) print 'STD. DEV. OF PRECISION FOR SPAM: ' + str( round(std_dev_precision_spam, 5)) print 'VARIANCE OF PRECISION FOR SPAM: ' + str( round(variance_precision_spam, 8)) print '' print 'MEAN RECALL FOR SPAM: ' + str(round(mean_recall_spam, 5)) print 'STD. DEV. OF RECALL FOR SPAM: ' + str( round(std_dev_recall_spam, 5)) print 'VARIANCE OF RECALL FOR SPAM: ' + str( round(variance_recall_spam, 8)) print '' print 'MEAN PRECISION FOR HAM: ' + str(round(mean_precision_ham, 5)) print 'STD. DEV. OF PRECISION FOR HAM: ' + str( round(std_dev_precision_ham, 5)) print 'VARIANCE OF PRECISION FOR HAM: ' + str( round(variance_precision_ham, 8)) print '' print 'MEAN RECALL FOR HAM: ' + str(round(mean_recall_ham, 5)) print 'STD. DEV. OF RECALL FOR HAM: ' + str( round(std_dev_recall_ham, 5)) print 'VARIANCE OF RECALL FOR HAM: ' + str( round(variance_recall_ham, 8))
def case3(output=True): # does not distinguish between emails where an attribute appears more than # once as opposed to those where said attribute appears only once. # note that we're not using equiprobabilities for the priors. # we'll simulate the population priors using the sample priors. accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() # just ones and zeros m = np.loadtxt(open("resources/binarized_data.csv", "rb"), delimiter=",") shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS, TRAIN_TEST_RATIO) for i in xrange(NUMBER_OF_ROUNDS): train_set, test_set = prep.split_sets(shuffled, TRAIN_TEST_RATIO, i) prior_spam = nb.take_p_spam(train_set, SPAM_ATTR_INDEX) prior_ham = nb.take_p_ham(train_set, SPAM_ATTR_INDEX) hits = 0.0 misses = 0.0 # number of instances correctly evaluated as spam correctly_is_spam = 0.0 # total number of spam instances is_spam = 0.0 # total number of instances evaluated as spam guessed_spam = 0.0 # number of instances correctly evaluated as ham correctly_is_ham = 0.0 # total number of ham instances is_ham = 0.0 # total number of instances evaluated as ham guessed_ham = 0.0 # we will pre-calculate these in order to save processing ps_attribute_spam = map(lambda elem: nb.take_p_attribute_spam(train_set, elem, SPAM_ATTR_INDEX), xrange(54)) ps_attribute_ham = map(lambda elem: nb.take_p_attribute_ham(train_set, elem, SPAM_ATTR_INDEX), xrange(54)) # now we test the hypothesis against the test set for row in test_set: # esses sao os valores do produtorio das probabilidades condicionais para spam e para ham, respectivamente. product_p_spam_attributes = reduce( lambda acc, elem: acc * ps_attribute_spam[elem] if row[elem] == 1 else acc, xrange(54), 1.0 ) product_p_ham_attributes = reduce( lambda acc, elem: acc * ps_attribute_ham[elem] if row[elem] == 1 else acc, xrange(54), 1.0 ) # o posterior eh igual ao prior vezes o produtorio das probabilidades condicionais p_spam_attribute = prior_spam * product_p_spam_attributes p_ham_attribute = prior_ham * product_p_ham_attributes # whichever is greater - that will be our prediction if p_spam_attribute > p_ham_attribute: guess = 1 else: guess = 0 # all these values should in theory be divided by the number # of features we're using, but we just want to know which # of the two (spam or ham) is more likely so even though the result # will be skewed by a constant, both classes will be multiplied # by that constant hence the comparison between the two is # still valid if (row[SPAM_ATTR_INDEX] == 0) and (guess == 0): is_ham += 1 guessed_ham += 1 correctly_is_ham += 1 hits += 1 elif (row[SPAM_ATTR_INDEX] == 0) and (guess == 1): is_ham += 1 guessed_spam += 1 misses += 1 elif (row[SPAM_ATTR_INDEX] == 1) and (guess == 0): is_spam += 1 guessed_ham += 1 misses += 1 elif (row[SPAM_ATTR_INDEX] == 1) and (guess == 1): is_spam += 1 guessed_spam += 1 correctly_is_spam += 1 hits += 1 # accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits / (hits + misses) # precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if is_spam == 0: precision_spam = 0 else: precision_spam = correctly_is_spam / is_spam # recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if guessed_spam == 0: recall_spam = 0 else: recall_spam = correctly_is_spam / guessed_spam # precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if is_ham == 0: precision_ham = 0 else: precision_ham = correctly_is_ham / is_ham # recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if guessed_ham == 0: recall_ham = 0 else: recall_ham = correctly_is_ham / guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print "=============================================" print "CASE 3 - ALL ATTRIBUTES - USING BERNOULLI MODEL" print "=============================================" print "\033[00m" print "MEAN ACCURACY: " + str(round(mean_accuracy, 5)) print "STD. DEV. OF ACCURACY: " + str(round(std_dev_accuracy, 5)) print "VARIANCE OF ACCURACY: " + str(round(variance_accuracy, 8)) print "" print "MEAN PRECISION FOR SPAM: " + str(round(mean_precision_spam, 5)) print "STD. DEV. OF PRECISION FOR SPAM: " + str(round(std_dev_precision_spam, 5)) print "VARIANCE OF PRECISION FOR SPAM: " + str(round(variance_precision_spam, 8)) print "" print "MEAN RECALL FOR SPAM: " + str(round(mean_recall_spam, 5)) print "STD. DEV. OF RECALL FOR SPAM: " + str(round(std_dev_recall_spam, 5)) print "VARIANCE OF RECALL FOR SPAM: " + str(round(variance_recall_spam, 8)) print "" print "MEAN PRECISION FOR HAM: " + str(round(mean_precision_ham, 5)) print "STD. DEV. OF PRECISION FOR HAM: " + str(round(std_dev_precision_ham, 5)) print "VARIANCE OF PRECISION FOR HAM: " + str(round(variance_precision_ham, 8)) print "" print "MEAN RECALL FOR HAM: " + str(round(mean_recall_ham, 5)) print "STD. DEV. OF RECALL FOR HAM: " + str(round(std_dev_recall_ham, 5)) print "VARIANCE OF RECALL FOR HAM: " + str(round(variance_recall_ham, 8))
def case1(index=CASE_1_ATTRIBUTE_INDEX, output=True, ret='accuracy'): # does not distinguish between emails where an attribute appears more than # once as opposed to those where said attribute appears only once. # note that we're not using equiprobabilities for the priors. # we'll simulate the population priors using the sample priors. accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/binarized_data.csv", "rb"), delimiter=',') shuffled = np.random.permutation(m) for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set, test_set = prep.split_sets(shuffled, TRAIN_TEST_RATIO, i) p_attribute_spam = nb.take_p_attribute_spam(train_set, index, SPAM_ATTR_INDEX) p_spam = nb.take_p_spam(train_set, SPAM_ATTR_INDEX) p_attribute = nb.take_p_attribute(train_set, index, SPAM_ATTR_INDEX) hits = 0.0 misses = 0.0 #number of instances corretcly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # whichever is greater - that will be our prediction p_spam_attribute = (p_attribute_spam * p_spam) / p_attribute p_ham_attribute = 1 - p_spam_attribute if p_spam_attribute > p_ham_attribute: guess = 1 else: guess = 0 # now we test the hypothesis against the test set for row in test_set: # if the attribute isn't there, then our actual guess is the opposite of the calculated guess if (row[index] == 0) and (guess == 1): actual_guess = 0 elif (row[index] == 0) and (guess == 0): actual_guess = 1 else: actual_guess = guess if (row[SPAM_ATTR_INDEX] == 0) and (actual_guess == 0): is_ham += 1 guessed_ham += 1 correctly_is_ham += 1 hits += 1 elif (row[SPAM_ATTR_INDEX] == 0) and (actual_guess == 1): is_ham += 1 guessed_spam += 1 misses += 1 elif (row[SPAM_ATTR_INDEX] == 1) and (actual_guess == 0): is_spam += 1 guessed_ham += 1 misses += 1 elif (row[SPAM_ATTR_INDEX] == 1) and (actual_guess == 1): is_spam += 1 guessed_spam += 1 hits += 1 correctly_is_spam += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits / (hits + misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if (is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam / is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if (guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam / guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if (is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham / is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if (guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham / guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 1 - ONE ATTRIBUTE - USING BERNOULLI MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: ' + str(round(mean_accuracy, 5)) print 'STD. DEV. OF ACCURACY: ' + str(round(std_dev_accuracy, 5)) print 'VARIANCE OF ACCURACY: ' + str(round(variance_accuracy, 8)) print '' print 'MEAN PRECISION FOR SPAM: ' + str(round(mean_precision_spam, 5)) print 'STD. DEV. OF PRECISION FOR SPAM: ' + str( round(std_dev_precision_spam, 5)) print 'VARIANCE OF PRECISION FOR SPAM: ' + str( round(variance_precision_spam, 8)) print '' print 'MEAN RECALL FOR SPAM: ' + str(round(mean_recall_spam, 5)) print 'STD. DEV. OF RECALL FOR SPAM: ' + str( round(std_dev_recall_spam, 5)) print 'VARIANCE OF RECALL FOR SPAM: ' + str( round(variance_recall_spam, 8)) print '' print 'MEAN PRECISION FOR HAM: ' + str(round(mean_precision_ham, 5)) print 'STD. DEV. OF PRECISION FOR HAM: ' + str( round(std_dev_precision_ham, 5)) print 'VARIANCE OF PRECISION FOR HAM: ' + str( round(variance_precision_ham, 8)) print '' print 'MEAN RECALL FOR HAM: ' + str(round(mean_recall_ham, 5)) print 'STD. DEV. OF RECALL FOR HAM: ' + str( round(std_dev_recall_ham, 5)) print 'VARIANCE OF RECALL FOR HAM: ' + str( round(variance_recall_ham, 8)) # we'll only use these return values to compute rankings # for example in script which_attribute_case_1 if ret == 'utility': return mean_accuracy * mean_precision_ham elif ret == 'accuracy': return mean_accuracy else: print 'UNKNOWN METRIC: ' + ret sys.exit()
def case1(index= CASE_1_ATTRIBUTE_INDEX, output=True,ret='accuracy'): # does not distinguish between emails where an attribute appears more than # once as opposed to those where said attribute appears only once. # note that we're not using equiprobabilities for the priors. # we'll simulate the population priors using the sample priors. accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/binarized_data.csv","rb"),delimiter=',') shuffled = np.random.permutation(m) for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i) p_attribute_spam = nb.take_p_attribute_spam(train_set,index,SPAM_ATTR_INDEX) p_spam = nb.take_p_spam(train_set,SPAM_ATTR_INDEX) p_attribute = nb.take_p_attribute(train_set,index,SPAM_ATTR_INDEX) hits = 0.0 misses = 0.0 #number of instances corretcly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # whichever is greater - that will be our prediction p_spam_attribute = (p_attribute_spam * p_spam) / p_attribute p_ham_attribute = 1 - p_spam_attribute if p_spam_attribute > p_ham_attribute: guess = 1 else: guess = 0 # now we test the hypothesis against the test set for row in test_set: # if the attribute isn't there, then our actual guess is the opposite of the calculated guess if(row[index] == 0) and (guess == 1): actual_guess = 0 elif(row[index] == 0) and (guess == 0): actual_guess = 1 else: actual_guess = guess if (row[SPAM_ATTR_INDEX] == 0) and (actual_guess == 0): is_ham += 1 guessed_ham += 1 correctly_is_ham += 1 hits += 1 elif (row[SPAM_ATTR_INDEX] == 0) and (actual_guess ==1): is_ham += 1 guessed_spam += 1 misses += 1 elif (row[SPAM_ATTR_INDEX] == 1) and (actual_guess ==0): is_spam += 1 guessed_ham += 1 misses += 1 elif (row[SPAM_ATTR_INDEX] == 1) and (actual_guess ==1): is_spam += 1 guessed_spam += 1 hits += 1 correctly_is_spam += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits/(hits+misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if(is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam/is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if(guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam/guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if(is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham/is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if(guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham/guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 1 - ONE ATTRIBUTE - USING BERNOULLI MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: '+str(round(mean_accuracy,5)) print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5)) print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8)) print '' print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5)) print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5)) print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8)) print '' print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5)) print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5)) print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8)) print '' print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5)) print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5)) print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8)) print '' print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5)) print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5)) print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8)) # we'll only use these return values to compute rankings # for example in script which_attribute_case_1 if ret == 'utility': return mean_accuracy * mean_precision_ham elif ret =='accuracy': return mean_accuracy else: print 'UNKNOWN METRIC: '+ret sys.exit()