def get_counts(file_list): """ Computes counts for each word that occurs in the files in file_list. Inputs ------ file_list : a list of filenames, suitable for use with open() or util.get_words_in_file() Output ------ A dict whose keys are words, and whose values are the number of files the key occurred in. """ ### TODO: Comment out the following line and write your code here word_counts = Counter() total = 0 for filename in file_list: words = util.get_words_in_file(filename) for word in words: count = word_counts[word] word_counts[word] = count + 1.0 total += 1.0 return word_counts
def get_counts(file_list): """ Computes counts for each word that occurs in the files in file_list. Inputs ------ file_list : a list of filenames, suitable for use with open() or util.get_words_in_file() Output ------ A dict whose keys are words, and whose values are the number of files the key occurred in. """ ### TODO: Comment out the following line and write your code here #raise NotImplementedError word_dict = Counter() for file in file_list: words = set(util.get_words_in_file(file)) for item in words: word_dict[item] += 1 return word_dict
def classify_email(email_filename, log_probabilities_by_category, log_prior_by_category): """ Uses Naive Bayes classification to classify the email in the given file. Inputs ------ email_filename : name of the file containing the email to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions Output ------ One of the labels in names. """ pspam = log_prior_by_category[0] pYgivenSpam = log_probabilities_by_category[0] pham = log_prior_by_category[1] pYgivenHam = log_probabilities_by_category[1] words = util.get_words_in_file(email_filename) for word in words: pspam += pYgivenSpam[word] pham += pYgivenHam[word] if pspam >= pham: return 'spam' return 'ham'
def get_all_unique_words(files): ret = set() for f in files: words = util.get_words_in_file(f) #print(set(words)) ret = ret.union(set(words)) #print(ret) return ret
def learn_distributions(file_lists_by_category): """ Estimate the parameters p_d, and q_d from the training set Input ----- file_lists_by_category: A two-element list. The first element is a list of spam files, and the second element is a list of ham files. Output ------ probabilities_by_category: A two-element tuple. The first element is a dict whose keys are words, and whose values are the smoothed estimates of p_d; the second element is a dict whose keys are words, and whose values are the smoothed estimates of q_d """ spamfiles = file_lists_by_category[0] hamfiles = file_lists_by_category[1] w = [] for spamfile in spamfiles: w.extend(util.get_words_in_file(spamfile)) for hamfile in hamfiles: w.extend(util.get_words_in_file(hamfile)) # n_spam = len(spam_words) # n_ham = len(ham_words) spam_count = util.get_counts(spamfiles) ham_count = util.get_counts(hamfiles) n = len(w) dict_spam = {wi : 0 for wi in w} dict_ham = {wi : 0 for wi in w} for key in dict_spam: dict_spam[key] = (spam_count[key]+1)/(n+2) dict_ham[key] = (ham_count[key]+1)/(n+2) probabilities_by_category = (dict_spam,dict_ham) return probabilities_by_category
def classify_email(email_filename, log_probabilities_by_category, log_prior_by_category): """ Uses Naive Bayes classification to classify the email in the given file. Inputs ------ email_filename : name of the file containing the email to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions Output ------ One of the labels in names. """ ### TODO: Comment out the following line and write your code here word_list = util.get_words_in_file(email_filename) len0 = len(log_probabilities_by_category[0].keys()) len1 = len(log_probabilities_by_category[1].keys()) P_spam = 0 P_ham = 0 for word in np.unique( list(log_probabilities_by_category[0].keys()) + \ list(log_probabilities_by_category[1].keys()) ): if word in word_list: if word in log_probabilities_by_category[0].keys(): P_spam += log_probabilities_by_category[0][word] else: P_spam += np.log(1 / (len0 + 2)) if word in log_probabilities_by_category[1].keys(): P_ham += log_probabilities_by_category[1][word] else: P_ham += np.log(1 / (len1 + 2)) else: if word in log_probabilities_by_category[0].keys(): P_spam += np.log( 1 - np.exp(log_probabilities_by_category[0][word])) else: P_spam += np.log(1 - 1 / (len0 + 2)) if word in log_probabilities_by_category[1].keys(): P_ham += np.log(1 - np.exp(log_probabilities_by_category[1][word])) else: P_ham += np.log(1 - 1 / (len1 + 2)) P_spam += log_prior_by_category[0] P_ham += log_prior_by_category[1] answer = {True: 'spam', False: 'ham'} return answer[P_spam >= P_ham]
def classify_new_email(filename, probabilities_by_category, prior_by_category, decisionFactor=1): """ Use Naive Bayes classification to classify the email in the given file. Inputs ------ filename: name of the file to be classified probabilities_by_category: output of function learn_distributions prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the parameter in the prior class distribution Optional argument decisionFactor is 1 by default. decisionFactor > 1 will let the model make decision in favour of HAM. Output ------ classify_result: A two-element tuple. The first element is a string whose value is either 'spam' or 'ham' depending on the classification result, and the second element is a two-element list as [log p(y=1|x), log p(y=0|x)], representing the log posterior probabilities """ # indexing is confusing, so I give them variable name to be more intuitive spamDistribution, hamDistribution = probabilities_by_category # build the function feature vector from the mail first mailFeatureVec = vocab.copy() for word in util.get_words_in_file(filename): # word is a regular word if word in mailFeatureVec: mailFeatureVec[word] += 1 # word represents a numeric value elif word.isnumeric(): mailFeatureVec[num] += 1 # word is not recognized else: mailFeatureVec["<unk>"] += 1 spamProb = 0 hamProb = 0 for word in mailFeatureVec: # compute P(y=spam| mailFeatureVec) spamProb += mailFeatureVec[word] * math.log(spamDistribution[word]) # compute P(y=ham| mailFeatureVec) hamProb += mailFeatureVec[word] * math.log(hamDistribution[word]) # multiply by prior distribution spamProb += math.log(prior_by_category[0]) hamProb += math.log(prior_by_category[1]) hamProb *= decisionFactor result = "ham" if (hamProb > spamProb) else "spam" return (result, (spamProb, hamProb)) #classify_result
def classify_email(email_filename, log_probabilities_by_category, log_prior_by_category): """ Uses Naive Bayes classification to classify the email in the given file. Inputs ------ email_filename : name of the file containing the email to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions Output ------ One of the labels in names. """ ### TODO: Comment out the following line and write your code here email_words = util.get_words_in_file(email_filename) test_words = set([]) for list in log_probabilities_by_category: for word in list: test_words.add(word) spam = log_prior_by_category[0] ham = log_prior_by_category[1] #print("spam prior:" + str(spam)) #print("ham prior:" + str(ham) + "\n") spam_data = log_probabilities_by_category[0] ham_data = log_probabilities_by_category[1] for word in test_words: if word in email_words: spam += spam_data[word] ham += ham_data[word] else: spam += log(1 - exp(spam_data[word])) ham += log(1 - exp(ham_data[word])) #print("spam:" + str(spam)) #print("ham:" + str(ham) + "\n\n\n") if spam > ham: label = "spam" else: label = "ham" return label
def train_logistic(file_lists_by_category): """ Extract features and labels for the logistic regression model, and train the model. Note that you'll need to arbitrarily pick one of spam and ham to be 0, and the other to be 1, when creating the labels for logistic regression. The choice doesn't matter; just make sure you are consistent about it. Inputs ------ A two-element list. The first element is a list of spam files, and the second element is a list of ham (non-spam) files. Output ------ Tuple of (theta, all_words) where theta is the vector of trained logistic regression parameters, and all_words is the list of all words found in the dataset (reused later to make sure we extract features in a consistent manner) """ # Build the set of all words. all_words = set() for filelist in file_lists_by_category: for f in filelist: words_in_f = util.get_words_in_file(f) all_words.update(words_in_f) all_words = list(all_words) num_spam = len(file_lists_by_category[0]) num_ham = len(file_lists_by_category[1]) num_examples = (num_spam + num_ham) num_features = len(all_words) # Allocate data containers. y = np.zeros((num_examples, num_features)) # Let label spam = 1. c = np.zeros(num_examples) c[:num_spam] = 1.0 # Get features for each file. j = 0 # Counter. for filelist in file_lists_by_category: for f in filelist: y[j, :] = extract_features(f, all_words) j += 1 # Optimize parameters. theta = optimize_theta( y, c) #, learning_rate=4.0, convergence_threshold=1e-6) return theta, all_words
def classify_new_email(filename, probabilities_by_category, prior_by_category, adjustment): """ Use Naive Bayes classification to classify the email in the given file. Inputs ------ filename: name of the file to be classified probabilities_by_category: output of function learn_distributions prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the parameter in the prior class distribution Output ------ classify_result: A two-element tuple. The first element is a string whose value is either 'spam' or 'ham' depending on the classification result, and the second element is a two-element list as [log p(y=1|x), log p(y=0|x)], representing the log posterior probabilities """ ### TODO: Write your code here D = len(probabilities_by_category) # Prior distributions (initital) MAP_spam = math.log(prior_by_category[0]) MAP_ham = math.log(prior_by_category[1]) all_words = util.get_words_in_file(filename) vocab = list(probabilities_by_category[0].keys()) # Calculate for each subsequent word for word in vocab: if word in all_words: MAP_spam += math.log(probabilities_by_category[0][word]) MAP_ham += math.log(probabilities_by_category[1][word]) else: MAP_spam += math.log(1-probabilities_by_category[0][word]) MAP_ham += math.log(1-probabilities_by_category[1][word]) # Check for the result if MAP_spam > adjustment * MAP_ham: result = 'spam' else: result = 'ham' classify_result = (result, [MAP_spam, MAP_ham]) return classify_result
def classify_email(email_filename, log_probabilities_by_category, log_prior_by_category): """ Uses Naive Bayes classification to classify the email in the given file. Inputs ------ email_filename : name of the file containing the email to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions Output ------ One of the labels in names. """ prob_spam = log_prior_by_category[0] prob_ham = log_prior_by_category[1] spam_probs = log_probabilities_by_category[0] ham_probs = log_probabilities_by_category[1] spam_probs_cond = 0 ham_probs_cond = 0 # For each word in the email get the prob of it appearing in spam or ham # If the word was not in the training dataset, assign it the 1/(# of words in relevant dictionary + 2) email_words = util.get_words_in_file(email_filename) for word in email_words: if word not in spam_probs.keys(): spam_probs_cond += -np.log(len(spam_probs.keys()) + 2) else: spam_probs_cond += spam_probs[word] if word not in ham_probs.keys(): ham_probs_cond += -np.log(len(ham_probs.keys()) + 2) else: ham_probs_cond += ham_probs[word] # Using Bayes' Theorem calculate the log probability of spam and ham spam = (prob_spam + spam_probs_cond) / logsumexp( [prob_spam + spam_probs_cond, prob_ham + ham_probs_cond]) ham = (prob_ham + ham_probs_cond) / logsumexp( [prob_spam + spam_probs_cond, prob_ham + ham_probs_cond]) # If the prob of spam is higher return spam o.w. ham if spam / ham > 1: return 'ham' else: return 'spam'
def classify_message(message_filename, log_probabilities_by_category, log_prior_by_category, names = ['spam', 'ham']): """ Uses Naive Bayes classification to classify the message in the given file. Inputs ------ message_filename : name of the file containing the message to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions names : labels for each class (for this problem set, will always be just spam and ham). Output ------ One of the labels in names. """ message_words = util.get_words_in_file(message_filename) N_categories = len(log_probabilities_by_category) log_likelihoods = [] for i in xrange(N_categories): total = 0 all_word_log_probs = log_probabilities_by_category[i].copy() # for w in all_word_log_probs.keys(): # log_prob = all_word_log_probs[w] # test = (w in message_words) # old_total += test*log_prob + (1-test)*np.log(1-np.exp(log_prob)) all_word_log_probs = log_probabilities_by_category[i].copy() for w in message_words: total += all_word_log_probs[w] all_word_log_probs.pop(w) total += np.sum(np.log(1-np.exp(all_word_log_probs.values()))) log_likelihoods.append(total) posterior = np.array(log_likelihoods) + np.array(log_prior_by_category) winner = np.argmax(posterior) return names[winner]
def classify_new_email(filename, probabilities_by_category, prior_by_category, eps=1): """ Use Naive Bayes classification to classify the email in the given file. Inputs ------ filename: name of the file to be classified probabilities_by_category: output of function learn_distributions prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the parameter in the prior class distribution Output ------ classify_result: A two-element tuple. The first element is a string whose value is either 'spam' or 'ham' depending on the classification result, and the second element is a two-element list as [log p(y=1|x), log p(y=0|x)], representing the log posterior probabilities """ ### TODO: Write your code here #eps is our adjustment #our priors Pspam = np.log(prior_by_category[0]) Pham = np.log(prior_by_category[1]) #all our vocabulary vocab = probabilities_by_category[0].keys() #words in our email words = util.get_words_in_file(filename) for word in vocab: #depending on if word is in the email or not, add the log probability of it being in or out if word in words: Pspam += np.log(probabilities_by_category[0][word]) Pham += np.log(probabilities_by_category[1][word]) else: Pspam += np.log(1 - probabilities_by_category[0][word]) Pham += np.log(1 - probabilities_by_category[1][word]) if Pspam > eps * Pham: classify_result = ('spam', [Pspam, Pham]) else: classify_result = ('ham', [Pspam, Pham]) return classify_result
def classify_email(email_filename, log_probabilities_by_category, log_prior_by_category): """ Uses Naive Bayes classification to classify the email in the given file. Inputs ------ email_filename : name of the file containing the email to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions Output ------ One of the labels in names. """ #labels: 0 us spam, 1:spam, use for return value labels = {0: 'spam', 1: 'ham'} #get unique occurences of word in each file new_words = set(util.get_words_in_file(email_filename)) all_words = [] for i in range(len(labels)): all_words += log_probabilities_by_category[i].keys() all_words = list(set(all_words)) #calcualte posterior distribution for spam and ham posterior_dist = [] for label in labels: posterior_dist.append(log_prior_by_category[label]) for word in all_words: #p_i or q_i if word in new_words: posterior_dist[label] += log_probabilities_by_category[label][ word] #1-p_i or q_i else: posterior_dist[label] += util.careful_log( 1 - np.exp(log_probabilities_by_category[label][word])) #print(posterior_dist[0] - posterior_dist[1]) #return MAP from log odds map = np.argmax(posterior_dist) #return spam return labels[map]
def classify_message(message_filename, log_probabilities_by_category, log_prior_by_category, names=['spam', 'ham']): """ Uses Naive Bayes classification to classify the message in the given file. Inputs ------ message_filename : name of the file containing the message to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions names : labels for each class (for this problem set, will always be just spam and ham). Output ------ One of the labels in names. """ message_words = util.get_words_in_file(message_filename) N_categories = len(log_probabilities_by_category) log_likelihoods = [] for i in xrange(N_categories): total = 0 all_word_log_probs = log_probabilities_by_category[i].copy() # for w in all_word_log_probs.keys(): # log_prob = all_word_log_probs[w] # test = (w in message_words) # old_total += test*log_prob + (1-test)*np.log(1-np.exp(log_prob)) all_word_log_probs = log_probabilities_by_category[i].copy() for w in message_words: total += all_word_log_probs[w] all_word_log_probs.pop(w) total += np.sum(np.log(1 - np.exp(all_word_log_probs.values()))) log_likelihoods.append(total) posterior = np.array(log_likelihoods) + np.array(log_prior_by_category) winner = np.argmax(posterior) return names[winner]
def get_counts(file_list): ''' Computes counts for each word that occurs in the files in file_list. :param file_list: a list of filenames :return: A dictionary whose keys are words, and whose values are the number of files the key occurred in. ''' dict_of_words = util.Counter() for file in file_list: words_in_file = util.get_words_in_file(file) #for each unique word in a file, increment the count for word in set(words_in_file): dict_of_words[word] += 1 return dict_of_words
def classify_email(email_filename, log_probabilities_by_category, log_prior_by_category): """ Uses Naive Bayes classification to classify the email in the given file. Inputs ------ email_filename : name of the file containing the email to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions Output ------ Either "spam" or "ham" """ email_words = set(util.get_words_in_file(email_filename)) N_categories = len(log_probabilities_by_category) # get the union of all words encountered during training all_words = [] for i in range(N_categories): all_words += log_probabilities_by_category[i].keys() all_words = list(set(all_words)) log_likelihoods = [] for i in range(N_categories): total = 0 all_word_log_probs = log_probabilities_by_category[i] for w in all_words: log_prob = all_word_log_probs[w] test = (w in email_words) if w in email_words: total += log_prob else: total += np.log(1 - np.exp(log_prob)) log_likelihoods.append(total) posterior = np.array(log_likelihoods) + np.array(log_prior_by_category) winner = np.argmax(posterior) if winner == 0: return "spam" else: return "ham"
def classify_message(message_filename, log_probabilities_by_category, log_prior_by_category, names=['spam', 'ham']): """ Uses Naive Bayes classification to classify the message in the given file. Inputs ------ message_filename : name of the file containing the message to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions names : labels for each class (for this problem set, will always be just spam and ham). Output ------ One of the labels in names. """ words = util.get_words_in_file(message_filename) words_set = set(words) # Faster lookup. # Start with just the prior probability of 'spam' and 'ham', respectively. posteriors = np.array(log_prior_by_category) # print('Prior:', posteriors) # Feature space is all the words from spam and ham. all_words = set(log_probabilities_by_category[0].keys()) all_words.update(list(log_probabilities_by_category[1].keys())) # print(len(all_words)) # Sum log-probabilities of generating each observed word given the label. # for w in log_probabilities_by_category[0]: for w in all_words: has_word = 1 if w in words_set else 0 posteriors[0] += log_probabilities_by_category[0][w][has_word] posteriors[1] += log_probabilities_by_category[1][w][has_word] # print(posteriors) if posteriors[0] >= posteriors[1]: return names[0] else: return names[1]
def get_counts(file_list): """ Computes counts for each word that occurs in the files in file_list. Inputs ------ file_list : a list of filenames, suitable for use with open() or util.get_words_in_file() Output ------ A dict whose keys are words, and whose values are the number of files the key occurred in. """ ctr = util.Counter() for f in file_list: for w in set(util.get_words_in_file(f)): ctr[w] += 1 return ctr
def classify_message(message_filename, log_probabilities_by_category, log_prior_by_category, names=['spam', 'ham']): ''' classify the message in the given file using learned parameters :param message_filename: name of the file containing the message to be classified :param log_probabilities_by_category: :param log_prior_by_category: :param names: class labels :return: the predicted class ''' try: words_in_file = set(util.get_words_in_file(message_filename)) except: return "file cannot be decoded" num_of_categories = len(log_priors_by_category) # make list of all the words seen in training all_words_from_training = [] for i in range(num_of_categories): all_words_from_training += log_probabilities_by_category[i].keys() all_words_from_training = list(set(all_words_from_training)) log_likelihoods = [] for i in range(num_of_categories): total = 0 all_word_log_probs = log_probabilities_by_category[i] for word in all_words_from_training: log_prob = all_word_log_probs[word] is_in_file = (word in words_in_file) total += is_in_file * log_prob + ( 1 - is_in_file) * np.log(1 - np.exp(log_prob)) log_likelihoods.append(total) posterior = np.array(log_likelihoods) + np.array(log_prior_by_category) predicted_category = np.argmax(posterior) return names[predicted_category]
def get_counts(file_list): """ Computes counts for each word that occurs in the files in file_list. Inputs ------ file_list : a list of filenames, suitable for use with open() or util.get_words_in_file() Output ------ A dict whose keys are words, and whose values are the number of files the key occurred in. """ counts = util.Counter() for f in file_list: words = util.get_words_in_file(f) for w in set(words): counts[w] += 1 return counts
def classify_new_email(filename, probabilities_by_category, prior_by_category, b=0): """ Use Naive Bayes classification to classify the email in the given file. Inputs ------ filename: name of the file to be classified probabilities_by_category: output of function learn_distributions prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the parameter in the prior class distribution Output ------ classify_result: A two-element tuple. The first element is a string whose value is either 'spam' or 'ham' depending on the classification result, and the second element is a two-element list as [log p(y=1|x), log p(y=0|x)], representing the log posterior probabilities """ # TODO: Write your code here # 2 classes are handled the same way log_probabilities = [0, 0] x = util.get_words_in_file(filename) for i in range(2): for word in probabilities_by_category[i]: x_d = x.count(word) log_probabilities[i] += x_d * np.log( probabilities_by_category[i][word]) log_probabilities[i] += prior_by_category[ i] # Since both 0.5, this line doesn't affect anything if log_probabilities[0] + b >= log_probabilities[1]: classify_result = ('spam', log_probabilities[0]) else: classify_result = ('ham', log_probabilities[1]) return classify_result
def get_counts(file_list): """ Computes counts for each word that occurs in the files in file_list. Inputs ------ file_list : a list of filenames, suitable for use with open() or util.get_words_in_file() Output ------ A dict whose keys are words, and whose values are the number of files the key occurred in. """ res = {} for f in file_list: tmp = util.get_words_in_file(f) for w in tmp: if w not in res: res[w] = 1 else: res[w] += 1 return res
def extract_features(f, all_words): """ Extract features from file for logistic regression. Inputs ------ f: Name of file to extract features from. all_words : List of all words in the training set of files. Output ------ Extracted features. """ words_in_f = set(util.get_words_in_file(f)) features = np.zeros(len(all_words)) # Set entry to 0 or 1 to indicate absence or presence of word i. for i, w in enumerate(all_words): if w in words_in_f: features[i] = 1.0 return features
def get_counts(file_list): """ Computes counts for each word that occurs in the files in file_list. Inputs ------ file_list : a list of filenames, suitable for use with open() or util.get_words_in_file() Output ------ A dict whose keys are words, and whose values are the number of files the key occurred in. """ counter = Counter() for filename in file_list: # make sure multiple occurances of a word per email are ignored word_set = set(util.get_words_in_file(filename)) for word in word_set: counter[word] += 1 return counter
def classify_new_email(filename,probabilities_by_category,prior_by_category): """ Use Naive Bayes classification to classify the email in the given file. Inputs ------ filename: name of the file to be classified probabilities_by_category: output of function learn_distributions prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the parameter in the prior class distribution Output ------ classify_result: A two-element tuple. The first element is a string whose value is either 'spam' or 'ham' depending on the classification result, and the second element is a two-element list as [log p(y=1|x), log p(y=0|x)], representing the log posterior probabilities """ ### TODO: Write your code here prior_0 = prior_by_category[0] prior_1 = prior_by_category[1] words = util.get_words_in_file(filename) unique_words = set(words) p_xy0 = get_log_pxy(probabilities_by_category, 0, words) + np.log(prior_0) p_xy1 = get_log_pxy(probabilities_by_category, 1, words) + np.log(prior_1) #print(p_xy0, p_xy1) #p_xy0 contains p_d, p_xy1 contains q_d therefore we need to swap res = 'spam' if p_xy0-p_xy1 > 0 else 'ham' classify_result = (res, (p_xy0, p_xy1)) return classify_result
def classify_new_email(filename, probabilities_by_category, prior_by_category, zeta): """ Use Naive Bayes classification to classify the email in the given file. Inputs ------ filename: name of the file to be classified probabilities_by_category: output of function learn_distributions prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the parameter in the prior class distribution Output ------ classify_result: A two-element tuple. The first element is a string whose value is either 'spam' or 'ham' depending on the classification result, and the second element is a two-element list as [log p(y=1|x), log p(y=0|x)], representing the log posterior probabilities """ ### Construct the feature vector x x = dict() words = util.get_words_in_file(filename) for w in words: if w in x: x[w] += 1 else: x[w] = 1 ### Calculate multinomial_coef = [(x1+x2+...xd)!]/[(x1!)(x2!)...(xd)!] numerator = 0 for w in x: numerator += x[w] numerator = math.factorial(numerator) denominator = 1 for w in x: denominator = denominator * math.factorial(x[w]) log_multinomial_coef = math.log10(numerator) - math.log10(denominator) ### Posterior of being spam: log[ p(y=1|x) ]= log[ p(x|y=1)*p(y=1) ] p_d = probabilities_by_category[0] log_of_product_spam = 0 for w in p_d: if w in x: log_of_product_spam += (x[w] * math.log10(p_d[w])) p_y1 = log_of_product_spam + log_multinomial_coef + math.log10( prior_by_category[0]) ### Posterior of being spam: log[ p(y=0|x) ]= log[ p(x|y=0)*p(y=0) ] q_d = probabilities_by_category[1] log_of_product_ham = 0 for w in q_d: if w in x: log_of_product_ham += (x[w] * math.log10(q_d[w])) p_y0 = log_of_product_ham + log_multinomial_coef + math.log10( prior_by_category[1]) if (p_y1 >= zeta * p_y0): classify_result = ("spam", [p_y1, p_y0]) else: classify_result = ("ham", [p_y1, p_y0]) return classify_result
def classify_email(email_filename, log_probabilities_by_category, log_prior_by_category): """ Uses Naive Bayes classification to classify the email in the given file. Inputs ------ email_filename : name of the file containing the email to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions Output ------ One of the labels in names. """ ### TODO: Comment out the following line and write your code here #return 'spam' email_dict = set(util.get_words_in_file(email_filename)) spam_prob = 0 ham_prob = 0 spam_comp = 0 ham_comp = 0 # # Spam # number_of_emails = number_of_spam_emails # for word in log_probabilities_by_category[0]: # if word in email_dict: # spam_prob += log_probabilities_by_category[0][word] # else: # num = np.exp(log_probabilities_by_category[0][word]) # spam_comp += np.log(1 - num) # # Ham # number_of_emails = number_of_ham_emails # for word in log_probabilities_by_category[1]: # if word in email_dict: # ham_prob += log_probabilities_by_category[1][word] # else: # num = np.exp(log_probabilities_by_category[1][word]) # ham_comp += np.log(1 - num) all_words = set(log_probabilities_by_category[0].keys()) all_words.update(set(log_probabilities_by_category[1].keys())) all_words = set(all_words) number_of_emails = len(all_words) for word in all_words: if word in email_dict: spam_prob += log_probabilities_by_category[0][word] ham_prob += log_probabilities_by_category[1][word] else: spam_num = np.exp(log_probabilities_by_category[0][word]) spam_comp += np.log(1-spam_num) ham_num = np.exp(log_probabilities_by_category[1][word]) ham_comp += np.log(1-ham_num) log_spam_prob = log_prior_by_category[0] + spam_prob log_ham_prob = log_prior_by_category[1] + ham_prob if (log_spam_prob) - (log_ham_prob) > 0: return 'spam' else: return 'ham'
def classify_new_email(filename,probabilities_by_category,prior_by_category): """ Use Naive Bayes classification to classify the email in the given file. Inputs ------ filename: name of the file to be classified probabilities_by_category: output of function learn_distributions prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the parameter in the prior class distribution Output ------ classify_result: A two-element tuple. The first element is a string whose value is either 'spam' or 'ham' depending on the classification result, and the second element is a two-element list as [log p(y=1|x), log p(y=0|x)], representing the log posterior probabilities """ ### TODO: Write your code here spam_prob = probabilities_by_category[0] ham_prob = probabilities_by_category[1] w = list(spam_prob.keys()) words = util.get_words_in_file(filename) # x_spam = np.zeros(len(spam_words)) # x_ham = np.zeros(len(ham_words)) x = np.zeros(len(w)) # itr = 0 for wi in w: if wi in words: x[itr] = 1 itr = itr + 1 # p_spam = np.sum(x_spam*np.log(spam_prob.values()) + (1-x_spam)*np.log(1-spam_prob.values())) # p_ham = np.sum(x_ham*np.log(ham_prob.values()) + (1-x_ham)*np.log(1-ham_prob.values())) # i = 0 # p_spam = 0 # for keys in spam_prob: # p_spam += x_spam[i]*np.log(spam_prob[keys]) + (1-x_spam[i])*np.log(1-spam_prob[keys]) # i += 1 # # i = 0 # p_ham = 0 # for keys in ham_prob: # p_ham += x_ham[i]*np.log(ham_prob[keys]) + (1-x_ham[i])*np.log(1-ham_prob[keys]) # i += 1 # spam_values = np.array(list(spam_prob.values())) ham_values = np.array(list(ham_prob.values())) p_ham = np.log(prior_by_category[1]) p_spam = np.log(prior_by_category[0]) # p_spam = np.sum(x_spam*np.log(spam_values) + (1-x_spam)*np.log(1-spam_values)) + np.log(prior_by_category[0]) for i in range(0,len(x)): p_spam += x[i]*np.log(spam_values[i]) + (1-x[i])*np.log(1-spam_values[i]) p_ham += x[i]*np.log(ham_values[i]) + (1-x[i])*np.log(1-ham_values[i]) if p_spam > p_ham: classification = 'spam' else: classification = 'ham' posterior = [p_spam, p_ham] classify_result = (classification, posterior) return classify_result
def learn_distributions(file_lists_by_category): """ Estimate the parameters p_d, and q_d from the training set Input ----- file_lists_by_category: A two-element list. The first element is a list of spam files, and the second element is a list of ham files. Output ------ probabilities_by_category: A two-element tuple. The first element is a dict whose keys are words, and whose values are the smoothed estimates of p_d; the second element is a dict whose keys are words, and whose values are the smoothed estimates of q_d """ # p_d = p(w_d | y_n=1) spam # q_d = p(w_d | y_n=0) ham vocab[num] = 0 for file in file_lists_by_category[1]: for word in util.get_words_in_file(file): # this adds the new word to the vocab dict if word.isnumeric(): vocab[num] = 0 else: vocab[word] = 0 for file in file_lists_by_category[0]: for word in util.get_words_in_file(file): # this adds the new word to the vocab dict if word.isnumeric(): vocab[num] = 0 else: vocab[word] = 0 # add an entry to vocab dict to account for any new words from test set vocab["<unk>"] = 0 # vocab contains all the words from the training set # but all the values are 0 D = len(vocab) # HAM distribution hamWords = vocab.copy() hamTotalWordCount = 0 for file in file_lists_by_category[1]: for word in util.get_words_in_file(file): if word.isnumeric(): hamWords[num] += 1 else: hamWords[word] += 1 hamTotalWordCount += 1 # smoothe each element for element in hamWords: old_val = hamWords[element] hamWords[element] = (old_val + 1) / (hamTotalWordCount + D) # SPAM distribution # spamWords = vocab.copy() spamTotalWordCount = 0 for file in file_lists_by_category[0]: for word in util.get_words_in_file(file): if word.isnumeric(): spamWords[num] += 1 else: spamWords[word] += 1 spamTotalWordCount += 1 # smoothe each elemetn for element in spamWords: old_val = spamWords[element] spamWords[element] = (old_val + 1) / (spamTotalWordCount + D) return spamWords, hamWords
def classify_email(email_filename, log_probabilities_by_category, log_prior_by_category): """ Uses Naive Bayes classification to classify the email in the given file. Inputs ------ email_filename : name of the file containing the email to be classified log_probabilities_by_category : See output of learn_distributions log_prior_by_category : See output of learn_distributions Output ------ One of the labels in names. sempra """ ### TODO: Comment out the following line and write your code here words_in_file = list(set(util.get_words_in_file(email_filename))) # for spam in log_probabilities_by_category[0]: # if spam not in words_in_file: # print(spam, 1-np.exp(log_probabilities_by_category[0][spam])) spam_prod = 0 ham_prod = 0 for word in words_in_file: # if 1-np.exp(log_probabilities_by_category[0][word]) <= 0: # print(word, 1-np.exp(log_probabilities_by_category[0][word])) spam_prod += log_probabilities_by_category[0][word] ham_prod += log_probabilities_by_category[0][word] for word in log_probabilities_by_category[0]: if word not in words_in_file: spam_prod += np.log(1 - np.exp(log_probabilities_by_category[0][word])) # if 1-np.exp(log_probabilities_by_category[0][word]) <= 0 and word == 'delegating': # print('spam', word, 1-np.exp(log_probabilities_by_category[0][word])) # for word in log_probabilities_by_category[1]: if word not in words_in_file: ham_prod += np.log(1 - np.exp(log_probabilities_by_category[1][word])) # if 1-np.exp(log_probabilities_by_category[0][word]) <= 0 and word == 'delegating': # print('ham', word, 1-np.exp(log_probabilities_by_category[1][word])) # print('horcrux', word, 1-np.exp(log_probabilities_by_category[0]['horcrux'])) # # print('horcrux', word, 1-np.exp(log_probabilities_by_category[1]['horcrux'])) # spam_prod = sum([log_probabilities_by_category[0][x] if x in words_in_file else np.log(1-np.exp(log_probabilities_by_category[0][x])) for x in log_probabilities_by_category[0]]) # ham_prod = sum([log_probabilities_by_category[0][x] if x in words_in_file else np.log(1-np.exp(log_probabilities_by_category[1][x])) for x in log_probabilities_by_category[1]]) log_odds_num = log_prior_by_category[0] + spam_prod log_odds_denom = log_prior_by_category[1] + ham_prod log_odds = log_odds_num - log_odds_denom # print(log_odds_num, log_odds_denom, log_odds) # print('spam3 ' + str(len(log_probabilities_by_category[0]))) # print('ham3 ' + str(len(log_probabilities_by_category[1]))) if log_odds >= 0: return 'spam' else: return 'ham'
def learn_distributions(file_lists_by_category): """ Estimate the parameters p_d, and q_d from the training set Input ----- file_lists_by_category: A two-element list. The first element is a list of spam files, and the second element is a list of ham files. Output ------ probabilities_by_category: A two-element tuple. The first element is a dict whose keys are words, and whose values are the smoothed estimates of p_d; the second element is a dict whose keys are words, and whose values are the smoothed estimates of q_d """ spam_files = file_lists_by_category[0] ham_files = file_lists_by_category[1] ### W is the vocabulary, W = {w1, w2, ..., wd}, generate this by going through all the files print("Generating vocabulary...") W = dict() for x in spam_files: words = util.get_words_in_file(x) for w in words: W[w] = 1 for x in ham_files: words = util.get_words_in_file(x) for w in words: W[w] = 1 ### generate p_d dict and q_d dict, perform laplace smoothing print("Generating posterior probabilities...") p_d = dict() q_d = dict() laplace_smooth_word_count_spam = util.get_total_word_count( spam_files) + len(W) laplace_smooth_word_count_ham = util.get_total_word_count(ham_files) + len( W) wc_spam = dict() for f in spam_files: words = util.get_words_in_file(f) for w in words: if w in wc_spam: wc_spam[w] += 1 else: wc_spam[w] = 1 wc_ham = dict() for f in ham_files: words = util.get_words_in_file(f) for w in words: if w in wc_ham: wc_ham[w] += 1 else: wc_ham[w] = 1 for w in W: if w in wc_spam: p_d[w] = (wc_spam[w] + 1) / (laplace_smooth_word_count_spam) else: p_d[w] = 1 / laplace_smooth_word_count_spam if w in wc_ham: q_d[w] = (wc_ham[w] + 1) / (laplace_smooth_word_count_ham) else: q_d[w] = 1 / laplace_smooth_word_count_ham probabilities_by_category = [p_d, q_d] return probabilities_by_category