Exemple #1
0
def get_counts(file_list):
    """
    Computes counts for each word that occurs in the files in file_list.

    Inputs
    ------
    file_list : a list of filenames, suitable for use with open() or 
                util.get_words_in_file()

    Output
    ------
    A dict whose keys are words, and whose values are the number of files the
    key occurred in.
    """
    ### TODO: Comment out the following line and write your code here
    word_counts = Counter()
    total = 0
    for filename in file_list:
        words = util.get_words_in_file(filename)
        for word in words:
            count = word_counts[word]
            word_counts[word] = count + 1.0
            total += 1.0

    return word_counts
def get_counts(file_list):
    """
    Computes counts for each word that occurs in the files in file_list.

    Inputs
    ------
    file_list : a list of filenames, suitable for use with open() or 
                util.get_words_in_file()

    Output
    ------
    A dict whose keys are words, and whose values are the number of files the
    key occurred in.
    """
    ### TODO: Comment out the following line and write your code here

    #raise NotImplementedError

    word_dict = Counter()

    for file in file_list:
        words = set(util.get_words_in_file(file))
        for item in words:
            word_dict[item] += 1

    return word_dict
Exemple #3
0
def classify_email(email_filename,
                   log_probabilities_by_category,
                   log_prior_by_category):
    """
    Uses Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    email_filename : name of the file containing the email to be classified

    log_probabilities_by_category : See output of learn_distributions

    log_prior_by_category : See output of learn_distributions

    Output
    ------
    One of the labels in names.
    """
    
    pspam = log_prior_by_category[0]
    pYgivenSpam = log_probabilities_by_category[0]
    pham = log_prior_by_category[1]
    pYgivenHam = log_probabilities_by_category[1]
    words = util.get_words_in_file(email_filename)
    for word in words:
        pspam += pYgivenSpam[word]
        pham += pYgivenHam[word]
        
    if pspam >= pham:
        return 'spam'
        
    return 'ham'
Exemple #4
0
def get_all_unique_words(files):
    ret = set()
    for f in files:
        words = util.get_words_in_file(f)
        #print(set(words))
        ret = ret.union(set(words))
        #print(ret)
    return ret
Exemple #5
0
def learn_distributions(file_lists_by_category):
    """
    Estimate the parameters p_d, and q_d from the training set
    
    Input
    -----
    file_lists_by_category: A two-element list. The first element is a list of 
    spam files, and the second element is a list of ham files.

    Output
    ------
    probabilities_by_category: A two-element tuple. The first element is a dict 
    whose keys are words, and whose values are the smoothed estimates of p_d;
    the second element is a dict whose keys are words, and whose values are the 
    smoothed estimates of q_d 
    """
    spamfiles = file_lists_by_category[0]
    hamfiles = file_lists_by_category[1]
    w = []
    for spamfile in spamfiles:
        w.extend(util.get_words_in_file(spamfile))
        
    for hamfile in hamfiles:
        w.extend(util.get_words_in_file(hamfile))
        
#    n_spam = len(spam_words)
#    n_ham = len(ham_words)
    spam_count = util.get_counts(spamfiles)
    ham_count = util.get_counts(hamfiles)
    
    n = len(w)
    dict_spam = {wi : 0 for wi in w}
    dict_ham = {wi : 0 for wi in w}
    for key in dict_spam:
        dict_spam[key] = (spam_count[key]+1)/(n+2) 
        dict_ham[key] = (ham_count[key]+1)/(n+2)
                  
    probabilities_by_category = (dict_spam,dict_ham)

    
    
    return probabilities_by_category
Exemple #6
0
def classify_email(email_filename, log_probabilities_by_category,
                   log_prior_by_category):
    """
    Uses Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    email_filename : name of the file containing the email to be classified

    log_probabilities_by_category : See output of learn_distributions

    log_prior_by_category : See output of learn_distributions

    Output
    ------
    One of the labels in names.
    """
    ### TODO: Comment out the following line and write your code here
    word_list = util.get_words_in_file(email_filename)
    len0 = len(log_probabilities_by_category[0].keys())
    len1 = len(log_probabilities_by_category[1].keys())

    P_spam = 0
    P_ham = 0

    for word in np.unique( list(log_probabilities_by_category[0].keys()) + \
        list(log_probabilities_by_category[1].keys()) ):

        if word in word_list:
            if word in log_probabilities_by_category[0].keys():
                P_spam += log_probabilities_by_category[0][word]
            else:
                P_spam += np.log(1 / (len0 + 2))
            if word in log_probabilities_by_category[1].keys():
                P_ham += log_probabilities_by_category[1][word]
            else:
                P_ham += np.log(1 / (len1 + 2))
        else:
            if word in log_probabilities_by_category[0].keys():
                P_spam += np.log(
                    1 - np.exp(log_probabilities_by_category[0][word]))
            else:
                P_spam += np.log(1 - 1 / (len0 + 2))
            if word in log_probabilities_by_category[1].keys():
                P_ham += np.log(1 -
                                np.exp(log_probabilities_by_category[1][word]))
            else:
                P_ham += np.log(1 - 1 / (len1 + 2))

    P_spam += log_prior_by_category[0]
    P_ham += log_prior_by_category[1]

    answer = {True: 'spam', False: 'ham'}
    return answer[P_spam >= P_ham]
Exemple #7
0
def classify_new_email(filename,
                       probabilities_by_category,
                       prior_by_category,
                       decisionFactor=1):
    """
    Use Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    filename: name of the file to be classified
    probabilities_by_category: output of function learn_distributions
    prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the 
    parameter in the prior class distribution
    Optional argument decisionFactor is 1 by default. decisionFactor > 1 will let
    the model make decision in favour of HAM. 

    Output
    ------
    classify_result: A two-element tuple. The first element is a string whose value
    is either 'spam' or 'ham' depending on the classification result, and the 
    second element is a two-element list as [log p(y=1|x), log p(y=0|x)], 
    representing the log posterior probabilities
    """
    # indexing is confusing, so I give them variable name to be more intuitive
    spamDistribution, hamDistribution = probabilities_by_category

    # build the function feature vector from the mail first
    mailFeatureVec = vocab.copy()
    for word in util.get_words_in_file(filename):
        # word is a regular word
        if word in mailFeatureVec:
            mailFeatureVec[word] += 1
        # word represents a numeric value
        elif word.isnumeric():
            mailFeatureVec[num] += 1
            # word is not recognized
        else:
            mailFeatureVec["<unk>"] += 1

    spamProb = 0
    hamProb = 0
    for word in mailFeatureVec:
        # compute P(y=spam| mailFeatureVec)
        spamProb += mailFeatureVec[word] * math.log(spamDistribution[word])
        # compute P(y=ham| mailFeatureVec)
        hamProb += mailFeatureVec[word] * math.log(hamDistribution[word])

    # multiply by prior distribution
    spamProb += math.log(prior_by_category[0])
    hamProb += math.log(prior_by_category[1])
    hamProb *= decisionFactor
    result = "ham" if (hamProb > spamProb) else "spam"
    return (result, (spamProb, hamProb))  #classify_result
Exemple #8
0
def classify_email(email_filename, log_probabilities_by_category,
                   log_prior_by_category):
    """
    Uses Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    email_filename : name of the file containing the email to be classified

    log_probabilities_by_category : See output of learn_distributions

    log_prior_by_category : See output of learn_distributions

    Output
    ------
    One of the labels in names.
    """
    ### TODO: Comment out the following line and write your code here
    email_words = util.get_words_in_file(email_filename)
    test_words = set([])

    for list in log_probabilities_by_category:
        for word in list:
            test_words.add(word)

    spam = log_prior_by_category[0]
    ham = log_prior_by_category[1]

    #print("spam prior:" + str(spam))
    #print("ham prior:" + str(ham) + "\n")

    spam_data = log_probabilities_by_category[0]
    ham_data = log_probabilities_by_category[1]

    for word in test_words:
        if word in email_words:
            spam += spam_data[word]
            ham += ham_data[word]
        else:
            spam += log(1 - exp(spam_data[word]))
            ham += log(1 - exp(ham_data[word]))

    #print("spam:" + str(spam))
    #print("ham:" + str(ham) + "\n\n\n")

    if spam > ham:
        label = "spam"
    else:
        label = "ham"

    return label
Exemple #9
0
def train_logistic(file_lists_by_category):
    """
    Extract features and labels for the logistic regression model, and train the model.

    Note that you'll need to arbitrarily pick one of spam and ham to be 0, and the other to be 1, 
    when creating the labels for logistic regression. 
    The choice doesn't matter; just make sure you are consistent about it. 

    Inputs
    ------
    A two-element list. The first element is a list of spam files, 
    and the second element is a list of ham (non-spam) files.

    Output
    ------
    Tuple of (theta, all_words) where theta is the vector of trained logistic regression parameters, 
    and all_words is the list of all words found in the dataset (reused later to make
    sure we extract features in a consistent manner)
    """
    # Build the set of all words.
    all_words = set()
    for filelist in file_lists_by_category:
        for f in filelist:
            words_in_f = util.get_words_in_file(f)
            all_words.update(words_in_f)
    all_words = list(all_words)

    num_spam = len(file_lists_by_category[0])
    num_ham = len(file_lists_by_category[1])

    num_examples = (num_spam + num_ham)
    num_features = len(all_words)

    # Allocate data containers.
    y = np.zeros((num_examples, num_features))

    # Let label spam = 1.
    c = np.zeros(num_examples)
    c[:num_spam] = 1.0

    # Get features for each file.
    j = 0  # Counter.
    for filelist in file_lists_by_category:
        for f in filelist:
            y[j, :] = extract_features(f, all_words)
            j += 1

    # Optimize parameters.
    theta = optimize_theta(
        y, c)  #, learning_rate=4.0, convergence_threshold=1e-6)
    return theta, all_words
def classify_new_email(filename, probabilities_by_category, prior_by_category, adjustment):
    """
    Use Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    filename: name of the file to be classified
    
    probabilities_by_category: output of function learn_distributions
    
    prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the 
    parameter in the prior class distribution

    Output
    ------
    classify_result: A two-element tuple. The first element is a string whose value
    is either 'spam' or 'ham' depending on the classification result, and the 
    second element is a two-element list as [log p(y=1|x), log p(y=0|x)], 
    representing the log posterior probabilities
    """
    ### TODO: Write your code here

    D = len(probabilities_by_category)

    # Prior distributions (initital)
    MAP_spam = math.log(prior_by_category[0])
    MAP_ham = math.log(prior_by_category[1])

    all_words = util.get_words_in_file(filename)
    vocab = list(probabilities_by_category[0].keys())

    # Calculate for each subsequent word
    for word in vocab:
        if word in all_words:
            MAP_spam += math.log(probabilities_by_category[0][word])
            MAP_ham += math.log(probabilities_by_category[1][word])

        else:
            MAP_spam += math.log(1-probabilities_by_category[0][word])
            MAP_ham += math.log(1-probabilities_by_category[1][word])

    # Check for the result
    if MAP_spam > adjustment * MAP_ham:
        result = 'spam'
    else:
        result = 'ham'

    classify_result = (result, [MAP_spam, MAP_ham])

    return classify_result
Exemple #11
0
def classify_email(email_filename, log_probabilities_by_category,
                   log_prior_by_category):
    """
    Uses Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    email_filename : name of the file containing the email to be classified

    log_probabilities_by_category : See output of learn_distributions

    log_prior_by_category : See output of learn_distributions

    Output
    ------
    One of the labels in names.
    """
    prob_spam = log_prior_by_category[0]
    prob_ham = log_prior_by_category[1]
    spam_probs = log_probabilities_by_category[0]
    ham_probs = log_probabilities_by_category[1]

    spam_probs_cond = 0
    ham_probs_cond = 0

    # For each word in the email get the prob of it appearing in spam or ham
    # If the word was not in the training dataset, assign it the 1/(# of words in relevant dictionary + 2)
    email_words = util.get_words_in_file(email_filename)
    for word in email_words:
        if word not in spam_probs.keys():
            spam_probs_cond += -np.log(len(spam_probs.keys()) + 2)
        else:
            spam_probs_cond += spam_probs[word]
        if word not in ham_probs.keys():
            ham_probs_cond += -np.log(len(ham_probs.keys()) + 2)
        else:
            ham_probs_cond += ham_probs[word]

    # Using Bayes' Theorem calculate the log probability of spam and ham
    spam = (prob_spam + spam_probs_cond) / logsumexp(
        [prob_spam + spam_probs_cond, prob_ham + ham_probs_cond])
    ham = (prob_ham + ham_probs_cond) / logsumexp(
        [prob_spam + spam_probs_cond, prob_ham + ham_probs_cond])

    # If the prob of spam is higher return spam o.w. ham
    if spam / ham > 1:
        return 'ham'
    else:
        return 'spam'
Exemple #12
0
def classify_message(message_filename,
                     log_probabilities_by_category,
                     log_prior_by_category,
                     names = ['spam', 'ham']):
    """
    Uses Naive Bayes classification to classify the message in the given file.

    Inputs
    ------
    message_filename : name of the file containing the message to be classified

    log_probabilities_by_category : See output of learn_distributions

    log_prior_by_category : See output of learn_distributions

    names : labels for each class (for this problem set, will always be just 
            spam and ham).

    Output
    ------
    One of the labels in names.
    """

    message_words = util.get_words_in_file(message_filename)
    N_categories = len(log_probabilities_by_category)

    log_likelihoods = []
    for i in xrange(N_categories):
        total = 0
        all_word_log_probs = log_probabilities_by_category[i].copy()

        # for w in all_word_log_probs.keys():
        #     log_prob = all_word_log_probs[w]
        #     test = (w in message_words)
        #     old_total += test*log_prob + (1-test)*np.log(1-np.exp(log_prob))

        all_word_log_probs = log_probabilities_by_category[i].copy()
        for w in message_words:
            total += all_word_log_probs[w]
            all_word_log_probs.pop(w)

        total += np.sum(np.log(1-np.exp(all_word_log_probs.values())))


        log_likelihoods.append(total)

    posterior = np.array(log_likelihoods) + np.array(log_prior_by_category)
    winner = np.argmax(posterior)
    return names[winner]
def classify_new_email(filename,
                       probabilities_by_category,
                       prior_by_category,
                       eps=1):
    """
    Use Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    filename: name of the file to be classified
    
    probabilities_by_category: output of function learn_distributions
    
    prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the 
    parameter in the prior class distribution

    Output
    ------
    classify_result: A two-element tuple. The first element is a string whose value
    is either 'spam' or 'ham' depending on the classification result, and the 
    second element is a two-element list as [log p(y=1|x), log p(y=0|x)], 
    representing the log posterior probabilities
    """
    ### TODO: Write your code here
    #eps is our adjustment
    #our priors
    Pspam = np.log(prior_by_category[0])
    Pham = np.log(prior_by_category[1])

    #all our vocabulary
    vocab = probabilities_by_category[0].keys()

    #words in our email
    words = util.get_words_in_file(filename)

    for word in vocab:
        #depending on if word is in the email or not, add the log probability of it being in or out
        if word in words:
            Pspam += np.log(probabilities_by_category[0][word])
            Pham += np.log(probabilities_by_category[1][word])
        else:
            Pspam += np.log(1 - probabilities_by_category[0][word])
            Pham += np.log(1 - probabilities_by_category[1][word])
    if Pspam > eps * Pham:
        classify_result = ('spam', [Pspam, Pham])
    else:
        classify_result = ('ham', [Pspam, Pham])
    return classify_result
Exemple #14
0
def classify_email(email_filename, log_probabilities_by_category,
                   log_prior_by_category):
    """
    Uses Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    email_filename : name of the file containing the email to be classified

    log_probabilities_by_category : See output of learn_distributions

    log_prior_by_category : See output of learn_distributions

    Output
    ------
    One of the labels in names.
    """

    #labels: 0 us spam, 1:spam, use for return value
    labels = {0: 'spam', 1: 'ham'}

    #get unique occurences of word in each file
    new_words = set(util.get_words_in_file(email_filename))

    all_words = []
    for i in range(len(labels)):
        all_words += log_probabilities_by_category[i].keys()
    all_words = list(set(all_words))

    #calcualte posterior distribution for spam and ham
    posterior_dist = []
    for label in labels:
        posterior_dist.append(log_prior_by_category[label])
        for word in all_words:
            #p_i or q_i
            if word in new_words:
                posterior_dist[label] += log_probabilities_by_category[label][
                    word]
            #1-p_i or q_i
            else:
                posterior_dist[label] += util.careful_log(
                    1 - np.exp(log_probabilities_by_category[label][word]))

    #print(posterior_dist[0] - posterior_dist[1])
    #return MAP from log odds
    map = np.argmax(posterior_dist)
    #return spam
    return labels[map]
Exemple #15
0
def classify_message(message_filename,
                     log_probabilities_by_category,
                     log_prior_by_category,
                     names=['spam', 'ham']):
    """
    Uses Naive Bayes classification to classify the message in the given file.

    Inputs
    ------
    message_filename : name of the file containing the message to be classified

    log_probabilities_by_category : See output of learn_distributions

    log_prior_by_category : See output of learn_distributions

    names : labels for each class (for this problem set, will always be just 
            spam and ham).

    Output
    ------
    One of the labels in names.
    """

    message_words = util.get_words_in_file(message_filename)
    N_categories = len(log_probabilities_by_category)

    log_likelihoods = []
    for i in xrange(N_categories):
        total = 0
        all_word_log_probs = log_probabilities_by_category[i].copy()

        # for w in all_word_log_probs.keys():
        #     log_prob = all_word_log_probs[w]
        #     test = (w in message_words)
        #     old_total += test*log_prob + (1-test)*np.log(1-np.exp(log_prob))

        all_word_log_probs = log_probabilities_by_category[i].copy()
        for w in message_words:
            total += all_word_log_probs[w]
            all_word_log_probs.pop(w)

        total += np.sum(np.log(1 - np.exp(all_word_log_probs.values())))

        log_likelihoods.append(total)

    posterior = np.array(log_likelihoods) + np.array(log_prior_by_category)
    winner = np.argmax(posterior)
    return names[winner]
Exemple #16
0
def get_counts(file_list):
    '''
    Computes counts for each word that occurs in the files in file_list.
    :param file_list: a list of filenames
    :return: A dictionary whose keys are words, and whose values are the number of files the
    key occurred in.
    '''

    dict_of_words = util.Counter()
    for file in file_list:
        words_in_file = util.get_words_in_file(file)

        #for each unique word in a file, increment the count
        for word in set(words_in_file):
            dict_of_words[word] += 1

    return dict_of_words
def classify_email(email_filename,
                   log_probabilities_by_category,
                   log_prior_by_category):
    """
    Uses Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    email_filename : name of the file containing the email to be classified

    log_probabilities_by_category : See output of learn_distributions

    log_prior_by_category : See output of learn_distributions

    Output
    ------
    Either "spam" or "ham"
    """

    email_words = set(util.get_words_in_file(email_filename))
    N_categories  = len(log_probabilities_by_category)

    # get the union of all words encountered during training
    all_words = []
    for i in range(N_categories):
        all_words += log_probabilities_by_category[i].keys()
    all_words = list(set(all_words))

    log_likelihoods = []
    for i in range(N_categories):
        total = 0
        all_word_log_probs = log_probabilities_by_category[i]
        for w in all_words:
            log_prob = all_word_log_probs[w]
            test = (w in email_words)
            if w in email_words:
                total += log_prob
            else:
                total += np.log(1 - np.exp(log_prob))
        log_likelihoods.append(total)
    posterior = np.array(log_likelihoods) + np.array(log_prior_by_category)
    winner = np.argmax(posterior)
    if winner == 0:
        return "spam"
    else:
        return "ham"
Exemple #18
0
def classify_message(message_filename,
                     log_probabilities_by_category,
                     log_prior_by_category,
                     names=['spam', 'ham']):
    """
    Uses Naive Bayes classification to classify the message in the given file.

    Inputs
    ------
    message_filename : name of the file containing the message to be classified

    log_probabilities_by_category : See output of learn_distributions

    log_prior_by_category : See output of learn_distributions

    names : labels for each class (for this problem set, will always be just 
            spam and ham).

    Output
    ------
    One of the labels in names.
    """
    words = util.get_words_in_file(message_filename)
    words_set = set(words)  # Faster lookup.

    # Start with just the prior probability of 'spam' and 'ham', respectively.
    posteriors = np.array(log_prior_by_category)
    # print('Prior:', posteriors)

    # Feature space is all the words from spam and ham.
    all_words = set(log_probabilities_by_category[0].keys())
    all_words.update(list(log_probabilities_by_category[1].keys()))
    # print(len(all_words))

    # Sum log-probabilities of generating each observed word given the label.
    # for w in log_probabilities_by_category[0]:
    for w in all_words:
        has_word = 1 if w in words_set else 0
        posteriors[0] += log_probabilities_by_category[0][w][has_word]
        posteriors[1] += log_probabilities_by_category[1][w][has_word]

    # print(posteriors)
    if posteriors[0] >= posteriors[1]:
        return names[0]
    else:
        return names[1]
Exemple #19
0
def get_counts(file_list):
    """
    Computes counts for each word that occurs in the files in file_list.

    Inputs
    ------
    file_list : a list of filenames, suitable for use with open() or 
                util.get_words_in_file()

    Output
    ------
    A dict whose keys are words, and whose values are the number of files the
    key occurred in.
    """
    ctr = util.Counter()
    for f in file_list:
        for w in set(util.get_words_in_file(f)):
            ctr[w] += 1
    return ctr
Exemple #20
0
def classify_message(message_filename,
                     log_probabilities_by_category,
                     log_prior_by_category,
                     names=['spam', 'ham']):
    '''
    classify the message in the given file using learned parameters

    :param message_filename: name of the file containing the message to be classified
    :param log_probabilities_by_category:
    :param log_prior_by_category:
    :param names: class labels
    :return: the predicted class
    '''

    try:
        words_in_file = set(util.get_words_in_file(message_filename))
    except:
        return "file cannot be decoded"
    num_of_categories = len(log_priors_by_category)

    # make list of all the words seen in training
    all_words_from_training = []
    for i in range(num_of_categories):
        all_words_from_training += log_probabilities_by_category[i].keys()
        all_words_from_training = list(set(all_words_from_training))

    log_likelihoods = []
    for i in range(num_of_categories):
        total = 0
        all_word_log_probs = log_probabilities_by_category[i]

        for word in all_words_from_training:
            log_prob = all_word_log_probs[word]
            is_in_file = (word in words_in_file)
            total += is_in_file * log_prob + (
                1 - is_in_file) * np.log(1 - np.exp(log_prob))
        log_likelihoods.append(total)

    posterior = np.array(log_likelihoods) + np.array(log_prior_by_category)
    predicted_category = np.argmax(posterior)

    return names[predicted_category]
Exemple #21
0
def get_counts(file_list):
    """
    Computes counts for each word that occurs in the files in file_list.

    Inputs
    ------
    file_list : a list of filenames, suitable for use with open() or 
                util.get_words_in_file()

    Output
    ------
    A dict whose keys are words, and whose values are the number of files the
    key occurred in.
    """
    counts = util.Counter()
    for f in file_list:
        words = util.get_words_in_file(f)
        for w in set(words):
            counts[w] += 1
    return counts
def classify_new_email(filename,
                       probabilities_by_category,
                       prior_by_category,
                       b=0):
    """
    Use Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    filename: name of the file to be classified
    probabilities_by_category: output of function learn_distributions
    prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the 
    parameter in the prior class distribution

    Output
    ------
    classify_result: A two-element tuple. The first element is a string whose value
    is either 'spam' or 'ham' depending on the classification result, and the 
    second element is a two-element list as [log p(y=1|x), log p(y=0|x)], 
    representing the log posterior probabilities
    """
    # TODO: Write your code here
    # 2 classes are handled the same way
    log_probabilities = [0, 0]
    x = util.get_words_in_file(filename)
    for i in range(2):
        for word in probabilities_by_category[i]:
            x_d = x.count(word)
            log_probabilities[i] += x_d * np.log(
                probabilities_by_category[i][word])
        log_probabilities[i] += prior_by_category[
            i]  # Since both 0.5, this line doesn't affect anything

    if log_probabilities[0] + b >= log_probabilities[1]:
        classify_result = ('spam', log_probabilities[0])
    else:
        classify_result = ('ham', log_probabilities[1])

    return classify_result
Exemple #23
0
def get_counts(file_list):
    """
    Computes counts for each word that occurs in the files in file_list.

    Inputs
    ------
    file_list : a list of filenames, suitable for use with open() or 
                util.get_words_in_file()

    Output
    ------
    A dict whose keys are words, and whose values are the number of files the
    key occurred in.
    """
    res = {}
    for f in file_list:
        tmp = util.get_words_in_file(f)
        for w in tmp:
            if w not in res:
                res[w] = 1
            else:
                res[w] += 1
    return res
Exemple #24
0
def extract_features(f, all_words):
    """
    Extract features from file for logistic regression. 

    Inputs
    ------
    f: Name of file to extract features from.

    all_words : List of all words in the training set of files. 

    Output
    ------
    Extracted features. 
    """
    words_in_f = set(util.get_words_in_file(f))
    features = np.zeros(len(all_words))

    # Set entry to 0 or 1 to indicate absence or presence of word i.
    for i, w in enumerate(all_words):
        if w in words_in_f:
            features[i] = 1.0

    return features
Exemple #25
0
def get_counts(file_list):
    """
    Computes counts for each word that occurs in the files in file_list.

    Inputs
    ------
    file_list : a list of filenames, suitable for use with open() or
                util.get_words_in_file()

    Output
    ------
    A dict whose keys are words, and whose values are the number of files the
    key occurred in.
    """
    counter = Counter()

    for filename in file_list:
        # make sure multiple occurances of a word per email are ignored
        word_set = set(util.get_words_in_file(filename))

        for word in word_set:
            counter[word] += 1

    return counter
Exemple #26
0
def classify_new_email(filename,probabilities_by_category,prior_by_category):
    """
    Use Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    filename: name of the file to be classified
    
    probabilities_by_category: output of function learn_distributions
    
    prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the 
    parameter in the prior class distribution

    Output
    ------
    classify_result: A two-element tuple. The first element is a string whose value
    is either 'spam' or 'ham' depending on the classification result, and the 
    second element is a two-element list as [log p(y=1|x), log p(y=0|x)], 
    representing the log posterior probabilities
    """
    ### TODO: Write your code here
    prior_0 = prior_by_category[0]
    prior_1 = prior_by_category[1]

    words = util.get_words_in_file(filename)
    unique_words = set(words)

    p_xy0 = get_log_pxy(probabilities_by_category, 0, words) + np.log(prior_0)
    p_xy1 = get_log_pxy(probabilities_by_category, 1, words) + np.log(prior_1)

    #print(p_xy0, p_xy1)
    #p_xy0 contains p_d, p_xy1 contains q_d therefore we need to swap
    res = 'spam' if p_xy0-p_xy1 > 0 else 'ham'
    classify_result = (res, (p_xy0, p_xy1))

    return classify_result
Exemple #27
0
def classify_new_email(filename, probabilities_by_category, prior_by_category,
                       zeta):
    """
    Use Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    filename: name of the file to be classified
    probabilities_by_category: output of function learn_distributions
    prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the 
    parameter in the prior class distribution

    Output
    ------
    classify_result: A two-element tuple. The first element is a string whose value
    is either 'spam' or 'ham' depending on the classification result, and the 
    second element is a two-element list as [log p(y=1|x), log p(y=0|x)], 
    representing the log posterior probabilities
    """
    ### Construct the feature vector x
    x = dict()
    words = util.get_words_in_file(filename)
    for w in words:
        if w in x:
            x[w] += 1
        else:
            x[w] = 1

    ### Calculate multinomial_coef = [(x1+x2+...xd)!]/[(x1!)(x2!)...(xd)!]
    numerator = 0
    for w in x:
        numerator += x[w]
    numerator = math.factorial(numerator)

    denominator = 1
    for w in x:
        denominator = denominator * math.factorial(x[w])

    log_multinomial_coef = math.log10(numerator) - math.log10(denominator)

    ### Posterior of being spam: log[ p(y=1|x) ]= log[ p(x|y=1)*p(y=1) ]
    p_d = probabilities_by_category[0]
    log_of_product_spam = 0

    for w in p_d:
        if w in x:
            log_of_product_spam += (x[w] * math.log10(p_d[w]))

    p_y1 = log_of_product_spam + log_multinomial_coef + math.log10(
        prior_by_category[0])

    ### Posterior of being spam: log[ p(y=0|x) ]= log[ p(x|y=0)*p(y=0) ]
    q_d = probabilities_by_category[1]
    log_of_product_ham = 0

    for w in q_d:
        if w in x:
            log_of_product_ham += (x[w] * math.log10(q_d[w]))

    p_y0 = log_of_product_ham + log_multinomial_coef + math.log10(
        prior_by_category[1])

    if (p_y1 >= zeta * p_y0):
        classify_result = ("spam", [p_y1, p_y0])
    else:
        classify_result = ("ham", [p_y1, p_y0])

    return classify_result
def classify_email(email_filename,
                   log_probabilities_by_category,
                   log_prior_by_category):
    """
    Uses Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    email_filename : name of the file containing the email to be classified

    log_probabilities_by_category : See output of learn_distributions

    log_prior_by_category : See output of learn_distributions

    Output
    ------
    One of the labels in names.
    """
    ### TODO: Comment out the following line and write your code here
    #return 'spam'

    email_dict = set(util.get_words_in_file(email_filename))

    spam_prob = 0
    ham_prob = 0
    spam_comp = 0
    ham_comp = 0


    # # Spam
    # number_of_emails = number_of_spam_emails
    # for word in log_probabilities_by_category[0]:
    #     if word in email_dict:
    #         spam_prob += log_probabilities_by_category[0][word]
    #     else:
    #         num = np.exp(log_probabilities_by_category[0][word])
    #         spam_comp += np.log(1 - num)


    # # Ham
    # number_of_emails = number_of_ham_emails
    # for word in log_probabilities_by_category[1]:
    #     if word in email_dict:
    #         ham_prob += log_probabilities_by_category[1][word]
    #     else:
    #         num = np.exp(log_probabilities_by_category[1][word])
    #         ham_comp += np.log(1 - num)

    all_words = set(log_probabilities_by_category[0].keys())
    all_words.update(set(log_probabilities_by_category[1].keys()))
    all_words = set(all_words)

    number_of_emails = len(all_words)
    for word in all_words:
        if word in email_dict:
            spam_prob += log_probabilities_by_category[0][word]
            ham_prob += log_probabilities_by_category[1][word]
        else:

            spam_num = np.exp(log_probabilities_by_category[0][word])
            spam_comp += np.log(1-spam_num)
            ham_num = np.exp(log_probabilities_by_category[1][word])
            ham_comp += np.log(1-ham_num)

    log_spam_prob = log_prior_by_category[0] + spam_prob
    log_ham_prob = log_prior_by_category[1] + ham_prob

    if (log_spam_prob) - (log_ham_prob) > 0:
        return 'spam'
    else:
        return 'ham'
Exemple #29
0
def classify_new_email(filename,probabilities_by_category,prior_by_category):
    """
    Use Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    filename: name of the file to be classified
    
    probabilities_by_category: output of function learn_distributions
    
    prior_by_category: A two-element list as [\pi, 1-\pi], where \pi is the 
    parameter in the prior class distribution

    Output
    ------
    classify_result: A two-element tuple. The first element is a string whose value
    is either 'spam' or 'ham' depending on the classification result, and the 
    second element is a two-element list as [log p(y=1|x), log p(y=0|x)], 
    representing the log posterior probabilities
    """
    ### TODO: Write your code here
    spam_prob = probabilities_by_category[0]
    ham_prob = probabilities_by_category[1]
    w = list(spam_prob.keys())
    words = util.get_words_in_file(filename)
#    x_spam = np.zeros(len(spam_words))
#    x_ham = np.zeros(len(ham_words))
    x = np.zeros(len(w))
#    
    itr = 0
    for wi in w:
        if wi in words:
            x[itr] = 1
        itr = itr + 1
#    p_spam = np.sum(x_spam*np.log(spam_prob.values()) + (1-x_spam)*np.log(1-spam_prob.values()))  
#    p_ham = np.sum(x_ham*np.log(ham_prob.values()) + (1-x_ham)*np.log(1-ham_prob.values()))
#    i = 0
#    p_spam = 0
#    for keys in spam_prob:
#        p_spam += x_spam[i]*np.log(spam_prob[keys]) + (1-x_spam[i])*np.log(1-spam_prob[keys])
#        i += 1
#    
#    i = 0
#    p_ham = 0
#    for keys in ham_prob:
#        p_ham += x_ham[i]*np.log(ham_prob[keys]) + (1-x_ham[i])*np.log(1-ham_prob[keys])
#        i += 1
#   
    spam_values = np.array(list(spam_prob.values()))
    ham_values = np.array(list(ham_prob.values()))
    p_ham = np.log(prior_by_category[1])
    p_spam = np.log(prior_by_category[0])
#    p_spam = np.sum(x_spam*np.log(spam_values) + (1-x_spam)*np.log(1-spam_values)) + np.log(prior_by_category[0])
    
    for i in range(0,len(x)):
        p_spam += x[i]*np.log(spam_values[i]) + (1-x[i])*np.log(1-spam_values[i])
        p_ham += x[i]*np.log(ham_values[i]) + (1-x[i])*np.log(1-ham_values[i])
    
    
    if p_spam > p_ham:
        classification = 'spam'
    else:
        classification = 'ham'
        
    posterior = [p_spam, p_ham]
    classify_result = (classification, posterior)

    return classify_result
Exemple #30
0
def learn_distributions(file_lists_by_category):
    """
    Estimate the parameters p_d, and q_d from the training set
    
    Input
    -----
    file_lists_by_category: A two-element list. The first element is a list of 
    spam files, and the second element is a list of ham files.

    Output
    ------
    probabilities_by_category: A two-element tuple. The first element is a dict 
    whose keys are words, and whose values are the smoothed estimates of p_d;
    the second element is a dict whose keys are words, and whose values are the 
    smoothed estimates of q_d 
    """

    # p_d = p(w_d | y_n=1) spam
    # q_d = p(w_d | y_n=0) ham

    vocab[num] = 0

    for file in file_lists_by_category[1]:
        for word in util.get_words_in_file(file):
            # this adds the new word to the vocab dict
            if word.isnumeric():
                vocab[num] = 0
            else:
                vocab[word] = 0

    for file in file_lists_by_category[0]:
        for word in util.get_words_in_file(file):
            # this adds the new word to the vocab dict
            if word.isnumeric():
                vocab[num] = 0
            else:
                vocab[word] = 0

    # add an entry to vocab dict to account for any new words from test set
    vocab["<unk>"] = 0

    # vocab contains all the words from the training set
    # but all the values are 0
    D = len(vocab)

    # HAM distribution
    hamWords = vocab.copy()
    hamTotalWordCount = 0
    for file in file_lists_by_category[1]:
        for word in util.get_words_in_file(file):
            if word.isnumeric(): hamWords[num] += 1
            else: hamWords[word] += 1
            hamTotalWordCount += 1

    # smoothe each element
    for element in hamWords:
        old_val = hamWords[element]
        hamWords[element] = (old_val + 1) / (hamTotalWordCount + D)

    # SPAM distribution #
    spamWords = vocab.copy()
    spamTotalWordCount = 0
    for file in file_lists_by_category[0]:
        for word in util.get_words_in_file(file):
            if word.isnumeric(): spamWords[num] += 1
            else: spamWords[word] += 1
            spamTotalWordCount += 1

    # smoothe each elemetn
    for element in spamWords:
        old_val = spamWords[element]
        spamWords[element] = (old_val + 1) / (spamTotalWordCount + D)

    return spamWords, hamWords
def classify_email(email_filename, log_probabilities_by_category,
                   log_prior_by_category):
    """
    Uses Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    email_filename : name of the file containing the email to be classified

    log_probabilities_by_category : See output of learn_distributions

    log_prior_by_category : See output of learn_distributions

    Output
    ------
    One of the labels in names. sempra
    """
    ### TODO: Comment out the following line and write your code here

    words_in_file = list(set(util.get_words_in_file(email_filename)))
    #    for spam in log_probabilities_by_category[0]:
    #        if spam not in words_in_file:
    #            print(spam, 1-np.exp(log_probabilities_by_category[0][spam]))
    spam_prod = 0
    ham_prod = 0

    for word in words_in_file:
        #        if 1-np.exp(log_probabilities_by_category[0][word]) <= 0:
        #            print(word, 1-np.exp(log_probabilities_by_category[0][word]))
        spam_prod += log_probabilities_by_category[0][word]
        ham_prod += log_probabilities_by_category[0][word]
    for word in log_probabilities_by_category[0]:
        if word not in words_in_file:
            spam_prod += np.log(1 -
                                np.exp(log_probabilities_by_category[0][word]))
#            if 1-np.exp(log_probabilities_by_category[0][word]) <= 0 and word == 'delegating':
#                print('spam', word, 1-np.exp(log_probabilities_by_category[0][word]))
#
    for word in log_probabilities_by_category[1]:
        if word not in words_in_file:
            ham_prod += np.log(1 -
                               np.exp(log_probabilities_by_category[1][word]))
#            if 1-np.exp(log_probabilities_by_category[0][word]) <= 0 and word == 'delegating':
#                print('ham', word, 1-np.exp(log_probabilities_by_category[1][word]))
#    print('horcrux', word, 1-np.exp(log_probabilities_by_category[0]['horcrux']))
#
#    print('horcrux', word, 1-np.exp(log_probabilities_by_category[1]['horcrux']))

#    spam_prod = sum([log_probabilities_by_category[0][x] if x in words_in_file else np.log(1-np.exp(log_probabilities_by_category[0][x])) for x in log_probabilities_by_category[0]])
#    ham_prod =  sum([log_probabilities_by_category[0][x] if x in words_in_file else np.log(1-np.exp(log_probabilities_by_category[1][x])) for x in log_probabilities_by_category[1]])
    log_odds_num = log_prior_by_category[0] + spam_prod
    log_odds_denom = log_prior_by_category[1] + ham_prod

    log_odds = log_odds_num - log_odds_denom

    #    print(log_odds_num, log_odds_denom, log_odds)

    #    print('spam3 ' + str(len(log_probabilities_by_category[0])))
    #    print('ham3 ' + str(len(log_probabilities_by_category[1])))

    if log_odds >= 0:
        return 'spam'
    else:
        return 'ham'
Exemple #32
0
def learn_distributions(file_lists_by_category):
    """
    Estimate the parameters p_d, and q_d from the training set
    
    Input
    -----
    file_lists_by_category: A two-element list. The first element is a list of 
    spam files, and the second element is a list of ham files.

    Output
    ------
    probabilities_by_category: A two-element tuple. The first element is a dict 
    whose keys are words, and whose values are the smoothed estimates of p_d;
    the second element is a dict whose keys are words, and whose values are the 
    smoothed estimates of q_d 
    """
    spam_files = file_lists_by_category[0]
    ham_files = file_lists_by_category[1]

    ### W is the vocabulary, W = {w1, w2, ..., wd}, generate this by going through all the files
    print("Generating vocabulary...")
    W = dict()

    for x in spam_files:
        words = util.get_words_in_file(x)
        for w in words:
            W[w] = 1

    for x in ham_files:
        words = util.get_words_in_file(x)
        for w in words:
            W[w] = 1

    ### generate p_d dict and q_d dict, perform laplace smoothing
    print("Generating posterior probabilities...")
    p_d = dict()
    q_d = dict()
    laplace_smooth_word_count_spam = util.get_total_word_count(
        spam_files) + len(W)
    laplace_smooth_word_count_ham = util.get_total_word_count(ham_files) + len(
        W)

    wc_spam = dict()
    for f in spam_files:
        words = util.get_words_in_file(f)
        for w in words:
            if w in wc_spam:
                wc_spam[w] += 1
            else:
                wc_spam[w] = 1

    wc_ham = dict()
    for f in ham_files:
        words = util.get_words_in_file(f)
        for w in words:
            if w in wc_ham:
                wc_ham[w] += 1
            else:
                wc_ham[w] = 1

    for w in W:
        if w in wc_spam:
            p_d[w] = (wc_spam[w] + 1) / (laplace_smooth_word_count_spam)
        else:
            p_d[w] = 1 / laplace_smooth_word_count_spam

        if w in wc_ham:
            q_d[w] = (wc_ham[w] + 1) / (laplace_smooth_word_count_ham)
        else:
            q_d[w] = 1 / laplace_smooth_word_count_ham

    probabilities_by_category = [p_d, q_d]

    return probabilities_by_category