def main(data_file, vocab_path): """Build and evaluate Naive Bayes classifiers for the federalist papers""" function_words = load_function_words(vocab_path) authors, essays, essay_ids = parse_federalist_papers(data_file) function_words = load_function_words(vocab_path) # load the attributed essays into a feature matrix X = load_features(essays, function_words) # TODO: load the author names into a vector y, mapped to 0 and 1, using functions from util.py labels_map = labels_to_key(authors) y = np.asarray(labels_to_y(authors, labels_map)) print(f"Numpy array has shape {X.shape} and dtype {X.dtype}")
def main(data_file, vocab_path): """Build and evaluate Naive Bayes classifiers for the federalist papers""" authors, essays, essay_ids = parse_federalist_papers(data_file) function_words = load_function_words(vocab_path) # load the attributed essays into a feature matrix # label mapping is for me to track # make them into two classifiers, zero and one. # the distribution of the zero (ham) was higher? # the distribution of one (man) was higher? # output: two classes zero and one X = load_features(essays, function_words) # TODO: load the author names into a vector y, mapped to 0 and 1, using functions from util.py labels_map = labels_to_key(authors) print(labels_map) # y output, a list of zeros and ones, 相对应,第几篇文章里面是什么 # y is the golden standard, it is used for both training, and evaluation y = np.asarray(labels_to_y(authors, labels_map)) # numerical print(f"Numpy array has shape {X.shape} and dtype {X.dtype}") # TODO shuffle, then split the data # if split has already had a shuffle function embedded in it, no need for importing train, test = split_data(X, y, 0.25) # TODO: train a multinomial NB model, evaluate on validation split nbm = MultinomialNB() # to see what is the definition of nbm, what it requires as in the parameter # train is array, two tuples with [] in it, the first one is a array, teh second one is target # rows of X and the len of y are not identical. # y 的长度要大于X, 不能直接用y, 需要用剪裁过在train 里面的 nbm.fit(train[0], train[1]) # change preds_nbm = nbm.predict(test[0]) test_y = test[1] accuracy = calculate_accuracy(preds_nbm, test_y) print(f" the accuracy for multinomial NB model is {accuracy}") # TODO: train a Bernoulli NB model, evaluate on validation split nbb = BernoulliNB() nbb.fit(train[0], train[1]) preds_nbb = nbb.predict(test[0]) accuracy = calculate_accuracy(preds_nbb, test_y) print(f" the accuracy for Bernoulli NB model is {accuracy}") # TODO: fit the zero rule train_y = train[1] most_frequent_class = find_zero_rule_class(train_y) print(f"the most frequent class is {most_frequent_class}") test_predictions = apply_zero_rule(test[0], most_frequent_class) test_accuracy = calculate_accuracy(test_predictions, test_y) print(f" the accuracy for the baseline is {test_accuracy}")
def main(data_file, vocab_path): """extract function word features from a text file""" # load resources and text file function_words = load_function_words(vocab_path) reviews, ids = load_reviews(data_file) # TODO: appropriately shape and fill this matrix # define the shape of this 2d array nrows = len(ids) ncols = len(function_words) # initialize the 2d array review_features = np.zeros((nrows, ncols), dtype=np.int) # fill in the value of the 2d array for i in range(len(reviews)): #tokenize and lowercase all the words each_review = word_tokenize(reviews[i].lower()) #loop through each word of each review and fill in the value of the 2d array for word in each_review: if word in function_words: word_index = function_words.index(word) review_features[i][word_index] += 1 # row is which review # column is which word print(f"Numpy array has shape {review_features.shape} and dtype {review_features.dtype}") # TODO: Calculate these from review_features # sum up each column words_count = [sum(x) for x in zip(*review_features)] # get the most common words most_common_count = max(words_count) most_common_word_index = words_count.index(most_common_count) most_common_word = function_words[most_common_word_index] print(f"Most common word: {most_common_word}, count: {most_common_count}") # TODO: Find any features that weren't in the data (i.e. columns that sum to 0) # initialize a list for index whose column sum is zero zero_inds = [] # loop through the list of the sum of columns, append index whose column sum is zero to the list just initialized for i in range(len(words_count)): if words_count[i] == 0: zero_inds.append(i) if len(zero_inds) > 0: print("No instances found for: ") for ind in zero_inds: print(f" {function_words[ind]}") else: print("All function words found") matrix_sum = review_features.sum() print(f"Sum of raw count matrix: {matrix_sum}") # TODO: make a binary feature vector from your count vector # copy the 2d array and convert it to a binary vector word_binary = np.copy(review_features) # loop through each entry and convert the value whose value is not zero to one for i in range(len(word_binary)): for j in range(len(word_binary[i])): if word_binary[i][j] > 0: word_binary[i][j] = 1 word_binary_sum = word_binary.sum() print(f"Sum of binary matrix: {word_binary_sum}") # TODO: normalize features by review length (divide rows by number of words in the review) # copy the matrix norm_reviews = np.copy(review_features) # copy the numpy ndarray to a list norm_reviews = norm_reviews.tolist() # loop through each row and calculate the sum of each row for i in range(len(norm_reviews)): sum_of_row = sum(norm_reviews[i]) # loop through each entry of each row and normalize it by the sum of each row for j in range(len(norm_reviews[i])): normalized_val = (norm_reviews[i][j]) / (sum_of_row) norm_reviews[i][j] = normalized_val # convert the list back to a numpy array norm_reviews = np.array(norm_reviews) #round the decimals norm_reviews_sum = round(norm_reviews.sum(), 2) print(f"Sum of normed matrix: {norm_reviews_sum}") # TODO: remove features from <review_features> that occur less than <min_count> times min_count = 100 min_matrix = np.copy(review_features) # initialize a list for index whose column sum is less than minimum ocunt remove_column_index = [] for i in range(len(words_count)): if words_count[i] <= min_count: remove_column_index.append(i) # remove columns whose column sum is less than the minimum count by np.delete(array, list of index to remove, axis = 1) min_matrix = np.delete(min_matrix, remove_column_index, 1) min_matrix_shape = min_matrix.shape print(f"Shape after removing features that occur < {min_count} times: {min_matrix_shape}") # TODO: split the dataset by updating the function above train, val = split_data(review_features, ids, 0.3) # Code below that all your data has been retained in your splits; do not edit. # Must all print True check_splits(train, val, review_features, ids)
def main(data_file, vocab_path): """extract function word features from a text file""" # load resources and text file function_words = load_function_words(vocab_path) reviews, ids = load_reviews(data_file) # TODO: appropriately shape and fill this matrix review_features = np.zeros((1, 1), dtype=np.int) # row is which review # column is which word print( f"Numpy array has shape {review_features.shape} and dtype {review_features.dtype}" ) # TODO: Calculate these from review_features most_common_count = 0 most_common_word = "" print(f"Most common word: {most_common_word}, count: {most_common_count}") # TODO: Find any features that weren't in the data (i.e. columns that sum to 0) zero_inds = [] if len(zero_inds) > 0: print("No instances found for: ") for ind in zero_inds: print(f" {function_words[ind]}") else: print("All function words found") matrix_sum = review_features.sum() print(f"Sum of raw count matrix: {matrix_sum}") # TODO: make a binary feature vector from your count vector word_binary = np.copy(review_features) word_binary_sum = word_binary.sum() print(f"Sum of binary matrix: {word_binary_sum}") # TODO: normalize features for review length (divide rows by number of function words in the review) # HINT: each row should sum to 1 norm_reviews = np.copy(review_features) norm_reviews_sum = norm_reviews.sum() print(f"Sum of normed matrix: {norm_reviews_sum}") # TODO: remove features from <review_features> that occur less than <min_count> times min_count = 100 min_matrix = np.copy(review_features) min_matrix_shape = min_matrix.shape print( f"Shape after removing features that occur < {min_count} times: {min_matrix_shape}" ) #TODO: split the dataset by updating the function above train, val = split_data(review_features, ids, 0.3) # Code below that all your data has been retained in your splits; do not edit. # Must all print True check_splits(train, val, review_features, ids)
def main(data_file, vocab_path): """extract function word features from a text file""" ### load resources and text file function_words = load_function_words(vocab_path) reviews, ids = load_reviews(data_file) ### appropriately shape and fill this matrix review_features = np.zeros((len(reviews), len(function_words)), dtype=np.int) review_features = feature_matrix(reviews, function_words) # row is which review # column is which word print( f"Numpy array has shape {review_features.shape} and dtype {review_features.dtype}" ) ### Calculate these from review_features column_sum = np.sum(review_features, axis=0) most_common_count = max(column_sum) index = np.where(column_sum == column_sum.max()) most_common_word = function_words[index[0][0]] print(f"Most common word: {most_common_word}, count: {most_common_count}") ### Find any features that weren't in the data (i.e. columns that sum to 0) index = np.where(column_sum == 0) zero_inds = index[0] if len(zero_inds) > 0: print("No instances found for: ") for ind in zero_inds: print(f" {function_words[ind]}") else: print("All function words found") matrix_sum = review_features.sum() print(f"Sum of raw count matrix: {matrix_sum}") ### make a binary feature vector from your count vector word_binary = np.copy(review_features) for i in range(len(reviews)): word_binary[i] = np.where(word_binary[i] > 0, 1, 0) word_binary_sum = word_binary.sum() print(f"Sum of binary matrix: {word_binary_sum}") ### normalize features by review length (divide rows by number of words in the review) norm_reviews = np.copy(review_features) for i in range(len(reviews)): for j in range(len(function_words)): norm_reviews[i, j] = norm_reviews[i, j] / norm_reviews[i].sum() norm_reviews_sum = norm_reviews.sum() print(f"Sum of normed matrix: {norm_reviews_sum}") ### remove features from <review_features> that occur less than <min_count> times min_count = 100 min_matrix = np.copy(review_features) index = np.where(column_sum <= min_count) mincnt_ind = index[0] functionword_min_matrix = [] for i in range(len(function_words)): if i not in mincnt_ind: functionword_min_matrix.append(function_words[i]) min_matrix = feature_matrix(reviews, functionword_min_matrix) min_matrix_shape = min_matrix.shape print( f"Shape after removing features that occur < {min_count} times: {min_matrix_shape}" ) ### split the dataset by updating the function above train, val = split_data(review_features, ids, 0.3) # Code below that all your data has been retained in your splits; do not edit. # Must all print True check_splits(train, val, review_features, ids)
def main(data_file, vocab_path): """Build and evaluate Naive Bayes classifiers for the federalist papers""" authors, essays, essay_ids = parse_federalist_papers(data_file) function_words = load_function_words(vocab_path) # load the attributed essays into a feature matrix X = load_features(essays, function_words) # TODO: load the author names into a vector y, mapped to 0 and 1, using functions from util.py labels_map = labels_to_key(authors) y = np.asarray(labels_to_y(authors, labels_map)) print(f"X has shape {X.shape} and dtype {X.dtype}") print(f"y has shape {y.shape} and dtype {y.dtype}") # TODO shuffle, then split the data train, test = split_data(X, y, 0.25) data_size_after = len(train[1]) + len(test[1]) assert data_size_after == y.size, f"Number of datapoints after split {data_size_after} must match size before {y.size}" print(f"{len(train[0])} in train; {len(test[0])} in test") # TODO: train a multinomial NB model, evaluate on validation split nb_mul = MultinomialNB() nb_mul.fit(train[0], train[1]) pred_mul = nb_mul.predict(test[0]) acc_mul = metrics.accuracy_score(test[1], pred_mul) print(f"Accuracy of Multinomial NB method: {acc_mul:0.03f}") # TODO: train a Bernoulli NB model, evaluate on validation split ### make a binary feature vector train_bi = np.copy(train[0]) for i in range(len(train_bi)): train_bi[i] = np.where(train_bi[i] > 0, 1, 0) test_bi = np.copy(test[0]) for i in range(len(test_bi)): test_bi[i] = np.where(test_bi[i] > 0, 1, 0) nb_ber = BernoulliNB() nb_ber.fit(train_bi, train[1]) pred_ber = nb_ber.predict(test_bi) acc_ber = metrics.accuracy_score(test[1], pred_ber) print(f"Accuracy of Bernoulli NB method: {acc_ber:0.03f}") # TODO: fit the zero rule # learn zero rule on train most_frequent_class = find_zero_rule_class(train[1]) # apply zero rule to test reviews test_predictions = apply_zero_rule(test[0], most_frequent_class) # score accuracy test_accuracy = calculate_accuracy(test_predictions, test[1]) print(f"Accuracy of Zero rule: {test_accuracy:0.03f}") # lookup label string from class # author_key = labels_to_key(authors) reverse_author_key = {v: k for k, v in author_key.items()} print( f"The author predicted by the Zero rule is {reverse_author_key[most_frequent_class]}" )
def main(data_file, vocab_path): """extract function word features from a text file""" # load resources and text file function_words = load_function_words(vocab_path) reviews, ids = load_reviews(data_file) # TODO 0: appropriately shape and fill this matrix review_features = np.zeros((1, 1), dtype=np.int) # row is which review # column is which word print( f"0: Numpy array has shape {review_features.shape} and dtype {review_features.dtype}" ) matrix_sum = review_features.sum() print(f"Sum of raw count matrix: {matrix_sum}") # TODO 1: Figure out what the most common word (feature) is in review_features. Do not hardcode the answer most_common_count = 0 most_common_word = "" print( f"1. Most common word: {most_common_word}, count: {most_common_count}") # TODO 2: Find any features that weren't in the data (i.e. columns that sum to 0) zero_inds = [] if len(zero_inds) > 0: print("2. No instances found for: ") for ind in zero_inds: print(f" {function_words[ind]}") else: print("2. All function words found") # TODO 3: make a binary feature vector from your count vector word_binary = np.copy(review_features) word_binary_sum = word_binary.sum() print(f"3: Sum of binary matrix: {word_binary_sum}") # TODO 4: normalize features for review length (divide rows by number of *function words* in the review) # HINT: each row should sum to 1 norm_reviews = np.copy(review_features) norm_reviews_sum = norm_reviews.sum() print(f"4: Sum of normed matrix: {norm_reviews_sum}") # TODO 5: remove features from <review_features> that occur less than <min_count> times min_count = 100 min_matrix = np.copy(review_features) min_matrix_shape = min_matrix.shape print( f"5: Shape after removing features that occur < {min_count} times: {min_matrix_shape}" ) # TODO 6: normalize features by each feature's *document frequency* # For THIS exercise, divide each count by the number of documents that has that feature at all # (be careful not to divide by *total count* of the feature) # perform this on the matrix from TODO 5 df_norm_reviews = np.copy(min_matrix) df_norm_reviews_sum = df_norm_reviews.sum() print(f"6: Sum of document frequency normed matrix: {df_norm_reviews_sum}")