Ejemplo n.º 1
0
def main(data_file, vocab_path):
    """Build and evaluate Naive Bayes classifiers for the federalist papers"""

    function_words = load_function_words(vocab_path)

    authors, essays, essay_ids = parse_federalist_papers(data_file)

    function_words = load_function_words(vocab_path)

    # load the attributed essays into a feature matrix
    X = load_features(essays, function_words)
    # TODO: load the author names into a vector y, mapped to 0 and 1, using functions from util.py
    labels_map = labels_to_key(authors)
    y = np.asarray(labels_to_y(authors, labels_map))

    print(f"Numpy array has shape {X.shape} and dtype {X.dtype}")
Ejemplo n.º 2
0
def main(data_file, vocab_path):
    """Build and evaluate Naive Bayes classifiers for the federalist papers"""

    authors, essays, essay_ids = parse_federalist_papers(data_file)

    function_words = load_function_words(vocab_path)
    # load the attributed essays into a feature matrix
    # label mapping is for me to track
    # make them into two classifiers, zero and one.
    # the distribution of  the zero (ham) was higher?
    # the distribution of one (man) was higher?
    # output: two classes zero and one

    X = load_features(essays, function_words)
    # TODO: load the author names into a vector y, mapped to 0 and 1, using functions from util.py

    labels_map = labels_to_key(authors)
    print(labels_map)
    # y output, a list of zeros and ones, 相对应,第几篇文章里面是什么
    # y is the golden standard, it is used for both training, and evaluation
    y = np.asarray(labels_to_y(authors, labels_map))
    # numerical
    print(f"Numpy array has shape {X.shape} and dtype {X.dtype}")

    # TODO shuffle, then split the data
    # if split has already had a shuffle function embedded in it, no need for importing
    train, test = split_data(X, y, 0.25)

    # TODO: train a multinomial NB model, evaluate on validation split
    nbm = MultinomialNB()
    # to see what is the definition of nbm, what it requires as in the parameter
    # train is array, two tuples with [] in it, the first one is a array, teh second one is target
    # rows of X and the len of y are not identical.
    # y 的长度要大于X, 不能直接用y, 需要用剪裁过在train 里面的
    nbm.fit(train[0], train[1])  # change
    preds_nbm = nbm.predict(test[0])
    test_y = test[1]
    accuracy = calculate_accuracy(preds_nbm, test_y)

    print(f" the accuracy for multinomial NB model is {accuracy}")

    # TODO: train a Bernoulli NB model, evaluate on validation split

    nbb = BernoulliNB()
    nbb.fit(train[0], train[1])
    preds_nbb = nbb.predict(test[0])
    accuracy = calculate_accuracy(preds_nbb, test_y)

    print(f" the accuracy for Bernoulli NB model is {accuracy}")

    # TODO: fit the zero rule
    train_y = train[1]
    most_frequent_class = find_zero_rule_class(train_y)
    print(f"the most frequent class is {most_frequent_class}")
    test_predictions = apply_zero_rule(test[0], most_frequent_class)
    test_accuracy = calculate_accuracy(test_predictions, test_y)
    print(f" the accuracy for the baseline is {test_accuracy}")
Ejemplo n.º 3
0
def main(data_file, vocab_path):
    """extract function word features from a text file"""

    # load resources and text file
    function_words = load_function_words(vocab_path)

    reviews, ids = load_reviews(data_file)

    # TODO: appropriately shape and fill this matrix
    # define the shape of this 2d array
    nrows = len(ids)
    ncols = len(function_words)
    # initialize the 2d array
    review_features = np.zeros((nrows, ncols), dtype=np.int)
    # fill in the value of the 2d array
    for i in range(len(reviews)):
        #tokenize and lowercase all the words
        each_review = word_tokenize(reviews[i].lower())
        #loop through each word of each review and fill in the value of the 2d array
        for word in each_review:
            if word in function_words:
                word_index = function_words.index(word)
                review_features[i][word_index] += 1
    # row is which review
    # column is which word

    print(f"Numpy array has shape {review_features.shape} and dtype {review_features.dtype}")

    # TODO: Calculate these from review_features
    # sum up each column
    words_count = [sum(x) for x in zip(*review_features)]
    # get the most common words
    most_common_count = max(words_count)
    most_common_word_index = words_count.index(most_common_count)
    most_common_word = function_words[most_common_word_index]
    print(f"Most common word: {most_common_word}, count: {most_common_count}")

    # TODO: Find any features that weren't in the data (i.e. columns that sum to 0)
    # initialize a list for index whose column sum is zero
    zero_inds = []
    # loop through the list of the sum of columns, append index whose column sum is zero to the list just initialized
    for i in range(len(words_count)):
        if words_count[i] == 0:
            zero_inds.append(i)
    if len(zero_inds) > 0:
        print("No instances found for: ")
        for ind in zero_inds:
            print(f"  {function_words[ind]}")
    else:
        print("All function words found")

    matrix_sum = review_features.sum()
    print(f"Sum of raw count matrix: {matrix_sum}")

    # TODO: make a binary feature vector from your count vector
    # copy the 2d array and convert it to a binary vector
    word_binary = np.copy(review_features)
    # loop through each entry and convert the value whose value is not zero to one
    for i in range(len(word_binary)):
        for j in range(len(word_binary[i])):
            if word_binary[i][j] > 0:
                word_binary[i][j] = 1
    word_binary_sum = word_binary.sum()
    print(f"Sum of binary matrix: {word_binary_sum}")

    # TODO: normalize features by review length (divide rows by number of words in the review)
    # copy the matrix
    norm_reviews = np.copy(review_features)
    # copy the numpy ndarray to a list
    norm_reviews = norm_reviews.tolist()
    # loop through each row and calculate the sum of each row
    for i in range(len(norm_reviews)):
        sum_of_row = sum(norm_reviews[i])
        # loop through each entry of each row and normalize it by the sum of each row
        for j in range(len(norm_reviews[i])):
            normalized_val = (norm_reviews[i][j]) / (sum_of_row)
            norm_reviews[i][j] = normalized_val
    # convert the list back to a numpy array
    norm_reviews = np.array(norm_reviews)
    #round the decimals
    norm_reviews_sum = round(norm_reviews.sum(), 2)
    print(f"Sum of normed matrix: {norm_reviews_sum}")

    # TODO: remove features from <review_features> that occur less than <min_count> times
    min_count = 100
    min_matrix = np.copy(review_features)
    # initialize a list for index whose column sum is less than minimum ocunt
    remove_column_index = []
    for i in range(len(words_count)):
        if words_count[i] <= min_count:
            remove_column_index.append(i)
    # remove columns whose column sum is less than the minimum count by np.delete(array, list of index to remove, axis = 1)
    min_matrix = np.delete(min_matrix, remove_column_index, 1)
    min_matrix_shape = min_matrix.shape
    print(f"Shape after removing features that occur < {min_count} times: {min_matrix_shape}")

    # TODO: split the dataset by updating the function above
    train, val = split_data(review_features, ids, 0.3)

    # Code below that all your data has been retained in your splits; do not edit.
    # Must all print True

    check_splits(train, val, review_features, ids)
Ejemplo n.º 4
0
def main(data_file, vocab_path):
    """extract function word features from a text file"""

    # load resources and text file
    function_words = load_function_words(vocab_path)

    reviews, ids = load_reviews(data_file)

    # TODO: appropriately shape and fill this matrix
    review_features = np.zeros((1, 1), dtype=np.int)
    # row is which review
    # column is which word

    print(
        f"Numpy array has shape {review_features.shape} and dtype {review_features.dtype}"
    )

    # TODO: Calculate these from review_features
    most_common_count = 0
    most_common_word = ""
    print(f"Most common word: {most_common_word}, count: {most_common_count}")

    # TODO: Find any features that weren't in the data (i.e. columns that sum to 0)
    zero_inds = []
    if len(zero_inds) > 0:
        print("No instances found for: ")
        for ind in zero_inds:
            print(f"  {function_words[ind]}")
    else:
        print("All function words found")

    matrix_sum = review_features.sum()
    print(f"Sum of raw count matrix: {matrix_sum}")

    # TODO: make a binary feature vector from your count vector
    word_binary = np.copy(review_features)
    word_binary_sum = word_binary.sum()
    print(f"Sum of binary matrix: {word_binary_sum}")

    # TODO: normalize features for review length (divide rows by number of function words in the review)
    # HINT: each row should sum to 1
    norm_reviews = np.copy(review_features)
    norm_reviews_sum = norm_reviews.sum()
    print(f"Sum of normed matrix: {norm_reviews_sum}")

    # TODO: remove features from <review_features> that occur less than <min_count> times
    min_count = 100
    min_matrix = np.copy(review_features)
    min_matrix_shape = min_matrix.shape
    print(
        f"Shape after removing features that occur < {min_count} times: {min_matrix_shape}"
    )

    #TODO: split the dataset by updating the function above

    train, val = split_data(review_features, ids, 0.3)

    # Code below that all your data has been retained in your splits; do not edit.
    # Must all print True

    check_splits(train, val, review_features, ids)
def main(data_file, vocab_path):
    """extract function word features from a text file"""

    ### load resources and text file
    function_words = load_function_words(vocab_path)
    reviews, ids = load_reviews(data_file)

    ### appropriately shape and fill this matrix
    review_features = np.zeros((len(reviews), len(function_words)),
                               dtype=np.int)
    review_features = feature_matrix(reviews, function_words)
    # row is which review
    # column is which word
    print(
        f"Numpy array has shape {review_features.shape} and dtype {review_features.dtype}"
    )

    ### Calculate these from review_features
    column_sum = np.sum(review_features, axis=0)
    most_common_count = max(column_sum)

    index = np.where(column_sum == column_sum.max())
    most_common_word = function_words[index[0][0]]

    print(f"Most common word: {most_common_word}, count: {most_common_count}")

    ### Find any features that weren't in the data (i.e. columns that sum to 0)
    index = np.where(column_sum == 0)
    zero_inds = index[0]
    if len(zero_inds) > 0:
        print("No instances found for: ")
        for ind in zero_inds:
            print(f"  {function_words[ind]}")
    else:
        print("All function words found")

    matrix_sum = review_features.sum()
    print(f"Sum of raw count matrix: {matrix_sum}")

    ### make a binary feature vector from your count vector
    word_binary = np.copy(review_features)
    for i in range(len(reviews)):
        word_binary[i] = np.where(word_binary[i] > 0, 1, 0)

    word_binary_sum = word_binary.sum()
    print(f"Sum of binary matrix: {word_binary_sum}")

    ### normalize features by review length (divide rows by number of words in the review)
    norm_reviews = np.copy(review_features)

    for i in range(len(reviews)):
        for j in range(len(function_words)):
            norm_reviews[i, j] = norm_reviews[i, j] / norm_reviews[i].sum()

    norm_reviews_sum = norm_reviews.sum()
    print(f"Sum of normed matrix: {norm_reviews_sum}")

    ### remove features from <review_features> that occur less than <min_count> times
    min_count = 100
    min_matrix = np.copy(review_features)

    index = np.where(column_sum <= min_count)
    mincnt_ind = index[0]

    functionword_min_matrix = []
    for i in range(len(function_words)):
        if i not in mincnt_ind:
            functionword_min_matrix.append(function_words[i])

    min_matrix = feature_matrix(reviews, functionword_min_matrix)

    min_matrix_shape = min_matrix.shape
    print(
        f"Shape after removing features that occur < {min_count} times: {min_matrix_shape}"
    )

    ### split the dataset by updating the function above
    train, val = split_data(review_features, ids, 0.3)

    # Code below that all your data has been retained in your splits; do not edit.
    # Must all print True

    check_splits(train, val, review_features, ids)
def main(data_file, vocab_path):
    """Build and evaluate Naive Bayes classifiers for the federalist papers"""
    authors, essays, essay_ids = parse_federalist_papers(data_file)
    function_words = load_function_words(vocab_path)
    # load the attributed essays into a feature matrix
    X = load_features(essays, function_words)

    # TODO: load the author names into a vector y, mapped to 0 and 1, using functions from util.py
    labels_map = labels_to_key(authors)
    y = np.asarray(labels_to_y(authors, labels_map))
    print(f"X has shape {X.shape} and dtype {X.dtype}")
    print(f"y has shape {y.shape} and dtype {y.dtype}")

    # TODO shuffle, then split the data
    train, test = split_data(X, y, 0.25)
    data_size_after = len(train[1]) + len(test[1])

    assert data_size_after == y.size, f"Number of datapoints after split {data_size_after} must match size before {y.size}"
    print(f"{len(train[0])} in train; {len(test[0])} in test")

    # TODO: train a multinomial NB model, evaluate on validation split
    nb_mul = MultinomialNB()
    nb_mul.fit(train[0], train[1])

    pred_mul = nb_mul.predict(test[0])
    acc_mul = metrics.accuracy_score(test[1], pred_mul)
    print(f"Accuracy of Multinomial NB method: {acc_mul:0.03f}")

    # TODO: train a Bernoulli NB model, evaluate on validation split
    ### make a binary feature vector
    train_bi = np.copy(train[0])
    for i in range(len(train_bi)):
        train_bi[i] = np.where(train_bi[i] > 0, 1, 0)
    test_bi = np.copy(test[0])
    for i in range(len(test_bi)):
        test_bi[i] = np.where(test_bi[i] > 0, 1, 0)

    nb_ber = BernoulliNB()
    nb_ber.fit(train_bi, train[1])

    pred_ber = nb_ber.predict(test_bi)
    acc_ber = metrics.accuracy_score(test[1], pred_ber)
    print(f"Accuracy of Bernoulli NB method: {acc_ber:0.03f}")

    # TODO: fit the zero rule
    # learn zero rule on train
    most_frequent_class = find_zero_rule_class(train[1])

    # apply zero rule to test reviews
    test_predictions = apply_zero_rule(test[0], most_frequent_class)

    # score accuracy
    test_accuracy = calculate_accuracy(test_predictions, test[1])
    print(f"Accuracy of Zero rule: {test_accuracy:0.03f}")

    # lookup label string from class #
    author_key = labels_to_key(authors)
    reverse_author_key = {v: k for k, v in author_key.items()}
    print(
        f"The author predicted by the Zero rule is {reverse_author_key[most_frequent_class]}"
    )
Ejemplo n.º 7
0
def main(data_file, vocab_path):
    """extract function word features from a text file"""

    # load resources and text file
    function_words = load_function_words(vocab_path)

    reviews, ids = load_reviews(data_file)

    # TODO 0: appropriately shape and fill this matrix
    review_features = np.zeros((1, 1), dtype=np.int)
    # row is which review
    # column is which word

    print(
        f"0: Numpy array has shape {review_features.shape} and dtype {review_features.dtype}"
    )

    matrix_sum = review_features.sum()
    print(f"Sum of raw count matrix: {matrix_sum}")

    # TODO 1: Figure out what the most common word (feature) is in review_features. Do not hardcode the answer
    most_common_count = 0
    most_common_word = ""
    print(
        f"1. Most common word: {most_common_word}, count: {most_common_count}")

    # TODO 2: Find any features that weren't in the data (i.e. columns that sum to 0)
    zero_inds = []
    if len(zero_inds) > 0:
        print("2. No instances found for: ")
        for ind in zero_inds:
            print(f"  {function_words[ind]}")
    else:
        print("2. All function words found")

    # TODO 3: make a binary feature vector from your count vector
    word_binary = np.copy(review_features)
    word_binary_sum = word_binary.sum()
    print(f"3: Sum of binary matrix: {word_binary_sum}")

    # TODO 4: normalize features for review length (divide rows by number of *function words* in the review)
    # HINT: each row should sum to 1
    norm_reviews = np.copy(review_features)
    norm_reviews_sum = norm_reviews.sum()
    print(f"4: Sum of normed matrix: {norm_reviews_sum}")

    # TODO 5: remove features from <review_features> that occur less than <min_count> times
    min_count = 100
    min_matrix = np.copy(review_features)
    min_matrix_shape = min_matrix.shape
    print(
        f"5: Shape after removing features that occur < {min_count} times: {min_matrix_shape}"
    )

    # TODO 6: normalize features by each feature's *document frequency*
    # For THIS exercise, divide each count by the number of documents that has that feature at all
    # (be careful not to divide by *total count* of the feature)
    # perform this on the matrix from TODO 5
    df_norm_reviews = np.copy(min_matrix)
    df_norm_reviews_sum = df_norm_reviews.sum()
    print(f"6: Sum of document frequency normed matrix: {df_norm_reviews_sum}")