Beispiel #1
0
def write_issue(issue):
    title      = issue["title"]
    labels_str = ",".join([l["name"] for l in issue["labels"]])
    body       = issue["body"]

    fmt.h2(title + " -- " + labels_str)
    fmt.paragraph(body)
    fmt.rule()
def write_milestone(milestone):
    due = milestone['due']
    due = due.split('T')[0] if due else 'LATER'
    title = milestone['title'] + ' ' + '<span style="float: right; font-size: 75%">(Due: ' + due + ')</span>'
    description = milestone['description']

    fmt.h2(title)
    fmt.paragraph(description)
    write_issues(milestone['issues'])
    fmt.rule()
def write_milestone(milestone):
    due = milestone['due']
    due = due.split('T')[0] if due else 'LATER'
    title = milestone[
        'title'] + ' ' + '<span style="float: right; font-size: 75%">(Due: ' + due + ')</span>'
    description = milestone['description']

    fmt.h2(title)
    fmt.paragraph(description)
    write_issues(milestone['issues'])
    fmt.rule()
Beispiel #4
0
def main():
    display_plot_of_temperatures()
    plt.savefig("images/testing_training_graph")
    plt.gcf().clear()

    learn(number_of_iterations=100)

    display_plot_of_temperatures()
    mySepLinePlot = build_sep_line_plot()
    mySepLinePlot.title(
        "Plots of temperatures and learning unit activation function")
    plt.savefig("images/activation_line")
    plt.gcf().clear()

    plt.plot(testing_errors)
    plt.ylabel("Euclidean Distance")
    plt.xlabel("Iteration #")
    plt.title("Testing error over iterations")
    plt.savefig("images/testing_error")
    plt.gcf().clear()

    file = open(reportFileName, "w")

    save_markdown_report(file, [
        md.h1("Project 3 Report"),
        md.h2("CMSC 409 - Artificial Intelligence"),
        md.h2("Steven Hernandez"),
        md.p("""1. There would be two input and one output for our unit.
Inputs would be the hour and a bias input while output would be the estimated
temperature at that hour of the day.
In fact, because we have weights for x (hour of the day) and a bias,
we can create the formula net = ax+b which means our unit can simply return net * 1
or the identity."""),
        md.p("""2. The activation function would be some linear function.
Or unit would not have a threshold however.
Whatever the outcome from the linear activation function is
would be the exact result from the learning unit.
If we look at the graph of temperatures for our training
(and testing) data, we can see that the values are basically
just a linear function."""),
        md.image("./images/testing_training_graph.png",
                 "Testing training graph"),
        md.p("3. Outcome of training with days 1-3:"),
        md.p("Euclidean distance comes down from %f to %f" %
             (testing_errors[0], testing_errors[len(testing_errors) - 1])),
        md.image("./images/testing_error.png", "Testing Error"),
        md.p("resulting in an activation as so:"),
        md.image("./images/activation_line.png", "Testing Error"),
        md.p("4."),
        md.table([
            [
                "input", "expected output", "actual output",
                "Euclidean distance"
            ],
            [5, 59.5, output(5), -59.5 + output(5)],
            [6, 64, output(6), -64 + output(6)],
            [7, 68.7, output(7), -68.7 + output(7)],
            [8, 73.65, output(8), -73.65 + output(8)],
            [9, 78.43, output(9), -78.43 + output(9)],
            [10, 82, output(10), -82 + output(10)],
            [11, 85.2, output(11), -85.2 + output(11)],
            [12, 87, output(12), -87 + output(12)],
            [13, 90.67, output(13), -90.67 + output(13)],
        ]),
        md.
        p("5. Learning rate was 0.0005 to keep the learning from going to quickly,"
          "while we went through 100 iterations."),
        md.
        p("Notice from the graph above on Euclidean distances, we reach our peak around the 20th iteration mark"
          ),
        md.
        p("6. As such, after the 20th iteration, we reach a plateau of improvement with our current system."
          ),
        md.
        p("7. Using a more complex network with greater than one unit would allow for more complex output"
          "which would ultimately help us with this problem."),
        md.
        p("Currently, we are stuck with a linear output because the single unit can only learn as such."
          ),
    ])

    file.close()

    print("Markdown Report generated in ./report.md")
    print("Converting Markdown file to PDF with ")
    print(
        "`pandoc --latex-engine=xelatex -V geometry=margin=1in -s -o FINAL_REPORT.pdf "
        + reportFileName + "`")

    os.system(
        "pandoc --latex-engine=xelatex -V geometry=margin=1in -s -o FINAL_REPORT.pdf "
        + reportFileName)
    print("Report created")
def main():
    minimum_occurrences = 2
    encountered_words = get_encountered_words(
        minimum_occurrences=minimum_occurrences)
    feature_vector = create_feature_vector(
        minimum_occurrences=minimum_occurrences)

    table = [get_encountered_words(minimum_occurrences=minimum_occurrences)
             ] + feature_vector

    normalized_feature_vector = normalize_feature_vector(feature_vector)

    result = learn_wta(normalized_feature_vector, cluster_count=20)

    clustered_sentences = split_sentences_into_clusters(
        result, normalized_feature_vector)

    clustered_sentences = list(filter(lambda x: x, clustered_sentences))

    def sentence_tuple_to_str(tuple):
        return str(tuple[0]) + ") " + tuple[1]

    clustered_sentence_strings = list(
        map(lambda cluster: list(map(sentence_tuple_to_str, cluster)),
            clustered_sentences))

    file = open(reportFileName, "w")
    md.save_markdown_report(file, [
        md.meta_data("Project 4 Report - CMSC 409 - Artificial Intelligence",
                     "Steven Hernandez"),
        md.p("In total, there are " + str(len(get_encountered_words())) +
             " unique root words found. "),
        md.p(
            str(len(get_encountered_words(minimum_occurrences=2))) +
            " words that are encountered at least 2 times. "),
        md.p("And then only " +
             str(len(get_encountered_words(minimum_occurrences=3))) +
             " words that are encountered at least 3 times. "),
        md.
        p("These statistics are calculated based on processing the documents in the following ways:"
          ),
        md.ol([
            "Tokenizing the sentences, which splits each sentence on the spaces to only produce a list of word/numeric "
            "tokens. This allows us to begin processing each word individually without requiring the context of the "
            "entire sentence. ",
            "Removing punctuation is required because in general, punctuation does not provide us textual context. "
            "Again, we are only looking at the similarity of sentence based on the number of occurrences of common "
            "words between the sentences. We are not trying to decifer the intent or the sentiment behind the "
            "sentence, so we do not require punctuation or even placement of words within the sentence. Just that the "
            "word exists "
            "within the sentence. ",
            "Removing numbers because numbers do not provide context about what the sentence is talking about. "
            "A book might cost $20 as would a basic microcontroller like an Arduino, but they are not related. "
            "Additional since, we removed punctuation in the previous step, we wouldn't be able to differentiate "
            "$20 from 20 miles or 20 participants, etc. ",
            "Converting upper to lower case prevents words appearing at the beginning of a sentence (with a required "
            "capital letter) from being considered a different word if it also appears in the middle of a sentence "
            "(which would be written in all lower case) ",
            "Removing stop words shrinks the total number of words that we find. More importantly though, it removes "
            "overly common words that do not provide us useful insights into the similarity of sentences. The word "
            "'the' is very likely to appear in most sentences, thus is not a useful indicator. ",
            "Stemming takes a word in past tense/future tense or plural/singular and takes the 'stem' or 'root' word. "
            "This further shrinks the overall number of words or dimensions that we must analyze. An example: run and "
            "running have the same root word, thus are very similar. ",
            "Combining stemmed words takes these common stemmed root words and combines them so that we can get a "
            "total count of the occurances of the word throughout all sentence documents."
        ],
              alpha=True),
        md.
        p("On the following page is a table listing all of these root words along with the number of occurrences of "
          "the word through the documents (the feature vector)"),
        md.page_break(),
        md.table(split_table_rows(["Root Word", "\# of instances"],
                                  list(count_encountered_words().items()), 49),
                 width=50),
        md.page_break(),
        md.p("The following lists the root words with greater than " +
             str(minimum_occurrences) + "occurrences:"),
        md.table(split_table_rows(
            ["Root Word", "\# of instances"],
            list(({
                k: v
                for k, v in count_encountered_words().items()
                if v > minimum_occurrences
            }).items()), 49),
                 width=50),
        md.page_break(),
        md.
        p("The following 2 tables show the distribution of root words which appear at least "
          + str(minimum_occurrences) + " times across each "
          "document (with each row indicating one sentence) (This is the Term Document Matrix **TDM**)"
          ),
        md.table(split_table(table, 0, math.floor(len(table[0]) / 2)),
                 width=20),
        md.page_break(),
        md.table(split_table(table,
                             math.floor(len(table[0]) / 2) + 1, len(table[0])),
                 width=20),
        md.page_break(),
        md.h2("Learning"),
        md.
        p("We begin learning by using the 'Winner Takes All' (WTA) method which means that we begin with `n` "
          "clusters, then iterating for each document, we find the closest cluster using euclidean "
          "distance. Depending on which cluster's center (based on weight) is closest to the new document, "
          "the cluster's center's weight is changed by a value to better match the resulting pattern. Code below: "
          ),
        md.code(function=learn_wta),
        md.code(function=get_closest_cluster),
        md.code(function=calculate_change_in_weight),
        md.page_break(),
        md.h3("Learned clusters:"),
    ])

    # Show resulting clusters
    for i in range(len(clustered_sentence_strings)):
        md.save_markdown_report(file, [
            md.p("Cluster " + str(i + 1) + ":"),
            md.li(clustered_sentence_strings[i]),
        ])

    # Show bit representation of sentence vectors
    md.save_markdown_report(file, [
        md.
        p("If we look at the feature vectors as a bit map showing whether a sentence has or does not have "
          "a specific word, we can begin to see the pattern of the clustering method."
          ),
    ])

    def sentence_tuple_to_bit_string(tuple):
        return str(tuple[0]) + ") " + feature_vector_to_bit_string(
            feature_vector[tuple[0]])

    def feature_vector_to_bit_string(vector):
        return ''.join(map(str, vector))

    for i in range(len(clustered_sentences)):
        md.save_markdown_report(file, [
            md.p("Cluster " + str(i + 1) + ":"),
            md.li(
                list(map(sentence_tuple_to_bit_string,
                         clustered_sentences[i]))),
        ])

    md.save_markdown_report(file, [
        md.
        p("From these bit maps, we can see that each cluster has relatively distinct columns which match"
          "across the documents of the cluster."),
        md.
        p("Of course, this clustering does split some groups of documents into more clusters than expected. "
          "Some clusters seem as if they could be combined to the human views. Having additional sample documents "
          "would very likely help with this issue. With these few number of documents, for example, sentence 12 "
          "'Three parking spaces in back, pets are possible with approval from the owner.' does not mention "
          "being about a 'home' or many other words which are used in other documents that truly identify it"
          "as being about a home. With more documents, we would begin to have more overlap, which could "
          "aid in finding which words provide us the most importance. Sentence 10 as well does not share enough"
          "words to be able to identify it with the provided documents."),
        md.p("Below, we can see which words these sentences share in common."),
    ])

    def sentence_tuple_to_formatted_sentence(tuple):
        formatted_sentence = []

        sentence_vector = feature_vector[tuple[0]]

        for i, v in enumerate(sentence_vector):
            if v:
                formatted_sentence.append(encountered_words[i])
        return str(tuple[0]) + ") " + ", ".join(formatted_sentence)

    for i in range(len(clustered_sentence_strings)):
        md.save_markdown_report(file, [
            md.p("Cluster " + str(i + 1) + ":"),
            md.li(
                list(
                    map(sentence_tuple_to_formatted_sentence,
                        clustered_sentences[i]))),
        ])

    md.save_markdown_report(file, [
        md.
        p("One problem of this method compared to a method where clusters a created as needed, was that if the "
          "random initialization of weights for the cluster were randomly generated in a bad spot, it is likely "
          "the cluster would never contain any sentences because (as the name implies) the Winner Takes All method"
          "would often find one cluster taking over most of the documents, while other clusters remained empty."
          ),
        md.
        p("The solution taken here for this problem was to learn on many randomly placed clusters. Learning "
          "began with 20 clusters. From these 20 clusters however, we only end up with "
          + str(len(clustered_sentences)) +
          " clusters. Additionally, (during testing) it would some times "
          "result in clusters with only a single result, when the result would have worked better "
          "in some other already defined cluster."),
        md.
        p("With fewer clusters (for example 4), we occasionally ended up with good results, but often would end up"
          "with most documents stuck in one single cluster"),
        md.
        p("In addition to having more documents to sample, having clusters only as needed would likely improve this "
          "situation. With clusters-as-needed, clusters would only be able to contain documents within some radius "
          "of the cluster's center. If a document is found outside of this radius, then a new cluster would be "
          "formed in this place.")
    ])

    file.close()

    print("Markdown Report generated in ./report4.md")
    print("Converting Markdown file to PDF with ")
    print(
        "`pandoc --latex-engine=xelatex -V geometry=margin=1in -s -o FINAL_REPORT.pdf "
        + reportFileName + "`")

    os.system(
        "pandoc --latex-engine=xelatex -V geometry=margin=1in -s -o FINAL_REPORT.pdf "
        + reportFileName)
    print("Report created")
Beispiel #6
0
def main():
    # Data has been generated, so we don't want to regenerate the data.
    # generate_random_data()

    df = pd.read_csv(dataFileName, header=None)
    sepLineA = pd.read_csv(sepLineAFileName, header=None)
    sepLineB = pd.read_csv(sepLineBFileName, header=None)
    #
    errorMatrix1 = get_confusion_matrix(df, sepLineA)
    errorMatrix2 = get_confusion_matrix(df, sepLineB)

    myPlt = build_height_plot(df, sepLineA)
    myPlt.savefig("images/1d")
    myPlt.gcf().clear()

    myPlt = build_height_weight_plot(df, sepLineB)
    myPlt.savefig("images/2d")
    myPlt.gcf().clear()

    file = open(reportFileName, "w")

    save_markdown_report(file, [
        md.h1("Project 1 Report"),
        md.h2("CMSC 409 - Artificial Intelligence"),
        md.h2("Steven Hernandez"),
        md.p("Fully generated data can be found in `./Project1_data/data.txt"),
        md.h3("*Scenerio 1:* using only height."),
        md.table([["", "Weights"], ["x", sepLineA[0][0]],
                  ["bias", sepLineA[0][1]]]),
        md.p("Assuming the following"),
        md.image("./images/net.png"),
        md.p("Or in this situation: "),
        md.p("1 if 0 <= -a(Height) + bias, otherwise 0"),
        md.p("where *a* is some weight and *1* is male and *0* is female."),
        md.p("In this situation a=" + str(sepLineA[0][0]) + " and bias=" +
             str(sepLineA[0][1])),
        md.image("./images/1d.png"),
        md.table([["", "Predicted Male", "Predicted Female"],
                  ["Actual Male", errorMatrix1[1], errorMatrix1[2]],
                  ["Actual Female", errorMatrix1[3], errorMatrix1[0]]]),
        md.p("**Confusion Matrix**"),
        md.table([
            ["", ""],
            ["Error", 1 - ((errorMatrix1[1] + errorMatrix1[0]) / 4000)],
            ["Accuracy", (errorMatrix1[1] + errorMatrix1[0]) / 4000],
            ["True Positive Rate", errorMatrix1[1] / 2000],
            ["True Negative Rate", errorMatrix1[0] / 2000],
            ["False Positive Rate", errorMatrix1[3] / 2000],
            ["False Negative Rate", errorMatrix1[2] / 2000],
        ]),
        md.h3("*Scenerio 2:* heights and weights."),
        md.table([["", "Weights"], ["x", sepLineB[0][0]],
                  ["y", sepLineB[0][1]], ["bias", sepLineB[0][2]]]),
        md.p("Assuming the following"),
        md.image("./images/net.png"),
        md.p("Or in this situation:"),
        md.p("1 if 0 <= a(Height) - b(Weight) + bias, otherwise 0"),
        md.
        p("where *a* and *b* are some weights and *1* is male and *0* is female."
          ),
        md.p("In this situation a=" + str(sepLineB[0][0]) + " and b=" +
             str(sepLineB[0][1]) + " and bias=" + str(sepLineB[0][2])),
        md.image("./images/2d.png"),
        md.
        p("Notice, Male and Female are on slightly different levels in this graph"
          "so that one does not completely cover up the other."),
        md.p("**Confusion Matrix**"),
        md.table([["", "Predicted Male", "Predicted Female"],
                  ["Actual Male", errorMatrix2[1], errorMatrix2[2]],
                  ["Actual Female", errorMatrix2[3], errorMatrix2[0]]]),
        md.table([
            ["", ""],
            ["Error", 1 - ((errorMatrix2[1] + errorMatrix2[0]) / 4000)],
            ["Accuracy", (errorMatrix2[1] + errorMatrix2[0]) / 4000],
            ["True Positive Rate", errorMatrix2[1] / 2000],
            ["True Negative Rate", errorMatrix2[0] / 2000],
            ["False Positive Rate", errorMatrix2[3] / 2000],
            ["False Negative Rate", errorMatrix2[2] / 2000],
        ]),
        md.h3("Libraries Used"),
        md.p("matplotlib, numpy, pandas, pandoc"),
        md.h3("Selected Code Functions"),
        md.p("Functions used to generate this data and calculations."),
        md.p("The full code can be found in `./project1.py`"),
        md.code(function=generate_random_data),
        md.code(function=plot_male_and_females),
        md.code(function=plot_male_and_females),
        md.code(function=get_confusion_matrix),
    ])

    file.close()

    print("Markdown Report generated in ./report.md")
    print("Convert Markdown file to PDF with ")
    print(
        "`pandoc --latex-engine=xelatex -V geometry=margin=1in -s -o FINAL_REPORT.pdf report.md`"
    )