Beispiel #1
0
def collect_and_output_normalized_corpus_term_frequencies(
        corpus, corpus_name, term_frequencies=None):

    if term_frequencies is None:
        term_frequencies = collect_term_counts(corpus)

    output_csv_file = fs.open_csv_file("normalized_term_frequencies.csv",
                                       ["Term", "Log Normalized TF"])

    unsorted_array = []

    for term, frequency in term_frequencies.iteritems():
        normalized_term_frequency = (1 + math.log(frequency, 10))
        unsorted_array.append([term, normalized_term_frequency])
        output_csv_file.writerow([term] + [normalized_term_frequency])

    sorted_array = sorted(unsorted_array,
                          key=lambda term_frequency: term_frequency[1],
                          reverse=True)

    # output a bar chart illustrating the above
    chart_term_frequencies(
        "normalized_term_frequencies.png",
        "Log Normalized Term Frequencies (" + corpus_name + ")",
        "Term Frequencies", sorted_array, [0, 1, 2, -3, -2, -1])

    return term_frequencies
Beispiel #2
0
def collect_and_output_normalized_corpus_term_frequencies(corpus, corpus_name, term_frequencies=None):

    if term_frequencies is None:
        term_frequencies = collect_term_counts(corpus)

    output_csv_file = fs.open_csv_file("normalized_term_frequencies.csv", ["Term", "Log Normalized TF"])

    unsorted_array = []

    for term, frequency in term_frequencies.iteritems():
        normalized_term_frequency = 1 + math.log(frequency, 10)
        unsorted_array.append([term, normalized_term_frequency])
        output_csv_file.writerow([term] + [normalized_term_frequency])

    sorted_array = sorted(unsorted_array, key=lambda term_frequency: term_frequency[1], reverse=True)

    # output a bar chart illustrating the above
    chart_term_frequencies(
        "normalized_term_frequencies.png",
        "Log Normalized Term Frequencies (" + corpus_name + ")",
        "Term Frequencies",
        sorted_array,
        [0, 1, 2, -3, -2, -1],
    )

    return term_frequencies
Beispiel #3
0
def output_corpus_terms(corpus, unique_vocabulary=None):
    if unique_vocabulary is None:
        unique_vocabulary = collect_unique_terms(corpus)

    output_csv_file = fs.open_csv_file("corpus_terms.csv", ["Term"])

    for term in unique_vocabulary:
        logging.debug(term)
        output_csv_file.writerow([term])
Beispiel #4
0
def output_corpus_terms(corpus, unique_vocabulary=None):
    if unique_vocabulary is None:
        unique_vocabulary = collect_unique_terms(corpus)

    output_csv_file = fs.open_csv_file("corpus_terms.csv", ["Term"])

    for term in unique_vocabulary:
        logging.debug(term)
        output_csv_file.writerow([term])
Beispiel #5
0
def collect_and_output_frequency_frequencies(corpus, corpus_name,
                                             term_frequencies):
    if term_frequencies is None:
        term_frequencies = collect_term_counts(corpus)

    frequency_frequencies = {}
    for term, frequency in term_frequencies.iteritems():
        if frequency_frequencies.has_key(frequency):
            frequency_frequencies[frequency] += 1
        else:
            frequency_frequencies[frequency] = 1

    unsorted_array = [[key, value]
                      for key, value in frequency_frequencies.iteritems()]
    sorted_array = sorted(
        unsorted_array,
        key=lambda frequency_frequency: frequency_frequency[1],
        reverse=True)

    frequency_frequencies_to_chart = []
    frequencies_to_chart = []
    output_csv_file = fs.open_csv_file(
        "frequency_frequencies.csv", ["Frequency Frequency", "Term Frequency"])

    # we collect frequencies_to_chart and frequency_frequencies_to_chart each into their own single dimensional
    # array.  Then we pass frequency_frequencies_to_chart in an array so that it is 2D as needed by the chart.
    # This means there is exactly 1 data set and 6 columns of data in the set.  There is no second set to compare
    # it to.
    for index, (term_frequency,
                frequency_frequency) in enumerate(sorted_array):
        output_csv_file.writerow([frequency_frequency] + [term_frequency])
        if index <= 20:
            frequencies_to_chart.extend([term_frequency])
            frequency_frequencies_to_chart.extend([frequency_frequency])

    charting.bar_chart(
        "frequency_frequencies.png", [frequency_frequencies_to_chart],
        "Frequency Frequencies (" + corpus_name + ")", frequencies_to_chart,
        "Frequency Frequency", None,
        ['#59799e', '#810CE8', '#FF0000', '#12995D', '#FD53FF', '#AA55CC'],
        0.2, 0.0)

    return frequency_frequencies
Beispiel #6
0
def collect_and_output_frequency_frequencies(corpus, corpus_name, term_frequencies):
    if term_frequencies is None:
        term_frequencies = collect_term_counts(corpus)

    frequency_frequencies = {}
    for term, frequency in term_frequencies.iteritems():
        if frequency_frequencies.has_key(frequency):
            frequency_frequencies[frequency] += 1
        else:
            frequency_frequencies[frequency] = 1

    unsorted_array = [[key, value] for key, value in frequency_frequencies.iteritems()]
    sorted_array = sorted(unsorted_array, key=lambda frequency_frequency: frequency_frequency[1], reverse=True)

    frequency_frequencies_to_chart = []
    frequencies_to_chart = []
    output_csv_file = fs.open_csv_file("frequency_frequencies.csv", ["Frequency Frequency", "Term Frequency"])

    # we collect frequencies_to_chart and frequency_frequencies_to_chart each into their own single dimensional
    # array.  Then we pass frequency_frequencies_to_chart in an array so that it is 2D as needed by the chart.
    # This means there is exactly 1 data set and 6 columns of data in the set.  There is no second set to compare
    # it to.
    for index, (term_frequency, frequency_frequency) in enumerate(sorted_array):
        output_csv_file.writerow([frequency_frequency] + [term_frequency])
        if index <= 20:
            frequencies_to_chart.extend([term_frequency])
            frequency_frequencies_to_chart.extend([frequency_frequency])

    charting.bar_chart(
        "frequency_frequencies.png",
        [frequency_frequencies_to_chart],
        "Frequency Frequencies (" + corpus_name + ")",
        frequencies_to_chart,
        "Frequency Frequency",
        None,
        ["#59799e", "#810CE8", "#FF0000", "#12995D", "#FD53FF", "#AA55CC"],
        0.2,
        0.0,
    )

    return frequency_frequencies
Beispiel #7
0
def train_classifier(args):
    logging.debug("Training classifier")

    training_corpora = {}

    # Use the same corpora that we have used in previous demos
    training_set_names = [
        "abc", "genesis", "gutenberg", "inaugural", "stateUnion", "webtext",
        "custom"
    ]

    # Open a CSV file with 3 columns.  First column is the name of the corpus (which in this example is also the
    # name of the class).  Second is a single term from the corpus. Third is the probability with which the term occurs.
    training = fs.open_csv_file("bayes_training.csv",
                                ["class", "term", "probability"])

    # Ignore stopwords
    stopwords = nltk.corpus.stopwords.words('english')

    # Iterate through each of the training sets
    for training_set_name in training_set_names:

        # Load the words and corpus name from the requested corpus.
        terms_array, corpus_name = words.load_text_corpus(
            {training_set_name: args[training_set_name]})

        # Stem the terms if stemming is enabled
        if args["stemming"]:
            terms_array = words.stem_words_array(terms_array)

        # Count up the unique terms in the words array
        term_counts = words.collect_term_counts(terms_array)

        # Get the total number of words in entire corpus
        num_words = float(len(terms_array))

        # Write the frequency of each term occurring in the given class out to the CSV
        for term, count in term_counts.iteritems():
            # We ignore stop words and punctuation
            if term not in stopwords and term.isalnum():
                training.writerow([corpus_name, term.lower(), count])
Beispiel #8
0
def collect_and_output_corpus_term_frequencies(corpus, corpus_name):
    term_frequencies = collect_term_counts(corpus)

    output_csv_file = fs.open_csv_file("term_frequencies.csv", ["Term", "Frequency"])

    unsorted_array = [[key, value] for key, value in term_frequencies.iteritems()]
    sorted_array = sorted(unsorted_array, key=lambda term_frequency: term_frequency[1], reverse=True)

    for term, frequency in sorted_array:
        output_csv_file.writerow([term] + [frequency])

    # output a bar chart illustrating the above
    chart_term_frequencies(
        "term_frequencies.png",
        "Term Frequencies (" + corpus_name + ")",
        "Term Frequencies",
        sorted_array,
        [0, 1, 2, -3, -2, -1],
    )

    return term_frequencies
Beispiel #9
0
def collect_and_output_corpus_term_frequencies(corpus, corpus_name):
    term_frequencies = collect_term_counts(corpus)

    output_csv_file = fs.open_csv_file("term_frequencies.csv",
                                       ["Term", "Frequency"])

    unsorted_array = [[key, value]
                      for key, value in term_frequencies.iteritems()]
    sorted_array = sorted(unsorted_array,
                          key=lambda term_frequency: term_frequency[1],
                          reverse=True)

    for term, frequency in sorted_array:
        output_csv_file.writerow([term] + [frequency])

    # output a bar chart illustrating the above
    chart_term_frequencies("term_frequencies.png",
                           "Term Frequencies (" + corpus_name + ")",
                           "Term Frequencies", sorted_array,
                           [0, 1, 2, -3, -2, -1])

    return term_frequencies
Beispiel #10
0
def train_classifier(args):
    logging.debug("Training classifier")

    training_corpora = {}

    # Use the same corpora that we have used in previous demos
    training_set_names = ["abc", "genesis", "gutenberg", "inaugural", "stateUnion", "webtext", "custom"]

    # Open a CSV file with 3 columns.  First column is the name of the corpus (which in this example is also the
    # name of the class).  Second is a single term from the corpus. Third is the probability with which the term occurs.
    training = fs.open_csv_file("bayes_training.csv", ["class", "term", "probability"]);

    # Ignore stopwords
    stopwords = nltk.corpus.stopwords.words('english')

    # Iterate through each of the training sets
    for training_set_name in training_set_names:

        # Load the words and corpus name from the requested corpus.
        terms_array, corpus_name = words.load_text_corpus({training_set_name : args[training_set_name]})

        # Stem the terms if stemming is enabled
        if args["stemming"]:
            terms_array = words.stem_words_array(terms_array)

        # Count up the unique terms in the words array
        term_counts = words.collect_term_counts(terms_array)

        # Get the total number of words in entire corpus
        num_words = float(len(terms_array))

        # Write the frequency of each term occurring in the given class out to the CSV
        for term, count in term_counts.iteritems():
            # We ignore stop words and punctuation
            if term not in stopwords and term.isalnum():
                training.writerow([corpus_name, term.lower(), count])