def collect_and_output_normalized_corpus_term_frequencies( corpus, corpus_name, term_frequencies=None): if term_frequencies is None: term_frequencies = collect_term_counts(corpus) output_csv_file = fs.open_csv_file("normalized_term_frequencies.csv", ["Term", "Log Normalized TF"]) unsorted_array = [] for term, frequency in term_frequencies.iteritems(): normalized_term_frequency = (1 + math.log(frequency, 10)) unsorted_array.append([term, normalized_term_frequency]) output_csv_file.writerow([term] + [normalized_term_frequency]) sorted_array = sorted(unsorted_array, key=lambda term_frequency: term_frequency[1], reverse=True) # output a bar chart illustrating the above chart_term_frequencies( "normalized_term_frequencies.png", "Log Normalized Term Frequencies (" + corpus_name + ")", "Term Frequencies", sorted_array, [0, 1, 2, -3, -2, -1]) return term_frequencies
def collect_and_output_normalized_corpus_term_frequencies(corpus, corpus_name, term_frequencies=None): if term_frequencies is None: term_frequencies = collect_term_counts(corpus) output_csv_file = fs.open_csv_file("normalized_term_frequencies.csv", ["Term", "Log Normalized TF"]) unsorted_array = [] for term, frequency in term_frequencies.iteritems(): normalized_term_frequency = 1 + math.log(frequency, 10) unsorted_array.append([term, normalized_term_frequency]) output_csv_file.writerow([term] + [normalized_term_frequency]) sorted_array = sorted(unsorted_array, key=lambda term_frequency: term_frequency[1], reverse=True) # output a bar chart illustrating the above chart_term_frequencies( "normalized_term_frequencies.png", "Log Normalized Term Frequencies (" + corpus_name + ")", "Term Frequencies", sorted_array, [0, 1, 2, -3, -2, -1], ) return term_frequencies
def output_corpus_terms(corpus, unique_vocabulary=None): if unique_vocabulary is None: unique_vocabulary = collect_unique_terms(corpus) output_csv_file = fs.open_csv_file("corpus_terms.csv", ["Term"]) for term in unique_vocabulary: logging.debug(term) output_csv_file.writerow([term])
def collect_and_output_frequency_frequencies(corpus, corpus_name, term_frequencies): if term_frequencies is None: term_frequencies = collect_term_counts(corpus) frequency_frequencies = {} for term, frequency in term_frequencies.iteritems(): if frequency_frequencies.has_key(frequency): frequency_frequencies[frequency] += 1 else: frequency_frequencies[frequency] = 1 unsorted_array = [[key, value] for key, value in frequency_frequencies.iteritems()] sorted_array = sorted( unsorted_array, key=lambda frequency_frequency: frequency_frequency[1], reverse=True) frequency_frequencies_to_chart = [] frequencies_to_chart = [] output_csv_file = fs.open_csv_file( "frequency_frequencies.csv", ["Frequency Frequency", "Term Frequency"]) # we collect frequencies_to_chart and frequency_frequencies_to_chart each into their own single dimensional # array. Then we pass frequency_frequencies_to_chart in an array so that it is 2D as needed by the chart. # This means there is exactly 1 data set and 6 columns of data in the set. There is no second set to compare # it to. for index, (term_frequency, frequency_frequency) in enumerate(sorted_array): output_csv_file.writerow([frequency_frequency] + [term_frequency]) if index <= 20: frequencies_to_chart.extend([term_frequency]) frequency_frequencies_to_chart.extend([frequency_frequency]) charting.bar_chart( "frequency_frequencies.png", [frequency_frequencies_to_chart], "Frequency Frequencies (" + corpus_name + ")", frequencies_to_chart, "Frequency Frequency", None, ['#59799e', '#810CE8', '#FF0000', '#12995D', '#FD53FF', '#AA55CC'], 0.2, 0.0) return frequency_frequencies
def collect_and_output_frequency_frequencies(corpus, corpus_name, term_frequencies): if term_frequencies is None: term_frequencies = collect_term_counts(corpus) frequency_frequencies = {} for term, frequency in term_frequencies.iteritems(): if frequency_frequencies.has_key(frequency): frequency_frequencies[frequency] += 1 else: frequency_frequencies[frequency] = 1 unsorted_array = [[key, value] for key, value in frequency_frequencies.iteritems()] sorted_array = sorted(unsorted_array, key=lambda frequency_frequency: frequency_frequency[1], reverse=True) frequency_frequencies_to_chart = [] frequencies_to_chart = [] output_csv_file = fs.open_csv_file("frequency_frequencies.csv", ["Frequency Frequency", "Term Frequency"]) # we collect frequencies_to_chart and frequency_frequencies_to_chart each into their own single dimensional # array. Then we pass frequency_frequencies_to_chart in an array so that it is 2D as needed by the chart. # This means there is exactly 1 data set and 6 columns of data in the set. There is no second set to compare # it to. for index, (term_frequency, frequency_frequency) in enumerate(sorted_array): output_csv_file.writerow([frequency_frequency] + [term_frequency]) if index <= 20: frequencies_to_chart.extend([term_frequency]) frequency_frequencies_to_chart.extend([frequency_frequency]) charting.bar_chart( "frequency_frequencies.png", [frequency_frequencies_to_chart], "Frequency Frequencies (" + corpus_name + ")", frequencies_to_chart, "Frequency Frequency", None, ["#59799e", "#810CE8", "#FF0000", "#12995D", "#FD53FF", "#AA55CC"], 0.2, 0.0, ) return frequency_frequencies
def train_classifier(args): logging.debug("Training classifier") training_corpora = {} # Use the same corpora that we have used in previous demos training_set_names = [ "abc", "genesis", "gutenberg", "inaugural", "stateUnion", "webtext", "custom" ] # Open a CSV file with 3 columns. First column is the name of the corpus (which in this example is also the # name of the class). Second is a single term from the corpus. Third is the probability with which the term occurs. training = fs.open_csv_file("bayes_training.csv", ["class", "term", "probability"]) # Ignore stopwords stopwords = nltk.corpus.stopwords.words('english') # Iterate through each of the training sets for training_set_name in training_set_names: # Load the words and corpus name from the requested corpus. terms_array, corpus_name = words.load_text_corpus( {training_set_name: args[training_set_name]}) # Stem the terms if stemming is enabled if args["stemming"]: terms_array = words.stem_words_array(terms_array) # Count up the unique terms in the words array term_counts = words.collect_term_counts(terms_array) # Get the total number of words in entire corpus num_words = float(len(terms_array)) # Write the frequency of each term occurring in the given class out to the CSV for term, count in term_counts.iteritems(): # We ignore stop words and punctuation if term not in stopwords and term.isalnum(): training.writerow([corpus_name, term.lower(), count])
def collect_and_output_corpus_term_frequencies(corpus, corpus_name): term_frequencies = collect_term_counts(corpus) output_csv_file = fs.open_csv_file("term_frequencies.csv", ["Term", "Frequency"]) unsorted_array = [[key, value] for key, value in term_frequencies.iteritems()] sorted_array = sorted(unsorted_array, key=lambda term_frequency: term_frequency[1], reverse=True) for term, frequency in sorted_array: output_csv_file.writerow([term] + [frequency]) # output a bar chart illustrating the above chart_term_frequencies( "term_frequencies.png", "Term Frequencies (" + corpus_name + ")", "Term Frequencies", sorted_array, [0, 1, 2, -3, -2, -1], ) return term_frequencies
def collect_and_output_corpus_term_frequencies(corpus, corpus_name): term_frequencies = collect_term_counts(corpus) output_csv_file = fs.open_csv_file("term_frequencies.csv", ["Term", "Frequency"]) unsorted_array = [[key, value] for key, value in term_frequencies.iteritems()] sorted_array = sorted(unsorted_array, key=lambda term_frequency: term_frequency[1], reverse=True) for term, frequency in sorted_array: output_csv_file.writerow([term] + [frequency]) # output a bar chart illustrating the above chart_term_frequencies("term_frequencies.png", "Term Frequencies (" + corpus_name + ")", "Term Frequencies", sorted_array, [0, 1, 2, -3, -2, -1]) return term_frequencies
def train_classifier(args): logging.debug("Training classifier") training_corpora = {} # Use the same corpora that we have used in previous demos training_set_names = ["abc", "genesis", "gutenberg", "inaugural", "stateUnion", "webtext", "custom"] # Open a CSV file with 3 columns. First column is the name of the corpus (which in this example is also the # name of the class). Second is a single term from the corpus. Third is the probability with which the term occurs. training = fs.open_csv_file("bayes_training.csv", ["class", "term", "probability"]); # Ignore stopwords stopwords = nltk.corpus.stopwords.words('english') # Iterate through each of the training sets for training_set_name in training_set_names: # Load the words and corpus name from the requested corpus. terms_array, corpus_name = words.load_text_corpus({training_set_name : args[training_set_name]}) # Stem the terms if stemming is enabled if args["stemming"]: terms_array = words.stem_words_array(terms_array) # Count up the unique terms in the words array term_counts = words.collect_term_counts(terms_array) # Get the total number of words in entire corpus num_words = float(len(terms_array)) # Write the frequency of each term occurring in the given class out to the CSV for term, count in term_counts.iteritems(): # We ignore stop words and punctuation if term not in stopwords and term.isalnum(): training.writerow([corpus_name, term.lower(), count])