Example #1
0
def train_classifier(args):
    logging.debug("Training classifier")

    training_corpora = {}

    # Use the same corpora that we have used in previous demos
    training_set_names = [
        "abc", "genesis", "gutenberg", "inaugural", "stateUnion", "webtext",
        "custom"
    ]

    # Open a CSV file with 3 columns.  First column is the name of the corpus (which in this example is also the
    # name of the class).  Second is a single term from the corpus. Third is the probability with which the term occurs.
    training = fs.open_csv_file("bayes_training.csv",
                                ["class", "term", "probability"])

    # Ignore stopwords
    stopwords = nltk.corpus.stopwords.words('english')

    # Iterate through each of the training sets
    for training_set_name in training_set_names:

        # Load the words and corpus name from the requested corpus.
        terms_array, corpus_name = words.load_text_corpus(
            {training_set_name: args[training_set_name]})

        # Stem the terms if stemming is enabled
        if args["stemming"]:
            terms_array = words.stem_words_array(terms_array)

        # Count up the unique terms in the words array
        term_counts = words.collect_term_counts(terms_array)

        # Get the total number of words in entire corpus
        num_words = float(len(terms_array))

        # Write the frequency of each term occurring in the given class out to the CSV
        for term, count in term_counts.iteritems():
            # We ignore stop words and punctuation
            if term not in stopwords and term.isalnum():
                training.writerow([corpus_name, term.lower(), count])
Example #2
0
def train_classifier(args):
    logging.debug("Training classifier")

    training_corpora = {}

    # Use the same corpora that we have used in previous demos
    training_set_names = ["abc", "genesis", "gutenberg", "inaugural", "stateUnion", "webtext", "custom"]

    # Open a CSV file with 3 columns.  First column is the name of the corpus (which in this example is also the
    # name of the class).  Second is a single term from the corpus. Third is the probability with which the term occurs.
    training = fs.open_csv_file("bayes_training.csv", ["class", "term", "probability"]);

    # Ignore stopwords
    stopwords = nltk.corpus.stopwords.words('english')

    # Iterate through each of the training sets
    for training_set_name in training_set_names:

        # Load the words and corpus name from the requested corpus.
        terms_array, corpus_name = words.load_text_corpus({training_set_name : args[training_set_name]})

        # Stem the terms if stemming is enabled
        if args["stemming"]:
            terms_array = words.stem_words_array(terms_array)

        # Count up the unique terms in the words array
        term_counts = words.collect_term_counts(terms_array)

        # Get the total number of words in entire corpus
        num_words = float(len(terms_array))

        # Write the frequency of each term occurring in the given class out to the CSV
        for term, count in term_counts.iteritems():
            # We ignore stop words and punctuation
            if term not in stopwords and term.isalnum():
                training.writerow([corpus_name, term.lower(), count])
Example #3
0
def classify(args):

    file_name = args["classify"]
    logging.debug("Classify " + file_name)

    # Load the training data and class names.
    training_data, class_names = load_training_data()

    # Read in the document to classify
    to_classify = codecs.open(args["classify"], "r", "utf-8").read()

    # Tokenize the document to classify.
    to_classify_terms = nltk.word_tokenize(to_classify)

    # If we have enabled stemming then stem these words
    if args["stemming"]:
        to_classify_terms = words.stem_words_array(to_classify_terms)

    # We are now ready to actually classify the document.  We need to determine the
    # the probability that our document (D) is a member of each of our classes (C).
    # We calculate this probability by taking the product of the probability that
    # each word in the document belongs to the class C (this is the Naive aspect of
    # the classifier - we make the assumption that each word probability is independent
    # of all other word probabilities).  This calculates the probability of the
    # words in this document given a class C -> P(w|c)
    class_probabilities = {}

    # In this example, each class is comprised of just one document.  The probability
    # that a document falls in a class is therefore 1 / the number of classes.  We
    # use the log probability to counteract the effect of the product of many near-0
    # probabilities.  In our example we are actually calling each corpus a single document,
    # so the probability of a given document is 1 / the number of corpora.  If this weren't
    # the case we'd track the number of documents per category.  Categories with lots of
    # documents would have higher probabilities of being picked by the classifier because
    # this term would be relatively high when compared to other categories.
    log_probability_of_class = math.log(1.0 / len(class_names))

    stopwords = nltk.corpus.stopwords.words('english')

    # We need the total vocabulary size in order to do laplace smoothing
    vocabulary_size = calculate_vocabulary_size(training_data);

    logging.debug("Total vocabulary size " + str(vocabulary_size) + " terms")

    # Calculate the word probability product for each class P(w|c)
    for class_name in class_names:

        logging.debug("Calculating log probability for class " + class_name)

        # keeping everything log probabilities - math.log(1) = 0
        log_probability_of_words_in_class = math.log(1)

        # We need the number of terms in the class (note - NOT unique terms)
        number_of_terms_in_class = calculate_number_of_terms_in_class(training_data[class_name])

        logging.debug("Class contains " + str(number_of_terms_in_class) + " terms")

        # Take the product of all the probabilities of a term appearing in the class as
        # calculated during training
        for term in to_classify_terms:

            # Treat capitalized and lowercase as a single term
            term = term.lower()

            # We ignore stop words and punctuation
            if term not in stopwords and term.isalnum():

                # We have to smooth the probabilities of unknown words.  This means that a term we
                # don't recognize is treated as having a very small probability.  If we left it as 1 it
                # doesn't impact the product. In truth, unrecognized terms should be treated as rare rather
                # than common. Here we use laplace smoothing (or add one smoothing)
                if term in training_data[class_name]:
                    term_frequency = float(training_data[class_name][term])
                else:
                    term_frequency = 0.0

                # A probability very near 0
                term_probability_in_trained_class = (term_frequency + 1) / (number_of_terms_in_class + vocabulary_size)

                if args["printProbabilities"]:
                    logging.warn("The word <" + term + "> occurs with frequency " + str(term_frequency) + " and probability " + str(term_probability_in_trained_class))

                # Log probability used in the product to avoid approaching 0 as we multiple small numbers
                log_probability_of_words_in_class += math.log(term_probability_in_trained_class)

        # We now know P(c) and P(w|c).  We are planning to use Bayes Theorem:
        # P(A|B) = P(B|A) * P(A) / P(B) to learn P(c|w) - the probability of
        # a class given the words in a document.  Plugging into Bayes Theorem:
        # P(c|w) = P(w|c) * P(c) / P(w).  P(w) is only a function of the words
        # in the document we are classifying, and it therefore can be considered
        # constant across classes.  We can therefore drop it.  So now we have
        # P(c|w) = P(w|c) * P(c).
        class_probabilities[class_name] = log_probability_of_words_in_class + log_probability_of_class;

    logging.debug("")

    # We now have a bunch of probabilities, one per class.  We simply take the class associated
    # with the highest probability and label the document as belonging to that class.
    max_class = None
    max_probability = -sys.float_info.max
    for class_name, probability in class_probabilities.iteritems():
        logging.debug("Probability of " + class_name + " is " + str(probability))
        if probability > max_probability:
            max_probability = probability
            max_class = class_name

    return max_class, max_probability
Example #4
0
def classify(args):

    file_name = args["classify"]
    logging.debug("Classify " + file_name)

    # Load the training data and class names.
    training_data, class_names = load_training_data()

    # Read in the document to classify
    to_classify = codecs.open(args["classify"], "r", "utf-8").read()

    # Tokenize the document to classify.
    to_classify_terms = nltk.word_tokenize(to_classify)

    # If we have enabled stemming then stem these words
    if args["stemming"]:
        to_classify_terms = words.stem_words_array(to_classify_terms)

    # We are now ready to actually classify the document.  We need to determine the
    # the probability that our document (D) is a member of each of our classes (C).
    # We calculate this probability by taking the product of the probability that
    # each word in the document belongs to the class C (this is the Naive aspect of
    # the classifier - we make the assumption that each word probability is independent
    # of all other word probabilities).  This calculates the probability of the
    # words in this document given a class C -> P(w|c)
    class_probabilities = {}

    # In this example, each class is comprised of just one document.  The probability
    # that a document falls in a class is therefore 1 / the number of classes.  We
    # use the log probability to counteract the effect of the product of many near-0
    # probabilities.  In our example we are actually calling each corpus a single document,
    # so the probability of a given document is 1 / the number of corpora.  If this weren't
    # the case we'd track the number of documents per category.  Categories with lots of
    # documents would have higher probabilities of being picked by the classifier because
    # this term would be relatively high when compared to other categories.
    log_probability_of_class = math.log(1.0 / len(class_names))

    stopwords = nltk.corpus.stopwords.words('english')

    # We need the total vocabulary size in order to do laplace smoothing
    vocabulary_size = calculate_vocabulary_size(training_data)

    logging.debug("Total vocabulary size " + str(vocabulary_size) + " terms")

    # Calculate the word probability product for each class P(w|c)
    for class_name in class_names:

        logging.debug("Calculating log probability for class " + class_name)

        # keeping everything log probabilities - math.log(1) = 0
        log_probability_of_words_in_class = math.log(1)

        # We need the number of terms in the class (note - NOT unique terms)
        number_of_terms_in_class = calculate_number_of_terms_in_class(
            training_data[class_name])

        logging.debug("Class contains " + str(number_of_terms_in_class) +
                      " terms")

        # Take the product of all the probabilities of a term appearing in the class as
        # calculated during training
        for term in to_classify_terms:

            # Treat capitalized and lowercase as a single term
            term = term.lower()

            # We ignore stop words and punctuation
            if term not in stopwords and term.isalnum():

                # We have to smooth the probabilities of unknown words.  This means that a term we
                # don't recognize is treated as having a very small probability.  If we left it as 1 it
                # doesn't impact the product. In truth, unrecognized terms should be treated as rare rather
                # than common. Here we use laplace smoothing (or add one smoothing)
                if term in training_data[class_name]:
                    term_frequency = float(training_data[class_name][term])
                else:
                    term_frequency = 0.0

                # A probability very near 0
                term_probability_in_trained_class = (term_frequency + 1) / (
                    number_of_terms_in_class + vocabulary_size)

                if args["printProbabilities"]:
                    logging.warn("The word <" + term +
                                 "> occurs with frequency " +
                                 str(term_frequency) + " and probability " +
                                 str(term_probability_in_trained_class))

                # Log probability used in the product to avoid approaching 0 as we multiple small numbers
                log_probability_of_words_in_class += math.log(
                    term_probability_in_trained_class)

        # We now know P(c) and P(w|c).  We are planning to use Bayes Theorem:
        # P(A|B) = P(B|A) * P(A) / P(B) to learn P(c|w) - the probability of
        # a class given the words in a document.  Plugging into Bayes Theorem:
        # P(c|w) = P(w|c) * P(c) / P(w).  P(w) is only a function of the words
        # in the document we are classifying, and it therefore can be considered
        # constant across classes.  We can therefore drop it.  So now we have
        # P(c|w) = P(w|c) * P(c).
        class_probabilities[
            class_name] = log_probability_of_words_in_class + log_probability_of_class

    logging.debug("")

    # We now have a bunch of probabilities, one per class.  We simply take the class associated
    # with the highest probability and label the document as belonging to that class.
    max_class = None
    max_probability = -sys.float_info.max
    for class_name, probability in class_probabilities.iteritems():
        logging.debug("Probability of " + class_name + " is " +
                      str(probability))
        if probability > max_probability:
            max_probability = probability
            max_class = class_name

    return max_class, max_probability