Python Tokenizer.bag_of_char_ngrams Examples

Programming Language: Python

Namespace/Package Name: tokenizer

Class/Type: Tokenizer

Method/Function: bag_of_char_ngrams

Examples at hotexamples.com: 2

Python Tokenizer.bag_of_char_ngrams - 2 examples found. These are the top rated real world Python examples of tokenizer.Tokenizer.bag_of_char_ngrams extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Tokenizer(30)

encode(11)

decode(7)

get_next_token(7)

all_tokens(7)

_pos(5)

advance(5)

filter_tokens(4)

fit(4)

batch_encode(4)

discovery_dir(4)

close(3)

curr_token(3)

eat(3)

LoadStrategy(3)

getTokens(3)

__init__(3)

fit_on_texts(3)

from_pretrained(3)

build_vocab(3)

fit_in_parallel(2)

get_baseforms(2)

en_vocab_create(2)

clean_text(2)

process_review(2)

gen_n_grams(2)

getNextToken(2)

tokenized_url(2)

add(2)

getSentences(1)

get_inlined_exception_name(1)

get_chunks(1)

get_blocks(1)

_Tokenizer__next_char(1)

_Tokenizer__unread_char(1)

getToken(1)

getTTL(1)

changeId(1)

get_n_gram_count(1)

getLocations(1)

getLastToken(1)

getJson(1)

getFinal(1)

gentokenize(1)

genclasstokenize(1)

add_consumer(1)

get_inlined_right_value(1)

Tokenize(1)

add_format(1)

print_all(1)

Example #1

Show file

File: tester.py Project: Xyllan/boun_cmpe561_assignments

def test_authors(p, bag_of_words = True, alpha = 0.05, bag_of_char_ngrams = False, ngram_len = 5, set_of_words = False, complexity_features = False,
	print_predictions = True):
	""" Tests the classifiers with the given feature sets.

	p is the Preprocessor object holding the path data. It must have been initialized by
	calling p.organize_authors()

	If bag_of_words argument is True, the program uses Multinomial Naive Bayes classifier
	with the given alpha and bag of words as the feature set.

	If bag_of_char_ngrams argument is True, the program uses Multinomial Naive Bayes classifier
	with the given alpha, and a bag of character n-grams as the feature set.

	If set_of_words argument is True, the program uses Binarized Multinomial Naive Bayes clasifier
	with the given alpha, and a set of words as the feature set.

	If complexity_features argument is True, the program uses Normalizing Naive Bayes, which
	is my term for a classifier which simply fits all features for all classes into their own normal
	distributions and calculates probabilities using the pdfs.

	Returns a 4-tuple, each being the score tuple a different feature set, in the order they are written
	above. Any feature sets not used will return a score of None.
	"""
	authors = p.get_authors()
	classifiers = (None if not bag_of_words else MultinomialNaiveBayes(authors, alpha = alpha),
		None if not bag_of_char_ngrams else MultinomialNaiveBayes(authors, alpha = alpha),
		None if not set_of_words else BinarizedMultinomialNaiveBayes(authors, alpha = alpha),
		None if not complexity_features else NormalizingNaiveBayes(authors, 8))

	# Train the bayes classifiers for each training data
	for author in authors:
		for clsf in classifiers:
			if clsf is not None: clsf.add_documents(author, len(p.training_data(author)))

		for data in p.training_data(author):

			# Featurize and add features to the classifiers
			t = Tokenizer(p.file_path(author,data))
			if classifiers[0] is not None: classifiers[0].add_feature_counts(author, t.bag_of_words())
			if classifiers[1] is not None: classifiers[1].add_feature_counts(author, t.bag_of_char_ngrams(ngram_len))
			if classifiers[2] is not None: classifiers[2].add_feature_counts(author, t.bag_of_words())
			if classifiers[3] is not None: classifiers[3].add_features(author, classifiers[3].vectorize(t.features()))

	for clsf in classifiers:
		if clsf is not None: clsf.train()

	testers = (None if not bag_of_words else Tester(classifiers[0].get_classes()),
		None if not bag_of_char_ngrams else Tester(classifiers[1].get_classes()),
		None if not set_of_words else Tester(classifiers[2].get_classes()),
		None if not complexity_features else Tester(classifiers[3].get_classes()))

	# Check the classifier predictions for each test data
	for author in authors:
		for data in p.test_data(author):

			# Featurize and classify
			t = Tokenizer(p.file_path(author,data, training_data = False))
			class_predicted = [None, None, None, None]
			if classifiers[0] is not None:
				class_predicted[0] = classifiers[0].most_probable_class(classifiers[0].vectorize(t.bag_of_words()))
				testers[0].add_stat(class_predicted[0], author)
			if classifiers[1] is not None:
				class_predicted[1] = classifiers[1].most_probable_class(classifiers[1].vectorize(t.bag_of_char_ngrams(ngram_len)))
				testers[1].add_stat(class_predicted[1], author)
			if classifiers[2] is not None:
				class_predicted[2] = classifiers[2].most_probable_class(classifiers[2].vectorize(t.bag_of_words()))
				testers[2].add_stat(class_predicted[2], author)
			if classifiers[3] is not None:
				class_predicted[3] = classifiers[3].most_probable_class(classifiers[3].vectorize(t.features()))
				testers[3].add_stat(class_predicted[3], author)
			if print_predictions: print('predicted:',[pr for pr in class_predicted if pr is not None],'actual:',author)
		
	return (testers[0].scores() if testers[0] is not None else None, testers[1].scores() if testers[1] is not None else None,
		testers[2].scores() if testers[2] is not None else None, testers[3].scores() if testers[3] is not None else None)

Example #2

Show file

def test_authors(p,
                 bag_of_words=True,
                 alpha=0.05,
                 bag_of_char_ngrams=False,
                 ngram_len=5,
                 set_of_words=False,
                 complexity_features=False,
                 print_predictions=True):
    """ Tests the classifiers with the given feature sets.

	p is the Preprocessor object holding the path data. It must have been initialized by
	calling p.organize_authors()

	If bag_of_words argument is True, the program uses Multinomial Naive Bayes classifier
	with the given alpha and bag of words as the feature set.

	If bag_of_char_ngrams argument is True, the program uses Multinomial Naive Bayes classifier
	with the given alpha, and a bag of character n-grams as the feature set.

	If set_of_words argument is True, the program uses Binarized Multinomial Naive Bayes clasifier
	with the given alpha, and a set of words as the feature set.

	If complexity_features argument is True, the program uses Normalizing Naive Bayes, which
	is my term for a classifier which simply fits all features for all classes into their own normal
	distributions and calculates probabilities using the pdfs.

	Returns a 4-tuple, each being the score tuple a different feature set, in the order they are written
	above. Any feature sets not used will return a score of None.
	"""
    authors = p.get_authors()
    classifiers = (None if not bag_of_words else MultinomialNaiveBayes(
        authors, alpha=alpha),
                   None if not bag_of_char_ngrams else MultinomialNaiveBayes(
                       authors, alpha=alpha), None if not set_of_words else
                   BinarizedMultinomialNaiveBayes(authors, alpha=alpha),
                   None if not complexity_features else NormalizingNaiveBayes(
                       authors, 8))

    # Train the bayes classifiers for each training data
    for author in authors:
        for clsf in classifiers:
            if clsf is not None:
                clsf.add_documents(author, len(p.training_data(author)))

        for data in p.training_data(author):

            # Featurize and add features to the classifiers
            t = Tokenizer(p.file_path(author, data))
            if classifiers[0] is not None:
                classifiers[0].add_feature_counts(author, t.bag_of_words())
            if classifiers[1] is not None:
                classifiers[1].add_feature_counts(
                    author, t.bag_of_char_ngrams(ngram_len))
            if classifiers[2] is not None:
                classifiers[2].add_feature_counts(author, t.bag_of_words())
            if classifiers[3] is not None:
                classifiers[3].add_features(
                    author, classifiers[3].vectorize(t.features()))

    for clsf in classifiers:
        if clsf is not None: clsf.train()

    testers = (
        None if not bag_of_words else Tester(classifiers[0].get_classes()),
        None
        if not bag_of_char_ngrams else Tester(classifiers[1].get_classes()),
        None if not set_of_words else Tester(classifiers[2].get_classes()),
        None
        if not complexity_features else Tester(classifiers[3].get_classes()))

    # Check the classifier predictions for each test data
    for author in authors:
        for data in p.test_data(author):

            # Featurize and classify
            t = Tokenizer(p.file_path(author, data, training_data=False))
            class_predicted = [None, None, None, None]
            if classifiers[0] is not None:
                class_predicted[0] = classifiers[0].most_probable_class(
                    classifiers[0].vectorize(t.bag_of_words()))
                testers[0].add_stat(class_predicted[0], author)
            if classifiers[1] is not None:
                class_predicted[1] = classifiers[1].most_probable_class(
                    classifiers[1].vectorize(t.bag_of_char_ngrams(ngram_len)))
                testers[1].add_stat(class_predicted[1], author)
            if classifiers[2] is not None:
                class_predicted[2] = classifiers[2].most_probable_class(
                    classifiers[2].vectorize(t.bag_of_words()))
                testers[2].add_stat(class_predicted[2], author)
            if classifiers[3] is not None:
                class_predicted[3] = classifiers[3].most_probable_class(
                    classifiers[3].vectorize(t.features()))
                testers[3].add_stat(class_predicted[3], author)
            if print_predictions:
                print('predicted:',
                      [pr for pr in class_predicted if pr is not None],
                      'actual:', author)

    return (testers[0].scores() if testers[0] is not None else None,
            testers[1].scores() if testers[1] is not None else None,
            testers[2].scores() if testers[2] is not None else None,
            testers[3].scores() if testers[3] is not None else None)