Ejemplo n.º 1
0
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print(classifier.classify("win money"))  # False: most likely spam.
print(classifier.classify("fix bug"))  # True: most likely a real message.
print()

# False: people don't talk like this on developer lists...
print(classifier.classify("customer"))
# True: because most likely everyone knows everyone.
print(classifier.classify("guys"))
print()

# To test the accuracy of a classifier,
# we typically use 10-fold cross validation.
# This means that 10 individual tests are performed,
# each with 90% of the corpus as training data and 10% as testing data.
from pattern.vector import k_fold_cv

print(k_fold_cv(NB, documents=m, folds=10))

# This yields 5 scores: (Accuracy, Precision, Recall, F-score, standard deviation).
# Accuracy in itself is not very useful,
# since some spam may have been regarded as real messages (false positives),
# and some real messages may have been regarded as spam (false negatives).
# Precision = how accurately false positives are discarded,
#    Recall = how accurately false negatives are discarded.
#   F-score = harmonic mean of precision and recall.
#     stdev = folds' variation from average F-score.
Ejemplo n.º 2
0
# Each document has a type: True for actual e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print classifier.classify("win money") # False: most likely spam.
print classifier.classify("fix bug")   # True: most likely a real message.
print

print classifier.classify("customer")  # False: people don't talk like this on developer lists...
print classifier.classify("guys")      # True: because most likely everyone knows everyone.
print

# To test the accuracy of a classifier,
# we typically use 10-fold cross validation.
# This means that 10 individual tests are performed, 
# each with 90% of the corpus as training data and 10% as testing data.
from pattern.vector import k_fold_cv
print k_fold_cv(Bayes, documents=m, folds=10)

# This yields 4 scores: (Accuracy, Precision, Recall, F-score).
# Accuracy in itself is not very useful, 
# since some spam may have been regarded as real messages (false positives),
# and some real messages may have been regarded as spam (false negatives).
# Precision = how accurately false positives are discarded,
#    Recall = how accurately false negatives are discarded.
#   F-score = harmonic mean of precision and recall.
Ejemplo n.º 3
0
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print classifier.classify("win money")  # False: most likely spam.
print classifier.classify("fix bug")  # True: most likely a real message.
print

print classifier.classify(
    "customer")  # False: people don't talk like this on developer lists...
print classifier.classify(
    "guys")  # True: because most likely everyone knows everyone.
print

# To test the accuracy of a classifier,
# we typically use 10-fold cross validation.
# This means that 10 individual tests are performed,
# each with 90% of the corpus as training data and 10% as testing data.
from pattern.vector import k_fold_cv
print k_fold_cv(Bayes, documents=m, folds=10)

# This yields 4 scores: (Accuracy, Precision, Recall, F-score).
# Accuracy in itself is not very useful,
# since some spam may have been regarded as real messages (false positives),
# and some real messages may have been regarded as spam (false negatives).
# Precision = how accurately false positives are discarded,
#    Recall = how accurately false negatives are discarded.
#   F-score = harmonic mean of precision and recall.
Ejemplo n.º 4
0
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print(classifier.classify("win money")) # False: most likely spam.
print(classifier.classify("fix bug"))   # True: most likely a real message.
print()

print(classifier.classify("customer"))  # False: people don't talk like this on developer lists...
print(classifier.classify("guys"))      # True: because most likely everyone knows everyone.
print()

# To test the accuracy of a classifier,
# we typically use 10-fold cross validation.
# This means that 10 individual tests are performed, 
# each with 90% of the corpus as training data and 10% as testing data.
from pattern.vector import k_fold_cv
print(k_fold_cv(NB, documents=m, folds=10))

# This yields 5 scores: (Accuracy, Precision, Recall, F-score, standard deviation).
# Accuracy in itself is not very useful, 
# since some spam may have been regarded as real messages (false positives),
# and some real messages may have been regarded as spam (false negatives).
# Precision = how accurately false positives are discarded,
#    Recall = how accurately false negatives are discarded.
#   F-score = harmonic mean of precision and recall.
#     stdev = folds' variation from average F-score.
Ejemplo n.º 5
0
# Each document has a type: True for real e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = Bayes()
for document in corpus:
    classifier.train(document)

# We can now ask it questions about unknown e-mails:

print classifier.classify("win money") # False: most likely spam.
print classifier.classify("fix bug")   # True: most likely a real message.
print

print classifier.classify("customer")  # False: people don't talk like this on developer lists...
print classifier.classify("guys")      # True: because most likely everyone knows everyone.
print

# To test the accuracy of a classifier,
# we typically use 10-fold cross validation.
# This means that 10 individual tests are performed, 
# each with 90% of the corpus as training data and 10% as testing data.
from pattern.vector import k_fold_cv
print k_fold_cv(Bayes, folds=10, documents=corpus)

# This yields 4 scores: Accuracy, Precision, Recall and F-score.
# Accuracy in itself is not very useful, 
# since some spam may have been regarded as real messages (false positives),
# and some real messages may have been regarded as spam (false negatives).
# Precision = how accurate false positives are discarded,
# Recall = how accurate false negatives are discarded.