Example #1
0
import os, sys; sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.vector import Corpus

# In the previous example we saw that words in a Document have a weight,
# based on how many times the word occurs in the document.
# This is called term frequency (TF).

# A better measure is term frequency - inverse document frequency (TF-IDF).
# If "important" is the most important word in a document,
# but it also occurs frequently in many other documents, then it is not important at all.

# The Corpus object groups a number of documents
# so their words can be compared to calculate TF-IDF.

corpus = Corpus.build(os.path.join("corpus", "*.txt"))
d = corpus.document(name="lion")

print d.keywords(top=10)
print
print d.tf("food")
print d.tfidf("food") # TF-IDF is less: "food" is also mentioned with the other animals.
print

# This allows us to compare how similar two documents are,
# based on the "vector" of word tf-idf frequencies of each document.
# This is called cosine-similarity:
d1 = corpus.document(name="lion")
d2 = corpus.document(name="tiger")
d3 = corpus.document(name="dolphin")
d4 = corpus.document(name="shark")
Example #2
0
sys.path.append(os.path.join("..", "..", ".."))

from pattern.vector import Corpus

# In the previous example we saw that words in a Document have a weight,
# based on how many times the word occurs in the document.
# This is called term frequency (TF).

# A better measure is term frequency - inverse document frequency (TF-IDF).
# If "important" is the most important word in a document,
# but it also occurs frequently in many other documents, then it is not important at all.

# The Corpus object groups a number of documents
# so their words can be compared to calculate TF-IDF.

corpus = Corpus.build(os.path.join("corpus", "*.txt"))
d = corpus.document(name="lion")

print d.keywords(top=10)
print
print d.tf("food")
print d.tfidf(
    "food")  # TF-IDF is less: "food" is also mentioned with the other animals.
print

# This allows us to compare how similar two documents are,
# based on the "vector" of word tf-idf frequencies of each document.
# This is called cosine-similarity:
d1 = corpus.document(name="lion")
d2 = corpus.document(name="tiger")
d3 = corpus.document(name="dolphin")
Example #3
0
from pattern.vector import Corpus

# In the previous example we saw that words in a Document have a weight
# based on how many times the word occurs in the document.
# This is called term frequency (TF).

# Another interesting measure is term frequency-inverse document frequency (TF-IDF).
# If "important" is the most important word in a document,
# but it also occurs frequently in many other documents, then it is not important at all.

# The Corpus object groups a number of documents
# so that their words can be compared to calculate TF-IDF.

# You can build a corpus from a folder of text files.
# We supply a naming function that names individual documents based on the file path.
corpus = Corpus.build(os.path.join("corpus", "*.txt"), name=lambda path: os.path.basename(path)[:-4])

d = corpus.document(name="lion") # Filename is now used as document name.

print d.keywords(top=10)
print
print d.tf("food")
print d.tfidf("food") # TF-IDF is less: "food" is also mentioned with the other animals.
print

# This allows us to compare how similar two documents are.
# The corpus can be represented as a matrix with documents as rows
# and words as columns. Each document row is called a "vector",
# containing the TF-IDF scores for word columns.
# We can compare two vectors by calculating their angle,
# this is called "cosine-similarity":
Example #4
0
from pattern.vector import Corpus

# In the previous example we saw that words in a Document have a weight
# based on how many times the word occurs in the document.
# This is called term frequency (TF).

# Another interesting measure is term frequency-inverse document frequency (TF-IDF).
# If "important" is the most important word in a document,
# but it also occurs frequently in many other documents, then it is not important at all.

# The Corpus object groups a number of documents
# so that their words can be compared to calculate TF-IDF.

# You can build a corpus from a folder of text files.
# We supply a naming function that names individual documents based on the file path.
corpus = Corpus.build(os.path.join("corpus", "*.txt"),
                      name=lambda path: os.path.basename(path)[:-4])

d = corpus.document(name="lion")  # Filename is now used as document name.

print d.keywords(top=10)
print
print d.tf("food")
print d.tfidf(
    "food")  # TF-IDF is less: "food" is also mentioned with the other animals.
print

# This allows us to compare how similar two documents are.
# The corpus can be represented as a matrix with documents as rows
# and words as columns. Each document row is called a "vector",
# containing the TF-IDF scores for word columns.
# We can compare two vectors by calculating their angle,