Example #1
0
import os, sys
sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.vector import Document, Corpus, Vectorspace

# Latent Semantic Analysis (LSA) is a statistical machine learning method
# based on singular value decomposition (SVD).
# It discovers semantically related words across documents.
# The idea is to group the document vectors in a matrix
# (each document is a row, each word in the corpus is a column),
# and then to reduce the number of dimensions, filtering out "noise".

D1 = Document("The dog wags his tail.", threshold=0, name="dog")
D2 = Document("Curiosity killed the cat.", threshold=0, name="cat")
D3 = Document("Cats and dogs make good pets.", threshold=0, name="pet")
D4 = Document("Curiosity drives science.", threshold=0, name="science")

corpus = Corpus([D1, D2, D3, D4])

lsa = corpus.lsa()

print lsa.keywords(D4)
print
print lsa.search("curiosity")

# Document D4 now yields kill as a keyword, although this word was not D4's description.
# However, document D2 and D4 share curiosity as a keyword,
# so D4 inherits some of the keywords from D2.
# Performing a search on curiosity now also yields document D3 as a result.
Example #2
0
# The concept vector for the first document:
print corpus.lsa.vectors[corpus[0].id]
print

# It is a dictionary linking concept id's to a score.
# This is is not very helpful.
# But we can look up the words related to a concept id:
#print corpus.lsa.concepts[0]

# That's a lot of words.
# Actually, all words in the corpus have a score in one of the four concepts. 
# This is a little bit abstract.
# We'll do a new reduction with 100 concepts (or semantic "categories"),
# and examine only the salient words for a concept.

corpus.lsa = None
corpus.reduce(100)

for word, weight in corpus.lsa.concepts[1].items():
    if abs(weight) > 0.1:
        print word
        
# Concept  1 = "truman", "ventura", "ace", "carrey", ... It's obviously about Jim Carrey movies.
# Concept 20 = "ripley", "butcher", "aliens", ... the Alien-franchise?
# Concept 40 = "wars", "lucas", "jedi", "phantom", "star", ...

# You'll notice that not all concepts are equally easy to interpret,
# or that some of them seem to mingle 2 or more core ideas.
# However, it should underpin that (with further massaging)
# LSA can not only be used for faster processing but also to discover synonym sets.
Example #3
0
import os, sys; sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.vector import Document, Corpus, Vectorspace

# Latent Semantic Analysis (LSA) is a statistical machine learning method 
# based on singular value decomposition (SVD).
# It discovers semantically related words across documents. 
# The idea is to group the document vectors in a matrix 
# (each document is a row, each word in the corpus is a column), 
# and then to reduce the number of dimensions, filtering out "noise".

D1 = Document("The dog wags his tail.", threshold=0, name="dog")
D2 = Document("Curiosity killed the cat.", threshold=0, name="cat")
D3 = Document("Cats and dogs make good pets.", threshold=0, name="pet")
D4 = Document("Curiosity drives science.", threshold=0, name="science")

corpus = Corpus([D1,D2,D3,D4])

lsa = corpus.lsa()

print lsa.keywords(D4)
print
print lsa.search("curiosity")

# Document D4 now yields kill as a keyword, although this word was not D4's description. 
# However, document D2 and D4 share curiosity as a keyword, 
# so D4 inherits some of the keywords from D2. 
# Performing a search on curiosity now also yields document D3 as a result.
Example #4
0
# The concept vector for the first document:
print corpus.lsa.vectors[corpus[0].id]
print

# It is a dictionary linking concept id's to a score.
# This is is not very helpful.
# But we can look up the words related to a concept id:
#print corpus.lsa.concepts[0]

# That's a lot of words.
# Actually, all words in the corpus have a score in one of the four concepts.
# This is a little bit abstract.
# We'll do a new reduction with 100 concepts (or semantic "categories"),
# and examine only the salient words for a concept.

corpus.lsa = None
corpus.reduce(100)

for word, weight in corpus.lsa.concepts[1].items():
    if abs(weight) > 0.1:
        print word

# Concept  1 = "truman", "ventura", "ace", "carrey", ... It's obviously about Jim Carrey movies.
# Concept 20 = "ripley", "butcher", "aliens", ... the Alien-franchise?
# Concept 40 = "wars", "lucas", "jedi", "phantom", "star", ...

# You'll notice that not all concepts are equally easy to interpret,
# or that some of them seem to mingle 2 or more core ideas.
# However, it should underpin that (with further massaging)
# LSA can not only be used for faster processing but also to discover synonym sets.