import os, sys sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.vector import Document, Corpus, Vectorspace # Latent Semantic Analysis (LSA) is a statistical machine learning method # based on singular value decomposition (SVD). # It discovers semantically related words across documents. # The idea is to group the document vectors in a matrix # (each document is a row, each word in the corpus is a column), # and then to reduce the number of dimensions, filtering out "noise". D1 = Document("The dog wags his tail.", threshold=0, name="dog") D2 = Document("Curiosity killed the cat.", threshold=0, name="cat") D3 = Document("Cats and dogs make good pets.", threshold=0, name="pet") D4 = Document("Curiosity drives science.", threshold=0, name="science") corpus = Corpus([D1, D2, D3, D4]) lsa = corpus.lsa() print lsa.keywords(D4) print print lsa.search("curiosity") # Document D4 now yields kill as a keyword, although this word was not D4's description. # However, document D2 and D4 share curiosity as a keyword, # so D4 inherits some of the keywords from D2. # Performing a search on curiosity now also yields document D3 as a result.
# The concept vector for the first document: print corpus.lsa.vectors[corpus[0].id] print # It is a dictionary linking concept id's to a score. # This is is not very helpful. # But we can look up the words related to a concept id: #print corpus.lsa.concepts[0] # That's a lot of words. # Actually, all words in the corpus have a score in one of the four concepts. # This is a little bit abstract. # We'll do a new reduction with 100 concepts (or semantic "categories"), # and examine only the salient words for a concept. corpus.lsa = None corpus.reduce(100) for word, weight in corpus.lsa.concepts[1].items(): if abs(weight) > 0.1: print word # Concept 1 = "truman", "ventura", "ace", "carrey", ... It's obviously about Jim Carrey movies. # Concept 20 = "ripley", "butcher", "aliens", ... the Alien-franchise? # Concept 40 = "wars", "lucas", "jedi", "phantom", "star", ... # You'll notice that not all concepts are equally easy to interpret, # or that some of them seem to mingle 2 or more core ideas. # However, it should underpin that (with further massaging) # LSA can not only be used for faster processing but also to discover synonym sets.
import os, sys; sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.vector import Document, Corpus, Vectorspace # Latent Semantic Analysis (LSA) is a statistical machine learning method # based on singular value decomposition (SVD). # It discovers semantically related words across documents. # The idea is to group the document vectors in a matrix # (each document is a row, each word in the corpus is a column), # and then to reduce the number of dimensions, filtering out "noise". D1 = Document("The dog wags his tail.", threshold=0, name="dog") D2 = Document("Curiosity killed the cat.", threshold=0, name="cat") D3 = Document("Cats and dogs make good pets.", threshold=0, name="pet") D4 = Document("Curiosity drives science.", threshold=0, name="science") corpus = Corpus([D1,D2,D3,D4]) lsa = corpus.lsa() print lsa.keywords(D4) print print lsa.search("curiosity") # Document D4 now yields kill as a keyword, although this word was not D4's description. # However, document D2 and D4 share curiosity as a keyword, # so D4 inherits some of the keywords from D2. # Performing a search on curiosity now also yields document D3 as a result.