clusters = clusterer.cluster(vectors, True)

## k-means clustering
# clusterer = nltk.cluster.KMeansClusterer(2, nltk.cluster.euclidean_distance)
# clusters = clusterer.cluster(U, assign_clusters=True, trace=False)

print "clusterer: ", clusterer
print "clustered: ", vectors
print "As: ", clusters
# print "Means: ", clusterer.means()

# show the dendrogram
clusterer.dendrogram().show(leaf_labels= ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27'])

## lsa analysis
corpus = Corpus(documents)
corpus.reduce(10)



# for document in corpus:
# 	print
# 	print document.name
# 	for concept, w1 in corpus.lsa.vectors[document.id].items():
# 		for word, w2 in corpus.lsa.concepts[concept].items():
# 			if w1 !=0 and w2 !=0:
# 				print (word, w1*w2)

## clustering analysis by pattern's hierarchical
patterncluster = corpus.cluster(method=HIERARCHICAL, k=10, iterations=1000)
print patterncluster, patterncluster.depth
Example #2
0
# This may be too much words for some clustering algorithms (e.g., hierarchical).
# We'll reduce the documents to vectors of 4 concepts.

# First, let's test how the corpus would perform as a classifier.
# The details of KNN are not that important right now, just observe the numbers.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.
t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
print "time:", time.time() - t
print

# Reduce the documents to vectors of 4 concepts (= 1/7 of 30 words).
print "LSA reduction..."
print
corpus.reduce(4)

t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
print "time:", time.time() - t
print

# Not bad, accuracy is about the same but performance is 3x faster,
# because each document is now a "4-word summary" of the original review.

# Let's take a closer look at the concepts.
# The concept vector for the first document:
print corpus.lsa.vectors[corpus[0].id]
print

# It is a dictionary linking concept id's to a score.
#Classify the Preprocessed tweets file into Positive or NEgative categories.
# Sample with 20 tweets file.
from pattern.vector import Document,Bayes,LSA, Corpus,PORTER,TFIDF
from numpy import diag,dot
from numpy.linalg import svd
filename='sample_20'
f=open(filename,'r')
lines=f.readlines()
Positive=lines[9:]
Negative=lines[:9]
Type=[0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1]
docs=[]
for text in lines:
	vec=Document(text,stopword=True,stemmer=PORTER,type='1')
	docs.append(vec)

corpus=Corpus(documents=docs,weight=TFIDF)
print corpus.vectors
corpus.reduce(3)
print "hi"
print corpus.lsa.vectors

f.close()
Example #4
0
# based on a matrix calculation called "singular value decomposition" (SVD).
# It discovers semantically related words across documents.
# It groups these into different "concepts" 
# and creates a "concept vector" instead of a word vector for each document.
# This reduces the amount of data to work with (for example when clustering),
# and filters out noise, so that semantically related words come out stronger. 

D1 = Document("The dog wags his tail.", threshold=0, name="dog")
D2 = Document("Curiosity killed the cat.", threshold=0, name="cat")
D3 = Document("Cats and dogs make good pets.", threshold=0, name="pet")
D4 = Document("Curiosity drives science.", threshold=0, name="science")

corpus = Corpus([D1,D2,D3,D4])

print corpus.search("curiosity")
print

corpus.reduce()

# A search on the reduced concept space also yields D3 ("pet") as a result,
# since D2 and D2 are slightly similar even though D3 does not explicitly contain "curiosity".
# Note how the results also yield stronger similarity scores (noise was filtered out).
print corpus.search("curiosity")
print

# The concept vector for document D1:
#print corpus.lsa.vectors[D1.id]
#print

# The word scores for each concept:
#print corpus.lsa.concepts
Example #5
0
# This may be too much words for some clustering algorithms (e.g., hierarchical).
# We'll reduce the documents to vectors of 4 concepts.

# First, let's test how the corpus would perform as a classifier.
# The details of KNN are not that important right now, just observe the numbers.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.
t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
print "time:", time.time() - t
print

# Reduce the documents to vectors of 4 concepts (= 1/7 of 30 words).
print "LSA reduction..."
print
corpus.reduce(4)

t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
print "time:", time.time() - t
print

# Not bad, accuracy is about the same but performance is 3x faster,
# because each document is now a "4-word summary" of the original review.

# Let's take a closer look at the concepts.
# The concept vector for the first document:
print corpus.lsa.vectors[corpus[0].id]
print

# It is a dictionary linking concept id's to a score.