clusters = clusterer.cluster(vectors, True) ## k-means clustering # clusterer = nltk.cluster.KMeansClusterer(2, nltk.cluster.euclidean_distance) # clusters = clusterer.cluster(U, assign_clusters=True, trace=False) print "clusterer: ", clusterer print "clustered: ", vectors print "As: ", clusters # print "Means: ", clusterer.means() # show the dendrogram clusterer.dendrogram().show(leaf_labels= ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27']) ## lsa analysis corpus = Corpus(documents) corpus.reduce(10) # for document in corpus: # print # print document.name # for concept, w1 in corpus.lsa.vectors[document.id].items(): # for word, w2 in corpus.lsa.concepts[concept].items(): # if w1 !=0 and w2 !=0: # print (word, w1*w2) ## clustering analysis by pattern's hierarchical patterncluster = corpus.cluster(method=HIERARCHICAL, k=10, iterations=1000) print patterncluster, patterncluster.depth
# This may be too much words for some clustering algorithms (e.g., hierarchical). # We'll reduce the documents to vectors of 4 concepts. # First, let's test how the corpus would perform as a classifier. # The details of KNN are not that important right now, just observe the numbers. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print "accuracy:", KNN.test(corpus, folds=10)[-1] print "time:", time.time() - t print # Reduce the documents to vectors of 4 concepts (= 1/7 of 30 words). print "LSA reduction..." print corpus.reduce(4) t = time.time() print "accuracy:", KNN.test(corpus, folds=10)[-1] print "time:", time.time() - t print # Not bad, accuracy is about the same but performance is 3x faster, # because each document is now a "4-word summary" of the original review. # Let's take a closer look at the concepts. # The concept vector for the first document: print corpus.lsa.vectors[corpus[0].id] print # It is a dictionary linking concept id's to a score.
#Classify the Preprocessed tweets file into Positive or NEgative categories. # Sample with 20 tweets file. from pattern.vector import Document,Bayes,LSA, Corpus,PORTER,TFIDF from numpy import diag,dot from numpy.linalg import svd filename='sample_20' f=open(filename,'r') lines=f.readlines() Positive=lines[9:] Negative=lines[:9] Type=[0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1] docs=[] for text in lines: vec=Document(text,stopword=True,stemmer=PORTER,type='1') docs.append(vec) corpus=Corpus(documents=docs,weight=TFIDF) print corpus.vectors corpus.reduce(3) print "hi" print corpus.lsa.vectors f.close()
# based on a matrix calculation called "singular value decomposition" (SVD). # It discovers semantically related words across documents. # It groups these into different "concepts" # and creates a "concept vector" instead of a word vector for each document. # This reduces the amount of data to work with (for example when clustering), # and filters out noise, so that semantically related words come out stronger. D1 = Document("The dog wags his tail.", threshold=0, name="dog") D2 = Document("Curiosity killed the cat.", threshold=0, name="cat") D3 = Document("Cats and dogs make good pets.", threshold=0, name="pet") D4 = Document("Curiosity drives science.", threshold=0, name="science") corpus = Corpus([D1,D2,D3,D4]) print corpus.search("curiosity") print corpus.reduce() # A search on the reduced concept space also yields D3 ("pet") as a result, # since D2 and D2 are slightly similar even though D3 does not explicitly contain "curiosity". # Note how the results also yield stronger similarity scores (noise was filtered out). print corpus.search("curiosity") print # The concept vector for document D1: #print corpus.lsa.vectors[D1.id] #print # The word scores for each concept: #print corpus.lsa.concepts