def word_ranking(text, n='L2'): """ extract most relevant sentences from text according to LSA algorithm steps: 1. tokenize text by sentences 2. compute tfidf matrix 3. applying SVD of tfidf matrix (reduce to n-dimensions) 4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf) - text: string consisting of a few sentences - n: number of sentences to extract """ # tokenize text to sentences list sentences = tokenize(text) #============================================================================== # #synctatic filter # exclude_list = [] # for sent in sentences: # for word, pos in tag(sent): # if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns. # exclude_list.append(word.lower()) #============================================================================== # create documents list # stop words and punctuation erase by default docs = [Document(sentences[i], name=i) for i in range(len(sentences))] # model initialize m = Model(docs, weight=TFIDF) # dimensions number equal to euclidean norm of singular values # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False) # dimensions=int(round(np.linalg.norm(S, 2))) m.reduce(dimensions=n) # sentences selection according to cross-method # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf # topic(rows) x tokens(cols) matrix(tfidf) V = np.array(m.lsa.vt) # average sentence score for each concept/topic by the rows of the Vt matrix avg_score = np.mean(V, axis=1).reshape((-1, 1)) # cell values which are less than or equal to the average score are set to zero V[V <= avg_score] = 0.0 # sigma natrix after svd performing S = np.array(m.lsa.sigma).reshape((-1, 1)) # total length of each sentence vector length = np.sum(V * S, axis=0) # ranking words by length score ranking = Counter(dict(zip(m.lsa.terms, length))) #.most_common(n) #words, score = list(zip(*ranking)) return ranking
def getMod(): essay_path = 'essays/original/' files = fio.recGetTextFiles(path.abspath(essay_path)) docs = [] for f in files: with io.open(f, 'r', encoding='utf-8') as w: text = TextBlob(PageParser.parse(w.read())) text = ' '.join([ word for word in text.words if word not in cachedStopWords ]).lstrip() #ent_text = ' '.join(er.recognize_entities(text.sentences)) #ent_text = PageParser.parse(w.read()) docs.append(Document(text, name=f, top=40)) m = Model(docs) lsa = m.reduce(5) return lsa # Clustering could be a useful technique, commenting out for now #with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w: # write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "") with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w: for i, concept in enumerate(m.lsa.concepts): print("Concept {0}:".format(i)), w.write(unicode("Concept {0}:".format(i))) count = 0 # Show top only first 5 features we come across for feature, weight in m.lsa.concepts[i].items(): if abs(weight) > 0.2: print(feature), w.write(feature + " ") count += 1 if count > 5: break w.write(unicode('\n')) #print cat_docs = [] for d in m.documents: cat = (0, 0, {}) #print d.name.split('\\')[-1] for idx, weight in m.lsa.vectors[d.id].items(): print "\tCat {0}: {1}".format(idx, weight) if abs(weight) > abs(cat[1]) or cat[1] == 0: cat = (idx, weight, d) if cat[0] == i: cat_docs.append(cat) #print "\t{0}".format(d.name.split('\\')[-1]) cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True) for cat, weight, d in cat_docs: f = d.name.split('\\')[-1] w.write( unicode("\t{0} - {1}\n").format( filter(lambda x: x in string.printable, f), weight))
def getMod(): essay_path = 'essays/original/' files = fio.recGetTextFiles(path.abspath(essay_path)) docs = [] for f in files: with io.open(f, 'r', encoding='utf-8') as w: text = TextBlob(PageParser.parse(w.read())) text = ' '.join([word for word in text.words if word not in cachedStopWords]).lstrip() #ent_text = ' '.join(er.recognize_entities(text.sentences)) #ent_text = PageParser.parse(w.read()) docs.append(Document(text, name=f, top=40)) m = Model(docs) lsa = m.reduce(5) return lsa # Clustering could be a useful technique, commenting out for now #with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w: # write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "") with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w: for i,concept in enumerate(m.lsa.concepts): print("Concept {0}:".format(i)), w.write(unicode("Concept {0}:".format(i))) count = 0 # Show top only first 5 features we come across for feature, weight in m.lsa.concepts[i].items(): if abs(weight) > 0.2: print(feature), w.write(feature + " ") count += 1 if count > 5: break w.write(unicode('\n')) #print cat_docs = [] for d in m.documents: cat = (0,0, {}) #print d.name.split('\\')[-1] for idx,weight in m.lsa.vectors[d.id].items(): print "\tCat {0}: {1}".format(idx, weight) if abs(weight) > abs(cat[1]) or cat[1] == 0: cat = (idx,weight,d) if cat[0] == i: cat_docs.append(cat) #print "\t{0}".format(d.name.split('\\')[-1]) cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True) for cat,weight,d in cat_docs: f = d.name.split('\\')[-1] w.write(unicode("\t{0} - {1}\n").format(filter(lambda x: x in string.printable, f), weight))
# to predict the label (type/class) of unlabeled documents. # In this case, it can predict whether a new movie review is positive or negative. # The details are not that important right now, just observe the accuracy. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print("accuracy:", KNN.test(m, folds=10)[-1]) print("time:", time.time() - t) print() # Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features). print("LSA reduction...") print() m.reduce(10) t = time.time() print("accuracy:", KNN.test(m, folds=10)[-1]) print("time:", time.time() - t) print() # Accuracy is about the same, but the performance is better: 2x-3x faster, # because each document is now a "10-word summary" of the original review. # Let's take a closer look at the concepts. # The concept vector for the first document: print(m.lsa.vectors[m[0].id]) print() # It is a dictionary of concept id's (instead of features).
# to predict the label (type/class) of unlabeled documents. # In this case, it can predict whether a new movie review is positive or negative. # The details are not that important right now, just observe the accuracy. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print "accuracy:", KNN.test(m, folds=10)[-1] print "time:", time.time() - t print # Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features). print "LSA reduction..." print m.reduce(10) t = time.time() print "accuracy:", KNN.test(m, folds=10)[-1] print "time:", time.time() - t print # Accuracy is about the same, but the performance is better: 2x-3x faster, # because each document is now a "10-word summary" of the original review. # Let's take a closer look at the concepts. # The concept vector for the first document: print m.lsa.vectors[m[0].id] print # It is a dictionary of concept id's (instead of features).
con = pymongo.MongoClient() sentiment_res = con.tweets.sentiment_analysis sentiment_res_p = con.tweets.patterns_sentiment_analysis tweets = con.tweets.tweets_toronto docs = [] # with open('D:\\data\\documents.spkl', 'wb') as fp: # for tweet in tweets.find(): # doc = Document(tweet['text'],name=tweet['id']) # pickle.dump(doc, fp) # fp.close() # m = Model(documents=[],weight=TFIDF) with open('D:\\data\\documents.spkl', 'rb') as fp: for j in range(tweets.count()/100): print 'Loading model' m.append(pickle.load(fp)) print len(m.documents) with open('D:\\data\\documents.spkl', 'rb') as fp: for j in xrange(tweets.count()): print 'Loading model' m.append(pickle.load(fp)) print len(m.documents) print len(m.documents) m.reduce(dimensions=L2) m.save
type='lion', ) d3 = Document('An elephant is a big grey animal with a slurf.', type='elephant') print d1.vector m = Model(documents=[d1, d2, d3], weight=TFIDF) print d1.vector print m.similarity(d1, d2) # tiger vs. lion print m.similarity(d1, d3) # tiger vs. elephant # lsa concept space d1 = Document('The cat purrs.', name='cat1') d2 = Document('Curiosity killed the cat.', name='cat2') d3 = Document('The dog wags his tail.', name='dog1') d4 = Document('The dog is happy.', name='dog2') m = Model([d1, d2, d3, d4]) m.reduce(2) for d in m.documents: print print d.name for concept, w1 in m.lsa.vectors[d.id].items(): for feature, w2 in m.lsa.concepts[concept].items(): if w1 != 0 and w2 != 0: print(feature, w1 * w2) # clustering d1 = Document('Cats are independent pets.', name='cat') d2 = Document('Dogs are trustworthy pets.', name='dog') d3 = Document('Boxes are made of cardboard.', name='box') m = Model((d1, d2, d3)) print m.cluster(method=HIERARCHICAL, k=2) # hierarchical clustering cluster = Cluster((1, Cluster((2, Cluster((3, 4))))))
from pattern.vector import Document, Model d1 = Document('The cat purrs.', name='cat1') d2 = Document('Curiosity killed the cat.', name='cat2') d3 = Document('The dog wags his tail.', name='dog1') d4 = Document('The dog is happy.', name='dog2') m = Model([d1, d2, d3, d4]) m.reduce(4) for d in m.documents: print print d.name for concept, w1 in m.lsa.vectors[d.id].items(): print m.lsa.vectors[d.id].items() #matriz U for feature, w2 in m.lsa.concepts[concept].items(): #print m.lsa.concepts[concept].items() #matriz Vt if w1 != 0 and w2 != 0: print(feature, w1 * w2)
# -*- coding: utf-8 -*- from json import load from pattern.vector import Document, Model,L2 packages = load(file("packages.json")) docs = [Document(p['description'], name=p['name']) for p in packages] model = Model(docs) lsa = model.reduce(L2)
import cPickle as pickle con = pymongo.MongoClient() sentiment_res = con.tweets.sentiment_analysis sentiment_res_p = con.tweets.patterns_sentiment_analysis tweets = con.tweets.tweets_toronto docs = [] # with open('D:\\data\\documents.spkl', 'wb') as fp: # for tweet in tweets.find(): # doc = Document(tweet['text'],name=tweet['id']) # pickle.dump(doc, fp) # fp.close() # m = Model(documents=[], weight=TFIDF) with open('D:\\data\\documents.spkl', 'rb') as fp: for j in range(tweets.count() / 100): print 'Loading model' m.append(pickle.load(fp)) print len(m.documents) with open('D:\\data\\documents.spkl', 'rb') as fp: for j in xrange(tweets.count()): print 'Loading model' m.append(pickle.load(fp)) print len(m.documents) print len(m.documents) m.reduce(dimensions=L2) m.save