Example #1
0
def word_ranking(text, n='L2'):
    """
    extract most relevant sentences from text according to LSA algorithm
    steps:    
    1. tokenize text by sentences
    2. compute tfidf matrix
    3. applying SVD of tfidf matrix (reduce to n-dimensions) 
    4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf)
        
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    #==============================================================================
    #     #synctatic filter
    #     exclude_list = []
    #     for sent in sentences:
    #         for word, pos in tag(sent):
    #             if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns.
    #                 exclude_list.append(word.lower())
    #==============================================================================

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dimensions number equal to euclidean norm of singular values
    # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False)
    # dimensions=int(round(np.linalg.norm(S, 2)))
    m.reduce(dimensions=n)

    # sentences selection according to cross-method
    # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf
    # topic(rows) x tokens(cols) matrix(tfidf)
    V = np.array(m.lsa.vt)

    # average sentence score for each concept/topic by the rows of the Vt matrix
    avg_score = np.mean(V, axis=1).reshape((-1, 1))

    # cell values which are less than or equal to the average score are set to zero
    V[V <= avg_score] = 0.0

    # sigma natrix after svd performing
    S = np.array(m.lsa.sigma).reshape((-1, 1))

    # total length of each sentence vector
    length = np.sum(V * S, axis=0)

    # ranking words by length score
    ranking = Counter(dict(zip(m.lsa.terms, length)))  #.most_common(n)

    #words, score =  list(zip(*ranking))

    return ranking
Example #2
0
def getMod():
    essay_path = 'essays/original/'
    files = fio.recGetTextFiles(path.abspath(essay_path))
    docs = []
    for f in files:
        with io.open(f, 'r', encoding='utf-8') as w:
            text = TextBlob(PageParser.parse(w.read()))
            text = ' '.join([
                word for word in text.words if word not in cachedStopWords
            ]).lstrip()
            #ent_text = ' '.join(er.recognize_entities(text.sentences))
            #ent_text = PageParser.parse(w.read())
            docs.append(Document(text, name=f, top=40))
    m = Model(docs)
    lsa = m.reduce(5)
    return lsa
    # Clustering could be a useful technique, commenting out for now
    #with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
    #	write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "")

    with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
        for i, concept in enumerate(m.lsa.concepts):
            print("Concept {0}:".format(i)),
            w.write(unicode("Concept {0}:".format(i)))
            count = 0
            # Show top only first 5 features we come across
        for feature, weight in m.lsa.concepts[i].items():
            if abs(weight) > 0.2:
                print(feature),
                w.write(feature + " ")
                count += 1

            if count > 5:
                break
        w.write(unicode('\n'))
        #print

        cat_docs = []
        for d in m.documents:
            cat = (0, 0, {})
            #print d.name.split('\\')[-1]
            for idx, weight in m.lsa.vectors[d.id].items():
                print "\tCat {0}: {1}".format(idx, weight)
                if abs(weight) > abs(cat[1]) or cat[1] == 0:
                    cat = (idx, weight, d)

            if cat[0] == i:
                cat_docs.append(cat)
                #print "\t{0}".format(d.name.split('\\')[-1])

        cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True)
        for cat, weight, d in cat_docs:
            f = d.name.split('\\')[-1]
            w.write(
                unicode("\t{0} - {1}\n").format(
                    filter(lambda x: x in string.printable, f), weight))
Example #3
0
def getMod():
    essay_path = 'essays/original/'
    files = fio.recGetTextFiles(path.abspath(essay_path))
    docs = []
    for f in files:
        with io.open(f, 'r', encoding='utf-8') as w:
	        text = TextBlob(PageParser.parse(w.read()))
	        text = ' '.join([word for word in text.words if word not in cachedStopWords]).lstrip()
            #ent_text = ' '.join(er.recognize_entities(text.sentences))
	        #ent_text = PageParser.parse(w.read())
	        docs.append(Document(text, name=f, top=40))
    m = Model(docs)
    lsa = m.reduce(5)
    return lsa
	# Clustering could be a useful technique, commenting out for now
	#with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
	#	write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "")

    with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
	    for i,concept in enumerate(m.lsa.concepts):
		    print("Concept {0}:".format(i)),
		    w.write(unicode("Concept {0}:".format(i)))
		    count = 0
		    # Show top only first 5 features we come across
            for feature, weight in m.lsa.concepts[i].items(): 
	            if abs(weight) > 0.2:
		            print(feature),
		            w.write(feature + " ")
		            count += 1

	            if count > 5:
		            break
            w.write(unicode('\n'))
            #print 

            cat_docs = []
            for d in m.documents:
				cat = (0,0, {})
				#print d.name.split('\\')[-1]
				for idx,weight in m.lsa.vectors[d.id].items():
					print "\tCat {0}: {1}".format(idx, weight)
					if abs(weight) > abs(cat[1]) or cat[1] == 0:
						cat = (idx,weight,d)

				if cat[0] == i:
					cat_docs.append(cat)
					#print "\t{0}".format(d.name.split('\\')[-1])

            cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True)
            for cat,weight,d in cat_docs:
	            f = d.name.split('\\')[-1]
	            w.write(unicode("\t{0} - {1}\n").format(filter(lambda x: x in string.printable, f), weight))
Example #4
0
# to predict the label (type/class) of unlabeled documents.
# In this case, it can predict whether a new movie review is positive or negative.

# The details are not that important right now, just observe the accuracy.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.

t = time.time()
print("accuracy:", KNN.test(m, folds=10)[-1])
print("time:", time.time() - t)
print()

# Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features).
print("LSA reduction...")
print()
m.reduce(10)

t = time.time()
print("accuracy:", KNN.test(m, folds=10)[-1])
print("time:", time.time() - t)
print()

# Accuracy is about the same, but the performance is better: 2x-3x faster,
# because each document is now a "10-word summary" of the original review.

# Let's take a closer look at the concepts.
# The concept vector for the first document:
print(m.lsa.vectors[m[0].id])
print()

# It is a dictionary of concept id's (instead of features).
Example #5
0
# to predict the label (type/class) of unlabeled documents.
# In this case, it can predict whether a new movie review is positive or negative.

# The details are not that important right now, just observe the accuracy.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.

t = time.time()
print "accuracy:", KNN.test(m, folds=10)[-1]
print "time:", time.time() - t
print

# Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features).
print "LSA reduction..."
print
m.reduce(10)

t = time.time()
print "accuracy:", KNN.test(m, folds=10)[-1]
print "time:", time.time() - t
print

# Accuracy is about the same, but the performance is better: 2x-3x faster,
# because each document is now a "10-word summary" of the original review.

# Let's take a closer look at the concepts.
# The concept vector for the first document:
print m.lsa.vectors[m[0].id]
print

# It is a dictionary of concept id's (instead of features).

con = pymongo.MongoClient()
sentiment_res = con.tweets.sentiment_analysis
sentiment_res_p = con.tweets.patterns_sentiment_analysis
tweets = con.tweets.tweets_toronto

docs = []
# with open('D:\\data\\documents.spkl', 'wb') as fp:
#     for tweet in tweets.find():
#         doc = Document(tweet['text'],name=tweet['id'])
#         pickle.dump(doc, fp)
#     fp.close()
#

m = Model(documents=[],weight=TFIDF)

with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in range(tweets.count()/100):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in xrange(tweets.count()):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
    print len(m.documents)
m.reduce(dimensions=L2)
m.save
    type='lion',
)
d3 = Document('An elephant is a big grey animal with a slurf.',
              type='elephant')
print d1.vector
m = Model(documents=[d1, d2, d3], weight=TFIDF)
print d1.vector
print m.similarity(d1, d2)  # tiger vs. lion
print m.similarity(d1, d3)  # tiger vs. elephant
# lsa concept space
d1 = Document('The cat purrs.', name='cat1')
d2 = Document('Curiosity killed the cat.', name='cat2')
d3 = Document('The dog wags his tail.', name='dog1')
d4 = Document('The dog is happy.', name='dog2')
m = Model([d1, d2, d3, d4])
m.reduce(2)
for d in m.documents:
    print
    print d.name
    for concept, w1 in m.lsa.vectors[d.id].items():
        for feature, w2 in m.lsa.concepts[concept].items():
            if w1 != 0 and w2 != 0:
                print(feature, w1 * w2)
# clustering
d1 = Document('Cats are independent pets.', name='cat')
d2 = Document('Dogs are trustworthy pets.', name='dog')
d3 = Document('Boxes are made of cardboard.', name='box')
m = Model((d1, d2, d3))
print m.cluster(method=HIERARCHICAL, k=2)
# hierarchical clustering
cluster = Cluster((1, Cluster((2, Cluster((3, 4))))))
Example #8
0
from pattern.vector import Document, Model

d1 = Document('The cat purrs.', name='cat1')
d2 = Document('Curiosity killed the cat.', name='cat2')
d3 = Document('The dog wags his tail.', name='dog1')
d4 = Document('The dog is happy.', name='dog2')

m = Model([d1, d2, d3, d4])
m.reduce(4)

for d in m.documents:
    print
    print d.name
    for concept, w1 in m.lsa.vectors[d.id].items():
        print m.lsa.vectors[d.id].items()  #matriz U
        for feature, w2 in m.lsa.concepts[concept].items():
            #print m.lsa.concepts[concept].items() #matriz Vt
            if w1 != 0 and w2 != 0:
                print(feature, w1 * w2)
Example #9
0
# -*- coding: utf-8 -*-

from json import load
from pattern.vector import Document, Model,L2

packages = load(file("packages.json"))

docs = [Document(p['description'], name=p['name']) for p in packages]
model = Model(docs)

lsa = model.reduce(L2)
import cPickle as pickle

con = pymongo.MongoClient()
sentiment_res = con.tweets.sentiment_analysis
sentiment_res_p = con.tweets.patterns_sentiment_analysis
tweets = con.tweets.tweets_toronto

docs = []
# with open('D:\\data\\documents.spkl', 'wb') as fp:
#     for tweet in tweets.find():
#         doc = Document(tweet['text'],name=tweet['id'])
#         pickle.dump(doc, fp)
#     fp.close()
#

m = Model(documents=[], weight=TFIDF)

with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in range(tweets.count() / 100):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in xrange(tweets.count()):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
    print len(m.documents)
m.reduce(dimensions=L2)
m.save