def summarize(query=None, k=4,url=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word] j = ' '.join(j) lsa1 = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != ''] for sentence in sentences: lsa1.parse(sentence) else: lsa1 = LSA(stopwords, ignore_characters) sentences = query.split('.') for sentence in sentences: lsa1.parse(sentence) lsa1.build() lsa1.calc() summary =[(sentences[i], norm(dot(diag(lsa1.S),lsa1.Vt[:,b]),2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))] sorted(summary, key=itemgetter(1)) summary = dict((v[0],v) for v in sorted(summary, key=lambda summary: summary[1])).values() return '.'.join([a for a, b in summary][len(summary)-(k):])
def summarize(query=None, k=4, url=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [ word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word ] j = ' '.join(j) lsa1 = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [ sentence for sentence in sentences if len(sentence) > 1 and sentence != '' ] for sentence in sentences: lsa1.parse(sentence) else: lsa1 = LSA(stopwords, ignore_characters) sentences = query.split('.') for sentence in sentences: lsa1.parse(sentence) lsa1.build() lsa1.calc() summary = [(sentences[i], norm(dot(diag(lsa1.S), lsa1.Vt[:, b]), 2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))] sorted(summary, key=itemgetter(1)) summary = dict( (v[0], v) for v in sorted(summary, key=lambda summary: summary[1])).values() return '.'.join([a for a, b in summary][len(summary) - (k):])
def summarize_evaluation(query=None, url=None, summary=None): j=[] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word] j = ' '.join(j) lsa = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != ''] for sentence in sentences: lsa.parse(sentence) else: lsa = LSA(stopwords, ignore_characters) for sentence in query: lsa.parse(sentence) lsa.build() lsa.calc() lsa2 = LSA(stopwords, ignore_characters) for sentence in summary: lsa2.parse(sentence) lsa2.build() lsa2.calc() vectors =[(dot(lsa.S,lsa.U[0,:]),dot(lsa.S,lsa.U[i,:])) for i in range(len(lsa.U))] vectors2 =[(dot(lsa2.S,lsa2.U[0,:]),dot(lsa2.S,lsa2.U[i,:])) for i in range(len(lsa2.U))] angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a in vectors for b in vectors2] return str(abs(1 - float(angles[1])/float(pi/2)))
def summarize_evaluation(query=None, url=None, summary=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [ word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word ] j = ' '.join(j) lsa = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [ sentence for sentence in sentences if len(sentence) > 1 and sentence != '' ] for sentence in sentences: lsa.parse(sentence) else: lsa = LSA(stopwords, ignore_characters) for sentence in query: lsa.parse(sentence) lsa.build() lsa.calc() lsa2 = LSA(stopwords, ignore_characters) for sentence in summary: lsa2.parse(sentence) lsa2.build() lsa2.calc() vectors = [(dot(lsa.S, lsa.U[0, :]), dot(lsa.S, lsa.U[i, :])) for i in range(len(lsa.U))] vectors2 = [(dot(lsa2.S, lsa2.U[0, :]), dot(lsa2.S, lsa2.U[i, :])) for i in range(len(lsa2.U))] angles = [ arccos(dot(a, b) / (norm(a, 2) * norm(b, 2))) for a in vectors for b in vectors2 ] return str(abs(1 - float(angles[1]) / float(pi / 2)))
import os, sys sys.path.insert(0, os.path.join("..", "..")) from pattern.web import URL, Document, plaintext from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT # The web module has a number of convenient search engines, # but often you will need to handle the HTML in web pages of your interest manually. # The Document object can be used for this, similar to the Javascript DOM. # For example: url = URL("http://www.reddit.com/top/") dom = Document(url.download(cached=True)) for e in dom.get_elements_by_tagname("div.entry")[:5]: # Top 5 reddit entries. for a in e.get_elements_by_tagname( "a.title")[:1]: # First <a class="title"> in entry. print plaintext(a.content) print a.attributes["href"] print # Some of the links can be relative, for example starting with "../". # We can get the absolute URL by prepending the base URL. # However, this might get messy with anchors, trailing slashes and redirected URL's. # A good way to get absolute URL's is to use the module's abs() function: from pattern.web import abs url = URL("http://nodebox.net") for link in Document(url.download()).by_tag("a"): link = link.attributes.get("href", "") link = abs(link, base=url.redirect or url.string) #print link
import os, sys; sys.path.insert(0, os.path.join("..", "..")) from pattern.web import URL, Document, plaintext from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT # The web module has a number of convenient search engines, # but often you will need to handle the HTML in web pages of your interest manually. # The Document object can be used for this, similar to the Javascript DOM. # For example: url = URL("http://www.reddit.com/top/") dom = Document(url.download(cached=True)) print dom.body.content.__class__ for e in dom.get_elements_by_tagname("div.entry")[:5]: # Top 5 reddit entries. for a in e.get_elements_by_tagname("a.title")[:1]: # First <a class="title"> in entry. print plaintext(a.content) print a.attributes["href"] print # Some of the links can be relative, for example starting with "../". # We can get the absolute URL by prepending the base URL. # However, this might get messy with anchors, trailing slashes and redirected URL's. # A good way to get absolute URL's is to use the module's abs() function: from pattern.web import abs url = URL("http://nodebox.net") for link in Document(url.download()).by_tag("a"): link = link.attributes.get("href","") link = abs(link, base=url.redirect or url.string) #print link # The Document object is a tree of Element and Text objects.
return str(abs(1 - float(angles[0])/float(pi/2))) def graph(query1, query2): lsa = LSA(stopwords, ignore_characters) titles = [lsa.search_wiki(query1), lsa.search_wiki(query2)] for t in titles: lsa.parse(t) lsa.build() lsa.calc() lsa.plotSVD() ## core summarization function. def summarize(query=None, k=4,url=None): j = [] if url: b = URL(url) a = Document(b.download(cached=True)) for b in a.get_elements_by_tagname("p"): j.append(plaintext(b.content).encode("utf-8")) j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word] j = ' '.join(j) lsa1 = LSA(stopwords, ignore_characters) sentences = j.split('.') sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != ''] for sentence in sentences: lsa1.parse(sentence) else: lsa1 = LSA(stopwords, ignore_characters) sentences = query.split('.') for sentence in sentences: lsa1.parse(sentence) lsa1.build() lsa1.calc()