Beispiel #1
0
def summarize(query=None, k=4,url=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
        for sentence in sentences:
            lsa1.parse(sentence)
    else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
        for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()
    lsa1.calc()
    summary =[(sentences[i], norm(dot(diag(lsa1.S),lsa1.Vt[:,b]),2)) for i in range(len(sentences)) for b in range(len(lsa1.Vt))]
    sorted(summary, key=itemgetter(1))
    summary = dict((v[0],v) for v in sorted(summary, key=lambda summary: summary[1])).values()
    return '.'.join([a for a, b in summary][len(summary)-(k):])
Beispiel #2
0
def summarize(query=None, k=4, url=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [
            word for sentence in j for word in sentence.split()
            if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word
            or '"' in word
        ]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [
            sentence for sentence in sentences
            if len(sentence) > 1 and sentence != ''
        ]
        for sentence in sentences:
            lsa1.parse(sentence)
    else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
        for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()
    lsa1.calc()
    summary = [(sentences[i], norm(dot(diag(lsa1.S), lsa1.Vt[:, b]), 2))
               for i in range(len(sentences)) for b in range(len(lsa1.Vt))]
    sorted(summary, key=itemgetter(1))
    summary = dict(
        (v[0], v)
        for v in sorted(summary, key=lambda summary: summary[1])).values()
    return '.'.join([a for a, b in summary][len(summary) - (k):])
Beispiel #3
0
def summarize_evaluation(query=None, url=None, summary=None):
    j=[]
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
        for sentence in sentences:
            lsa.parse(sentence)
    else:
        lsa = LSA(stopwords, ignore_characters)
        for sentence in query:
            lsa.parse(sentence)
    lsa.build()
    lsa.calc()
    lsa2 = LSA(stopwords, ignore_characters)
    for sentence in summary:
        lsa2.parse(sentence)
    lsa2.build()
    lsa2.calc()
    vectors =[(dot(lsa.S,lsa.U[0,:]),dot(lsa.S,lsa.U[i,:])) for i in range(len(lsa.U))]
    vectors2 =[(dot(lsa2.S,lsa2.U[0,:]),dot(lsa2.S,lsa2.U[i,:])) for i in range(len(lsa2.U))]
    angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a in vectors for b in vectors2]
    return str(abs(1 - float(angles[1])/float(pi/2)))
Beispiel #4
0
def summarize_evaluation(query=None, url=None, summary=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [
            word for sentence in j for word in sentence.split()
            if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word
            or '"' in word
        ]
        j = ' '.join(j)
        lsa = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [
            sentence for sentence in sentences
            if len(sentence) > 1 and sentence != ''
        ]
        for sentence in sentences:
            lsa.parse(sentence)
    else:
        lsa = LSA(stopwords, ignore_characters)
        for sentence in query:
            lsa.parse(sentence)
    lsa.build()
    lsa.calc()
    lsa2 = LSA(stopwords, ignore_characters)
    for sentence in summary:
        lsa2.parse(sentence)
    lsa2.build()
    lsa2.calc()
    vectors = [(dot(lsa.S, lsa.U[0, :]), dot(lsa.S, lsa.U[i, :]))
               for i in range(len(lsa.U))]
    vectors2 = [(dot(lsa2.S, lsa2.U[0, :]), dot(lsa2.S, lsa2.U[i, :]))
                for i in range(len(lsa2.U))]
    angles = [
        arccos(dot(a, b) / (norm(a, 2) * norm(b, 2))) for a in vectors
        for b in vectors2
    ]
    return str(abs(1 - float(angles[1]) / float(pi / 2)))
Beispiel #5
0
import os, sys
sys.path.insert(0, os.path.join("..", ".."))

from pattern.web import URL, Document, plaintext
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

# The web module has a number of convenient search engines,
# but often you will need to handle the HTML in web pages of your interest manually.
# The Document object can be used for this, similar to the Javascript DOM.

# For example:
url = URL("http://www.reddit.com/top/")
dom = Document(url.download(cached=True))
for e in dom.get_elements_by_tagname("div.entry")[:5]:  # Top 5 reddit entries.
    for a in e.get_elements_by_tagname(
            "a.title")[:1]:  # First <a class="title"> in entry.
        print plaintext(a.content)
        print a.attributes["href"]
        print

# Some of the links can be relative, for example starting with "../".
# We can get the absolute URL by prepending the base URL.
# However, this might get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
url = URL("http://nodebox.net")
for link in Document(url.download()).by_tag("a"):
    link = link.attributes.get("href", "")
    link = abs(link, base=url.redirect or url.string)
    #print link
Beispiel #6
0
import os, sys; sys.path.insert(0, os.path.join("..", ".."))

from pattern.web import URL, Document, plaintext
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

# The web module has a number of convenient search engines,
# but often you will need to handle the HTML in web pages of your interest manually.
# The Document object can be used for this, similar to the Javascript DOM.

# For example:
url = URL("http://www.reddit.com/top/")
dom = Document(url.download(cached=True))
print dom.body.content.__class__
for e in dom.get_elements_by_tagname("div.entry")[:5]: # Top 5 reddit entries.
    for a in e.get_elements_by_tagname("a.title")[:1]: # First <a class="title"> in entry.
        print plaintext(a.content)
        print a.attributes["href"]
        print
        
# Some of the links can be relative, for example starting with "../".
# We can get the absolute URL by prepending the base URL.
# However, this might get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
url = URL("http://nodebox.net")
for link in Document(url.download()).by_tag("a"):
    link = link.attributes.get("href","")
    link = abs(link, base=url.redirect or url.string)
    #print link

# The Document object is a tree of Element and Text objects.
return str(abs(1 - float(angles[0])/float(pi/2)))
def graph(query1, query2):
    lsa = LSA(stopwords, ignore_characters)
    titles = [lsa.search_wiki(query1), lsa.search_wiki(query2)]
for t in titles:
        lsa.parse(t)
    lsa.build()
    lsa.calc()
    lsa.plotSVD()
## core summarization function.
def summarize(query=None, k=4,url=None):
    j = []
if url:
        b = URL(url)
        a = Document(b.download(cached=True))
for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
for sentence in sentences:
            lsa1.parse(sentence)
else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()
    lsa1.calc()