Beispiel #1
0
def summarize(query=None, k=4, url=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [
            word for sentence in j for word in sentence.split()
            if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word
            or '"' in word
        ]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [
            sentence for sentence in sentences
            if len(sentence) > 1 and sentence != ''
        ]
        for sentence in sentences:
            lsa1.parse(sentence)
    else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
        for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()
    lsa1.calc()
    summary = [(sentences[i], norm(dot(diag(lsa1.S), lsa1.Vt[:, b]), 2))
               for i in range(len(sentences)) for b in range(len(lsa1.Vt))]
    sorted(summary, key=itemgetter(1))
    summary = dict(
        (v[0], v)
        for v in sorted(summary, key=lambda summary: summary[1])).values()
    return '.'.join([a for a, b in summary][len(summary) - (k):])
Beispiel #2
0
def get_dom(url):
    
    try:
        s_content = URL(url).download(timeout=120, cached=False)
    except (URLError, HTTP404NotFound):
        print "Error downloading article"
        return None

    #for AJE compatibility
    try:
        s_content = s_content.decode('unicode_escape')
    except (UnicodeEncodeError):
        pass
    
    return Document(s_content)
Beispiel #3
0
def heuristic_scrape(article):
    from pattern.web import URL, Document, HTTP404NotFound, URLError, plaintext
    try:
        s_content = URL(article).download()
    except (URLError, HTTP404NotFound):
        print "Error downloading article"
        return ("could not download", article)

    dom = Document(s_content)

    text = ''

    for node in dom.by_tag('p'):
        for c in node:
            if c.type == 'text':
                text = text + ' ' + plaintext(c.source())
    return text.strip()
Beispiel #4
0
def summarize_evaluation(query=None, url=None, summary=None):
    j = []
    if url:
        b = URL(url)
        a = Document(b.download(cached=True))
        for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [
            word for sentence in j for word in sentence.split()
            if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word
            or '"' in word
        ]
        j = ' '.join(j)
        lsa = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [
            sentence for sentence in sentences
            if len(sentence) > 1 and sentence != ''
        ]
        for sentence in sentences:
            lsa.parse(sentence)
    else:
        lsa = LSA(stopwords, ignore_characters)
        for sentence in query:
            lsa.parse(sentence)
    lsa.build()
    lsa.calc()
    lsa2 = LSA(stopwords, ignore_characters)
    for sentence in summary:
        lsa2.parse(sentence)
    lsa2.build()
    lsa2.calc()
    vectors = [(dot(lsa.S, lsa.U[0, :]), dot(lsa.S, lsa.U[i, :]))
               for i in range(len(lsa.U))]
    vectors2 = [(dot(lsa2.S, lsa2.U[0, :]), dot(lsa2.S, lsa2.U[i, :]))
                for i in range(len(lsa2.U))]
    angles = [
        arccos(dot(a, b) / (norm(a, 2) * norm(b, 2))) for a in vectors
        for b in vectors2
    ]
    return str(abs(1 - float(angles[1]) / float(pi / 2)))
Beispiel #5
0
for p in ("-", "-web", "-db", "-search", "-vector", "-graph", "-canvas", "-metrics", 
          "-de", "-en", "-es", "-fr", "-it", "-nl", 
          "-shell", "stop-words", "mbsp-tags", "-dev"):
    # We include some useful pages (Penn Treebank tags, stop words) referenced in the documentation.
    if p.startswith("-"):
        p = "pattern" + p.rstrip("-")
        title = p.replace("-", ".")
    if p == "stop-words":
        title = "Stop words"
    if p == "mbsp-tags":
        title = "Penn Treebank II tag set"
    # Download the online documentation pages.
    print "Retrieving", url + p
    html = URL(url + p).download(cached=False)
    # Parse the actual documentation, we don't need the website header, footer, navigation, search.
    html = Document(html)
    html = html.by_id("content-area")
    html = html.by_class("node-type-page")[0]
    html = html.source
    html = strip_javascript(html)
    html = strip_between('<div id="navbar">', '/#navbar -->', html)
    html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->', html)
    html = strip_between('<div id="footer">', '/#footer -->', html)
    html = strip_between('<a class="twitter-share-button"', '</a>', html)
    # Link to local pages and images.
    # Link to online media.
    html = html.replace('href="/pages/MBSP"', 'href="%sMBSP"' % url)                   # MBSP docs (online)
    html = re.sub('href="/pages/(pattern-examples.*?)"', 'href="%s\\1"' % url, html)   # examples (online)
    html = re.sub('href="/pages/(using-.*?)"', 'href="%s\\1"' % url, html)             # examples (online)
    html = re.sub('href="/pages/(modeling-.*?)"', 'href="%s\\1"' % url, html)          # examples (online)
    html = re.sub('href="/pages/(.*?)([#|"])', 'href="\\1.html\\2', html)              # pages (offline)
Beispiel #6
0
import os, sys
sys.path.insert(0, os.path.join("..", ".."))

from pattern.web import URL, Document, plaintext
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

# The web module has a number of convenient search engines,
# but often you will need to handle the HTML in web pages of your interest manually.
# The Document object can be used for this, similar to the Javascript DOM.

# For example:
url = URL("http://www.reddit.com/top/")
dom = Document(url.download(cached=True))
for e in dom.get_elements_by_tagname("div.entry")[:5]:  # Top 5 reddit entries.
    for a in e.get_elements_by_tagname(
            "a.title")[:1]:  # First <a class="title"> in entry.
        print plaintext(a.content)
        print a.attributes["href"]
        print

# Some of the links can be relative, for example starting with "../".
# We can get the absolute URL by prepending the base URL.
# However, this might get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
url = URL("http://nodebox.net")
for link in Document(url.download()).by_tag("a"):
    link = link.attributes.get("href", "")
    link = abs(link, base=url.redirect or url.string)
    #print link
    angles = [arccos(dot(a,b)/(norm(a,2)*norm(b,2))) for a,b in vectors[1:]]
return str(abs(1 - float(angles[0])/float(pi/2)))
def graph(query1, query2):
    lsa = LSA(stopwords, ignore_characters)
    titles = [lsa.search_wiki(query1), lsa.search_wiki(query2)]
for t in titles:
        lsa.parse(t)
    lsa.build()
    lsa.calc()
    lsa.plotSVD()
## core summarization function.
def summarize(query=None, k=4,url=None):
    j = []
if url:
        b = URL(url)
        a = Document(b.download(cached=True))
for b in a.get_elements_by_tagname("p"):
            j.append(plaintext(b.content).encode("utf-8"))
        j = [word for sentence in j for word in sentence.split() if re.match("^[a-zA-Z_-]*$", word) or '.' in word or "'" in word or '"' in word]
        j = ' '.join(j)
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = j.split('.')
        sentences = [sentence for sentence in sentences if len(sentence)>1 and sentence != '']
for sentence in sentences:
            lsa1.parse(sentence)
else:
        lsa1 = LSA(stopwords, ignore_characters)
        sentences = query.split('.')
for sentence in sentences:
            lsa1.parse(sentence)
    lsa1.build()
Beispiel #8
0
import os, sys
sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.web import URL, Document, plaintext
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

# The web module has a number of convenient search engines,
# but often you will need to handle the HTML in web pages of your interest manually.
# The Document object can be used for this, similar to the Javascript DOM.

# For example:
url = URL("http://www.reddit.com/top/")
dom = Document(url.download(cached=True))
for e in dom.get_elements_by_tagname("div.entry")[:5]:  # Top 5 reddit entries.
    for a in e.get_elements_by_tagname(
            "a.title")[:1]:  # First <a class="title"> in entry.
        print plaintext(a.content)
        print a.attributes["href"]
        print

# The Document object is a tree of Element and Text objects.
# All objects inherit from Node, Document also inherits from Element.

# Node.type          => NODE, TEXT, COMMENT, ELEMENT, DOCUMENT
# Node.parent        => Parent Node object.
# Node.children      => List of child Node objects.
# Node.next          => Next Node in Node.parent.children.
# Node.previous      => Previous Node in Node.parent.children.

# Document.head      => Element with tag name "head".
# Document.body      => Element with tag name "body".