Exemple #1
0
def parseURL(url, force=False):
    "Parses the given url and saves it to Database"
    try:
        wr = WebResource.objects.get(url=url)
    except MultipleObjectsReturned:
        WebResource.objects.filter(url=url).delete()
    except:
        pass

    wr, created = WebResource.objects.get_or_create(url=url)
    if created or force:
        # print "Parsing and Caching {0}".format(url)
        try:
            a = newspaper.Article(url)
            a.download()
            a.parse()
            text = a.text
            title = a.title

            if 'books.google' in url:
                text = ''

            wr.text = str(text.encode('utf-8', 'replace').lower())
            wr.title = a.title
            wr.urls = ",".join(
                find_urls(strip_between("<body*>", "</body", text)))
            wr.save()
            print "  PARSED ", url
        except:
            print "  Failed"
    return wr
Exemple #2
0
 def test_strip_between(self):
     # Assert strip <p> elements.
     v = web.strip_between("<p", "</p>", " <p><p></p>text</p> <b><P></P></b>")
     self.assertEqual(v, " text</p> <b></b>")
     print "pattern.web.strip_between()"
Exemple #3
0
 def test_strip_between(self):
     # Assert strip <p> elements.
     v = web.strip_between("<p", "</p>", " <p><p></p>text</p> <b><P></P></b>")
     self.assertEqual(v, " text</p> <b></b>")
     print "pattern.web.strip_between()"
Exemple #4
0
     p = "pattern" + p.rstrip("-")
     title = p.replace("-", ".")
 if p == "stop-words":
     title = "Stop words"
 if p == "mbsp-tags":
     title = "Penn Treebank II tag set"
 # Download the online documentation pages.
 print "Retrieving", url + p
 html = URL(url + p).download(cached=False)
 # Parse the actual documentation, we don't need the website header, footer, navigation, search.
 html = Document(html)
 html = html.by_id("content-area")
 html = html.by_class("node-type-page")[0]
 html = html.source
 html = strip_javascript(html)
 html = strip_between('<div id="navbar">', '/#navbar -->', html)
 html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->', html)
 html = strip_between('<div id="footer">', '/#footer -->', html)
 html = strip_between('<a class="twitter-share-button"', '</a>', html)
 # Link to local pages and images.
 # Link to online media.
 html = html.replace('href="/pages/MBSP"', 'href="%sMBSP"' % url)                   # MBSP docs (online)
 html = re.sub('href="/pages/(pattern-examples.*?)"', 'href="%s\\1"' % url, html)   # examples (online)
 html = re.sub('href="/pages/(using-.*?)"', 'href="%s\\1"' % url, html)             # examples (online)
 html = re.sub('href="/pages/(modeling-.*?)"', 'href="%s\\1"' % url, html)          # examples (online)
 html = re.sub('href="/pages/(.*?)([#|"])', 'href="\\1.html\\2', html)              # pages (offline)
 html = html.replace('src="/media/', 'src="../g/')                                  # images (offline)
 html = html.replace('src="/sites/all/themes/clips/g/', 'src="../g/')               # images (offline)
 html = html.replace('href="/media/', 'href="%smedia/' % url.replace("pages/", "")) # downloads (online)
 # Apply the simplified template + set page titles.
 html = template % (p, url+p, url+p, title, html)
Exemple #5
0
     p = "pattern" + p.rstrip("-")
     title = p.replace("-", ".")
 if p == "stop-words":
     title = "Stop words"
 if p == "mbsp-tags":
     title = "Penn Treebank II tag set"
 # Download the online documentation pages.
 print "Retrieving", url + p
 html = URL(url + p).download(cached=False)
 # Parse the actual documentation, we don't need the website header, footer, navigation, search.
 html = Document(html)
 html = html.by_id("content-area")
 html = html.by_class("node-type-page")[0]
 html = html.source
 html = strip_javascript(html)
 html = strip_between('<div id="navbar">', '/#navbar -->', html)
 html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->',
                      html)
 html = strip_between('<div id="footer">', '/#footer -->', html)
 html = strip_between('<a href="http://twitter.com/share"', '</a>', html)
 # Link to local pages and images.
 # Link to online media.
 html = html.replace('href="/pages/MBSP"',
                     'href="%sMBSP"' % url)  # MBSP docs (online)
 html = re.sub('href="/pages/(pattern-examples.*?)"', 'href="%s\\1"' % url,
               html)  # examples (online)
 html = re.sub('href="/pages/(.*?)([#|"])', 'href="\\1.html\\2',
               html)  # pages (offline)
 html = html.replace('src="/media/', 'src="../g/')  # images (offline)
 html = html.replace('src="/sites/all/themes/clips/g/',
                     'src="../g/')  # images (offline)
Exemple #6
0
     p = "pattern" + p.rstrip("-")
     title = p.replace("-", ".")
 if p == "stop-words":
     title = "Stop words"
 if p == "mbsp-tags":
     title = "Penn Treebank II tag set"
 # Download the online documentation pages.
 print "Retrieving", url + p
 html = URL(url + p).download(cached=False)
 # Parse the actual documentation, we don't need the website header, footer, navigation, search.
 html = Document(html)
 html = html.by_id("content-area")
 html = html.by_class("node-type-page")[0]
 html = html.source
 html = strip_javascript(html)
 html = strip_between('<div id="navbar">', '/#navbar -->', html)
 html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->', html)
 html = strip_between('<div id="footer">', '/#footer -->', html)
 html = strip_between('<a href="http://twitter.com/share"', '</a>', html)
 # Link to local pages and images.
 # Link to online media.
 html = html.replace('href="/pages/MBSP"', 'href="%sMBSP"' % url)                   # MBSP docs (online)
 html = re.sub('href="/pages/(pattern-examples.*?)"', 'href="%s\\1"' % url, html)   # examples (online)
 html = re.sub('href="/pages/(.*?)([#|"])', 'href="\\1.html\\2', html)              # pages (offline)
 html = html.replace('src="/media/', 'src="../g/')                                  # images (offline)
 html = html.replace('src="/sites/all/themes/clips/g/', 'src="../g/')               # images (offline)
 html = html.replace('href="/media/', 'href="%smedia/' % url.replace("pages/", "")) # downloads (online)
 # Apply the simplified template + set page titles.
 html = template % (p, url+p, url+p, title, html)
 # Generate offline HTML file.
 f = codecs.open(os.path.join("html", "%s.html" % p), "w", encoding="utf-8")
Exemple #7
0
#        Number of Ratings


page_urls = []

tableRows = dom.by_id('main').by_tag('table')[1].by_tag('tr')
for tr in tableRows[1:]:
	a = tr.by_tag('a')[0]
	page_urls.append(clean_unicode(abs_url(a.attributes.get('href', ''), url.string)))

for p in page_urls:
	p_url = URL(p)
	p_dom = DOM(p_url.download(cached=True))
	
	title = clean_unicode(p_dom.by_class('header')[0].content)
	title = plaintext(strip_between('<span', '</span>', title))
	
	runtime = clean_unicode(p_dom.by_class('infobar')[0].by_tag('time')[0].content)

	genres = []
	for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]:
		genres.append(clean_unicode(genre.content))
 	
 	directors = []
 	writers = []
 	actors = []

 	text_blocks = p_dom.by_class('txt-block')[:3]
 	for t in text_blocks:
 		spans = t.by_tag('span')
 		for s in spans:
def find_name():
    idlink = URL('http://graph.facebook.com/' + userid).download()    
    #idlink = URL('http://graph.facebook.com/lilian.xie?fref=ts').download()
    name = strip_between('{', '"name":"', idlink)     
    name = strip_between('","user', '}', name)
    return name
def find_id():
    idlink = URL('http://graph.facebook.com/' + userid).download()
    #idlink = URL('http://graph.facebook.com/lilian.xie?fref=ts').download()
    idlink = idlink[7:]
    idnum = strip_between('"', '}', idlink)     
    return idnum