def parseURL(url, force=False): "Parses the given url and saves it to Database" try: wr = WebResource.objects.get(url=url) except MultipleObjectsReturned: WebResource.objects.filter(url=url).delete() except: pass wr, created = WebResource.objects.get_or_create(url=url) if created or force: # print "Parsing and Caching {0}".format(url) try: a = newspaper.Article(url) a.download() a.parse() text = a.text title = a.title if 'books.google' in url: text = '' wr.text = str(text.encode('utf-8', 'replace').lower()) wr.title = a.title wr.urls = ",".join( find_urls(strip_between("<body*>", "</body", text))) wr.save() print " PARSED ", url except: print " Failed" return wr
def test_strip_between(self): # Assert strip <p> elements. v = web.strip_between("<p", "</p>", " <p><p></p>text</p> <b><P></P></b>") self.assertEqual(v, " text</p> <b></b>") print "pattern.web.strip_between()"
p = "pattern" + p.rstrip("-") title = p.replace("-", ".") if p == "stop-words": title = "Stop words" if p == "mbsp-tags": title = "Penn Treebank II tag set" # Download the online documentation pages. print "Retrieving", url + p html = URL(url + p).download(cached=False) # Parse the actual documentation, we don't need the website header, footer, navigation, search. html = Document(html) html = html.by_id("content-area") html = html.by_class("node-type-page")[0] html = html.source html = strip_javascript(html) html = strip_between('<div id="navbar">', '/#navbar -->', html) html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->', html) html = strip_between('<div id="footer">', '/#footer -->', html) html = strip_between('<a class="twitter-share-button"', '</a>', html) # Link to local pages and images. # Link to online media. html = html.replace('href="/pages/MBSP"', 'href="%sMBSP"' % url) # MBSP docs (online) html = re.sub('href="/pages/(pattern-examples.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(using-.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(modeling-.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(.*?)([#|"])', 'href="\\1.html\\2', html) # pages (offline) html = html.replace('src="/media/', 'src="../g/') # images (offline) html = html.replace('src="/sites/all/themes/clips/g/', 'src="../g/') # images (offline) html = html.replace('href="/media/', 'href="%smedia/' % url.replace("pages/", "")) # downloads (online) # Apply the simplified template + set page titles. html = template % (p, url+p, url+p, title, html)
p = "pattern" + p.rstrip("-") title = p.replace("-", ".") if p == "stop-words": title = "Stop words" if p == "mbsp-tags": title = "Penn Treebank II tag set" # Download the online documentation pages. print "Retrieving", url + p html = URL(url + p).download(cached=False) # Parse the actual documentation, we don't need the website header, footer, navigation, search. html = Document(html) html = html.by_id("content-area") html = html.by_class("node-type-page")[0] html = html.source html = strip_javascript(html) html = strip_between('<div id="navbar">', '/#navbar -->', html) html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->', html) html = strip_between('<div id="footer">', '/#footer -->', html) html = strip_between('<a href="http://twitter.com/share"', '</a>', html) # Link to local pages and images. # Link to online media. html = html.replace('href="/pages/MBSP"', 'href="%sMBSP"' % url) # MBSP docs (online) html = re.sub('href="/pages/(pattern-examples.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(.*?)([#|"])', 'href="\\1.html\\2', html) # pages (offline) html = html.replace('src="/media/', 'src="../g/') # images (offline) html = html.replace('src="/sites/all/themes/clips/g/', 'src="../g/') # images (offline)
p = "pattern" + p.rstrip("-") title = p.replace("-", ".") if p == "stop-words": title = "Stop words" if p == "mbsp-tags": title = "Penn Treebank II tag set" # Download the online documentation pages. print "Retrieving", url + p html = URL(url + p).download(cached=False) # Parse the actual documentation, we don't need the website header, footer, navigation, search. html = Document(html) html = html.by_id("content-area") html = html.by_class("node-type-page")[0] html = html.source html = strip_javascript(html) html = strip_between('<div id="navbar">', '/#navbar -->', html) html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->', html) html = strip_between('<div id="footer">', '/#footer -->', html) html = strip_between('<a href="http://twitter.com/share"', '</a>', html) # Link to local pages and images. # Link to online media. html = html.replace('href="/pages/MBSP"', 'href="%sMBSP"' % url) # MBSP docs (online) html = re.sub('href="/pages/(pattern-examples.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(.*?)([#|"])', 'href="\\1.html\\2', html) # pages (offline) html = html.replace('src="/media/', 'src="../g/') # images (offline) html = html.replace('src="/sites/all/themes/clips/g/', 'src="../g/') # images (offline) html = html.replace('href="/media/', 'href="%smedia/' % url.replace("pages/", "")) # downloads (online) # Apply the simplified template + set page titles. html = template % (p, url+p, url+p, title, html) # Generate offline HTML file. f = codecs.open(os.path.join("html", "%s.html" % p), "w", encoding="utf-8")
# Number of Ratings page_urls = [] tableRows = dom.by_id('main').by_tag('table')[1].by_tag('tr') for tr in tableRows[1:]: a = tr.by_tag('a')[0] page_urls.append(clean_unicode(abs_url(a.attributes.get('href', ''), url.string))) for p in page_urls: p_url = URL(p) p_dom = DOM(p_url.download(cached=True)) title = clean_unicode(p_dom.by_class('header')[0].content) title = plaintext(strip_between('<span', '</span>', title)) runtime = clean_unicode(p_dom.by_class('infobar')[0].by_tag('time')[0].content) genres = [] for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]: genres.append(clean_unicode(genre.content)) directors = [] writers = [] actors = [] text_blocks = p_dom.by_class('txt-block')[:3] for t in text_blocks: spans = t.by_tag('span') for s in spans:
def find_name(): idlink = URL('http://graph.facebook.com/' + userid).download() #idlink = URL('http://graph.facebook.com/lilian.xie?fref=ts').download() name = strip_between('{', '"name":"', idlink) name = strip_between('","user', '}', name) return name
def find_id(): idlink = URL('http://graph.facebook.com/' + userid).download() #idlink = URL('http://graph.facebook.com/lilian.xie?fref=ts').download() idlink = idlink[7:] idnum = strip_between('"', '}', idlink) return idnum