def test_abs(self): # Assert absolute URL (special attention for anchors). for a, b in (("../page.html", "http://domain.com/path/"), ("page.html", "http://domain.com/home.html")): v = web.abs(a, base=b) self.assertEqual(v, "http://domain.com/page.html") for a, b, c in (("#anchor", "http://domain.com", "/"), ("#anchor", "http://domain.com/", ""), ("#anchor", "http://domain.com/page", "")): v = web.abs(a, base=b) self.assertEqual(v, b + c + a) # http://domain.com/#anchor print "pattern.web.abs()"
def test_abs(self): # Assert absolute URL (special attention for anchors). for a, b in (("../page.html", "http://domain.com/path/"), ("page.html", "http://domain.com/home.html")): v = web.abs(a, base=b) self.assertEqual(v, "http://domain.com/page.html") for a, b, c in ( ("#anchor", "http://domain.com", "/"), ("#anchor", "http://domain.com/", ""), ("#anchor", "http://domain.com/page", ""), ): v = web.abs(a, base=b) self.assertEqual(v, b + c + a) # http://domain.com/#anchor print "pattern.web.abs()"
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' # list for absolut URLS to each movie movie_urls = [] # base-url of the main page base_url = URL("http://www.imdb.com/") # obtaining the urls for element in DOM(url.download()).by_tag("td.titleColumn"): for link in element.by_tag("a"): link = link.attrs.get("href","") link = abs(link, base=base_url.redirect or base_url.string) movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): """ Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). """ movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. absurl = URL("http://www.imdb.com") top_250_html = url.download(cached=True) links_dom = DOM(top_250_html) for content in links_dom("td.titleColumn"): for link in content("a"): partlink = abs(link.attributes.get("href", ""), base=absurl.redirect or absurl.string) movie_urls.append(partlink) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. from pattern.web import abs url = URL("http://www.imdb.com/chart/top") dom = DOM(url.download(cached=True)) for e in dom.by_tag("td.titleColumn")[:250]: for link in e.by_tag("a"): link = link.attrs.get("href", "") link = abs(link, base=url.redirect or url.string) movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] dom = DOM(url.download()) from pattern.web import abs url = URL("http://imdb.com") for x in dom.by_tag("td.titleColumn"): x = x.by_tag("a")[0] x = x.attrs.get("href","") x = abs(x, base=url.redirect or url.string) # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' from pattern.web import abs movie_urls = [] html = url.download(cached=True) dom = DOM(html) for a in dom.by_tag("tbody.lister-list"): for b in a.by_tag("td.titleColumn"): for c in b.by_tag("a"): link = c.attrs.get("href","") link = abs(link, base=url.redirect or url.string) movie_urls.append(link) # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. # return the list of URLs of each movie's page on IMDB return movie_urls
def getImages(self): images = [] for image in self.dom('img'): images.append( abs(image.attributes.get('src', ''), base=self.url.redirect or self.url.string)) return images
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] #absolute_url = 'http://www.imdb.com' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. url = URL(url) dom = DOM(url.download(cached=True)) # return the dom value for e in dom('.titleColumn'): for link in e('a'): movie_urls.append(abs(link.attributes.get('href')), ) # return url list return movie_urls
def convert_to_abs(self, link): """Converts a relative URL to an absolute url. e.g. '/biz/hyatt-#hrid:123' --> 'http://www.hipadvisor.com/biz/hyatt-#hrid123' """ base_url = URL(self.url) return abs(link, base=base_url.string)
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] #absolute_url = 'http://www.imdb.com' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. url = URL(url) dom = DOM(url.download(cached=True)) #return dom for e in dom('.titleColumn'): for link in e('a'): movie_urls.append(abs(link.attributes.get('href')), ) # return the list of URLs of each movie's page on IMDB return movie_urls
def getScripts(self): scripts = [] for script in self.dom('script'): src = script.attributes.get('src', '') if (src): scripts.append( abs(src, base=self.url.redirect or self.url.string)) else: scripts.append(str(script)) return scripts
def scrape_heat_urls(dom): days = [] for i in range(1,3): temp = dom.by_tag("table")[i] day = temp.by_tag("th")[1].content rows = temp.by_tag("tr")[1:] heat_urls = [] for row in rows: partial_url = row.by_tag("a")[0] heat_url = abs(partial_url.attributes.get('href',''), base=url.redirect or url.string) heat_urls.append(heat_url) days.append(heat_urls) return days
def getLinks(self): if self.content is None: return self.links if len(self.links) == 0: links = [ abs(x.url, base=self.url.redirect or self.url.string) for x in HTMLLinkParser().parse(self.content, url=self.url.string) ] self.links = [ WebPage(x, self, depth=self.depth + 1) for x in links ] return self.links
def all_lyrics(artist): clean = re.sub(r"\s+|'", '', artist) url = URL(BASE_URL + artist[0] + '/' + clean + '.html') dom = DOM(url.download()) titles = [a.content for a in dom('div#listAlbum a')] ew_amazon = [ abs(link.attributes.get('href', ''), base=url.redirect or url.string) for link in dom('div#listAlbum a') ] songlinks = [l for l in ew_amazon if 'amazon' not in l] lyrics = [] for link in songlinks: song_url = URL(link) song_dom = DOM(song_url.download()) lyrics.append(plaintext(song_dom('div#main div')[4:5][0].content)) zippy_lyrics = zip(titles, lyrics) return json.dumps(zippy_lyrics, sort_keys=True)
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. dom = DOM(url.download()) # return the list of URLs of each movie's page on IMDB return [abs(x.attr['href'], base=url.string) for x in dom('.lister-list > tr > td.titleColumn > a')]
def scrape_top_250(url): """ Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). """ movie_urls = [] dom = DOM(url.download(cached=True)) allurls = dom.get_elements_by_classname("titleColumn") for oneurl in allurls: link = abs(oneurl[1].attrs.get("href", ""), base=url.redirect or url.string) movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' dom = DOM(url.download(cached=True)) movie_urls = [] # iterate over movies in top 250 IMDB list for movie in dom('td.titleColumn'): # adds absolute URL of movie to movie_urls array for link in movie.by_tag('a'): movie_urls.append(abs(link.attributes.get('href',''), base=url.redirect or url.string)) # return the list of URLs of each movie's page on IMDB return movie_urls
def scrape_top_250(url): ''' Scrape the IMDB top 250 movies index page. Args: url: pattern.web.URL instance pointing to the top 250 index page Returns: A list of strings, where each string is the URL to a movie's page on IMDB, note that these URLS must be absolute (i.e. include the http part, the domain part and the path part). ''' movie_urls = [] # YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE # URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls. from pattern.web import abs url = URL("http://www.imdb.com/chart/top") for link in DOM(url.download()).by_tag('td.titleColumn'): link = link.by_tag('a')[0] link = link.attrs.get('href',"") link = abs(link, base=url.redirect or url.string) movie_urls.append(link) # return the list of URLs of each movie's page on IMDB return movie_urls
for e in dom.get_elements_by_tagname("div.entry")[:5]: # Top 5 reddit entries. for a in e.get_elements_by_tagname( "a.title")[:1]: # First <a class="title"> in entry. print plaintext(a.content) print a.attributes["href"] print # Some of the links can be relative, for example starting with "../". # We can get the absolute URL by prepending the base URL. # However, this might get messy with anchors, trailing slashes and redirected URL's. # A good way to get absolute URL's is to use the module's abs() function: from pattern.web import abs url = URL("http://nodebox.net") for link in DOM(url.download()).by_tag("a"): link = link.attributes.get("href", "") link = abs(link, base=url.redirect or url.string) #print link # The DOM object is a tree of Element and Text objects. # All objects inherit from Node, DOM also inherits from Element. # Node.type => NODE, TEXT, COMMENT, ELEMENT, DOM # Node.parent => Parent Node object. # Node.children => List of child Node objects. # Node.next => Next Node in Node.parent.children. # Node.previous => Previous Node in Node.parent.children. # DOM.head => Element with tag name "head". # DOM.body => Element with tag name "body". # Element.tag => Element tag name, e.g. "body".
#With the movie links, scrape each entry #You will get the the following items: #Produce a comma-separated text file (use semicolons to separate the entries) with a header row and the fields: # Title of movie # Runtime # Genre (separated by semicolons if multiple) # Director(s) # Writer(s) # Actors (listed on the page directly only or first three, separated by semicolons) # Ratings # Number of Ratings allElements = dom.by_tag("a") for e in allElements: movieTitleLinks = re.match("http://www.imdb.com/title/.*", abs(e.attributes.get('href',''), base=url.redirect or url.string)) # Follow the links if(movieTitleLinks): movieUrl = URL(movieTitleLinks.group(0)) movieDom = DOM(movieUrl.download(cached=True)) #======================================================================= # Get the title #======================================================================= for movie in movieDom.by_tag("title"): title = re.sub(' \(\d+\) - IMDb','', movie.content.encode('ascii','ignore').strip())
# Get the DOM object to scrape for movie links. [Hint: Use absolute URL's. # Documentation can be found here: http://www.clips.ua.ac.be/pages/pattern-web] url = URL("http://www.opentable.com/promo.aspx?m=7&ref=470&pid=90") dom = DOM(url.download(cached=True)) for restaraunt in dom.by_class("ResultRow"): name = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_tag("a")[0].content.encode( 'ascii', 'ignore' ) neighborhood_cuisine = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_class("d")[0].content.encode( 'ascii', 'ignore' ) neihgborhood_cuisine = neighborhood_cuisine.split('|') neighborhood = neihgborhood_cuisine[0] cuisine = neihgborhood_cuisine[1] meals = restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_class("message")[0].content.encode( 'ascii', 'ignore' ) meals = meals.split('<') # need to clean meals = meals[0] restURL = URL(abs(restaraunt.by_class("ReCol")[0].by_class("rinfo")[0].by_tag("a")[0].attributes.get('href',''), base=url.redirect or url.string)) restDOM = DOM(restURL.download(cached=True)) # need to clean address = restDOM.by_id("ProfileOverview_lblAddressText").content price = restDOM.by_id("ProfileOverview_lblPriceText").content try: ratings = restDOM.by_id("RestPopLabel_ReviewsFormat")[0].attributes ratings = ratings['title'] except TypeError: ratings = 'not available' style = restDOM.by_id("ProfileOverview_DiningStyle").by_class("value")[0].content try: website = restDOM.by_id("ProfileOverview_Website").by_tag("a")[0].content except AttributeError: website = "not available" phone = restDOM.by_id("ProfileOverview_Phone").by_class("value")[0].content
name = name.encode('ascii', 'ignore') for j in g.by_class('category infoItem')[0:]: category = j.content category = plaintext(category) category = category.encode('ascii', 'ignore') if (g.by_class('tags infoItem')): tag = g.by_class('tags infoItem')[0].content tag = tag.encode('ascii', 'ignore') else: tag = " " for k in g.by_class('visits')[0:]: visits = k.content visits = visits[0:-6] visits = visits.encode('ascii', 'ignore') for l in g.by_class('description')[0:]: description = l.content description = description.encode('ascii', 'ignore') for link in g.by_tag('a')[1:2]: links = abs(link.attributes.get('href', ''), base=url.redirect or url.string) ff.get(links + "/about") element = ff.find_element_by_class_name("row_count") time.sleep(8) element_text = element.text ## Write each row to the file writer.writerow([ name, category, tag, visits, description, links, element_text ]) output.close()
for e in dom.by_tag("div._1poyrkZ7g36PawDueRza-J s1r3zmnv-7 bmeGah")[:5]: # Top 5 reddit entries. for a in e.by_tag("a.SQnoC3ObvgnGjWt90zD9Z")[:1]: print(plaintext(a.content)) print(a.attrs["href"]) print("") # The links in the HTML source code may be relative, # e.g., "../img.jpg" instead of "www.domain.com/img.jpg". # We can get the absolute URL by prepending the base URL. # However, this can get messy with anchors, trailing slashes and redirected URL's. # A good way to get absolute URL's is to use the module's abs() function: from pattern.web import abs url = URL("http://nodebox.net") for link in DOM(url.download()).by_tag("a"): link = link.attrs.get("href", "") link = abs(link, base=url.redirect or url.string) print(link) # The DOM object is a tree of nested Element and Text objects. # All objects inherit from Node (check the source code). # Node.type : NODE, TEXT, COMMENT, ELEMENT or DOM # Node.parent : Parent Node object. # Node.children : List of child Node objects. # Node.next : Next Node in Node.parent.children. # Node.previous : Previous Node in Node.parent.children. # DOM.head : Element with tag name "head". # DOM.body : Element with tag name "body". # Element.tag : Element tag name, e.g. "body".
name = h.content name = plaintext(name) name = name.encode('ascii', 'ignore') for j in g.by_class('category infoItem')[0:]: category = j.content category = plaintext(category) category = category.encode('ascii', 'ignore') if (g.by_class('tags infoItem')): tag = g.by_class('tags infoItem')[0].content tag = tag.encode('ascii', 'ignore') else: tag = " " for k in g.by_class('visits')[0:]: visits = k.content visits = visits[0:-6] visits = visits.encode('ascii', 'ignore') for l in g.by_class('description')[0:]: description = l.content description = description.encode('ascii', 'ignore') for link in g.by_tag('a')[1:2]: links = abs(link.attributes.get('href', ''), base=url.redirect or url.string) ff.get(links+"/about") element = ff.find_element_by_class_name("row_count") time.sleep(8) element_text = element.text ## Write each row to the file writer.writerow([name, category, tag, visits, description, links, element_text]) output.close()
#!/usr/bin/env python