Beispiel #1
0
def get_links_from_page(number_of_pages): 
	# get initial url 
	url = web.URL('http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1990,2012')
	# create an empty array to populate with the urls 
	pages = []
	# the loop to get the links 
	for page_index in range(number_of_pages): 
		if page_index == 0: # the first page has only next button so the DOM is different
			dom = web.DOM(url.download(cached = False))
			# to see which part of the DOM to use right click in Chrome 
			# and use 'Inspect Element'
			entry = dom('span.pagination')[1].by_tag('a')
			href = 'http://www.imdb.com/' + entry[0].attributes.get('href')
			pages.append(href)
			print(pages)
			url = web.URL(href)
		else:  # after the first page you have both previous and next butoon so you select next 
			dom = web.DOM(url.download(cached = False))
			entry = dom('span.pagination')[1].by_tag('a')
			href = 'http://www.imdb.com/' + entry[1].attributes.get('href')
			pages.append(href)
			print(pages)
			url = web.URL(href)
	# return a list that handles empty urls
	return list(set(pages))
Beispiel #2
0
 def test_element(self):
     # Assert Element properties (test <body>).
     v = web.DOM(self.html).body
     self.assertEqual(v.tag, "body")
     self.assertEqual(v.attributes["id"], "front")
     self.assertEqual(v.attributes["class"], "comments")
     self.assertTrue(v.content.startswith("\n<script"))
     # Assert Element.getElementsByTagname() (test navigation links).
     a = v.by_tag("a")
     self.assertEqual(len(a), 3)
     self.assertEqual(a[0].content, "nav1")
     self.assertEqual(a[1].content, "nav2")
     self.assertEqual(a[2].content, "nav3")
     # Assert Element.getElementsByClassname() (test <p class="comment">).
     a = v.by_class("comment")
     self.assertEqual(a[0].tag, "p")
     self.assertEqual(a[0].by_tag("span")[0].attributes["class"], "date")
     self.assertEqual(a[0].by_tag("span")[1].attributes["class"], "author")
     for selector in (".comment", "p.comment", "*.comment"):
         self.assertEqual(v.by_tag(selector)[0], a[0])
     # Assert Element.getElementById() (test <div id="content">).
     e = v.by_id("content")
     self.assertEqual(e.tag, "div")
     self.assertEqual(e, a[0].parent)
     for selector in ("#content", "div#content", "*#content"):
         self.assertEqual(v.by_tag(selector)[0], e)
     # Assert Element.getElementByAttribute() (test on <a href="">).
     a = v.by_attribute(href="nav1.html")
     self.assertEqual(a[0].content, "nav1")
     print "pattern.web.Node.Element"
     print "pattern.web.Node.Element.by_tag()"
     print "pattern.web.Node.Element.by_class()"
     print "pattern.web.Node.Element.by_id()"
     print "pattern.web.Node.Element.by_attribute()"
Beispiel #3
0
def get_data_from_pages(links): 
	# open an empty array 
	data = []
	#create the loop to get the links that you created from the previous function
	for urltext in links: 
		#parse the url 
		url = web.URL(urltext)
		# print them for "matrix" like effect (slower, comment this line if you do not want it)
		print "Getting data from: ", url 
		try:  # the main scraping loop, it all about DOM manipulation 
			# learn more about DOM at http://code.tutsplus.com/tutorials/javascript-and-the-dom-series-lesson-1--net-3134 
			dom = web.DOM(url.download(cached=False))
			for movie in dom.by_tag('td.title'):
				title = movie.by_tag('a')[0].content
				print title
				genres = movie.by_tag('span.genre')[0].by_tag('a')
				genres = [g.content for g in genres]
				print genres
				director = movie.by_tag('span.credit')[0].by_tag('a')[0].content
				print director
				first_actor = movie.by_tag('span.credit')[0].by_tag('a')[1].content
				print first_actor
				second_actor = movie.by_tag('span.credit')[0].by_tag('a')[2].content
				print second_actor
				runtime = movie.by_tag('span.runtime')[0].content
				print runtime
				rating = movie.by_tag('span.value')[0].content
				print rating
				data.append((title, genres, director, first_actor, second_actor, runtime, rating))	
		except KeyboardInterrupt:
			break # to be able to interrupt the Ctrl+c without losing the data
		except: 
			pass # to not stop in case of missing data 
	return data
Beispiel #4
0
    def test_selector(self):
        # Assert DOM CSS selectors with multiple classes.
        v = web.DOM(self.html).body

        # TODO uncomment these!
        # p = v("p.class1")
        # self.assertEqual(len(p), 1)
        # self.assertTrue("class1" in p[0].attributes["class"])

        # p = v("p.class2")
        # self.assertEqual(len(p), 1)
        # self.assertTrue("class2" in p[0].attributes["class"])

        p = v("p.class1.class2")
        self.assertEqual(len(p), 1)
        self.assertTrue("class1" in p[0].attributes["class"])
        self.assertTrue("class2" in p[0].attributes["class"])
        e = p[0]
        # This was previously incorrect
        self.assertEqual([], v("p[class='class1 class2']"))
        self.assertEqual(e, v("p[class^='class1']")[0])
        self.assertEqual(e, v("p[class$='class2']")[0])
        self.assertEqual(e, v("p[class*='class']")[0])
        self.assertEqual(e, v("p:contains('blah')")[1])
        self.assertTrue(web.Selector("p[class='class1 class2']").match(e))
        print("pattern.web.Selector()")
Beispiel #5
0
 def test_node_traverse(self):
     # Assert Node.traverse() (must visit all child nodes recursively).
     self.b = False
     def visit(node):
         if node.type == web.ELEMENT and node.tag == "span":
             self.b = True
     v = web.DOM(self.html)
     v.traverse(visit)
     self.assertEqual(self.b, True)
     print "pattern.web.Node.traverse()"
Beispiel #6
0
 def test_selector(self):
     v = web.DOM(self.html).body
     p1 = v("p.class1")
     self.assertEqual(len(p1), 1)
     self.assertTrue("class1" in p1[0].attributes["class"])
     p2 = v("p.class2")
     self.assertEqual(len(p2), 1)
     self.assertTrue("class2" in p2[0].attributes["class"])
     p1andp2 = v(".class1.class2")
     self.assertEqual(len(p1andp2), 1)
     self.assertTrue("class1" in p1andp2[0].attributes["class"])
     self.assertTrue("class2" in p1andp2[0].attributes["class"])
     print "pattern.web.Node.Element()"
Beispiel #7
0
 def test_selector(self):
     # Assert DOM CSS selectors with multiple classes.
     v = web.DOM(self.html).body
     p = v("p.class1")
     self.assertEqual(len(p), 1)
     self.assertTrue("class1" in p[0].attributes["class"])
     p = v("p.class2")
     self.assertEqual(len(p), 1)
     self.assertTrue("class2" in p[0].attributes["class"])
     p = v("p.class1.class2")
     self.assertEqual(len(p), 1)
     self.assertTrue("class1" in p[0].attributes["class"])
     self.assertTrue("class2" in p[0].attributes["class"])
     print "pattern.web.Node.Element()"
Beispiel #8
0
 def test_selector(self):
     # Assert DOM CSS selectors with multiple classes.
     v = web.DOM(self.html).body
     p = v("p.class1")
     self.assertEqual(len(p), 1)
     self.assertTrue("class1" in p[0].attributes["class"])
     p = v("p.class2")
     self.assertEqual(len(p), 1)
     self.assertTrue("class2" in p[0].attributes["class"])
     p = v("p.class1.class2")
     self.assertEqual(len(p), 1)
     self.assertTrue("class1" in p[0].attributes["class"])
     self.assertTrue("class2" in p[0].attributes["class"])
     e = p[0]
     self.assertEqual(e, v("p[class='class1 class2']")[0])
     self.assertEqual(e, v("p[class^='class1']")[0])
     self.assertEqual(e, v("p[class$='class2']")[0])
     self.assertEqual(e, v("p[class*='class']")[0])
     self.assertEqual(e, v("p:contains('blah')")[1])
     self.assertTrue(web.Selector("p[class='class1 class2']").match(e))
     print("pattern.web.Selector()")