Esempio n. 1
0
class CrawlPipelineStep(PipelineStep):
    def __init__(self):
        # jobs_url points to a page where we have several links to a page
        self.jobs_url = None

    def read_page(self, url):
        tries = 0
        while tries < 5:
            try:
                response = urllib2.urlopen(url)
                html = response.read()
                return html
            except:
                tries += 1
        return ""

    def get_pages_count(self, html):
        return NotImplementedError()

    def read(self):
        jobs_page = self.read_page(self.jobs_url)
        pages = [jobs_page]
        num_pages = self.get_pages_count(jobs_page)
       
        # For testing
        num_pages = min(num_pages, 3)

        for i in xrange(2, num_pages):
            print "Reading page %d " % i
            url = self.get_page_url(i)
            pages.append(self.read_page(url))
        return pages

    def write(self, items):
        self.writer = CrawlerWriter('localhost', 'root', 'root', self.site, 'job_agregator') 
        cnt = 0
        for item in items:
            # Item[0] = html, item[1] = url
            self.writer.write_job(item[0], item[1])
Esempio n. 2
0
 def write(self, items):
     self.writer = CrawlerWriter('localhost', 'root', 'root', self.site, 'job_agregator') 
     cnt = 0
     for item in items:
         # Item[0] = html, item[1] = url
         self.writer.write_job(item[0], item[1])