Esempio n. 1
0
        def run_task():
            print "Running crawl task %s" % task.task_id.value
            update = mesos_pb2.TaskStatus()
            update.task_id.value = task.task_id.value
            update.state = mesos_pb2.TASK_RUNNING
            driver.sendStatusUpdate(update)

            url = task.data

            source = urllib.urlopen(url).read()
            soup = BeautifulSoup(source)

            links = []
            try:
                for item in soup.find_all('a'):
                    try:
                        links.append(urlparse.urljoin(url, item.get('href')))
                    except:
                        pass  # Not a valid link
            except:
                print "Could not fetch any links from html"
                return

            res = results.CrawlResult(task.task_id.value, url, links)
            message = repr(res)
            driver.sendFrameworkMessage(message)

            print "Sending status update..."
            update = mesos_pb2.TaskStatus()
            update.task_id.value = task.task_id.value
            update.state = mesos_pb2.TASK_FINISHED
            driver.sendStatusUpdate(update)
            print "Sent status update"
            return
Esempio n. 2
0
    def frameworkMessage(self, driver, executorId, slaveId, message):
        """
          Invoked when an executor sends a message. These messages are best
          effort; do not expect a framework message to be retransmitted in any
          reliable fashion.
        """
        o = json.loads(message)

        if executorId.value == crawlExecutor.executor_id.value:
            result = results.CrawlResult(o['taskId'], o['url'], o['links'])
            #
            # TODO
            #

        elif executorId.value == renderExecutor.executor_id.value:
            result = results.RenderResult(o['taskId'], o['url'], o['imageUrl'])
Esempio n. 3
0
    def frameworkMessage(self, driver, executorId, slaveId, message):
        o = json.loads(message)

        if executorId.value == crawlExecutor.executor_id.value:
            result = results.CrawlResult(o['taskId'], o['url'], o['links'])
            for link in result.links:
                edge = (result.url, link)
                print "Appending [%s] to crawl results" % repr(edge)
                self.crawlResults.add(edge)
                if not link in self.processedURLs:
                    print "Enqueueing [%s]" % link
                    self.crawlQueue.append(link)
                    self.renderQueue.append(link)
                    self.processedURLs.add(link)

        elif executorId.value == renderExecutor.executor_id.value:
            result = results.RenderResult(o['taskId'], o['url'], o['imageUrl'])
            print "Appending [%s] to render results" % repr(
                (result.url, result.imageUrl))
            self.renderResults[result.url] = result.imageUrl
Esempio n. 4
0
    def frameworkMessage(self, driver, executorId, slaveId, message):
        o = json.loads(message)

        if executorId.value == crawlExecutor.executor_id.value:
            result = results.CrawlResult(o['taskId'], o['url'], o['links'])
            for link in result.links:
                edge = (result.url, link)
                print "Appending [%s] to crawl results" % repr(edge)
                self.crawlResults.add(edge)
                if not self.renderLimitReached and self.maxRenderTasks > 0 and \
                  self.maxRenderTasks <= len(self.processedURLs):
                    print "Render task limit (%d) reached" % self.maxRenderTasks
                    self.renderLimitReached = True
                if not link in self.processedURLs and not self.renderLimitReached:
                    print "Enqueueing [%s]" % link
                    self.crawlQueue.append(link)
                    self.renderQueue.append(link)
                    self.processedURLs.add(link)

        elif executorId.value == renderExecutor.executor_id.value:
            result = results.RenderResult(o['taskId'], o['url'], o['imageUrl'])
            print "Appending [%s] to render results" % repr(
                (result.url, result.imageUrl))
            self.renderResults[result.url] = result.imageUrl