def run_task(): print "Running crawl task %s" % task.task_id.value update = mesos_pb2.TaskStatus() update.task_id.value = task.task_id.value update.state = mesos_pb2.TASK_RUNNING driver.sendStatusUpdate(update) url = task.data source = urllib.urlopen(url).read() soup = BeautifulSoup(source) links = [] try: for item in soup.find_all('a'): try: links.append(urlparse.urljoin(url, item.get('href'))) except: pass # Not a valid link except: print "Could not fetch any links from html" return res = results.CrawlResult(task.task_id.value, url, links) message = repr(res) driver.sendFrameworkMessage(message) print "Sending status update..." update = mesos_pb2.TaskStatus() update.task_id.value = task.task_id.value update.state = mesos_pb2.TASK_FINISHED driver.sendStatusUpdate(update) print "Sent status update" return
def frameworkMessage(self, driver, executorId, slaveId, message): """ Invoked when an executor sends a message. These messages are best effort; do not expect a framework message to be retransmitted in any reliable fashion. """ o = json.loads(message) if executorId.value == crawlExecutor.executor_id.value: result = results.CrawlResult(o['taskId'], o['url'], o['links']) # # TODO # elif executorId.value == renderExecutor.executor_id.value: result = results.RenderResult(o['taskId'], o['url'], o['imageUrl'])
def frameworkMessage(self, driver, executorId, slaveId, message): o = json.loads(message) if executorId.value == crawlExecutor.executor_id.value: result = results.CrawlResult(o['taskId'], o['url'], o['links']) for link in result.links: edge = (result.url, link) print "Appending [%s] to crawl results" % repr(edge) self.crawlResults.add(edge) if not link in self.processedURLs: print "Enqueueing [%s]" % link self.crawlQueue.append(link) self.renderQueue.append(link) self.processedURLs.add(link) elif executorId.value == renderExecutor.executor_id.value: result = results.RenderResult(o['taskId'], o['url'], o['imageUrl']) print "Appending [%s] to render results" % repr( (result.url, result.imageUrl)) self.renderResults[result.url] = result.imageUrl
def frameworkMessage(self, driver, executorId, slaveId, message): o = json.loads(message) if executorId.value == crawlExecutor.executor_id.value: result = results.CrawlResult(o['taskId'], o['url'], o['links']) for link in result.links: edge = (result.url, link) print "Appending [%s] to crawl results" % repr(edge) self.crawlResults.add(edge) if not self.renderLimitReached and self.maxRenderTasks > 0 and \ self.maxRenderTasks <= len(self.processedURLs): print "Render task limit (%d) reached" % self.maxRenderTasks self.renderLimitReached = True if not link in self.processedURLs and not self.renderLimitReached: print "Enqueueing [%s]" % link self.crawlQueue.append(link) self.renderQueue.append(link) self.processedURLs.add(link) elif executorId.value == renderExecutor.executor_id.value: result = results.RenderResult(o['taskId'], o['url'], o['imageUrl']) print "Appending [%s] to render results" % repr( (result.url, result.imageUrl)) self.renderResults[result.url] = result.imageUrl