def spider(): tasks_execution_log = [] class LoggingTaskRunner(BaseTaskRunner): def __call__(self, task): tasks_execution_log.append((self, task.id)) if task.get('abortme', False): raise AbortTask() if task.get('skipme', False): raise SkipRunner() if task.get('retryme', False): raise RetryTask() if task.get('raise_sth', False): raise Exception("This is an exception!") return iter([]) class MyTaskRunner(LoggingTaskRunner): def match(self, task): return isinstance(task, MyTask) class MyTaskRunner2(LoggingTaskRunner): def match(self, task): return isinstance(task, MyTask) def __call__(self, task): super(MyTaskRunner2, self).__call__(task) yield MyOtherTask( task_id='was:' + task.id, previous=task) class MyOtherTaskRunner(LoggingTaskRunner): def match(self, task): return isinstance(task, MyOtherTask) runners = { 0: MyTaskRunner(), 1: MyTaskRunner2(), 2: MyOtherTaskRunner(), } spider = Spider() spider.add_runners(x[1] for x in sorted(runners.iteritems())) # spider.execution_log = tasks_execution_log ## We need to pass some extra stuff.. spider._testing = { 'execution_log': tasks_execution_log, 'runners': runners, } return spider
return True def __call__(self, task): assert self.match(task) response = task['response'] content_type, params = cgi.parse_header( response['headers'].get('content-type') or 'text/html') # content_type = task['response'].headers['Content-type'].split(';') if content_type[0] != 'text/html': return # Nothing to do here.. tree = lxml.html.fromstring(task['response'].content) el = tree.xpath('//h1[@id="firstHeading"]')[0] yield WikipediaPage(url=task['url'], title=el.text_content()) spider = Spider() spider.add_runners([ WikipediaDownloader(), WikipediaScraper(), LinkExtractor(max_depth=3), ]) if __name__ == '__main__': try: ## Prepare the storage if len(sys.argv) > 1: storage = AnydbmStorage(path=sys.argv[1]) else: storage = DictStorage() spider.add_runners([storage])