def crawl_start_requests(self): # process start requests from spider try: requests = self.spider.start_requests() for req in arg_to_iter(requests): self.download(req) except: log.err(Failure(), 'Error when processing start requests.')
def __init__(self, allow=None, deny=None, allow_domains=None, deny_domains=None, tags=['a', 'area', 'link'], attrs=['href'], unique=True, deny_extensions=None, filter_mobile=True): self.allow_res = [regex(x) for x in arg_to_iter(allow)] self.deny_res = [regex(x) for x in arg_to_iter(deny)] self.allow_domains = set(arg_to_iter(allow_domains)) self.deny_domains = set(arg_to_iter(deny_domains)) self.unique = unique if deny_extensions is None: deny_extensions = IGNORED_EXTENSIONS self.deny_extensions = set(['.' + e for e in deny_extensions]) self.filter_mobile = filter_mobile tags = list(arg_to_iter(tags)) # make a local copy self.tag_func = lambda x: x in tags attrs = list(arg_to_iter(attrs)) self.attr_func = lambda x: x in attrs
def test_arg_to_iter(self): self.assertTrue(hasattr(arg_to_iter(None), '__iter__')) self.assertTrue(hasattr(arg_to_iter(100), '__iter__')) self.assertTrue(hasattr(arg_to_iter('lala'), '__iter__')) self.assertTrue(hasattr(arg_to_iter([1, 2, 3]), '__iter__')) self.assertTrue(hasattr(arg_to_iter(l for l in 'abcd'), '__iter__')) self.assertEqual(list(arg_to_iter(None)), []) self.assertEqual(list(arg_to_iter('lala')), ['lala']) self.assertEqual(list(arg_to_iter(100)), [100]) self.assertEqual(list(arg_to_iter(l for l in 'abc')), ['a', 'b', 'c']) self.assertEqual(list(arg_to_iter([1, 2, 3])), [1, 2, 3]) self.assertEqual(list(arg_to_iter({'a': 1})), [{'a': 1}])
def _handle_spider_output(self, result, request): result = arg_to_iter(result) for request in result: assert isinstance(request, Request), \ 'spider must return None, request or iterable of requests' self.download(request)
def _handle_slot_output(self, result): self._to_schedule.append(iter(arg_to_iter(result)))
def start(self): super(DownloadSlot, self).start() requests = self.start_requests() self._to_schedule.append(iter(arg_to_iter(requests))) self._schedule_requests()