Example #1
0
 def crawl_start_requests(self):
     # process start requests from spider
     try:
         requests = self.spider.start_requests()
         for req in arg_to_iter(requests):
             self.download(req)
     except:
         log.err(Failure(), 'Error when processing start requests.')
Example #2
0
 def crawl_start_requests(self):
     # process start requests from spider
     try:
         requests = self.spider.start_requests()
         for req in arg_to_iter(requests):
             self.download(req)
     except:
         log.err(Failure(), 'Error when processing start requests.')
    def __init__(self, allow=None, deny=None,
                 allow_domains=None, deny_domains=None,
                 tags=['a', 'area', 'link'], attrs=['href'], unique=True,
                 deny_extensions=None, filter_mobile=True):
        self.allow_res = [regex(x) for x in arg_to_iter(allow)]
        self.deny_res = [regex(x) for x in arg_to_iter(deny)]
        self.allow_domains = set(arg_to_iter(allow_domains))
        self.deny_domains = set(arg_to_iter(deny_domains))
        self.unique = unique
        if deny_extensions is None:
            deny_extensions = IGNORED_EXTENSIONS
        self.deny_extensions = set(['.' + e for e in deny_extensions])
        self.filter_mobile = filter_mobile

        tags = list(arg_to_iter(tags))  # make a local copy
        self.tag_func = lambda x: x in tags

        attrs = list(arg_to_iter(attrs))
        self.attr_func = lambda x: x in attrs
    def test_arg_to_iter(self):
        self.assertTrue(hasattr(arg_to_iter(None), '__iter__'))
        self.assertTrue(hasattr(arg_to_iter(100), '__iter__'))
        self.assertTrue(hasattr(arg_to_iter('lala'), '__iter__'))
        self.assertTrue(hasattr(arg_to_iter([1, 2, 3]), '__iter__'))
        self.assertTrue(hasattr(arg_to_iter(l for l in 'abcd'), '__iter__'))

        self.assertEqual(list(arg_to_iter(None)), [])
        self.assertEqual(list(arg_to_iter('lala')), ['lala'])
        self.assertEqual(list(arg_to_iter(100)), [100])
        self.assertEqual(list(arg_to_iter(l for l in 'abc')), ['a', 'b', 'c'])
        self.assertEqual(list(arg_to_iter([1, 2, 3])), [1, 2, 3])
        self.assertEqual(list(arg_to_iter({'a': 1})), [{'a': 1}])
Example #5
0
 def _handle_spider_output(self, result, request):
     result = arg_to_iter(result)
     for request in result:
         assert isinstance(request, Request), \
             'spider must return None, request or iterable of requests'
         self.download(request)
Example #6
0
 def _handle_spider_output(self, result, request):
     result = arg_to_iter(result)
     for request in result:
         assert isinstance(request, Request), \
             'spider must return None, request or iterable of requests'
         self.download(request)
Example #7
0
    def test_arg_to_iter(self):
        self.assertTrue(hasattr(arg_to_iter(None), '__iter__'))
        self.assertTrue(hasattr(arg_to_iter(100), '__iter__'))
        self.assertTrue(hasattr(arg_to_iter('lala'), '__iter__'))
        self.assertTrue(hasattr(arg_to_iter([1, 2, 3]), '__iter__'))
        self.assertTrue(hasattr(arg_to_iter(l for l in 'abcd'), '__iter__'))

        self.assertEqual(list(arg_to_iter(None)), [])
        self.assertEqual(list(arg_to_iter('lala')), ['lala'])
        self.assertEqual(list(arg_to_iter(100)), [100])
        self.assertEqual(list(arg_to_iter(l for l in 'abc')), ['a', 'b', 'c'])
        self.assertEqual(list(arg_to_iter([1, 2, 3])), [1, 2, 3])
        self.assertEqual(list(arg_to_iter({'a': 1})), [{'a': 1}])
Example #8
0
 def _handle_slot_output(self, result):
     self._to_schedule.append(iter(arg_to_iter(result)))
Example #9
0
 def start(self):
     super(DownloadSlot, self).start()
     requests = self.start_requests()
     self._to_schedule.append(iter(arg_to_iter(requests)))
     self._schedule_requests()