def test_request_extractor(self):
        extractors = [SgmlRequestExtractor()]

        # extract all requests
        reqgen = RequestGenerator(extractors, [], callback=self.deferred)
        requests = reqgen.generate_requests(self.response)
        self.failUnless(self._equal_requests_list(requests, self.requests))
    def test_request_extractor(self):
        extractors = [
            SgmlRequestExtractor()
            ]

        # extract all requests
        reqgen = RequestGenerator(extractors, [], callback=self.deferred)
        requests = reqgen.generate_requests(self.response)
        self.failUnless(self._equal_requests_list(requests, self.requests))
    def test_request_processor(self):
        extractors = [
            SgmlRequestExtractor()
            ]

        processors = [
            Canonicalize(),
            FilterDupes(),
            ]

        reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
        requests = reqgen.generate_requests(self.response)
        self.failUnless(self._equal_requests_list(requests, self.requests))

        # filter domain
        processors = [
            Canonicalize(),
            FilterDupes(),
            FilterDomain(deny='example.org'),
            ]

        reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
        requests = reqgen.generate_requests(self.response)
        self.failUnlessEqual(list(requests), [])

        # filter url
        processors = [
            Canonicalize(),
            FilterDupes(),
            FilterUrl(deny=(r'about', r'othercat')),
            ]

        reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
        requests = reqgen.generate_requests(self.response)

        self.failUnless(self._equal_requests_list(requests, [
                Request('http://example.org/somepage/item/12.html',
                        meta={'link_text': 'Item 12'}),
                Request('http://example.org/',
                        meta={'link_text': ''}),
                ]))

        processors = [
            Canonicalize(),
            FilterDupes(),
            FilterUrl(allow=r'/somepage/'),
            ]

        reqgen = RequestGenerator(extractors, processors, callback=self.deferred)
        requests = reqgen.generate_requests(self.response)

        self.failUnless(self._equal_requests_list(requests, [
                Request('http://example.org/somepage/item/12.html',
                        meta={'link_text': 'Item 12'}),
                ]))
 def test_basic(self):
     reqgen = RequestGenerator([], [], callback=self.deferred)
     # returns generator
     requests = reqgen.generate_requests(self.response)
     self.failUnlessEqual(list(requests), [])
    def test_request_processor(self):
        extractors = [SgmlRequestExtractor()]

        processors = [
            Canonicalize(),
            FilterDupes(),
        ]

        reqgen = RequestGenerator(extractors,
                                  processors,
                                  callback=self.deferred)
        requests = reqgen.generate_requests(self.response)
        self.failUnless(self._equal_requests_list(requests, self.requests))

        # filter domain
        processors = [
            Canonicalize(),
            FilterDupes(),
            FilterDomain(deny='example.org'),
        ]

        reqgen = RequestGenerator(extractors,
                                  processors,
                                  callback=self.deferred)
        requests = reqgen.generate_requests(self.response)
        self.failUnlessEqual(list(requests), [])

        # filter url
        processors = [
            Canonicalize(),
            FilterDupes(),
            FilterUrl(deny=(r'about', r'othercat')),
        ]

        reqgen = RequestGenerator(extractors,
                                  processors,
                                  callback=self.deferred)
        requests = reqgen.generate_requests(self.response)

        self.failUnless(
            self._equal_requests_list(requests, [
                Request('http://example.org/somepage/item/12.html',
                        meta={'link_text': 'Item 12'}),
                Request('http://example.org/', meta={'link_text': ''}),
            ]))

        processors = [
            Canonicalize(),
            FilterDupes(),
            FilterUrl(allow=r'/somepage/'),
        ]

        reqgen = RequestGenerator(extractors,
                                  processors,
                                  callback=self.deferred)
        requests = reqgen.generate_requests(self.response)

        self.failUnless(
            self._equal_requests_list(requests, [
                Request('http://example.org/somepage/item/12.html',
                        meta={'link_text': 'Item 12'}),
            ]))
 def test_basic(self):
     reqgen = RequestGenerator([], [], callback=self.deferred)
     # returns generator
     requests = reqgen.generate_requests(self.response)
     self.failUnlessEqual(list(requests), [])