Example #1
0
 def test_extraction(self):
     # Default arguments
     lx = RegexLinkExtractor()
     self.assertEqual(lx.extract_links(self.response),
                      [Link(url='http://example.com/sample2.html', text=u'sample 2'),
                       Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
                       Link(url='http://www.google.com/something', text=u''),])
 def test_extraction(self):
     # Default arguments
     lx = RegexLinkExtractor()
     # Note that RegexLinkExtractor returns links in arbitrary order,
     # so we need to sort them for comparison
     self.assertEqual(sorted(lx.extract_links(self.response), key=lambda x: x.url), [
         Link(url='http://example.com/sample2.html', text=u'sample 2'),
         Link(url='http://example.com/sample3.html', text=u'sample 3 repetition'),
         Link(url='http://www.google.com/something', text=u''),
     ])
Example #3
0
class BaseSpider(CrawlSpider):
    name = "Base"
    site__id = None
    extractors = None
    allowed_domains = []
    start_urls = []
    session = None
    rules = (
        Rule(RegexLinkExtractor(),callback='parse_item'),
        Rule(SgmlLinkExtractor(),callback='parse_item'),
        Rule(LxmlParserLinkExtractor(),callback='parse_item'),
    )

    def process_results(self, response, results):
        return chain(results, self.parse_item(response))

    def parse_item(self, response):
        for extractor in self.extractors:
            values = {
                'URL_PROD': response.url,
                }
            extract = {}
            for e in extractor(response):
                extract.update(e) # TODO: check relevance if overwriting
            for k,v in extract.iteritems():
                values[k] = v[0]
            name = values.get('NAME_PROD')
            if name:
                yield  ScraperItem(name=name, site=self.site__id, values=values.iteritems())
Example #4
0
class MySpider(CrawlSpider):
    name = 'example'

    rules = [
        Rule(FallbackLinkExtractor([
            LxmlLinkExtractor(),
            SgmlLinkExtractor(),
            RegexLinkExtractor(),
        ]),
             callback='parse_page',
             follow=True)
    ]

    def parse_page(self, response):
        pass

    parse_start_url = parse_page
Example #5
0
class MySpider(CrawlSpider):
    name = 'example'
    start_urls = ['http://scrapinghub.com']
    callback_calls = 0

    rules = [Rule(FallbackLinkExtractor([
        LxmlLinkExtractor(),
        RegexLinkExtractor(),
    ]), callback='parse_page', follow=True)]

    def parse_page(self, response):
        self.callback_calls += 1
        pass

    def parse_nothing(self, response):
        pass

    parse_start_url = parse_nothing
Example #6
0
class MySpider(CrawlSpider):
    name = 'recorder'
    start_urls = [
        'http://' + DOMAIN,
    ]
    allowed_domains = [DOMAIN]

    rules = [
        Rule(FallbackLinkExtractor([
            LxmlLinkExtractor(allow=ALLOWED_RE),
            SgmlLinkExtractor(allow=ALLOWED_RE),
            RegexLinkExtractor(allow=ALLOWED_RE),
        ]),
             callback='parse_page',
             follow=True)
    ]

    def parse_page(self, response):
        pass