Esempio n. 1
0
class BaseSpider(CrawlSpider):
    name = "Base"
    site__id = None
    extractors = None
    allowed_domains = []
    start_urls = []
    session = None
    rules = (
        Rule(RegexLinkExtractor(),callback='parse_item'),
        Rule(SgmlLinkExtractor(),callback='parse_item'),
        Rule(LxmlParserLinkExtractor(),callback='parse_item'),
    )

    def process_results(self, response, results):
        return chain(results, self.parse_item(response))

    def parse_item(self, response):
        for extractor in self.extractors:
            values = {
                'URL_PROD': response.url,
                }
            extract = {}
            for e in extractor(response):
                extract.update(e) # TODO: check relevance if overwriting
            for k,v in extract.iteritems():
                values[k] = v[0]
            name = values.get('NAME_PROD')
            if name:
                yield  ScraperItem(name=name, site=self.site__id, values=values.iteritems())
 def __init__(self, skip_text=False):
     lx = LxmlParserLinkExtractor()
     self.linkextractor = FilteringLinkExtractor(lx,
                                                 allow=(),
                                                 deny=(),
                                                 allow_domains=(),
                                                 deny_domains=(),
                                                 restrict_xpaths=(),
                                                 canonicalize=True,
                                                 deny_extensions=None)
     self.skip_text = skip_text
Esempio n. 3
0
    def parse(self, response):
        ret = []
        title = "".join(
            response.xpath(".//span[@id='sites-page-title']/text()").extract())
        #        print "Page Title", title
        url = response.url
        path = url.replace(" ", "").replace("/", "_").replace(":", "")

        content = response.xpath(".//*[@id='sites-canvas-main-content']/table")
        links = []
        domains = []
        #print "Page Content", content

        l = LxmlParserLinkExtractor()
        for e in content:
            for x in e.xpath('//*//@href'):
                y = x.extract()
                links.append(y)
                domres = tldextract.extract(y)
                sd = domres.subdomain
                dm = domres.domain
                tld = domres.suffix
                if dm:
                    #print domres
                    d = "{0}.{1}.{2}".format(sd, dm, tld)
                    if d not in domains:
                        domains.append(d)

        content = "".join(content.extract())
        p = Page(title=title,
                 path=path,
                 url=url,
                 content=content,
                 links=links,
                 domains=domains)
        ret.append(p)

        # spider the other local pages
        for sel in response.xpath('//ul/li/div'):
            title = sel.xpath('a/text()').extract()
            link = sel.xpath('a/@href').extract()
            desc = sel.xpath('text()').extract()
            if link:
                l = self.base + link[0]
                #print title, l, desc
                ret.append(scrapy.http.Request(l))

        return ret