class BaseSpider(CrawlSpider): name = "Base" site__id = None extractors = None allowed_domains = [] start_urls = [] session = None rules = ( Rule(RegexLinkExtractor(),callback='parse_item'), Rule(SgmlLinkExtractor(),callback='parse_item'), Rule(LxmlParserLinkExtractor(),callback='parse_item'), ) def process_results(self, response, results): return chain(results, self.parse_item(response)) def parse_item(self, response): for extractor in self.extractors: values = { 'URL_PROD': response.url, } extract = {} for e in extractor(response): extract.update(e) # TODO: check relevance if overwriting for k,v in extract.iteritems(): values[k] = v[0] name = values.get('NAME_PROD') if name: yield ScraperItem(name=name, site=self.site__id, values=values.iteritems())
def __init__(self, skip_text=False): lx = LxmlParserLinkExtractor() self.linkextractor = FilteringLinkExtractor(lx, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), canonicalize=True, deny_extensions=None) self.skip_text = skip_text
def parse(self, response): ret = [] title = "".join( response.xpath(".//span[@id='sites-page-title']/text()").extract()) # print "Page Title", title url = response.url path = url.replace(" ", "").replace("/", "_").replace(":", "") content = response.xpath(".//*[@id='sites-canvas-main-content']/table") links = [] domains = [] #print "Page Content", content l = LxmlParserLinkExtractor() for e in content: for x in e.xpath('//*//@href'): y = x.extract() links.append(y) domres = tldextract.extract(y) sd = domres.subdomain dm = domres.domain tld = domres.suffix if dm: #print domres d = "{0}.{1}.{2}".format(sd, dm, tld) if d not in domains: domains.append(d) content = "".join(content.extract()) p = Page(title=title, path=path, url=url, content=content, links=links, domains=domains) ret.append(p) # spider the other local pages for sel in response.xpath('//ul/li/div'): title = sel.xpath('a/text()').extract() link = sel.xpath('a/@href').extract() desc = sel.xpath('text()').extract() if link: l = self.base + link[0] #print title, l, desc ret.append(scrapy.http.Request(l)) return ret