def make_page_extractor(self, obj): if type(obj)!=dict: return pages = obj.get('pages') if pages: regex = self.macro.expand(pages.get('regex')) css = self.macro.expand(pages.get('css')) if css: xpath = self.tr.css_to_xpath(css) else: xpath = self.macro.expand(pages.get('xpath')) self.page_extractor = SgmlLinkExtractor( allow=regex, restrict_xpaths=xpath, process_value=utils.first_n_pages(regex, pages) )
def load_config(self): self.pretty_conf = utils.load_cfg(self.config, pretty=True) conf_dump = json.dumps(self.pretty_conf) conf = json.loads(conf_dump) ### debug if self.debug==None: self.debug = conf.get('debug', False) ### site self.site = conf.get('site', u'未知站点') self.macro = utils.MacroExpander({ 'SITE': self.site, 'CONF': conf_dump }) ### allowed_domains self.allowed_domains = conf.get('domains', []) ### start_urls urls = conf.get('urls', []) self.start_urls = utils.generate_urls(urls, self.macro) if isinstance(urls, dict): self.start_method = urls.get('method', 'GET') self.make_headers(urls.get('headers', {})) if urls.get('parse'): self.parse_start_url = self.parse_page else: self.start_method = 'GET' self.make_headers({}) ### rules self.tr = HTMLTranslator() self.rules = [] self.page_extractor = None for k,v in conf.get('rules', {}).iteritems(): follow = v.get('follow', True) callback = None if follow else 'parse_page' follow = True if follow is None else follow match = self.macro.expand(v.get('match')) regex = self.macro.expand(v.get('regex')) css = self.macro.expand(v.get('css')) if css: xpath = self.tr.css_to_xpath(css) else: xpath = self.macro.expand(v.get('xpath')) pages = v.get('pages') sub = v.get('sub') vars = v.get('vars') rule = Rule( SgmlLinkExtractor( allow=regex, restrict_xpaths=xpath, process_value=utils.first_n_pages(regex, pages) ), process_links=self.sub_links(sub), process_request=self.set_vars(k, vars), callback=callback, follow=follow ) rule.match = match self.rules.append(rule) self._compile_rules() if not self.rules: self.parse_start_url = self.parse_page self.make_page_extractor(conf.get('urls', [])) ### mappings(loop/fields) self.build_item(conf) ### settings self.load_settings(conf) return conf