Example #1
0
 def make_page_extractor(self, obj):
     if type(obj) != dict:
         return
     pages = obj.get('pages')
     if pages:
         regex = self.macro.expand(pages.get('regex'))
         xpath = self.macro.expand(pages.get('xpath'))
         self.page_extractor = SgmlLinkExtractor(
             allow=regex,
             restrict_xpaths=xpath,
             process_value=utils.first_n_pages(regex, pages))
Example #2
0
 def make_page_extractor(self, obj):
     if type(obj)!=dict:
         return
     pages = obj.get('pages')
     if pages:
         regex = self.macro.expand(pages.get('regex'))
         xpath = self.macro.expand(pages.get('xpath'))
         self.page_extractor = SgmlLinkExtractor(
                                     allow=regex,
                                     restrict_xpaths=xpath,
                                     process_value=utils.first_n_pages(regex, pages))
Example #3
0
    def load_config(self, config_path):
        txt = utils.load_file(config_path)
        if not txt:
            raise CloseSpider()

        conf = json.loads(txt)
        log.msg(u'loading config from <{}>:\n{}'.format(
            unicode(config_path, encoding='utf-8'),
            json.dumps(conf, indent=4, ensure_ascii=False, sort_keys=True)))

        #### config
        self.config = config_path

        #### debug
        self.debug = conf.get('debug', False)

        #### site
        self.site = conf.get('site', u'未知站点')
        self.macro = utils.MacroExpander({
            'SITE': self.site,
            'CONF': json.dumps(conf)
        })

        #### allowed_domains
        self.allowed_domains = conf.get('domains', [])

        #### start_urls
        urls = conf.get('urls', [])
        self.start_urls = utils.generate_urls(urls, self.macro)
        self.start_method = urls.get('method',
                                     'GET') if type(urls) == dict else 'GET'
        self.make_headers(
            urls.get('headers', {}) if type(urls) == dict else {})

        #### rules
        self.rules = []
        self.page_extractor = None
        for k, v in conf.get('rules', {}).iteritems():

            follow = v.get('follow', True)
            callback = None if follow else 'parse_page'
            regex = self.macro.expand(v.get('regex'))
            xpath = self.macro.expand(v.get('xpath'))
            pages = v.get('pages')
            sub = v.get('sub')

            rule = Rule(SgmlLinkExtractor(allow=regex,
                                          restrict_xpaths=xpath,
                                          process_value=utils.first_n_pages(
                                              regex, pages)),
                        process_links=self.sub_links(sub),
                        callback=callback,
                        follow=follow)

            self.rules.append(rule)

        if not self.rules:
            self.parse = self.parse_page
            self.make_page_extractor(conf.get('urls', []))

        ### mappings(loop/fields)
        self.build_mappings(conf)

        ### proxy
        self.proxy = conf.get('proxy', {})

        ### database
        for db in ['mongo', 'mysql', 'zmq']:
            if db in conf:
                setattr(self, db, conf[db])

        ### settings
        # self.logger = settings.DEFAULT_LOGGER
        self.dedup = settings.DEFAULT_DEDUP
        for k, v in conf.get('settings', {}).iteritems():
            log.msg(utils.G('+SET {} = {}'.format(k, v)))
            setattr(self, k, v)

        ### plugin
        if hasattr(self, 'plugin'):
            self.plugin = utils.load_plugin(self.plugin)
            self.plugin.spider = self
        else:
            self.plugin = None
Example #4
0
    def load_config(self, config_path):
        txt = utils.load_file(config_path)
        if not txt:
            raise CloseSpider()

        conf = json.loads(txt)
        log.msg(u'loading config from <{}>:\n{}'.format(unicode(config_path, encoding='utf-8'),
                json.dumps(conf, indent=4, ensure_ascii=False, sort_keys=True)))

        #### config
        self.config = config_path

        #### debug
        self.debug = conf.get('debug', False)

        #### site
        self.site = conf.get('site', u'未知站点')
        self.macro = utils.MacroExpander({
            'SITE': self.site,
            'CONF': json.dumps(conf)
        })

        #### allowed_domains
        self.allowed_domains = conf.get('domains', [])

        #### start_urls
        urls = conf.get('urls', [])
        self.start_urls = utils.generate_urls(urls, self.macro)
        self.start_method = urls.get('method', 'GET') if type(urls)==dict else 'GET'
        self.make_headers(urls.get('headers', {}) if type(urls)==dict else {})

        #### rules
        self.rules = []
        self.page_extractor = None
        for k,v in conf.get('rules', {}).iteritems():

            follow = v.get('follow', True)
            callback = None if follow else 'parse_page'
            regex = self.macro.expand(v.get('regex'))
            xpath = self.macro.expand(v.get('xpath'))
            pages = v.get('pages')
            sub = v.get('sub')

            rule = Rule(
                SgmlLinkExtractor(
                    allow=regex,
                    restrict_xpaths=xpath,
                    process_value=utils.first_n_pages(regex, pages)),
                process_links=self.sub_links(sub),
                callback=callback,
                follow=follow
            )

            self.rules.append(rule)

        if not self.rules:
            self.parse = self.parse_page
            self.make_page_extractor(conf.get('urls', []))

        ### mappings(loop/fields)
        self.build_mappings(conf)

        ### proxy
        self.proxy = conf.get('proxy', {})

        ### database
        for db in ['mongo', 'mysql', 'zmq']:
            if db in conf:
                setattr(self, db, conf[db])

        ### settings
        # self.logger = settings.DEFAULT_LOGGER
        self.dedup = settings.DEFAULT_DEDUP
        for k,v in conf.get('settings', {}).iteritems():
            log.msg(utils.G('+SET {} = {}'.format(k, v)))
            setattr(self, k, v)

        ### plugin
        if hasattr(self, 'plugin'):
            self.plugin = utils.load_plugin(self.plugin)
            self.plugin.spider = self
        else:
            self.plugin = None