Beispiel #1
0
 def get_proxy_inst(self):
     proxy_pack_name = get_config().get_value('proxy', 'proxy_pack_name',
                                              'zspider.proxy_base')
     proxy_class_name = get_config().get_value('proxy', 'proxy_class_name',
                                               'proxy_base')
     module_file = importlib.import_module(proxy_pack_name)
     cls = getattr(module_file, proxy_class_name)
     return cls()
Beispiel #2
0
 def __init__(self):
     config = get_config().get_table('proxy')
     self.use_proxy = config.get('use_proxy')
     self.proxy_server = config.get('proxy_server')
     self.proxy_path = config.get('proxy_path')
     self.except_wait = config.get('except_wait')
     self.proxy_init()
Beispiel #3
0
def logger(name):
    if '_log_instance' not in globals():
        section = get_config().get_table('log')

        write_stream = section.get("write_stream", True)
        write_file = section.get("write_file", True)
        write_path = section.get("write_path") or './'
        log_level = zlogger.logger_level(section.get("log_level", 'DEBUG').upper())
        log_interval = section.get("log_interval", 1)
        log_backupCount = section.get("log_backupCount", 2)

        globals()['_log_instance'] = zlogger.logger(name, write_stream=write_stream, write_file=write_file,
                                                    file_dir=write_path,
                                                    level=log_level, interval=log_interval, backupCount=log_backupCount)

    return globals()['_log_instance']
Beispiel #4
0
    def __init__(self):
        assert self.spider_name, 'spider_name不能为空(不需要前缀spider_)'
        self.config = get_config()
        self.pid = os.getpid()

        self.log = logger(self.spider_name)
        self.log.info(f'spider=<{self.spider_name}>, 开始启动. ')

        self.seed_handler = seed_handler(self.spider_name)
        self._dup_filter = dup_filter(self.spider_name)

        self.downloader = httpreq(self.auto_cookie_enable)

        self._raw_seed_dict = None
        self._signal_init()

        self.log.info('spider_base初始化完成, 即将调用用户定义函数')
        self.spider_init()

        self.log.info(f'爬虫{self.spider_name}初始化完成')
Beispiel #5
0
class Public_Constant():
    '''公共常量'''
    seed_queue_suffix = eval(get_config().get_value(
        'seed_queue', 'suffix', "['vip', 'd1', 'd2', 'd3', 'seed', 'error']"))
    req_timeout = get_config().get_value('public_constant', 'req_timeout', 20)
    spider_err_wait_time = get_config().get_value('public_constant',
                                                  'spider_err_wait_time', 3)
    empty_seed_wait_time = get_config().get_value('public_constant',
                                                  'empty_seed_wait_time', 120)
    default_html_encoding = get_config().get_value('public_constant',
                                                   'default_html_encoding',
                                                   'utf8')
    retry_wait_fixed = get_config().get_value('public_constant',
                                              'retry_wait_fixed', 0.5)
    max_attempt_count = get_config().get_value('public_constant',
                                               'max_attempt_count', 5)

    seed_collname_suffix = 'seed'
    dup_collname_suffix = 'dup'
    error_seed_suffix = 'error'
    error_seed_parser_suffix = 'error_parser'

    except_retry_flag = _except_retry_flag
Beispiel #6
0
def get_ssdb_inst(**kwargs) -> ssdb_inst:
    section = get_config().get_table('ssdb')
    kw = {key: value for key, value in section.items()}
    kw.update(kwargs)
    return ssdb_inst(**kw)