def __init__(self, params=None): # 添加这两个配置只是为了调试方便 myCfg = { # CFG_JOB_BATCH:"split_test20140717", # CFG_JOB_NAME:"split", "env": "DEV" } BaseClass.__init__(self, params, myCfg)
def __init__(self, params=""): subCfg = { CFG_ACCOUNT_PROVIDER: "spideraccount.samanager", #ALIYUN_LOCALROOT: PROJECT_ROOT, # 正常这就是本地根 } import oss2 BaseClass.__init__(self, params, subCfg) aliyunCfg = AccountManager().getAccount(ALIYUN) accessKeyId = aliyunCfg["accessKeyId"] accessKeySecret = aliyunCfg["accessKeySecret"] endpoint = aliyunCfg["endPoint"] self.bucket = aliyunCfg["bucket"] self.oss = oss2.Bucket(oss2.Auth(accessKeyId, accessKeySecret), endpoint, aliyunCfg["bucket"]) self.prefix = gConfig.get(ALIYUN_LOCALROOT, self._getDefaultDownRoot()).replace( "\\", "/")
def __init__(self, params, subConfig=None): self.basicConfig = { # http down related CFG_HTTP_INTERVAL: 0.01, # 请求间隔 CFG_HTTP_TIMEOUT: 10, # CFG_HTTP_OUTFORMAT: 'html', # json CFG_HTTP_ENCODING: 'utf-8', # gbk CFG_HTTP_UNESCAPE: 0, # remove special character quoting CFG_HTTP_ENGINE: 'requests', # selenium CFG_HTTP_UA: 'windows', # mac,ios,android CFG_HTTP_BROWSERMODE: 'headless', # CFG_HTTP_BROWSER: BROWSER_TPE_PHANTOMJS, CFG_HTTP_MAXREQUEST: 0, # 一个session 最多请求次数,0表示无限制,否则超过这个次数将重启session CFG_JOB_RUNTIME: 0, # 爬虫运行时间,0无限制,单位是秒 CFG_JOB_HEARTBEAT: 60, # 任务心跳间隔,单位是秒 CFG_DOWN_MAXNUM: 0, # 一次爬虫最多下载数量,0无限制 CFG_DOWN_MAXPAGENUM: 0, # 一次爬虫最多下载页面数量,0无限制 CFG_BLOCK_MAXCHECK: 100, # 反block,元素检查,檢查次數默认>100次,就算blocked,也有可能是页面结构改变 CFG_ACCOUNT_PROVIDER: "spideraccount.samanager", } if subConfig: # 如果subConfig有配置 则更新基本配置 self.basicConfig.update(subConfig) # 把基本配置加入到全局配置中 并写入日志中 BaseClass.__init__(self, params, self.basicConfig) #依赖配置一定要紧跟在BaseClass后 if not gConfig.get(CFG_DOWN_ROOT, None): gConfig.set( CFG_DOWN_ROOT, "d:/" if gConfig.get("env") in ("ONLINE") and not isLinux() else PROJECT_ROOT) SpiderJobUtil.__init__(self) # 如果获取配置的引擎不是seleenium就用selenium请求,如果是就用requests请求 self.http = RequestsAgent() if gConfig.get( CFG_HTTP_ENGINE, "requests") != "selenium" else SeleniumAgent() # 获取全局编码格式 并用lxml 解析 self.parser = etree.HTMLParser(encoding=gConfig.get(CFG_HTTP_ENCODING)) # 把请求的方式和用什么编码解析 放到 下载内容提取器 self.extractor = Extractor(self.http, self.parser) self.antiBlock = None # 工作开始 self.syncPoint = self.jobBegin()
def __init__(self, params="", cfg=None): BaseClass.__init__(self, params, cfg)
def __init__(self,params=""): BaseClass.__init__(self,params)
def __init__(self, params=None, subCfg=None): BaseClass.__init__(self, params, subCfg) JobUtil.__init__(self)
def __init__(self): BaseClass.__init__(self) self.sqlite = Sqlite() # 创建本地数据库 self.createLocalDb()
def __init__(self, params=""): BaseClass.__init__(self, params) gConfig.set(CFG_ACCOUNT_PROVIDER, "spideraccount.samanager")