コード例 #1
0
ファイル: logUtil.py プロジェクト: Curiou/redundancy
    def critical(self, msg, *args, **kwargs):
        try:
            from superbase.globalData import gConfig
            name = gConfig.get(CFG_JOB_NAME)
            batch = gConfig.get(CFG_JOB_BATCH)
            from superbase.utility.timeUtil import getTimestamp
            curTime = getTimestamp()

            def _createId():
                lines = msg.split("\n")
                lines.reverse()
                for line in lines:
                    if line.find("!myex!File ") >= 0 and line.find(
                            " line ") > 0:
                        return hash("%s_%s_%s_%s" %
                                    (curTime[:8], line, name,
                                     batch))  # or use hash_lib.md5
                return None

            id = _createId()
            if id and id not in self.exceptions:
                #用db来保证多进程下log唯一性
                ret = self.logDB(name, batch, msg, id)
                if ret != LOG_EXIST or msg.find(LOG_ALWAYS) >= 0:
                    self.logger.critical(
                        "{name} {batch}  {msg}".format(name=name,
                                                       batch=batch,
                                                       msg=msg), *args,
                        **kwargs)
                self.exceptions.append(id)
            elif gConfig.get("env") == "DEV":
                print("dev-debug:%s" % msg)
        except ImportError, e1:
            print("%s/n/%s" % (e1, sys.path))
コード例 #2
0
ファイル: loginUtil.py プロジェクト: Curiou/redundancy
    def getProxyFromAPI(self):
        if gConfig.get(CFG_SERVER_PROXYAPI, "mogu") == "zhima":
            if self.currProxyInfo:
                self.giveBackProxy()
            return self.proxyFromDB()

        elif gConfig.get(CFG_SERVER_PROXYAPI, "mogu") == "mogu":
            retryNum = 0
            maxTry = 10
            while retryNum < maxTry:
                if not self.proxyList:
                    res = requests.get(self.proxyServer)
                    ck_json = res.content
                    logInfo(ck_json)
                    ck_dict = json.loads(ck_json)
                    if ck_dict["code"] == "3001":
                        time.sleep(5)
                    if ck_dict["code"] == "0":
                        self.proxyList = ck_dict["msg"]
                if self.proxyList:
                    ck = self.proxyList.pop()
                    ipPort = "http://{}:{}".format(ck["ip"], ck["port"])
                    if self.chekIP(ipPort):
                        return ipPort
                retryNum += 1
                time.sleep(2)
            logInfo("try to get proxies is disabled!!!!check out!!!")
コード例 #3
0
    def getResultByUrl(self, url, template, result):
        """
        获取请求页面的content数据
        :param url:
        :param template:
        :param result:
        :return: 返回请求页面的content数据
        """

        content = self.http.get(url)  # 对url发送get请求,获取内容

        if content in (u' ', u'', None):
            return None

        # 获取配置参数CFG_DEBUG_SAVEFILE,
        if gConfig.get(CFG_DEBUG_SAVEFILE):
            AntiBlock.saveWrongPage(
                content, gConfig.get(CFG_DEBUG_SAVEFILENAME)
            )  # 如果有,,调用AntiBlock.saveWrongPage方法,记录错误页面的消息
        format = gConfig.get(CFG_HTTP_OUTFORMAT, "html")
        if format == "json":  # 返回格式是json,不用解析
            result.update(content)  # 更新
        elif format == "html":
            self.getResultByContent(content, template,
                                    result)  # 不符合条件,调用getResultByContent方法
        elif format == "file":
            result["file"] = content

        return content
コード例 #4
0
def createDb(dbNameKey, dbParams=None):
    """
    创建数据库
    :param dbNameKey: 目前只有db.monitor数据监控 and db.business数据业务
    :param dbParams: DEV and TEST 有默认参数,ONLINE需要通过jobManager分配
    :return:
    """
    from superbase.globalData import gConfig, gTop
    # CFG_DB_DISABLE 禁用,deprecated
    if gConfig.get(CFG_DB_DISABLE, 0):
        return
    # 获取数据监控或者数据业务
    dbName = gConfig.get(dbNameKey)

    try:
        from superbase.globalData import gConfig
        db_params = AccountManager().getAccount(
            dbNameKey) if not dbParams else dbParams
        if not db_params:
            return

        db = createDb2(dbName,
                       db_params,
                       dictCursor=gConfig.get(CFG_DB_DICTCURSOR, 0))
        # 把数据业务或者数据监控和mysql连接 配置到 全局数据单点控制 中
        gTop.set(dbNameKey, db)
    except Exception:
        logException()
コード例 #5
0
 def __init__(self):
     #有jobId 才有jobEnable
     gConfig.set(CFG_JOB_ENABLE, gConfig.get(CFG_JOB_ID,0))
     if gConfig.get(CFG_JOB_ENABLE, 0):
         # jobManger 任务管理器
         from jobManager.job import Job
         self.job = Job()
     else:
         self.job = None
     self.jobError = None
コード例 #6
0
 def saveWrongPage(content2, htmlFile=None):
     import random
     if not htmlFile:
         htmlFile = gConfig.get(CFG_LOG_FILE_NAME).replace(".txt", "%s.html" % (random.randint(100, 999)))
         htmlFile = os.path.join(PROJECT_ROOT + "log/", htmlFile)
     fname = os.path.split(htmlFile)[1]
     import codecs
     with codecs.open(htmlFile, 'wb', gConfig.get(CFG_HTTP_ENCODING, "utf-8")) as f:
         f.write(content2)
     logInfo("saveWrongPage:%s" % htmlFile)
     return fname, htmlFile
コード例 #7
0
ファイル: loginUtil.py プロジェクト: Curiou/redundancy
 def checkError(self):
     """
     检查取账号是否正常
     :return: True有错
     """
     interval = gConfig.get("account.checkInterval", 300)  # 检测区间300s
     limit = gConfig.get("account.checkLimit", 10)  # 最大数10个
     if len(self.history) > limit:
         diff = self.history[-1][0] - self.history[-10][0]
         if diff < interval:
             logError("getAccount too frequently!!-%s" % diff)
             return True
     return False
コード例 #8
0
ファイル: logUtil.py プロジェクト: Curiou/redundancy
    def getLogger(cfg, forceNew=False):

        from superbase.globalData import gTop
        if not gTop.get(
                GD_LOGGER) or forceNew:  # singleton or force a new logger

            from superbase.globalData import gConfig
            from superbase.globalData import PROJECT_ROOT
            from superbase.utility.ioUtil import getPrintDict, mkdir

            logDir = os.path.join(PROJECT_ROOT, "log")
            mkdir(logDir)

            for key, value in cfg.items():
                if key in IN_PARAMS_KEY:
                    L1, L2, L3 = IN_PARAMS_KEY[key]
                    BASIC_SETTINGS[L1][L2][L3] = value
                elif key == CFG_LOG_FILE_NAME:
                    logFileName = os.path.join(logDir, value)
                    dir = os.path.split(logFileName)[0]
                    mkdir(dir)
            BASIC_SETTINGS["handlers"]["file"]["filename"] = logFileName

            logging.config.dictConfig(BASIC_SETTINGS)
            logger = logging.getLogger(SMILE_LOGGER)
            logger2 = logAdaper(logger)
            gTop.set(GD_LOGGER, logger2)  # logger#
            hint = "current code root %s\n--config is--\n%s" % (
                PROJECT_ROOT, getPrintDict(gConfig.cfg))
            if gConfig.get(CFG_JOB_ID, 0) > 0:
                logger2.info(hint)
            else:
                logger2.debug(hint)

        return gTop.get(GD_LOGGER)
コード例 #9
0
ファイル: resultUtil.py プロジェクト: Curiou/redundancy
    def saveSyncPoint(self, result,sync2Remote=False):
        """
        保存同步点
        :param result:
        :param sync2Remote:默认每次都同步到remote
        :return:返回去掉syncInfo的数据
        """
        if self.index:
            try:
                index = self.index
                #如果result中 同步点信息
                if CFG_DOWN_SYNCINFO in result:
                    #把result中 已经同步的信息 赋值与syncInfo
                    syncInfo = result.get(CFG_DOWN_SYNCINFO,{})
                    #删除result 中的同步点信息
                    del result[CFG_DOWN_SYNCINFO]
                    data = {
                        "id": md5(index),
                        "idx": index,
                        "syncInfo": syncInfo,
                        "upTime": getTimestamp()#时间戳
                    }
                    json2File(self.localFile,data)
                    self.saveNum += 1
                    if sync2Remote or (self.saveNum%gConfig.get(CFG_DOWN_SYNCINTERVAL,5)==1): #默认每5次同步到remote:
                        self.syncToRemote(data)

            except Exception, e:
                logException()

            return result
コード例 #10
0
ファイル: fanli.py プロジェクト: Curiou/redundancy
 def __init__(self, params=None):
     # 添加前面两个配置只是为了调试方便
     myCfg = {
         CFG_JOB_BATCH: "fanli_test20140717",
         CFG_JOB_NAME: "fanli",
         # CFG_HTTP_ENCODING:"gbk",
         CFG_HTTP_ENGINE: "selenium",
         CFG_HTTP_BROWSER: "chrome",  #selenium默认用phantomjs做browser
         CFG_HTTP_BROWSERMODE: "1",  #selenium默认用phantomjs做browser
         CFG_DOWN_INDEX: "www_fanli_com/coupon"
     }
     BaseCrawler.__init__(self, params, myCfg)
     resultFile = gConfig.get(CFG_LOG_FILE_NAME).replace(
         ".txt", "_result.txt")
     self.resultFile = os.path.join(PROJECT_ROOT + "log/", resultFile)
     self.file = open(self.resultFile, "w")
     import re
     self.urlPattern = re.compile(r"[url,go]=(http.*)")
     self.patterns = [
         {
             "url": re.compile(r"url=(http.*)"),
             "sellerId": re.compile(r"seller_?[I,i]d%3D(.*?)%"),
             "couponId": re.compile(r"activity_?[I,i]d%3D(.*?)[&,%]"),
             "productId": re.compile(r"Epid-(.*?)%"),
             "discount": re.compile(ur"满(.*?)减(.*)"),
         },
         {
             "url": re.compile(r"go=(http.*)"),
             # "sellerId": re.compile(r"[sellerId,seller_id]%3D(.*?)%"),
             "couponId": re.compile(r"activity_?[I,i]d%3D(.*?)[&,%]"),
             "productId": re.compile(r"itemId%3D(.*?)[&,%]"),
             "discount": re.compile(ur"(\d+)"),
         },
     ]
コード例 #11
0
ファイル: baseClass.py プロジェクト: Curiou/redundancy
    def __init__(self, params=None, subConfigDict=None):
        """
        :param params: 权限最高输入config,可覆盖所有,通常是命令行传入
        :param subConfigDict: 权限次高config,可覆盖父类,通常是子类固定设置或者用于
        :return:
        """

        newCfg = params or subConfigDict
        if newCfg:
            # 统一配置访问入口,会整合global,class,and input,可以用gConfig 统一访问
            # parseParams中转 把传递过来的str类型的配置 转换成dict类型的配置
            configIn = self.parseParams(params)  # input first
            if subConfigDict:
                # 如果subConfigDict非空 则把 处理好的配置参数 添加到subConfigDict
                subConfigDict.update(configIn)  # subClass second
            else:
                subConfigDict = configIn
            # 把配置添加到 全局数据单点配置中
            gTop.get(GD_CFG_IN).update(configIn)
            # 把子配置 更新到全局配置
            gConfig.update(subConfigDict)  #
            # 工作环境
            # upper()返回转换为大写的字符串的副本。
            gConfig.set("env", gConfig.get("env").upper())  # make sure capital

        # 创建日志记录器
        createLogger(gConfig)
コード例 #12
0
ファイル: resultUtil.py プロジェクト: Curiou/redundancy
    def getNewSyncInfoByDesc(self, oldSyncInfo, initBegin=None, initEnd=-365 * 10 * 24 * 3600 * 1000):
        """
        适用于同步信息是递减的情况,如以发帖时间为同步点
        :param oldSyncInfo: 前一次的同步信息
        :param initEnd: 第一次创建时用的结束时间,毫秒级别,默认是10年前
        :return:
        """
        syncInfo = oldSyncInfo

        if not initBegin:#默认就用当前时间
            initBegin = time.time() * 1000

        if not syncInfo:  # 第一次抓取
            syncInfo = {
                CFG_DOWN_INCPOINT: initBegin,  # 下一次的增量起始点
                CFG_DOWN_SYNCBEGIN: initBegin,   # 开始
                CFG_DOWN_SYNCCURR: initBegin,    # 执行
                CFG_DOWN_SYNCEND: initBegin + initEnd    # 结束
            }
        else:
            # 存量抓完了,或者没抓完但是放弃剩余存量,就用正常增量模式
            if syncInfo[CFG_DOWN_SYNCCURR] <= syncInfo[CFG_DOWN_SYNCEND] or gConfig.get(CFG_DOWN_INCMODE, 0):
                syncInfo[CFG_DOWN_SYNCEND] = syncInfo[CFG_DOWN_INCPOINT]
                syncInfo[CFG_DOWN_INCPOINT] = syncInfo[CFG_DOWN_SYNCBEGIN] = syncInfo[CFG_DOWN_SYNCCURR] = initBegin
            else:  # 上次没爬完,断点续爬
                logInfo("use break point mode,go on crawling from the last break point")
                # syncInfo[CFG_DOWN_INCPOINT] = syncInfo[CFG_DOWN_SYNCBEGIN],
                syncInfo[CFG_DOWN_SYNCBEGIN] = syncInfo[CFG_DOWN_SYNCCURR]


        return self._getSyncInfoByCfg(syncInfo)
コード例 #13
0
    def update(self, table, params, condition=""):
        """
        更新数据
        :param table:
        :param params:
        :param condition:条件
        :return:
        """

        str1 = []
        values = []
        for key, value in params.items():
            # 把传过来的参数遍历出来 key与"="与valueHolder 拼接并添加到str1列表中,value添加到values列表中
            str1.append(key + "=" + self.valueHolder)
            values.append(value)
        # 拼接字段
        fields = ",".join(str1)
        # 更新的sql语句
        sql = "update %s set  %s %s" % (table, fields, condition)
        # 把values列表转换成values元组
        values = tuple(values)
        # 执行 更新sql命令
        cur = self.safeExecute(sql, values)
        self.commitTransaction(cur)
        from superbase.globalData import gConfig
        if gConfig.get("debug.sql", None):
            str1 = []
            for key, value in params.items():
                str1.append("`%s`='%s'" % (key, value))
            params2 = ",".join(str1)
            sql = "update {table} set {params} {condition}".format(
                table=table, params=params2, condition=condition)
            # 打印sql 语句
            logInfo(sql)
コード例 #14
0
ファイル: sample.py プロジェクト: Curiou/redundancy
    def antiBlockTest1(self):
        """
        反屏蔽测试,直接屏蔽信息测试
        爬jobui,如果太频繁,有可能弹出登陆界面,或者显示反扒信息
        :return:
        """
        from spiderx.common.utility.antiBlockUtil import AntiBlockStrategy

        strategy = gConfig.get(CFG_AB_STRATEGY) if gConfig.get(
            CFG_AB_STRATEGY, None) else "postpone 2;changeAccount"

        #changeAccount 必须自己实现
        class MyAntiBlockStrategy(AntiBlockStrategy):
            def __init__(self, strategy):
                AntiBlockStrategy.__init__(self, strategy)

            def changeAccount(self):
                logInfo("implement:change account,relogin,...")

        antiBlocksConf = [
            # 测试直接屏蔽,这里只是做测试,所以blockInfo 是用正常的信息
            {
                # url 标示key,*表示任何页面
                'key': ['*'],
                # 可以有多个info,每个info对应一个strategy
                'blockInfo': [{
                    "info": u"We're sorry but the page",
                    "strategy": MyAntiBlockStrategy(strategy)
                }],
            },
        ]

        # setp 1,添加antiBlock配置
        self.addAntiBlock(antiBlocksConf)
        # 这个conf只是随便设置
        conf1 = {
            "logo": CssElement("div.company-logo > a > img", "src"),
            "name": CssElement("#companyH1 > a")
        }

        # 测试直接屏蔽,出错后会自动重启session,默认重试10次
        self.downOne(
            "http://www.jobui.com/cmp?keyword=%E4%B8%8A%E6%B5%B7%E4%BF%A1%E7%A4%BC",
            conf1)
        if self.antiBlock.isNeedExit():
            logInfo(u"重试10次失败 \n 无法继续处理,人工干预")
            return
コード例 #15
0
ファイル: loginUtil.py プロジェクト: Curiou/redundancy
 def __init__(self):
     if gConfig.get("env") == "ONLINE":
         params = AccountManager().getAccount(CFG_DB_BUSINESS)
         if params:
             self.db = createDb2("loginHelperDb", params, dictCursor=1)
     self.proxyServerList = AccountManager().getAccount(CFG_SERVER_PROXY)
     self.proxyServer = self.proxyServerList[2]
     self.proxyList = []
     self.usedIP = 0
     if gConfig.get(CFG_SERVER_PROXYAPI, None) == "zhima":
         self.website = {
             "www_tianyancha_com": "tyc",
             "www_qichacha_com": "qcc",
         }
         self.proxyId = 0
         self.currWeb = self.website[gConfig.get(CFG_DOWN_WEBSITE)]
         self.currProxyInfo = None
コード例 #16
0
 def retry(self):
     self.blocked = 0
     self.retryNum += 1
     logInfo("blocked!!--retry-%s"%self.retryNum)
     if self.retryNum>gConfig.get(CFG_AB_MAXRETRY,2):
         logCritical("retry antiblcok Fail")
         self.retryNum = 0
         self.setExit()
コード例 #17
0
ファイル: resultUtil.py プロジェクト: Curiou/redundancy
 def getSavePath(self,params=None):
     hour = getHour(getTimestamp())
     index = self.getDownIndex()
     # 设置文件的保存路径
     rootPath = gConfig.get(CFG_DOWN_ROOT)
     path = os.path.join(rootPath, "downData/{index}/{hour}/{batch}".format(index=index, batch=gConfig.get(CFG_JOB_BATCH),hour=hour))
     mkdir(path)
     return path
コード例 #18
0
ファイル: baseCrawler.py プロジェクト: Curiou/redundancy
    def __init__(self, params, subConfig=None):
        self.basicConfig = {
            # http down related
            CFG_HTTP_INTERVAL: 0.01,  # 请求间隔
            CFG_HTTP_TIMEOUT: 10,  #
            CFG_HTTP_OUTFORMAT: 'html',  # json
            CFG_HTTP_ENCODING: 'utf-8',  # gbk
            CFG_HTTP_UNESCAPE: 0,  # remove special character quoting
            CFG_HTTP_ENGINE: 'requests',  # selenium
            CFG_HTTP_UA: 'windows',  # mac,ios,android
            CFG_HTTP_BROWSERMODE: 'headless',  #
            CFG_HTTP_BROWSER: BROWSER_TPE_PHANTOMJS,
            CFG_HTTP_MAXREQUEST:
            0,  # 一个session 最多请求次数,0表示无限制,否则超过这个次数将重启session
            CFG_JOB_RUNTIME: 0,  # 爬虫运行时间,0无限制,单位是秒
            CFG_JOB_HEARTBEAT: 60,  # 任务心跳间隔,单位是秒
            CFG_DOWN_MAXNUM: 0,  # 一次爬虫最多下载数量,0无限制
            CFG_DOWN_MAXPAGENUM: 0,  # 一次爬虫最多下载页面数量,0无限制
            CFG_BLOCK_MAXCHECK:
            100,  # 反block,元素检查,檢查次數默认>100次,就算blocked,也有可能是页面结构改变
            CFG_ACCOUNT_PROVIDER: "spideraccount.samanager",
        }

        if subConfig:
            # 如果subConfig有配置 则更新基本配置
            self.basicConfig.update(subConfig)
        # 把基本配置加入到全局配置中 并写入日志中
        BaseClass.__init__(self, params, self.basicConfig)
        #依赖配置一定要紧跟在BaseClass后
        if not gConfig.get(CFG_DOWN_ROOT, None):
            gConfig.set(
                CFG_DOWN_ROOT, "d:/" if gConfig.get("env") in ("ONLINE")
                and not isLinux() else PROJECT_ROOT)
        SpiderJobUtil.__init__(self)

        # 如果获取配置的引擎不是seleenium就用selenium请求,如果是就用requests请求
        self.http = RequestsAgent() if gConfig.get(
            CFG_HTTP_ENGINE, "requests") != "selenium" else SeleniumAgent()
        # 获取全局编码格式 并用lxml 解析
        self.parser = etree.HTMLParser(encoding=gConfig.get(CFG_HTTP_ENCODING))
        # 把请求的方式和用什么编码解析 放到 下载内容提取器
        self.extractor = Extractor(self.http, self.parser)
        self.antiBlock = None

        # 工作开始
        self.syncPoint = self.jobBegin()
コード例 #19
0
ファイル: safeUtil.py プロジェクト: Curiou/redundancy
def isOnDevelop():
    """
    在开发
    :return:
    """
    # local 本地
    # upper 字符串中的小写字母转为大写字母。
    return gConfig.get("env").upper() == "LOCAL"
コード例 #20
0
ファイル: accountUtil.py プロジェクト: Curiou/redundancy
 def __init__(self, provider=None):
     """
     :param provider: module or full name of module,eg, xxAccount.provider
     """
     self.provider = gConfig.get(CFG_ACCOUNT_PROVIDER, provider)
     if isinstance(self.provider, basestring):
         import importlib
         self.provider = importlib.import_module(self.provider)
コード例 #21
0
ファイル: resultUtil.py プロジェクト: Curiou/redundancy
 def __init__(self, index=None):
     """
     :param index:
     """
     self.index = gConfig.get(CFG_DOWN_INDEX) if not index else index
     if self.index:
         syncDir = os.path.join(PROJECT_ROOT,"syncDir")
         if self.index:
             path1 = os.path.join(syncDir,"%s.json"%self.index)
             mkdir(os.path.split(path1)[0])
             self.localFile = path1
         self.lastSyncInfo = None
         self.saveNum = 0
         self.syncRemote = gConfig.get(CFG_JOB_ENABLE) or gConfig.get("debug.sync")
         self.checkSync()
     else:
         logError("no syncIndex!!")
         self.syncRemote = False
コード例 #22
0
    def doCheckBlock(self, url, content, antiBlock):
        blockInfo = antiBlock.get("blockInfo", None)
        if blockInfo:

            for b1 in blockInfo:
                info = b1["info"] #兼容
                self.blocked = BLOCKED_INFO if content.find(info) > 0 else 0
                if self.blocked:
                    logError("!!!!block by %s,url=%s" % (gConfig.get(CFG_JOB_NAME), url))
                    return b1["strategy"]

        # check the elements

        blocked = False
        element = antiBlock.get("blockElement",None)
        if element:

            strategy = element.get("strategy",None)
            elements = element["elements"]
            for template, value in elements:
                result = {}
                self.extractor.getResultByContent(content, template, result)
                checkName = result.get("name", None)
                if not checkName or (value and checkName.find(value) == -1):
                    blocked = True
                else:
                    blocked = False
                    break  # 非block马上跳出
            if blocked:
                self.blockCheck += 1
                logError("%s:the element not exist,block?%s" % (self.blockCheck,url))
            else:
                self.blockCheck = 0  # reset

            globalCheckNum = gConfig.get(CFG_BLOCK_MAXCHECK,30)
            localCheckNum = element.get("maxCheckNum",globalCheckNum) #如果有local,用local

            if self.blockCheck > localCheckNum:
                logError("block by element,pls check the content,maybe the structure has changed!")
                self.blocked = BLOCKED_ELEMENT
                self.blockCheck = 0
                return strategy
コード例 #23
0
ファイル: loginUtil.py プロジェクト: Curiou/redundancy
    def __init__(self, delAc=False):
        """

        :param acType: cookie,webSite,...
        """
        params = AccountManager().getAccount(CFG_DB_BUSINESS)
        self.db = createDb2("loginHelperDb", params, dictCursor=1)
        self.curMaxId = 0
        self.offset = gConfig.get(CFG_DB_OFFSET, 0)
        self.history = []
        self.delAc = delAc
コード例 #24
0
ファイル: resultUtil.py プロジェクト: Curiou/redundancy
 def getDownIndex(self):
     """
     :return: index,without the "/" at the begin and end
     路径index,可以多级,逐级细分
     """
     idx = gConfig.get(CFG_DOWN_INDEX)
     if idx.endswith("/"):
         idx = idx[:-1]
     if idx.startswith("/"):
         idx = idx[1:]
     return idx
コード例 #25
0
 def alarmPageError(self, url, content, downInfo):
     """
     解析元素有错,有可能是blocked 也有可能是页面结构变化,邮件警告,人工检查
     :param url:
     :param content:
     :param downInfo:downNum,downTime,downInterval etc.
     :return:
     """
     fname, filePath = AntiBlock.saveWrongPage(content)
     info = {
         'jobName': gConfig.get(CFG_JOB_NAME),
         'batch': gConfig.get(CFG_JOB_BATCH),
         'url': url,
         'filePath': filePath,
         'type': self.blocked,
         'detail': json.dumps(downInfo),
         'inTime': getTimestamp(),
     }
     title = "block-%s" % self.blocked
     content = getPrintDict(info)
     attach = [(fname, filePath)]
     emails2 = [gConfig.get(CFG_JOB_EMAIL)] if gConfig.get(CFG_JOB_EMAIL, None) else []
     if gConfig.get(CFG_JOB_ENABLE, 0):
         gTop.get('db').insert("block", info)
         from jobManager.job import Job
         Job().sendEmail(
             title=title,
             content=content,
             attach=attach,
             emails2=emails2
         )
     else:
         Mail.sendEmail(
             title=title,
             content=content,
             t_address=emails2,
             attaches=attach
         )
     logError("blocked?check the content\n%s" % getPrintDict(info))
コード例 #26
0
ファイル: urlUtil.py プロジェクト: Curiou/redundancy
 def pageUrls(self, listConf):
     """
     是自动拼接url以&str=
     list url's generator
     :return:
     """
     try:
         url = self.beginUrl
         totalPage, err = self.getPageNum(url, listConf)
         if err:
             logError("getPageNum error?%s,url=%s" % (err, url))
         logInfo("%s url=%s\ntotalPage=%s" % (getTimestamp(), url, totalPage))
         if int(gConfig.get(CFG_DOWN_MAXPAGENUM)):
             totalPage = min(int(totalPage), int(gConfig.get(CFG_DOWN_MAXPAGENUM)))
         for page in range(int(totalPage)):
             try:
                 url2 = self.getNextPageUrl(url, page + 1, listConf)
                 if self.http.isBlocked():
                     break
                 yield url2
             except Exception:
                 logException()
     except Exception, e:
         logException()
コード例 #27
0
    def test1(self, val1, val2):
        """
        这里演示的是:
        1,全局配置参数test.size的使用
        2,脚本调用的case
        python superbase/sample1.py "env=DEV,test.size=101"  test1 hello world
        python superbase/sample1.py "env=DEV,test.size=99"  test1 hello world

        :return:
        """
        size = gConfig.get("test.size", 0)
        if size > 100:
            logInfo("size=%s-%s" % (size, val1))
        else:
            logInfo("size=%s-%s" % (size, val2))
コード例 #28
0
ファイル: aliyun.py プロジェクト: Curiou/redundancy
 def __init__(self, params=""):
     subCfg = {
         CFG_ACCOUNT_PROVIDER: "spideraccount.samanager",
         #ALIYUN_LOCALROOT: PROJECT_ROOT,  # 正常这就是本地根
     }
     import oss2
     BaseClass.__init__(self, params, subCfg)
     aliyunCfg = AccountManager().getAccount(ALIYUN)
     accessKeyId = aliyunCfg["accessKeyId"]
     accessKeySecret = aliyunCfg["accessKeySecret"]
     endpoint = aliyunCfg["endPoint"]
     self.bucket = aliyunCfg["bucket"]
     self.oss = oss2.Bucket(oss2.Auth(accessKeyId, accessKeySecret),
                            endpoint, aliyunCfg["bucket"])
     self.prefix = gConfig.get(ALIYUN_LOCALROOT,
                               self._getDefaultDownRoot()).replace(
                                   "\\", "/")
コード例 #29
0
 def exe(conn, sql, values):
     try:
         # 创建执行的游标
         cur = conn.cursor()
         # 如果values列表 非空
         if values:
             # 执行sql命令
             cur.execute(sql, values)
         else:
             cur.execute(sql)
         if gConfig.get("debug.sql", 0):
             logDebug(cur._last_executed)
         # 把创建的游标 放到队列中 put 发送 出去
         q.put(("ok", cur))
     except Exception:
         err = traceback.format_exc()
         # 错误 放到队列中 put 发送 出去
         q.put(("error", err))
コード例 #30
0
ファイル: logUtil.py プロジェクト: Curiou/redundancy
def getLogFileName(batch, jobName):
    """
    获取日志文件的名字
    :param batch:
    :param jobName:
    :return: jobLog and nodeLog
    """
    from superbase.globalData import PROJECT_ROOT
    from superbase.globalData import gConfig
    # PROJECT_ROOT 项目根
    logDir = PROJECT_ROOT + "log/"
    jobLog = None
    nodeLog = None
    if batch:
        # 工作日志 拼接 路径
        jobLog = os.path.join(logDir, "%s/%s.txt" % (batch, jobName))
        # 节点日志
    nodeLog = os.path.join(logDir, gConfig.get(CFG_LOG_FILE_NAME))
    return jobLog, nodeLog