Example #1
0
def jsonFromFile(path1, encoding="utf-8"):
    try:
        with open(path1, "r") as f:
            from collections import OrderedDict
            return json.load(f,
                             encoding=encoding,
                             object_pairs_hook=OrderedDict)
    except Exception:
        from superbase.utility.logUtil import logError
        logError("no the file -%s" % path1)
        return None
Example #2
0
def assert2(condition, info="assert error"):
    """
    声明
    :param condition: 环境
    :param info: 错误消息
    :return:
    """

    if not condition:
        logError(condition)
        # raise AssertionError(info)
        assert (condition)
Example #3
0
 def checkError(self):
     """
     检查取账号是否正常
     :return: True有错
     """
     interval = gConfig.get("account.checkInterval", 300)  # 检测区间300s
     limit = gConfig.get("account.checkLimit", 10)  # 最大数10个
     if len(self.history) > limit:
         diff = self.history[-1][0] - self.history[-10][0]
         if diff < interval:
             logError("getAccount too frequently!!-%s" % diff)
             return True
     return False
Example #4
0
 def __init__(self, index=None):
     """
     :param index:
     """
     self.index = gConfig.get(CFG_DOWN_INDEX) if not index else index
     if self.index:
         syncDir = os.path.join(PROJECT_ROOT,"syncDir")
         if self.index:
             path1 = os.path.join(syncDir,"%s.json"%self.index)
             mkdir(os.path.split(path1)[0])
             self.localFile = path1
         self.lastSyncInfo = None
         self.saveNum = 0
         self.syncRemote = gConfig.get(CFG_JOB_ENABLE) or gConfig.get("debug.sync")
         self.checkSync()
     else:
         logError("no syncIndex!!")
         self.syncRemote = False
Example #5
0
            def handleCoupon(result):
                try:
                    patterns = self.patterns[pageMode]
                    if not result["url"]:
                        return logError("handleCoupon:url is null")
                    url = safeReg1(patterns["url"], result["url"], "url")
                    if "taobao.com" in url or "tmall.com" in url:
                        begin, end = result.get("time").split(",")
                        result2 = {
                            "beginTime": getTimestampBySec(float(begin)),
                            "endTime": getTimestampBySec(float(end)),
                        }
                        result2["info"] = result["val"]
                        result2["url"] = url

                        if pageMode == 0:
                            result2["sellerId"] = safeReg1(
                                patterns["sellerId"], result["url"],
                                "sellerId")
                            result2["couponId"] = safeReg1(
                                patterns["couponId"], result["url"],
                                "couponId")
                            result2["productId"] = safeReg1(
                                patterns["productId"], result["url"],
                                "productId")
                            result2["limit"], result2["discount"] = patterns[
                                "discount"].search(result["val"]).groups()
                        else:
                            result2["couponId"] = safeReg1(
                                patterns["couponId"], result["url"],
                                "couponId")
                            result2["productId"] = safeReg1(
                                patterns["productId"], result["url"],
                                "productId")
                            result2["discount"] = safeReg1(
                                patterns["discount"], result["url"],
                                "discount")
                            result2["sellerId"] = result2["limit"] = "",

                        return result2
                    else:
                        logError("the url is not taobao or tmall!")
                except Exception:
                    logException(result.get("url", "no_url"))
Example #6
0
 def alarmPageError(self, url, content, downInfo):
     """
     解析元素有错,有可能是blocked 也有可能是页面结构变化,邮件警告,人工检查
     :param url:
     :param content:
     :param downInfo:downNum,downTime,downInterval etc.
     :return:
     """
     fname, filePath = AntiBlock.saveWrongPage(content)
     info = {
         'jobName': gConfig.get(CFG_JOB_NAME),
         'batch': gConfig.get(CFG_JOB_BATCH),
         'url': url,
         'filePath': filePath,
         'type': self.blocked,
         'detail': json.dumps(downInfo),
         'inTime': getTimestamp(),
     }
     title = "block-%s" % self.blocked
     content = getPrintDict(info)
     attach = [(fname, filePath)]
     emails2 = [gConfig.get(CFG_JOB_EMAIL)] if gConfig.get(CFG_JOB_EMAIL, None) else []
     if gConfig.get(CFG_JOB_ENABLE, 0):
         gTop.get('db').insert("block", info)
         from jobManager.job import Job
         Job().sendEmail(
             title=title,
             content=content,
             attach=attach,
             emails2=emails2
         )
     else:
         Mail.sendEmail(
             title=title,
             content=content,
             t_address=emails2,
             attaches=attach
         )
     logError("blocked?check the content\n%s" % getPrintDict(info))
Example #7
0
    def downOneList2(self, url, content, listConf, listItemConf,
                     resultHandler):
        """
        downOneList 的具体实现
        :param url: 只是起到log作用
        :param content: 页面内容
        :param listConf: 列表配置
        :param listItemConf: 列表项配置
        :param resultHandler: 结果的handler
        :return: error:-1,ok:0
        """

        # pq(etree.parse())直接接受一个文档,按照文档结构解析
        # StringIO经常被用来作字符串的缓存,因为StringIO的一些接口和文件操作是一致的,
        # 同样的代码,可以同时当成文件操作或者StringIO操作。
        # getroot 获取原网页的根
        root = pq(etree.parse(StringIO(content), self.parser).getroot())
        # list 行数组的模式
        css = listConf[TAG_LIST_ITEMS]
        trs = root(css)

        if trs and len(trs) > 0:
            for idx, tr in enumerate(trs):  # enumerate 列举
                try:
                    result = {}
                    # 把提取的原网页内容 以行组模式 css选择器 为匹配方式 并以dict形式 保存到result中
                    self.extractor.getResult(pq(tr), listItemConf, result)
                    # debug输出
                    logDebug(getPrintDict(result))
                    # 输出 -->BaseResultHandler().handle(result)
                    resultHandler.handle(result)
                except Exception:
                    logException()
        else:
            # 没有这个lsits 并打印错误的url
            logError("%s !no lists" % url)
            return -1
        return 0
Example #8
0
 def updateResume(self):
     """
     python spider/sample/fanli.py update.time=600 updateResume
     :return:
     """
     while True:
         self.http.get("https://www.liepin.com/sh/")
         driver = self.http.driver
         driver.find_element_by_xpath(
             '//*[@id="home"]/div[2]/div[1]/div/div/section[2]/form/div[3]/p/a'
         ).click()
         time.sleep(5)
         driver.find_element_by_xpath(
             '//*[@id="home"]/div[2]/div[1]/div/div/section[1]/div[1]/form/div[1]/input'
         ).send_keys("*****@*****.**")
         driver.find_element_by_xpath(
             '//*[@id="home"]/div[2]/div[1]/div/div/section[1]/div[1]/form/div[2]/input'
         ).send_keys("pwdh8f_liepin")
         driver.find_element_by_xpath(
             '//*[@id="home"]/div[2]/div[1]/div/div/section[1]/div[1]/form/input[3]'
         ).click()
         time.sleep(10)
         num = 0
         while num < 300000:
             e = findElement(
                 driver, By.XPATH,
                 '// *[ @ id = "home"] / div[3] / div[2] / div[1] / div[4] / ul / li[1] / a'
             )
             if e:
                 e.click()
                 time.sleep(gConfig.get("update.time", 300))
                 self.http.get("https://c.liepin.com/")
                 logDebug("update %s" % num)
                 num += 1
             else:
                 logError("try again!")
                 break
Example #9
0
 def pageUrls(self, listConf):
     """
     是自动拼接url以&str=
     list url's generator
     :return:
     """
     try:
         url = self.beginUrl
         totalPage, err = self.getPageNum(url, listConf)
         if err:
             logError("getPageNum error?%s,url=%s" % (err, url))
         logInfo("%s url=%s\ntotalPage=%s" % (getTimestamp(), url, totalPage))
         if int(gConfig.get(CFG_DOWN_MAXPAGENUM)):
             totalPage = min(int(totalPage), int(gConfig.get(CFG_DOWN_MAXPAGENUM)))
         for page in range(int(totalPage)):
             try:
                 url2 = self.getNextPageUrl(url, page + 1, listConf)
                 if self.http.isBlocked():
                     break
                 yield url2
             except Exception:
                 logException()
     except Exception, e:
         logException()
Example #10
0
    def checkDownStatus(self, num):
        """
        #检查下载状态
        :param num:
        :return:
        """
        try:
            # 默认每隔2048个显示下载进度,更新任务心跳
            if num & (gConfig.get(CFG_DEBUG_PROGRESS, 2048) - 1) == 0:
                logInfo(
                    "%s_%s:down=%s" %
                    (getTimestamp(),
                     gConfig.get(CFG_DOWN_WEBSITE, "undefined website"), num))

            self.jobHearBeat()

            if self.http.isBlocked():
                return ERR_BLOCK

            # 下载最大数
            maxNum = gConfig.get(CFG_DOWN_MAXNUM, 0)
            if maxNum and num > maxNum:
                logError("!!reach the maxNum %s" % maxNum)
                return ERR_MAXNUM

            # 工作 运行时间
            if gConfig.get(CFG_JOB_RUNTIME, 0) > 0:
                beginTime = int(gConfig.get(CFG_JOB_BEGINTIME))
                runTime = int(gConfig.get(CFG_JOB_RUNTIME))
                if time.time() - ts2seconds(beginTime) > runTime:
                    logInfo("begin=%s:exit for runTime=%s out" %
                            (beginTime, runTime))
                    return ERR_TIMEOUT

        except Exception, e:
            logException()
Example #11
0
    def doCheckBlock(self, url, content, antiBlock):
        blockInfo = antiBlock.get("blockInfo", None)
        if blockInfo:

            for b1 in blockInfo:
                info = b1["info"] #兼容
                self.blocked = BLOCKED_INFO if content.find(info) > 0 else 0
                if self.blocked:
                    logError("!!!!block by %s,url=%s" % (gConfig.get(CFG_JOB_NAME), url))
                    return b1["strategy"]

        # check the elements

        blocked = False
        element = antiBlock.get("blockElement",None)
        if element:

            strategy = element.get("strategy",None)
            elements = element["elements"]
            for template, value in elements:
                result = {}
                self.extractor.getResultByContent(content, template, result)
                checkName = result.get("name", None)
                if not checkName or (value and checkName.find(value) == -1):
                    blocked = True
                else:
                    blocked = False
                    break  # 非block马上跳出
            if blocked:
                self.blockCheck += 1
                logError("%s:the element not exist,block?%s" % (self.blockCheck,url))
            else:
                self.blockCheck = 0  # reset

            globalCheckNum = gConfig.get(CFG_BLOCK_MAXCHECK,30)
            localCheckNum = element.get("maxCheckNum",globalCheckNum) #如果有local,用local

            if self.blockCheck > localCheckNum:
                logError("block by element,pls check the content,maybe the structure has changed!")
                self.blocked = BLOCKED_ELEMENT
                self.blockCheck = 0
                return strategy
Example #12
0
def JobWrap(self):
    def wrap1(func):
        @functools.wraps(func)
        def __decorator(*params):

            try:
                self.jobBegin()
                return func(*params)
            except Exception, e:
                self.jobError = e
                logException()
            finally:
                if self.jobError:
                    self.jobFail()
                    logError("jobFail because:%s"%self.jobError)
                else:
                    self.jobDone()

        return __decorator
    return wrap1
class JobUtil(object):
    def __init__(self):
        #有jobId 才有jobEnable
        gConfig.set(CFG_JOB_ENABLE, gConfig.get(CFG_JOB_ID,0))
        if gConfig.get(CFG_JOB_ENABLE, 0):
            # jobManger 任务管理器
            from jobManager.job import Job
            self.job = Job()
        else:
            self.job = None
Example #13
0
        except psutil.NoSuchProcess, e:
            killInfo.append("has killed:%s" % info)
        except Exception, e:
            logException()

    try:
        pro = psutil.Process(pid)
        if not createTime or pro.create_time() == float(createTime):
            for proc in pro.children(recursive=True):
                killone(proc)
            parent = pro.parent()
            killone(pro)
            if killParent:
                killone(parent)
        else:
            logError("error:createTime=%s,proTime=%s" %
                     (createTime, pro.create_time()))
    except psutil.NoSuchProcess, e:
        info = "\nthe process is killed already-%s" % pid
        logInfo(info)
        killInfo.append(info)
    except Exception:
        logException()
    return killInfo


def showProcess(qinfo='python'):
    """
    获取进程的进程号和cmdline
    :param qinfo:python!!job.id=!!
    :return:
    cd /opt/work/spiderman/superbase/;sudo git pull;cd ..;python jobManager/manage/node.py pkill getTYCDetail;exit