Exemple #1
0
 def __enter__(self):
     logInfo("before--lock %s" % self.name)
     self.lock.acquire()
     logInfo("lock %s" % self.name)
     return self
Exemple #2
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     self.lock.release()
     logInfo("unlock %s" % self.name)
Exemple #3
0
    try:
        pro = psutil.Process(pid)
        if not createTime or pro.create_time() == float(createTime):
            for proc in pro.children(recursive=True):
                killone(proc)
            parent = pro.parent()
            killone(pro)
            if killParent:
                killone(parent)
        else:
            logError("error:createTime=%s,proTime=%s" %
                     (createTime, pro.create_time()))
    except psutil.NoSuchProcess, e:
        info = "\nthe process is killed already-%s" % pid
        logInfo(info)
        killInfo.append(info)
    except Exception:
        logException()
    return killInfo


def showProcess(qinfo='python'):
    """
    获取进程的进程号和cmdline
    :param qinfo:python!!job.id=!!
    :return:
    cd /opt/work/spiderman/superbase/;sudo git pull;cd ..;python jobManager/manage/node.py pkill getTYCDetail;exit
    """
    import psutil
    # 遍历所有运行的进程的进程号,然后通过进程号获取每个进程的cmdline
Exemple #4
0
def runProcess(cmd,
               outInfo=None,
               maxOutInfoNum=1000,
               debug=False,
               redirect=False,
               exitInfo=None):
    """
    运行多进程
    :param cmd:
    :param outInfo: 输出的console信息list
    :param log: 可定制的logger
    :param maxOutInfoNum: 最多输出的console 信息行数
    :param debug: debug模式只是输出命令行
    :param redirectFile: 是否用重定向文件模式
    :param ,exitInfo: 遇到该消息退出
    :return:
    """
    # cmd += "\n" #what the hell use it?
    from superbase.utility.logUtil import logInfo
    try:
        if redirect:
            idx = cmd.rfind(">")
            if idx > 0:  # 判断是否需要重定向,重定向必须是绝对路径
                outfile = cmd[idx + 1:].strip()
                outfile = os.path.abspath(outfile)
                logInfo("redirect-file=%s" % outfile)
                dir1 = os.path.dirname(outfile)
                from superbase.utility.ioUtil import mkdir
                mkdir(dir1)
                redirectFile = open(outfile, "w")
                cmd = cmd[:idx]
        else:
            redirectFile = None
        logDebug("\n%s the cmd is %s\n" % (timeUtil.getCurTime(), cmd))
        if debug:
            return
        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=True)
        lineNum = 0
        while True:
            line = p.stdout.readline()
            if not line:
                break
            if exitInfo and line.find(exitInfo) >= 0:
                break
            # log.debug(line)
            if (outInfo != None):
                outInfo.append(line)
                lineNum += 1
                if maxOutInfoNum > 0 and lineNum > maxOutInfoNum:
                    del outInfo[:-1]
                    lineNum = 0
                    if redirectFile:
                        redirectFile.flush()
                if redirectFile:
                    redirectFile.write(line)
        if redirectFile:
            redirectFile.close()

        logDebug("process-done:%s" % cmd)
    except Exception:
        from superbase.utility.logUtil import logException
        logException()

    return outInfo
Exemple #5
0
    def post(self, userId='9650668145'):
        """
        获取帖子 9650668145 1761234358
        {"count":20,"statuses":[...],"total":474,"page":2,"maxPage":24}
        https://xueqiu.com/v4/statuses/user_timeline.json?user_id=9650668145&page=12&type=&_=1508802861537
        https://xueqiu.com/v4/statuses/user_timeline.json?user_id=1761234358&page=12&type=&_=1508802861537
        :param userId:
        :return:
        """
        #每个用户作为一个索引
        newIndex = gConfig.get(CFG_DOWN_INDEX) + "/%s" % userId
        gConfig.set(CFG_DOWN_INDEX, newIndex)
        urlTemp = "https://xueqiu.com/v4/statuses/user_timeline.json?user_id={userId}&page={pageIdx}&type=&_={ts}"
        syncPoint = SyncPoint()  #第一条数据作为同步点,因为第一条数据的帖子最新
        #先获得同步点信息:
        sp = syncPoint.getSyncPoint()
        oldSyncInfo = sp.get("syncInfo", None) if sp else None
        syncInfo = syncPoint.getNewSyncInfoByDesc(oldSyncInfo)
        saveHandler = HourlySaveLocalHandler(syncPoint=syncPoint)  #

        def save(result):
            posts = result["statuses"]
            if len(posts) == 0:
                syncInfo[CFG_DOWN_SYNCEND] = syncInfo[CFG_DOWN_SYNCCURR]
                syncPoint.saveLastSyncInfo({CFG_DOWN_SYNCINFO: syncInfo})
                return 0
            saveNum = 0
            for post in posts:

                result = {
                    "xid":
                    post["id"],
                    "userId":
                    post["user_id"],
                    "commentId":
                    post.get("commentId", 0),
                    "topicId":
                    post["retweet_status_id"],
                    "topicUser":
                    post["retweeted_status"]["user_id"]
                    if post["retweet_status_id"] else post["user_id"],
                    "replyCount":
                    post["reply_count"],
                    "favCount":
                    post["fav_count"],
                    "likeCount":
                    post["like_count"],
                    "retweetCount":
                    post["retweet_count"],
                    "viewCount":
                    post["view_count"],
                    "inTime":
                    post["created_at"],
                }

                test = True  #测试,内容不做保存
                if not test:
                    text = dehtml(post["text"])
                    idx = text.find("//@")
                    if idx > 0:
                        text = text[:idx]
                    wordCount = len(text)
                    if wordCount > gConfig.get("xueqiu.postSaveMinWord",
                                               200):  #超过200字的保存
                        result["info"] = text,
                    result["wordCount"] = wordCount
                    if post["reward_count"] > 0:
                        reward = {
                            "count": post["reward_count"],
                            "users": post["reward_user_count"],
                            "amount": post["reward_amount"],
                        }
                        result["reward"] = json.dumps(reward)

                if int(result["inTime"]) < int(syncInfo[CFG_DOWN_SYNCBEGIN]):
                    syncInfo[CFG_DOWN_SYNCCURR] = result["inTime"]
                    result[CFG_DOWN_SYNCINFO] = syncInfo
                    saveHandler.handle(result)
                    saveNum += 1

            return saveNum

        downNum = 0
        curPage = 0  # if gConfig.get(CFG_DOWN_INCMODE,1) else hasDowned/20+1 #该参数可用做断点使用
        while int(syncInfo[CFG_DOWN_SYNCCURR]) > int(
                syncInfo[CFG_DOWN_SYNCEND]):
            url = urlTemp.format(userId=userId,
                                 pageIdx=curPage,
                                 ts=int(time.time()))
            result = self.downOne(url, None)
            downNum += save(result)
            curPage += 1
            logInfo("user=%s,curPage=%s,downNum=%s" %
                    (userId, curPage, downNum))

        self.jobDone()
Exemple #6
0
 def testAsyncJob(self):
     cmd2 = "python spiderx/testx.py testJob"
     TAsyncJob(cmd2, delay=10)
     logInfo("testAsyncJob done!")
Exemple #7
0
 def testExc(self):
     logInfo("testing---")
     a = 1 / 0
Exemple #8
0
 def testDb(self):
     gConfig.set("env", "ONLINE")
     name = gTop.get(CFG_DB_MONITOR).getOne("select name from job limit 1")
     logInfo("testing-name-%s" % name)
     from superbase.utility.aliyun import AliYun
     AliYun().upFile(os.path.join(PROJECT_ROOT, "log/spiderx.log"))
Exemple #9
0
    def downCoupon(self, url):
        """
        有三种页面:
        1,旗舰店/专卖店:
            淘宝:http://super.fanli.com/brand-5311?spm=super_home.pc.bid-5311
            非淘宝:http://super.fanli.com/brand-32665?spm=super_home.pc.bid-32665
        2,品牌普通店集合:http://super.fanli.com/brand-1181?spm=super_home.pc.bid-1181
        3,http://super.fanli.com/brand-63591?pid=13276411402&spm=super_home.pc.pid-13276411402~bid-63591&lc=super_abtest_14071c
            
        :param url:
        :return:
        """
        self._enableScroll(length=8000, num=3)
        result = {}
        content = self.http.get(url)
        self.extractor.getResultByContent(
            content, {
                "groups":
                ListElement(
                    "div.gather-wrap> div.container > div.gather-floor-content",
                    itemCssElement={
                        "url": CssElement("a.gather-link", "href")
                    })
            }, result)
        if result["groups"]:  # mode2,brands
            for item in result["groups"]:
                logInfo("group mode:begin download %s" % item["url"])
                self.downCoupon(item["url"])
        else:
            self.extractor.getResultByContent(
                content, {"mode": CssElement("a.coupon-link>p.detail")},
                result)
            pageMode = 1 if result["mode"] else 0

            def handleCoupon(result):
                try:
                    patterns = self.patterns[pageMode]
                    if not result["url"]:
                        return logError("handleCoupon:url is null")
                    url = safeReg1(patterns["url"], result["url"], "url")
                    if "taobao.com" in url or "tmall.com" in url:
                        begin, end = result.get("time").split(",")
                        result2 = {
                            "beginTime": getTimestampBySec(float(begin)),
                            "endTime": getTimestampBySec(float(end)),
                        }
                        result2["info"] = result["val"]
                        result2["url"] = url

                        if pageMode == 0:
                            result2["sellerId"] = safeReg1(
                                patterns["sellerId"], result["url"],
                                "sellerId")
                            result2["couponId"] = safeReg1(
                                patterns["couponId"], result["url"],
                                "couponId")
                            result2["productId"] = safeReg1(
                                patterns["productId"], result["url"],
                                "productId")
                            result2["limit"], result2["discount"] = patterns[
                                "discount"].search(result["val"]).groups()
                        else:
                            result2["couponId"] = safeReg1(
                                patterns["couponId"], result["url"],
                                "couponId")
                            result2["productId"] = safeReg1(
                                patterns["productId"], result["url"],
                                "productId")
                            result2["discount"] = safeReg1(
                                patterns["discount"], result["url"],
                                "discount")
                            result2["sellerId"] = result2["limit"] = "",

                        return result2
                    else:
                        logError("the url is not taobao or tmall!")
                except Exception:
                    logException(result.get("url", "no_url"))

            class CouponHandler(HourlySaveLocalHandler):
                def __init__(self):
                    HourlySaveLocalHandler.__init__(self)

                def preProcess(self, result):
                    result = HourlySaveLocalHandler.preProcess(self, result)
                    return handleCoupon(result)

            if pageMode == 1:  # 非正常模式
                listItemConf = {
                    "time": CssElement(None, "data-time"),
                    "url": CssElement("a.coupon-link", "href"),
                    "val": CssElement("a.coupon-link>p.coupon"),
                }
            else:
                listItemConf = {
                    "time": CssElement(None, "data-time"),
                    "url": CssElement("a.item-coupon", "href"),
                    "val": CssElement("a.item-coupon"),
                }

            self.downOneList2(
                url,
                content,
                listConf={TAG_LIST_ITEMS: "[class*=J_super_item]"},
                listItemConf=listItemConf,
                resultHandler=CouponHandler())