def __enter__(self): logInfo("before--lock %s" % self.name) self.lock.acquire() logInfo("lock %s" % self.name) return self
def __exit__(self, exc_type, exc_val, exc_tb): self.lock.release() logInfo("unlock %s" % self.name)
try: pro = psutil.Process(pid) if not createTime or pro.create_time() == float(createTime): for proc in pro.children(recursive=True): killone(proc) parent = pro.parent() killone(pro) if killParent: killone(parent) else: logError("error:createTime=%s,proTime=%s" % (createTime, pro.create_time())) except psutil.NoSuchProcess, e: info = "\nthe process is killed already-%s" % pid logInfo(info) killInfo.append(info) except Exception: logException() return killInfo def showProcess(qinfo='python'): """ 获取进程的进程号和cmdline :param qinfo:python!!job.id=!! :return: cd /opt/work/spiderman/superbase/;sudo git pull;cd ..;python jobManager/manage/node.py pkill getTYCDetail;exit """ import psutil # 遍历所有运行的进程的进程号,然后通过进程号获取每个进程的cmdline
def runProcess(cmd, outInfo=None, maxOutInfoNum=1000, debug=False, redirect=False, exitInfo=None): """ 运行多进程 :param cmd: :param outInfo: 输出的console信息list :param log: 可定制的logger :param maxOutInfoNum: 最多输出的console 信息行数 :param debug: debug模式只是输出命令行 :param redirectFile: 是否用重定向文件模式 :param ,exitInfo: 遇到该消息退出 :return: """ # cmd += "\n" #what the hell use it? from superbase.utility.logUtil import logInfo try: if redirect: idx = cmd.rfind(">") if idx > 0: # 判断是否需要重定向,重定向必须是绝对路径 outfile = cmd[idx + 1:].strip() outfile = os.path.abspath(outfile) logInfo("redirect-file=%s" % outfile) dir1 = os.path.dirname(outfile) from superbase.utility.ioUtil import mkdir mkdir(dir1) redirectFile = open(outfile, "w") cmd = cmd[:idx] else: redirectFile = None logDebug("\n%s the cmd is %s\n" % (timeUtil.getCurTime(), cmd)) if debug: return p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) lineNum = 0 while True: line = p.stdout.readline() if not line: break if exitInfo and line.find(exitInfo) >= 0: break # log.debug(line) if (outInfo != None): outInfo.append(line) lineNum += 1 if maxOutInfoNum > 0 and lineNum > maxOutInfoNum: del outInfo[:-1] lineNum = 0 if redirectFile: redirectFile.flush() if redirectFile: redirectFile.write(line) if redirectFile: redirectFile.close() logDebug("process-done:%s" % cmd) except Exception: from superbase.utility.logUtil import logException logException() return outInfo
def post(self, userId='9650668145'): """ 获取帖子 9650668145 1761234358 {"count":20,"statuses":[...],"total":474,"page":2,"maxPage":24} https://xueqiu.com/v4/statuses/user_timeline.json?user_id=9650668145&page=12&type=&_=1508802861537 https://xueqiu.com/v4/statuses/user_timeline.json?user_id=1761234358&page=12&type=&_=1508802861537 :param userId: :return: """ #每个用户作为一个索引 newIndex = gConfig.get(CFG_DOWN_INDEX) + "/%s" % userId gConfig.set(CFG_DOWN_INDEX, newIndex) urlTemp = "https://xueqiu.com/v4/statuses/user_timeline.json?user_id={userId}&page={pageIdx}&type=&_={ts}" syncPoint = SyncPoint() #第一条数据作为同步点,因为第一条数据的帖子最新 #先获得同步点信息: sp = syncPoint.getSyncPoint() oldSyncInfo = sp.get("syncInfo", None) if sp else None syncInfo = syncPoint.getNewSyncInfoByDesc(oldSyncInfo) saveHandler = HourlySaveLocalHandler(syncPoint=syncPoint) # def save(result): posts = result["statuses"] if len(posts) == 0: syncInfo[CFG_DOWN_SYNCEND] = syncInfo[CFG_DOWN_SYNCCURR] syncPoint.saveLastSyncInfo({CFG_DOWN_SYNCINFO: syncInfo}) return 0 saveNum = 0 for post in posts: result = { "xid": post["id"], "userId": post["user_id"], "commentId": post.get("commentId", 0), "topicId": post["retweet_status_id"], "topicUser": post["retweeted_status"]["user_id"] if post["retweet_status_id"] else post["user_id"], "replyCount": post["reply_count"], "favCount": post["fav_count"], "likeCount": post["like_count"], "retweetCount": post["retweet_count"], "viewCount": post["view_count"], "inTime": post["created_at"], } test = True #测试,内容不做保存 if not test: text = dehtml(post["text"]) idx = text.find("//@") if idx > 0: text = text[:idx] wordCount = len(text) if wordCount > gConfig.get("xueqiu.postSaveMinWord", 200): #超过200字的保存 result["info"] = text, result["wordCount"] = wordCount if post["reward_count"] > 0: reward = { "count": post["reward_count"], "users": post["reward_user_count"], "amount": post["reward_amount"], } result["reward"] = json.dumps(reward) if int(result["inTime"]) < int(syncInfo[CFG_DOWN_SYNCBEGIN]): syncInfo[CFG_DOWN_SYNCCURR] = result["inTime"] result[CFG_DOWN_SYNCINFO] = syncInfo saveHandler.handle(result) saveNum += 1 return saveNum downNum = 0 curPage = 0 # if gConfig.get(CFG_DOWN_INCMODE,1) else hasDowned/20+1 #该参数可用做断点使用 while int(syncInfo[CFG_DOWN_SYNCCURR]) > int( syncInfo[CFG_DOWN_SYNCEND]): url = urlTemp.format(userId=userId, pageIdx=curPage, ts=int(time.time())) result = self.downOne(url, None) downNum += save(result) curPage += 1 logInfo("user=%s,curPage=%s,downNum=%s" % (userId, curPage, downNum)) self.jobDone()
def testAsyncJob(self): cmd2 = "python spiderx/testx.py testJob" TAsyncJob(cmd2, delay=10) logInfo("testAsyncJob done!")
def testExc(self): logInfo("testing---") a = 1 / 0
def testDb(self): gConfig.set("env", "ONLINE") name = gTop.get(CFG_DB_MONITOR).getOne("select name from job limit 1") logInfo("testing-name-%s" % name) from superbase.utility.aliyun import AliYun AliYun().upFile(os.path.join(PROJECT_ROOT, "log/spiderx.log"))
def downCoupon(self, url): """ 有三种页面: 1,旗舰店/专卖店: 淘宝:http://super.fanli.com/brand-5311?spm=super_home.pc.bid-5311 非淘宝:http://super.fanli.com/brand-32665?spm=super_home.pc.bid-32665 2,品牌普通店集合:http://super.fanli.com/brand-1181?spm=super_home.pc.bid-1181 3,http://super.fanli.com/brand-63591?pid=13276411402&spm=super_home.pc.pid-13276411402~bid-63591&lc=super_abtest_14071c :param url: :return: """ self._enableScroll(length=8000, num=3) result = {} content = self.http.get(url) self.extractor.getResultByContent( content, { "groups": ListElement( "div.gather-wrap> div.container > div.gather-floor-content", itemCssElement={ "url": CssElement("a.gather-link", "href") }) }, result) if result["groups"]: # mode2,brands for item in result["groups"]: logInfo("group mode:begin download %s" % item["url"]) self.downCoupon(item["url"]) else: self.extractor.getResultByContent( content, {"mode": CssElement("a.coupon-link>p.detail")}, result) pageMode = 1 if result["mode"] else 0 def handleCoupon(result): try: patterns = self.patterns[pageMode] if not result["url"]: return logError("handleCoupon:url is null") url = safeReg1(patterns["url"], result["url"], "url") if "taobao.com" in url or "tmall.com" in url: begin, end = result.get("time").split(",") result2 = { "beginTime": getTimestampBySec(float(begin)), "endTime": getTimestampBySec(float(end)), } result2["info"] = result["val"] result2["url"] = url if pageMode == 0: result2["sellerId"] = safeReg1( patterns["sellerId"], result["url"], "sellerId") result2["couponId"] = safeReg1( patterns["couponId"], result["url"], "couponId") result2["productId"] = safeReg1( patterns["productId"], result["url"], "productId") result2["limit"], result2["discount"] = patterns[ "discount"].search(result["val"]).groups() else: result2["couponId"] = safeReg1( patterns["couponId"], result["url"], "couponId") result2["productId"] = safeReg1( patterns["productId"], result["url"], "productId") result2["discount"] = safeReg1( patterns["discount"], result["url"], "discount") result2["sellerId"] = result2["limit"] = "", return result2 else: logError("the url is not taobao or tmall!") except Exception: logException(result.get("url", "no_url")) class CouponHandler(HourlySaveLocalHandler): def __init__(self): HourlySaveLocalHandler.__init__(self) def preProcess(self, result): result = HourlySaveLocalHandler.preProcess(self, result) return handleCoupon(result) if pageMode == 1: # 非正常模式 listItemConf = { "time": CssElement(None, "data-time"), "url": CssElement("a.coupon-link", "href"), "val": CssElement("a.coupon-link>p.coupon"), } else: listItemConf = { "time": CssElement(None, "data-time"), "url": CssElement("a.item-coupon", "href"), "val": CssElement("a.item-coupon"), } self.downOneList2( url, content, listConf={TAG_LIST_ITEMS: "[class*=J_super_item]"}, listItemConf=listItemConf, resultHandler=CouponHandler())