def parseAccount(keyword, currentPage): link = Configs.WEIXIN_GZH_URL.format(key=keyword, pageNo=str(currentPage)) try: page = HttpUtils(process_proxy()).gotoUrlWithCookie( link, WeiXin.process_cookie()) if page == "": logger.warn("HttpUtils异常: %s" % (link)) return [] pos = page.find(testStr) if pos > 1: logger.warn("Sogou拒绝访问: %s" % (link)) return [] # 分析页面 pre = re.compile( '<div.*?class="wx-rb.*?_item".*?href="/gzh\\?openid=(.*?)&ext=(.*?)".*?target="_blank".*?>.*?' + '<div.*?class="img-box">.*?<span class="ico-bg"></span>.*?<img.*?src="(.*?)".*?onload=.*?</div>.*?' + '<div.*?class="txt-box.*?<h3>(.*?)</h3>.*?<h4>.*?<span>(.*?)</span>.*?</h4>.*?</div>', re.S) items = re.findall(pre, page) logger.info("解析页面成功:" + link) return items except Exception, e: logger.warn("解析页面异常: %s ERROR:%s" % (link, e))
def parseIndexHotArt(self): # 时间戳 timestamp = int(time.time()) randomNum = random.randint(100, 999) t = str(timestamp) + str(randomNum) # 根据cookie获取文章 cookie = WeiXin.process_cookie() # 获取代理 proxy = process_proxy() # 根据公众号获取文章 url = Configs.WEIXIN_HOT_ART.format(pageNo=self.pageNo) page = HttpUtils(proxy).gotoUrlWithCookie(url, cookie) page = page.encode('ISO-8859-1').decode("utf-8") pre = re.compile( '<li.*?id="(.*?)".*?>.*?' + '<div.*?>.*?<img.*?src="(.*?)".*?">.*?</div>.*?' + '<div.*?>.*?<a.*?href=".*?openid=(.*?)&ext=(.*?)".*?>.*?<p>.*?<img.*?src="(.*?)".*?</p>.*?<p.*?title="(.*?)">.*?</p>.*?</a>.*?</div>.*?' + '<div.*?>.*?<h4>.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?</h4>.*?</div>.*?' '</li>', re.S) items = re.findall(pre, page) for item in items: doc_id = item[0] account_logo = item[4] openid = item[2] ext = item[3] article_head = item[1] account_name = item[5] article_url = item[6] article_title = item[7] article_desc = "" # 文章入库 sql = "insert into wd_article_hot (article_title, article_url, article_head, article_desc, account_name, account_logo, openid, ext,doc_id, create_time ) values (" sql = sql + "\"" + article_title + "\",\"" + article_url + "\",\"" + article_head + "\",\"" + "--" + "\",\"" + account_name + "\",\"" + account_logo + "\",\"" + openid + "\",\"" + ext + "\",\"" + doc_id + "\"," + "sysdate())" logger.info("【新增】首页热门文章【" + (account_name.encode('utf-8')) + "】 ,【标题】=" + (article_title.encode('utf-8'))) DBUtils.excute(sql) #新增账号 account = "testing" accountType = 0 rows = Account().getRecByOpenId(openid) count = len(rows) if count == 0: sql = "insert into wd_public_account (account, name, openid, ext, logo, type, create_time) values(" sql = sql + "\"" + account + "\",\"" + account_name + "\",\"" + openid + "\",\"" + ext + "\"," + "\"" + account_logo + "\"," + str( accountType) + ",sysdate())" logger.info("公众号更新:[新增]" + account + " 名称:" + account_name.encode("utf-8")) DBUtils.excute(sql) else: if rows[0][4] <> ext: sql = "update wd_public_account set ext=" + "\"" + ext + "\",update_time=sysdate() where account=" + "\"" + account + "\"" logger.info("公众号更新:[更新]" + account + " 名称:" + account_name.encode("utf-8")) DBUtils.excute(sql)
def getProxy(self, target): logger.info("【KDL】代理服务器目标网站:%s", target) txt = HttpUtils().gotoUrlWithCookie(target, []) txt = txt.encode("utf-8") pattern = re.compile('<tr>(.*?)</tr>', re.S) items = re.findall(pattern, txt) for item in items: pattern = re.compile('<td>(.*?)</td>', re.S) tds = re.findall(pattern, item) if (len(tds) > 2): self.rawProxyList.append((tds[0], tds[1]))
def writeSogouCookie(): SUV = HttpUtils().getCookie(Configs.WEIXIN_COOKIE_SUV)._find("SUV") OtherCookie = requests.get(Configs.WEIXIN_COOKIE_URL.format(q=time.time()), cookies={ "SUV": SUV }).cookies ABTEST = OtherCookie._find("ABTEST") IPLOC = OtherCookie._find("IPLOC") SNUID = OtherCookie._find("SNUID") SUID = OtherCookie._find("SUID") cookieStr = ABTEST + "," + IPLOC + "," + SNUID + "," + SUID + "," + SUV + "," output = open(Configs.COOKIE_TXT_PATH, 'a') output.write("\n") output.write(cookieStr) output.close() logger.info("同步更新cookie文件 " + cookieStr)
def parseArticleWithAccount(args): openid = args[0] ext = args[1] currentPage = args[2] account = args[3] t = str(int(time.time())) + str(random.randint(100, 999)) cookie = WeiXin.process_cookie() # 根据cookie获取文章 proxy = process_proxy() # 获取代理 url = Configs.WEIXIN_ART_URL.format(openid=openid, ext=ext, pageno=currentPage, t=t) # 根据公众号获取文章 logger.info("获取公众号文章,账号名称:%s page %s" % (account.encode("utf-8"), currentPage)) page = HttpUtils(proxy).gotoUrlWithCookie(url, cookie) # 连接异常 if len(page) == 0 or page is None: delIp(proxy) return "stop" ##判断是否非法请求 pos = page.find(testStr) if pos > 1: logger.info("获取公众号文章Sogou拒绝访问: IP %s %s %s" % (proxy, account.encode("utf8"), url)) removeIp(proxy) return "stop" # 解析文章 page = page.replace("\\", "") page, number = re.subn("<\?xml version=.*?encoding=.*?>", "", page) page, number = re.subn("sogou.weixin.gzhcb.*?items\":\[\"", "", page) end = page.find("\"]})") page = page[0:end] page = "<wx>" + page + "</wx>" page = page.encode("utf-8") try: doc = parseString(page) except Exception, Argument: logger.warn("解析文章数据错误 IP:%s %s" % (Argument, page)) removeIp(proxy) return "stop"
logger.warn("解析文章数据错误 IP:%s %s" % (Argument, page)) removeIp(proxy) return "stop" documents = doc.documentElement.getElementsByTagName("DOCUMENT") for document in documents: status = "0" item = document.getElementsByTagName('item')[0] display = item.getElementsByTagName("display")[0] article_title = display.getElementsByTagName( 'title')[0].childNodes[0].data # 文章标题', article_url = display.getElementsByTagName( 'url')[0].childNodes[0].data # 文章地址', proxy = process_proxy() article_url_real = HttpUtils(proxy).getRedirctUrl( Configs.WEIXIN_HOST.format(key=article_url), cookie) if (article_url_real is None or len(article_url_real) == 0 or article_url_real.find("antispider") > 0): logger.warn("换取微信地址异常,Sogou拒绝 %s %s" % (proxy, article_url_real)) removeIp(proxy) continue article_head = display.getElementsByTagName( 'imglink')[0].childNodes[0].data # 图片', account_name = display.getElementsByTagName( 'sourcename')[0].childNodes[0].data # 公众号名称', account_logo = display.getElementsByTagName( 'headimage')[0].childNodes[0].data # 公众号LOGO', openid = display.getElementsByTagName( 'openid')[0].childNodes[0].data # 公众账号', ext = display.getElementsByTagName(