Example #1
0
def parseAccount(keyword, currentPage):
    link = Configs.WEIXIN_GZH_URL.format(key=keyword, pageNo=str(currentPage))
    try:
        page = HttpUtils(process_proxy()).gotoUrlWithCookie(
            link, WeiXin.process_cookie())
        if page == "":
            logger.warn("HttpUtils异常: %s" % (link))
            return []
        pos = page.find(testStr)
        if pos > 1:
            logger.warn("Sogou拒绝访问: %s" % (link))
            return []

        # 分析页面
        pre = re.compile(
            '<div.*?class="wx-rb.*?_item".*?href="/gzh\\?openid=(.*?)&amp;ext=(.*?)".*?target="_blank".*?>.*?'
            +
            '<div.*?class="img-box">.*?<span class="ico-bg"></span>.*?<img.*?src="(.*?)".*?onload=.*?</div>.*?'
            +
            '<div.*?class="txt-box.*?<h3>(.*?)</h3>.*?<h4>.*?<span>(.*?)</span>.*?</h4>.*?</div>',
            re.S)
        items = re.findall(pre, page)
        logger.info("解析页面成功:" + link)
        return items
    except Exception, e:
        logger.warn("解析页面异常: %s ERROR:%s" % (link, e))
Example #2
0
    def parseIndexHotArt(self):
        # 时间戳
        timestamp = int(time.time())
        randomNum = random.randint(100, 999)
        t = str(timestamp) + str(randomNum)
        # 根据cookie获取文章
        cookie = WeiXin.process_cookie()
        # 获取代理
        proxy = process_proxy()
        # 根据公众号获取文章
        url = Configs.WEIXIN_HOT_ART.format(pageNo=self.pageNo)
        page = HttpUtils(proxy).gotoUrlWithCookie(url, cookie)
        page = page.encode('ISO-8859-1').decode("utf-8")
        pre = re.compile(
            '<li.*?id="(.*?)".*?>.*?' +
            '<div.*?>.*?<img.*?src="(.*?)".*?">.*?</div>.*?' +
            '<div.*?>.*?<a.*?href=".*?openid=(.*?)&ext=(.*?)".*?>.*?<p>.*?<img.*?src="(.*?)".*?</p>.*?<p.*?title="(.*?)">.*?</p>.*?</a>.*?</div>.*?'
            +
            '<div.*?>.*?<h4>.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?</h4>.*?</div>.*?'
            '</li>', re.S)
        items = re.findall(pre, page)
        for item in items:
            doc_id = item[0]
            account_logo = item[4]
            openid = item[2]
            ext = item[3]
            article_head = item[1]
            account_name = item[5]
            article_url = item[6]
            article_title = item[7]
            article_desc = ""
            # 文章入库
            sql = "insert into wd_article_hot (article_title, article_url, article_head, article_desc, account_name, account_logo, openid, ext,doc_id, create_time ) values ("
            sql = sql + "\"" + article_title + "\",\"" + article_url + "\",\"" + article_head + "\",\"" + "--" + "\",\"" + account_name + "\",\"" + account_logo + "\",\"" + openid + "\",\"" + ext + "\",\"" + doc_id + "\"," + "sysdate())"
            logger.info("【新增】首页热门文章【" + (account_name.encode('utf-8')) +
                        "】 ,【标题】=" + (article_title.encode('utf-8')))
            DBUtils.excute(sql)

            #新增账号
            account = "testing"
            accountType = 0
            rows = Account().getRecByOpenId(openid)
            count = len(rows)
            if count == 0:
                sql = "insert into wd_public_account (account, name, openid, ext, logo, type, create_time) values("
                sql = sql + "\"" + account + "\",\"" + account_name + "\",\"" + openid + "\",\"" + ext + "\"," + "\"" + account_logo + "\"," + str(
                    accountType) + ",sysdate())"
                logger.info("公众号更新:[新增]" + account + " 名称:" +
                            account_name.encode("utf-8"))
                DBUtils.excute(sql)
            else:
                if rows[0][4] <> ext:
                    sql = "update wd_public_account set ext=" + "\"" + ext + "\",update_time=sysdate() where account=" + "\"" + account + "\""
                    logger.info("公众号更新:[更新]" + account + " 名称:" +
                                account_name.encode("utf-8"))
                    DBUtils.excute(sql)
Example #3
0
 def getProxy(self, target):
     logger.info("【KDL】代理服务器目标网站:%s", target)
     txt = HttpUtils().gotoUrlWithCookie(target, [])
     txt = txt.encode("utf-8")
     pattern = re.compile('<tr>(.*?)</tr>', re.S)
     items = re.findall(pattern, txt)
     for item in items:
         pattern = re.compile('<td>(.*?)</td>', re.S)
         tds = re.findall(pattern, item)
         if (len(tds) > 2):
             self.rawProxyList.append((tds[0], tds[1]))
Example #4
0
def writeSogouCookie():
    SUV = HttpUtils().getCookie(Configs.WEIXIN_COOKIE_SUV)._find("SUV")
    OtherCookie = requests.get(Configs.WEIXIN_COOKIE_URL.format(q=time.time()),
                               cookies={
                                   "SUV": SUV
                               }).cookies
    ABTEST = OtherCookie._find("ABTEST")
    IPLOC = OtherCookie._find("IPLOC")
    SNUID = OtherCookie._find("SNUID")
    SUID = OtherCookie._find("SUID")
    cookieStr = ABTEST + "," + IPLOC + "," + SNUID + "," + SUID + "," + SUV + ","
    output = open(Configs.COOKIE_TXT_PATH, 'a')
    output.write("\n")
    output.write(cookieStr)
    output.close()
    logger.info("同步更新cookie文件 " + cookieStr)
Example #5
0
def parseArticleWithAccount(args):
    openid = args[0]
    ext = args[1]
    currentPage = args[2]
    account = args[3]

    t = str(int(time.time())) + str(random.randint(100, 999))
    cookie = WeiXin.process_cookie()  # 根据cookie获取文章
    proxy = process_proxy()  # 获取代理

    url = Configs.WEIXIN_ART_URL.format(openid=openid,
                                        ext=ext,
                                        pageno=currentPage,
                                        t=t)  # 根据公众号获取文章
    logger.info("获取公众号文章,账号名称:%s page %s" %
                (account.encode("utf-8"), currentPage))
    page = HttpUtils(proxy).gotoUrlWithCookie(url, cookie)
    # 连接异常
    if len(page) == 0 or page is None:
        delIp(proxy)
        return "stop"

    ##判断是否非法请求
    pos = page.find(testStr)
    if pos > 1:
        logger.info("获取公众号文章Sogou拒绝访问: IP %s %s %s" %
                    (proxy, account.encode("utf8"), url))
        removeIp(proxy)
        return "stop"

    # 解析文章
    page = page.replace("\\", "")
    page, number = re.subn("<\?xml version=.*?encoding=.*?>", "", page)
    page, number = re.subn("sogou.weixin.gzhcb.*?items\":\[\"", "", page)
    end = page.find("\"]})")
    page = page[0:end]
    page = "<wx>" + page + "</wx>"
    page = page.encode("utf-8")

    try:
        doc = parseString(page)
    except Exception, Argument:
        logger.warn("解析文章数据错误 IP:%s %s" % (Argument, page))
        removeIp(proxy)
        return "stop"
Example #6
0
        logger.warn("解析文章数据错误 IP:%s %s" % (Argument, page))
        removeIp(proxy)
        return "stop"

    documents = doc.documentElement.getElementsByTagName("DOCUMENT")
    for document in documents:
        status = "0"
        item = document.getElementsByTagName('item')[0]
        display = item.getElementsByTagName("display")[0]
        article_title = display.getElementsByTagName(
            'title')[0].childNodes[0].data  # 文章标题',
        article_url = display.getElementsByTagName(
            'url')[0].childNodes[0].data  # 文章地址',

        proxy = process_proxy()
        article_url_real = HttpUtils(proxy).getRedirctUrl(
            Configs.WEIXIN_HOST.format(key=article_url), cookie)
        if (article_url_real is None or len(article_url_real) == 0
                or article_url_real.find("antispider") > 0):
            logger.warn("换取微信地址异常,Sogou拒绝 %s %s" % (proxy, article_url_real))
            removeIp(proxy)
            continue

        article_head = display.getElementsByTagName(
            'imglink')[0].childNodes[0].data  # 图片',
        account_name = display.getElementsByTagName(
            'sourcename')[0].childNodes[0].data  # 公众号名称',
        account_logo = display.getElementsByTagName(
            'headimage')[0].childNodes[0].data  # 公众号LOGO',
        openid = display.getElementsByTagName(
            'openid')[0].childNodes[0].data  # 公众账号',
        ext = display.getElementsByTagName(