Example #1
0
    def __init__(self):
        self.url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_='
        self.account = ''
        self.name = ''
        self.search_name = ''
        self.tags = ''
        self.s = requests.Session()
        self.s.keep_alive = False  # 关闭多余连接
        self.s.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/68.0.3440.106 Safari/537.36',
        }
        self.cookies = {}

        # 使用单一driver
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        self.driver = webdriver.Chrome(chrome_options=chrome_options)

        # self.driver = driver
        self.driver.set_page_load_timeout(20)
        self.driver.set_script_timeout(20)
        self.wait = WebDriverWait(self.driver, 5)
        self.proxies = abuyun_proxy()
Example #2
0
 def __init__(self):
     self.url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_='
     self.account = ''
     self.name = ''
     self.search_name = ''
     self.tags = ''
     self.s = requests.Session()
     self.s.keep_alive = False  # 关闭多余连接
     self.s.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
         'Chrome/68.0.3440.106 Safari/537.36',
     }
     self.cookies = {
         'SUID': '4A72170E2613910A000000005BAC759D',
         'ABTEST': '3|1538028956|v1',
         'SUIR': '1538028956',
         'IPLOC': 'CN4401',
         'SNUID': '5960051C121665C656C04D9E13C88607',
         'PHPSESSID': '80l6acdo9sq3uj357t00heqpg1',
         'seccodeRight': 'success',
         'SUV': '00F347B50E17724A5BAC759DBEFB6849',
         'successCount': '1|Thu, 27 Sep 2018 06:20:59 GMT',
         'refresh': '1',
         'JSESSIONID': 'aaa73Xexaf2BmgEL80Bvw'
     }
     self.driver = driver
     self.driver.set_page_load_timeout(15)
     self.driver.set_script_timeout(15)
     self.wait = WebDriverWait(self.driver, 5)
     self.proxies = abuyun_proxy()
Example #3
0
    def __init__(self):
        self.url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_='
        self.account = ''
        self.name = ''
        self.search_name = ''
        self.tags = ''
        self.s = requests.Session()
        self.s.keep_alive = False  # 关闭多余连接
        self.s.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.9,en;q=0.8',
            'Cache-Control':
            'no-cache',
            'Connection':
            'keep-alive',
            'Host':
            'mp.weixin.qq.com',
            'Pragma':
            'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
        }
        self.cookies = {}

        # 使用单一driver
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        self.driver = webdriver.Chrome(chrome_options=chrome_options)

        # self.driver = driver
        self.driver.set_page_load_timeout(20)
        self.driver.set_script_timeout(20)
        self.wait = WebDriverWait(self.driver, 5)
        self.proxies = abuyun_proxy()
        self.timeout = 23
Example #4
0
    def account_homepage(self):
        # 搜索账号并返回公众号主页
        count = 0
        while True:
            count += 1
            if count > 3:
                log.info('多次账号异常,跳过账号:'.format(self.name))
                return
            log.info('start account {}'.format(self.search_name))
            search_url = self.url.format(self.search_name)
            resp_search = self.s.get(search_url,
                                     headers=self.headers,
                                     cookies=self.cookies)
            e = pq(resp_search.text)
            log.info('当前搜狗标题:{}'.format(e('title').text()))
            if '搜狗' not in e('title').text():
                log.info('初始化session')
                self.s = requests.session()
            if self.search_name == e(".info").eq(0).text().replace('微信号:', ''):
                account_link = e(".tit").find('a').attr('href')
                self.name = e(".tit").eq(0).text()
                count_proxy = 0
                while True:
                    count_proxy += 1
                    if count_proxy > 10:
                        log.error('未能获取有效代理:{}'.format(self.search_name))
                        return
                    try:
                        log.info(self.proxies)
                        homepage = self.s.get(account_link,
                                              cookies=self.cookies,
                                              proxies=self.proxies)
                        if '<title>请输入验证码 </title>' in homepage.text:
                            log.info('需要输入验证码,重新获取代理')
                            self.proxies = abuyun_proxy()
                            # if self.proxies is False:
                            #     self.crack_sougou(account_link)
                            #     homepage = self.s.get(account_link, cookies=self.cookies)
                            #     return homepage.text
                            continue
                        else:
                            return homepage.text
                    except Exception as _e:
                        log.info('重新获取代理:{}'.format(_e))
                        self.proxies = abuyun_proxy()

                # if '<title>请输入验证码 </title>' in homepage.text:
                #     # self.crack_sougou(account_link)
                #     count_proxy = 0
                #     while True:
                #         count_proxy += 1
                #         if count_proxy > 5:
                #             break
                #         try:
                #             log.info(self.proxies)
                #             homepage = self.s.get(account_link, cookies=self.cookies, proxies=self.proxies)
                #             break
                #         except Exception as e:
                #             log.info('重新获取代理:{}'.format(e))
                #             self.proxies = abuyun_proxy()
            elif len(e(".tit").eq(0).text()) > 1:
                log.info("不能匹配正确的公众号: {}".format(self.search_name))
                return
            if '相关的官方认证订阅号' in resp_search.text:
                log.info("找不到该公众号: {}".format(self.search_name))
                return
            if '搜狗' in e('title').text():
                log.info('{} :搜索结果无文字'.format(self.search_name))
                return
            else:
                # 处理验证码
                log.info(search_url)
                log.info('验证之前的cookie'.format(self.cookies))
                try_count = 0
                while True:
                    try_count += 1
                    self.crack_sougou(search_url)

                    # if lock.acquire():
                    #     try:
                    #         self.crack_sougou(search_url)
                    #     except Exception as e:
                    #         log.info(e)
                    #     finally:
                    #         lock.release()
                    if '搜公众号' in self.driver.page_source:
                        log.info('------开始更新cookies------')
                        cookies = self.driver.get_cookies()
                        new_cookie = {}
                        for items in cookies:
                            new_cookie[items.get('name')] = items.get('value')
                        self.cookies = new_cookie
                        log.info('------cookies已更新------'.format(self.cookies))
                        break
                    elif try_count > 4:
                        log.info("浏览器验证失败")
                        break
                log.info("验证完毕")
                time.sleep(2)
                continue
Example #5
0
    def create(self, url, account_model, proxies):
        self.url = url
        if proxies is False:
            resp = requests.get(self.url)
            for i in range(30):
                if '访问过于频繁,请用微信扫描二维码进行访问' in resp.text:
                    time.sleep(600)
                else:
                    break
        else:
            count_loop = 0
            while True:
                count_loop += 1
                if count_loop >= 10:
                    break
                try:
                    resp = requests.get(self.url, proxies=proxies, timeout=21)
                    proxy_count = 0
                    while True:
                        proxy_count += 1
                        if proxy_count > 10:
                            log('文章页未获取有效代理')
                            # raise RuntimeError('访问过于频繁,请用微信扫描二维码进行访问')
                        if '访问过于频繁,请用微信扫描二维码进行访问' in resp.text:
                            proxies = abuyun_proxy()
                            resp = requests.get(self.url,
                                                proxies=proxies,
                                                timeout=21)
                            log('代理无效:访问过于频繁,请用微信扫描二维码进行访问')
                        else:
                            break
                    break
                except requests.exceptions.ProxyError as e:
                    log('代理请求ProxyError:{}'.format(e))
                except requests.exceptions.ConnectionError as e:
                    log('代理请求ConnectionError:{}'.format(e))
                    # time.sleep(600)
                    # raise RuntimeError('访问过于频繁,请用微信扫描二维码进行访问')
        e = pq(resp.text)
        # 匹配分享的文章 好像失效
        if 'var ct=' not in resp.text:
            # 第一次看到嫂子的就是她的 晚聊伴夜
            if '此内容因违规无法查看' in resp.text:
                self.title = '此内容因违规无法查看'
                return
            if '此内容被投诉且经审核涉嫌侵权' in resp.text:
                self.title = '此内容被投诉且经审核涉嫌侵权,无法查看。'
                return
            self.is_share = True
            # self.title = e("title").eq(0).text()
            self.title = e("title").text()
            self.content = e(".share_notice").text()
            time_find = re.search('createDate=new Date\("\d*', resp.text)
            self.time = time_find.group() if time_find else ''
            if '视频' == self.title:
                self.set_time(resp, content_type='video')

            # if '用腾讯视频观看' in resp.text:
            #     self.set_time(resp, content_type='video')
            return
        if '分享' in e('.share_notice').text():
            self.is_share = True
            # self.content =
        self.set_time(resp, content_type='article')
        self.account = account_model.account
        # if not self.account:
        #     inner_account = re.search('user_name = ".*?"', resp.text)
        #     self.account = inner_account.group().split('"')[1]
        self.title = e('.rich_media_title').text().replace(' ', '')
        # todo 分享的和视频
        self.content = e("#js_content").text().replace('\n', '')
        self.author = account_model.name
        img_list = e('img')
        img_str = ''
        for img_div in img_list:
            img = pq(img_div).attr('data-src')
            if img is not None:
                img_str += img + '|'
        self.image_url = img_str[:-1]