Esempio n. 1
0
 def has_signined(self):
     proxy = proxy_switcher.random_proxy()
     response = self.session.get(V2EX_INDEX_URL, proxies=proxy)
     if response.status_code != 200:
         return True
     index_page = response.text
     return 'href="/settings"' in index_page
Esempio n. 2
0
    def signin(self):
        if (not V2EX_USERNAME) or (not V2EX_PASSWORD) or (
                not JSDATI_USERNAME) or (not JSDATI_PASSWORD):
            logging.error('Missing username or password')
            return False

        if self.has_signined():
            return True

        logging.info('Signin to V2EX')
        proxy = proxy_switcher.random_proxy()
        signin_page_response = self.session.get(V2EX_SIGNIN_URL, proxies=proxy)

        if signin_page_response.status_code == 403:
            logging.error('Sign in 403')
            time.sleep(30)
            return False

        time.sleep(5)
        if not signin_page_response.text:
            return False

        soup = BeautifulSoup(signin_page_response.text, 'html5lib')
        login_form = soup.find('form', action='/signin')
        username_key = login_form.find('input',
                                       placeholder='用户名或电子邮箱地址')['name']
        password_key = login_form.find('input', type='password')['name']
        once_token = login_form.find('input',
                                     type='hidden',
                                     attrs={'name': 'once'})['value']
        captcha_key = login_form.find('input',
                                      placeholder='请输入上图中的验证码')['name']
        captcha_url = V2EX_INDEX_URL + '/_captcha?once=' + once_token

        image_bincontent = self.session.get(captcha_url).content
        captcha = dmapi.decode_image_bin_content(image_bincontent, 200)
        if (not isinstance(captcha, str)) or (not captcha):
            wechat_notify(once_token + '验证码打码失败')
            logging.warning('Decode captcha failed: ' + str(captcha))
            return False

        headers = CRAWLER_HEADERS.copy()
        headers['referer'] = V2EX_SIGNIN_URL

        payload = {
            username_key: V2EX_USERNAME,
            password_key: V2EX_PASSWORD,
            'once': once_token,
            captcha_key: captcha,
            'next': '/'
        }
        self.session.post(V2EX_SIGNIN_URL,
                          payload,
                          headers=headers,
                          proxies=proxy)
        if self.has_signined():
            return True
        else:
            wechat_notify(once_token + '登录失败')
            return False
Esempio n. 3
0
 def has_signined(self):
     proxy = proxy_switcher.random_proxy()
     response = self.session.get(V2EX_INDEX_URL, proxies=proxy)
     if response.status_code != 200:
         return True
     index_page = response.text
     return 'class="top">登出</a></td>' in index_page
Esempio n. 4
0
    def signin(self):
        if (not V2EX_USERNAME) or (not V2EX_PASSWORD):
            logging.error('Missing v2ex username or password')
            return False

        if self.has_signined():
            return True

        logging.info('Signin to V2EX')
        proxy = proxy_switcher.random_proxy()
        signin_page_response = self.session.get(V2EX_SIGNIN_URL, proxies=proxy)

        if signin_page_response.status_code == 403:
            logging.error('Sign in 403')
            time.sleep(30)
            return False

        time.sleep(5)
        if not signin_page_response.text:
            return False

        soup = BeautifulSoup(signin_page_response.text, 'html5lib')
        login_form = soup.find('form', action='/signin')
        username_key = login_form.find('input', placeholder='用户名或电子邮箱地址')['name']
        password_key = login_form.find('input', type='password')['name']
        once_token = login_form.find('input', type='hidden', attrs={'name': 'once'})['value']

        headers = CRAWLER_HEADERS.copy()
        headers['referer'] = V2EX_SIGNIN_URL

        payload = {username_key: V2EX_USERNAME, password_key: V2EX_PASSWORD,
                   'once': once_token, 'next': '/'}
        self.session.post(V2EX_SIGNIN_URL, payload, headers=headers, proxies=proxy)
        return self.has_signined()
Esempio n. 5
0
    def get_topic_extras(self, topic_id) -> WebExtras:
        extras = WebExtras()
        proxy = proxy_switcher.random_proxy()
        logging.info('random proxy: ' + str(proxy))
        topic_page_response = None
        try:
            topic_page_response = self.session.get(
                V2EX_TOPIC_WEB_URL.format(topic_id=topic_id),
                allow_redirects=False,
                timeout=60,
                proxies=proxy)
        except Exception as e:
            logging.error('get_topic_extras error: ' + str(e))
            proxy_switcher.mute_random_proxy(proxy)

        if topic_page_response is None:
            return self.get_topic_extras(topic_id)

        if topic_page_response.status_code == 302:
            if not self.has_signined():
                self.signin()
                return self.get_topic_extras(topic_id)
            else:
                logging.info(
                    'New accounts can\'t access some topics and will get 302,'
                    ' change your V2EX account')
                return extras

        if topic_page_response.status_code == 404:
            return None

        if topic_page_response.status_code == 403:
            time.sleep(10)
            proxy_switcher.mute_random_proxy(proxy)
            logging.info('403 Access Denied')
            return self.get_topic_extras(topic_id)

        if topic_page_response.status_code != 200:
            logging.error(
                'Something went wrong when fetch extras, status code:{0} '
                'response:{1}'.format(topic_page_response.status_code,
                                      topic_page_response.text))
            time.sleep(2)
            proxy_switcher.mute_random_proxy(proxy)
            return self.get_topic_extras(topic_id)

        if 'class="top">登出</a></td>' not in topic_page_response.text:
            self.signin()
            return self.get_topic_extras(topic_id)

        soup = BeautifulSoup(topic_page_response.text, 'html5lib')
        subtle_divs = soup.find_all('div', attrs={'class': 'subtle'})
        for subtle_div in subtle_divs:
            content_div = subtle_div.find(attrs={'class': 'topic_content'})
            extras.subtle_list.append(content_div.get_text())
        statistics_div = soup.find('div', attrs={'class': 'fr topic_stats'})
        if statistics_div:
            """
            '2569 次点击  ∙  4 人收藏   ∙  1 人感谢   '
            '2569 次点击  '
            """
            text = statistics_div.get_text()
            click_regex_match = re.search(self.click_regex, text)
            if click_regex_match:
                extras.click = int(click_regex_match.group())

            favorite_regex_match = re.search(self.favorite_regex, text)
            if favorite_regex_match:
                extras.favorite = int(favorite_regex_match.group())

            thank_regex_match = re.search(self.thank_regex, text)
            if thank_regex_match:
                extras.thank = int(thank_regex_match.group())

        else:
            logging.critical(
                'Something went wrong when parse statistics_div, status code:{0} '
                'response:{1}'.format(topic_page_response.status_code,
                                      topic_page_response.text))
        return extras
Esempio n. 6
0
    def _send_request(self, path, params=None):
        """

        :param path: path
        :param params: Dictionary
        :return: JSON object or None
        """

        url = V2EX_SITE_URL + path
        # proxy = proxy_switcher.get_proxy()
        proxy = proxy_switcher.random_proxy()
        try:

            def do_request():
                response = self.session.get(url,
                                            params=params,
                                            timeout=60,
                                            proxies=proxy)
                logging.info(
                    'do request with proxy {proxy}'.format(proxy=proxy))
                limit_remain = int(
                    response.headers.get('x-rate-limit-remaining',
                                         API_RATE_LIMIT_ONE_HOUR))
                if response.status_code == 403:
                    logging.info('api response 403')
                    limit_remain = 0
                logging.info('limit remain: {0}'.format(str(limit_remain)))
                if limit_remain <= 1:
                    reset_time = int(
                        response.headers.get('x-rate-limit-reset',
                                             int(time.time() + 1800)))
                    proxy_switcher.tag_current_ip_limited(reset_time)
                    return response if limit_remain else None
                else:
                    return response

            valid_response = None
            retry_times = 0
            while not valid_response:
                try:
                    retry_times += 1
                    valid_response = do_request()
                except requests.exceptions.RequestException as e:
                    logging.error(
                        'Error when fetch {url} with proxy{proxy}: {exception}'
                        .format(proxy=proxy, url=url, exception=str(e)))
                    self.session = requests.Session()
                    self.session.headers = CRAWLER_HEADERS
                    if retry_times >= 5:
                        logging.info(
                            'Mark ip as limited because of too many retry times'
                        )
                        proxy_switcher.tag_current_ip_limited(
                            int(time.time() + 1800))

            return valid_response.json(strict=False)

        except ValueError:
            logging.critical(
                'Error when parse json for {url} params {params}'.format(
                    url=url, params=params))
            raise