Ejemplo n.º 1
0
    def start_requests(self):
        '''
        override该入口方法,以selenium模拟登录后拿到cookie交给scrapy的request使用
        '''
        # 从文件中读取cookie

        cookies = None
        try:
            if os.path.exists('./cookies/lagou.cookie'):
                cookies = pickle.load(open('./cookies/lagou.cookie', 'rb'))
        except:
            pass

        if not cookies:
            from selenium import webdriver
            from selenium.webdriver.chrome.options import Options
            options = Options()

            options.add_argument("--disable-extensions")
            # options.add_argument('window-size=1280x800')
            # options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
            # options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"

            browser = webdriver.Chrome(executable_path='./chromedriver',
                                       chrome_options=options)

            from selenium.webdriver.common.action_chains import ActionChains
            action = ActionChains(browser)
            browser.get('https://passport.lagou.com/login/login.html')
            browser.find_element_by_css_selector(
                'div[data-view="passwordLogin"] div[data-propertyname="username"] .input.input_white'
            ).send_keys(LAGOU_USERNAME)
            browser.find_element_by_css_selector(
                'div[data-view="passwordLogin"] div[data-propertyname="password"] .input.input_white'
            ).send_keys(LAGOU_PASSWORD)
            browser.find_element_by_css_selector(
                'div[data-view="passwordLogin"] .sense_login_password input[type="submit"]'
            ).click()
            import time
            time.sleep(3)

            is_login = False
            while not is_login:
                try:
                    if browser.find_element_by_css_selector('.unick'):
                        is_login = True
                except:
                    pass

                try:
                    captcha_element = browser.find_element_by_css_selector(
                        '.geetest_window>.geetest_item:first-child>.geetest_item_wrap>img'
                    )
                except:
                    captcha_element = None

                if captcha_element:
                    if action._actions:
                        action._actions = []

                    img_url = captcha_element.get_attribute('src')
                    import ssl
                    ssl._create_default_https_context = ssl._create_unverified_context
                    import urllib
                    urllib.request.urlretrieve(img_url,
                                               filename='lagou_validate.jpeg')
                    # import requests
                    # html = requests.get(img_url)
                    # fh = open('lagou_validate.jpeg', 'wb')
                    # fh.write(html.content)
                    # fh.close()

                    from tools.chaojiying import Chaojiying_Client
                    from settings import (chaojiying_username,
                                          chaojiying_password,
                                          chaojiying_app_id)
                    chaojiying = Chaojiying_Client(chaojiying_username,
                                                   chaojiying_password,
                                                   chaojiying_app_id)
                    im = open('lagou_validate.jpeg', 'rb').read()
                    pos_obj = chaojiying.PostPic(im, 9004)
                    if pos_obj['err_no'] == 0 or pos_obj['err_str'] == 'OK':
                        pic_str = pos_obj['pic_str']
                        positions = pic_str.split('|')
                        from ScrapyDemo.utils.common import get_coordinate
                        for item in positions:
                            point = item.split(',')
                            el_index = get_coordinate(point)
                            browser.find_element_by_css_selector(
                                '.geetest_window>.geetest_item:nth-child({})'.
                                format(el_index)).click()

                        browser.find_element_by_css_selector(
                            '.geetest_commit').click()
                        time.sleep(5)

            cookies = browser.get_cookies()
            # 写入cookie到文件中
            pickle.dump(cookies, open('./cookies/lagou.cookie', 'wb'))

        cookie_dict = {}
        for cookie in cookies:
            cookie_dict[cookie['name']] = cookie['value']

        for url in self.start_urls:
            yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict)
Ejemplo n.º 2
0
    def judge_login(self, response):

        self.browser.get('https://www.zhihu.com/signin')

        if self.__is_login():
            return [scrapy.Request(url=self.start_urls[0], dont_filter=True)]

        action = ActionChains(self.browser)
        self.browser.find_element_by_css_selector(
            '.SignFlow-tabs .SignFlow-tab:nth-child(2)').click()

        self.__inputLoginInfo(password=ZHIHU_PASSWORD + '1')

        time.sleep(3)
        login_success = False
        while not login_success:
            try:
                if self.__is_login():
                    login_success = True
            except:
                pass

            try:
                english_captcha_element = self.browser.find_element_by_class_name(
                    'Captcha-englishImg')
            except:
                english_captcha_element = None

            try:
                chinese_captcha_element = self.browser.find_element_by_class_name(
                    'Captcha-chineseImg')
            except:
                chinese_captcha_element = None

            if chinese_captcha_element:
                if action._actions:
                    action._actions = []
                    self.browser.execute_script(
                        'document.querySelectorAll(".Button.ChineseCaptchaPoint").forEach(function(el){el.click()});')
                # chinese_captcha_point = self.browser.find_element_by_class_name('')
                # ele_position = chinese_captcha_element.location
                # x_relative = ele_position['x']
                # y_relative = ele_position['y']
                # self.browser_navigation_panel_height = self.browser.execute_script(
                #     'return window.outerHeight - window.innerHeight'
                # )
                base64_text = chinese_captcha_element.get_attribute('src')
                import base64
                code = base64_text.replace(
                    'data:image/jpg;base64,', '').replace('%0A', '')
                fh = open('yzm_cn.jpeg', 'wb')
                fh.write(base64.b64decode(code))
                fh.close()

                from zheye import zheye
                z = zheye()
                positions = z.Recognize('yzm_cn.jpeg')
                last_position = []

                # 格式化倒立文字的坐标(zheye默认坐标返回为y,x的格式)
                if len(positions) == 2:
                    if positions[0][1] > positions[1][1]:
                        last_position.append(
                            [positions[1][1], positions[1][0]])
                        last_position.append(
                            [positions[0][1], positions[0][0]])
                    else:
                        last_position.append(
                            [positions[0][1], positions[0][0]])
                        last_position.append(
                            [positions[1][1], positions[1][0]])

                    first_position = [
                        int(last_position[0][0] / 2), int(last_position[0][1] / 2)]
                    second_position = [
                        int(last_position[1][0] / 2), int(last_position[1][1] / 2)]
                    action.move_to_element_with_offset(
                        chinese_captcha_element, first_position[0], first_position[1]).click()
                    action.move_to_element_with_offset(
                        chinese_captcha_element, second_position[0], second_position[1]).click().perform()
                else:
                    last_position.append([positions[0][1], positions[0][0]])

                    first_position = [
                        int(last_position[0][0] / 2), int(last_position[0][1] / 2)]
                    action.move_to_element_with_offset(
                        chinese_captcha_element, first_position[0], first_position[1]).click().perform()
                print(last_position)
                self.__inputLoginInfo()
                time.sleep(3)

            if english_captcha_element:
                base64_text = english_captcha_element.get_attribute('src')
                import base64
                code = base64_text.replace(
                    'data:image/jpg;base64,', '').replace('%0A', '')
                fh = open('yzm_en.jpeg', 'wb')
                fh.write(base64.b64decode(code))
                fh.close()

                from tools.fateadm_api import FateadmApi
                from ScrapyDemo.settings import (
                    ff_app_id, ff_app_key, ff_pd_id, ff_pd_key)

                api = FateadmApi(ff_app_id, ff_app_key, ff_pd_id, ff_pd_key)

                pred_type = "30400"
                # 返回详细识别结果
                rsp = api.PredictFromFile(pred_type, 'yzm_en.jpeg')

                while True:
                    if not rsp or rsp.ret_code != 0 or not rsp.pred_rsp or not rsp.pred_rsp.value:
                        rsp = api.PredictFromFile(pred_type, 'yzm_en.jpeg')
                    else:
                        break

                self.__clear_input(
                    '.Captcha.SignFlow-captchaContainer input[name="captcha"]')
                self.browser.find_element_by_css_selector(
                    '.Captcha.SignFlow-captchaContainer input[name="captcha"]').send_keys(rsp.pred_rsp.value)
                self.__inputLoginInfo()
                time.sleep(3)

        cookies = self.browser.get_cookies()
        pickle.dump(cookies, open(
            './cookies/zhihu.cookie', 'wb'))
        cookie_dict = {}
        for cookie in cookies:
            cookie_dict[cookie['name']] = cookie['value']

        return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict)]