def start_requests(self): ''' override该入口方法,以selenium模拟登录后拿到cookie交给scrapy的request使用 ''' # 从文件中读取cookie cookies = None try: if os.path.exists('./cookies/lagou.cookie'): cookies = pickle.load(open('./cookies/lagou.cookie', 'rb')) except: pass if not cookies: from selenium import webdriver from selenium.webdriver.chrome.options import Options options = Options() options.add_argument("--disable-extensions") # options.add_argument('window-size=1280x800') # options.add_experimental_option("debuggerAddress", "127.0.0.1:9222") # options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" browser = webdriver.Chrome(executable_path='./chromedriver', chrome_options=options) from selenium.webdriver.common.action_chains import ActionChains action = ActionChains(browser) browser.get('https://passport.lagou.com/login/login.html') browser.find_element_by_css_selector( 'div[data-view="passwordLogin"] div[data-propertyname="username"] .input.input_white' ).send_keys(LAGOU_USERNAME) browser.find_element_by_css_selector( 'div[data-view="passwordLogin"] div[data-propertyname="password"] .input.input_white' ).send_keys(LAGOU_PASSWORD) browser.find_element_by_css_selector( 'div[data-view="passwordLogin"] .sense_login_password input[type="submit"]' ).click() import time time.sleep(3) is_login = False while not is_login: try: if browser.find_element_by_css_selector('.unick'): is_login = True except: pass try: captcha_element = browser.find_element_by_css_selector( '.geetest_window>.geetest_item:first-child>.geetest_item_wrap>img' ) except: captcha_element = None if captcha_element: if action._actions: action._actions = [] img_url = captcha_element.get_attribute('src') import ssl ssl._create_default_https_context = ssl._create_unverified_context import urllib urllib.request.urlretrieve(img_url, filename='lagou_validate.jpeg') # import requests # html = requests.get(img_url) # fh = open('lagou_validate.jpeg', 'wb') # fh.write(html.content) # fh.close() from tools.chaojiying import Chaojiying_Client from settings import (chaojiying_username, chaojiying_password, chaojiying_app_id) chaojiying = Chaojiying_Client(chaojiying_username, chaojiying_password, chaojiying_app_id) im = open('lagou_validate.jpeg', 'rb').read() pos_obj = chaojiying.PostPic(im, 9004) if pos_obj['err_no'] == 0 or pos_obj['err_str'] == 'OK': pic_str = pos_obj['pic_str'] positions = pic_str.split('|') from ScrapyDemo.utils.common import get_coordinate for item in positions: point = item.split(',') el_index = get_coordinate(point) browser.find_element_by_css_selector( '.geetest_window>.geetest_item:nth-child({})'. format(el_index)).click() browser.find_element_by_css_selector( '.geetest_commit').click() time.sleep(5) cookies = browser.get_cookies() # 写入cookie到文件中 pickle.dump(cookies, open('./cookies/lagou.cookie', 'wb')) cookie_dict = {} for cookie in cookies: cookie_dict[cookie['name']] = cookie['value'] for url in self.start_urls: yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict)
def judge_login(self, response): self.browser.get('https://www.zhihu.com/signin') if self.__is_login(): return [scrapy.Request(url=self.start_urls[0], dont_filter=True)] action = ActionChains(self.browser) self.browser.find_element_by_css_selector( '.SignFlow-tabs .SignFlow-tab:nth-child(2)').click() self.__inputLoginInfo(password=ZHIHU_PASSWORD + '1') time.sleep(3) login_success = False while not login_success: try: if self.__is_login(): login_success = True except: pass try: english_captcha_element = self.browser.find_element_by_class_name( 'Captcha-englishImg') except: english_captcha_element = None try: chinese_captcha_element = self.browser.find_element_by_class_name( 'Captcha-chineseImg') except: chinese_captcha_element = None if chinese_captcha_element: if action._actions: action._actions = [] self.browser.execute_script( 'document.querySelectorAll(".Button.ChineseCaptchaPoint").forEach(function(el){el.click()});') # chinese_captcha_point = self.browser.find_element_by_class_name('') # ele_position = chinese_captcha_element.location # x_relative = ele_position['x'] # y_relative = ele_position['y'] # self.browser_navigation_panel_height = self.browser.execute_script( # 'return window.outerHeight - window.innerHeight' # ) base64_text = chinese_captcha_element.get_attribute('src') import base64 code = base64_text.replace( 'data:image/jpg;base64,', '').replace('%0A', '') fh = open('yzm_cn.jpeg', 'wb') fh.write(base64.b64decode(code)) fh.close() from zheye import zheye z = zheye() positions = z.Recognize('yzm_cn.jpeg') last_position = [] # 格式化倒立文字的坐标(zheye默认坐标返回为y,x的格式) if len(positions) == 2: if positions[0][1] > positions[1][1]: last_position.append( [positions[1][1], positions[1][0]]) last_position.append( [positions[0][1], positions[0][0]]) else: last_position.append( [positions[0][1], positions[0][0]]) last_position.append( [positions[1][1], positions[1][0]]) first_position = [ int(last_position[0][0] / 2), int(last_position[0][1] / 2)] second_position = [ int(last_position[1][0] / 2), int(last_position[1][1] / 2)] action.move_to_element_with_offset( chinese_captcha_element, first_position[0], first_position[1]).click() action.move_to_element_with_offset( chinese_captcha_element, second_position[0], second_position[1]).click().perform() else: last_position.append([positions[0][1], positions[0][0]]) first_position = [ int(last_position[0][0] / 2), int(last_position[0][1] / 2)] action.move_to_element_with_offset( chinese_captcha_element, first_position[0], first_position[1]).click().perform() print(last_position) self.__inputLoginInfo() time.sleep(3) if english_captcha_element: base64_text = english_captcha_element.get_attribute('src') import base64 code = base64_text.replace( 'data:image/jpg;base64,', '').replace('%0A', '') fh = open('yzm_en.jpeg', 'wb') fh.write(base64.b64decode(code)) fh.close() from tools.fateadm_api import FateadmApi from ScrapyDemo.settings import ( ff_app_id, ff_app_key, ff_pd_id, ff_pd_key) api = FateadmApi(ff_app_id, ff_app_key, ff_pd_id, ff_pd_key) pred_type = "30400" # 返回详细识别结果 rsp = api.PredictFromFile(pred_type, 'yzm_en.jpeg') while True: if not rsp or rsp.ret_code != 0 or not rsp.pred_rsp or not rsp.pred_rsp.value: rsp = api.PredictFromFile(pred_type, 'yzm_en.jpeg') else: break self.__clear_input( '.Captcha.SignFlow-captchaContainer input[name="captcha"]') self.browser.find_element_by_css_selector( '.Captcha.SignFlow-captchaContainer input[name="captcha"]').send_keys(rsp.pred_rsp.value) self.__inputLoginInfo() time.sleep(3) cookies = self.browser.get_cookies() pickle.dump(cookies, open( './cookies/zhihu.cookie', 'wb')) cookie_dict = {} for cookie in cookies: cookie_dict[cookie['name']] = cookie['value'] return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict)]