def __init__(self): _ = BaseDriver(type=CHROME, executable_path=CHROME_DRIVER_PATH, headless=False, load_images=False, driver_use_proxy=False) self.driver = _._get_driver() self.wait = WebDriverWait(self.driver, 10)
def __init__(self): self.driver = BaseDriver( type=FIREFOX, executable_path=FIREFOX_DRIVER_PATH, headless=False, load_images=True, ).driver self.search_key = '杭州'
def _init_driver(self): self.driver = BaseDriver( type=FIREFOX, executable_path=FIREFOX_DRIVER_PATH, user_agent_type=PC, load_images=True, driver_use_proxy=True, headless=False, ip_pool_type=fz_ip_pool, ).driver
def _test(self): driver = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH) url = 'https://httpbin.org/get' body = driver.get_url_body(url=url) # lg.info(str(body)) try: data = json_2_dict(re.compile('<pre.*?>(.*)</pre>').findall(body)[0], default_res={}) except IndexError: return {} del driver return data
def main(): _ = BaseDriver(type=CHROME, executable_path=CHROME_DRIVER_PATH, headless=False, load_images=True, driver_use_proxy=False) driver = _._get_driver() url = 'http://www.sto.cn/Home/Index' driver.get(url) cracker = IndustryAndCommerceGeetestCrack(driver) cracker.crack() print(driver.get_window_size()) sleep(3) driver.save_screenshot("screen.png") driver.quit()
def get_into_recommend_good_manage(self, driver: BaseDriver): """ 进入荐好管理 :param driver: :return: """ try: driver.find_element( value=self.recommend_good_label_css_selector).click() # 等待下方标签出现 sleep(.5) driver.find_element(value='a.J_menuItem').click() except SeleniumTimeoutException: # 进入目标页失败, 则抛出异常! raise EnterTargetPageFailException
def wait_for_delete_img_appear(self, driver: BaseDriver): """ 直至出现图片, 超时退出(并且避免发布无图文章) :return: """ while True: # 改用 不宜用下面方式 长期跑电脑卡死 try: delete_btn_text = driver.find_element( value='div.deletebut').text except NoSuchElementException: # 处理这个异常, 并继续等待 sleep(.3) continue # 原先 但是老是发布失败!! # delete_btn_text = driver.find_element(value='div.deletebut').text # self.lg.info('delete_btn_text: {}'.format(delete_btn_text)) if delete_btn_text == '删除': break else: continue self.lg.info('该url采集完毕!')
def login_bg(self, driver: BaseDriver): """ login :return: """ self.lg.info('login ...') body = driver.get_url_body( url=self.publish_url, timeout=30, ) try: assert body != '' driver.find_element(value='input#loginName').send_keys( self.yx_username) driver.find_element(value='input#loginPwd').send_keys( self.yx_password) driver.find_element(value='button#subbut').click() except ( NoSuchElementException, SeleniumTimeoutException, AssertionError, WebDriverException, AttributeError, ): # 抛出登录异常 raise LoginFailException try: self.wait_for_recommend_good_label_appear(driver=driver) except FZTimeoutError: # 进入目标页失败, 则抛出异常! raise EnterTargetPageFailException
async def _get_html_by_driver(self, url, load_images=False): ''' 使用driver获取异步页面 :return: ''' driver = BaseDriver( executable_path=self.driver_path, ip_pool_type=self.ip_pool_type, load_images=load_images) body = driver.get_url_body(url=url) # self.lg.info(body) try: del driver except: pass collect() return body
def test_driver_change_proxy(): """ 测试firefox动态切换代理 :return: """ d = BaseDriver( # 可行 type=PHANTOMJS, executable_path=PHANTOMJS_DRIVER_PATH, # type=FIREFOX, # executable_path=FIREFOX_DRIVER_PATH, # 无效 # type=CHROME, # executable_path=CHROME_DRIVER_PATH, headless=True, driver_use_proxy=True, ip_pool_type=tri_ip_pool, ) origin_ip_sel = {'method': 're', 'selector': '\"origin\": \"(.*?)\",'} url = 'https://httpbin.org/get' # url = 'https://www.baidu.com' for index in range(0, 5): body = d.get_url_body( url=url, timeout=20, change_proxy=True, change_user_agent=True, ) if 'httpbin' in url: origin_ip = parse_field( parser=origin_ip_sel, target_obj=body, ) print('origin_ip: {}'.format(origin_ip)) else: print(body) try: del d except: pass
def wait_for_delete_img_appear(self, driver: BaseDriver): """ 直至出现图片, 超时退出(并且避免发布无图文章) :return: """ while True: delete_btn_text = driver.find_element(value='div.deletebut').text # self.lg.info('delete_btn_text: {}'.format(delete_btn_text)) if delete_btn_text == '删除': break else: continue self.lg.info('该url采集完毕!')
def test_driver( _type=CHROME, headless=True, driver_use_proxy=True, url: str = 'https://httpbin.org/get', ) -> str: if _type == CHROME: executable_path = CHROME_DRIVER_PATH elif _type == FIREFOX: executable_path = FIREFOX_DRIVER_PATH elif _type == PHANTOMJS: executable_path = PHANTOMJS_DRIVER_PATH else: raise ValueError('_type value 异常!') print('driver_type: {}, executable_path: {}, driver_use_proxy: {}'.format( _type, executable_path, driver_use_proxy)) print('url: {}'.format(url)) d = BaseDriver( type=_type, executable_path=executable_path, headless=headless, driver_use_proxy=driver_use_proxy, ip_pool_type=tri_ip_pool, ) body = d.get_url_body( url=url, timeout=30, ) print(body) try: del d except: pass return body
def wait_for_recommend_good_label_appear(self, driver: BaseDriver): """ 直到出现荐好管理label :param driver: :return: """ while True: recommend_good_label_text = driver.find_element( value=self.recommend_good_label_css_selector).text # self.lg.info('recommend_good_label_text: {}'.format(recommend_good_label_text)) if recommend_good_label_text == '荐好管理': break else: continue self.lg.info('login success!')
async def auto_publish_articles(self): """ 自动发布文章 :return: """ self.sql_cli = get_new_sql_cli(sql_cli=self.sql_cli) if not self.sql_cli.is_connect_success: raise SqlServerConnectionException else: pass if self.min_article_id == 0\ or self.max_article_id == 0: self.article_parser = ArticleParser(logger=self.lg) article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type=self.article_type, )) assert article_list != [] self.min_article_id, self.max_article_id = self.get_latest_max_and_min_artcile_id_from_article_list( article_list=article_list, ) self.lg.info('最新的min_article_id: {}, max_article_id: {}'.format( self.min_article_id, self.max_article_id, )) else: pass # 创建目标集合 # zq_article_list = [] # hk_article_list = [] # lfd_article_list = [] zq_article_list = self.get_zq_own_create_article_id_list( min_article_id=self.min_article_id, max_article_id=self.max_article_id, ) hk_article_list = self.get_hk_article_id_list() lfd_article_list = self.get_lfd_article_id_list() gxg_article_list = self.get_gxg_article_id_list() # 测试用 # article_id = '17300123' # article_list = [{ # 'uid': get_uuid3(target_str='{}::{}'.format('zq', article_id)), # 'article_type': 'zq', # 'article_id': article_id, # 'title': '未知', # 'article_url': 'https://focus.youth.cn/mobile/detail/id/{}#'.format(article_id), # }] # 文章在前的发布顺序, 视频在后(避免视频发过多) article_list = zq_article_list + hk_article_list + lfd_article_list + gxg_article_list assert article_list != [] # pprint(article_list) target_article_list = self.get_target_article_list( article_list=article_list) if target_article_list == []: self.lg.info('待发布的target_article_list为空list, pass!') return # rasp上代理模式启动chromedriver具有一定的失败率, 故还是mac driver = BaseDriver( type=CHROME, executable_path=CHROME_DRIVER_PATH, # 本地老是出错 # type=FIREFOX, # executable_path=FIREFOX_DRIVER_PATH, load_images=True, logger=self.lg, headless=self.driver_headless, driver_use_proxy=self.driver_use_proxy, ip_pool_type=self.ip_pool_type, ) try: try: self.login_bg(driver=driver) self.get_into_recommend_good_manage(driver=driver) except FZTimeoutError: raise LoginFailException for item in target_article_list: uid = item.get('uid', '') title = item.get('title', '') article_url = item.get('article_url', '') self.lg.info('正在发布文章 title: {}, article_url: {} ...'.format( title, article_url)) try: self.publish_one_article( driver=driver, article_url=article_url, ) except FZTimeoutError: raise PublishOneArticleFailException # 新增, 以及插入db self.db_article_id_list.append(uid) self.sql_cli._insert_into_table_2( sql_str=self.insert_sql0, params=( uid, get_shanghai_time(), ), logger=self.lg, ) except ( ArticleTitleOverLongException, LoginFailException, ArticleTitleContainSensitiveWordsException, PublishOneArticleFailException, EnterTargetPageFailException, ) as e: # 抛出异常 raise e except Exception: self.lg.error('遇到错误:', exc_info=True) finally: try: del driver except: try: del driver except: pass return
''' from time import sleep # Alert 父子关系弹窗对象 from selenium.webdriver.common.alert import Alert from fzutils.ip_pools import tri_ip_pool from fzutils.spider.fz_driver import ( BaseDriver, FIREFOX,) FIREFOX_DRIVER_PATH = '/Users/afa/myFiles/tools/geckodriver' target_url = 'https://sso.volkswagen.de/kpmweb/b2bLogin.do' d = BaseDriver( type=FIREFOX, load_images=True, ip_pool_type=tri_ip_pool, executable_path=FIREFOX_DRIVER_PATH,) def action(d): d.get_url_body(url=target_url, timeout=30) # 刷新使出现弹窗 d.refresh() alert = d.switch_to_alert() # alert 确认 or ok # alert.accept() # alert 取消 # alert.dismiss() # alert的文本 # text = alert.text
class ZhiHuLogin(object): def __init__(self): self._init_driver() self._set_headers() def _init_driver(self): self.driver = BaseDriver( type=FIREFOX, executable_path=FIREFOX_DRIVER_PATH, user_agent_type=PC, load_images=True, driver_use_proxy=True, headless=False, ip_pool_type=fz_ip_pool, ).driver def _set_headers(self): self.headers = { 'authority': 'www.zhihu.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': get_random_pc_ua(), 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', } def is_driver_low_version_error(self, body) -> bool: ''' driver版本过低, 则重启再试 :param body: :return: ''' while True: # 无限重试, 直到成功! driver_low_version_error = re.compile('你正在使用的浏览器版本过低').findall( body) if driver_low_version_error == []: break else: print('提示浏览器版本过低!') collect() self._init_driver() self.driver.get("https://www.zhihu.com/signup") sleep(3) body = self.driver.page_source continue return True def _get_login_cookies(self): ''' :return: ''' def _scan_qrcode(qrcode_url): '''扫码''' print('download qrcode ...') # local 保存qrcode qrcode_body = get(qrcode_url, headers=self.headers).content with open('./images/qrcode.jpg', 'wb') as f: f.write(qrcode_body) qrcode_img = Image.open('./images/qrcode.jpg') qrcode_img.show() before_url = self.driver.current_url print('wait to scan qrcode ...') sleep(15) while self.driver.current_url != before_url: print('扫码登陆成功!') print('-' * 100) break return True self.driver.get("https://www.zhihu.com/signup") sleep(3) self.is_driver_low_version_error(body=self.driver.page_source) try: self.driver.find_element_by_css_selector( 'div.SignContainer-switch span').click() sleep(1) self.driver.find_element_by_css_selector( 'span.Login-qrcode button').click() sleep(2) qrcode_url = Selector(text=self.driver.page_source).css( 'div.Qrcode-img img ::attr("src")').extract_first() print('获取到的二维码地址为:{}'.format(qrcode_url)) print('wait to scan qrcode ...') # 扫码 # scan_res = _scan_qrcode(qrcode_url=qrcode_url) except (NoSuchElementException, IndexError) as e: print(e) # TODO 出现: Missing argument grant_type sleep(20) cookies = self.driver.get_cookies() pprint(cookies) return cookies def __del__(self): try: del self.driver except: pass collect()
def publish_one_article(self, driver: BaseDriver, article_url: str): """ 发布一篇图文 :param driver: :param article_url: :return: """ try: # 切换到目标iframe(用index有时候不准, pass) # driver.switch_to_frame(frame_reference=1) iframe_ele_list = driver.find_elements(by=By.TAG_NAME, value='iframe') # pprint(iframe_ele_list) assert iframe_ele_list != [] target_iframe_ele = iframe_ele_list[1] if len( iframe_ele_list) > 1 else iframe_ele_list[0] driver.switch_to_frame(frame_reference=target_iframe_ele) except (NoSuchFrameException, ) as e: # 没匹配到frame(可能是原先就在目标iframe, eg: title过长的, 再切回iframe, 但是iframe_ele_list为0) raise e # 清空输入框 input_box_ele = driver.find_element(value='input#SnatchUrl') input_box_ele.clear() # 输入待采集地址 input_box_ele.send_keys(article_url) # 点击采集按钮 driver.find_elements(value='span.input-group-btn button')[0].click() try: self.wait_for_delete_img_appear(driver=driver) except (FZTimeoutError, NoSuchElementException): # 发布某文章超时失败or无元素存在, 则抛出发布异常 raise PublishOneArticleFailException # 获取输入框的值 title = driver.find_element( value='input#RecommendName').get_attribute('value') self.lg.info('title: {}'.format(title)) if target_str_contain_some_char_check( target_str=title, check_char_obj=ARTICLE_TITLE_SENSITIVE_STR_TUPLE): raise ArticleTitleContainSensitiveWordsException else: pass if isinstance(title, str) and len(title) > 30: # 标题过长则return, 不发布 self.lg.info('@@@ title 标题过长, 无法发布!! 跳过!') # 由于标题过长后, 无法处理后续文章, 故不return, 直接抛出异常 # return raise ArticleTitleOverLongException else: pass try: # 点击发布按钮 driver.find_elements( value='span.input-group-btn button')[1].click() except WebDriverException: # 处理发布单篇异常! # 处理报错: Message: unknown error: Element <iframe class="J_iframe" name="iframe0" raise PublishOneArticleFailException # 切换至主页面 driver.switch_to_default_content() # 填写被发布人 random_phone = self.get_random_phone() driver.find_element( value='input.layui-layer-input').send_keys(random_phone) # 点击确定 driver.find_element(value='a.layui-layer-btn0').click() self.lg.info('url: {} 发布成功!'.format(article_url)) # 发布成功, 等待5.秒, 等待页面元素置空 sleep(5.) return
def get_home_page_info_by_page_num(self, page_num: int) -> list: """ 根据page_num获取单页的信息 :param page_num: :return: """ def parse_page_info(body) -> list: """ 解析 :param body: :return: """ # div item li_sel = { 'method': 'css', 'selector': 'div.center-wrap a.random_list', } title_sel = { 'method': 'css', 'selector': 'div.random_title ::text', } create_time_sel = { 'method': 'css', 'selector': 'div.date ::text', } article_img_url_sel = { 'method': 'css', 'selector': 'div.random_article img ::attr("data-original")', } article_img_name_sel = { 'method': 'css', 'selector': 'div.random_article img ::attr("alt")', } li_list = parse_field( parser=li_sel, target_obj=body, is_first=False, ) res = [] for item in li_list: # pprint(item) try: title = parse_field( parser=title_sel, target_obj=item, ) assert title != '' create_time = parse_field( parser=create_time_sel, target_obj=item, ) assert create_time != '' article_img_url_list = parse_field( parser=article_img_url_sel, target_obj=item, is_first=False, ) assert article_img_url_list != [] article_img_name_list = parse_field( parser=article_img_name_sel, target_obj=item, is_first=False, ) assert article_img_name_list != [] article_img_list = list( zip(article_img_name_list, article_img_url_list)) article_img_list = [{ 'img_name': i[0], 'img_url': i[1], } for i in article_img_list] except (AssertionError, IndexError) as e: # print(e) continue res.append({ 'title': title, 'create_time': create_time, 'article_img_list': article_img_list, }) return res headers = self.get_random_phone_headers() headers.update({ 'authority': 'www.doutula.com', 'referer': 'https://www.doutula.com/', }) params = (('page', str(page_num)), ) url = 'https://www.doutula.com/article/list/' # TODO 用requests乱码 # body = Requests.get_url_body( # url=url, # headers=headers, # params=params, # ip_pool_type=self.ip_pool_type, # num_retries=self.request_num_retries, # encoding='utf-8',) # print(body) # 改用driver d = BaseDriver(ip_pool_type=tri_ip_pool, user_agent_type=PHONE) body = d.get_url_body( url=_get_url_contain_params(url=url, params=params)) # print(body) try: del d except: pass res = parse_page_info(body=body) print('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) collect() return res
def get_stg_search_res2(self, k: str, default_sort_value: int=None) -> dict: """ 搜题狗2(driver 版) :param k: :return: """ # 只获取第一页数据 k = '社会主义核心' driver = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, load_images=False, logger=self.lg, user_agent_type=PHONE, ip_pool_type=self.ip_pool_type, ) # 输入框选择器 input_css_sel = 'input#scform_srchtxt' submit_btn_sel = 'button#scform_submit' body = driver.get_url_body( url='http://www.etkz.cn/search.php?mod=forum', css_selector=submit_btn_sel, timeout=20,) assert body != '' # self.lg.info(body) driver.find_element(value=input_css_sel).send_keys(k) driver.find_element(value=submit_btn_sel).click() sleep(5.) body = Requests._wash_html(driver.page_source) assert body != '' self.lg.info(body) try: del driver except: pass question_item_sel = { 'method': 'css', 'selector': 'div#threadlist ul li', } question_desc_div_sel = { 'method': 're', 'selector': '问题:(.*?)答案:', } answer_div_sel = { 'method': 're', 'selector': '答案:(.*?)更多相关问题', } question_item = parse_field( parser=question_item_sel, target_obj=body, is_first=False, logger=self.lg, ) assert question_item != [] # pprint(question_item) res = [] for item in question_item: # 有序的 try: question_desc_div = parse_field( parser=question_desc_div_sel, target_obj=item, logger=self.lg, ) assert question_desc_div != '' answer_div = parse_field( parser=answer_div_sel, target_obj=item, logger=self.lg, ) assert answer_div != '' # 清洗 question_desc = fix_text(wash_sensitive_info( data=question_desc_div, replace_str_list=[], add_sensitive_str_list=[ '<strong>', '</strong>', '<font .*?>', '</font>', '<span .*?>', '</span>', ], is_default_filter=False, is_lower=False, )) answer = fix_text(wash_sensitive_info( data=answer_div, replace_str_list=[], add_sensitive_str_list=[ '<strong>', '</strong>', '<font .*?>', '</font>', ], is_default_filter=False, is_lower=False, )) except Exception: continue ask_questions_result_item = AskQuestionsResultItem() ask_questions_result_item['question_desc'] = question_desc ask_questions_result_item['answer'] = answer res.append(dict(ask_questions_result_item)) self.lg.info('[{}] stg2, k: {}'.format( '+' if res != [] else '-', k, )) return { 'k': k, 'page_num': default_sort_value, 'res': res, }
class TaoWaiMaiSpider(object): def __init__(self): self.driver = BaseDriver( type=FIREFOX, executable_path=FIREFOX_DRIVER_PATH, headless=False, load_images=True, ).driver self.search_key = '杭州' def _actions(self): ''' 行为 :return: ''' url = 'https://h5.m.taobao.com/app/waimai/index.html#/' self.driver.get(url) sleep(3) try: self.driver.find_element_by_css_selector( 'div.location span').click() sleep(2) self.driver.find_element_by_css_selector( 'div.search input').send_keys(self.search_key) sleep(2.5) # add_p_list = self.driver.find_elements_by_css_selector('.search-result div.add-list div.item-wrap p.address') # pprint(add_p_list) # add_p_list[0].send_keys(Keys.ENTER) # 默认点第一个 # self.driver.find_element_by_xpath("(.//*[normalize-space(text()) and normalize-space(.)='Crocker St914'])[1]/preceding::p[22]").click() print('请点击选择定位处...') sleep(10) # scroll_js = '''document.body.scrollTop=10000''' scroll_js = r''' function scrollToBottom() { var Height = document.body.clientHeight, //文本高度 screenHeight = window.innerHeight, //屏幕高度 INTERVAL = 100, // 滚动动作之间的间隔时间 delta = 500, //每次滚动距离 curScrollTop = 0; //当前window.scrollTop 值 var scroll = function () { curScrollTop = document.body.scrollTop; window.scrollTo(0,curScrollTop + delta); }; var timer = setInterval(function () { var curHeight = curScrollTop + screenHeight; if (curHeight >= Height){ //滚动到页面底部时,结束滚动 clearInterval(timer); } scroll(); }, INTERVAL) } scrollToBottom() ''' self.driver.execute_script(script=scroll_js) sleep(5) body = self.driver.page_source # div.list div.list-item shop_list = Selector(text=body).css( 'div.list div.list-item ::text').extract() or [] pprint(shop_list) except Exception as e: print(e) sleep(60) def __del__(self): try: del self.driver except: pass collect()