def verify_code(): cop = ChromeOperate('https://www.jsdati.com/login', executable_path=executable_path) sys.exit() ele = cop.find_element_by_id('login-captcha-img') code = get_auth_code(cop.driver, ele) name_input = cop.find_element_by_name('username') psd_input = cop.find_element_by_name('password') captcha_input = cop.find_element_by_name('captcha') cop.input_words(name_input, 'vobile123') cop.input_words(psd_input, 'vobile@123') cop.input_words(captcha_input, code) time.sleep(3) captcha_input.send_keys(Keys.ENTER) #这里没有做异常处理,比如验证码识别失败应该多点几次 time.sleep(200)
def main(needLogin=False): arguments = [] if not needLogin: #arguments.append('headless') pass so = ChromeOperate( executable_path=r'C:\Users\Administrator\Desktop\chromedriver.exe', User_data_dir=r'D:\User Data', arguments=arguments) driver = so.driver driver.get('https://www.bilibili.com/')
def main(): cop = ChromeOperate( executable_path=r'F:\work\py3_pj\amazon_craw\chromedriver.exe') url = 'https://m.weibo.cn/status/4470730772817031' cop.open(url) while True: time.sleep(1) cop.down_page() page_buf = cop.open_source() comments = ct.getXpath('//h3/text()', page_buf) comments = [[text] for text in comments] ct.writer_to_csv(comments, 'comments.csv')
# -*- coding: utf-8 -*- # @Time : 2019/7/23 10:09 # @Author : meng_zhihao # @Email : [email protected] # @File : twitter.py from selenium_operate import ChromeOperate import re import time cop = ChromeOperate(executable_path=r'chromedriver.exe', arguments=['--proxy-server=http://%s' % '127.0.0.1:1080']) cop.open('https://twitter.com/1984to1776') page = cop.open_source()
from selenium.webdriver.common.keys import Keys import getpass import time from tools.db_util import DBUtil from selenium_operate import ChromeOperate from tools.md5_convert import * username = getpass.getuser() import sys reload(sys) sys.setdefaultencoding('utf8') #default_path = r'C:\Users\%s\AppData\Local\Google\Chrome\User Data' % username default_path = r'D:\User Data' arguments = [] arguments.append('headless') so = ChromeOperate( executable_path=r'C:\Users\Administrator\Desktop\chromedriver.exe', User_data_dir=r'D:\User Data', arguments=arguments) driver = so.driver # driver = webdriver.Chrome() # driver = webdriver.PhantomJS() time.sleep(5) time_range = 60 * 60 * 14 start_time = time.time() # url,hash? print('打开链接') driver.get('https://www.baidu.com/s?wd=python') print('打开浏览器') print(driver.title) conn, cursor = DBUtil.get_cursor('web_detect')
def jingdong(): cop = ChromeOperate( 'http://miao.item.taobao.com/564003208573.htm?spm=5070.7116889.1996665613.3.iQXafH', executable_path=executable_path)
def taobao(): print executable_path cop = ChromeOperate( 'https://fuwu.taobao.com/ser/detail.htm?spm=a1z13.8114203.1234-fwlb.5.66c25acaQRsoGC&service_code=ts-1796606&tracelog=category&scm=1215.1.1.51052018%E3%80%81', executable_path=executable_path)
def weibo_search(): start_date = '2019-12-01' end_date = '2020-01-01' cop = ChromeOperate( executable_path=r'F:\work\py3_pj\amazon_craw\chromedriver.exe') date_list = dateRange(start_date, end_date) print(date_list) search_url_template = 'https://s.weibo.com/weibo?q=%E5%8D%8E%E4%B8%BA%20-%E5%96%9C%E9%A9%AC%E6%8B%89%E9%9B%85&scope=ori&typeall=1&suball=1×cope=custom:{0}-{1}:{2}-{3}&Refer=SWeibo_box' line = ['时间', '博文id', '用户', '用户id', '评论', '转发', '点赞', '博文'] hours = [[0, 2], [2, 4], [4, 6], [6, 8], [8, 10], [10, 12], [12, 14], [14, 16], [16, 18], [18, 20], [20, 22], [22, 23]] yield line for i in range(len(date_list) - 1): for hour in hours: start_date = date_list[i] end_date = date_list[i + 1] print(start_date) shour, ehour = hour[0], hour[1] search_url = search_url_template.format(start_date, shour, start_date, ehour) cop.open(search_url) for page_num in range(5): try: page_buf = cop.open_source() posts = ct.getXpath('//div[@class="card-wrap"]', page_buf) for post in posts: texts = ct.getXpath( '//p[@node-type="feed_list_content_full"]//text()', post) if not texts: texts = ct.getXpath( '//p[@node-type="feed_list_content"]//text()', post) texts = ''.join(texts) if not texts: continue date = "" from_source = ct.getXpath('//p[@class="from"]', post) if from_source: date = ct.getXpath1('//a/text()', from_source[-1]) date = date.strip() nick = ct.getXpath1('//a/@nick-name', post) user_id = ct.getXpath1('//a[@class="name"]/@href', post) user_id = ct.getRegex('weibo.com/(\d+)', user_id) mid = ct.getXpath1('//div/@mid', post) '' # 评论 comments_button = ct.getXpath1( '//a[@action-type="feed_list_comment"]/text()', post) comments_count = ct.getRegex('评论 (\d+.*)', comments_button) if not comments_count: comments_count = 0 # get_comments = [] feed_list_forward_button = ct.getXpath1( '//a[@action-type="feed_list_forward"]/text()', post) forward_count = ct.getRegex('转发 (\d+.*)', feed_list_forward_button) if not forward_count: forward_count = 0 like_button = ct.getXpath( '//a[@action-type="feed_list_like"]', post) if like_button: like_button = like_button[-1] like_button = ct.getXpath1("//em/text()", like_button) like_count = ct.getRegex('(\d+.*)', like_button) if not like_count: like_count = 0 else: like_count = 0 # feed_list_forward # 转发 # if ct.getRegex('评论 (\d+.*)',comments_button): # # try: # # # proxy = ct.get_new_1min_proxy() # # # proxies = {'http': 'http://' + proxy, 'https': 'http://' + proxy} # # # try: # # # comments_page = ct.get('https://m.weibo.cn/api/comments/show?id='+mid,proxies=proxies) # 获取评论会封ip,另外有很多评论不可见(敏感词,用户设置) 虽然评论数不是空 # # # except: # # # time.sleep(2) # # time.sleep(2) # # comments_page = ct.get('https://m.weibo.cn/api/comments/show?id=' + mid) # # json_data = json.loads(comments_page) # # comments = json_data['data']['data'] # # for comment in comments: # # comment_text = comment['text'] # # get_comments.append(comment_text) # # except Exception as e: # # print(e,mid) line = [ date, mid, nick, user_id, comments_count, forward_count, like_count, texts ] yield line next_button = cop.find_elements_by_xpath( '//a[@class="next"]') if next_button: time.sleep(random.randint(2, 4)) next_button[0].click() else: break except Exception as e: print(e)
def weibo_search(): # start_date = '2020-01-10' # 10-12 start_date = '2020-01-10' end_date = '2020-02-12' cop = ChromeOperate( executable_path=r'F:\work\py3_pj\amazon_craw\chromedriver.exe') date_list = dateRange(start_date, end_date) print(date_list) search_url_template = 'https://s.weibo.com/weibo/%25E6%2594%25BF%25E5%25BA%259C?q=whzf&scope=ori&suball=1×cope=custom:{0}-0:{1}-0&Refer=g' for i in range(len(date_list) - 1): start_date = date_list[i] end_date = date_list[i + 1] search_url = search_url_template.format(start_date, end_date) cop.open(search_url) for page_num in range(50): try: page_buf = cop.open_source() posts = ct.getXpath('//div[@class="card-wrap"]', page_buf) # proxy = ct.get_new_1min_proxy() # proxies = {'http': 'http://' + proxy, 'https': 'http://' + proxy} for post in posts: texts = ct.getXpath( '//p[@node-type="feed_list_content_full"]//text()', post) if not texts: texts = ct.getXpath( '//p[@node-type="feed_list_content"]//text()', post) texts = ''.join(texts) if not texts: continue date = "" from_source = ct.getXpath('//p[@class="from"]', post) if from_source: date = ct.getXpath1('//a/text()', from_source[-1]) date = date.strip() nick = ct.getXpath1('//a/@nick-name', post) mid = ct.getXpath1('//div/@mid', post) '' # 评论 comments_button = ct.getXpath1( '//a[@action-type="feed_list_comment"]/text()', post) get_comments = [] if ct.getRegex('评论 (\d+.*)', comments_button): try: # try: # comments_page = ct.get('https://m.weibo.cn/api/comments/show?id='+mid,proxies=proxies) # 获取评论会封ip,另外有很多评论不可见(敏感词,用户设置) 虽然评论数不是空 # json_data = json.loads(comments_page) # comments = json_data['data']['data'] # for comment in comments: # comment_text = comment['text'] # get_comments.append(comment_text) # except: # time.sleep(2) # proxy = ct.get_new_1min_proxy() # time.sleep(2) comments_page = ct.get( 'https://m.weibo.cn/api/comments/show?id=' + mid) json_data = json.loads(comments_page) comments = json_data['data']['data'] for comment in comments: comment_text = comment['text'] get_comments.append(comment_text) except Exception as e: print(e, mid) line = [date, mid, nick, texts] + get_comments yield line next_button = cop.find_elements_by_xpath('//a[@class="next"]') if next_button: time.sleep(random.randint(1, 2) * 0.6) next_button[0].click() else: break except Exception as e: print(e)
# new_url = get_Complete_url(line) # if not new_url: # url_info =re.search('table=([\w-]+)',line) # if not url_info: # url_info =re.search('ProjectToken=([\w-]+)',line) # if url_info: # key_id = url_info.group(1) # new_url = 'https://s.cint.com/Survey/Complete?ProjectToken=' + key_id # if new_url: # print(new_url) # cop = ChromeOperate(executable_path=r'chromedriver.exe') # cop.open('https://www.lifepointspanel.com/') # cop_list.append(cop) # new_url_list.append(new_url) cop = ChromeOperate(executable_path=r'chromedriver.exe') cop.open('https://www.lifepointspanel.com/') while True: time.sleep(1) page_source = cop.driver.page_source if '很高兴见到您' in page_source: break page_source = cop.driver.page_source urls = re.findall('survey-link="(https:.*?)"', page_source) links = [url for url in urls] print(links) for link in links: # 'https://router.cint.com/survey/AQAAAL7nugAAAAAAmG-5FA@AQAAAHtb29WspZP0tE42GLad1uZV0jb-oDFne7DJ21JKNo_8'