#!/usr/local/python3/bin/python3 from lxml import etree import os from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='lxml.log') html = ''' <html> <title> This is Title </title> <body> <h1> This is h1 </h1> <div> This is fisrt div </div> <div id="divid"> <img src="1111.png"/> <span id="sp1"> desc 1111.png </span> <img src="2222.png"/> <span id="sp2"> desc 2222.png </span> <p> <a href="http://www.xxxxx.com/"> link-of-xxxxxx </a> </p> <a href="http://www.yyyyyyy.com/"> link-of-yyyyyyyyy </a> <br/> <a href="http://www.zzzzzzz.com/"> link-of-zzzzzzzzz </a> </div>
import os import time from selenium import webdriver import selenium.common from Tools.tools import debug_log logger = debug_log(os.getcwd(), name="play_tengxun.log") def play_tengxun_video(): index_url = "https://v.qq.com/" # 设置无界面 options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-gpu") driver = webdriver.Chrome(chrome_options=options) # debug timeout的情况下再试一次 n = 1 while n < 3: try: driver.get(index_url) # 当前窗口 index_win = driver.current_window_handle # 进入动漫频道
# encoding:utf-8 import urllib.request import os from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='check_proxy.log') ''' httpproxy=urllib2.ProxyHandler({"http":"10.36.132.41:808"})#代理无需账号 opener=urllib2.build_opener(httpproxy)#创建一个打开器 request=urllib2.Request("http://www.baidu.com") #访问百度 response=opener.open(request)#打开网页,内置代理服务器 print response.read() ''' # 测试代理是否可用 # 测试个多代理 def check_proxys(proxy_dict_list): if not isinstance(proxy_dict_list, list) and not isinstance( proxy_dict_list[0], dict): logger.debug("请输入正确的代理") i = 0 for proxy_dict in proxy_dict_list: proxy = urllib.request.ProxyHandler(proxy_dict) # nohttpproxy=urllib.request.ProxyHandler({}) #空代理 opener = urllib.request.build_opener(proxy) request = urllib.request.Request( "http://www.baidu.com/") #代理访问,URL必须完整, try: response = opener.open(request, timeout=4)
import os import selenium.webdriver from selenium.webdriver.chrome.options import Options from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='chrome.log') def use_chrome_withNonhead(): chrome_options = Options() chrome_options.add_argument("--no-sandbox") # root用户需加 chrome_options.add_argument('--headless') # 无界面模式 chrome_options.add_argument('window-size=1920x3000') # 指定浏览器分辨率 chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug chrome_options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面 chrome_options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片 driver = selenium.webdriver.Chrome(chrome_options=chrome_options) driver.get('https://www.cnblogs.com/z-x-y/p/9026226.html') print(driver.page_source) if __name__ == '__main__': use_chrome_withNonhead()
import selenium.webdriver import os import time from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='qq_screen.log') def get_qq_screen(): driver = selenium.webdriver.PhantomJS() driver.get('http://www.baidu.com/') # https://www.baidu.com/ # time.sleep(5) # 截图保存 print(dir(driver)) driver.save_screenshot(os.getcwd() + '/biadu_screen.png') driver.close() pass if __name__ == '__main__': get_qq_screen() pass
#!/usr/local/python3/bin/python3 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw=python&sm=0&p=1' import selenium import selenium.webdriver import urllib.parse import os import re import urllib.request from Tools.tools import debug_log logger = debug_log(os.getcwd()) def get_pages(): pass def get_url(addr='深圳', search_word='python'): data = {'jl': addr, 'kw': search_word} data = urllib.parse.urlencode(data) url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?' + data + '&sm=0&p=1' logger.debug(url) driver = selenium.webdriver.PhantomJS() driver.get(url) page_source = driver.page_source # logger.debug(page_source) restr = 'href=(\s\S*?)' src_pattern = re.compile(restr)
import time import os import re from selenium import webdriver from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='windows.log') def handle_windowns(): index_url = "https://baike.baidu.com/item/Java/85979?fr=aladdin" # 设置无界面 options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-gpu") driver = webdriver.Chrome(chrome_options=options) driver.get(index_url) # 主窗口 index_win = driver.current_window_handle # 进行登陆 login_elem = driver.find_element_by_link_text("登录") login_elem.click() login_win = driver.current_window_handle time.sleep(3)
import time import os from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.support.select import Select from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='solve_some_tags.log') def solve_some_tags(): url = "https://www.baidu.com/" # 设置无界面 options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-gpu") driver = webdriver.Chrome(chrome_options=options) driver.get(url) # 进入设置页面 setting_elem = driver.find_element_by_link_text("设置") ActionChains(driver).move_to_element(setting_elem).perform() search_settring_elem = driver.find_element_by_link_text("搜索设置") search_settring_elem.click() # 进行select标签的设置
import os from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='lovepython.log') # join 函数 def test_join(): str = 'hello' str2 = 'python' str3 = ' '.join(str2) print(str3) # is 和 == 的区别 def test_is(): a = 100 b = 100 c = 50 print(id(a)) print(id(b)) print(id(c)) print(a is b) # python和java等变量赋值的方式不同,python的奇葩的地址赋值 # 深度拷贝和浅拷贝 def test_deepcopy():
import selenium.webdriver import os import time from Tools.tools import debug_log from selenium.webdriver.support.select import Select logger = debug_log(os.getcwd(), name='login_qq_mailbox.log') def login_qq_mailbox(): driver = selenium.webdriver.PhantomJS() driver.get('https://qzone.qq.com/') # driver.get('https://blog.csdn.net/linlu_home/article/details/78799878') time.sleep(3) #login user u and p ''' element = driver.switch_to.active_element alert = driver.switch_to.alert driver.switch_to.default_content() driver.switch_to.frame('frame_name') driver.switch_to.frame(1) driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[0]) driver.switch_to.parent_frame() driver.switch_to.window('main') ''' # driver.switch_to.frame('login_frame') driver.switch_to_frame('login_frame') # driver.switch_to.frame(driver.find_element_by_xpath("//div[@id='login_div']/iframe")) print(driver.page_source)
import requests import os from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='requests_session.log') # reqiests.session进行登陆 def login_withsession(): session = requests.session() params = { 'emp_no': 'admin', 'password': '******', } login_url = 'http://demo.smeoa.com/index.php?m=&c=public&a=check_login' response = session.post(login_url, params) responsenex = session.get('http://demo.smeoa.com/index.php?m=&c=index&a=index') # 获取服务器响应的cookies cookies = response.cookies.get() cookies2 = response.cookies.get() print(responsenex.text) # 使用requests.post登陆 def login_oswith_request_post(): data = {
import subprocess import os from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='identify_picture_code.log') def identify_picture_code(): # 使用ocr工具 p = subprocess.Popen(['tesseract', '3.png', '3.png'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() # 读取识别结果 test_file = open('3.png.txt', 'r') line = test_file.readline() line = line.replace('\n', '') logger.debug(type(line)) logger.debug(line) pass if __name__ == '__main__': identify_picture_code() pass
import http.cookiejar import urllib.request import os import re import urllib.parse import random # 使用xpath获取数据不成功 # from lxml import etree from Tools.check_proxy import check_proxy from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='login_csdnwithcookie.log') def login_csdnwithcookie(): # 验证并使用代理 ''' HTTP 浙江省温州市 电信 117.87.178.31 HTTP 江苏省徐州市 电信 115.223.234.116 HTTP 浙江省温州市 电信 101.71.226.188 ''' if_proxy = False # 是否开启代理
import selenium.webdriver import selenium.webdriver.common.keys import os import time from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='selenium_baidu.log') def click_baidu(): driver = selenium.webdriver.PhantomJS() driver.get("https://www.baidu.com/") time.sleep(2) keyword_elem = driver.find_element_by_id('kw') keyword_elem.send_keys("python") driver.save_screenshot(os.getcwd() + "baidu1.png") # 点击 click_elem = driver.find_element_by_id("su") click_elem.click() driver.save_screenshot(os.getcwd() + "baidu2.png") if __name__ == '__main__': click_baidu()
import os import re import time from selenium import webdriver from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='login_jd.log') # 未成功, 元素没有加载 def login_jd(): login_url = "https://plogin.m.jd.com/user/login.action?appid=100&kpkey=&returnurl=http%3A%2F%2Fhome.m.jd.com%2FmyJd%2Fhome.action%3Fsid%3D583ee9874b9874ddf1515a4ada050e44" aim_url = "" # 无界面设置 options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-gpu") driver = webdriver.Chrome(chrome_options=options) driver.get(login_url) # 登陆 username_elem = driver.find_element_by_id("username") password_elem = driver.find_element_by_id("password") login_elem = driver.find_element_by_id("loginBtn") time.sleep(4) username_elem.send_keys('17688166224')
import os import time from selenium import webdriver from Tools.tools import debug_log logger = debug_log(os.getcwd(), name="dowload.log") def dowload_file(): index_url = "https://pypi.org/project/selenium/" current_dir = os.getcwd() # 设置无界面 options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-gpu") # 配置下载文件点击下载 # 测试失败 # prefs = {"profile.default_content_settings.popups":0, "download.default_directory": r"/workspace/sofeware/spider/pro2/12 selenium设置文件点击自动下载/"} prefs = { 'profile.default_content_settings.popups': 0, 'download.default_directory': current_dir } options.add_experimental_option("prefs", prefs) driver = webdriver.Chrome(chrome_options=options) driver.get(index_url) # 进入下载页面
import time import os import selenium.webdriver from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='oa_login.log') # 判断是否登陆 def if_logined(username=None, passwd=None): try: login_url = 'http://demo.smeoa.com/index.php?m=&c=public&a=login' driver = selenium.webdriver.PhantomJS() driver.get(login_url) time.sleep(2) # 登陆 username_elem = driver.find_element_by_id('emp_no') passwd_elem = driver.find_element_by_id('password') login_elem = driver.find_element_by_id('login_btn') username_elem.send_keys(username) passwd_elem.send_keys(passwd) login_elem.click() time.sleep(2) # 获取网页代码 person_index_url = 'http://demo.smeoa.com/index.php?m=&c=index&a=index' driver.get(person_index_url)
import urllib.parse import urllib.request import os import json import ssl from Tools.tools import debug_log logger = debug_log(os.getcwd(),name='debug.log') # 使用伪装浏览器以及标准url规范爬取智联招聘 def get_zhilian(addr='深圳', position='python'): "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw=python&sm=0&p=1" addr = {'jl': addr} addr = urllib.parse.urlencode(addr) position = {'kw': position} position = urllib.parse.urlencode(position) url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?' + addr + '&' + position + '&sm=0&p=1' headers = { 'User-Agent': "User-Agent:Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5" } request = urllib.request.Request(url = url, headers = headers) response = urllib.request.urlopen(request) print(response.read().decode()) return response
import os import time from selenium import webdriver from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='shucheng.log') # 创建书本章节对象 class Book(): def __init__(self, booke_num=0, book_name="未知"): self.booke_num = booke_num self.book_name = book_name self.capter_url_list = [] self.capters = [] # 内容存储对象 # 章节对象 class Capter(): def __init__(self, capter_num, capter_title, content): self.capter_num = capter_num # int self.capter_title = capter_title self.content = content # 创建driver def create_driver(url): # 设置无界面
#!/usr/local/python3/bin/python3 import os import urllib.request import http.cookiejar import urllib.parse import gzip import selenium.webdriver from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='myprogram2.log') # 使用open容器捉取网页cookie def read_cookie(): # 设置代理ip proxy_hander = urllib.request.ProxyHandler({'https': '123.57.207.2'}) cookie = http.cookiejar.CookieJar() cookie_hander = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(cookie_hander, proxy_hander) response = opener.open('http://www.baidu.com') cookies = "" for data in cookie: cookies = cookies + data.name + '=' + data.value + ";\r\n" logger.debug(cookies) return cookies
import time import re import os from selenium import webdriver from Tools.tools import debug_log logger = debug_log(os.getcwd(), name="proxy.log") # 设计存储对象 class Proxy(): def __init__(self, ip=None, port=0, type=None, addr=None): self.ip = ip self.port = port self.type = type self.addr = addr def create_driver(url): # 设置无界面 options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-gpu") driver = webdriver.Chrome(chrome_options=options) driver.get(url)
import os import time from selenium import webdriver from Tools.tools import debug_log logger = debug_log(os.getcwd(), name="play_video.log") def play_video(): url = "http://videojs.com/" # 设置无界面 options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-gpu") driver = webdriver.Chrome(chrome_options=options) driver.get(url) # 点击视频播放 # video_elem = driver.find_element_by_id("preview-player") video_emem = driver.find_element_by_class_name("vjs-big-play-button") video_emem.click() # 截图 time.sleep(5) driver.save_screenshot(os.getcwd() + "/play_video.png") driver.quit()
import urllib.request import os import re from lxml import etree from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='jiaobenzhijia.log') # 获取网页代码 def get_page_source(url=None): try: headers = { "User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)" } request = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(request) return response.read().decode('gb2312') except ValueError as e: logger.error(e) logger.debug('参数不正确, 路由错误') # 获取网页中需要的url def get_info_url(url): page_source = get_page_source(url) xpath = "//div[@class='artlist clearfix']/dl/dt/a/@href" xpath2 = "//div[@class='artlist clearfix']/dl/dt/a/@title" html = etree.HTML(page_source)
import re import os from selenium import webdriver from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='04 dowload_meinv.log') def create_driver(url=None): # 设置无界面模式 options = webdriver.ChromeOptions() options.add_argument("--no-sandbox") options.add_argument("--headless") options.add_argument("--disable-gpu") driver = webdriver.Chrome(chrome_options=options) driver.get(url) return driver # all_page_urls # def get_all_page_urls(): # url = 'http://m.umei.cc/p/gaoqing/rihan/1.htm' # # url = 'http://m.umei.cc/p/gaoqing/rihan/' # driver = create_driver(url) # # res_page_num = "<strong id='pagelist_all'>(\d+)</strong>" # page_num = re.findall(res_page_num, driver.page_source)
import random import urllib.request import urllib.parse import os import re import time from lxml import etree from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='etree.log') def get_page_source(url=None): headers = { "User-Agent": "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" } # 避免url为空的问题 try: request = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(request) return response.read().decode('utf-8') except ValueError as e: logger.debug('url格式有误') def get_page_urls(): ''' 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=python&isadv=0&sg=39e470246d7e4727944af8c5e9417893&p=4'
import urllib.request import os from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='get_picture.log') def get_picture(): response = urllib.request.urlopen( "https://mp.weixin.qq.com/s/MevhhvosfM3Q9SRaUtpN4Q") print(dir(response)) print(response.read().decode('utf-8')) def get_picture_with_requests(): import requests response = requests.get( 'https://mp.weixin.qq.com/s/MevhhvosfM3Q9SRaUtpN4Q', verify=False) print(response.text) # 使用selenium def get_picture_with_selenim(): import selenium.webdriver url = 'https://mp.weixin.qq.com/s/MevhhvosfM3Q9SRaUtpN4Q' driver = selenium.webdriver.PhantomJS() driver.get(url) # picture_span_elems = driver.find_elements_by_xpath("//span[@style='font-size: 15px;']//img/")
import os import time from selenium import webdriver from Tools.tools import debug_log logger = debug_log(os.getcwd(), name='login_taobao.log') # 网页端无登陆按钮,直接抓取 def login_taobao(): login_url = "" aim_url = "" # chrome无界面配置 options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") driver = webdriver.Chrome(chrome_options=options) driver.get(login_url) print(driver.page_source)