-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider爬虫.py
231 lines (200 loc) · 8.43 KB
/
spider爬虫.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# -*- coding:utf-8 -*-
# author :范佳郡
# datetime :2019-05-04 22:50
# software :python_learn
from urllib import request, parse, error
import chardet
import ssl
import json
import requests
from http import cookiejar
def htmlfanyi(url,kw):
try:
# 使用data,将需要的值使用字典结构编写
data ={
'kw': kw
}
# 对关键词进行编码,只有编码后才能识别
data = parse.urlencode(data).encode()
print('编码后输入的关键词的内容:{0}'.format(data))
# ssl加密
ssl._create_default_https_context = ssl._create_unverified_context
# context = ssl._create_unverified_context()
'''
# 方法1:使用urlopen来处理请求
# 增加入参,翻译网址为
rsp = request.urlopen(url=url, data=data, context=context)
'''
# 方法2:使用Request来处理请求,需要header
# header中用户代理,简称UA, 属于heads的一部分,服务器通过UA来判断访问者身份,设置UA可以通过两种方式,header方式1
headers = {
'Content-Length': len(data),
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
rsp = request.Request(url=url, data=data, headers=headers)
# header方式2:add_header
rsp.add_header('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36')
# 因为已经构造了一个Request的请求实例,则所有的请求信息都可以封装在Request实例中
rsp = request.urlopen(rsp)
# 读传入入参后的html网页
json_data = rsp.read().decode()
print(type(json_data))
# 把json字符串转化成字典
json_data = json.loads(json_data)
print(json_data)
# 打印对应的关键词和翻译值
for item in json_data['data']:
print(item['k'], '----', item['v'])
# HTTPError是对应的HTTP请求的返回码错误, 如果返回错误码是400以上的,则引发HTTPError
except error.HTTPError as e:
print('HttpError:{0}'.format(e))
# URLError对应的一般是网络出现问题,包括url问题
except error.URLError as e:
print('UrlError:{0}'.format(e))
except Exception as e:
print('捕捉后的错误是:{0}'.format(e))
# 关系区别从大到小: OSError-URLError-HTTPError
def htmlfinder(url,wd):
print('hhaa')
# 要想使用data, 需要使用字典结构
wd = {
'wd': wd
}
# 转换url编码
wd = parse.urlencode(wd)
print(wd)
fulurl = url + wd
print('拼接后的网址:{0}'.format(fulurl))
# 如果直接用可读的带参数的url,是不能访问的
#fullurl = 'http://www.baidu.com/s?wd=大熊猫'
# 直接搜索的网址为
rsp = request.urlopen(fulurl)
html = rsp.read()
# 打印url相关信息
print("URL: {0}".format(rsp.geturl()))
print("Info: {0}".format(rsp.info()))
print("Code: {0}".format(rsp.getcode()))
# 自动获取网页的编码方式方便解码并打印网页
cs = chardet.detect(html)
print('cs的属性:{0}'.format(cs))
# 使用get取值保证不会出错,能取到则取对应的编码,取不到则使用utf-8
html = html.decode(cs.get('encoding', 'utf-8'))
print(html)
def htmlproxy(url):
# 基本使用步骤(4步):获取代理服务器的地址:www.xicidaili.com www.goubanjia.com
# 设置代理地址
proxy = {'http':'58.253.154.109:9999'}
# 创建ProxyHandler
proxyhandler = request.ProxyHandler(proxy)
# 创建Opener
opener = request.build_opener(proxyhandler)
# 安装Opener
request.install_opener(opener)
try:
rsp = request.urlopen(url=url)
html = rsp.read().decode()
print(html)
except Exception as e:
print(e)
def htmllogin_session():
# res = requests.post('http://nladuo.cn:90/crawler_lesson2/do_login.php', data={'uname':'nladuo','passwd':'nladuo'})
# res = requests.get('http://nladuo.cn/crawler_lesson2/private.php')
# res.encoding ='utf-8'
# 使用session来进行访问
# 创建一个session
session = requests.session()
res = session.post('http://nladuo.cn:90/crawler_lesson2/do_login.php',
data={'uname': 'nladuo', 'passwd': 'nladuo'})
res = session.get('http://nladuo.cn/crawler_lesson2/private.php')
res.encoding = 'utf-8'
if res.status_code == 200:
print(res.text)
def htmllogin_cookie():
'''
# 方式1,手动获取cookie登陆
url = 'http://www.renren.com/290945236/profile'
# 从网址f12中获取登陆的cookie
hearders = {
'Cookie':'anonymid=jvt2qxshui2rua; depovince=GW; jebecookies=a7626a85-b224-4e30-b3d7-c12e67e2d4d4|||||; _r01_=1; JSESSIONID=abc4xyp-gRZi56QnlBjRw; ick_login=c056991b-81dd-44e0-be54-2aec7fb061f9; _de=CBC1F09DABF094CB727E6F614F21270D; p=91eb1d36eb38e1dac3c292228f0985ed6; first_login_flag=1; ln_uact=18767101020; ln_hurl=http://hdn.xnimg.cn/photos/hdn521/20111005/1700/h_main_Snnk_2095000200b32f75.jpg; t=eeb3b8930db21a2690b5473b65a06ad36; societyguester=eeb3b8930db21a2690b5473b65a06ad36; id=290945236; ver=7.0; jebe_key=c21984c4-e831-41c9-9a55-0cd1f195b183%7Cdbdf9d3356b37bca42cf2586a2a96f51%7C1558157549546%7C1%7C1558157549601; wp_fold=0; xnsid=36927821; loginfrom=null'
}
rsp = request.Request(url=url,headers=hearders)
rsp = request.urlopen(rsp)
html = rsp.read().decode()
with open('rsp.html','w') as f:
f.write(html)
'''
# 方式2:自动获取cookie登陆的流程:
# 创建cookiehar实例
# 方式1:使用cookiejar
cookie = cookiejar.CookieJar()
# 方式2:创建filecookiejar实例
filename = 'cookie.txt'
cookie = cookiejar.MozillaCookieJar(filename)
# 生成cookie的管理器
cookie_handler = request.HTTPCookieProcessor(cookie)
# 创建http的管理器
http_handler = request.HTTPHandler()
# 生成https的管理器
https_handler = request.HTTPSHandler()
# 创建请求的管理器
opener = request.build_opener(http_handler,https_handler,cookie_handler)
# 打开登录页面后自动通过用户名密码登录
# 获取登陆页面
url = 'http://www.renren.com/PLogin.do'
data = {
'email':'18767101020',
'password':'18767101020'
}
data = parse.urlencode(data)
res = request.Request(url=url,data=data.encode())
# 使用opener发起请求
res = opener.open(res)
# 打印下cookie
'''
cookie的属性
name: 名称
value: 值
domain:可以访问此cookie的域名
path: 可以发昂文此cookie的页面路径
expires:过期时间
size: 大小
Http字段
'''
# print(cookie)
# for item in cookie:
# # print('item的类型是{0}'.format(type(item)))
# print('item的内容是{0}'.format(item))
# for i in dir(item):
# print('item中的类型{0}'.format(i))
# 通过filecookie文件保存cookie
# ignor_discard表示及时cookie将要被丢弃也要保存下来
# ignore_expire表示如果该文件中cookie即使已经过期,保存
cookie.save(ignore_discard=True, ignore_expires=True)
# 访问隐私页面需要的网址
urlprive = 'http://www.renren.com/290945236/profile'
# 自动提取反馈回来的cookie
# 利用提取的cookie登录隐私页面
res = opener.open(urlprive)
htmlprive = res.read().decode()
with open('rsp.html', 'w') as f:
f.write(htmlprive)
def htmlcookietxt():
cookie = cookiejar.MozillaCookieJar()
cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)
cookie_handler = request.HTTPCookieProcessor(cookie)
http_handler = request.HTTPHandler()
https_handler = request.HTTPSHandler()
opener = request.build_opener(http_handler, https_handler, cookie_handler)
urlprive2 = 'http://www.renren.com/290945236/profile'
res = opener.open(urlprive2)
htmlprive2 = res.read().decode()
with open('rsp2.html', 'w') as f:
f.write(htmlprive2)
if __name__ == '__main__':
# htmlfanyi(url='https://fanyi.baidu.com/sug', kw='girl')
# kw = input("Input your keyword:")
# htmlfinder(url='http://www.baidu.com/s?', wd=kw)
# htmlproxy(url='http://www.baidu.com')
# htmllogin_session()
# htmllogin_cookie()
# htmlcookietxt()