forked from MagicDu/wpspider
/
wpspider.py
executable file
·213 lines (193 loc) · 5.96 KB
/
wpspider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python3
#-*-coding : utf-8 -*-
from urllib.request import urlopen
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
from wordpress_xmlrpc import Client,WordPressPost
from wordpress_xmlrpc.compat import xmlrpc_client
from wordpress_xmlrpc.methods import media, posts
from wordpress_xmlrpc.methods.posts import GetPosts,NewPost
from wordpress_xmlrpc.methods.users import GetUserInfo
from email.mime.text import MIMEText
from email.header import Header
import re
import time
import smtplib
import traceback
import os,random
import requests
user_agents=list()
#新闻类
class News(object):
def __init__(self,title,tags,category,content,image_name):
self.title = title #标题
self.tags=tags #标签
self.category=category #分类
self.content=content #内容
self.image_name=image_name
# 根据url 获取主机名
def getHost(url):
reg = r'^https?:\/\/([a-z0-9\-\.]+)[\/\?]?'
m = re.match(reg, url)
uri = m.groups()[0] if m else ''
host=uri[uri.rfind('.', 0, uri.rfind('.')) + 1:]
return host
#获取最新的新闻链接列表
'''
url :需要抓取的网址
n :获取链接的数量,即每次需要发布新文章的数量
links:返回链接列表
'''
def get_urls(url,n=1):
links=[]
length = len(user_agents)
index=random.randint(0,length-1)
user_agent = user_agents[index]
headers={
'Referer': 'http://www.jrzj.com',
'Host':'www.jrzj.com',
'User-Agent':user_agent,
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
bsObj=requests.session()
bsObj=BeautifulSoup(bsObj.get(url,headers=headers).content,'html.parser')
for link in bsObj.find('div',{'class':'main2_left_fir_left'}).findAll('a')[0:n]:
if 'href' in link.attrs:
links.append(link.attrs['href'])
return links
#加载 user_agents配置文件
def load_user_agent():
fp = open('user_agents', 'r')
line = fp.readline().strip('\n')
while(line):
user_agents.append(line)
line = fp.readline().strip('\n')
fp.close()
#根据文章链接切分文章
'''
这里是以金融之家为例的,抓取其他资讯请自行分析网站文章重写 get_news(link)
'''
def get_news(link):
length = len(user_agents)
index=random.randint(0,length-1)
user_agent = user_agents[index]
headers={
'Referer': 'http://www.jrzj.com',
'Host':'www.jrzj.com',
'User-Agent':user_agent,
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
bsObj=requests.session()
bsObj=BeautifulSoup(bsObj.get(link,headers=headers).content,'html.parser')
title=bsObj.h1.get_text()
#print('标题:',title)
tags_list=bsObj.find('meta',{'name':'keywords'}).attrs['content']
l=re.split(',',tags_list)
tags=[item for item in filter(lambda x:x != '', l)]
#print('标签:',tags)
category=bsObj.title.get_text().split('_')[1]
#print('分类',category)
#content=bsObj.find('div',{'class':'news_content'}).prettify()
content=bsObj.find('div',{'class':'news_content'})
#print('内容:',content)
#查找图片
a_tag=content.find('img')
#print(a_tag)
image_url=a_tag.attrs['src']
image_name=os.path.basename(image_url).split('!')[0]
#下载图片
get_image(image_url,image_name)
#删除标签
a_tag.extract()
news=News(title,tags,category,content.prettify(),image_name)
return news
#下载图片
'''
将图片保存到本地
'''
def get_image(image_url,image_name):
os.makedirs('images',exist_ok=True)
#print('下载了--->'+image_name)
urlretrieve(image_url,'images/'+image_name)
#上传图片
'''
根据图片路径将图片上传到wordpress
返回attachment_id
'''
def upload_image(image_name,client):
data={
'name':image_name,
'type':'image/jpeg'
}
with open('images/'+image_name, 'rb') as img:
data['bits'] = xmlrpc_client.Binary(img.read())
response = client.call(media.UploadFile(data))
#print('上传了--->'+image_name)
attachment_id = response['id']
return attachment_id
#发送新闻到wordpress
'''
yourwebsit: 你的wordpress地址+xmlrpc.php
username : wordpress登录用户名
password : wordpress登录密码
news : 新闻对象
'''
def send_news(yourwebsit,username,password,news):
wp=Client(yourwebsit,username,password)
attachment_id=upload_image(news.image_name,wp)
post=WordPressPost()
post.title=news.title
post.content=news.content
post.post_status ='publish'
post.thumbnail = attachment_id
post.terms_names={
'post_tag':news.tags,
'category':[news.category]
}
wp.call(NewPost(post))
#将文章标题写入文件
def write_file(str_title):
with open('title.txt','a') as f:
f.write(str_title)
#发送电子邮件
'''
mail_user :发送者名称
mail_postfix:邮箱后缀
sender :发送者
receiver :接收者(可以设置为139邮箱)
smtpserver :smtp服务器地址
message :消息
subject :主题
username :用户名
password :密码
example: 以新浪邮箱为例
send_email('user','sina.com','user@sina.com','xxxx@qq.com','smtp.sina.com','您的爬虫出现异常\n'+m,'wpspider','user@sina.com','abc123')
'''
def send_email(mail_user,mail_postfix,sender,receiver,smtpserver,message,subject,username,password):
try:
msg=MIMEText(message,'plain','utf-8')
me="Wpspider"+"<"+mail_user+"@"+mail_postfix+">"
msg['From']=Header(me)
msg['Subject']=Header(subject,'utf-8')
smtp = smtplib.SMTP()
smtp.connect(smtpserver)
smtp.login(username,password)
smtp.sendmail(sender, receiver, msg.as_string())
smtp.quit()
print ("邮件发送成功")
except smtplib.SMTPException as e:
print ("Error: 无法发送邮件")
#以金融之家为例
#接收邮箱可以设置为139邮箱,以便接收短信提醒
try:
load_user_agent()
l=get_urls('http://www.jrzj.com',1)
for link in l:
news=get_news(link)
#print(news.title) #打印文章标题
write_file(news.title+'\n')
time.sleep(5)
send_news('http://blog.abc.com/xmlrpc.php','username','password',news)
except Exception as e:
m=traceback.format_exc()
send_email('user','sina.com','user@sina.com','xxxx@139.com','smtp.sina.com','您的爬虫出现异常\n'+m,'wpspider','user@sina.com','abc123')