forked from cmjs/copy2bbs
/
spider.py
51 lines (40 loc) · 1.9 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
class Spider(object):
def __init__(self,url):
self.spiderUrl = url
self.subject=''
self.content=''
def spide(self):
self.subject=''
self.content=''
try:
request = urllib2.Request(self.spiderUrl)
response = urllib2.urlopen(request)
soup = BeautifulSoup(response.read().decode('utf-8'))
self.subject = soup.title.string.output_ready()
self.subject = self.subject.encode('utf-8')
self.content = '[b]' + soup.find("div",class_="article-experpt explain").string.output_ready() + '[/b]' + '\n'
main_body = soup.find("div",class_ ="js-article-body")
for child in main_body.children:
if child.string:
self.content = self.content + '\t' + child.string.output_ready() +'\n'
elif child.contents:
# if child.contents[0].name=='strong':
# self.content = self.content + '[color=Sienna]' + child.string.output_ready() +'[/color]' +'\n'
if child.contents[0].name=='img':
self.content = self.content + '[align=center][img=660,440]' + child.contents[0]['src'] + '[/img][/align]'+'\n'
self.content = self.content.encode('utf-8')
self.content = '[font=微软雅黑]' + self.content +'[/font]'+ '\n\n\n\n ' +'本文转自' + self.spiderUrl + '\n' + '\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t-----自动转贴'
print (self.content)
return True
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
return False
if __name__ == '__main__':
spider = Spider('http://iranshao.com/articles/2068-marathon_hundred')
spider.spide()