-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
119 lines (89 loc) · 3.91 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from model import db, Author, News
from xml.dom.minidom import parse, parseString
import urllib
import re
import lxml
from lxml.html.clean import Cleaner
import unicodedata
import datetime
class Craww:
def craw(self):
tech_rss_url = "http://feeds.feedburner.com/TechCrunch/"
tech = urllib.urlopen(tech_rss_url)
tech_content = tech.read()
tech_dom = parseString(tech_content)
news_dom = tech_dom.getElementsByTagName('feedburner:origLink')
for n in news_dom:
url = n.firstChild.nodeValue
if(isinstance(self.findNewsByUrl(url), News) == False):
self.crawNews(url)
return
def crawNews(self, url):
cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True
cleaner.comments = True
tech_content = lxml.html.parse(url)
tech_content = (lxml.html.tostring(tech_content))
re_title = re.compile(r'<h1.*>(.*)</h1', re.S)
re_content = re.compile(r'<!-- Begin: Wordpress Article Content -->(.*)<!-- End: Wordpress Article Content -->', re.S)
re_published = re.compile(r'name="sailthru.date"\scontent="(.*?)"')
re_author = re.compile(r'<a\shref="(.*?)"\stitle.*?rel="author">(.*?)<\/a>.*?rel="external">(.*?)<\/a>')
match_title = re.search(re_title, tech_content)
match_content = re.search(re_content, tech_content)
match_date = re.search(re_published, tech_content)
match_author = re.search(re_author, tech_content)
author_url = "http://techcrunch.com" + match_author.group(1)
author_name = match_author.group(2)
author_twitter = match_author.group(3)
title = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_title.group(1)))
title = re.sub(r'\s+', ' ', title)
title = title.decode('utf-8').strip()
content = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_content.group(1)))
content = re.sub(r'\s+', ' ', content)
content = content.decode('utf-8').strip()
content = content.strip('\n')
published_on = datetime.datetime.strptime(match_date.group(1), '%Y-%m-%d %H:%M:%S')
news = self.save_news(url, title, content, published_on)
author = self.findAuthorByUrl(author_url)
if (isinstance(author, Author) == False):
author = self.save_author(author_url, author_name, author_twitter, '')
self.newsAuthor(news, author)
def findAuthorByUrl(self, url):
return Author.query.filter(Author.url == url).first()
def findNewsByUrl(self, url):
return News.query.filter(News.url == url).first()
def newsAuthor(self, news, author):
aauthor = author.query.filter(Author.news.any(id=news.id)).first()
if (isinstance(aauthor, Author) == False):
news.author.append(author)
db.session.commit()
db.session.refresh(news)
return news
def save_news(self, url, title, content, published_on):
news = News.query.filter(News.title == title).first()
if (isinstance(news, News) == False):
news = News()
news.url = url
news.title = title
news.content = content
news.published_on = published_on
db.session.add(news)
db.session.commit()
db.session.refresh(news)
return news
def save_author(self, author_url, author_name, author_twitter, author_bio):
author = Author.query.filter(Author.url == author_url).first()
if (isinstance(author, Author) == False):
author = Author()
author.url = author_url
author.name = author_name
author.twitter = author_twitter
author.bio = author_bio
db.session.add(author)
db.session.commit()
db.session.refresh(author)
return author
if __name__ == '__main__':
craww = Craww()
craww.craw()