def main(): # print 'Running.' # url = 'https://blog.csdn.net/GitChat' # download = Download() # articledb = ArticleDB(DB(*config)) # spider = Spider(url, download, articledb) # spider.start() # print 'Done.' # index url = 'https://blog.csdn.net/' download = Download() html = download.down_html(url, save=True) print html
class CSDN(object): def __init__(self): self.download = Download() self.home = 'https://blog.csdn.net' self.catetories = [] self.blog_user = [] self.queue = Queue.Queue() pass def visit_home(self): html = self.download.down_html(self.home) return html def parse_category(self, html): # with open('tmp.html') as f: # html = f.read() soup = BeautifulSoup(html, 'lxml') div = soup.find('div', class_='nav_com') if div: # print div a_tags = div.find_all('a') print len(a_tags) for a_tag in a_tags: href = a_tag.attrs['href'] self.catetories.append(''.join([self.home, href])) print self.catetories def visit_category(self): for category in self.catetories: html = self.download.down_html(category, save=True) self.parse_blog_user(html) # break print self.blog_user print len(self.blog_user) def parse_blog_user(self, html): print 'parse blog user' # soup = BeautifulSoup(html, 'lxml') ul = soup.find('ul', class_='feedlist_mod') if ul: dds = ul.find_all('dd', class_='name') for dd in dds: href = dd.find('a').attrs['href'] self.blog_user.append(href) self.queue.put(href) def start(self): html = self.visit_home() # print html # html = '' self.parse_category(html) self.visit_category() i = 0 while not self.queue.empty(): blog = self.queue.get() # 下载博客文章 i += 1 pass print 'run times ', i