def hot_search(self, ids): Log.init_log() for id in ids: crawlers = self.get_crawler_category(id) for crawler in crawlers: logging.debug('name:%s, url:%s' % (crawler.author, crawler.url)) if None is crawler.url or '' == crawler.url: #如果爬虫没有url,则不执行 continue if crawler.charset != 'utf-8': #只处理utf-8爬虫 continue try: self.parse(crawler) except Exception as err: logging.debug(traceback.format_exc()) self.crawlers[crawler.id] = crawler self.sort_word_list() url = self.find_hot_url() logging.debug('最热文章url为:') logging.debug(url) crawler_id = self.get_crawler_id(url) real_url = self.get_real_url(url) crawler = self.crawlers[int(crawler_id)] self.search_core.crawler = crawler blog = self.get_blog(self.get_title(url), real_url) self.save(blog) logging.debug('最热文章更新成功')
def update_website(self): Log.init_log() logging.debug("网站内容更新开始") categories = self.get_categories() for category in categories: self.update_category(category) logging.debug("网站内容更新完成")
def search(self): Log.init_log() logging.debug('开始抓取') self.get_tag_list() self.get_html() #self.tagList = self.tagList[::-1] templates = self.search_templates() # self.tagList = self.tagList[1:] tag = None for template in templates[1:]: tag = self.get_tag(template) if tag != None: blog = self.fetchBlog(tag.text, tag['href']) self.save_blog(blog) break