コード例 #1
0
ファイル: HotSearchCore.py プロジェクト: hjydzh/crawler
    def hot_search(self, ids):
        Log.init_log()
        for id in ids:
            crawlers = self.get_crawler_category(id)
            for crawler in crawlers:
                logging.debug('name:%s, url:%s' %
                              (crawler.author, crawler.url))
                if None is crawler.url or '' == crawler.url:
                    #如果爬虫没有url,则不执行
                    continue
                if crawler.charset != 'utf-8':
                    #只处理utf-8爬虫
                    continue
                try:

                    self.parse(crawler)
                except Exception as err:
                    logging.debug(traceback.format_exc())
                self.crawlers[crawler.id] = crawler
        self.sort_word_list()
        url = self.find_hot_url()
        logging.debug('最热文章url为:')
        logging.debug(url)
        crawler_id = self.get_crawler_id(url)
        real_url = self.get_real_url(url)
        crawler = self.crawlers[int(crawler_id)]
        self.search_core.crawler = crawler
        blog = self.get_blog(self.get_title(url), real_url)
        self.save(blog)
        logging.debug('最热文章更新成功')
コード例 #2
0
ファイル: CommonUpdate.py プロジェクト: hjydzh/crawler
 def update_website(self):
     Log.init_log()
     logging.debug("网站内容更新开始")
     categories = self.get_categories()
     for category in categories:
         self.update_category(category)
     logging.debug("网站内容更新完成")
コード例 #3
0
 def search(self):
     Log.init_log()
     logging.debug('开始抓取')
     self.get_tag_list()
     self.get_html()
     #self.tagList = self.tagList[::-1]
     templates = self.search_templates()
     #  self.tagList = self.tagList[1:]
     tag = None
     for template in templates[1:]:
         tag = self.get_tag(template)
         if tag != None:
             blog = self.fetchBlog(tag.text, tag['href'])
             self.save_blog(blog)
             break