def run(self): print self.city.name, "线程开始抓取" for i in xrange(1, 2): articles = [] url = host + url_args.format(fid=str(self.city.fid), page=str(i)) home_page = Base.get_home(url) article_infos = Base.get_article_infos(home_page) for article_info in article_infos: time_at = do_time.str2datetime(article_info['time_at'], "%Y-%m-%d %H:%M") tid = int(article_info['tid']) if time_at <= self.latest_time_at and article_info['_type'] != 'hot': Article.add_all(articles) print self.city.name, "结束" return else: if tid not in self.tids: print "抓取", self.city.name, i, "页", "帖子", article_info['title'] content = Base.get_content(article_info['url']) articles.append( Article( city_id=self.city.id, tid=tid, type=article_info['_type'], title=article_info['title'], time_at=time_at, content=content, author=article_info['author'], reply_number=article_info['reply_number'], read_number=article_info['read_number'], url=article_info['url'], )) Article.add_all(articles) print self.city.name, "结束"
def add(article_arg): return Article.add( title=article_arg.title, content=article_arg.content, time=utils.utc2datetime(article_arg.time), author=article_arg.author, user_id=article_arg.user_id, )
import time from utils import cg_core from spider.thread import MyThread from analyse.thread import AThread from handler.model import ( City, Article ) thread_limit = 4 # 最大爬虫线程数 thread_limit_time = 60 * 1 # 爬虫限制等待时间等待抓取 if __name__ == '__main__': key_mood_map = cg_core.get_key_mood_map("../doc/", "feel.xlsx") citys = City.mget() articles = Article.mget_latest_time_at() latest_time_at_map = {articles[1]: articles[0] for articles in articles} for city in citys: if threading.activeCount() > thread_limit: print "最大爬虫线程数只能为{}个, 线程休眠{}秒". \ format(thread_limit, thread_limit_time) time.sleep(thread_limit_time) tids = [int(tid[0]) for tid in Article.mget_tids_by_city_id(city.id)] MyThread(city, latest_time_at_map.get(city.id), tids).start() while (True): print "准备分析数据,当前线程数:{},等待爬虫线程抓取完毕".format(threading.activeCount()) if threading.activeCount() != 1: time.sleep(60 * 5) continue
# coding=utf-8 from spider.thread import MyThread from handler.model import ( City, Article ) # latest_time_ats = Article.get_latest_time_at() if __name__ == '__main__': citys = City.mget() articles = Article.mget_latest_time_at() latest_time_at_map = {articles[1]: articles[0] for articles in articles} for city in citys: tids = [int(tid[0]) for tid in Article.mget_tids_by_city_id(city.id)] MyThread(city, latest_time_at_map.get(city.id), tids).start()
def mget(query): return [article.serialize() for article in Article.mget(query)]