Ejemplo n.º 1
0
	def run(self):
		print self.city.name, "线程开始抓取"
		for i in xrange(1, 2):
			articles = []
			url = host + url_args.format(fid=str(self.city.fid), page=str(i))
			home_page = Base.get_home(url)
			article_infos = Base.get_article_infos(home_page)
			for article_info in article_infos:
				time_at = do_time.str2datetime(article_info['time_at'], "%Y-%m-%d %H:%M")
				tid = int(article_info['tid'])
				if time_at <= self.latest_time_at and article_info['_type'] != 'hot':
					Article.add_all(articles)
					print self.city.name, "结束"
					return
				else:
					if tid not in self.tids:
						print "抓取", self.city.name, i, "页", "帖子", article_info['title']
						content = Base.get_content(article_info['url'])
						articles.append(
							Article(
								city_id=self.city.id,
								tid=tid,
								type=article_info['_type'],
								title=article_info['title'],
								time_at=time_at,
								content=content,
								author=article_info['author'],
								reply_number=article_info['reply_number'],
								read_number=article_info['read_number'],
								url=article_info['url'],
							))
			Article.add_all(articles)
		print self.city.name, "结束"
Ejemplo n.º 2
0
Archivo: article.py Proyecto: hncg/bps
def add(article_arg):
    return Article.add(
        title=article_arg.title,
        content=article_arg.content,
        time=utils.utc2datetime(article_arg.time),
        author=article_arg.author,
        user_id=article_arg.user_id,
    )
Ejemplo n.º 3
0
import time
from utils import cg_core
from spider.thread import MyThread
from analyse.thread import AThread
from handler.model import (
    City,
    Article
)

thread_limit = 4    # 最大爬虫线程数
thread_limit_time = 60 * 1    # 爬虫限制等待时间等待抓取

if __name__ == '__main__':
    key_mood_map = cg_core.get_key_mood_map("../doc/", "feel.xlsx")
    citys = City.mget()
    articles = Article.mget_latest_time_at()
    latest_time_at_map = {articles[1]: articles[0] for
                          articles in articles}
    for city in citys:
        if threading.activeCount() > thread_limit:
            print "最大爬虫线程数只能为{}个, 线程休眠{}秒". \
                format(thread_limit, thread_limit_time)
            time.sleep(thread_limit_time)
        tids = [int(tid[0]) for tid in Article.mget_tids_by_city_id(city.id)]
        MyThread(city, latest_time_at_map.get(city.id), tids).start()

    while (True):
        print "准备分析数据,当前线程数:{},等待爬虫线程抓取完毕".format(threading.activeCount())
        if threading.activeCount() != 1:
            time.sleep(60 * 5)
            continue
Ejemplo n.º 4
0
# coding=utf-8
from spider.thread import MyThread
from handler.model import (
    City,
    Article
)

# latest_time_ats = Article.get_latest_time_at()
if __name__ == '__main__':
    citys = City.mget()
    articles = Article.mget_latest_time_at()
    latest_time_at_map = {articles[1]: articles[0] for
                          articles in articles}
    for city in citys:
        tids = [int(tid[0]) for tid in Article.mget_tids_by_city_id(city.id)]
        MyThread(city, latest_time_at_map.get(city.id), tids).start()
Ejemplo n.º 5
0
Archivo: article.py Proyecto: hncg/bps
def mget(query):
    return [article.serialize() for article
            in Article.mget(query)]