def start_calculate(self, type_id, time_from, time_to_end): """ 计算单日视频的均值并记录 """ if not self.judge_record(type_id, time_from, time_to_end, RecordStatus.Handled.value): print('对应时间范围内的标签关系未被全部处理,不进行计算') return HandlerErrcode.NotHandled time_start = time.time() video_tags = self._db['video_tag'].find({ 'tid': type_id, 'pubdate': { '$gte': time_from, '$lt': time_to_end } }).sort([('tid', 1), ('pubdate', 1)]).batch_size(500) self.calculate_cur = 0 self.calculate_total = video_tags.count() print("将要计算{count}个标签关系的均值数据".format(count=self.calculate_total)) lower_time = time_from higher_time = time_from + 86400 for video_tag in video_tags: stats = {} # 计算对应记录的数据均值 for stat_code, stat_name in STAT_NAME.items(): stats[stat_name] = [] for aid in video_tag['aids']: video = self._db['videos'].find_one({'aid': aid}) for stat_code, stat_name in STAT_NAME.items(): stats[stat_name].append(video[stat_name]) # 更新对应记录的数据均值 update_json = {} for stat_code, stat_name in STAT_NAME.items(): update_json['avg_' + stat_name] = math.ceil( numpy.mean(stats[stat_name])) self._db['video_tag'].update_one({'_id': video_tag['_id']}, {'$set': update_json}) if video_tag['pubdate'] >= higher_time: # 判断并记录当日视频是否全部处理完成 self.save_record(type_id, lower_time, RecordStatus.Calculated.value) lower_time += 86400 higher_time += 86400 self.calculate_cur += 1 if self.calculate_cur % 500 == 0: print_time( "计算{count}个标签关系所用时间:".format(count=self.calculate_cur), time.time() - time_start) # 记录最后一日的视频全部完成 self.save_record(type_id, lower_time, RecordStatus.Calculated.value) print_time("计算{count}个标签关系所用时间:".format(count=self.calculate_cur), time.time() - time_start) return HandlerErrcode.Success
def start_handle(self, type_id, time_from, time_to_end): """ 处理对应视频的标签关系 """ if not self.judge_record(type_id, time_from, time_to_end, RecordStatus.Crawled.value): print('对应时间范围内的视频未全部抓取,不进行处理') return HandlerErrcode.NotCrawled time_start = time.time() videos = self._db['videos'].find({ 'tid': type_id, 'pubdate': { '$gte': time_from, '$lt': time_to_end } }).sort([('tid', 1), ('pubdate', 1)]).batch_size(500) self.handle_cur = 0 self.handle_total = videos.count() print("将要处理{count}个视频的标签关系".format(count=self.handle_total)) lower_time = time_from higher_time = time_from + 86400 for video in videos: for tag in video['tags'].split(','): tag = str_to_clear(tag) self.save_tags(tag) self.save_videotag(video['aid'], type_id, timestamp_round_to_day(video['pubdate']), tag) if video['pubdate'] >= higher_time: # 判断并记录当日视频是否全部处理完成 self.save_record(type_id, lower_time, RecordStatus.Handled.value) lower_time += 86400 higher_time += 86400 self.handle_cur += 1 if self.handle_cur % 500 == 0: print_time( "处理{count}个视频的标签关系所用时间:".format(count=self.handle_cur), time.time() - time_start) # 记录最后一日的视频全部完成 self.save_record(type_id, lower_time, RecordStatus.Handled.value) print_time("处理{count}个视频的标签关系所用时间:".format(count=self.handle_cur), time.time() - time_start) return HandlerErrcode.Success
def start(self, type_id, time_from, time_to): """ 开始处理 """ time_from = date_to_timestamp(time_from) time_to_end = date_to_timestamp(time_to) + 24 * 3600 time_start = time.time() self.reset_data() # 处理对应视频的标签关系 errcode_handle = self.start_handle(type_id, time_from, time_to_end) if errcode_handle != HandlerErrcode.Success: return errcode_handle.value # 计算每日视频的数据均值 errcode_calculate = self.start_calculate(type_id, time_from, time_to_end) if errcode_calculate != HandlerErrcode.Success: return errcode_calculate.value print_time('总时间:', time.time() - time_start) return HandlerErrcode.Success.value
time_to=time_to) conn.rpush(REDIS_START_URL_KEY, url) # 监控爬虫进度 url_len = conn.llen(REDIS_START_URL_KEY) item_len = conn.llen(REDIS_ITEMS_KEY) while url_len != 0: time.sleep(2) url_len = conn.llen(REDIS_START_URL_KEY) item_len = conn.llen(REDIS_ITEMS_KEY) # print("总爬取进度:({}/{}),{:.1f}%".format( # item_len, # num_results, # item_len / num_results * 100, # )) print("剩余urls:{}".format(url_len)) print( "分区号{type_id}, [{time_from}-{time_to}]共有{num_results}个视频,{pages}页,每页{per_page}个视频" .format( type_id=type_id, time_from=time_from, time_to=time_to, num_results=response['numResults'], pages=pages, per_page=per_page, )) print_time("爬取所用时间:", time.time() - time_start) # TODO 插入记录每日视频爬取完成的record项
import timeit from helpers import print_time template = '"Hello {{ name }}!"' print '\n', template, '\n' print_time('linja2', timeit.repeat("template.render(name='John Doe')", "from jinja2 import Template; template = Template(%s)" % template, repeat=5, number=10000)) print_time('lighty', timeit.repeat("template.execute({'name': 'John Doe'})", "from lighty.templates import Template; template = Template();" + "template.parse(%s)" % template, repeat=5, number=10000)) print_time('django', timeit.repeat("template.render(context)", "import djangohelper; from django.template import Context, " + "Template; template = Template(%s); " % template + "context = Context({'name': 'John Doe'})", repeat=5, number=10000))
def spider_closed(self, spider): logger.info("爬虫关闭:{}".format(spider.name)) print_time("爬取所用时间:", time.time() - self.start_time)
<head> <title>If test page</title> </head> <body> {% if user %} <h1>Hello {{ user.name }}!</h1> {% if user.is_authenticated %} <h2>Wellcome back</h2> {% endif %} {% endif %} </body> </html>"""''' print '\n', if_template, '\n' print_time('linja2', timeit.repeat( "template.render(user={'name':'John Doe', 'is_authenticated':False})", "from jinja2 import Template; template = Template(%s)" % if_template, repeat=5, number=10000)) print_time('lighty', timeit.repeat( "template.execute({'user': {'name': 'John Doe', " + "'is_authenticated': False}})", "from lighty.templates import Template; template = Template();" + "template.parse(%s)" % if_template, repeat=5, number=10000)) print_time('django', timeit.repeat("template.render(context)", "import djangohelper; from django.template import Context, Template; " + "template = Template(%s); " % if_template + "context = Context({'user':{'name':'John Doe','is_authenticated':False}})", repeat=5, number=10000))
template = '''"""<!DOCTYPE html> <html> <head> <title>For test page</title> </head> <body> <ul> {% for i in items %} <li>{{ i }}</li> {% endfor %} </ul> </body> </html>"""''' print '\n', template, '\n' print_time('linja2', timeit.repeat( "template.render(items=[1, 2, 3, 4, 5, 6, 7, 8, 9, 0])", "from jinja2 import Template; template = Template(%s)" % template, repeat=5, number=10000)) print_time('lighty', timeit.repeat( "template.execute({'items': [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]})", "from lighty.templates import Template; template = Template();" + "template.parse(%s)" % template, repeat=5, number=10000)) print_time('django', timeit.repeat("template.render(context)", "import djangohelper; from django.template import Context, Template; " + "template = Template(%s); " % template + "context = Context({'items': [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]})", repeat=5, number=10000))