def get_home_stats(self): """ Get stats for home page """ # overview stats task_count = db_manager.count('tasks', {}) spider_count = db_manager.count('spiders', {}) node_count = db_manager.count('nodes', {}) deploy_count = db_manager.count('deploys', {}) # daily stats cur = db_manager.aggregate('tasks', [{ '$project': { 'date': { '$dateToString': { 'format': '%Y-%m-%d', 'date': '$create_ts' } } } }, { '$group': { '_id': '$date', 'count': { '$sum': 1 } } }, { '$sort': { '_id': 1 } }]) date_cache = {} for item in cur: date_cache[item['_id']] = item['count'] start_date = datetime.now() - timedelta(31) end_date = datetime.now() - timedelta(1) date = start_date daily_tasks = [] while date < end_date: date = date + timedelta(1) date_str = date.strftime('%Y-%m-%d') daily_tasks.append({ 'date': date_str, 'count': date_cache.get(date_str) or 0, }) return { 'status': 'ok', 'overview_stats': { 'task_count': task_count, 'spider_count': spider_count, 'node_count': node_count, 'deploy_count': deploy_count, }, 'daily_tasks': daily_tasks }
def get_results(self, id: str) -> (dict, tuple): """ Get a list of results crawled in a given task. :param id: task_id """ args = self.parser.parse_args() page_size = args.get('page_size') or 10 page_num = args.get('page_num') or 1 task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return [] fields = get_spider_col_fields(col_name) items = db_manager.list(col_name, {'task_id': id}, skip=page_size * (page_num - 1), limit=page_size) return { 'status': 'ok', 'fields': jsonify(fields), 'total_count': db_manager.count(col_name, {'task_id': id}), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(items) }
def get_last_n_day_tasks_count(spider_id: ObjectId, n: int) -> list: return db_manager.count(col_name='tasks', cond={ 'spider_id': spider_id, 'create_ts': { '$gte': (datetime.now() - timedelta(n)) } })
def get(self, id=None, action=None): args = self.parser.parse_args() # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # list items elif id is None: # filter cond = {} if args.get('filter') is not None: cond = args.filter # cond = json.loads(args.filter) # page number page = 1 if args.get('page') is not None: page = args.page # page = int(args.page) # page size page_size = 10 if args.get('page_size') is not None: page_size = args.page_size # page = int(args.page_size) # TODO: sort functionality # total count total_count = db_manager.count(col_name=self.col_name, cond=cond) # items items = db_manager.list(col_name=self.col_name, cond=cond, skip=(page - 1) * page_size, limit=page_size) # TODO: getting status for node return jsonify({ 'status': 'ok', 'total_count': total_count, 'page': page, 'page_size': page_size, 'items': items }) # get item by id else: return jsonify(db_manager.get(col_name=self.col_name, id=id))
def get(self, id=None, action=None): # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) elif id is not None: task = db_manager.get('tasks', id=id) _task = db_manager.get('tasks_celery', id=task['_id']) _spider = db_manager.get('spiders', id=str(task['spider_id'])) if _task: if not task.get('status'): task['status'] = _task['status'] task['result'] = _task['result'] task['spider_name'] = _spider['name'] try: with open(task['log_file_path']) as f: task['log'] = f.read() except Exception as err: task['log'] = '' return jsonify(task) # list tasks args = self.parser.parse_args() page_size = args.get('page_size') or 10 page_num = args.get('page_num') or 1 tasks = db_manager.list('tasks', {}, limit=page_size, skip=page_size * (page_num - 1), sort_key='finish_ts') items = [] for task in tasks: _task = db_manager.get('tasks_celery', id=task['_id']) _spider = db_manager.get('spiders', id=str(task['spider_id'])) if _task: task['status'] = _task['status'] else: task['status'] = TaskStatus.UNAVAILABLE task['spider_name'] = _spider['name'] items.append(task) return { 'status': 'ok', 'total_count': db_manager.count('tasks', {}), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(items) }
def get_results(self, id: str) -> (dict, tuple): """ Get a list of results crawled in a given task. :param id: task_id """ args = self.parser.parse_args() page_size = args.get('page_size') or 10 task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return [] fields = get_spider_col_fields(col_name) fields = list(set(fields) - set(IGNORE_FIELD)) items = db_manager.list(col_name, {'task_id': id}) # 避免内容过长,做一下限制;同时剔除无用的字段不展示 adjust_items = [] for item in items: adjust_item = {} for key, value in item.items(): if isinstance(value, str) == False: continue if key in IGNORE_FIELD: continue if len(value) > 500: value = value[:500] + '...' adjust_item[key] = value adjust_items += [adjust_item] total_count = db_manager.count(col_name, {'task_id': id}) page_num = len(adjust_items) / page_size if isinstance(page_num, float): page_num = int(page_num) + 1 return { 'status': 'ok', 'fields': jsonify(fields), 'total_count': len(adjust_items), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(adjust_items) }
def get_results(self, id): args = self.parser.parse_args() page_size = args.get('page_size') or 10 page_num = args.get('page_num') or 1 task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return [] fields = get_spider_col_fields(col_name) items = db_manager.list(col_name, {'task_id': id}) return { 'status': 'ok', 'fields': jsonify(fields), 'total_count': db_manager.count(col_name, {'task_id': id}), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(items) }
def get(self, id: str = None, action: str = None) -> (dict, tuple): """ GET method for retrieving item information. If id is specified and action is not, return the object of the given id; If id and action are both specified, execute the given action results of the given id; If neither id nor action is specified, return the list of items given the page_size, page_num and filter :param id: :param action: :return: """ # import pdb # pdb.set_trace() args = self.parser.parse_args() # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # list items elif id is None: # filter cond = {} if args.get('filter') is not None: cond = args.filter # cond = json.loads(args.filter) # page number page = 1 if args.get('page_num') is not None: page = args.page # page = int(args.page) # page size page_size = 10 if args.get('page_size') is not None: page_size = args.page_size # page = int(args.page_size) # TODO: sort functionality # total count total_count = db_manager.count(col_name=self.col_name, cond=cond) # items items = db_manager.list(col_name=self.col_name, cond=cond, skip=(page - 1) * page_size, limit=page_size) # TODO: getting status for node return { 'status': 'ok', 'total_count': total_count, 'page_num': page, 'page_size': page_size, 'items': jsonify(items) } # get item by id else: return jsonify(db_manager.get(col_name=self.col_name, id=id))
def get(self, id: str = None, action: str = None): """ GET method of TaskAPI. :param id: item id :param action: action """ # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) elif id is not None: task = db_manager.get(col_name=self.col_name, id=id) spider = db_manager.get(col_name='spiders', id=str(task['spider_id'])) # spider task['num_results'] = 0 if spider: task['spider_name'] = spider['name'] if spider.get('col'): col = spider.get('col') num_results = db_manager.count(col, {'task_id': task['_id']}) task['num_results'] = num_results # duration if task.get('finish_ts') is not None: task['duration'] = (task['finish_ts'] - task['create_ts']).total_seconds() task['avg_num_results'] = round( task['num_results'] / task['duration'], 1) try: with open(task['log_file_path']) as f: task['log'] = f.read() except Exception as err: task['log'] = '' return jsonify(task) # list tasks args = self.parser.parse_args() page_size = args.get('page_size') or 10 page_num = args.get('page_num') or 1 filter_str = args.get('filter') filter_ = {} if filter_str is not None: filter_ = json.loads(filter_str) if filter_.get('spider_id'): filter_['spider_id'] = ObjectId(filter_['spider_id']) tasks = db_manager.list(col_name=self.col_name, cond=filter_, limit=page_size, skip=page_size * (page_num - 1), sort_key='create_ts') items = [] for task in tasks: # get spider _spider = db_manager.get(col_name='spiders', id=str(task['spider_id'])) # status if task.get('status') is None: task['status'] = TaskStatus.UNAVAILABLE # spider task['num_results'] = 0 if _spider: # spider name task['spider_name'] = _spider['name'] # number of results if _spider.get('col'): col = _spider.get('col') num_results = db_manager.count(col, {'task_id': task['_id']}) task['num_results'] = num_results # duration if task.get('finish_ts') is not None: task['duration'] = (task['finish_ts'] - task['create_ts']).total_seconds() task['avg_num_results'] = round( task['num_results'] / task['duration'], 1) items.append(task) return { 'status': 'ok', 'total_count': db_manager.count('tasks', filter_), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(items) }
def get_spider_stats(self): args = self.parser.parse_args() spider_id = args.get('spider_id') spider = db_manager.get('spiders', id=spider_id) tasks = db_manager.list(col_name='tasks', cond={ 'spider_id': spider['_id'], 'create_ts': { '$gte': datetime.now() - timedelta(30) } }, limit=9999999) # task count task_count = len(tasks) # calculate task count stats task_count_by_status = defaultdict(int) task_count_by_node = defaultdict(int) total_seconds = 0 for task in tasks: task_count_by_status[task['status']] += 1 task_count_by_node[task.get('node_id')] += 1 if task['status'] == TaskStatus.SUCCESS and task.get('finish_ts'): duration = (task['finish_ts'] - task['create_ts']).total_seconds() total_seconds += duration # task count by node task_count_by_node_ = [] for status, value in task_count_by_node.items(): task_count_by_node_.append({'name': status, 'value': value}) # task count by status task_count_by_status_ = [] for status, value in task_count_by_status.items(): task_count_by_status_.append({'name': status, 'value': value}) # success rate success_rate = task_count_by_status[TaskStatus.SUCCESS] / task_count # average duration avg_duration = total_seconds / task_count # calculate task count by date cur = db_manager.aggregate('tasks', [{ '$match': { 'spider_id': spider['_id'] } }, { '$project': { 'date': { '$dateToString': { 'format': '%Y-%m-%d', 'date': '$create_ts' } }, 'duration': { '$subtract': ['$finish_ts', '$create_ts'] } } }, { '$group': { '_id': '$date', 'count': { '$sum': 1 }, 'duration': { '$avg': '$duration' } } }, { '$sort': { '_id': 1 } }]) date_cache = {} for item in cur: date_cache[item['_id']] = { 'duration': (item['duration'] or 0) / 1000, 'count': item['count'] } start_date = datetime.now() - timedelta(31) end_date = datetime.now() - timedelta(1) date = start_date daily_tasks = [] while date < end_date: date = date + timedelta(1) date_str = date.strftime('%Y-%m-%d') d = date_cache.get(date_str) row = { 'date': date_str, } if d is None: row['count'] = 0 row['duration'] = 0 else: row['count'] = d['count'] row['duration'] = d['duration'] daily_tasks.append(row) # calculate total results result_count = 0 col_name = spider.get('col') if col_name is not None: for task in tasks: result_count += db_manager.count(col_name, {'task_id': task['_id']}) # top tasks # top_10_tasks = db_manager.list('tasks', {'spider_id': spider['_id']}) return { 'status': 'ok', 'overview': { 'task_count': task_count, 'result_count': result_count, 'success_rate': success_rate, 'avg_duration': avg_duration }, 'task_count_by_status': task_count_by_status_, 'task_count_by_node': task_count_by_node_, 'daily_stats': daily_tasks, }
def get(self, id: str = None, action: str = None): # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) elif id is not None: site = db_manager.get(col_name=self.col_name, id=id) return jsonify(site) # list tasks args = self.parser.parse_args() page_size = args.get('page_size') or 10 page_num = args.get('page_num') or 1 filter_str = args.get('filter') keyword = args.get('keyword') filter_ = {} if filter_str is not None: filter_ = json.loads(filter_str) if keyword is not None: filter_['$or'] = [{ 'description': { '$regex': keyword } }, { 'name': { '$regex': keyword } }, { 'domain': { '$regex': keyword } }] items = db_manager.list(col_name=self.col_name, cond=filter_, limit=page_size, skip=page_size * (page_num - 1), sort_key='rank', sort_direction=ASCENDING) sites = [] for site in items: # get spider count site['spider_count'] = db_manager.count('spiders', {'site': site['_id']}) sites.append(site) return { 'status': 'ok', 'total_count': db_manager.count(self.col_name, filter_), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(sites) }