def on_crawl(self, id: str) -> (dict, tuple): """ Start a crawl task. :param id: spider_id :return: """ args = self.parser.parse_args() params = args.get('params') spider = db_manager.get('spiders', id=ObjectId(id)) # determine execute function if spider['type'] == SpiderType.CONFIGURABLE: # configurable spider exec_func = execute_config_spider else: # customized spider exec_func = execute_spider # trigger an asynchronous job job = exec_func.delay(id, params) # create a new task db_manager.save('tasks', { '_id': job.id, 'spider_id': ObjectId(id), 'cmd': spider.get('cmd'), 'params': params, 'create_ts': datetime.utcnow(), 'status': TaskStatus.PENDING }) return { 'code': 200, 'status': 'ok', 'task': { 'id': job.id, 'status': job.status } }
def update_nodes_status(refresh=False): """ Update all nodes status :param refresh: """ online_node_ids = [] url = '%s/workers?status=1' % FLOWER_API_ENDPOINT if refresh: url += '&refresh=1' res = requests.get(url) if res.status_code != 200: return online_node_ids for k, v in json.loads(res.content.decode('utf-8')).items(): node_name = k node_status = NodeStatus.ONLINE if v else NodeStatus.OFFLINE # node_celery = v node = db_manager.get('nodes', id=node_name) # new node if node is None: node = { '_id': node_name, 'name': node_name, 'status': node_status, 'ip': 'localhost', 'port': '8000' } db_manager.save('nodes', node) # existing node else: node['status'] = node_status db_manager.save('nodes', node) if node_status: online_node_ids.append(node_name) return online_node_ids
def post(self, id: str = None, action: str = None): """ POST method of the given id for performing an action. :param id: :param action: :return: """ args = self.parser.parse_args() name = args.get('name') if name is not None : spider = db_manager._get('spiders', {'name': name}) # new spider if spider is None: item = {} for k in args.keys(): item[k] = args.get(k) db_manager.save(col_name='spiders', item=item) spider = db_manager._get('spiders', {'name': name}) id = str(spider.get('_id')) return self.update(id) else: id = str(spider.get('_id')) # perform update action if action is not specified if action is None: return self.update(id) # if action is not defined in the attributes, return 400 error if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 # perform specified action of given id return getattr(self, action)(id)
def update_nodes_status(refresh=False): online_node_ids = [] url = '%s/workers?status=1' % FLOWER_API_ENDPOINT if refresh: url += '&refresh=1' res = requests.get(url) for k, v in json.loads(res.content.decode('utf-8')).items(): node_name = k node_status = NodeStatus.ONLINE if v else NodeStatus.OFFLINE # node_celery = v node = db_manager.get('nodes', id=node_name) # new node if node is None: node = {'_id': node_name, 'name': node_name, 'status': node_status} db_manager.save('nodes', node) else: node['status'] = node_status db_manager.save('nodes', node) if node_status: online_node_ids.append(node_name) return online_node_ids
def put(self) -> (dict, tuple): """ PUT method for creating a new item. :return: """ args = self.parser.parse_args() item = {} for k in args.keys(): if k not in DEFAULT_ARGS: item[k] = args.get(k) item = db_manager.save(col_name=self.col_name, item=item) self.after_update() return jsonify(item)
def get(self, id=None, action=None): """ GET method of SpiderAPI. :param id: spider_id :param action: action """ # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # get one node elif id is not None: spider = db_manager.get('spiders', id=id) # get deploy last_deploy = db_manager.get_last_deploy(spider_id=spider['_id']) if last_deploy is not None: spider['deploy_ts'] = last_deploy['finish_ts'] return jsonify(spider) # get a list of items else: items = [] # get customized spiders dirs = os.listdir(PROJECT_SOURCE_FILE_FOLDER) for _dir in dirs: if _dir in IGNORE_DIRS: continue dir_path = os.path.join(PROJECT_SOURCE_FILE_FOLDER, _dir) dir_name = _dir spider = db_manager.get_one_by_key('spiders', key='src', value=dir_path) # new spider if spider is None: stats = get_file_suffix_stats(dir_path) lang = get_lang_by_stats(stats) spider = db_manager.save( 'spiders', { 'name': dir_name, 'src': dir_path, 'lang': lang, 'suffix_stats': stats, 'type': SpiderType.CUSTOMIZED }) # existing spider else: # get last deploy last_deploy = db_manager.get_last_deploy( spider_id=spider['_id']) if last_deploy is not None: spider['deploy_ts'] = last_deploy['finish_ts'] # file stats stats = get_file_suffix_stats(dir_path) # language lang = get_lang_by_stats(stats) # spider type type_ = SpiderType.CUSTOMIZED # update spider data db_manager.update_one('spiders', id=str(spider['_id']), values={ 'lang': lang, 'type': type_, 'suffix_stats': stats, }) # append spider items.append(spider) # get configurable spiders for spider in db_manager.list('spiders', {'type': SpiderType.CONFIGURABLE}): # append spider items.append(spider) # get other info for i in range(len(items)): spider = items[i] # get site if spider.get('site') is not None: site = db_manager.get('sites', spider['site']) if site is not None: items[i]['site_name'] = site['name'] # get last task last_task = db_manager.get_last_task(spider_id=spider['_id']) if last_task is not None: items[i]['task_ts'] = last_task['create_ts'] # --------- # stats # --------- # last 5-run errors items[i]['last_5_errors'] = get_last_n_run_errors_count( spider_id=spider['_id'], n=5) items[i]['last_7d_tasks'] = get_last_n_day_tasks_count( spider_id=spider['_id'], n=5) return {'status': 'ok', 'items': jsonify(items)}
def execute_spider(self, id: str): task_id = self.request.id hostname = self.request.hostname spider = db_manager.get('spiders', id=id) command = spider.get('cmd') current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id'))) # log info logger.info('current_working_directory: %s' % current_working_directory) logger.info('spider_id: %s' % id) logger.info(command) # make sure the log folder exists log_path = os.path.join(PROJECT_LOGS_FOLDER, id) if not os.path.exists(log_path): os.makedirs(log_path) # open log file streams log_file_path = os.path.join( log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S')) stdout = open(log_file_path, 'a') stderr = open(log_file_path, 'a') # create a new task db_manager.save( 'tasks', { '_id': task_id, 'spider_id': ObjectId(id), 'create_ts': datetime.now(), 'node_id': 'celery@%s' % hostname, 'hostname': hostname, 'log_file_path': log_file_path, 'status': TaskStatus.PENDING }) # execute the command env = os.environ.copy() env['CRAWLAB_TASK_ID'] = task_id if spider.get('col'): env['CRAWLAB_COLLECTION'] = spider.get('col') p = subprocess.Popen(command.split(' '), stdout=stdout.fileno(), stderr=stderr.fileno(), cwd=current_working_directory, env=env, bufsize=1) # get output from the process _stdout, _stderr = p.communicate() # get return code code = p.poll() if code == 0: status = TaskStatus.SUCCESS else: status = TaskStatus.FAILURE # save task when the task is finished db_manager.update_one('tasks', id=task_id, values={ 'finish_ts': datetime.now(), 'status': status }) task = db_manager.get('tasks', id=id) # close log file streams stdout.flush() stderr.flush() stdout.close() stderr.close() return task