def on_stop(self, id): """ Stop the task in progress. :param id: :return: """ task = db_manager.get('tasks', id=id) celery_app.control.revoke(id, terminate=True) db_manager.update_one('tasks', id=id, values={'status': TaskStatus.REVOKED}) # kill process if task.get('pid'): pid = task.get('pid') if 'win32' in sys.platform: os.popen('taskkill /pid:' + str(pid)) else: # unix system os.kill(pid, SIGKILL) return { 'id': id, 'status': 'ok', }
def execute_spider(self, id: str): task_id = self.request.id hostname = self.request.hostname spider = db_manager.get('spiders', id=id) command = spider.get('cmd') current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id'))) # log info logger.info('current_working_directory: %s' % current_working_directory) logger.info('spider_id: %s' % id) logger.info(command) # make sure the log folder exists log_path = os.path.join(PROJECT_LOGS_FOLDER, id) if not os.path.exists(log_path): os.makedirs(log_path) # open log file streams log_file_path = os.path.join(log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S')) stdout = open(log_file_path, 'a') stderr = open(log_file_path, 'a') # create a new task db_manager.save('tasks', { '_id': task_id, 'spider_id': ObjectId(id), 'create_ts': datetime.now(), 'node_id': hostname, 'hostname': hostname, 'log_file_path': log_file_path, }) # execute the command env = os.environ.copy() env['CRAWLAB_TASK_ID'] = task_id env['CRAWLAB_COLLECTION'] = spider.get('col') p = subprocess.Popen(command.split(' '), stdout=stdout.fileno(), stderr=stderr.fileno(), cwd=current_working_directory, env=env, bufsize=1) # get output from the process _stdout, _stderr = p.communicate() # save task when the task is finished db_manager.update_one('tasks', id=task_id, values={ 'finish_ts': datetime.now(), }) task = db_manager.get('tasks', id=id) # close log file streams stdout.flush() stderr.flush() stdout.close() stderr.close() return task
def update_envs(self, id: str): """ Update environment variables :param id: spider_id """ args = self.parser.parse_args() envs = json.loads(args.envs) db_manager.update_one(col_name='spiders', id=id, values={'envs': envs})
def update_detail_fields(self, id: str): """ Update detail page fields variables for configurable spiders :param id: spider_id """ args = self.parser.parse_args() detail_fields = json.loads(args.detail_fields) db_manager.update_one(col_name='spiders', id=id, values={'detail_fields': detail_fields})
def get(self, id=None, action=None): # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # get one node elif id is not None: return jsonify(db_manager.get('spiders', id=id)) # get a list of items else: items = [] dirs = os.listdir(PROJECT_SOURCE_FILE_FOLDER) for _dir in dirs: if _dir in IGNORE_DIRS: continue dir_path = os.path.join(PROJECT_SOURCE_FILE_FOLDER, _dir) dir_name = _dir spider = db_manager.get_one_by_key('spiders', key='src', value=dir_path) # new spider if spider is None: stats = get_file_suffix_stats(dir_path) lang = get_lang_by_stats(stats) db_manager.save('spiders', { 'name': dir_name, 'src': dir_path, 'lang': lang, 'suffix_stats': stats, }) # existing spider else: stats = get_file_suffix_stats(dir_path) lang = get_lang_by_stats(stats) db_manager.update_one('spiders', id=str(spider['_id']), values={ 'lang': lang, 'suffix_stats': stats, }) # append spider items.append(spider) return jsonify({ 'status': 'ok', 'items': items })
def stop(self, id): """ Stop the task in progress. :param id: :return: """ celery_app.control.revoke(id, terminate=True) db_manager.update_one('tasks', id=id, values={'status': TaskStatus.REVOKED}) return { 'id': id, 'status': 'ok', }
def update(self, id: str = None) -> (dict, tuple): """ Helper function for update action given the id. :param id: :return: """ args = self.parser.parse_args() item = db_manager.get(col_name=self.col_name, id=id) if item is None: return { 'status': 'ok', 'code': 401, 'error': 'item not exists' }, 401 values = {} for k in args.keys(): if k not in DEFAULT_ARGS: if args.get(k) is not None: values[k] = args.get(k) item = db_manager.update_one(col_name=self.col_name, id=id, values=values) # execute after_update hook self.after_update(id) return item
def update(self, id=None): args = self.parser.parse_args() item = db_manager.get(col_name=self.col_name, id=id) if item is None: return { 'status': 'ok', 'code': 401, 'error': 'item not exists' }, 401 values = {} for k in args.keys(): if k not in DEFAULT_ARGS: values[k] = args.get(k) item = db_manager.update_one(col_name=self.col_name, id=id, values=values) return item
def execute_spider(self, id: str, params: str = None): """ Execute spider task. :param self: :param id: task_id """ task_id = self.request.id hostname = self.request.hostname spider = db_manager.get('spiders', id=id) command = spider.get('cmd') # if start with python, then use sys.executable to execute in the virtualenv if command.startswith('python '): command = command.replace('python ', sys.executable + ' ') # if start with scrapy, then use sys.executable to execute scrapy as module in the virtualenv elif command.startswith('scrapy '): command = command.replace('scrapy ', sys.executable + ' -m scrapy ') # pass params to the command if params is not None: command += ' ' + params # get task object and return if not found task = get_task(task_id) if task is None: return # current working directory current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id'))) # log info logger.info('task_id: %s' % task_id) logger.info('hostname: %s' % hostname) logger.info('current_working_directory: %s' % current_working_directory) logger.info('spider_id: %s' % id) logger.info(command) # make sure the log folder exists log_path = os.path.join(PROJECT_LOGS_FOLDER, id) if not os.path.exists(log_path): os.makedirs(log_path) # open log file streams log_file_path = os.path.join( log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S')) stdout = open(log_file_path, 'a') stderr = open(log_file_path, 'a') # update task status as started db_manager.update_one('tasks', id=task_id, values={ 'start_ts': datetime.utcnow(), 'node_id': hostname, 'hostname': hostname, 'log_file_path': log_file_path, 'status': TaskStatus.STARTED }) # pass params as env variables env = os.environ.copy() # custom environment variables if spider.get('envs'): for _env in spider.get('envs'): env[_env['name']] = _env['value'] # task id environment variable env['CRAWLAB_TASK_ID'] = task_id # collection environment variable if spider.get('col'): env['CRAWLAB_COLLECTION'] = spider.get('col') # create index to speed results data retrieval db_manager.create_index(spider.get('col'), [('task_id', ASCENDING)]) # start process cmd_arr = command.split(' ') cmd_arr = list(filter(lambda x: x != '', cmd_arr)) try: p = subprocess.Popen(cmd_arr, stdout=stdout.fileno(), stderr=stderr.fileno(), cwd=current_working_directory, env=env, bufsize=1) # update pid db_manager.update_one(col_name='tasks', id=task_id, values={'pid': p.pid}) # get output from the process _stdout, _stderr = p.communicate() # get return code code = p.poll() if code == 0: status = TaskStatus.SUCCESS else: status = TaskStatus.FAILURE except Exception as err: logger.error(err) stderr.write(str(err)) status = TaskStatus.FAILURE # save task when the task is finished finish_ts = datetime.utcnow() db_manager.update_one('tasks', id=task_id, values={ 'finish_ts': finish_ts, 'duration': (finish_ts - task['create_ts']).total_seconds(), 'status': status }) task = db_manager.get('tasks', id=id) # close log file streams stdout.flush() stderr.flush() stdout.close() stderr.close() return task
def execute_config_spider(self, id: str, params: str = None): task_id = self.request.id hostname = self.request.hostname spider = db_manager.get('spiders', id=id) # get task object and return if not found task = get_task(task_id) if task is None: return # current working directory current_working_directory = os.path.join(BASE_DIR, 'spiders') # log info logger.info('task_id: %s' % task_id) logger.info('hostname: %s' % hostname) logger.info('current_working_directory: %s' % current_working_directory) logger.info('spider_id: %s' % id) # make sure the log folder exists log_path = os.path.join(PROJECT_LOGS_FOLDER, id) if not os.path.exists(log_path): os.makedirs(log_path) # open log file streams log_file_path = os.path.join( log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S')) stdout = open(log_file_path, 'a') stderr = open(log_file_path, 'a') # update task status as started db_manager.update_one('tasks', id=task_id, values={ 'start_ts': datetime.utcnow(), 'node_id': hostname, 'hostname': hostname, 'log_file_path': log_file_path, 'status': TaskStatus.STARTED }) # pass params as env variables env = os.environ.copy() # custom environment variables if spider.get('envs'): for _env in spider.get('envs'): env[_env['name']] = _env['value'] # task id environment variable env['CRAWLAB_TASK_ID'] = task_id # collection environment variable if spider.get('col'): env['CRAWLAB_COLLECTION'] = spider.get('col') # create index to speed results data retrieval db_manager.create_index(spider.get('col'), [('task_id', ASCENDING)]) # mongodb environment variables env['MONGO_HOST'] = MONGO_HOST env['MONGO_PORT'] = str(MONGO_PORT) env['MONGO_DB'] = MONGO_DB if MONGO_USERNAME is not None: env['MONGO_USERNAME'] = MONGO_USERNAME if MONGO_PASSWORD: env['MONGO_PASSWORD'] = MONGO_PASSWORD cmd_arr = [sys.executable, '-m', 'scrapy', 'crawl', 'config_spider'] try: p = subprocess.Popen(cmd_arr, stdout=stdout.fileno(), stderr=stderr.fileno(), cwd=current_working_directory, env=env, bufsize=1) # update pid db_manager.update_one(col_name='tasks', id=task_id, values={'pid': p.pid}) # get output from the process _stdout, _stderr = p.communicate() # get return code code = p.poll() if code == 0: status = TaskStatus.SUCCESS else: status = TaskStatus.FAILURE except Exception as err: traceback.print_exc() logger.error(err) stderr.write(str(err)) status = TaskStatus.FAILURE # save task when the task is finished finish_ts = datetime.utcnow() db_manager.update_one('tasks', id=task_id, values={ 'finish_ts': finish_ts, 'duration': (finish_ts - task['create_ts']).total_seconds(), 'status': status }) task = db_manager.get('tasks', id=id) # close log file streams stdout.flush() stderr.flush() stdout.close() stderr.close() return task
def get(self, id=None, action=None): """ GET method of SpiderAPI. :param id: spider_id :param action: action """ # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # get one node elif id is not None: spider = db_manager.get('spiders', id=id) # get deploy last_deploy = db_manager.get_last_deploy(spider_id=spider['_id']) if last_deploy is not None: spider['deploy_ts'] = last_deploy['finish_ts'] return jsonify(spider) # get a list of items else: items = [] # get customized spiders dirs = os.listdir(PROJECT_SOURCE_FILE_FOLDER) for _dir in dirs: if _dir in IGNORE_DIRS: continue dir_path = os.path.join(PROJECT_SOURCE_FILE_FOLDER, _dir) dir_name = _dir spider = db_manager.get_one_by_key('spiders', key='src', value=dir_path) # new spider if spider is None: stats = get_file_suffix_stats(dir_path) lang = get_lang_by_stats(stats) spider = db_manager.save( 'spiders', { 'name': dir_name, 'src': dir_path, 'lang': lang, 'suffix_stats': stats, 'type': SpiderType.CUSTOMIZED }) # existing spider else: # get last deploy last_deploy = db_manager.get_last_deploy( spider_id=spider['_id']) if last_deploy is not None: spider['deploy_ts'] = last_deploy['finish_ts'] # file stats stats = get_file_suffix_stats(dir_path) # language lang = get_lang_by_stats(stats) # spider type type_ = SpiderType.CUSTOMIZED # update spider data db_manager.update_one('spiders', id=str(spider['_id']), values={ 'lang': lang, 'type': type_, 'suffix_stats': stats, }) # append spider items.append(spider) # get configurable spiders for spider in db_manager.list('spiders', {'type': SpiderType.CONFIGURABLE}): # append spider items.append(spider) # get other info for i in range(len(items)): spider = items[i] # get site if spider.get('site') is not None: site = db_manager.get('sites', spider['site']) if site is not None: items[i]['site_name'] = site['name'] # get last task last_task = db_manager.get_last_task(spider_id=spider['_id']) if last_task is not None: items[i]['task_ts'] = last_task['create_ts'] # --------- # stats # --------- # last 5-run errors items[i]['last_5_errors'] = get_last_n_run_errors_count( spider_id=spider['_id'], n=5) items[i]['last_7d_tasks'] = get_last_n_day_tasks_count( spider_id=spider['_id'], n=5) return {'status': 'ok', 'items': jsonify(items)}
def update_nodes_status(event): node_id = event.get('hostname') db_manager.update_one('nodes', id=node_id, values={ 'status': NodeStatus.ONLINE })
def update_envs(self, id: str): args = self.parser.parse_args() envs = json.loads(args.envs) db_manager.update_one(col_name='spiders', id=id, values={'envs': envs})