def get(self, id=None, action=None): # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # get one node elif id is not None: return jsonify(db_manager.get('deploys', id=id)) # get a list of items else: items = db_manager.list('deploys', {}) deploys = [] for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] deploys.append(item) return {'status': 'ok', 'items': jsonify(deploys)}
def get(self, id: str = None, action: str = None) -> (dict, tuple): """ GET method of DeployAPI. :param id: deploy_id :param action: action """ # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # get one node elif id is not None: return jsonify(db_manager.get('deploys', id=id)) # get a list of items else: items = db_manager.list('deploys', {}) deploys = [] for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) if spider is None: db_manager.remove('deploys', {'spider_id':spider_id}) item['spider_name'] = spider['name'] deploys.append(item) return { 'status': 'ok', 'items': jsonify(deploys) }
def get_results(self, id: str) -> (dict, tuple): """ Get a list of results crawled in a given task. :param id: task_id """ args = self.parser.parse_args() page_size = args.get('page_size') or 10 page_num = args.get('page_num') or 1 task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return [] fields = get_spider_col_fields(col_name) items = db_manager.list(col_name, {'task_id': id}, skip=page_size * (page_num - 1), limit=page_size) return { 'status': 'ok', 'fields': jsonify(fields), 'total_count': db_manager.count(col_name, {'task_id': id}), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(items) }
def get_log(self, id: (str, ObjectId)) -> (dict, tuple): """ Submit an HTTP request to fetch log from the node of a given task. :param id: task_id :return: """ task = db_manager.get(col_name=self.col_name, id=id) node = db_manager.get(col_name='nodes', id=task['node_id']) r = requests.get('http://%s:%s/api/tasks/%s/on_get_log' % ( node['ip'], node['port'], id )) if r.status_code == 200: data = json.loads(r.content.decode('utf-8')) return { 'status': 'ok', 'log': data.get('log') } else: data = json.loads(r.content) return { 'code': 500, 'status': 'ok', 'error': data['error'] }, 500
def execute_spider(self, id: str): task_id = self.request.id hostname = self.request.hostname spider = db_manager.get('spiders', id=id) command = spider.get('cmd') current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id'))) # log info logger.info('current_working_directory: %s' % current_working_directory) logger.info('spider_id: %s' % id) logger.info(command) # make sure the log folder exists log_path = os.path.join(PROJECT_LOGS_FOLDER, id) if not os.path.exists(log_path): os.makedirs(log_path) # open log file streams log_file_path = os.path.join(log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S')) stdout = open(log_file_path, 'a') stderr = open(log_file_path, 'a') # create a new task db_manager.save('tasks', { '_id': task_id, 'spider_id': ObjectId(id), 'create_ts': datetime.now(), 'node_id': hostname, 'hostname': hostname, 'log_file_path': log_file_path, }) # execute the command env = os.environ.copy() env['CRAWLAB_TASK_ID'] = task_id env['CRAWLAB_COLLECTION'] = spider.get('col') p = subprocess.Popen(command.split(' '), stdout=stdout.fileno(), stderr=stderr.fileno(), cwd=current_working_directory, env=env, bufsize=1) # get output from the process _stdout, _stderr = p.communicate() # save task when the task is finished db_manager.update_one('tasks', id=task_id, values={ 'finish_ts': datetime.now(), }) task = db_manager.get('tasks', id=id) # close log file streams stdout.flush() stderr.flush() stdout.close() stderr.close() return task
def get_results(self, id): task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return [] fields = get_spider_col_fields(col_name) items = db_manager.list(col_name, {'task_id': id}) return jsonify({'status': 'ok', 'fields': fields, 'items': items})
def get_log(self, id): task = db_manager.get('tasks', id=id) node = db_manager.get('nodes', id=task['node_id']) r = requests.get('http://%s:%s/api/tasks/%s/on_get_log' % (node['ip'], node['port'], id)) if r.status_code == 200: data = json.loads(r.content.decode('utf-8')) return {'status': 'ok', 'log': data.get('log')} else: data = json.loads(r.content) return {'code': 500, 'status': 'ok', 'error': data['error']}, 500
def download_results(self, id: str): task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return send_csv([], f'results_{col_name}_{round(time())}.csv') items = db_manager.list(col_name, {'task_id': id}, limit=999999999) fields = get_spider_col_fields(col_name, task_id=id, limit=999999999) return send_csv(items, filename=f'results_{col_name}_{round(time())}.csv', fields=fields, encoding='utf-8')
def get(self, id=None, action=None): # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) elif id is not None: task = db_manager.get('tasks', id=id) _task = db_manager.get('tasks_celery', id=task['_id']) _spider = db_manager.get('spiders', id=str(task['spider_id'])) if _task: if not task.get('status'): task['status'] = _task['status'] task['result'] = _task['result'] task['spider_name'] = _spider['name'] try: with open(task['log_file_path']) as f: task['log'] = f.read() except Exception as err: task['log'] = '' return jsonify(task) # list tasks args = self.parser.parse_args() page_size = args.get('page_size') or 10 page_num = args.get('page_num') or 1 tasks = db_manager.list('tasks', {}, limit=page_size, skip=page_size * (page_num - 1), sort_key='finish_ts') items = [] for task in tasks: _task = db_manager.get('tasks_celery', id=task['_id']) _spider = db_manager.get('spiders', id=str(task['spider_id'])) if _task: task['status'] = _task['status'] else: task['status'] = TaskStatus.UNAVAILABLE task['spider_name'] = _spider['name'] items.append(task) return { 'status': 'ok', 'total_count': db_manager.count('tasks', {}), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(items) }
def get_tasks(self, id): items = db_manager.list('tasks', {'node_id': id}, limit=10, sort_key='create_ts') for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] _task = db_manager.get('tasks_celery', id=item['_id']) if _task: item['status'] = _task['status'] else: item['status'] = TaskStatus.UNAVAILABLE return {'status': 'ok', 'items': jsonify(items)}
def stop(self, id): """ Send stop signal to a specific node :param id: task_id """ task = db_manager.get('tasks', id=id) node = db_manager.get('nodes', id=task['node_id']) r = requests.get('http://%s:%s/api/tasks/%s/on_stop' % (node['ip'], node['port'], id)) if r.status_code == 200: return {'status': 'ok', 'message': 'success'} else: data = json.loads(r.content) return {'code': 500, 'status': 'ok', 'error': data['error']}, 500
def get_tasks(self, id): items = db_manager.list('tasks', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='finish_ts') for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] task = db_manager.get('tasks_celery', id=item['_id']) if task is not None: item['status'] = task['status'] else: item['status'] = TaskStatus.UNAVAILABLE return jsonify({'status': 'ok', 'items': items})
def deploy(self, id): spider = db_manager.get('spiders', id=id) nodes = db_manager.list('nodes', {}) for node in nodes: node_id = node['_id'] output_file_name = '%s_%s.zip' % ( datetime.now().strftime('%Y%m%d%H%M%S'), str(random())[2:12]) output_file_path = os.path.join(PROJECT_TMP_FOLDER, output_file_name) # zip source folder to zip file zip_file(source_dir=spider['src'], output_filename=output_file_path) # upload to api files = {'file': open(output_file_path, 'rb')} r = requests.post( 'http://%s:%s/api/spiders/%s/deploy_file?node_id=%s' % ( node.get('ip'), node.get('port'), id, node_id, ), files=files) return {'code': 200, 'status': 'ok', 'message': 'deploy success'}
def crawl(self, id): args = self.parser.parse_args() node_id = args.get('node_id') if node_id is None: return { 'code': 400, 'status': 400, 'error': 'node_id cannot be empty' }, 400 # get node from db node = db_manager.get('nodes', id=node_id) # validate ip and port if node.get('ip') is None or node.get('port') is None: return { 'code': 400, 'status': 'ok', 'error': 'node ip and port should not be empty' }, 400 # dispatch crawl task res = requests.get('http://%s:%s/api/spiders/%s/on_crawl?node_id=%s' % (node.get('ip'), node.get('port'), id, node_id)) data = json.loads(res.content.decode('utf-8')) return { 'code': res.status_code, 'status': 'ok', 'error': data.get('error'), 'task': data.get('task') }
def on_crawl(self, id: str) -> (dict, tuple): """ Start a crawl task. :param id: spider_id :return: """ args = self.parser.parse_args() params = args.get('params') spider = db_manager.get('spiders', id=ObjectId(id)) job = execute_spider.delay(id, params) # create a new task db_manager.save('tasks', { '_id': job.id, 'spider_id': ObjectId(id), 'cmd': spider.get('cmd'), 'params': params, 'create_ts': datetime.utcnow(), 'status': TaskStatus.PENDING }) return { 'code': 200, 'status': 'ok', 'task': { 'id': job.id, 'status': job.status } }
def get(self, id=None, action=None): # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # get one node elif id is not None: return db_manager.get('nodes', id=id) # get a list of items else: # get a list of active nodes from flower and save to db update_nodes_status() # iterate db nodes to update status nodes = db_manager.list('nodes', {}) return { 'status': 'ok', 'items': jsonify(nodes) }
def update(self, id: str = None) -> (dict, tuple): """ Helper function for update action given the id. :param id: :return: """ args = self.parser.parse_args() item = db_manager.get(col_name=self.col_name, id=id) if item is None: return { 'status': 'ok', 'code': 401, 'error': 'item not exists' }, 401 values = {} for k in args.keys(): if k not in DEFAULT_ARGS: if args.get(k) is not None: values[k] = args.get(k) item = db_manager.update_one(col_name=self.col_name, id=id, values=values) # execute after_update hook self.after_update(id) return item
def deploy(self, id: str) -> (dict, tuple): """ Submit HTTP requests to deploy the given spider to all nodes. :param id: :return: """ spider = db_manager.get('spiders', id=id) nodes = db_manager.list('nodes', {'status': NodeStatus.ONLINE}) for node in nodes: node_id = node['_id'] output_file_name = '%s_%s.zip' % ( datetime.now().strftime('%Y%m%d%H%M%S'), str(random())[2:12]) output_file_path = os.path.join(PROJECT_TMP_FOLDER, output_file_name) # zip source folder to zip file zip_file(source_dir=spider['src'], output_filename=output_file_path) # upload to api files = {'file': open(output_file_path, 'rb')} r = requests.post( 'http://%s:%s/api/spiders/%s/deploy_file?node_id=%s' % ( node.get('ip'), node.get('port'), id, node_id, ), files=files) # TODO: checkpoint for errors return {'code': 200, 'status': 'ok', 'message': 'deploy success'}
def on_stop(self, id): """ Stop the task in progress. :param id: :return: """ task = db_manager.get('tasks', id=id) celery_app.control.revoke(id, terminate=True) db_manager.update_one('tasks', id=id, values={'status': TaskStatus.REVOKED}) # kill process if task.get('pid'): pid = task.get('pid') if 'win32' in sys.platform: os.popen('taskkill /pid:' + str(pid)) else: # unix system os.kill(pid, SIGKILL) return { 'id': id, 'status': 'ok', }
def deploy_file(self, id: str = None) -> (dict, tuple): """ Receive HTTP request of deploys and unzip zip files and copy to the destination directories. :param id: spider_id """ args = parser.parse_args() node_id = request.args.get('node_id') f = args.file if get_file_suffix(f.filename) != 'zip': return { 'status': 'ok', 'error': 'file type mismatch' }, 400 # save zip file on temp folder file_path = '%s/%s' % (PROJECT_TMP_FOLDER, f.filename) with open(file_path, 'wb') as fw: fw.write(f.stream.read()) # unzip zip file dir_path = file_path.replace('.zip', '') if os.path.exists(dir_path): shutil.rmtree(dir_path) unzip_file(file_path, dir_path) # get spider and version spider = db_manager.get(col_name=self.col_name, id=id) if spider is None: return None, 400 # make source / destination src = os.path.join(dir_path, os.listdir(dir_path)[0]) # src = dir_path dst = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id'))) # logging info current_app.logger.info('src: %s' % src) current_app.logger.info('dst: %s' % dst) # remove if the target folder exists if os.path.exists(dst): shutil.rmtree(dst) # copy from source to destination shutil.copytree(src=src, dst=dst) # save to db # TODO: task management for deployment db_manager.save('deploys', { 'spider_id': ObjectId(id), 'node_id': node_id, 'finish_ts': datetime.utcnow() }) return { 'code': 200, 'status': 'ok', 'message': 'deploy success' }
def on_get_log(self, id): try: task = db_manager.get('tasks', id=id) with open(task['log_file_path']) as f: log = f.read() return {'status': 'ok', 'log': log} except Exception as err: return {'code': 500, 'status': 'ok', 'error': str(err)}, 500
def get_task(id: str): i = 0 while i < 5: task = db_manager.get('tasks', id=id) if task is not None: return task i += 1 sleep(1) return None
def get(self, id=None, action=None): args = self.parser.parse_args() # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # list items elif id is None: # filter cond = {} if args.get('filter') is not None: cond = args.filter # cond = json.loads(args.filter) # page number page = 1 if args.get('page') is not None: page = args.page # page = int(args.page) # page size page_size = 10 if args.get('page_size') is not None: page_size = args.page_size # page = int(args.page_size) # TODO: sort functionality # total count total_count = db_manager.count(col_name=self.col_name, cond=cond) # items items = db_manager.list(col_name=self.col_name, cond=cond, skip=(page - 1) * page_size, limit=page_size) # TODO: getting status for node return jsonify({ 'status': 'ok', 'total_count': total_count, 'page': page, 'page_size': page_size, 'items': items }) # get item by id else: return jsonify(db_manager.get(col_name=self.col_name, id=id))
def get_results(self, id: str) -> (dict, tuple): """ Get a list of results crawled in a given task. :param id: task_id """ args = self.parser.parse_args() page_size = args.get('page_size') or 10 task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return [] fields = get_spider_col_fields(col_name) fields = list(set(fields) - set(IGNORE_FIELD)) items = db_manager.list(col_name, {'task_id': id}) # 避免内容过长,做一下限制;同时剔除无用的字段不展示 adjust_items = [] for item in items: adjust_item = {} for key, value in item.items(): if isinstance(value, str) == False: continue if key in IGNORE_FIELD: continue if len(value) > 500: value = value[:500] + '...' adjust_item[key] = value adjust_items += [adjust_item] total_count = db_manager.count(col_name, {'task_id': id}) page_num = len(adjust_items) / page_size if isinstance(page_num, float): page_num = int(page_num) + 1 return { 'status': 'ok', 'fields': jsonify(fields), 'total_count': len(adjust_items), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(adjust_items) }
def get(self, id=None, action=None): # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) elif id is not None: task = db_manager.get('tasks', id=id) _task = db_manager.get('tasks_celery', id=task['_id']) _spider = db_manager.get('spiders', id=str(task['spider_id'])) if _task: task['status'] = _task['status'] else: task['status'] = TaskStatus.UNAVAILABLE task['result'] = _task['result'] task['spider_name'] = _spider['name'] try: with open(task['log_file_path']) as f: task['log'] = f.read() except Exception as err: task['log'] = '' return jsonify(task) tasks = db_manager.list('tasks', {}, limit=1000, sort_key='finish_ts') items = [] for task in tasks: _task = db_manager.get('tasks_celery', id=task['_id']) _spider = db_manager.get('spiders', id=str(task['spider_id'])) if _task: task['status'] = _task['status'] else: task['status'] = TaskStatus.UNAVAILABLE task['spider_name'] = _spider['name'] items.append(task) return jsonify({ 'status': 'ok', 'items': items })
def get_tasks(self, id): items = db_manager.list('tasks', {'node_id': id}, limit=10, sort_key='create_ts') for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] return { 'status': 'ok', 'items': jsonify(items) }
def get(self, id=None, action=None): # action by id if action is not None: if not hasattr(self, action): return { 'status': 'ok', 'code': 400, 'error': 'action "%s" invalid' % action }, 400 return getattr(self, action)(id) # get one node elif id is not None: return jsonify(db_manager.get('spiders', id=id)) # get a list of items else: items = [] dirs = os.listdir(PROJECT_SOURCE_FILE_FOLDER) for _dir in dirs: if _dir in IGNORE_DIRS: continue dir_path = os.path.join(PROJECT_SOURCE_FILE_FOLDER, _dir) dir_name = _dir spider = db_manager.get_one_by_key('spiders', key='src', value=dir_path) # new spider if spider is None: stats = get_file_suffix_stats(dir_path) lang = get_lang_by_stats(stats) db_manager.save('spiders', { 'name': dir_name, 'src': dir_path, 'lang': lang, 'suffix_stats': stats, }) # existing spider else: stats = get_file_suffix_stats(dir_path) lang = get_lang_by_stats(stats) db_manager.update_one('spiders', id=str(spider['_id']), values={ 'lang': lang, 'suffix_stats': stats, }) # append spider items.append(spider) return jsonify({ 'status': 'ok', 'items': items })
def get_results(self, id): args = self.parser.parse_args() page_size = args.get('page_size') or 10 page_num = args.get('page_num') or 1 task = db_manager.get('tasks', id=id) spider = db_manager.get('spiders', id=task['spider_id']) col_name = spider.get('col') if not col_name: return [] fields = get_spider_col_fields(col_name) items = db_manager.list(col_name, {'task_id': id}) return { 'status': 'ok', 'fields': jsonify(fields), 'total_count': db_manager.count(col_name, {'task_id': id}), 'page_num': page_num, 'page_size': page_size, 'items': jsonify(items) }
def get_deploys(self, id): items = db_manager.list('deploys', {'node_id': id}, limit=10, sort_key='finish_ts') deploys = [] for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] deploys.append(item) return {'status': 'ok', 'items': jsonify(deploys)}
def on_get_log(self, id: (str, ObjectId)) -> (dict, tuple): """ Get the log of given task_id :param id: task_id """ try: task = db_manager.get(col_name=self.col_name, id=id) with open(task['log_file_path']) as f: log = f.read() return {'status': 'ok', 'log': log} except Exception as err: return {'code': 500, 'status': 'ok', 'error': str(err)}, 500