def unzip_file(zip_src, dst_dir): """ Unzip file :param zip_src: source zip file :param dst_dir: destination directory """ r = zipfile.is_zipfile(zip_src) if r: fz = zipfile.ZipFile(zip_src, 'r') for file in fz.namelist(): fz.extract(file, dst_dir) else: other.info('This is not zip')
def execute_spider(self, id: str, params: str = None): """ Execute spider task. :param self: :param id: task_id """ task_id = self.request.id hostname = self.request.hostname spider = db_manager.get('spiders', id=id) command = spider.get('cmd') # if start with python, then use sys.executable to execute in the virtualenv if command.startswith('python '): command = command.replace('python ', sys.executable + ' ') # if start with scrapy, then use sys.executable to execute scrapy as module in the virtualenv elif command.startswith('scrapy '): command = command.replace('scrapy ', sys.executable + ' -m scrapy ') # pass params to the command if params is not None: command += ' ' + params # get task object and return if not found task = get_task(task_id) if task is None: return # current working directory current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id'))) # log info logger.info('task_id: %s' % task_id) logger.info('hostname: %s' % hostname) logger.info('current_working_directory: %s' % current_working_directory) logger.info('spider_id: %s' % id) logger.info(command) # make sure the log folder exists log_path = os.path.join(PROJECT_LOGS_FOLDER, id) if not os.path.exists(log_path): os.makedirs(log_path) # open log file streams log_file_path = os.path.join( log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S')) stdout = open(log_file_path, 'a') stderr = open(log_file_path, 'a') # update task status as started db_manager.update_one('tasks', id=task_id, values={ 'start_ts': datetime.utcnow(), 'node_id': hostname, 'hostname': hostname, 'log_file_path': log_file_path, 'status': TaskStatus.STARTED }) # pass params as env variables env = os.environ.copy() # custom environment variables if spider.get('envs'): for _env in spider.get('envs'): env[_env['name']] = _env['value'] # task id environment variable env['CRAWLAB_TASK_ID'] = task_id # collection environment variable if spider.get('col'): env['CRAWLAB_COLLECTION'] = spider.get('col') # create index to speed results data retrieval db_manager.create_index(spider.get('col'), [('task_id', ASCENDING)]) # start process cmd_arr = command.split(' ') cmd_arr = list(filter(lambda x: x != '', cmd_arr)) try: p = subprocess.Popen(cmd_arr, stdout=stdout.fileno(), stderr=stderr.fileno(), cwd=current_working_directory, env=env, bufsize=1) # update pid db_manager.update_one(col_name='tasks', id=task_id, values={'pid': p.pid}) # get output from the process _stdout, _stderr = p.communicate() # get return code code = p.poll() if code == 0: status = TaskStatus.SUCCESS else: status = TaskStatus.FAILURE except Exception as err: logger.error(err) stderr.write(str(err)) status = TaskStatus.FAILURE # save task when the task is finished finish_ts = datetime.utcnow() db_manager.update_one('tasks', id=task_id, values={ 'finish_ts': finish_ts, 'duration': (finish_ts - task['create_ts']).total_seconds(), 'status': status }) task = db_manager.get('tasks', id=id) # close log file streams stdout.flush() stderr.flush() stdout.close() stderr.close() return task
def execute_config_spider(self, id: str, params: str = None): task_id = self.request.id hostname = self.request.hostname spider = db_manager.get('spiders', id=id) # get task object and return if not found task = get_task(task_id) if task is None: return # current working directory current_working_directory = os.path.join(BASE_DIR, 'spiders') # log info logger.info('task_id: %s' % task_id) logger.info('hostname: %s' % hostname) logger.info('current_working_directory: %s' % current_working_directory) logger.info('spider_id: %s' % id) # make sure the log folder exists log_path = os.path.join(PROJECT_LOGS_FOLDER, id) if not os.path.exists(log_path): os.makedirs(log_path) # open log file streams log_file_path = os.path.join( log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S')) stdout = open(log_file_path, 'a') stderr = open(log_file_path, 'a') # update task status as started db_manager.update_one('tasks', id=task_id, values={ 'start_ts': datetime.utcnow(), 'node_id': hostname, 'hostname': hostname, 'log_file_path': log_file_path, 'status': TaskStatus.STARTED }) # pass params as env variables env = os.environ.copy() # custom environment variables if spider.get('envs'): for _env in spider.get('envs'): env[_env['name']] = _env['value'] # task id environment variable env['CRAWLAB_TASK_ID'] = task_id # collection environment variable if spider.get('col'): env['CRAWLAB_COLLECTION'] = spider.get('col') # create index to speed results data retrieval db_manager.create_index(spider.get('col'), [('task_id', ASCENDING)]) # mongodb environment variables env['MONGO_HOST'] = MONGO_HOST env['MONGO_PORT'] = str(MONGO_PORT) env['MONGO_DB'] = MONGO_DB if MONGO_USERNAME is not None: env['MONGO_USERNAME'] = MONGO_USERNAME if MONGO_PASSWORD: env['MONGO_PASSWORD'] = MONGO_PASSWORD cmd_arr = [sys.executable, '-m', 'scrapy', 'crawl', 'config_spider'] try: p = subprocess.Popen(cmd_arr, stdout=stdout.fileno(), stderr=stderr.fileno(), cwd=current_working_directory, env=env, bufsize=1) # update pid db_manager.update_one(col_name='tasks', id=task_id, values={'pid': p.pid}) # get output from the process _stdout, _stderr = p.communicate() # get return code code = p.poll() if code == 0: status = TaskStatus.SUCCESS else: status = TaskStatus.FAILURE except Exception as err: traceback.print_exc() logger.error(err) stderr.write(str(err)) status = TaskStatus.FAILURE # save task when the task is finished finish_ts = datetime.utcnow() db_manager.update_one('tasks', id=task_id, values={ 'finish_ts': finish_ts, 'duration': (finish_ts - task['create_ts']).total_seconds(), 'status': status }) task = db_manager.get('tasks', id=id) # close log file streams stdout.flush() stderr.flush() stdout.close() stderr.close() return task
def update_nodes_status_online(event): other.info(f"{event}")
def run_flower(): p = subprocess.Popen(['celery', 'flower', '-b', BROKER_URL], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) for line in iter(p.stdout.readline, 'b'): if line.decode('utf-8') != '': other.info(line.decode('utf-8'))
import os import sys import subprocess # make sure the working directory is in system path FILE_DIR = os.path.dirname(os.path.realpath(__file__)) ROOT_PATH = os.path.abspath(os.path.join(FILE_DIR, '..')) sys.path.append(ROOT_PATH) from utils.log import other from config import BROKER_URL if __name__ == '__main__': p = subprocess.Popen( [sys.executable, '-m', 'celery', 'flower', '-b', BROKER_URL], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=ROOT_PATH) for line in iter(p.stdout.readline, 'b'): if line.decode('utf-8') != '': other.info(line.decode('utf-8'))