Esempio n. 1
0
def unzip_file(zip_src, dst_dir):
    """
    Unzip file
    :param zip_src: source zip file
    :param dst_dir: destination directory
    """
    r = zipfile.is_zipfile(zip_src)
    if r:
        fz = zipfile.ZipFile(zip_src, 'r')
        for file in fz.namelist():
            fz.extract(file, dst_dir)
    else:
        other.info('This is not zip')
Esempio n. 2
0
def execute_spider(self, id: str, params: str = None):
    """
    Execute spider task.
    :param self:
    :param id: task_id
    """
    task_id = self.request.id
    hostname = self.request.hostname
    spider = db_manager.get('spiders', id=id)
    command = spider.get('cmd')

    # if start with python, then use sys.executable to execute in the virtualenv
    if command.startswith('python '):
        command = command.replace('python ', sys.executable + ' ')

    # if start with scrapy, then use sys.executable to execute scrapy as module in the virtualenv
    elif command.startswith('scrapy '):
        command = command.replace('scrapy ', sys.executable + ' -m scrapy ')

    # pass params to the command
    if params is not None:
        command += ' ' + params

    # get task object and return if not found
    task = get_task(task_id)
    if task is None:
        return

    # current working directory
    current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER,
                                             str(spider.get('_id')))

    # log info
    logger.info('task_id: %s' % task_id)
    logger.info('hostname: %s' % hostname)
    logger.info('current_working_directory: %s' % current_working_directory)
    logger.info('spider_id: %s' % id)
    logger.info(command)

    # make sure the log folder exists
    log_path = os.path.join(PROJECT_LOGS_FOLDER, id)
    if not os.path.exists(log_path):
        os.makedirs(log_path)

    # open log file streams
    log_file_path = os.path.join(
        log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S'))
    stdout = open(log_file_path, 'a')
    stderr = open(log_file_path, 'a')

    # update task status as started
    db_manager.update_one('tasks',
                          id=task_id,
                          values={
                              'start_ts': datetime.utcnow(),
                              'node_id': hostname,
                              'hostname': hostname,
                              'log_file_path': log_file_path,
                              'status': TaskStatus.STARTED
                          })

    # pass params as env variables
    env = os.environ.copy()

    # custom environment variables
    if spider.get('envs'):
        for _env in spider.get('envs'):
            env[_env['name']] = _env['value']

    # task id environment variable
    env['CRAWLAB_TASK_ID'] = task_id

    # collection environment variable
    if spider.get('col'):
        env['CRAWLAB_COLLECTION'] = spider.get('col')

        # create index to speed results data retrieval
        db_manager.create_index(spider.get('col'), [('task_id', ASCENDING)])

    # start process
    cmd_arr = command.split(' ')
    cmd_arr = list(filter(lambda x: x != '', cmd_arr))
    try:
        p = subprocess.Popen(cmd_arr,
                             stdout=stdout.fileno(),
                             stderr=stderr.fileno(),
                             cwd=current_working_directory,
                             env=env,
                             bufsize=1)

        # update pid
        db_manager.update_one(col_name='tasks',
                              id=task_id,
                              values={'pid': p.pid})

        # get output from the process
        _stdout, _stderr = p.communicate()

        # get return code
        code = p.poll()
        if code == 0:
            status = TaskStatus.SUCCESS
        else:
            status = TaskStatus.FAILURE
    except Exception as err:
        logger.error(err)
        stderr.write(str(err))
        status = TaskStatus.FAILURE

    # save task when the task is finished
    finish_ts = datetime.utcnow()
    db_manager.update_one('tasks',
                          id=task_id,
                          values={
                              'finish_ts':
                              finish_ts,
                              'duration':
                              (finish_ts - task['create_ts']).total_seconds(),
                              'status':
                              status
                          })
    task = db_manager.get('tasks', id=id)

    # close log file streams
    stdout.flush()
    stderr.flush()
    stdout.close()
    stderr.close()

    return task
Esempio n. 3
0
def execute_config_spider(self, id: str, params: str = None):
    task_id = self.request.id
    hostname = self.request.hostname
    spider = db_manager.get('spiders', id=id)

    # get task object and return if not found
    task = get_task(task_id)
    if task is None:
        return

    # current working directory
    current_working_directory = os.path.join(BASE_DIR, 'spiders')

    # log info
    logger.info('task_id: %s' % task_id)
    logger.info('hostname: %s' % hostname)
    logger.info('current_working_directory: %s' % current_working_directory)
    logger.info('spider_id: %s' % id)

    # make sure the log folder exists
    log_path = os.path.join(PROJECT_LOGS_FOLDER, id)
    if not os.path.exists(log_path):
        os.makedirs(log_path)

    # open log file streams
    log_file_path = os.path.join(
        log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S'))
    stdout = open(log_file_path, 'a')
    stderr = open(log_file_path, 'a')

    # update task status as started
    db_manager.update_one('tasks',
                          id=task_id,
                          values={
                              'start_ts': datetime.utcnow(),
                              'node_id': hostname,
                              'hostname': hostname,
                              'log_file_path': log_file_path,
                              'status': TaskStatus.STARTED
                          })

    # pass params as env variables
    env = os.environ.copy()

    # custom environment variables
    if spider.get('envs'):
        for _env in spider.get('envs'):
            env[_env['name']] = _env['value']

    # task id environment variable
    env['CRAWLAB_TASK_ID'] = task_id

    # collection environment variable
    if spider.get('col'):
        env['CRAWLAB_COLLECTION'] = spider.get('col')

        # create index to speed results data retrieval
        db_manager.create_index(spider.get('col'), [('task_id', ASCENDING)])

    # mongodb environment variables
    env['MONGO_HOST'] = MONGO_HOST
    env['MONGO_PORT'] = str(MONGO_PORT)
    env['MONGO_DB'] = MONGO_DB
    if MONGO_USERNAME is not None:
        env['MONGO_USERNAME'] = MONGO_USERNAME
    if MONGO_PASSWORD:
        env['MONGO_PASSWORD'] = MONGO_PASSWORD

    cmd_arr = [sys.executable, '-m', 'scrapy', 'crawl', 'config_spider']
    try:
        p = subprocess.Popen(cmd_arr,
                             stdout=stdout.fileno(),
                             stderr=stderr.fileno(),
                             cwd=current_working_directory,
                             env=env,
                             bufsize=1)

        # update pid
        db_manager.update_one(col_name='tasks',
                              id=task_id,
                              values={'pid': p.pid})

        # get output from the process
        _stdout, _stderr = p.communicate()

        # get return code
        code = p.poll()
        if code == 0:
            status = TaskStatus.SUCCESS
        else:
            status = TaskStatus.FAILURE
    except Exception as err:
        traceback.print_exc()
        logger.error(err)
        stderr.write(str(err))
        status = TaskStatus.FAILURE

    # save task when the task is finished
    finish_ts = datetime.utcnow()
    db_manager.update_one('tasks',
                          id=task_id,
                          values={
                              'finish_ts':
                              finish_ts,
                              'duration':
                              (finish_ts - task['create_ts']).total_seconds(),
                              'status':
                              status
                          })
    task = db_manager.get('tasks', id=id)

    # close log file streams
    stdout.flush()
    stderr.flush()
    stdout.close()
    stderr.close()

    return task
Esempio n. 4
0
 def update_nodes_status_online(event):
     other.info(f"{event}")
Esempio n. 5
0
def run_flower():
    p = subprocess.Popen(['celery', 'flower', '-b', BROKER_URL], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    for line in iter(p.stdout.readline, 'b'):
        if line.decode('utf-8') != '':
            other.info(line.decode('utf-8'))
Esempio n. 6
0
import os
import sys
import subprocess

# make sure the working directory is in system path
FILE_DIR = os.path.dirname(os.path.realpath(__file__))
ROOT_PATH = os.path.abspath(os.path.join(FILE_DIR, '..'))
sys.path.append(ROOT_PATH)

from utils.log import other
from config import BROKER_URL

if __name__ == '__main__':
    p = subprocess.Popen(
        [sys.executable, '-m', 'celery', 'flower', '-b', BROKER_URL],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        cwd=ROOT_PATH)
    for line in iter(p.stdout.readline, 'b'):
        if line.decode('utf-8') != '':
            other.info(line.decode('utf-8'))