Esempio n. 1
0
    def on_stop(self, id):
        """
        Stop the task in progress.
        :param id:
        :return:
        """
        task = db_manager.get('tasks', id=id)
        celery_app.control.revoke(id, terminate=True)
        db_manager.update_one('tasks',
                              id=id,
                              values={'status': TaskStatus.REVOKED})

        # kill process
        if task.get('pid'):
            pid = task.get('pid')
            if 'win32' in sys.platform:
                os.popen('taskkill /pid:' + str(pid))
            else:
                # unix system
                os.kill(pid, SIGKILL)

        return {
            'id': id,
            'status': 'ok',
        }
Esempio n. 2
0
def execute_spider(self, id: str):
    task_id = self.request.id
    hostname = self.request.hostname
    spider = db_manager.get('spiders', id=id)
    command = spider.get('cmd')

    current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id')))

    # log info
    logger.info('current_working_directory: %s' % current_working_directory)
    logger.info('spider_id: %s' % id)
    logger.info(command)

    # make sure the log folder exists
    log_path = os.path.join(PROJECT_LOGS_FOLDER, id)
    if not os.path.exists(log_path):
        os.makedirs(log_path)

    # open log file streams
    log_file_path = os.path.join(log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S'))
    stdout = open(log_file_path, 'a')
    stderr = open(log_file_path, 'a')

    # create a new task
    db_manager.save('tasks', {
        '_id': task_id,
        'spider_id': ObjectId(id),
        'create_ts': datetime.now(),
        'node_id': hostname,
        'hostname': hostname,
        'log_file_path': log_file_path,
    })

    # execute the command
    env = os.environ.copy()
    env['CRAWLAB_TASK_ID'] = task_id
    env['CRAWLAB_COLLECTION'] = spider.get('col')
    p = subprocess.Popen(command.split(' '),
                         stdout=stdout.fileno(),
                         stderr=stderr.fileno(),
                         cwd=current_working_directory,
                         env=env,
                         bufsize=1)

    # get output from the process
    _stdout, _stderr = p.communicate()

    # save task when the task is finished
    db_manager.update_one('tasks', id=task_id, values={
        'finish_ts': datetime.now(),
    })
    task = db_manager.get('tasks', id=id)

    # close log file streams
    stdout.flush()
    stderr.flush()
    stdout.close()
    stderr.close()

    return task
Esempio n. 3
0
 def update_envs(self, id: str):
     """
     Update environment variables
     :param id: spider_id
     """
     args = self.parser.parse_args()
     envs = json.loads(args.envs)
     db_manager.update_one(col_name='spiders', id=id, values={'envs': envs})
Esempio n. 4
0
 def update_detail_fields(self, id: str):
     """
     Update detail page fields variables for configurable spiders
     :param id: spider_id
     """
     args = self.parser.parse_args()
     detail_fields = json.loads(args.detail_fields)
     db_manager.update_one(col_name='spiders', id=id, values={'detail_fields': detail_fields})
Esempio n. 5
0
    def get(self, id=None, action=None):
        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                           'status': 'ok',
                           'code': 400,
                           'error': 'action "%s" invalid' % action
                       }, 400
            return getattr(self, action)(id)

        # get one node
        elif id is not None:
            return jsonify(db_manager.get('spiders', id=id))

        # get a list of items
        else:
            items = []
            dirs = os.listdir(PROJECT_SOURCE_FILE_FOLDER)
            for _dir in dirs:
                if _dir in IGNORE_DIRS:
                    continue

                dir_path = os.path.join(PROJECT_SOURCE_FILE_FOLDER, _dir)
                dir_name = _dir
                spider = db_manager.get_one_by_key('spiders', key='src', value=dir_path)

                # new spider
                if spider is None:
                    stats = get_file_suffix_stats(dir_path)
                    lang = get_lang_by_stats(stats)
                    db_manager.save('spiders', {
                        'name': dir_name,
                        'src': dir_path,
                        'lang': lang,
                        'suffix_stats': stats,
                    })

                # existing spider
                else:
                    stats = get_file_suffix_stats(dir_path)
                    lang = get_lang_by_stats(stats)
                    db_manager.update_one('spiders', id=str(spider['_id']), values={
                        'lang': lang,
                        'suffix_stats': stats,
                    })

                # append spider
                items.append(spider)

            return jsonify({
                'status': 'ok',
                'items': items
            })
Esempio n. 6
0
 def stop(self, id):
     """
     Stop the task in progress.
     :param id:
     :return:
     """
     celery_app.control.revoke(id, terminate=True)
     db_manager.update_one('tasks',
                           id=id,
                           values={'status': TaskStatus.REVOKED})
     return {
         'id': id,
         'status': 'ok',
     }
Esempio n. 7
0
    def update(self, id: str = None) -> (dict, tuple):
        """
        Helper function for update action given the id.
        :param id:
        :return:
        """
        args = self.parser.parse_args()
        item = db_manager.get(col_name=self.col_name, id=id)
        if item is None:
            return {
                'status': 'ok',
                'code': 401,
                'error': 'item not exists'
            }, 401
        values = {}
        for k in args.keys():
            if k not in DEFAULT_ARGS:
                if args.get(k) is not None:
                    values[k] = args.get(k)
        item = db_manager.update_one(col_name=self.col_name,
                                     id=id,
                                     values=values)

        # execute after_update hook
        self.after_update(id)

        return item
Esempio n. 8
0
 def update(self, id=None):
     args = self.parser.parse_args()
     item = db_manager.get(col_name=self.col_name, id=id)
     if item is None:
         return {
                    'status': 'ok',
                    'code': 401,
                    'error': 'item not exists'
                }, 401
     values = {}
     for k in args.keys():
         if k not in DEFAULT_ARGS:
             values[k] = args.get(k)
     item = db_manager.update_one(col_name=self.col_name, id=id, values=values)
     return item
Esempio n. 9
0
def execute_spider(self, id: str, params: str = None):
    """
    Execute spider task.
    :param self:
    :param id: task_id
    """
    task_id = self.request.id
    hostname = self.request.hostname
    spider = db_manager.get('spiders', id=id)
    command = spider.get('cmd')

    # if start with python, then use sys.executable to execute in the virtualenv
    if command.startswith('python '):
        command = command.replace('python ', sys.executable + ' ')

    # if start with scrapy, then use sys.executable to execute scrapy as module in the virtualenv
    elif command.startswith('scrapy '):
        command = command.replace('scrapy ', sys.executable + ' -m scrapy ')

    # pass params to the command
    if params is not None:
        command += ' ' + params

    # get task object and return if not found
    task = get_task(task_id)
    if task is None:
        return

    # current working directory
    current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER,
                                             str(spider.get('_id')))

    # log info
    logger.info('task_id: %s' % task_id)
    logger.info('hostname: %s' % hostname)
    logger.info('current_working_directory: %s' % current_working_directory)
    logger.info('spider_id: %s' % id)
    logger.info(command)

    # make sure the log folder exists
    log_path = os.path.join(PROJECT_LOGS_FOLDER, id)
    if not os.path.exists(log_path):
        os.makedirs(log_path)

    # open log file streams
    log_file_path = os.path.join(
        log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S'))
    stdout = open(log_file_path, 'a')
    stderr = open(log_file_path, 'a')

    # update task status as started
    db_manager.update_one('tasks',
                          id=task_id,
                          values={
                              'start_ts': datetime.utcnow(),
                              'node_id': hostname,
                              'hostname': hostname,
                              'log_file_path': log_file_path,
                              'status': TaskStatus.STARTED
                          })

    # pass params as env variables
    env = os.environ.copy()

    # custom environment variables
    if spider.get('envs'):
        for _env in spider.get('envs'):
            env[_env['name']] = _env['value']

    # task id environment variable
    env['CRAWLAB_TASK_ID'] = task_id

    # collection environment variable
    if spider.get('col'):
        env['CRAWLAB_COLLECTION'] = spider.get('col')

        # create index to speed results data retrieval
        db_manager.create_index(spider.get('col'), [('task_id', ASCENDING)])

    # start process
    cmd_arr = command.split(' ')
    cmd_arr = list(filter(lambda x: x != '', cmd_arr))
    try:
        p = subprocess.Popen(cmd_arr,
                             stdout=stdout.fileno(),
                             stderr=stderr.fileno(),
                             cwd=current_working_directory,
                             env=env,
                             bufsize=1)

        # update pid
        db_manager.update_one(col_name='tasks',
                              id=task_id,
                              values={'pid': p.pid})

        # get output from the process
        _stdout, _stderr = p.communicate()

        # get return code
        code = p.poll()
        if code == 0:
            status = TaskStatus.SUCCESS
        else:
            status = TaskStatus.FAILURE
    except Exception as err:
        logger.error(err)
        stderr.write(str(err))
        status = TaskStatus.FAILURE

    # save task when the task is finished
    finish_ts = datetime.utcnow()
    db_manager.update_one('tasks',
                          id=task_id,
                          values={
                              'finish_ts':
                              finish_ts,
                              'duration':
                              (finish_ts - task['create_ts']).total_seconds(),
                              'status':
                              status
                          })
    task = db_manager.get('tasks', id=id)

    # close log file streams
    stdout.flush()
    stderr.flush()
    stdout.close()
    stderr.close()

    return task
Esempio n. 10
0
def execute_config_spider(self, id: str, params: str = None):
    task_id = self.request.id
    hostname = self.request.hostname
    spider = db_manager.get('spiders', id=id)

    # get task object and return if not found
    task = get_task(task_id)
    if task is None:
        return

    # current working directory
    current_working_directory = os.path.join(BASE_DIR, 'spiders')

    # log info
    logger.info('task_id: %s' % task_id)
    logger.info('hostname: %s' % hostname)
    logger.info('current_working_directory: %s' % current_working_directory)
    logger.info('spider_id: %s' % id)

    # make sure the log folder exists
    log_path = os.path.join(PROJECT_LOGS_FOLDER, id)
    if not os.path.exists(log_path):
        os.makedirs(log_path)

    # open log file streams
    log_file_path = os.path.join(
        log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S'))
    stdout = open(log_file_path, 'a')
    stderr = open(log_file_path, 'a')

    # update task status as started
    db_manager.update_one('tasks',
                          id=task_id,
                          values={
                              'start_ts': datetime.utcnow(),
                              'node_id': hostname,
                              'hostname': hostname,
                              'log_file_path': log_file_path,
                              'status': TaskStatus.STARTED
                          })

    # pass params as env variables
    env = os.environ.copy()

    # custom environment variables
    if spider.get('envs'):
        for _env in spider.get('envs'):
            env[_env['name']] = _env['value']

    # task id environment variable
    env['CRAWLAB_TASK_ID'] = task_id

    # collection environment variable
    if spider.get('col'):
        env['CRAWLAB_COLLECTION'] = spider.get('col')

        # create index to speed results data retrieval
        db_manager.create_index(spider.get('col'), [('task_id', ASCENDING)])

    # mongodb environment variables
    env['MONGO_HOST'] = MONGO_HOST
    env['MONGO_PORT'] = str(MONGO_PORT)
    env['MONGO_DB'] = MONGO_DB
    if MONGO_USERNAME is not None:
        env['MONGO_USERNAME'] = MONGO_USERNAME
    if MONGO_PASSWORD:
        env['MONGO_PASSWORD'] = MONGO_PASSWORD

    cmd_arr = [sys.executable, '-m', 'scrapy', 'crawl', 'config_spider']
    try:
        p = subprocess.Popen(cmd_arr,
                             stdout=stdout.fileno(),
                             stderr=stderr.fileno(),
                             cwd=current_working_directory,
                             env=env,
                             bufsize=1)

        # update pid
        db_manager.update_one(col_name='tasks',
                              id=task_id,
                              values={'pid': p.pid})

        # get output from the process
        _stdout, _stderr = p.communicate()

        # get return code
        code = p.poll()
        if code == 0:
            status = TaskStatus.SUCCESS
        else:
            status = TaskStatus.FAILURE
    except Exception as err:
        traceback.print_exc()
        logger.error(err)
        stderr.write(str(err))
        status = TaskStatus.FAILURE

    # save task when the task is finished
    finish_ts = datetime.utcnow()
    db_manager.update_one('tasks',
                          id=task_id,
                          values={
                              'finish_ts':
                              finish_ts,
                              'duration':
                              (finish_ts - task['create_ts']).total_seconds(),
                              'status':
                              status
                          })
    task = db_manager.get('tasks', id=id)

    # close log file streams
    stdout.flush()
    stderr.flush()
    stdout.close()
    stderr.close()

    return task
Esempio n. 11
0
    def get(self, id=None, action=None):
        """
        GET method of SpiderAPI.
        :param id: spider_id
        :param action: action
        """
        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                    'status': 'ok',
                    'code': 400,
                    'error': 'action "%s" invalid' % action
                }, 400
            return getattr(self, action)(id)

        # get one node
        elif id is not None:
            spider = db_manager.get('spiders', id=id)

            # get deploy
            last_deploy = db_manager.get_last_deploy(spider_id=spider['_id'])
            if last_deploy is not None:
                spider['deploy_ts'] = last_deploy['finish_ts']

            return jsonify(spider)

        # get a list of items
        else:
            items = []

            # get customized spiders
            dirs = os.listdir(PROJECT_SOURCE_FILE_FOLDER)
            for _dir in dirs:
                if _dir in IGNORE_DIRS:
                    continue

                dir_path = os.path.join(PROJECT_SOURCE_FILE_FOLDER, _dir)
                dir_name = _dir
                spider = db_manager.get_one_by_key('spiders',
                                                   key='src',
                                                   value=dir_path)

                # new spider
                if spider is None:
                    stats = get_file_suffix_stats(dir_path)
                    lang = get_lang_by_stats(stats)
                    spider = db_manager.save(
                        'spiders', {
                            'name': dir_name,
                            'src': dir_path,
                            'lang': lang,
                            'suffix_stats': stats,
                            'type': SpiderType.CUSTOMIZED
                        })

                # existing spider
                else:
                    # get last deploy
                    last_deploy = db_manager.get_last_deploy(
                        spider_id=spider['_id'])
                    if last_deploy is not None:
                        spider['deploy_ts'] = last_deploy['finish_ts']

                    # file stats
                    stats = get_file_suffix_stats(dir_path)

                    # language
                    lang = get_lang_by_stats(stats)

                    # spider type
                    type_ = SpiderType.CUSTOMIZED

                    # update spider data
                    db_manager.update_one('spiders',
                                          id=str(spider['_id']),
                                          values={
                                              'lang': lang,
                                              'type': type_,
                                              'suffix_stats': stats,
                                          })

                # append spider
                items.append(spider)

            # get configurable spiders
            for spider in db_manager.list('spiders',
                                          {'type': SpiderType.CONFIGURABLE}):
                # append spider
                items.append(spider)

            # get other info
            for i in range(len(items)):
                spider = items[i]

                # get site
                if spider.get('site') is not None:
                    site = db_manager.get('sites', spider['site'])
                    if site is not None:
                        items[i]['site_name'] = site['name']

                # get last task
                last_task = db_manager.get_last_task(spider_id=spider['_id'])
                if last_task is not None:
                    items[i]['task_ts'] = last_task['create_ts']

                # ---------
                # stats
                # ---------
                # last 5-run errors
                items[i]['last_5_errors'] = get_last_n_run_errors_count(
                    spider_id=spider['_id'], n=5)
                items[i]['last_7d_tasks'] = get_last_n_day_tasks_count(
                    spider_id=spider['_id'], n=5)

            return {'status': 'ok', 'items': jsonify(items)}
Esempio n. 12
0
 def update_nodes_status(event):
     node_id = event.get('hostname')
     db_manager.update_one('nodes', id=node_id, values={
         'status': NodeStatus.ONLINE
     })
Esempio n. 13
0
 def update_envs(self, id: str):
     args = self.parser.parse_args()
     envs = json.loads(args.envs)
     db_manager.update_one(col_name='spiders', id=id, values={'envs': envs})