コード例 #1
0
    def on_crawl(self, id: str) -> (dict, tuple):
        """
        Start a crawl task.
        :param id: spider_id
        :return:
        """
        args = self.parser.parse_args()
        params = args.get('params')

        spider = db_manager.get('spiders', id=ObjectId(id))

        # determine execute function
        if spider['type'] == SpiderType.CONFIGURABLE:
            # configurable spider
            exec_func = execute_config_spider
        else:
            # customized spider
            exec_func = execute_spider

        # trigger an asynchronous job
        job = exec_func.delay(id, params)

        # create a new task
        db_manager.save('tasks', {
            '_id': job.id,
            'spider_id': ObjectId(id),
            'cmd': spider.get('cmd'),
            'params': params,
            'create_ts': datetime.utcnow(),
            'status': TaskStatus.PENDING
        })

        return {
            'code': 200,
            'status': 'ok',
            'task': {
                'id': job.id,
                'status': job.status
            }
        }
コード例 #2
0
ファイル: node.py プロジェクト: zzy0302/crawlab
def update_nodes_status(refresh=False):
    """
    Update all nodes status
    :param refresh:
    """
    online_node_ids = []
    url = '%s/workers?status=1' % FLOWER_API_ENDPOINT
    if refresh:
        url += '&refresh=1'

    res = requests.get(url)
    if res.status_code != 200:
        return online_node_ids

    for k, v in json.loads(res.content.decode('utf-8')).items():
        node_name = k
        node_status = NodeStatus.ONLINE if v else NodeStatus.OFFLINE
        # node_celery = v
        node = db_manager.get('nodes', id=node_name)

        # new node
        if node is None:
            node = {
                '_id': node_name,
                'name': node_name,
                'status': node_status,
                'ip': 'localhost',
                'port': '8000'
            }
            db_manager.save('nodes', node)

        # existing node
        else:
            node['status'] = node_status
            db_manager.save('nodes', node)

        if node_status:
            online_node_ids.append(node_name)
    return online_node_ids
コード例 #3
0
ファイル: spiders.py プロジェクト: tom2jack/crawlab
    def post(self, id: str = None, action: str = None):
        """
        POST method of the given id for performing an action.
        :param id:
        :param action:
        :return:
        """
        args = self.parser.parse_args()
        name = args.get('name')
        if name is not None :
            spider = db_manager._get('spiders', {'name': name})
            # new spider
            if spider is None:
                item = {}
                for k in args.keys():
                    item[k] = args.get(k)
                db_manager.save(col_name='spiders', item=item)
                spider = db_manager._get('spiders', {'name': name})
                id = str(spider.get('_id'))
                return self.update(id)
            else:
                id = str(spider.get('_id'))


        # perform update action if action is not specified
        if action is None:
            return self.update(id)

        # if action is not defined in the attributes, return 400 error
        if not hasattr(self, action):
            return {
                       'status': 'ok',
                       'code': 400,
                       'error': 'action "%s" invalid' % action
                   }, 400

        # perform specified action of given id
        return getattr(self, action)(id)
コード例 #4
0
def update_nodes_status(refresh=False):
    online_node_ids = []
    url = '%s/workers?status=1' % FLOWER_API_ENDPOINT
    if refresh:
        url += '&refresh=1'
    res = requests.get(url)
    for k, v in json.loads(res.content.decode('utf-8')).items():
        node_name = k
        node_status = NodeStatus.ONLINE if v else NodeStatus.OFFLINE
        # node_celery = v
        node = db_manager.get('nodes', id=node_name)

        # new node
        if node is None:
            node = {'_id': node_name, 'name': node_name, 'status': node_status}
            db_manager.save('nodes', node)

        else:
            node['status'] = node_status
            db_manager.save('nodes', node)

        if node_status:
            online_node_ids.append(node_name)
    return online_node_ids
コード例 #5
0
ファイル: base.py プロジェクト: tom2jack/crawlab
    def put(self) -> (dict, tuple):
        """
        PUT method for creating a new item.
        :return:
        """
        args = self.parser.parse_args()
        item = {}
        for k in args.keys():
            if k not in DEFAULT_ARGS:
                item[k] = args.get(k)
        item = db_manager.save(col_name=self.col_name, item=item)

        self.after_update()

        return jsonify(item)
コード例 #6
0
    def get(self, id=None, action=None):
        """
        GET method of SpiderAPI.
        :param id: spider_id
        :param action: action
        """
        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                    'status': 'ok',
                    'code': 400,
                    'error': 'action "%s" invalid' % action
                }, 400
            return getattr(self, action)(id)

        # get one node
        elif id is not None:
            spider = db_manager.get('spiders', id=id)

            # get deploy
            last_deploy = db_manager.get_last_deploy(spider_id=spider['_id'])
            if last_deploy is not None:
                spider['deploy_ts'] = last_deploy['finish_ts']

            return jsonify(spider)

        # get a list of items
        else:
            items = []

            # get customized spiders
            dirs = os.listdir(PROJECT_SOURCE_FILE_FOLDER)
            for _dir in dirs:
                if _dir in IGNORE_DIRS:
                    continue

                dir_path = os.path.join(PROJECT_SOURCE_FILE_FOLDER, _dir)
                dir_name = _dir
                spider = db_manager.get_one_by_key('spiders',
                                                   key='src',
                                                   value=dir_path)

                # new spider
                if spider is None:
                    stats = get_file_suffix_stats(dir_path)
                    lang = get_lang_by_stats(stats)
                    spider = db_manager.save(
                        'spiders', {
                            'name': dir_name,
                            'src': dir_path,
                            'lang': lang,
                            'suffix_stats': stats,
                            'type': SpiderType.CUSTOMIZED
                        })

                # existing spider
                else:
                    # get last deploy
                    last_deploy = db_manager.get_last_deploy(
                        spider_id=spider['_id'])
                    if last_deploy is not None:
                        spider['deploy_ts'] = last_deploy['finish_ts']

                    # file stats
                    stats = get_file_suffix_stats(dir_path)

                    # language
                    lang = get_lang_by_stats(stats)

                    # spider type
                    type_ = SpiderType.CUSTOMIZED

                    # update spider data
                    db_manager.update_one('spiders',
                                          id=str(spider['_id']),
                                          values={
                                              'lang': lang,
                                              'type': type_,
                                              'suffix_stats': stats,
                                          })

                # append spider
                items.append(spider)

            # get configurable spiders
            for spider in db_manager.list('spiders',
                                          {'type': SpiderType.CONFIGURABLE}):
                # append spider
                items.append(spider)

            # get other info
            for i in range(len(items)):
                spider = items[i]

                # get site
                if spider.get('site') is not None:
                    site = db_manager.get('sites', spider['site'])
                    if site is not None:
                        items[i]['site_name'] = site['name']

                # get last task
                last_task = db_manager.get_last_task(spider_id=spider['_id'])
                if last_task is not None:
                    items[i]['task_ts'] = last_task['create_ts']

                # ---------
                # stats
                # ---------
                # last 5-run errors
                items[i]['last_5_errors'] = get_last_n_run_errors_count(
                    spider_id=spider['_id'], n=5)
                items[i]['last_7d_tasks'] = get_last_n_day_tasks_count(
                    spider_id=spider['_id'], n=5)

            return {'status': 'ok', 'items': jsonify(items)}
コード例 #7
0
ファイル: spider.py プロジェクト: dujun31/crawlab
def execute_spider(self, id: str):
    task_id = self.request.id
    hostname = self.request.hostname
    spider = db_manager.get('spiders', id=id)
    command = spider.get('cmd')

    current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER,
                                             str(spider.get('_id')))

    # log info
    logger.info('current_working_directory: %s' % current_working_directory)
    logger.info('spider_id: %s' % id)
    logger.info(command)

    # make sure the log folder exists
    log_path = os.path.join(PROJECT_LOGS_FOLDER, id)
    if not os.path.exists(log_path):
        os.makedirs(log_path)

    # open log file streams
    log_file_path = os.path.join(
        log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S'))
    stdout = open(log_file_path, 'a')
    stderr = open(log_file_path, 'a')

    # create a new task
    db_manager.save(
        'tasks', {
            '_id': task_id,
            'spider_id': ObjectId(id),
            'create_ts': datetime.now(),
            'node_id': 'celery@%s' % hostname,
            'hostname': hostname,
            'log_file_path': log_file_path,
            'status': TaskStatus.PENDING
        })

    # execute the command
    env = os.environ.copy()
    env['CRAWLAB_TASK_ID'] = task_id
    if spider.get('col'):
        env['CRAWLAB_COLLECTION'] = spider.get('col')
    p = subprocess.Popen(command.split(' '),
                         stdout=stdout.fileno(),
                         stderr=stderr.fileno(),
                         cwd=current_working_directory,
                         env=env,
                         bufsize=1)

    # get output from the process
    _stdout, _stderr = p.communicate()

    # get return code
    code = p.poll()
    if code == 0:
        status = TaskStatus.SUCCESS
    else:
        status = TaskStatus.FAILURE

    # save task when the task is finished
    db_manager.update_one('tasks',
                          id=task_id,
                          values={
                              'finish_ts': datetime.now(),
                              'status': status
                          })
    task = db_manager.get('tasks', id=id)

    # close log file streams
    stdout.flush()
    stderr.flush()
    stdout.close()
    stderr.close()

    return task