Exemple #1
0
    def get(self, id=None, action=None):
        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                    'status': 'ok',
                    'code': 400,
                    'error': 'action "%s" invalid' % action
                }, 400
            return getattr(self, action)(id)

        # get one node
        elif id is not None:
            return jsonify(db_manager.get('deploys', id=id))

        # get a list of items
        else:
            items = db_manager.list('deploys', {})
            deploys = []
            for item in items:
                spider_id = item['spider_id']
                spider = db_manager.get('spiders', id=str(spider_id))
                item['spider_name'] = spider['name']
                deploys.append(item)
            return {'status': 'ok', 'items': jsonify(deploys)}
Exemple #2
0
    def get(self, id: str = None, action: str = None) -> (dict, tuple):
        """
        GET method of DeployAPI.
        :param id: deploy_id
        :param action: action
        """
        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                           'status': 'ok',
                           'code': 400,
                           'error': 'action "%s" invalid' % action
                       }, 400
            return getattr(self, action)(id)

        # get one node
        elif id is not None:
            return jsonify(db_manager.get('deploys', id=id))

        # get a list of items
        else:
            items = db_manager.list('deploys', {})
            deploys = []
            for item in items:
                spider_id = item['spider_id']
                spider = db_manager.get('spiders', id=str(spider_id))
                if spider is None:
                    db_manager.remove('deploys', {'spider_id':spider_id})
                item['spider_name'] = spider['name']
                deploys.append(item)
            return {
                'status': 'ok',
                'items': jsonify(deploys)
            }
Exemple #3
0
    def get_results(self, id: str) -> (dict, tuple):
        """
        Get a list of results crawled in a given task.
        :param id: task_id
        """
        args = self.parser.parse_args()
        page_size = args.get('page_size') or 10
        page_num = args.get('page_num') or 1

        task = db_manager.get('tasks', id=id)
        spider = db_manager.get('spiders', id=task['spider_id'])
        col_name = spider.get('col')
        if not col_name:
            return []
        fields = get_spider_col_fields(col_name)
        items = db_manager.list(col_name, {'task_id': id},
                                skip=page_size * (page_num - 1),
                                limit=page_size)
        return {
            'status': 'ok',
            'fields': jsonify(fields),
            'total_count': db_manager.count(col_name, {'task_id': id}),
            'page_num': page_num,
            'page_size': page_size,
            'items': jsonify(items)
        }
Exemple #4
0
 def get_log(self, id: (str, ObjectId)) -> (dict, tuple):
     """
     Submit an HTTP request to fetch log from the node of a given task.
     :param id: task_id
     :return:
     """
     task = db_manager.get(col_name=self.col_name, id=id)
     node = db_manager.get(col_name='nodes', id=task['node_id'])
     r = requests.get('http://%s:%s/api/tasks/%s/on_get_log' % (
         node['ip'],
         node['port'],
         id
     ))
     if r.status_code == 200:
         data = json.loads(r.content.decode('utf-8'))
         return {
             'status': 'ok',
             'log': data.get('log')
         }
     else:
         data = json.loads(r.content)
         return {
                    'code': 500,
                    'status': 'ok',
                    'error': data['error']
                }, 500
Exemple #5
0
def execute_spider(self, id: str):
    task_id = self.request.id
    hostname = self.request.hostname
    spider = db_manager.get('spiders', id=id)
    command = spider.get('cmd')

    current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id')))

    # log info
    logger.info('current_working_directory: %s' % current_working_directory)
    logger.info('spider_id: %s' % id)
    logger.info(command)

    # make sure the log folder exists
    log_path = os.path.join(PROJECT_LOGS_FOLDER, id)
    if not os.path.exists(log_path):
        os.makedirs(log_path)

    # open log file streams
    log_file_path = os.path.join(log_path, '%s.log' % datetime.now().strftime('%Y%m%d%H%M%S'))
    stdout = open(log_file_path, 'a')
    stderr = open(log_file_path, 'a')

    # create a new task
    db_manager.save('tasks', {
        '_id': task_id,
        'spider_id': ObjectId(id),
        'create_ts': datetime.now(),
        'node_id': hostname,
        'hostname': hostname,
        'log_file_path': log_file_path,
    })

    # execute the command
    env = os.environ.copy()
    env['CRAWLAB_TASK_ID'] = task_id
    env['CRAWLAB_COLLECTION'] = spider.get('col')
    p = subprocess.Popen(command.split(' '),
                         stdout=stdout.fileno(),
                         stderr=stderr.fileno(),
                         cwd=current_working_directory,
                         env=env,
                         bufsize=1)

    # get output from the process
    _stdout, _stderr = p.communicate()

    # save task when the task is finished
    db_manager.update_one('tasks', id=task_id, values={
        'finish_ts': datetime.now(),
    })
    task = db_manager.get('tasks', id=id)

    # close log file streams
    stdout.flush()
    stderr.flush()
    stdout.close()
    stderr.close()

    return task
Exemple #6
0
 def get_results(self, id):
     task = db_manager.get('tasks', id=id)
     spider = db_manager.get('spiders', id=task['spider_id'])
     col_name = spider.get('col')
     if not col_name:
         return []
     fields = get_spider_col_fields(col_name)
     items = db_manager.list(col_name, {'task_id': id})
     return jsonify({'status': 'ok', 'fields': fields, 'items': items})
Exemple #7
0
 def get_log(self, id):
     task = db_manager.get('tasks', id=id)
     node = db_manager.get('nodes', id=task['node_id'])
     r = requests.get('http://%s:%s/api/tasks/%s/on_get_log' %
                      (node['ip'], node['port'], id))
     if r.status_code == 200:
         data = json.loads(r.content.decode('utf-8'))
         return {'status': 'ok', 'log': data.get('log')}
     else:
         data = json.loads(r.content)
         return {'code': 500, 'status': 'ok', 'error': data['error']}, 500
Exemple #8
0
 def download_results(self, id: str):
     task = db_manager.get('tasks', id=id)
     spider = db_manager.get('spiders', id=task['spider_id'])
     col_name = spider.get('col')
     if not col_name:
         return send_csv([], f'results_{col_name}_{round(time())}.csv')
     items = db_manager.list(col_name, {'task_id': id}, limit=999999999)
     fields = get_spider_col_fields(col_name, task_id=id, limit=999999999)
     return send_csv(items,
                     filename=f'results_{col_name}_{round(time())}.csv',
                     fields=fields,
                     encoding='utf-8')
Exemple #9
0
    def get(self, id=None, action=None):
        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                    'status': 'ok',
                    'code': 400,
                    'error': 'action "%s" invalid' % action
                }, 400
            return getattr(self, action)(id)

        elif id is not None:
            task = db_manager.get('tasks', id=id)
            _task = db_manager.get('tasks_celery', id=task['_id'])
            _spider = db_manager.get('spiders', id=str(task['spider_id']))
            if _task:
                if not task.get('status'):
                    task['status'] = _task['status']
            task['result'] = _task['result']
            task['spider_name'] = _spider['name']
            try:
                with open(task['log_file_path']) as f:
                    task['log'] = f.read()
            except Exception as err:
                task['log'] = ''
            return jsonify(task)

        # list tasks
        args = self.parser.parse_args()
        page_size = args.get('page_size') or 10
        page_num = args.get('page_num') or 1
        tasks = db_manager.list('tasks', {},
                                limit=page_size,
                                skip=page_size * (page_num - 1),
                                sort_key='finish_ts')
        items = []
        for task in tasks:
            _task = db_manager.get('tasks_celery', id=task['_id'])
            _spider = db_manager.get('spiders', id=str(task['spider_id']))
            if _task:
                task['status'] = _task['status']
            else:
                task['status'] = TaskStatus.UNAVAILABLE
            task['spider_name'] = _spider['name']
            items.append(task)
        return {
            'status': 'ok',
            'total_count': db_manager.count('tasks', {}),
            'page_num': page_num,
            'page_size': page_size,
            'items': jsonify(items)
        }
Exemple #10
0
 def get_tasks(self, id):
     items = db_manager.list('tasks', {'node_id': id},
                             limit=10,
                             sort_key='create_ts')
     for item in items:
         spider_id = item['spider_id']
         spider = db_manager.get('spiders', id=str(spider_id))
         item['spider_name'] = spider['name']
         _task = db_manager.get('tasks_celery', id=item['_id'])
         if _task:
             item['status'] = _task['status']
         else:
             item['status'] = TaskStatus.UNAVAILABLE
     return {'status': 'ok', 'items': jsonify(items)}
Exemple #11
0
 def stop(self, id):
     """
     Send stop signal to a specific node
     :param id: task_id
     """
     task = db_manager.get('tasks', id=id)
     node = db_manager.get('nodes', id=task['node_id'])
     r = requests.get('http://%s:%s/api/tasks/%s/on_stop' %
                      (node['ip'], node['port'], id))
     if r.status_code == 200:
         return {'status': 'ok', 'message': 'success'}
     else:
         data = json.loads(r.content)
         return {'code': 500, 'status': 'ok', 'error': data['error']}, 500
Exemple #12
0
 def get_tasks(self, id):
     items = db_manager.list('tasks',
                             cond={'spider_id': ObjectId(id)},
                             limit=10,
                             sort_key='finish_ts')
     for item in items:
         spider_id = item['spider_id']
         spider = db_manager.get('spiders', id=str(spider_id))
         item['spider_name'] = spider['name']
         task = db_manager.get('tasks_celery', id=item['_id'])
         if task is not None:
             item['status'] = task['status']
         else:
             item['status'] = TaskStatus.UNAVAILABLE
     return jsonify({'status': 'ok', 'items': items})
Exemple #13
0
    def deploy(self, id):
        spider = db_manager.get('spiders', id=id)
        nodes = db_manager.list('nodes', {})

        for node in nodes:
            node_id = node['_id']

            output_file_name = '%s_%s.zip' % (
                datetime.now().strftime('%Y%m%d%H%M%S'), str(random())[2:12])
            output_file_path = os.path.join(PROJECT_TMP_FOLDER,
                                            output_file_name)

            # zip source folder to zip file
            zip_file(source_dir=spider['src'],
                     output_filename=output_file_path)

            # upload to api
            files = {'file': open(output_file_path, 'rb')}
            r = requests.post(
                'http://%s:%s/api/spiders/%s/deploy_file?node_id=%s' % (
                    node.get('ip'),
                    node.get('port'),
                    id,
                    node_id,
                ),
                files=files)

        return {'code': 200, 'status': 'ok', 'message': 'deploy success'}
Exemple #14
0
    def crawl(self, id):
        args = self.parser.parse_args()
        node_id = args.get('node_id')

        if node_id is None:
            return {
                'code': 400,
                'status': 400,
                'error': 'node_id cannot be empty'
            }, 400

        # get node from db
        node = db_manager.get('nodes', id=node_id)

        # validate ip and port
        if node.get('ip') is None or node.get('port') is None:
            return {
                'code': 400,
                'status': 'ok',
                'error': 'node ip and port should not be empty'
            }, 400

        # dispatch crawl task
        res = requests.get('http://%s:%s/api/spiders/%s/on_crawl?node_id=%s' %
                           (node.get('ip'), node.get('port'), id, node_id))
        data = json.loads(res.content.decode('utf-8'))
        return {
            'code': res.status_code,
            'status': 'ok',
            'error': data.get('error'),
            'task': data.get('task')
        }
Exemple #15
0
    def on_crawl(self, id: str) -> (dict, tuple):
        """
        Start a crawl task.
        :param id: spider_id
        :return:
        """
        args = self.parser.parse_args()
        params = args.get('params')

        spider = db_manager.get('spiders', id=ObjectId(id))

        job = execute_spider.delay(id, params)

        # create a new task
        db_manager.save('tasks', {
            '_id': job.id,
            'spider_id': ObjectId(id),
            'cmd': spider.get('cmd'),
            'params': params,
            'create_ts': datetime.utcnow(),
            'status': TaskStatus.PENDING
        })

        return {
            'code': 200,
            'status': 'ok',
            'task': {
                'id': job.id,
                'status': job.status
            }
        }
Exemple #16
0
    def get(self, id=None, action=None):
        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                           'status': 'ok',
                           'code': 400,
                           'error': 'action "%s" invalid' % action
                       }, 400
            return getattr(self, action)(id)

        # get one node
        elif id is not None:
            return db_manager.get('nodes', id=id)

        # get a list of items
        else:
            # get a list of active nodes from flower and save to db
            update_nodes_status()

            # iterate db nodes to update status
            nodes = db_manager.list('nodes', {})

            return {
                'status': 'ok',
                'items': jsonify(nodes)
            }
Exemple #17
0
    def update(self, id: str = None) -> (dict, tuple):
        """
        Helper function for update action given the id.
        :param id:
        :return:
        """
        args = self.parser.parse_args()
        item = db_manager.get(col_name=self.col_name, id=id)
        if item is None:
            return {
                'status': 'ok',
                'code': 401,
                'error': 'item not exists'
            }, 401
        values = {}
        for k in args.keys():
            if k not in DEFAULT_ARGS:
                if args.get(k) is not None:
                    values[k] = args.get(k)
        item = db_manager.update_one(col_name=self.col_name,
                                     id=id,
                                     values=values)

        # execute after_update hook
        self.after_update(id)

        return item
Exemple #18
0
    def deploy(self, id: str) -> (dict, tuple):
        """
        Submit HTTP requests to deploy the given spider to all nodes.
        :param id:
        :return:
        """
        spider = db_manager.get('spiders', id=id)
        nodes = db_manager.list('nodes', {'status': NodeStatus.ONLINE})

        for node in nodes:
            node_id = node['_id']

            output_file_name = '%s_%s.zip' % (
                datetime.now().strftime('%Y%m%d%H%M%S'), str(random())[2:12])
            output_file_path = os.path.join(PROJECT_TMP_FOLDER,
                                            output_file_name)

            # zip source folder to zip file
            zip_file(source_dir=spider['src'],
                     output_filename=output_file_path)

            # upload to api
            files = {'file': open(output_file_path, 'rb')}
            r = requests.post(
                'http://%s:%s/api/spiders/%s/deploy_file?node_id=%s' % (
                    node.get('ip'),
                    node.get('port'),
                    id,
                    node_id,
                ),
                files=files)

            # TODO: checkpoint for errors

        return {'code': 200, 'status': 'ok', 'message': 'deploy success'}
Exemple #19
0
    def on_stop(self, id):
        """
        Stop the task in progress.
        :param id:
        :return:
        """
        task = db_manager.get('tasks', id=id)
        celery_app.control.revoke(id, terminate=True)
        db_manager.update_one('tasks',
                              id=id,
                              values={'status': TaskStatus.REVOKED})

        # kill process
        if task.get('pid'):
            pid = task.get('pid')
            if 'win32' in sys.platform:
                os.popen('taskkill /pid:' + str(pid))
            else:
                # unix system
                os.kill(pid, SIGKILL)

        return {
            'id': id,
            'status': 'ok',
        }
Exemple #20
0
    def deploy_file(self, id: str = None) -> (dict, tuple):
        """
        Receive HTTP request of deploys and unzip zip files and copy to the destination directories.
        :param id: spider_id
        """
        args = parser.parse_args()
        node_id = request.args.get('node_id')
        f = args.file

        if get_file_suffix(f.filename) != 'zip':
            return {
                       'status': 'ok',
                       'error': 'file type mismatch'
                   }, 400

        # save zip file on temp folder
        file_path = '%s/%s' % (PROJECT_TMP_FOLDER, f.filename)
        with open(file_path, 'wb') as fw:
            fw.write(f.stream.read())

        # unzip zip file
        dir_path = file_path.replace('.zip', '')
        if os.path.exists(dir_path):
            shutil.rmtree(dir_path)
        unzip_file(file_path, dir_path)

        # get spider and version
        spider = db_manager.get(col_name=self.col_name, id=id)
        if spider is None:
            return None, 400

        # make source / destination
        src = os.path.join(dir_path, os.listdir(dir_path)[0])
        # src = dir_path
        dst = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id')))

        # logging info
        current_app.logger.info('src: %s' % src)
        current_app.logger.info('dst: %s' % dst)

        # remove if the target folder exists
        if os.path.exists(dst):
            shutil.rmtree(dst)

        # copy from source to destination
        shutil.copytree(src=src, dst=dst)

        # save to db
        # TODO: task management for deployment
        db_manager.save('deploys', {
            'spider_id': ObjectId(id),
            'node_id': node_id,
            'finish_ts': datetime.utcnow()
        })

        return {
            'code': 200,
            'status': 'ok',
            'message': 'deploy success'
        }
Exemple #21
0
 def on_get_log(self, id):
     try:
         task = db_manager.get('tasks', id=id)
         with open(task['log_file_path']) as f:
             log = f.read()
             return {'status': 'ok', 'log': log}
     except Exception as err:
         return {'code': 500, 'status': 'ok', 'error': str(err)}, 500
Exemple #22
0
def get_task(id: str):
    i = 0
    while i < 5:
        task = db_manager.get('tasks', id=id)
        if task is not None:
            return task
        i += 1
        sleep(1)
    return None
Exemple #23
0
    def get(self, id=None, action=None):
        args = self.parser.parse_args()

        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                    'status': 'ok',
                    'code': 400,
                    'error': 'action "%s" invalid' % action
                }, 400
            return getattr(self, action)(id)

        # list items
        elif id is None:
            # filter
            cond = {}
            if args.get('filter') is not None:
                cond = args.filter
                # cond = json.loads(args.filter)

            # page number
            page = 1
            if args.get('page') is not None:
                page = args.page
                # page = int(args.page)

            # page size
            page_size = 10
            if args.get('page_size') is not None:
                page_size = args.page_size
                # page = int(args.page_size)

            # TODO: sort functionality

            # total count
            total_count = db_manager.count(col_name=self.col_name, cond=cond)

            # items
            items = db_manager.list(col_name=self.col_name,
                                    cond=cond,
                                    skip=(page - 1) * page_size,
                                    limit=page_size)

            # TODO: getting status for node

            return jsonify({
                'status': 'ok',
                'total_count': total_count,
                'page': page,
                'page_size': page_size,
                'items': items
            })

        # get item by id
        else:
            return jsonify(db_manager.get(col_name=self.col_name, id=id))
Exemple #24
0
    def get_results(self, id: str) -> (dict, tuple):
        """
        Get a list of results crawled in a given task.
        :param id: task_id
        """
        args = self.parser.parse_args()
        page_size = args.get('page_size') or 10

        task = db_manager.get('tasks', id=id)
        spider = db_manager.get('spiders', id=task['spider_id'])
        col_name = spider.get('col')
        if not col_name:
            return []
        fields = get_spider_col_fields(col_name)
        fields = list(set(fields) - set(IGNORE_FIELD))
        items = db_manager.list(col_name, {'task_id': id})

        # 避免内容过长,做一下限制;同时剔除无用的字段不展示
        adjust_items = []
        for item in items:
            adjust_item = {}
            for key, value in item.items():
                if isinstance(value, str) == False:
                    continue
                if key in IGNORE_FIELD:
                    continue
                if len(value) > 500:
                    value = value[:500] + '...'
                adjust_item[key] = value
            adjust_items += [adjust_item]

        total_count = db_manager.count(col_name, {'task_id': id})
        page_num = len(adjust_items) / page_size
        if isinstance(page_num, float):
            page_num = int(page_num) + 1

        return {
            'status': 'ok',
            'fields': jsonify(fields),
            'total_count': len(adjust_items),
            'page_num': page_num,
            'page_size': page_size,
            'items': jsonify(adjust_items)
        }
Exemple #25
0
    def get(self, id=None, action=None):
        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                           'status': 'ok',
                           'code': 400,
                           'error': 'action "%s" invalid' % action
                       }, 400
            return getattr(self, action)(id)

        elif id is not None:
            task = db_manager.get('tasks', id=id)
            _task = db_manager.get('tasks_celery', id=task['_id'])
            _spider = db_manager.get('spiders', id=str(task['spider_id']))
            if _task:
                task['status'] = _task['status']
            else:
                task['status'] = TaskStatus.UNAVAILABLE
            task['result'] = _task['result']
            task['spider_name'] = _spider['name']
            try:
                with open(task['log_file_path']) as f:
                    task['log'] = f.read()
            except Exception as err:
                task['log'] = ''
            return jsonify(task)

        tasks = db_manager.list('tasks', {}, limit=1000, sort_key='finish_ts')
        items = []
        for task in tasks:
            _task = db_manager.get('tasks_celery', id=task['_id'])
            _spider = db_manager.get('spiders', id=str(task['spider_id']))
            if _task:
                task['status'] = _task['status']
            else:
                task['status'] = TaskStatus.UNAVAILABLE
            task['spider_name'] = _spider['name']
            items.append(task)
        return jsonify({
            'status': 'ok',
            'items': items
        })
Exemple #26
0
 def get_tasks(self, id):
     items = db_manager.list('tasks', {'node_id': id}, limit=10, sort_key='create_ts')
     for item in items:
         spider_id = item['spider_id']
         spider = db_manager.get('spiders', id=str(spider_id))
         item['spider_name'] = spider['name']
     return {
         'status': 'ok',
         'items': jsonify(items)
     }
Exemple #27
0
    def get(self, id=None, action=None):
        # action by id
        if action is not None:
            if not hasattr(self, action):
                return {
                           'status': 'ok',
                           'code': 400,
                           'error': 'action "%s" invalid' % action
                       }, 400
            return getattr(self, action)(id)

        # get one node
        elif id is not None:
            return jsonify(db_manager.get('spiders', id=id))

        # get a list of items
        else:
            items = []
            dirs = os.listdir(PROJECT_SOURCE_FILE_FOLDER)
            for _dir in dirs:
                if _dir in IGNORE_DIRS:
                    continue

                dir_path = os.path.join(PROJECT_SOURCE_FILE_FOLDER, _dir)
                dir_name = _dir
                spider = db_manager.get_one_by_key('spiders', key='src', value=dir_path)

                # new spider
                if spider is None:
                    stats = get_file_suffix_stats(dir_path)
                    lang = get_lang_by_stats(stats)
                    db_manager.save('spiders', {
                        'name': dir_name,
                        'src': dir_path,
                        'lang': lang,
                        'suffix_stats': stats,
                    })

                # existing spider
                else:
                    stats = get_file_suffix_stats(dir_path)
                    lang = get_lang_by_stats(stats)
                    db_manager.update_one('spiders', id=str(spider['_id']), values={
                        'lang': lang,
                        'suffix_stats': stats,
                    })

                # append spider
                items.append(spider)

            return jsonify({
                'status': 'ok',
                'items': items
            })
Exemple #28
0
    def get_results(self, id):
        args = self.parser.parse_args()
        page_size = args.get('page_size') or 10
        page_num = args.get('page_num') or 1

        task = db_manager.get('tasks', id=id)
        spider = db_manager.get('spiders', id=task['spider_id'])
        col_name = spider.get('col')
        if not col_name:
            return []
        fields = get_spider_col_fields(col_name)
        items = db_manager.list(col_name, {'task_id': id})
        return {
            'status': 'ok',
            'fields': jsonify(fields),
            'total_count': db_manager.count(col_name, {'task_id': id}),
            'page_num': page_num,
            'page_size': page_size,
            'items': jsonify(items)
        }
Exemple #29
0
 def get_deploys(self, id):
     items = db_manager.list('deploys', {'node_id': id},
                             limit=10,
                             sort_key='finish_ts')
     deploys = []
     for item in items:
         spider_id = item['spider_id']
         spider = db_manager.get('spiders', id=str(spider_id))
         item['spider_name'] = spider['name']
         deploys.append(item)
     return {'status': 'ok', 'items': jsonify(deploys)}
Exemple #30
0
 def on_get_log(self, id: (str, ObjectId)) -> (dict, tuple):
     """
     Get the log of given task_id
     :param id: task_id
     """
     try:
         task = db_manager.get(col_name=self.col_name, id=id)
         with open(task['log_file_path']) as f:
             log = f.read()
             return {'status': 'ok', 'log': log}
     except Exception as err:
         return {'code': 500, 'status': 'ok', 'error': str(err)}, 500