Example #1
0
 def get(self):
     mongo = MongodbLink()
     name = self.get_argument('name', '')
     print(name)
     # 查空
     if not name:
         self.set_header('Content-Type', 'application/json; charset=UTF-8')
         return self.write(
             json_encode({
                 'state': False,
                 'message': '无法获取参数, 请检查.'
             }))
     # 删除数据
     request = {'config.name': name}
     conf = mongo.get_config()
     result = mongo.exe_remove(conf, request)
     if not result['state']:
         self.set_header('Content-Type', 'application/json; charset=UTF-8')
         return self.write(
             json_encode({
                 'state': False,
                 'message': '删除配置失败.'
             }))
     else:
         self.set_header('Content-Type', 'application/json; charset=UTF-8')
         return self.write(
             json_encode({
                 'state': True,
                 'message': 'success'
             }))
Example #2
0
 def post(self):
     mongo = MongodbLink()
     selectedField = json.loads(self.get_argument('selectedField', '{}'))
     config = json.loads(self.get_argument('config', '{}'))
     # 请求间隔
     requestInterval = int(self.get_argument('requestInterval', 0))
     # 翻页间隔
     pageLoadDelay = int(self.get_argument('pageLoadDelay', 0))
     # 查空
     if not selectedField or not config:
         self.set_header('Content-Type', 'application/json; charset=UTF-8')
         return self.write(
             json_encode({
                 'state': False,
                 'message': '无法获取配置, 请检查.'
             }))
     # 查重
     count = mongo.exe_count(config, {'config.name': config['name']})
     if not count:
         self.set_header('Content-Type', 'application/json; charset=UTF-8')
         return self.write(
             json_encode({
                 'state': False,
                 'message': '配置名称已存在, 请修改.'
             }))
     # 保存数据
     request = {
         'selectedField': selectedField,
         'config': config,
         'requestInterval': requestInterval,
         "pageLoadDelay": pageLoadDelay
     }
     conf = mongo.get_config()
     result = mongo.exe_insert(conf, request)
     if not result['state']:
         self.set_header('Content-Type', 'application/json; charset=UTF-8')
         return self.write(
             json_encode({
                 'state': False,
                 'message': '保存配置失败.'
             }))
     else:
         self.set_header('Content-Type', 'application/json; charset=UTF-8')
         return self.write(
             json_encode({
                 'state': True,
                 'message': 'success'
             }))
Example #3
0
 def get(self):
     mongo = MongodbLink()
     collection_name = self.get_argument('collection_name', '')
     if not collection_name:
         return self.write([])
     # limit = int(self.get_argument('limit', 10))
     # offset = int(self.get_argument('offset', 10))
     # limit = 50 if limit > 50 else limit
     # offset = 50 if offset > 50 else offset
     # 查询数据
     db = mongo.get_db()
     # result = mongo.exe_search_page(db[collection_name], {}, limit, offset)
     result = mongo.exe_search(db[collection_name], {})
     if not result['state']:
         return self.write([])
     else:
         for i in result['data']:
             del i["_id"]
     self.set_header('Content-Type', 'application/json; charset=UTF-8')
     # 查询总数
     total = mongo.exe_count(db[collection_name], {})
     # self.write(json_encode({'total': total['data'], 'rows': result['data']}))
     self.write(
         json_encode({
             'total': total['data'],
             'data': result['data']
         }))
Example #4
0
 def get(self):
     mongo = MongodbLink()
     collection_name = self.get_argument('collection_name', '')
     if not collection_name:
         self.set_header('Content-Type', 'application/json; charset=UTF-8')
         return self.write(json_encode([]))
     db = mongo.get_db()
     result = mongo.exe_search_one(db[collection_name], {})
     if not result['state']:
         self.set_header('Content-Type', 'application/json; charset=UTF-8')
         return self.write(json_encode([]))
     fields = result['data']
     if not fields:
         self.set_header('Content-Type', 'application/json; charset=UTF-8')
         return self.write(json_encode([]))
     result = [{'field': i, 'title': i} for i in fields if i != "_id"]
     self.set_header('Content-Type', 'application/json; charset=UTF-8')
     return self.write(json_encode(result))
Example #5
0
 def get(self):
     name = self.get_argument('name', '')
     if not name:
         self.set_header('Content-Type', 'application/json; charset=UTF-8')
         return self.write(json_encode({
             'state': False,
             'message': '字段值为空'
         }))
     # 查询配置数据
     mongo = MongodbLink()
     conf = mongo.get_config()
     conf_data = mongo.exe_search_one(conf, {'config.name': name})
     del conf_data['data']['_id']
     self.set_header('Content-Type', 'application/json; charset=UTF-8')
     return self.write(
         json_encode({
             'state': True,
             'message': 'success',
             'data': conf_data['data']
         }))
Example #6
0
 def get(self):
     crawl_ids_arg = self.get_argument('crawl_ids', '')
     if crawl_ids_arg == '':
         jobs = self.crawler_process.get_jobs()
     else:
         crawl_ids = set(map(int, crawl_ids_arg.split(',')))
         jobs = [
             job for job in self.crawler_process.get_jobs()
             if job['id'] in crawl_ids
         ]
     return self.write(json_encode({'jobs': jobs}))
Example #7
0
 def post(self):
     name = self.get_argument('name', '').replace('"', '')
     url = self.get_argument('url', '').replace('"', '')
     method = self.get_argument('method', '').replace('"', '')
     proxy = int(self.get_argument('proxy', 0))
     dynamic = int(self.get_argument('dynamic', 0))
     header = json.loads(self.get_argument('header', '{}'))
     cookie = json.loads(self.get_argument('cookie', '{}'))
     form = json.loads(self.get_argument('form', '{}'))
     config_data = json_encode({
         'name': name,
         'url': url,
         'method': method,
         'proxy': proxy,
         'dynamic': dynamic,
         'header': header,
         'cookie': cookie,
         'form': form
     })
     # 代理服务器
     settings = {}
     settings['DOWNLOADER_MIDDLEWARES'] = {}
     if proxy:
         settings = {
             'DOWNLOADER_MIDDLEWARES': {
                 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':
                 542,
                 'server.spider.middlewares.ProxyIPMiddleware': 100
             }
         }
     # 动态加载
     if dynamic:
         settings['DOWNLOADER_MIDDLEWARES'].update({
             # 关闭默认下载器
             'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
             None,
             'server.spider.middlewares.PhantomJSMiddleware':
             543
         })
     if not url:
         return self.render('config.html')
     # 启动spider
     result = thread_crawl(url, CrawlEntranceSpider, method, header, cookie,
                           form, settings)
     if result:
         return self.render('work.html', config_data=config_data)
     else:
         return self.write('waiting')
Example #8
0
 def render(self, *args, **kwargs):
     proc_stats = self.crawler_process.procmon.get_recent()
     kwargs['initial_process_stats_json'] = json_encode(proc_stats)
     return super(BaseRequestHandler, self).render(*args, **kwargs)
Example #9
0
    def post(self):
        if self.is_json:
            name = self.json_args['config'].get('name', '')
            url = self.json_args['config'].get('url', '')
            method = self.json_args['config'].get('method', '')
            proxy = self.json_args['config'].get('proxy', 0)
            dynamic = self.json_args['config'].get('dynamic', 0)
            header = self.json_args['config'].get('header', '{}')
            cookie = self.json_args['config'].get('cookie', '{}')
            form = self.json_args['config'].get('form', '{}')
            selectors = self.json_args['selectedField'].get('selectors', [])
            requestInterval = self.json_args['requestInterval']
            pageLoadDelay = self.json_args['pageLoadDelay']
            # 字段验证
            fields = [
                i['id'] for i in selectors if i['type'] in [
                    'SelectorText', 'SelectorElementAttribute', 'SelectorHTML',
                    'SelectorImage'
                ]
            ]
            mongo = MongodbLink()
            if not name:
                self.set_header('Content-Type',
                                'application/json; charset=UTF-8')
                return self.write(
                    json_encode({
                        'state': False,
                        'message': '配置名称为空'
                    }))
            db = mongo.get_db()
            result = mongo.exe_search_one(db[name], {})
            if name == 'config':
                self.set_header('Content-Type',
                                'application/json; charset=UTF-8')
                return self.write(
                    json_encode({
                        'state': False,
                        'message': '配置名称不能为config'
                    }))
            if not result['state']:
                self.set_header('Content-Type',
                                'application/json; charset=UTF-8')
                return self.write(json_encode([]))
            if result['data']:
                db_fields = [i for i in result['data'] if i != "_id"]
                for i in fields:
                    if i not in db_fields:
                        self.set_header('Content-Type',
                                        'application/json; charset=UTF-8')
                        return self.write(
                            json_encode({
                                'state':
                                False,
                                'message':
                                '采集字段和数据库已有字段不符, 请修改配置名或修改采集字段.'
                            }))

            # settings设置
            # 代理服务器
            settings = {}
            settings['DOWNLOADER_MIDDLEWARES'] = {}
            if proxy:
                settings = {
                    'DOWNLOADER_MIDDLEWARES': {
                        'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':
                        542,
                        'server.spider.middlewares.ProxyIPMiddleware':
                        100
                    }
                }
            # 动态加载
            if dynamic:
                settings['DOWNLOADER_MIDDLEWARES'].update({
                    # 关闭默认下载器
                    'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware':
                    None,
                    'server.spider.middlewares.PhantomJSMiddleware':
                    543
                })
            # 下载间隔
            if requestInterval: settings['DOWNLOAD_DELAY'] = requestInterval
            if self.crawl(url, name, CrawlWebsiteSpider, method, header,
                          cookie, form, settings, selectors, pageLoadDelay):
                return self.write({'state': True, 'message': 'success'})
            else:
                raise HTTPError(400)
        else:
            self.set_header('Content-Type', 'application/json; charset=UTF-8')
            self.write(
                json.dumps({
                    'state': False,
                    'message': '程序启动失败<br>字段选择器格式错误,或缺失字段,请检查.'
                }))
            self.finish()
Example #10
0
 def get(self):
     # TODO
     jobs = self.crawler_process.jobs
     initial_data_json = json_encode({'jobs': jobs})
     return self.render("index.html", initial_data_json=initial_data_json)
Example #11
0
 def write_event(self, event, data):
     """向客户端发送message"""
     message = json_encode({'event': event, 'data': data})
     self.write_message(message)