def get(self): mongo = MongodbLink() name = self.get_argument('name', '') print(name) # 查空 if not name: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write( json_encode({ 'state': False, 'message': '无法获取参数, 请检查.' })) # 删除数据 request = {'config.name': name} conf = mongo.get_config() result = mongo.exe_remove(conf, request) if not result['state']: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write( json_encode({ 'state': False, 'message': '删除配置失败.' })) else: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write( json_encode({ 'state': True, 'message': 'success' }))
def post(self): mongo = MongodbLink() selectedField = json.loads(self.get_argument('selectedField', '{}')) config = json.loads(self.get_argument('config', '{}')) # 请求间隔 requestInterval = int(self.get_argument('requestInterval', 0)) # 翻页间隔 pageLoadDelay = int(self.get_argument('pageLoadDelay', 0)) # 查空 if not selectedField or not config: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write( json_encode({ 'state': False, 'message': '无法获取配置, 请检查.' })) # 查重 count = mongo.exe_count(config, {'config.name': config['name']}) if not count: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write( json_encode({ 'state': False, 'message': '配置名称已存在, 请修改.' })) # 保存数据 request = { 'selectedField': selectedField, 'config': config, 'requestInterval': requestInterval, "pageLoadDelay": pageLoadDelay } conf = mongo.get_config() result = mongo.exe_insert(conf, request) if not result['state']: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write( json_encode({ 'state': False, 'message': '保存配置失败.' })) else: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write( json_encode({ 'state': True, 'message': 'success' }))
def get(self): mongo = MongodbLink() collection_name = self.get_argument('collection_name', '') if not collection_name: return self.write([]) # limit = int(self.get_argument('limit', 10)) # offset = int(self.get_argument('offset', 10)) # limit = 50 if limit > 50 else limit # offset = 50 if offset > 50 else offset # 查询数据 db = mongo.get_db() # result = mongo.exe_search_page(db[collection_name], {}, limit, offset) result = mongo.exe_search(db[collection_name], {}) if not result['state']: return self.write([]) else: for i in result['data']: del i["_id"] self.set_header('Content-Type', 'application/json; charset=UTF-8') # 查询总数 total = mongo.exe_count(db[collection_name], {}) # self.write(json_encode({'total': total['data'], 'rows': result['data']})) self.write( json_encode({ 'total': total['data'], 'data': result['data'] }))
def get(self): mongo = MongodbLink() collection_name = self.get_argument('collection_name', '') if not collection_name: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write(json_encode([])) db = mongo.get_db() result = mongo.exe_search_one(db[collection_name], {}) if not result['state']: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write(json_encode([])) fields = result['data'] if not fields: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write(json_encode([])) result = [{'field': i, 'title': i} for i in fields if i != "_id"] self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write(json_encode(result))
def get(self): name = self.get_argument('name', '') if not name: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write(json_encode({ 'state': False, 'message': '字段值为空' })) # 查询配置数据 mongo = MongodbLink() conf = mongo.get_config() conf_data = mongo.exe_search_one(conf, {'config.name': name}) del conf_data['data']['_id'] self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write( json_encode({ 'state': True, 'message': 'success', 'data': conf_data['data'] }))
def get(self): crawl_ids_arg = self.get_argument('crawl_ids', '') if crawl_ids_arg == '': jobs = self.crawler_process.get_jobs() else: crawl_ids = set(map(int, crawl_ids_arg.split(','))) jobs = [ job for job in self.crawler_process.get_jobs() if job['id'] in crawl_ids ] return self.write(json_encode({'jobs': jobs}))
def post(self): name = self.get_argument('name', '').replace('"', '') url = self.get_argument('url', '').replace('"', '') method = self.get_argument('method', '').replace('"', '') proxy = int(self.get_argument('proxy', 0)) dynamic = int(self.get_argument('dynamic', 0)) header = json.loads(self.get_argument('header', '{}')) cookie = json.loads(self.get_argument('cookie', '{}')) form = json.loads(self.get_argument('form', '{}')) config_data = json_encode({ 'name': name, 'url': url, 'method': method, 'proxy': proxy, 'dynamic': dynamic, 'header': header, 'cookie': cookie, 'form': form }) # 代理服务器 settings = {} settings['DOWNLOADER_MIDDLEWARES'] = {} if proxy: settings = { 'DOWNLOADER_MIDDLEWARES': { 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 542, 'server.spider.middlewares.ProxyIPMiddleware': 100 } } # 动态加载 if dynamic: settings['DOWNLOADER_MIDDLEWARES'].update({ # 关闭默认下载器 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'server.spider.middlewares.PhantomJSMiddleware': 543 }) if not url: return self.render('config.html') # 启动spider result = thread_crawl(url, CrawlEntranceSpider, method, header, cookie, form, settings) if result: return self.render('work.html', config_data=config_data) else: return self.write('waiting')
def render(self, *args, **kwargs): proc_stats = self.crawler_process.procmon.get_recent() kwargs['initial_process_stats_json'] = json_encode(proc_stats) return super(BaseRequestHandler, self).render(*args, **kwargs)
def post(self): if self.is_json: name = self.json_args['config'].get('name', '') url = self.json_args['config'].get('url', '') method = self.json_args['config'].get('method', '') proxy = self.json_args['config'].get('proxy', 0) dynamic = self.json_args['config'].get('dynamic', 0) header = self.json_args['config'].get('header', '{}') cookie = self.json_args['config'].get('cookie', '{}') form = self.json_args['config'].get('form', '{}') selectors = self.json_args['selectedField'].get('selectors', []) requestInterval = self.json_args['requestInterval'] pageLoadDelay = self.json_args['pageLoadDelay'] # 字段验证 fields = [ i['id'] for i in selectors if i['type'] in [ 'SelectorText', 'SelectorElementAttribute', 'SelectorHTML', 'SelectorImage' ] ] mongo = MongodbLink() if not name: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write( json_encode({ 'state': False, 'message': '配置名称为空' })) db = mongo.get_db() result = mongo.exe_search_one(db[name], {}) if name == 'config': self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write( json_encode({ 'state': False, 'message': '配置名称不能为config' })) if not result['state']: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write(json_encode([])) if result['data']: db_fields = [i for i in result['data'] if i != "_id"] for i in fields: if i not in db_fields: self.set_header('Content-Type', 'application/json; charset=UTF-8') return self.write( json_encode({ 'state': False, 'message': '采集字段和数据库已有字段不符, 请修改配置名或修改采集字段.' })) # settings设置 # 代理服务器 settings = {} settings['DOWNLOADER_MIDDLEWARES'] = {} if proxy: settings = { 'DOWNLOADER_MIDDLEWARES': { 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 542, 'server.spider.middlewares.ProxyIPMiddleware': 100 } } # 动态加载 if dynamic: settings['DOWNLOADER_MIDDLEWARES'].update({ # 关闭默认下载器 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'server.spider.middlewares.PhantomJSMiddleware': 543 }) # 下载间隔 if requestInterval: settings['DOWNLOAD_DELAY'] = requestInterval if self.crawl(url, name, CrawlWebsiteSpider, method, header, cookie, form, settings, selectors, pageLoadDelay): return self.write({'state': True, 'message': 'success'}) else: raise HTTPError(400) else: self.set_header('Content-Type', 'application/json; charset=UTF-8') self.write( json.dumps({ 'state': False, 'message': '程序启动失败<br>字段选择器格式错误,或缺失字段,请检查.' })) self.finish()
def get(self): # TODO jobs = self.crawler_process.jobs initial_data_json = json_encode({'jobs': jobs}) return self.render("index.html", initial_data_json=initial_data_json)
def write_event(self, event, data): """向客户端发送message""" message = json_encode({'event': event, 'data': data}) self.write_message(message)