def _spawn_process(self, message, slot): msg = native_stringify_dict(message, keys_only=False) file_settings = msg.pop('file_settings', None) project = msg['_project'] args = [sys.executable, '-m', self.runner, 'crawl'] args += get_crawl_args(msg) e = self.app.getComponent(IEnvironment) env = e.get_environment(msg, slot) env = native_stringify_dict(env, keys_only=False) tmpfile = None if file_settings: with NamedTemporaryFile('w', encoding='utf-8', suffix='.py', delete=False) as tmp: tmp.write(file_settings) path, name_file = os.path.split(tmp.name) module = os.path.splitext(name_file)[0] env['PYTHONPATH'] = '{}:{}'.format(path, os.environ.get('PYTHONPATH')) \ if os.environ.get('PYTHONPATH') else path env['SCRAPY_SETTINGS_MODULE_TO_OVERRIDE'] = module tmpfile = tmp.name pp = ScrapyProcessProtocol(slot, project, msg['_spider'], \ msg['_job'], env, tmpfile) pp.deferred.addBoth(self._process_finished, slot) reactor.spawnProcess(pp, sys.executable, args=args, env=env) self.processes[slot] = pp
def _spawn_process(self, message, slot): msg = native_stringify_dict(message, keys_only=False) project = msg['_project'] args = [sys.executable, '-m', self.runner, 'crawl'] args += get_crawl_args(msg) e = self.app.getComponent(IEnvironment) env = e.get_environment(msg, slot) env = native_stringify_dict(env, keys_only=False) pp = ScrapyProcessProtocol(slot, project, msg['_spider'], msg['_job'], env) pp.deferred.addBoth(self._process_finished, slot) reactor.spawnProcess(pp, sys.executable, args=args, env=env) self.processes[slot] = pp
def _spawn_process(self, message, slot): msg = native_stringify_dict(message, keys_only=False) project = msg['_project'] args = [sys.executable, '-m', self.runner, 'crawl'] args += get_crawl_args(msg) e = self.app.getComponent(IEnvironment) env = e.get_environment(msg, slot) env = native_stringify_dict(env, keys_only=False) pp = ScrapyProcessProtocol(slot, project, msg['_spider'], \ msg['_job'], env, msg) pp.deferred.addBoth(self._process_finished, slot) reactor.spawnProcess(pp, sys.executable, args=args, env=env) self.processes[slot] = pp
def _spawn_process(self, message, slot): msg = native_stringify_dict(message, keys_only=False) project = msg['_project'] args = [sys.executable, '-m', self.runner, 'crawl'] args += get_crawl_args(msg) e = self.app.getComponent(IEnvironment) env = e.get_environment(msg, slot) env = native_stringify_dict(env, keys_only=False) log.msg(format='Scrapyd %(version)s started: name=%(_name)r, env=%(env)r', version=__version__, name=env.get('_name',''), env=env) pp = ScrapyProcessProtocol(slot, project, msg['_spider'], \ msg['_job'], env) pp.deferred.addBoth(self._process_finished, slot) reactor.spawnProcess(pp, sys.executable, args=args, env=env) self.processes[slot] = pp
def render_POST(self, txrequest): args = native_stringify_dict(copy(txrequest.args), keys_only=False) settings = args.pop('setting', []) settings = dict(x.split('=', 1) for x in settings) args = dict((k, v[0]) for k, v in args.items()) project = args.pop('project') spider = args.pop('spider') version = args.get('_version', '') priority = float(args.pop('priority', 0)) spiders = get_spider_list(project, version=version) if spider not in spiders: return { "status": "error", "message": "spider '%s' not found" % spider } args['settings'] = settings jobid = args.pop('jobid', uuid.uuid1().hex) args['_job'] = jobid self.root.scheduler.schedule(project, spider, priority=priority, **args) return { "node_name": self.root.nodename, "status": "ok", "jobid": jobid }
def render_POST(self, txrequest): args = native_stringify_dict(copy(txrequest.args), keys_only=False) project = args['project'][0] version = args['version'][0] self._delete_version(project, version) UtilsCache.invalid_cache(project) return {"node_name": self.root.nodename, "status": "ok"}
def render_GET(self, txrequest): args = native_stringify_dict(copy(txrequest.args), keys_only=False) project = args.get('project', [None])[0] spiders = self.root.launcher.processes.values() queues = self.root.poller.queues pending = [{ "project": qname, "spider": x["name"], "id": x["_job"] } for qname in (queues if project is None else [project]) for x in queues[qname].list()] running = [{ "project": s.project, "spider": s.spider, "id": s.job, "pid": s.pid, "start_time": str(s.start_time), } for s in spiders if project is None or s.project == project] finished = [{ "project": s.project, "spider": s.spider, "id": s.job, "start_time": str(s.start_time), "end_time": str(s.end_time) } for s in self.root.launcher.finished if project is None or s.project == project] return { "node_name": self.root.nodename, "status": "ok", "pending": pending, "running": running, "finished": finished }
def render_GET(self, txrequest): args = native_stringify_dict(copy(txrequest.args), keys_only=False) project = args['project'][0] versions = self.root.eggstorage.list(project) return { "node_name": self.root.nodename, "status": "ok", "versions": versions }
def render_GET(self, txrequest): args = native_stringify_dict(copy(txrequest.args), keys_only=False) project = args['project'][0] version = args.get('_version', [''])[0] spiders = get_spider_list(project, runner=self.root.runner, version=version) return { "node_name": self.root.nodename, "status": "ok", "spiders": spiders }
def spider_stats(): raw_stats = native_stringify_dict(redis_cli.get_all_stats(), keys_only=False) stats = { 'start_time': start_time, 'dupefilter/filtered': raw_stats.get('dupefilter/filtered', 0), 'item_scraped_count': raw_stats.get('item_scraped_count', 0), 'response_received_count': raw_stats.get('response_received_count', 0), 'Page Crawled Speed': "%s pages/min" % logstats.prate, 'Item Scraped Speed': "%s items/min" % logstats.irate, 'Avg Page Crawled Speed': "%.2f pages/min" % logstats.avg_prate, 'Avg Item Scraped Speed': "%.2f items/min" % logstats.avg_irate, } return jsonify(stats)
def render_POST(self, txrequest): eggf = BytesIO(txrequest.args.pop(b'egg')[0]) args = native_stringify_dict(copy(txrequest.args), keys_only=False) project = args['project'][0] version = args['version'][0] self.root.eggstorage.put(eggf, project, version) spiders = get_spider_list(project, version=version) self.root.update_projects() UtilsCache.invalid_cache(project) return { "node_name": self.root.nodename, "status": "ok", "project": project, "version": version, "spiders": len(spiders) }
def render_POST(self, txrequest): args = dict((k, v[0]) for k, v in native_stringify_dict(copy(txrequest.args), keys_only=False).items()) project = args['project'] jobid = args['job'] signal = args.get('signal', 'TERM') prevstate = None queue = self.root.poller.queues[project] c = queue.remove(lambda x: x["_job"] == jobid) if c: prevstate = "pending" spiders = self.root.launcher.processes.values() for s in spiders: if s.project == project and s.job == jobid: s.transport.signalProcess(signal) prevstate = "running" return { "node_name": self.root.nodename, "status": "ok", "prevstate": prevstate }
def index(): return jsonify(native_stringify_dict(zw.children, keys_only=False))
def allstats(): return jsonify( native_stringify_dict(redis_cli.get_all_stats(), keys_only=False))