def done(process): self.processes.remove(process) retcode = process.returncode if retcode is not None: if retcode == 0: logger.info('Create virtualenv done.') future.set_result(self) else: future.set_exception( ProcessFailed('Error when init workspace virtualenv '))
def done(process): self.processes.remove(process) retcode = process.returncode if retcode is not None: if retcode == 0: future.set_result(self) else: stdout_stream.seek(0) stderr_stream.seek(0) std_out = stdout_stream.read().decode(PROCESS_ENCODING) err_out = stderr_stream.read().decode(PROCESS_ENCODING) future.set_exception(ProcessFailed(std_output=std_out, err_output=err_out))
def check_process(): logger.debug('poll') retcode = process.poll() if retcode is not None: if retcode == 0: future.set_result(self) else: std_out = process.stdout.read() err_out = process.stderr.read() future.set_exception( ProcessFailed(std_output=std_out, err_output=err_out)) return IOLoop.current().call_later(1, check_process)
def crawl(self, spider_settings): """ Parameters: spider_settings (SpiderSetting): spider settings object. Returns: Future: future As the settings values are forced writen with %(key)s = '%(value)s' format, the origin value type will lost, every settings values are strings. However, it would not be a problem since scrapy settings support a series of strong-typed settings value get interfaces. Always use strong-typed value retrieving method such as settings.getbool, getfloat, getint etc can prevent you from TypeError. see: https://docs.scrapy.org/en/latest/topics/api.html#scrapy.settings.BaseSettings.get """ spider_json_buffer = BytesIO() spider_json_buffer.write(spider_settings.to_json().encode('utf8')) spider_json_buffer.seek(0) items_file_path = path.join(self._work_dir, 'items.jl') log_file_path = path.join(self._work_dir, 'crawl.log') pargs = ["python", "-m", "scrapydd.utils.runner2"] env = {} env['SCRAPY_FEED_URI'] = 'items.jl' env['SCRAPY_EGG'] = 'spider.egg' container = self._client.containers.create(self.image, pargs, detach=True, working_dir='/spider_run', environment=env) self._put_file(container, 'spider.json', spider_json_buffer) self._put_egg(container) self._start_container(container) ret_code = self._wait_container(container) while ret_code is None: yield gen.moment ret_code = self._wait_container(container) process_output = ensure_str(container.logs()) if ret_code == 0: with open(log_file_path, 'w') as f: f.write(process_output) self._collect_files(container) result = CrawlResult(0, items_file=items_file_path, crawl_logfile=log_file_path) self._remove_container(container) raise gen.Return(result) else: self._remove_container(container) raise ProcessFailed(err_output=process_output)
def check_process(): logger.debug('create virtualenv process poll.') retcode = process.poll() if retcode is not None: if retcode == 0: future.set_result(self) else: std_output = process.stdout.read() err_output = process.stderr.read() future.set_exception( ProcessFailed('Error when init workspace virtualenv ', std_output=std_output, err_output=err_output)) return IOLoop.current().call_later(1, check_process)
def settings_module(self): pargs = ["python", "-m", "scrapydd.utils.extract_settings_module", 'spider.egg'] container = self._client.containers.create(self.image, pargs, detach=True, working_dir='/spider_run') self._put_egg(container) self._start_container(container) ret_code = self._wait_container(container) while ret_code is None: yield gen.moment ret_code = self._wait_container(container) output = ensure_str(container.logs()).strip() self._remove_container(container) if ret_code == 0: raise gen.Return(output) else: raise ProcessFailed(err_output=output)
def list(self): env = {'SCRAPY_EGG': 'spider.egg'} container = self._client.containers.create(self.image, ["python", "-m", "scrapydd.utils.runner", 'list'], detach=True, working_dir='/spider_run', environment=env) self._put_egg(container) self._start_container(container) ret_code = self._wait_container(container) while ret_code is None: yield gen.moment ret_code = self._wait_container(container) logs = container.logs() if ret_code == 0: self._collect_files(container) self._remove_container(container) raise gen.Return(ensure_str(logs).split()) else: self._remove_container(container) raise ProcessFailed(err_output=logs)
def crawl(self, spider_settings): """ Parameters: spider_settings (SpiderSetting): spider settings object. """ self._spider_settings = spider_settings yield self._prepare() crawl_log_path = path.join(self._work_dir, 'crawl.log') f_crawl_log = open(crawl_log_path, 'w') try: ret = yield self._project_workspace.run_spider(spider_settings.spider_name, spider_settings.spider_parameters, f_output=f_crawl_log, project=spider_settings.project_name) f_crawl_log.close() result = CrawlResult(0, items_file=ret, crawl_logfile=crawl_log_path) raise gen.Return(result) except ProcessFailed: f_crawl_log.close() with open(crawl_log_path, 'r') as f_log: error_log = f_log.read() raise ProcessFailed(err_output=error_log)
def done(process): self.processes.remove(process) if process.returncode: return ret_future.set_exception(ProcessFailed()) return ret_future.set_result(items_file)
def list(self): future = Future() future.set_exception(ProcessFailed()) return future