Esempio n. 1
0
 def done(process):
     self.processes.remove(process)
     retcode = process.returncode
     if retcode is not None:
         if retcode == 0:
             logger.info('Create virtualenv done.')
             future.set_result(self)
         else:
             future.set_exception(
                 ProcessFailed('Error when init workspace virtualenv '))
Esempio n. 2
0
 def done(process):
     self.processes.remove(process)
     retcode = process.returncode
     if retcode is not None:
         if retcode == 0:
             future.set_result(self)
         else:
             stdout_stream.seek(0)
             stderr_stream.seek(0)
             std_out = stdout_stream.read().decode(PROCESS_ENCODING)
             err_out = stderr_stream.read().decode(PROCESS_ENCODING)
             future.set_exception(ProcessFailed(std_output=std_out,
                                                err_output=err_out))
Esempio n. 3
0
 def check_process():
     logger.debug('poll')
     retcode = process.poll()
     if retcode is not None:
         if retcode == 0:
             future.set_result(self)
         else:
             std_out = process.stdout.read()
             err_out = process.stderr.read()
             future.set_exception(
                 ProcessFailed(std_output=std_out, err_output=err_out))
         return
     IOLoop.current().call_later(1, check_process)
Esempio n. 4
0
    def crawl(self, spider_settings):
        """
        Parameters:
            spider_settings (SpiderSetting): spider settings object.

        Returns:
            Future: future

        As the settings values are forced writen with %(key)s = '%(value)s' format, the origin
        value type will lost, every settings values are strings. However, it would not be a
        problem since scrapy settings support a series of strong-typed settings value get interfaces.
        Always use strong-typed value retrieving method such as settings.getbool, getfloat, getint etc
        can prevent you from TypeError.

        see: https://docs.scrapy.org/en/latest/topics/api.html#scrapy.settings.BaseSettings.get

        """
        spider_json_buffer = BytesIO()
        spider_json_buffer.write(spider_settings.to_json().encode('utf8'))
        spider_json_buffer.seek(0)
        items_file_path = path.join(self._work_dir, 'items.jl')
        log_file_path = path.join(self._work_dir, 'crawl.log')


        pargs = ["python", "-m", "scrapydd.utils.runner2"]
        env = {}
        env['SCRAPY_FEED_URI'] = 'items.jl'
        env['SCRAPY_EGG'] = 'spider.egg'
        container = self._client.containers.create(self.image, pargs,
                                                   detach=True,
                                                   working_dir='/spider_run',
                                                   environment=env)
        self._put_file(container, 'spider.json', spider_json_buffer)
        self._put_egg(container)
        self._start_container(container)
        ret_code = self._wait_container(container)
        while ret_code is None:
            yield gen.moment
            ret_code = self._wait_container(container)

        process_output = ensure_str(container.logs())
        if ret_code == 0:
            with open(log_file_path, 'w') as f:
                f.write(process_output)
            self._collect_files(container)
            result = CrawlResult(0, items_file=items_file_path, crawl_logfile=log_file_path)
            self._remove_container(container)
            raise gen.Return(result)
        else:
            self._remove_container(container)
            raise ProcessFailed(err_output=process_output)
Esempio n. 5
0
 def check_process():
     logger.debug('create virtualenv process poll.')
     retcode = process.poll()
     if retcode is not None:
         if retcode == 0:
             future.set_result(self)
         else:
             std_output = process.stdout.read()
             err_output = process.stderr.read()
             future.set_exception(
                 ProcessFailed('Error when init workspace virtualenv ',
                               std_output=std_output,
                               err_output=err_output))
         return
     IOLoop.current().call_later(1, check_process)
Esempio n. 6
0
    def settings_module(self):
        pargs = ["python", "-m", "scrapydd.utils.extract_settings_module", 'spider.egg']
        container = self._client.containers.create(self.image, pargs,
                                                   detach=True, working_dir='/spider_run')
        self._put_egg(container)
        self._start_container(container)
        ret_code = self._wait_container(container)
        while ret_code is None:
            yield gen.moment
            ret_code = self._wait_container(container)

        output = ensure_str(container.logs()).strip()
        self._remove_container(container)
        if ret_code == 0:
            raise gen.Return(output)
        else:
            raise ProcessFailed(err_output=output)
Esempio n. 7
0
 def list(self):
     env = {'SCRAPY_EGG': 'spider.egg'}
     container = self._client.containers.create(self.image, ["python", "-m", "scrapydd.utils.runner", 'list'],
                                                detach=True, working_dir='/spider_run',
                                                environment=env)
     self._put_egg(container)
     self._start_container(container)
     ret_code = self._wait_container(container)
     while ret_code is None:
         yield gen.moment
         ret_code = self._wait_container(container)
     logs = container.logs()
     if ret_code == 0:
         self._collect_files(container)
         self._remove_container(container)
         raise gen.Return(ensure_str(logs).split())
     else:
         self._remove_container(container)
         raise ProcessFailed(err_output=logs)
Esempio n. 8
0
 def crawl(self, spider_settings):
     """
     Parameters:
         spider_settings (SpiderSetting): spider settings object.
     """
     self._spider_settings = spider_settings
     yield self._prepare()
     crawl_log_path = path.join(self._work_dir, 'crawl.log')
     f_crawl_log = open(crawl_log_path, 'w')
     try:
         ret = yield self._project_workspace.run_spider(spider_settings.spider_name,
                                                        spider_settings.spider_parameters,
                                                        f_output=f_crawl_log,
                                                        project=spider_settings.project_name)
         f_crawl_log.close()
         result = CrawlResult(0, items_file=ret, crawl_logfile=crawl_log_path)
         raise gen.Return(result)
     except ProcessFailed:
         f_crawl_log.close()
         with open(crawl_log_path, 'r') as f_log:
             error_log = f_log.read()
         raise ProcessFailed(err_output=error_log)
Esempio n. 9
0
        def done(process):
            self.processes.remove(process)
            if process.returncode:
                return ret_future.set_exception(ProcessFailed())

            return ret_future.set_result(items_file)
Esempio n. 10
0
 def list(self):
     future = Future()
     future.set_exception(ProcessFailed())
     return future