Example #1
0
 def index(self):
     """ Create new environment, fill it, and create an IndexAPI
     """
     from docido_sdk.index.config import YamlPullCrawlersIndexingConfig
     config_yaml = osp.splitext(__file__)[0] + '.yml'
     with docido_config:
         docido_config.clear()
         docido_config.update(Configuration.from_file(config_yaml))
         env = Environment()
         env.temp_dir = tempfile.mkdtemp()
         test_components = self._setup_test_components(env)
         pipeline = env[IndexPipelineProvider]
         env[Elasticsearch]
         try:
             # build and provide an IndexAPI
             env[YamlPullCrawlersIndexingConfig]
             yield pipeline.get_index_api(None, None, None, None)
         finally:
             # Hide from Environment the Component classes defined
             # for this test only.
             for test_component in test_components:
                 test_component.unregister()
                 # Remove temporary directory previously created
                 if osp.isdir(env.temp_dir):
                     shutil.rmtree(env.temp_dir)
 def index(self):
     """ Create new environment, fill it, and create an IndexAPI
     """
     from docido_sdk.index.config import YamlPullCrawlersIndexingConfig
     config_yaml = osp.splitext(__file__)[0] + '.yml'
     with docido_config:
         docido_config.clear()
         docido_config.update(Configuration.from_file(config_yaml))
         env = Environment()
         env.temp_dir = tempfile.mkdtemp()
         test_components = self._setup_test_components(env)
         pipeline = env[IndexPipelineProvider]
         env[Elasticsearch]
         try:
             # build and provide an IndexAPI
             env[YamlPullCrawlersIndexingConfig]
             yield pipeline.get_index_api(None, None, None, None)
         finally:
             # Hide from Environment the Component classes defined
             # for this test only.
             for test_component in test_components:
                 test_component.unregister()
                 # Remove temporary directory previously created
                 if osp.isdir(env.temp_dir):
                     shutil.rmtree(env.temp_dir)
 def run_crawl(self, cls, *args, **kwargs):
     with restore_dict_kv(os.environ, 'DOCIDO_CC_RUNS'), \
             docido_config, \
             self.crawler(cls, *args, **kwargs), \
             self.check_crawl(*args, **kwargs):
         config_prefix = osp.splitext(__file__)[0]
         os.environ['DOCIDO_CC_RUNS'] = config_prefix + '-runs.yml'
         config_settings = config_prefix + '-settings.yml'
         docido_config.update(Configuration.from_file(config_settings))
         for c in dcc_run.run([], environment=Environment()):
             shutil.rmtree(c['crawl_path'])
 def run_crawl(self, cls, *args, **kwargs):
     with restore_dict_kv(os.environ, 'DOCIDO_CC_RUNS'), \
             docido_config, \
             self.crawler(cls, *args, **kwargs), \
             self.check_crawl(*args, **kwargs):
         config_prefix = osp.splitext(__file__)[0]
         os.environ['DOCIDO_CC_RUNS'] = config_prefix + '-runs.yml'
         config_settings = config_prefix + '-settings.yml'
         docido_config.update(Configuration.from_file(config_settings))
         for c in dcc_run.run([], environment=Environment()):
             shutil.rmtree(c['crawl_path'])
 def index(self):
     from docido_sdk.index.config import YamlPullCrawlersIndexingConfig
     config_yaml = osp.splitext(__file__)[0] + '.yml'
     with docido_config:
         docido_config.clear()
         docido_config.update(Configuration.from_file(config_yaml))
         env = Environment()
         test_components = self._setup_test_components(env)
         env[IndexPipelineProvider]
         env[LocalDumbIndex]
         env[processor.CheckProcessor]
         try:
             env[YamlPullCrawlersIndexingConfig]
             index_builder = env[IndexPipelineProvider]
             yield index_builder.get_index_api('check-processor-test',
                                               'user2', 'account3', None)
         finally:
             for test_component in test_components:
                 test_component.unregister()
 def index(self):
     from docido_sdk.index.config import YamlPullCrawlersIndexingConfig
     config_yaml = osp.splitext(__file__)[0] + '.yml'
     with docido_config:
         docido_config.clear()
         docido_config.update(Configuration.from_file(config_yaml))
         env = Environment()
         test_components = self._setup_test_components(env)
         env[IndexPipelineProvider]
         env[LocalDumbIndex]
         env[processor.CheckProcessor]
         try:
             env[YamlPullCrawlersIndexingConfig]
             index_builder = env[IndexPipelineProvider]
             yield index_builder.get_index_api(
                 'check-processor-test', 'user2', 'account3'
             )
         finally:
             for test_component in test_components:
                 test_component.unregister()
Example #7
0
 def run(self, logger, config, crawler):
     logger.info("starting crawl")
     self.prepare_crawl_path()
     logger.info('pushed data will be stored in {}'.format(self.crawl_path))
     index_provider = env[IndexPipelineProvider]
     with docido_config:
         if config.environment is not None:
             docido_config.clear()
             new_config = Configuration.from_file(config.environment)
             docido_config.update(new_config)
         index_api = index_provider.get_index_api(
             self.service, None, None, config.get('config') or {}
         )
         runner = TasksRunner(crawler, index_api, config, logger)
         self._check_pickle(runner.tasks)
         runner.execute()
     return {
         'service': self.service,
         'name': self.launch,
         'crawl_path': self.crawl_path,
     }
Example #8
0
 def run(self, logger, config, crawler):
     logger.info("starting crawl")
     self.prepare_crawl_path()
     logger.info('pushed data will be stored in {}'.format(self.crawl_path))
     index_provider = env[IndexPipelineProvider]
     with docido_config:
         if config.environment is not None:
             docido_config.clear()
             new_config = Configuration.from_file(config.environment)
             docido_config.update(new_config)
         index_api = index_provider.get_index_api(
             self.service, None, None,
             config.get('config') or {})
         runner = TasksRunner(crawler, index_api, config, logger)
         self._check_pickle(runner.tasks)
         runner.execute()
     return {
         'service': self.service,
         'name': self.launch,
         'crawl_path': self.crawl_path,
     }
Example #9
0
    def run(self, logger, config, crawler):
        logger.info("starting crawl")
        self.prepare_crawl_path()
        logger.info('pushed data will be stored in {}'.format(self.crawl_path))
        index_provider = env[IndexPipelineProvider]
        with docido_config:
            if config.config is not None:
                docido_config.clear()
                new_config = Configuration.from_file(config.config)
                docido_config.update(new_config)
            index_api = index_provider.get_index_api(
                self.service, None, None
            )
            attempt = 1
            while True:
                try:
                    tasks = crawler.iter_crawl_tasks(
                        index_api, config.token,
                        logger, config.get('full', False)
                    )
                    break
                except Retry as e:
                    try:
                        wait_or_raise(logger, e, attempt)
                    except:
                        logger.exception('Max retries reached')
                        raise
                    else:
                        attempt += 1
                except Exception:
                    logger.exception('Unexpected exception was raised')
                    raise

            self._check_pickle(tasks)
            tasks, epilogue, concurrency = reorg_crawl_tasks(
                tasks,
                int(config.get('max_concurrent_tasks', 2))
            )
            tasks = split_crawl_tasks(tasks, concurrency)

            def _runtask(task, prev_result):
                attempt = 1
                result = None
                kwargs = dict()
                while True:
                    try:
                        result = task(index_api, config.token,
                                      prev_result, logger, **kwargs)
                        break
                    except Retry as e:
                        try:
                            wait_or_raise(logger, e, attempt)
                        except:
                            logger.exception('Max retries reached')
                            result = e
                            break
                        else:
                            attempt += 1
                            kwargs = e.kwargs
                    except Exception as e:
                        logger.exception('Unexpected exception was raised')
                        result = e
                        break
                return result

            results = []
            for seq in tasks:
                previous_result = None
                for task in seq:
                    previous_result = _runtask(task, previous_result)
                results.append(previous_result)
            if epilogue is not None:
                _runtask(epilogue, results)
        return {
            'service': self.service,
            'name': self.launch,
            'crawl_path': self.crawl_path,
        }