def index(self): """ Create new environment, fill it, and create an IndexAPI """ from docido_sdk.index.config import YamlPullCrawlersIndexingConfig config_yaml = osp.splitext(__file__)[0] + '.yml' with docido_config: docido_config.clear() docido_config.update(Configuration.from_file(config_yaml)) env = Environment() env.temp_dir = tempfile.mkdtemp() test_components = self._setup_test_components(env) pipeline = env[IndexPipelineProvider] env[Elasticsearch] try: # build and provide an IndexAPI env[YamlPullCrawlersIndexingConfig] yield pipeline.get_index_api(None, None, None, None) finally: # Hide from Environment the Component classes defined # for this test only. for test_component in test_components: test_component.unregister() # Remove temporary directory previously created if osp.isdir(env.temp_dir): shutil.rmtree(env.temp_dir)
def run_crawl(self, cls, *args, **kwargs): with restore_dict_kv(os.environ, 'DOCIDO_CC_RUNS'), \ docido_config, \ self.crawler(cls, *args, **kwargs), \ self.check_crawl(*args, **kwargs): config_prefix = osp.splitext(__file__)[0] os.environ['DOCIDO_CC_RUNS'] = config_prefix + '-runs.yml' config_settings = config_prefix + '-settings.yml' docido_config.update(Configuration.from_file(config_settings)) for c in dcc_run.run([], environment=Environment()): shutil.rmtree(c['crawl_path'])
def index(self): from docido_sdk.index.config import YamlPullCrawlersIndexingConfig config_yaml = osp.splitext(__file__)[0] + '.yml' with docido_config: docido_config.clear() docido_config.update(Configuration.from_file(config_yaml)) env = Environment() test_components = self._setup_test_components(env) env[IndexPipelineProvider] env[LocalDumbIndex] env[processor.CheckProcessor] try: env[YamlPullCrawlersIndexingConfig] index_builder = env[IndexPipelineProvider] yield index_builder.get_index_api('check-processor-test', 'user2', 'account3', None) finally: for test_component in test_components: test_component.unregister()
def index(self): from docido_sdk.index.config import YamlPullCrawlersIndexingConfig config_yaml = osp.splitext(__file__)[0] + '.yml' with docido_config: docido_config.clear() docido_config.update(Configuration.from_file(config_yaml)) env = Environment() test_components = self._setup_test_components(env) env[IndexPipelineProvider] env[LocalDumbIndex] env[processor.CheckProcessor] try: env[YamlPullCrawlersIndexingConfig] index_builder = env[IndexPipelineProvider] yield index_builder.get_index_api( 'check-processor-test', 'user2', 'account3' ) finally: for test_component in test_components: test_component.unregister()
def run(self, logger, config, crawler): logger.info("starting crawl") self.prepare_crawl_path() logger.info('pushed data will be stored in {}'.format(self.crawl_path)) index_provider = env[IndexPipelineProvider] with docido_config: if config.environment is not None: docido_config.clear() new_config = Configuration.from_file(config.environment) docido_config.update(new_config) index_api = index_provider.get_index_api( self.service, None, None, config.get('config') or {} ) runner = TasksRunner(crawler, index_api, config, logger) self._check_pickle(runner.tasks) runner.execute() return { 'service': self.service, 'name': self.launch, 'crawl_path': self.crawl_path, }
def run(self, logger, config, crawler): logger.info("starting crawl") self.prepare_crawl_path() logger.info('pushed data will be stored in {}'.format(self.crawl_path)) index_provider = env[IndexPipelineProvider] with docido_config: if config.environment is not None: docido_config.clear() new_config = Configuration.from_file(config.environment) docido_config.update(new_config) index_api = index_provider.get_index_api( self.service, None, None, config.get('config') or {}) runner = TasksRunner(crawler, index_api, config, logger) self._check_pickle(runner.tasks) runner.execute() return { 'service': self.service, 'name': self.launch, 'crawl_path': self.crawl_path, }
def run(self, logger, config, crawler): logger.info("starting crawl") self.prepare_crawl_path() logger.info('pushed data will be stored in {}'.format(self.crawl_path)) index_provider = env[IndexPipelineProvider] with docido_config: if config.config is not None: docido_config.clear() new_config = Configuration.from_file(config.config) docido_config.update(new_config) index_api = index_provider.get_index_api( self.service, None, None ) attempt = 1 while True: try: tasks = crawler.iter_crawl_tasks( index_api, config.token, logger, config.get('full', False) ) break except Retry as e: try: wait_or_raise(logger, e, attempt) except: logger.exception('Max retries reached') raise else: attempt += 1 except Exception: logger.exception('Unexpected exception was raised') raise self._check_pickle(tasks) tasks, epilogue, concurrency = reorg_crawl_tasks( tasks, int(config.get('max_concurrent_tasks', 2)) ) tasks = split_crawl_tasks(tasks, concurrency) def _runtask(task, prev_result): attempt = 1 result = None kwargs = dict() while True: try: result = task(index_api, config.token, prev_result, logger, **kwargs) break except Retry as e: try: wait_or_raise(logger, e, attempt) except: logger.exception('Max retries reached') result = e break else: attempt += 1 kwargs = e.kwargs except Exception as e: logger.exception('Unexpected exception was raised') result = e break return result results = [] for seq in tasks: previous_result = None for task in seq: previous_result = _runtask(task, previous_result) results.append(previous_result) if epilogue is not None: _runtask(epilogue, results) return { 'service': self.service, 'name': self.launch, 'crawl_path': self.crawl_path, }