def rebuild_fast(): from ckan.lib.search import commit db_url = config['sqlalchemy.url'] engine = sa.create_engine(db_url) package_ids = [] result = engine.execute(u"select id from package where state = 'active';") for row in result: package_ids.append(row[0]) def start(ids: list[str]): from ckan.lib.search import rebuild rebuild(package_ids=ids) def chunks(list_: list[str], n: int): u""" Yield n successive chunks from list_""" newn = int(len(list_) / n) for i in range(0, n - 1): yield list_[i * newn:i * newn + newn] yield list_[n * newn - newn:] processes = [] try: for chunk in chunks(package_ids, mp.cpu_count()): process = mp.Process(target=start, args=(chunk, )) processes.append(process) process.daemon = True process.start() for process in processes: process.join() commit() except Exception as e: error_shout(e)
def rebuild(verbose, force, refresh, only_missing, quiet, commit_each): u''' Rebuild search index ''' from ckan.lib.search import rebuild, commit try: rebuild(only_missing=only_missing, force=force, refresh=refresh, defer_commit=(not commit_each), quiet=quiet) except Exception as e: tk.error_shout(e) if not commit_each: commit()
def publish_ogc_worker(self): ''' Publish dataset wms/wfs to geoserver by pop-ing an element (dataset id) from the publis_ogc_queue(redis) ''' print str(datetime.datetime.now() ) + ' PUBLISH_OGC_WORKER: Started the worker process' # flush stdout see https://github.com/Supervisor/supervisor/issues/13 sys.stdout.flush() try: r = self._redis_connection() except: print str(datetime.datetime.now( )) + ' PUBLISH_OGC_WORKER: ERROR, could not connect to Redis ' sys.stdout.flush() # Lovely infinite loop ;P, we do need them from time to time while True: # POP an element (package_id) from publis_ogc_queue and publish it to ogc try: # we need to slow down this loop by setting the blpop timeout to 5 seconds # when publish_ogc_queue is empty queue_task = r.blpop('publish_ogc_queue', 5) if queue_task is not None: package_id = queue_task[1] print str( datetime.datetime.now() ) + ' PUBLISH_OGC_WORKER: Start publishing dataset: ' + package_id sys.stdout.flush() self.publish_ogc(package_id) print str( datetime.datetime.now() ) + ' PUBLISH_OGC_WORKER: finished publishing now index: ' + package_id sys.stdout.flush() # rebuild solr index for this dataset to avoid duplicate datasets in search results rebuild(package_id) commit() except: print str( datetime.datetime.now() ) + ' PUBLISH_OGC_WORKER: An Error has occured while publishing dataset:' + package_id + ' to GeoServer' sys.stdout.flush() # retry in 30 seconds if something went south time.sleep(30)
def rebuild(ctx, verbose, force, refresh, only_missing, quiet, commit_each): u''' Rebuild search index ''' flask_app = ctx.obj.app.apps['flask_app']._wsgi_app from ckan.lib.search import rebuild, commit try: with flask_app.test_request_context(): rebuild(only_missing=only_missing, force=force, refresh=refresh, defer_commit=(not commit_each), quiet=quiet) except Exception as e: error_shout(e) if not commit_each: commit()
def rebuild(verbose: bool, force: bool, only_missing: bool, quiet: bool, commit_each: bool, package_id: str, clear: bool): u''' Rebuild search index ''' from ckan.lib.search import rebuild, commit try: rebuild(package_id, only_missing=only_missing, force=force, defer_commit=(not commit_each), quiet=quiet, clear=clear) except Exception as e: error_shout(e) if not commit_each: commit()
def rebuild(self): from ckan.lib.search import rebuild, commit # BY default we don't commit after each request to Solr, as it is # a really heavy operation and slows things a lot if len(self.args) > 1: rebuild(self.args[1]) else: rebuild(only_missing=self.options.only_missing, force=self.options.force, refresh=self.options.refresh, defer_commit=(not self.options.commit_each)) if not self.options.commit_each: commit()
def publish_ogc_worker(self): ''' Publish dataset wms/wfs to geoserver by pop-ing an element (dataset id) from the publis_ogc_queue(redis) ''' print str(datetime.datetime.now()) + ' PUBLISH_OGC_WORKER: Started the worker process' # flush stdout see https://github.com/Supervisor/supervisor/issues/13 sys.stdout.flush() try: r = self._redis_connection() except: print str(datetime.datetime.now()) + ' PUBLISH_OGC_WORKER: ERROR, could not connect to Redis ' sys.stdout.flush() # Lovely infinite loop ;P, we do need them from time to time while True: # POP an element (package_id) from publis_ogc_queue and publish it to ogc try: # we need to slow down this loop by setting the blpop timeout to 5 seconds # when publish_ogc_queue is empty queue_task = r.blpop('publish_ogc_queue', 5) if queue_task is not None: package_id = queue_task[1] print str(datetime.datetime.now()) + ' PUBLISH_OGC_WORKER: Start publishing dataset: ' + package_id sys.stdout.flush() self.publish_ogc(package_id) # rebuild solr index for this dataset to avoid duplicate datasets in search results rebuild(package_id) commit() except: print str(datetime.datetime.now()) + ' PUBLISH_OGC_WORKER: An Error has occured while publishing dataset:' + package_id + ' to GeoServer' sys.stdout.flush() # retry in 30 seconds if something went south time.sleep(30)
def start(ids): from ckan.lib.search import rebuild, commit rebuild(package_ids=ids) commit()
def format_mapping(self): try: tk.check_access('sysadmin', {'user': g.user, model: model}) except tk.NotAuthorized: return tk.abort(403) if request.method == 'POST': old = request.POST.get('from') new = request.POST.get('to') if old and new: ids = set() res_query = model.Session.query(model.Resource).filter_by( format=old, state='active' ) for res in res_query: ids.add(res.package_id) res_query.update({'format': new}) model.Session.commit() for id in ids: clear(id) rebuild(id, defer_commit=True) commit() tk.h.flash_success( 'Updated. Records changed: {}'.format(len(ids)) ) return tk.redirect_to('format_mapping') defined = set( map(lambda (_1, fmt, _3): fmt, h.resource_formats().values()) ) db_formats = model.Session.query( model.Resource.format, func.count(model.Resource.id), func.count(model.PackageExtra.value) ).outerjoin( model.PackageExtra, (model.Resource.package_id == model.PackageExtra.package_id) & ((model.PackageExtra.key == 'harvest_portal') | (model.PackageExtra.key.is_(None))) ).group_by(model.Resource.format).filter( model.Resource.format != '', model.Resource.state == 'active' ) db_formats = db_formats.all() format_types = { f: { True: 'Partially external', e == 0: 'Local', t - e == 0: 'External' }[True] for (f, t, e) in db_formats } used = set(format_types) undefined = used - defined extra_vars = { 'undefined': undefined, 'defined': defined, 'format_types': format_types } return tk.render('admin/format_mapping.html', extra_vars)