def deploy_tree(tree, es, index_name): """Point the ES aliases and catalog records to a newly built tree, and delete any obsoleted index. """ config = tree.config # Make new index live: alias = config.es_alias.format(format=FORMAT, tree=tree.name) swap_alias(alias, index_name, es) # Create catalog index if it doesn't exist. try: create_index_and_wait( es, config.es_catalog_index, settings={ 'settings': { 'index': { # Fewer should be faster: 'number_of_shards': 1, # This should be cranked up until it's on all nodes, # so it's always a fast read: 'number_of_replicas': config.es_catalog_replicas }, }, 'mappings': { TREE: { '_all': { 'enabled': False }, 'properties': { 'name': UNANALYZED_STRING, 'format': UNANALYZED_STRING, # In case es_alias changes in the conf file: 'es_alias': UNINDEXED_STRING, # Needed so new trees or edited descriptions can show # up without a WSGI restart: 'description': UNINDEXED_STRING, # ["clang", "pygmentize"]: 'enabled_plugins': UNINDEXED_STRING, 'generated_date': UNINDEXED_STRING # We may someday also need to serialize some plugin # configuration here. } } } }) except IndexAlreadyExistsError: pass # Insert or update the doc representing this tree. There'll be a little # race between this and the alias swap. We'll live. es.index(config.es_catalog_index, doc_type=TREE, doc=dict(name=tree.name, format=FORMAT, es_alias=alias, description=tree.description, enabled_plugins=[p.name for p in tree.enabled_plugins], generated_date=config.generated_date), id='%s/%s' % (FORMAT, tree.name))
def index_tree(tree, es, verbose=False): """Index a single tree into ES and the filesystem, and return the name of the new ES index. """ config = tree.config def new_pool(): return ProcessPoolExecutor(max_workers=config.workers) def farm_out(method_name): """Farm out a call to all tree indexers across a process pool. Return the tree indexers, including anything mutations the method call might have made. Show progress while doing it. """ if not config.workers: return [save_scribbles(ti, method_name) for ti in tree_indexers] else: futures = [pool.submit(full_traceback, save_scribbles, ti, method_name) for ti in tree_indexers] return [future.result() for future in show_progress(futures, 'Running %s' % method_name)] def delete_index_quietly(es, index): """Delete an index, and ignore any error. This cannot be done inline in the except clause below, because, even if we catch this exception, it spoils the exception info in that scope, making the bare ``raise`` raise the not-found error rather than whatever went wrong earlier. """ try: es.delete_index(index) except Exception: pass print "Starting tree '%s'." % tree.name # Note starting time start_time = datetime.now() skip_indexing = 'index' in config.skip_stages skip_build = 'build' in config.skip_stages skip_cleanup = skip_indexing or skip_build or 'clean' in config.skip_stages # Create and/or clear out folders: ensure_folder(tree.object_folder, tree.source_folder != tree.object_folder) ensure_folder(tree.temp_folder, not skip_cleanup) ensure_folder(tree.log_folder, not skip_cleanup) ensure_folder(join(tree.temp_folder, 'plugins'), not skip_cleanup) for plugin in tree.enabled_plugins: ensure_folder(join(tree.temp_folder, 'plugins', plugin.name), not skip_cleanup) vcs_cache = VcsCache(tree) tree_indexers = [p.tree_to_index(p.name, tree, vcs_cache) for p in tree.enabled_plugins if p.tree_to_index] try: if not skip_indexing: # Substitute the format, tree name, and uuid into the index identifier. index = tree.es_index.format(format=FORMAT, tree=tree.name, unique=uuid1()) create_index_and_wait( es, index, settings={ 'settings': { 'index': { 'number_of_shards': tree.es_shards, # Fewer should be faster, assuming enough RAM. 'number_of_replicas': 0 # for speed }, # Default analyzers and mappings are in the core plugin. 'analysis': reduce( deep_update, (p.analyzers for p in tree.enabled_plugins), {}), # DXR indices are immutable once built. Turn the # refresh interval down to keep the segment count low # while indexing. It will make for less merging later. # We could also simply call "optimize" after we're # done indexing, but it is unthrottled; we'd have to # use shard allocation to do the indexing on one box # and then move it elsewhere for actual use. 'refresh_interval': '%is' % config.es_refresh_interval }, 'mappings': reduce(deep_update, (p.mappings for p in tree.enabled_plugins), {}) }) else: index = None print "Skipping indexing (due to 'index' in 'skip_stages')" # Run pre-build hooks: with new_pool() as pool: tree_indexers = farm_out('pre_build') # Tear down pool to let the build process use more RAM. if not skip_build: # Set up env vars, and build: build_tree(tree, tree_indexers, verbose) else: print "Skipping rebuild (due to 'build' in 'skip_stages')" # Post-build, and index files: if not skip_indexing: with new_pool() as pool: tree_indexers = farm_out('post_build') index_files(tree, tree_indexers, index, pool, es) # refresh() times out in prod. Wait until it doesn't. That # probably means things are ready to rock again. with aligned_progressbar(repeat(None), label='Refreshing index') as bar: for _ in bar: try: es.refresh(index=index) except (ConnectionError, Timeout) as exc: pass else: break es.update_settings( index, { 'settings': { 'index': { 'number_of_replicas': 1 # fairly arbitrary } } }) except Exception as exc: # If anything went wrong, delete the index, because we're not # going to have a way of returning its name if we raise an # exception. if not skip_indexing: delete_index_quietly(es, index) raise print "Finished '%s' in %s." % (tree.name, datetime.now() - start_time) if not skip_cleanup: # By default, we remove the temp files, because they're huge. rmtree(tree.temp_folder) return index
def index_tree(tree, es, verbose=False): """Index a single tree into ES and the filesystem, and return the name of the new ES index. """ def new_pool(): return ProcessPoolExecutor(max_workers=tree.workers) def farm_out(method_name): """Farm out a call to all tree indexers across a process pool. Return the tree indexers, including anything mutations the method call might have made. Show progress while doing it. """ if not tree.workers: return [save_scribbles(ti, method_name) for ti in tree_indexers] else: futures = [ pool.submit(full_traceback, save_scribbles, ti, method_name) for ti in tree_indexers ] return [ future.result() for future in show_progress(futures, 'Running %s' % method_name) ] def delete_index_quietly(es, index): """Delete an index, and ignore any error. This cannot be done inline in the except clause below, because, even if we catch this exception, it spoils the exception info in that scope, making the bare ``raise`` raise the not-found error rather than whatever went wrong earlier. """ try: es.delete_index(index) except Exception: pass print "Starting tree '%s'." % tree.name # Note starting time start_time = datetime.now() config = tree.config skip_indexing = 'index' in config.skip_stages skip_build = 'build' in config.skip_stages skip_cleanup = skip_indexing or skip_build or 'clean' in config.skip_stages # Create and/or clear out folders: ensure_folder(tree.object_folder, tree.source_folder != tree.object_folder) ensure_folder(tree.temp_folder, not skip_cleanup) ensure_folder(tree.log_folder, not skip_cleanup) ensure_folder(join(tree.temp_folder, 'plugins'), not skip_cleanup) for plugin in tree.enabled_plugins: ensure_folder(join(tree.temp_folder, 'plugins', plugin.name), not skip_cleanup) vcs_cache = VcsCache(tree) tree_indexers = [ p.tree_to_index(p.name, tree, vcs_cache) for p in tree.enabled_plugins if p.tree_to_index ] try: if not skip_indexing: # Substitute the format, tree name, and uuid into the index identifier. index = tree.es_index.format(format=FORMAT, tree=tree.name, unique=uuid1()) create_index_and_wait( es, index, settings={ 'settings': { 'index': { 'number_of_shards': tree. es_shards, # Fewer should be faster, assuming enough RAM. 'number_of_replicas': 0 # for speed }, # Default analyzers and mappings are in the core plugin. 'analysis': reduce(deep_update, (p.analyzers for p in tree.enabled_plugins), {}), # DXR indices are immutable once built. Turn the # refresh interval down to keep the segment count low # while indexing. It will make for less merging later. # We could also simply call "optimize" after we're # done indexing, but it is unthrottled; we'd have to # use shard allocation to do the indexing on one box # and then move it elsewhere for actual use. 'refresh_interval': '%is' % config.es_refresh_interval }, 'mappings': reduce(deep_update, (p.mappings for p in tree.enabled_plugins), {}) }) else: index = None print "Skipping indexing (due to 'index' in 'skip_stages')" # Run pre-build hooks: with new_pool() as pool: tree_indexers = farm_out('pre_build') # Tear down pool to let the build process use more RAM. if not skip_build: # Set up env vars, and build: build_tree(tree, tree_indexers, verbose) else: print "Skipping rebuild (due to 'build' in 'skip_stages')" # Post-build, and index files: if not skip_indexing: with new_pool() as pool: tree_indexers = farm_out('post_build') index_files(tree, tree_indexers, index, pool, es) # refresh() times out in prod. Wait until it doesn't. That # probably means things are ready to rock again. with aligned_progressbar(repeat(None), label='Refreshing index') as bar: for _ in bar: try: es.refresh(index=index) except (ConnectionError, Timeout) as exc: pass else: break es.update_settings( index, { 'settings': { 'index': { 'number_of_replicas': 1 # fairly arbitrary } } }) except Exception as exc: # If anything went wrong, delete the index, because we're not # going to have a way of returning its name if we raise an # exception. if not skip_indexing: delete_index_quietly(es, index) raise print "Finished '%s' in %s." % (tree.name, datetime.now() - start_time) if not skip_cleanup: # By default, we remove the temp files, because they're huge. rmtree(tree.temp_folder) return index