def test_diff(self): branch_props = elasticsearch.Cluster( host="http://localhost").get_index("debug_active_data", "active_data").get_properties() debug_props = elasticsearch.Cluster(host="http://localhost").get_index( "debug", "bz_etl").get_properties() elasticsearch.diff_schema(branch_props, debug_props)
def __init__( self, rollover_field, # the FIELD with a timestamp to use for determining which index to push to rollover_interval, # duration between roll-over to new index rollover_max, # remove old indexes, do not add old records schema, # es schema queue_size=10000, # number of documents to queue in memory batch_size=5000, # number of documents to push at once typed=None, # indicate if we are expected typed json kwargs=None # plus additional ES settings ): if kwargs.tjson != None: Log.error("not expected") if typed == None: Log.error("not expected") schema.settings.index.max_result_window = 100000 # REQUIRED FOR ACTIVEDATA NESTED QUERIES schema.settings.index.max_inner_result_window = 100000 # REQUIRED FOR ACTIVEDATA NESTED QUERIES self.settings = kwargs self.locker = Lock("lock for rollover_index") self.rollover_field = jx.get(rollover_field) self.rollover_interval = self.settings.rollover_interval = Duration(rollover_interval) self.rollover_max = self.settings.rollover_max = Duration(rollover_max) self.known_queues = {} # MAP DATE TO INDEX self.cluster = elasticsearch.Cluster(self.settings)
def get_branches(hg, branches, kwargs=None): # TRY ES cluster = elasticsearch.Cluster(branches) try: es = cluster.get_index(kwargs=branches, read_only=False) esq = jx_elasticsearch.new_instance(branches) found_branches = esq.query({ "from": branches.index, "format": "list", "limit": 10000 }).data # IF IT IS TOO OLD, THEN PULL FROM HG oldest = Date(MAX(found_branches.etl.timestamp)) if oldest == None or Date.now() - oldest > OLD_BRANCH: found_branches = _get_branches_from_hg(hg) es.extend([{ "id": b.name + " " + b.locale, "value": b } for b in found_branches]) es.flush() try: return UniqueIndex(["name", "locale"], data=found_branches, fail_on_dup=False) except Exception as e: Log.error("Bad branch in ES index", cause=e) except Exception as e: e = Except.wrap(e) if "Can not find index " in e: branches.schema = branches_schema es = cluster.get_or_create_index(branches) es.add_alias() return get_branches(kwargs) Log.error("problem getting branches", cause=e)
def __init__( self, host, index, # THE NAME OF THE SNOWFLAKE (IF WRITING) alias=None, # THE NAME OF THE SNOWFLAKE (FOR READING) type=None, name=None, # THE FULL NAME OF THE TABLE (THE NESTED PATH INTO THE SNOWFLAKE) port=9200, read_only=True, timeout=None, # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests) wait_for_active_shards=1, # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency) typed=None, kwargs=None): Container.__init__(self) if not container.config.default: container.config.default = { "type": "elasticsearch", "settings": unwrap(kwargs) } self.edges = Data() # SET EARLY, SO OTHER PROCESSES CAN REQUEST IT self.worker = None self.settings = kwargs self._namespace = ElasticsearchMetadata(kwargs=kwargs) self.name = name = self._namespace._find_alias( coalesce(alias, index, name)) if read_only: self.es = elasticsearch.Alias(alias=name, index=None, kwargs=kwargs) else: self.es = elasticsearch.Cluster(kwargs=kwargs).get_index( read_only=read_only, kwargs=kwargs) self._ensure_max_result_window_set(name) self.settings.type = self.es.settings.type self.stats = QueryStats(self.es.cluster) columns = self.snowflake.columns # ABSOLUTE COLUMNS is_typed = any(c.es_column == EXISTS_TYPE for c in columns) if typed == None: # SWITCH ON TYPED MODE self.typed = is_typed else: if is_typed != typed: Log.error( "Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed) self.typed = typed if not typed: # ADD EXISTENCE COLUMNS all_paths = {'.': None} # MAP FROM path TO parent TO MAKE A TREE def nested_path_of(v): if v == '.': return ('.', ) return (v, ) + nested_path_of(all_paths[v]) query_paths = sort_using_key(set( step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p))) for step in query_paths: if step in all_paths: continue else: best = '.' for candidate in all_paths.keys(): if startswith_field(step, candidate): if startswith_field(candidate, best): best = candidate all_paths[step] = best for p in all_paths.keys(): if p == ".": nested_path = ('.', ) else: nested_path = nested_path_of(p)[1:] jx_type = (OBJECT if p == "." else NESTED) self.namespace.meta.columns.add( Column(name=p, es_column=p, es_index=self.name, es_type=jx_type, jx_type=jx_type, cardinality=1, nested_path=nested_path, multi=1001 if jx_type is NESTED else 1, last_updated=Date.now()))
def __init__( self, hg=None, # CONNECT TO hg repo=None, # CONNECTION INFO FOR ES CACHE use_cache=False, # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES timeout=30 * SECOND, kwargs=None, ): if not _hg_branches: _late_imports() if not is_text(repo.index): Log.error("Expecting 'index' parameter") self.repo_locker = Lock() self.moves_locker = Lock() self.todo = mo_threads.Queue("todo for hg daemon", max=DAEMON_QUEUE_SIZE) self.settings = kwargs self.timeout = Duration(timeout) self.last_cache_miss = Date.now() # VERIFY CONNECTIVITY with Explanation("Test connect with hg"): http.head(self.settings.hg.url) set_default(repo, { "type": "revision", "schema": revision_schema, }) kwargs.branches = set_default( { "index": repo.index + "-branches", "type": "branch", }, repo, ) moves = set_default( { "index": repo.index + "-moves", }, repo, ) self.branches = _hg_branches.get_branches(kwargs=kwargs) cluster = elasticsearch.Cluster(kwargs=repo) self.repo = cluster.get_or_create_index(kwargs=repo) self.moves = cluster.get_or_create_index(kwargs=moves) def setup_es(please_stop): with suppress_exception: self.repo.add_alias() with suppress_exception: self.moves.add_alias() with suppress_exception: self.repo.set_refresh_interval(seconds=1) with suppress_exception: self.moves.set_refresh_interval(seconds=1) Thread.run("setup_es", setup_es) Thread.run("hg daemon", self._daemon)
def setup(): global config config = startup.read_settings( default_filename=os.environ.get('ACTIVEDATA_CONFIG'), defs=[{ "name": ["--process_num", "--process"], "help": "Additional port offset (for multiple Flask processes", "type": int, "dest": "process_num", "default": 0, "required": False }]) constants.set(config.constants) Log.start(config.debug) agg_bulk.S3_CONFIG = config.bulk.s3 File.new_instance("activedata.pid").write(text(machine_metadata.pid)) # PIPE REQUEST LOGS TO ES DEBUG if config.request_logs: cluster = elasticsearch.Cluster(config.request_logs) request_logger = cluster.get_or_create_index(config.request_logs) active_data.request_log_queue = request_logger.threaded_queue( max_size=2000, period=1) if config.dockerflow: def backend_check(): http.get_json(config.elasticsearch.host + ":" + text(config.elasticsearch.port)) dockerflow(flask_app, backend_check) else: # IF NOT USING DOCKERFLOW, THEN RESPOND WITH A SIMPLER __version__ add_version(flask_app) # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY container.config.default = { "type": "elasticsearch", "settings": config.elasticsearch.copy() } # TRIGGER FIRST INSTANCE if config.saved_queries: setattr(save_query, "query_finder", SaveQueries(config.saved_queries)) # STARTUP QUERY STATS QueryStats(elasticsearch.Cluster(config.elasticsearch)) if config.flask.port and config.args.process_num: config.flask.port += config.args.process_num # TURN ON /exit FOR WINDOWS DEBUGGING if config.flask.debug or config.flask.allow_exit: config.flask.allow_exit = None Log.warning("ActiveData is in debug mode") flask_app.add_url_rule('/exit', 'exit', _exit) if config.flask.ssl_context: if config.args.process_num: Log.error("can not serve ssl and multiple Flask instances at once") setup_flask_ssl() # ENSURE MAIN THREAD SHUTDOWN TRIGGERS Flask SHUTDOWN MAIN_THREAD.stopped.then(exit)