def get_branches(hg, branches, kwargs=None): # TRY ES cluster = elasticsearch.Cluster(branches) try: es = cluster.get_index(kwargs=branches, read_only=False) esq = jx_elasticsearch.new_instance(branches) found_branches = esq.query({"from": "branches", "format": "list", "limit": 10000}).data # IF IT IS TOO OLD, THEN PULL FROM HG oldest = Date(MAX(found_branches.etl.timestamp)) if oldest == None or Date.now() - oldest > OLD_BRANCH: found_branches = _get_branches_from_hg(hg) es.extend({"id": b.name + " " + b.locale, "value": b} for b in found_branches) es.flush() try: return UniqueIndex(["name", "locale"], data=found_branches, fail_on_dup=False) except Exception as e: Log.error("Bad branch in ES index", cause=e) except Exception as e: e = Except.wrap(e) if "Can not find index " in e: set_default(branches, {"schema": branches_schema}) es = cluster.get_or_create_index(branches) es.add_alias() return get_branches(kwargs) Log.error("problem getting branches", cause=e)
def append_query(self, es_query, start): self.start = start parts = self.edge.domain.partitions filters = [] notty = [] for p in parts: w = p.where filters.append(AndOp("and", [w] + notty).to_esfilter(self.schema)) notty.append(NotOp("not", w)) missing_filter = None if self.edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER missing_filter = set_default( {"filter": AndOp("and", notty).to_esfilter(self.schema)}, es_query ) return wrap({"aggs": { "_match": set_default( {"filters": {"filters": filters}}, es_query ), "_missing": missing_filter }})
def _delayed_imports(): global _ListContainer global _meta global _containers from pyLibrary.queries import meta as _meta from pyLibrary.queries.containers.list_usingPythonList import ListContainer as _ListContainer from pyLibrary.queries import containers as _containers _ = _ListContainer _ = _meta _ = _containers try: from pyLibrary.queries.jx_usingMySQL import MySQL except Exception: MySQL = None from pyLibrary.queries.jx_usingES import FromES from pyLibrary.queries.meta import FromESMetadata set_default(_containers.type2container, { "elasticsearch": FromES, "mysql": MySQL, "memory": None, "meta": FromESMetadata })
def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.es_index, c.names["."]) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: if DEBUG: Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical is not c: set_default(c.names, canonical.names) for key in Column.__slots__: canonical[key] = c[key] if DEBUG: Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column) self.todo.add(canonical)
def _range_composer(edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if edge.allowNulls: missing_filter = set_default( { "filter": NotOp("not", AndOp("and", [ edge.value.exists(), InequalityOp("gte", [edge.value, Literal(None, to_float(_min))]), InequalityOp("lt", [edge.value, Literal(None, to_float(_max))]) ]).partial_eval()).to_esfilter(schema) }, es_query ) else: missing_filter = None if isinstance(edge.value, Variable): calc = {"field": schema.leaves(edge.value.var)[0].es_column} else: calc = {"script": edge.value.to_es_script(schema).script(schema)} return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": missing_filter }})
def es_setop(es, query): es_query, filters = es14.util.es_query_template(query.frum.name) set_default(filters[0], simplify_esfilter(query.where.to_esfilter())) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = FlatList() return extract_rows(es, es_query, query)
def __init__(self, **desc): desc = wrap(desc) self._set_slots_to_null(self.__class__) set_default(self, desc) self.name = coalesce(desc.name, desc.type) self.isFacet = coalesce(desc.isFacet, False) self.dimension = Null self.limit = desc.limit
def _convert_clause(self, clause): """ JSON QUERY EXPRESSIONS HAVE MANY CLAUSES WITH SIMILAR COLUMN DELCARATIONS """ if clause == None: return None elif isinstance(clause, Mapping): return set_default({"value": self.convert(clause["value"])}, clause) else: return [set_default({"value": self.convert(c.value)}, c) for c in clause]
def append_query(self, es_query, start): self.start = start if not isinstance(self.edge.value, Variable): if self.exists is TRUE: # IF True THEN WE DO NOT NEED THE _filter OR THE _missing (THIS RARELY HAPPENS THOUGH) output = wrap({"aggs": { "_match": set_default( {"terms": { "script": self.script.expr, "size": self.domain.limit, "order": self.es_order }}, es_query ) }}) else: output = wrap({"aggs": { "_match": { # _match AND _filter REVERSED SO _match LINES UP WITH _missing "filter": self.exists.to_esfilter(self.schema), "aggs": { "_filter": set_default( {"terms": { "script": self.script.expr, "size": self.domain.limit, "order": self.es_order }}, es_query ) } }, "_missing": set_default( {"filter": self.missing.to_esfilter(self.schema)}, es_query ) }}) return output else: output = wrap({"aggs": { "_match": set_default( {"terms": { "field": self.schema.leaves(self.edge.value.var)[0].es_column, "size": self.domain.limit, "order": self.es_order }}, es_query ), "_missing": set_default( {"filter": self.missing.to_esfilter(self.schema)}, es_query ) }}) return output
def _convert_clause(self, clause): """ JSON QUERY EXPRESSIONS HAVE MANY CLAUSES WITH SIMILAR COLUMN DELCARATIONS """ clause = wrap(clause) if clause == None: return None elif is_data(clause): return set_default({"value": self.convert(clause.value)}, clause) else: return [set_default({"value": self.convert(c.value)}, c) for c in clause]
def append_query(self, es_query, start): self.start = start domain = self.domain domain_key = domain.key include, text_include = transpose(*( ( float(v) if isinstance(v, (int, float)) else v, text_type(float(v)) if isinstance(v, (int, float)) else v ) for v in (p[domain_key] for p in domain.partitions) )) value = self.edge.value exists = AndOp("and", [ value.exists(), InOp("in", [value, Literal("literal", include)]) ]).partial_eval() limit = coalesce(self.limit, len(domain.partitions)) if isinstance(value, Variable): es_field = self.query.frum.schema.leaves(value.var)[0].es_column # ALREADY CHECKED THERE IS ONLY ONE terms = set_default({"terms": { "field": es_field, "size": limit, "order": {"_term": self.sorted} if self.sorted else None }}, es_query) else: terms = set_default({"terms": { "script": { "lang": "painless", "inline": value.to_es_script(self.schema).script(self.schema) }, "size": limit }}, es_query) if self.edge.allowNulls: missing = set_default( {"filter": NotOp("not", exists).to_esfilter(self.schema)}, es_query ) else: missing = None return wrap({"aggs": { "_match": { "filter": exists.to_esfilter(self.schema), "aggs": { "_filter": terms } }, "_missing": missing }})
def append_query(self, es_query, start): self.start = start for i, v in enumerate(self.fields): nest = wrap({"aggs": { "_match": set_default({"terms": { "field": v, "size": self.domain.limit }}, es_query), "_missing": set_default( {"filter": {"missing": {"field": v}}}, es_query ) }}) es_query = nest return es_query
def __init__(self, name, params, cwd=None, env=None, debug=False, shell=False, bufsize=-1): self.name = name self.service_stopped = Signal("stopped signal for " + strings.quote(name)) self.stdin = Queue("stdin for process " + strings.quote(name), silent=True) self.stdout = Queue("stdout for process " + strings.quote(name), silent=True) self.stderr = Queue("stderr for process " + strings.quote(name), silent=True) try: self.debug = debug or DEBUG self.service = service = subprocess.Popen( params, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=bufsize, cwd=cwd if isinstance(cwd, (basestring, NullType, NoneType)) else cwd.abspath, env=unwrap(set_default(env, os.environ)), shell=shell ) self.please_stop = Signal() self.please_stop.on_go(self._kill) self.thread_locker = Lock() self.children = [ Thread.run(self.name + " stdin", self._writer, service.stdin, self.stdin, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " stdout", self._reader, "stdout", service.stdout, self.stdout, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " stderr", self._reader, "stderr", service.stderr, self.stderr, please_stop=self.service_stopped, parent_thread=self), Thread.run(self.name + " waiter", self._monitor, parent_thread=self), ] except Exception as e: Log.error("Can not call", e) if self.debug: Log.note("{{process}} START: {{command}}", process=self.name, command=" ".join(map(strings.quote, params)))
def alarm( cls, template, default_params={}, stack_depth=0, log_context=None, **more_params ): """ :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ # USE replace() AS POOR MAN'S CHILD TEMPLATE template = ("*" * 80) + "\n" + indent(template, prefix="** ").strip() + "\n" + ("*" * 80) Log.note( template, default_params=default_params, stack_depth=stack_depth + 1, log_context=set_default({"context": exceptions.ALARM}, log_context), **more_params )
def send(self, topic, message): """Publishes a pulse message to the proper exchange.""" if not message: Log.error("Expecting a message") message._prepare() if not self.connection: self.connect() producer = Producer( channel=self.connection, exchange=Exchange(self.settings.exchange, type='topic'), routing_key=topic ) # The message is actually a simple envelope format with a payload and # some metadata. final_data = Data( payload=message.data, _meta=set_default({ 'exchange': self.settings.exchange, 'routing_key': message.routing_key, 'serializer': self.settings.serializer, 'sent': time_to_string(datetime.datetime.now(timezone(self.settings.broker_timezone))), 'count': self.count }, message.metadata) ) producer.publish(jsons.scrub(final_data), serializer=self.settings.serializer) self.count += 1
def unexpected( cls, template, default_params={}, cause=None, stack_depth=0, log_context=None, **more_params ): """ :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if isinstance(default_params, BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) if cause and not isinstance(cause, Except): cause = Except(exceptions.UNEXPECTED, text_type(cause), trace=exceptions._extract_traceback(0)) trace = exceptions.extract_stack(1) e = Except(type=exceptions.UNEXPECTED, template=template, params=params, cause=cause, trace=trace) Log.note( "{{error}}", error=e, log_context=set_default({"context": exceptions.WARNING}, log_context), stack_depth=stack_depth + 1 )
def new_instance(settings): """ MAKE A PYTHON INSTANCE `settings` HAS ALL THE `kwargs`, PLUS `class` ATTRIBUTE TO INDICATE THE CLASS TO CREATE """ settings = set_default({}, settings) if not settings["class"]: Log.error("Expecting 'class' attribute with fully qualified class name") # IMPORT MODULE FOR HANDLER path = settings["class"].split(".") class_name = path[-1] path = ".".join(path[:-1]) constructor = None try: temp = __import__(path, globals(), locals(), [class_name], 0) constructor = object.__getattribute__(temp, class_name) except Exception as e: Log.error("Can not find class {{class}}", {"class": path}, cause=e) settings['class'] = None try: return constructor(kwargs=settings) # MAYBE IT TAKES A KWARGS OBJECT except Exception as e: pass try: return constructor(**settings) except Exception as e: Log.error("Can not create instance of {{name}}", name=".".join(path), cause=e)
def map_to_es(self): """ RETURN A MAP FROM THE NAMESPACE TO THE es_column NAME """ output = {} for path in self.query_path: set_default( output, { k: c.es_column for c in self.snowflake.columns if c.jx_type not in STRUCT for rel_name in [c.names[path]] for k in [rel_name, untype_path(rel_name), unnest_path(rel_name)] } ) return output
def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() )))
def es_query_proto(path, selects, wheres, schema): """ RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE :param path: THE NESTED PATH (NOT INCLUDING TABLE NAME) :param wheres: MAP FROM path TO LIST OF WHERE CONDITIONS :return: (es_query, filters_map) TUPLE """ output = None last_where = MATCH_ALL for p in reversed(sorted( wheres.keys() | set(selects.keys()))): where = wheres.get(p) select = selects.get(p) if where: where = AndOp(where).partial_eval().to_esfilter(schema) if output: where = es_or([es_and([output, where]), where]) else: if output: if last_where is MATCH_ALL: where = es_or([output, MATCH_ALL]) else: where = output else: where = MATCH_ALL if p == ".": output = set_default( { "from": 0, "size": 0, "sort": [], "query": where }, select.to_es() ) else: output = {"nested": { "path": p, "inner_hits": set_default({"size": 100000}, select.to_es()) if select else None, "query": where }} last_where = where return output
def _get_branches_from_hg(kwarg): # GET MAIN PAGE response = http.get(kwarg.url) doc = BeautifulSoup(response.all_content, "html.parser") all_repos = doc("table")[1] branches = UniqueIndex(["name", "locale"], fail_on_dup=False) for i, r in enumerate(all_repos("tr")): dir, name = [v.text.strip() for v in r("td")] b = _get_single_branch_from_hg(kwarg, name, dir.lstrip("/")) branches.extend(b) # branches.add(set_default({"name": "release-mozilla-beta"}, branches["mozilla-beta", DEFAULT_LOCALE])) for b in list(branches["mozilla-beta", ]): branches.add(set_default({"name": "release-mozilla-beta"}, b)) # THIS IS THE l10n "name" b.url = "https://hg.mozilla.org/releases/mozilla-beta" # THIS IS THE for b in list(branches["mozilla-release", ]): branches.add(set_default({"name": "release-mozilla-release"}, b)) for b in list(branches["mozilla-aurora", ]): if b.locale == "en-US": continue branches.add(set_default({"name": "comm-aurora"}, b)) # b.url = "https://hg.mozilla.org/releases/mozilla-aurora" for b in list(branches): if b.name.startswith("mozilla-esr"): branches.add(set_default({"name": "release-" + b.name}, b)) # THIS IS THE l10n "name" b.url = "https://hg.mozilla.org/releases/" + b.name #CHECKS for b in branches: if b.name != b.name.lower(): Log.error("Expecting lowercase name") if not b.locale: Log.error("Not expected") if not b.url.startswith("http"): Log.error("Expecting a valid url") if not b.etl.timestamp: Log.error("Expecting a timestamp") return branches
def _range_composer(edge, domain, es_query, to_float, schema): # USE RANGES _min = coalesce(domain.min, MIN(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if edge.allowNulls: missing_filter = set_default( { "filter": NotOp( "not", AndOp("and", [ edge.value.exists(), InequalityOp( "gte", [edge.value, Literal(None, to_float(_min))]), InequalityOp( "lt", [edge.value, Literal(None, to_float(_max))]) ]).partial_eval()).to_esfilter(schema) }, es_query) else: missing_filter = None if isinstance(edge.value, Variable): calc = {"field": schema.leaves(edge.value.var)[0].es_column} else: calc = {"script": edge.value.to_painless(schema).script(schema)} return wrap({ "aggs": { "_match": set_default({"range": calc}, { "range": { "ranges": [{ "from": to_float(p.min), "to": to_float(p.max) } for p in domain.partitions] } }, es_query), "_missing": missing_filter } })
def __init__( self, hg=None, # CONNECT TO hg repo=None, # CONNECTION INFO FOR ES CACHE branches=None, # CONNECTION INFO FOR ES CACHE use_cache=False, # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES timeout=30 * SECOND, kwargs=None): if not _hg_branches: _late_imports() self.es_locker = Lock() self.todo = mo_threads.Queue("todo for hg daemon", max=DAEMON_QUEUE_SIZE) self.settings = kwargs self.timeout = Duration(timeout) # VERIFY CONNECTIVITY with Explanation("Test connect with hg"): response = http.head(self.settings.hg.url) if branches == None: self.branches = _hg_branches.get_branches(kwargs=kwargs) self.es = None return self.last_cache_miss = Date.now() set_default(repo, {"schema": revision_schema}) self.es = elasticsearch.Cluster(kwargs=repo).get_or_create_index( kwargs=repo) def setup_es(please_stop): with suppress_exception: self.es.add_alias() with suppress_exception: self.es.set_refresh_interval(seconds=1) Thread.run("setup_es", setup_es) self.branches = _hg_branches.get_branches(kwargs=kwargs) self.timeout = timeout Thread.run("hg daemon", self._daemon)
def _replace_ref(node, url): if url.path.endswith("/"): url.path = url.path[:-1] if is_data(node): ref = None output = {} for k, v in node.items(): if k == "$ref": ref = URL(v) else: output[k] = _replace_ref(v, url) if not ref: return output node = output if not ref.scheme and not ref.path: # DO NOT TOUCH LOCAL REF YET output["$ref"] = ref return output if not ref.scheme: # SCHEME RELATIVE IMPLIES SAME PROTOCOL AS LAST TIME, WHICH # REQUIRES THE CURRENT DOCUMENT'S SCHEME ref.scheme = url.scheme # FIND THE SCHEME AND LOAD IT if ref.scheme in scheme_loaders: new_value = scheme_loaders[ref.scheme](ref, url) else: raise Log.error("unknown protocol {{scheme}}", scheme=ref.scheme) if ref.fragment: new_value = mo_dots.get_attr(new_value, ref.fragment) DEBUG and Log.note("Replace {{ref}} with {{new_value}}", ref=ref, new_value=new_value) if not output: output = new_value elif is_text(output): Log.error("Can not handle set_default({{output}},{{new_value}})", output=output, new_value=new_value) else: output = unwrap(set_default(output, new_value)) DEBUG and Log.note("Return {{output}}", output=output) return output elif is_list(node): output = [_replace_ref(n, url) for n in node] # if all(p[0] is p[1] for p in zip(output, node)): # return node return output return node
def _replace_ref(node, url): if url.path.endswith("/"): url.path = url.path[:-1] if isinstance(node, Mapping): ref = None output = {} for k, v in node.items(): if k == "$ref": ref = URL(v) else: output[k] = _replace_ref(v, url) if not ref: return output node = output if not ref.scheme and not ref.path: # DO NOT TOUCH LOCAL REF YET output["$ref"] = ref return output if not ref.scheme: # SCHEME RELATIVE IMPLIES SAME PROTOCOL AS LAST TIME, WHICH # REQUIRES THE CURRENT DOCUMENT'S SCHEME ref.scheme = url.scheme # FIND THE SCHEME AND LOAD IT if ref.scheme in scheme_loaders: new_value = scheme_loaders[ref.scheme](ref, url) else: raise Log.error("unknown protocol {{scheme}}", scheme=ref.scheme) if ref.fragment: new_value = mo_dots.get_attr(new_value, ref.fragment) DEBUG and Log.note("Replace {{ref}} with {{new_value}}", ref=ref, new_value=new_value) if not output: output = new_value elif isinstance(output, text_type): Log.error("Can not handle set_default({{output}},{{new_value}})", output=output, new_value=new_value) else: output = unwrap(set_default(output, new_value)) DEBUG and Log.note("Return {{output}}", output=output) return output elif isinstance(node, list): output = [_replace_ref(n, url) for n in node] # if all(p[0] is p[1] for p in zip(output, node)): # return node return output return node
def start(): try: config = json2value(STDIN.readline().decode('utf8')) constants.set(config.constants) Log.start(set_default(config.debug, {"logs": [{"type": "raw"}]})) command_loop({"config": config}) except Exception as e: Log.error("problem staring worker", cause=e) finally: Log.stop()
def _status_update(self, mail, status, more=Null): """ UPDATE MESSAGE ID STATUS FOR PTHER PROCESSES TO KNOW :param mail: :param status: :return: """ mail.status = status if mail.sender.track_started: self.response_queue.add(value2json(set_default({"request": {"id": mail.request.id}, "status": status}, more)))
def es_query_proto(path, selects, wheres, schema): """ RETURN TEMPLATE AND PATH-TO-FILTER AS A 2-TUPLE :param path: THE NESTED PATH (NOT INCLUDING TABLE NAME) :param wheres: MAP FROM path TO LIST OF WHERE CONDITIONS :return: (es_query, filters_map) TUPLE """ output = None last_where = MATCH_ALL for p in reversed(sorted(set(wheres.keys()) | set(selects.keys()))): where = wheres.get(p) select = selects.get(p) if where: where = AndOp(where).partial_eval().to_esfilter(schema) if output: where = es_or([es_and([output, where]), where]) else: if output: if last_where is MATCH_ALL: where = es_or([output, MATCH_ALL]) else: where = output else: where = MATCH_ALL if p == ".": output = set_default( {"from": 0, "size": 0, "sort": [], "query": where}, select.to_es() ) else: output = { "nested": { "path": p, "inner_hits": set_default({"size": 100000}, select.to_es()) if select else None, "query": where, } } last_where = where return output
def __init__( self, hg=None, # CONNECT TO hg repo=None, # CONNECTION INFO FOR ES CACHE branches=None, # CONNECTION INFO FOR ES CACHE use_cache=False, # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES timeout=30 * SECOND, kwargs=None ): if not _hg_branches: _late_imports() self.es_locker = Lock() self.todo = mo_threads.Queue("todo for hg daemon", max=DAEMON_QUEUE_SIZE) self.settings = kwargs self.timeout = Duration(timeout) # VERIFY CONNECTIVITY with Explanation("Test connect with hg"): response = http.head(self.settings.hg.url) if branches == None: self.branches = _hg_branches.get_branches(kwargs=kwargs) self.es = None return self.last_cache_miss = Date.now() set_default(repo, {"schema": revision_schema}) self.es = elasticsearch.Cluster(kwargs=repo).get_or_create_index(kwargs=repo) def setup_es(please_stop): with suppress_exception: self.es.add_alias() with suppress_exception: self.es.set_refresh_interval(seconds=1) Thread.run("setup_es", setup_es) self.branches = _hg_branches.get_branches(kwargs=kwargs) self.timeout = timeout Thread.run("hg daemon", self._daemon)
def __init__( self, host, user=None, port=None, config=None, gateway=None, forward_agent=None, connect_timeout=None, connect_kwargs=None, inline_ssh_env=None, key_filename=None, # part of connect_kwargs kwargs=None, ): connect_kwargs = set_default( {}, connect_kwargs, {"key_filename": File(key_filename).abspath} ) self.stdout = LogStream(host, "stdout") self.stderr = LogStream(host, "stderr") config = Config(**unwrap(set_default( {}, config, {"overrides": {"run": { # "hide": True, "out_stream": self.stdout, "err_stream": self.stderr, }}}, ))) self.warn = False self.conn = _Connection( host, user, port, config, gateway, forward_agent, connect_timeout, connect_kwargs, inline_ssh_env, )
def test_set_default(self): a = {"x": {"y": 1}} b = {"x": {"z": 2}} c = {} d = set_default(c, a, b) self.assertTrue(from_data(d) is c, "expecting first parameter to be returned") self.assertEqual(d.x.y, 1, "expecting d to have attributes of a") self.assertEqual(d.x.z, 2, "expecting d to have attributes of b") self.assertEqual(to_data(a).x.z, None, "a should not have been altered")
def append_query(self, es_query, start): self.start = start es_field = self.query.frum.schema.leaves(self.var)[0].es_column es_query = wrap({"aggs": { "_match": set_default({"terms": { "script": expand_template(LIST_TO_PIPE, {"expr": 'doc[' + quote(es_field) + '].values'}) }}, es_query) }}) return es_query
def __init__(self, type=ERROR, template=Null, params=Null, cause=Null, trace=Null, **kwargs): Exception.__init__(self) self.type = type self.template = template self.params = set_default(kwargs, params) self.cause = cause if not trace: self.trace=extract_stack(2) else: self.trace = trace
def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] rollover_interval = coalesce(kwargs.rollover.interval, kwargs.rollover.max, "year") rollover_max = coalesce(kwargs.rollover.max, kwargs.rollover.interval, "year") schema = set_default(kwargs.schema, { "mappings": { kwargs.type: { "properties": { "~N~": { "type": "nested" } } } } }, json2value(value2json(SCHEMA), leaves=True)) self.es = RolloverIndex( rollover_field={"get": [{ "first": "." }, { "literal": "timestamp" }]}, rollover_interval=rollover_interval, rollover_max=rollover_max, schema=schema, limit_replicas=True, typed=True, read_only=False, kwargs=kwargs, ) self.batch_size = batch_size self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def append_query(self, es_query, start): #TODO: USE "reverse_nested" QUERY TO PULL THESE self.start = start for i, v in enumerate(self.fields): nest = wrap({"aggs": { "_match": set_default({"terms": { "field": v, "size": self.domain.limit }}, es_query) }}) if self.edge.allowNulls: nest.aggs._missing = set_default({"missing": {"field": v}}, es_query) # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER es_query = nest if self.domain.where: filter = simplify_esfilter(self.domain.where) es_query = {"aggs": {"_filter": set_default({"filter": filter}, es_query)}} return es_query
def __init__(self, type=ERROR, template=Null, params=Null, cause=Null, trace=Null, **kwargs): Exception.__init__(self) self.type = type self.template = template self.params = set_default(kwargs, params) self.cause = Except.wrap(cause) if not trace: self.trace=extract_stack(2) else: self.trace = trace
def get_schema(self, retry=True): if self.settings.explore_metadata: indices = self.cluster.get_metadata().indices if not self.settings.alias or self.settings.alias==self.settings.index: #PARTIALLY DEFINED settings candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases] # TODO: MERGE THE mappings OF ALL candidates, DO NOT JUST PICK THE LAST ONE index = "dummy value" schema = wrap({"_routing": {}, "properties": {}}) for _, ind in jx.sort(candidates, {"value": 0, "sort": -1}): mapping = ind.mappings[self.settings.type] set_default(schema._routing, mapping._routing) schema.properties = _merge_mapping(schema.properties, mapping.properties) else: #FULLY DEFINED settings index = indices[self.settings.index] schema = index.mappings[self.settings.type] if index == None and retry: #TRY AGAIN, JUST IN CASE self.cluster.cluster_state = None return self.get_schema(retry=False) #TODO: REMOVE THIS BUG CORRECTION if not schema and self.settings.type == "test_result": schema = index.mappings["test_results"] # DONE BUG CORRECTION if not schema: Log.error( "ElasticSearch index ({{index}}) does not have type ({{type}})", index=self.settings.index, type=self.settings.type ) return schema else: mapping = self.cluster.get(self.path + "/_mapping") if not mapping[self.settings.type]: Log.error("{{index}} does not have type {{type}}", self.settings) return wrap({"mappings": mapping[self.settings.type]})
def append_query(self, es_query, start): self.start = start es_field = self.query.frum.schema.leaves(self.var)[0].es_column for i, v in enumerate(self.values): es_query = wrap({"aggs": { "_match": set_default({"terms": { "script": 'doc['+quote(es_field)+'].values.contains(' + value2json(v) + ') ? 1 : 0' }}, es_query) }}) return es_query
def append_query(self, es_query, start): self.start = start for i, v in enumerate(self.fields): nest = wrap({ "aggs": { "_match": set_default( {"terms": { "field": v, "size": self.domain.limit }}, es_query), "_missing": set_default({"filter": { "missing": { "field": v } }}, es_query) } }) es_query = nest return es_query
def find_container(frum, schema=None): """ :param frum: :param schema: :return: """ if not _meta: _delayed_imports() frum = wrap(frum) if isinstance(frum, text_type): if not container.config.default.settings: Log.error( "expecting jx_base.container.config.default.settings to contain default elasticsearch connection info" ) type_ = None if frum.startswith("meta."): if frum == "meta.columns": return _meta.singlton.meta.columns.denormalized() elif frum == "meta.tables": return _meta.singlton.meta.tables else: Log.error("{{name}} not a recognized table", name=frum) type_ = container.config.default.type fact_table_name = split_field(frum)[0] settings = set_default( { "index": fact_table_name, "name": frum, "exists": True, }, container.config.default.settings) settings.type = None return container.type2container[type_](settings) elif isinstance( frum, Mapping) and frum.type and container.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return container.type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from jx_base.query import QueryOp return QueryOp.wrap(frum, namespace=schema) elif isinstance(frum, (list, set)): return _ListContainer("test_list", frum) else: return frum
def append_query(self, es_query, start): self.start = start domain = self.domain field = self.edge.value if isinstance(field, Variable): key = domain.key if isinstance(key, (tuple, list)) and len(key)==1: key = key[0] include = [p[key] for p in domain.partitions] if self.edge.allowNulls: return wrap({"aggs": { "_match": set_default({"terms": { "field": field.var, "size": self.limit, "include": include }}, es_query), "_missing": set_default( {"filter": {"or": [ field.missing().to_esfilter(), {"not": {"terms": {field.var: include}}} ]}}, es_query ), }}) else: return wrap({"aggs": { "_match": set_default({"terms": { "field": field.var, "size": self.limit, "include": include }}, es_query) }}) else: include = [p[domain.key] for p in domain.partitions] if self.edge.allowNulls: return wrap({"aggs": { "_match": set_default({"terms": { "script_field": field.to_ruby(), "size": self.limit, "include": include }}, es_query), "_missing": set_default( {"filter": {"or": [ field.missing().to_esfilter(), NotOp("not", InOp("in", [field, Literal("literal", include)])).to_esfilter() ]}}, es_query ), }}) else: return wrap({"aggs": { "_match": set_default({"terms": { "script_field": field.to_ruby(), "size": self.limit, "include": include }}, es_query) }})
def append_query(self, es_query, start): self.start = start value = self.edge.value.partial_eval() script = value.to_ruby(self.schema) exists = NotOp("not", script.miss).partial_eval() if not isinstance(self.edge.value, Variable): output = wrap({"aggs": { "_match": { "filter": exists.to_esfilter(self.schema), "aggs": { "_filter": set_default( {"terms": { "script": script.expr, "size": self.domain.limit, "order": {"_term": self.sorted} if self.sorted else None }}, es_query ) } }, "_missing": set_default( {"filter": NotOp("not", exists).to_esfilter(self.schema)}, es_query ) }}) return output elif self.edge.value.var in [s.value.var for s in self.query.sort]: sort_dir = [s.sort for s in self.query.sort if s.value.var == self.edge.value.var][0] output = wrap({"aggs": { "_match": set_default( {"terms": { "field": self.schema.leaves(self.edge.value.var)[0].es_column, "size": self.domain.limit, "order": {"_term": "asc" if sort_dir == 1 else "desc"} }}, es_query ), "_missing": set_default( {"filter": NotOp("not", exists).to_esfilter(self.schema)}, es_query ) }}) return output else: output = wrap({"aggs": { "_match": set_default( {"terms": { "field": self.schema.leaves(self.edge.value.var)[0].es_column, "size": self.domain.limit }}, es_query ), "_missing": set_default( {"filter": NotOp("not", exists).to_esfilter(self.schema)}, es_query ) }}) return output
def _replace_locals(node, doc_path): if is_data(node): # RECURS, DEEP COPY ref = None output = {} for k, v in node.items(): if k == "$ref": ref = v elif k == "$concat": if not is_sequence(v): Log.error("$concat expects an array of strings") return coalesce(node.get("separator"), "").join(v) elif v == None: continue else: output[k] = _replace_locals(v, [v] + doc_path) if not ref: return output # REFER TO SELF frag = ref.fragment if frag[0] == ".": # RELATIVE for i, p in enumerate(frag): if p != ".": if i > len(doc_path): Log.error( "{{frag|quote}} reaches up past the root document", frag=frag) new_value = get_attr(doc_path[i - 1], frag[i::]) break else: new_value = doc_path[len(frag) - 1] else: # ABSOLUTE new_value = get_attr(doc_path[-1], frag) new_value = _replace_locals(new_value, [new_value] + doc_path) if not output: return new_value # OPTIMIZATION FOR CASE WHEN node IS {} else: return unwrap(set_default(output, new_value)) elif is_list(node): candidate = [_replace_locals(n, [n] + doc_path) for n in node] # if all(p[0] is p[1] for p in zip(candidate, node)): # return node return candidate return node
def append_query(self, es_query, start): # TODO: USE "reverse_nested" QUERY TO PULL THESE self.start = start for i, v in enumerate(self.fields): exists = v.exists().partial_eval() nest = wrap({ "aggs": { "_match": { "filter": exists.to_es14_filter(self.schema), "aggs": { "_filter": set_default( { "terms": { "field": first( self.schema.leaves( v.var)).es_column, "size": self.domain.limit } }, es_query) } } } }) nest.aggs._missing = set_default( {"filter": NotOp("not", exists).to_es14_filter(self.schema)}, es_query) es_query = nest if self.domain.where: filter_ = self.domain.where.partial_eval().to_es14_filter( self.schema) es_query = { "aggs": { "_filter": set_default({"filter": filter_}, es_query) } } return es_query
def _replace_locals(node, doc_path): if isinstance(node, Mapping): # RECURS, DEEP COPY ref = None output = {} for k, v in node.items(): if k == "$ref": ref = v elif v == None: continue else: output[k] = _replace_locals(v, [v] + doc_path) if not ref: return output # REFER TO SELF frag = ref.fragment if frag[0] == ".": # RELATIVE for i, p in enumerate(frag): if p != ".": if i > len(doc_path): Log.error( "{{frag|quote}} reaches up past the root document", frag=frag) new_value = mo_dots.get_attr(doc_path[i - 1], frag[i::]) break else: new_value = doc_path[len(frag) - 1] else: # ABSOLUTE new_value = mo_dots.get_attr(doc_path[-1], frag) if new_value in doc_path: Log.error("encountered referential loop {{path|json}}", path=[new_value] + doc_path) new_value = _replace_locals(new_value, [new_value] + doc_path) if not output: return new_value # OPTIMIZATION FOR CASE WHEN node IS {} else: return unwrap(set_default(output, new_value)) elif isinstance(node, list): candidate = [_replace_locals(n, [n] + doc_path) for n in node] # if all(p[0] is p[1] for p in zip(candidate, node)): # return node return candidate return node
def find_container(frum): """ :param frum: :return: """ global namespace if not namespace: if not container.config.default.settings: Log.error( "expecting jx_base.container.config.default.settings to contain default elasticsearch connection info" ) namespace = ElasticsearchMetadata(container.config.default.settings) if is_text(frum): if frum in container_cache: return container_cache[frum] path = split_field(frum) if path[0] == "meta": if path[1] == "columns": return namespace.meta.columns.denormalized() elif path[1] == "tables": return namespace.meta.tables else: Log.error("{{name}} not a recognized table", name=frum) type_ = container.config.default.type fact_table_name = path[0] settings = set_default( { "index": fact_table_name, "name": frum, "exists": True, }, container.config.default.settings) settings.type = None output = container.type2container[type_](settings) container_cache[frum] = output return output elif is_data(frum) and frum.type and container.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return container.type2container[frum.type](frum.settings) elif is_data(frum) and (frum["from"] or is_container(frum["from"])): from jx_base.query import QueryOp return QueryOp.wrap(frum) elif is_container(frum): return ListContainer("test_list", frum) else: return frum
def append_query(self, es_query, start): # TODO: USE "reverse_nested" QUERY TO PULL THESE self.start = start for i, v in enumerate(self.fields): exists = v.exists().partial_eval() nest = wrap({"aggs": {"_match": { "filter": exists.to_esfilter(self.schema), "aggs": {"_filter": set_default({"terms": { "field": self.schema.leaves(v.var)[0].es_column, "size": self.domain.limit }}, es_query)} }}}) nest.aggs._missing = set_default( {"filter": NotOp("not", exists).to_esfilter(self.schema)}, es_query ) es_query = nest if self.domain.where: filter_ = self.domain.where.partial_eval().to_esfilter(self.schema) es_query = {"aggs": {"_filter": set_default({"filter": filter_}, es_query)}} return es_query
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not _containers: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not _containers.config.default.settings: Log.error("expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info") type_ = None index = frum if frum.startswith("meta."): if frum == "meta.columns": return _meta.singlton.meta.columns.denormalized() elif frum == "meta.tables": return _meta.singlton.meta.tables else: Log.error("{{name}} not a recognized table", name=frum) else: type_ = _containers.config.default.type index = split_field(frum)[0] settings = set_default( { "index": index, "name": frum, "exists": True, }, _containers.config.default.settings ) settings.type = None return _containers.type2container[type_](settings) elif isinstance(frum, Mapping) and frum.type and _containers.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return _containers.type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import QueryOp return QueryOp.wrap(frum, schema=schema) elif isinstance(frum, (list, set)): return _ListContainer("test_list", frum) else: return frum
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not _containers: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not _containers.config.default.settings: Log.error("expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info") type_ = None index = frum if frum.startswith("meta."): if frum == "meta.columns": return _meta.singlton.meta.columns elif frum == "meta.tables": return _meta.singlton.meta.tables else: Log.error("{{name}} not a recognized table", name=frum) else: type_ = _containers.config.default.type index = join_field(split_field(frum)[:1:]) settings = set_default( { "index": index, "name": frum, "exists": True, }, _containers.config.default.settings ) settings.type = None return _containers.type2container[type_](settings) elif isinstance(frum, Mapping) and frum.type and _containers.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return _containers.type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import QueryOp return QueryOp.wrap(frum, schema=schema) elif isinstance(frum, (list, set)): return _ListContainer("test_list", frum) else: return frum
def async(args, kwargs=None, *_args, **_kwargs): kwargs = set_default(kwargs, dict(zip(arg_names, args))) with self.id_lock: id = self.next_id self.next_id += 1 mail = deepcopy(Data( status=states.PENDING, caller={ # "stack": extract_stack(1) }, sender=set_default(_kwargs, opts), message=kwargs, request=set_default({"id": id}) )) output = AsyncResult(id, mail=mail, app=self) with self.responses_lock: self.responses[id] = output self.request_queue.add(value2json(mail)) Log.note("Added {{id}} ({{name}}) to request queue\n{{request}}", id=id, name=opts.name, request=mail) return output
def note(cls, template, default_params={}, stack_depth=0, log_context=None, **more_params): """ :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if not isinstance(template, text_type): Log.error("Log.note was expecting a unicode template") if len(template) > 10000: template = template[:10000] params = dict(unwrap(default_params), **more_params) log_params = set_default( { "template": template, "params": params, "timestamp": datetime.utcnow(), "machine": machine_metadata }, log_context, {"context": exceptions.NOTE}) if not template.startswith("\n") and template.find("\n") > -1: template = "\n" + template if cls.trace: log_template = "{{machine.name}} (pid {{machine.pid}}) - {{timestamp|datetime}} - {{thread.name}} - \"{{location.file}}:{{location.line}}\" ({{location.method}}) - " + template.replace( "{{", "{{params.") f = sys._getframe(stack_depth + 1) log_params.location = { "line": f.f_lineno, "file": text_type(f.f_code.co_filename.split(os.sep)[-1]), "method": text_type(f.f_code.co_name) } thread = _Thread.current() log_params.thread = {"name": thread.name, "id": thread.id} else: log_template = "{{timestamp|datetime}} - " + template.replace( "{{", "{{params.") cls.main_log.write(log_template, log_params)
def _convert_edge(self, edge): dim = self.dimensions[edge.value] if not dim: return edge if len(listwrap(dim.fields)) == 1: #TODO: CHECK IF EDGE DOMAIN AND DIMENSION DOMAIN CONFLICT new_edge = set_default({"value": unwraplist(dim.fields)}, edge) return new_edge new_edge.domain = dim.getDomain() edge = copy(edge) edge.value = None edge.domain = dim.getDomain() return edge
def get_branches(hg, branches, kwargs=None): # TRY ES cluster = elasticsearch.Cluster(branches) try: es = cluster.get_index(kwargs=branches, read_only=False) esq = jx_elasticsearch.new_instance(branches) found_branches = esq.query({ "from": "branches", "format": "list", "limit": 10000 }).data # IF IT IS TOO OLD, THEN PULL FROM HG oldest = Date(MAX(found_branches.etl.timestamp)) if oldest == None or Date.now() - oldest > OLD_BRANCH: found_branches = _get_branches_from_hg(hg) es.extend({ "id": b.name + " " + b.locale, "value": b } for b in found_branches) es.flush() try: return UniqueIndex(["name", "locale"], data=found_branches, fail_on_dup=False) except Exception as e: Log.error("Bad branch in ES index", cause=e) except Exception as e: e = Except.wrap(e) if "Can not find index " in e: set_default(branches, {"schema": branches_schema}) es = cluster.get_or_create_index(branches) es.add_alias() return get_branches(kwargs) Log.error("problem getting branches", cause=e)
def __init__(self, name, config): config = wrap(config) if config.debug.logs: Log.error("not allowed to configure logging on other process") self.process = Process(name, [PYTHON, "mo_threads" + os.sep + "python_worker.py"], shell=True) self.process.stdin.add(value2json(set_default({"debug": {"trace": True}}, config))) self.lock = Lock("wait for response from "+name) self.current_task = None self.current_response = None self.current_error = None self.daemon = Thread.run("", self._daemon) self.errors = Thread.run("", self._stderr)
def append_query(self, es_query, start): self.start = start parts = self.edge.domain.partitions filters = [] notty = [] for p in parts: filters.append(AndOp("and", [p.where] + notty).to_esfilter()) notty.append(NotOp("not", p.where)) missing_filter = None if self.edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER missing_filter = set_default( {"filter": AndOp("and", notty).to_esfilter()}, es_query) return wrap({ "aggs": { "_match": set_default({"filters": { "filters": filters }}, es_query), "_missing": missing_filter } })