def __init__(self, **desc): desc = wrap(desc) self._set_slots_to_null(self.__class__) set_default(self, desc) self.name = coalesce(desc.name, desc.type) self.isFacet = coalesce(desc.isFacet, False) self.dimension = Null
def append_query(self, es_query, start): self.start = start if not isinstance(self.edge.value, Variable): script_field = self.edge.value.to_ruby() missing = self.edge.value.missing().to_esfilter() output = wrap( { "aggs": { "_match": set_default( {"terms": {"script_field": script_field, "size": self.domain.limit}}, es_query ), "_missing": set_default({"filter": missing}, es_query), } } ) return output output = wrap( { "aggs": { "_match": set_default( {"terms": {"field": self.edge.value.var, "size": self.domain.limit}}, es_query ), "_missing": set_default({"missing": {"field": self.edge.value}}, es_query), } } ) return output
def _delayed_imports(): global _ListContainer global _meta global _containers from pyLibrary.queries import meta as _meta from pyLibrary.queries.containers.list_usingPythonList import ListContainer as _ListContainer from pyLibrary.queries import containers as _containers _ = _ListContainer _ = _meta _ = _containers try: from pyLibrary.queries.jx_usingMySQL import MySQL except Exception: MySQL = None from pyLibrary.queries.jx_usingES import FromES from pyLibrary.queries.meta import FromESMetadata set_default(_containers.type2container, { "elasticsearch": FromES, "mysql": MySQL, "memory": None, "meta": FromESMetadata })
def _delayed_imports(): global type2container global _ListContainer global _Cube global _run global _Query global _Normal try: from pyLibrary.queries.jx_usingMySQL import MySQL as _MySQL except Exception: _MySQL = None from pyLibrary.queries.jx_usingES import FromES as _FromES from pyLibrary.queries.containers.list_usingPythonList import ListContainer as _ListContainer from pyLibrary.queries.containers.cube import Cube as _Cube from pyLibrary.queries.jx import run as _run from pyLibrary.queries.query import QueryOp as _Query from pyLibrary.queries.containers.list_usingSQLite import Table_usingSQLite set_default(type2container, { "elasticsearch": _FromES, "mysql": _MySQL, "sqlite": Table_usingSQLite, "memory": None }) _ = _run _ = _Query _ = _Normal
def append_query(self, es_query, start): self.start = start parts = self.edge.domain.partitions filters = [] notty = [] for p in parts: filters.append(AndOp("and", [p.where]+notty).to_esfilter()) notty.append(NotOp("not", p.where)) missing_filter = None if self.edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER missing_filter = set_default( {"filter": AndOp("and", notty).to_esfilter()}, es_query ) return wrap({"aggs": { "_match": set_default( {"filters": {"filters": filters}}, es_query ), "_missing": missing_filter }})
def _range_composer(edge, domain, es_query, to_float): # USE RANGES _min = coalesce(domain.min, MAX(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if isinstance(edge.value, Variable): calc = {"field": edge.value.var} else: calc = {"script_field": edge.value.to_ruby()} if edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER missing_filter = set_default( {"filter": {"or": [ OrOp("or", [ BinaryOp("lt", [edge.value, Literal(None, to_float(_min))]), BinaryOp("gte", [edge.value, Literal(None, to_float(_max))]), ]).to_esfilter(), edge.value.missing().to_esfilter() ]}}, es_query ) else: missing_filter = None return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": missing_filter }})
def query(self, _query): if not self.columns: self.columns = [] alias_done = set() metadata = self._es.get_metadata() for index, meta in qb.sort(metadata.indices.items(), { "value": 0, "sort": -1 }): for _, properties in meta.mappings.items(): columns = _parse_properties(index, properties.properties) for c in columns: c.cube = index c.property = c.name c.name = None c.useSource = None self.columns.extend(columns) for a in meta.aliases: # ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS if a in alias_done: continue alias_done.add(a) for c in columns: self.columns.append(set_default( {"cube": a}, c)) # ENSURE WE COPY return qb.run( set_default({ "from": self.columns, "sort": ["cube", "property"] }, _query.as_dict()))
def _delayed_imports(): global type2container global _ListContainer global _Cube global _run global _Query global _Normal try: from pyLibrary.queries.jx_usingMySQL import MySQL as _MySQL except Exception: _MySQL = None from pyLibrary.queries.jx_usingES import FromES as _FromES from pyLibrary.queries.containers.list_usingPythonList import ListContainer as _ListContainer from pyLibrary.queries.containers.cube import Cube as _Cube from pyLibrary.queries.jx import run as _run from pyLibrary.queries.query import QueryOp as _Query from pyLibrary.queries.containers.list_usingSQLite import Table_usingSQLite set_default( type2container, { "elasticsearch": _FromES, "mysql": _MySQL, "sqlite": Table_usingSQLite, "memory": None }) _ = _run _ = _Query _ = _Normal
def _get_branches_from_hg(settings): # GET MAIN PAGE response = http.get(settings.url) doc = BeautifulSoup(response.all_content) all_repos = doc("table")[1] branches = UniqueIndex(["name", "locale"], fail_on_dup=False) for i, r in enumerate(all_repos("tr")): dir, name = [v.text.strip() for v in r("td")] b = _get_single_branch_from_hg(settings, name, dir.lstrip("/")) branches.extend(b) # branches.add(set_default({"name": "release-mozilla-beta"}, branches["mozilla-beta", DEFAULT_LOCALE])) for b in list(branches["mozilla-beta", ]): branches.add(set_default({"name": "release-mozilla-beta"}, b)) # THIS IS THE l10n "name" b.url = "https://hg.mozilla.org/releases/mozilla-beta" # THIS IS THE for b in list(branches["mozilla-release", ]): branches.add(set_default({"name": "release-mozilla-release"}, b)) for b in list(branches["mozilla-aurora", ]): if b.locale == "en-US": continue branches.add(set_default({"name": "comm-aurora"}, b)) # b.url = "https://hg.mozilla.org/releases/mozilla-aurora" return branches
def append_query(self, es_query, start): self.start = start if not isinstance(self.edge.value, Variable): script_field = self.edge.value.to_ruby() missing = self.edge.value.missing() output = wrap({"aggs": { "_match": set_default( {"terms": { "script_field": script_field, "size": self.domain.limit }}, es_query ), "_missing": set_default({"filter": missing.to_esfilter()}, es_query) if missing else None }}) return output output = wrap({"aggs": { "_match": set_default( {"terms": { "field": self.edge.value.var, "size": self.domain.limit }}, es_query ), "_missing": set_default({"missing": {"field": self.edge.value}}, es_query) # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER }}) return output
def get_branches(settings): # GET MAIN PAGE response = http.get(settings.url) doc = BeautifulSoup(response.all_content) all_repos = doc("table")[1] branches = UniqueIndex(["name", "locale"], fail_on_dup=False) for i, r in enumerate(all_repos("tr")): dir, name = [v.text.strip() for v in r("td")] b = get_branch(settings, name, dir.lstrip("/")) branches.extend(b) # branches.add(set_default({"name": "release-mozilla-beta"}, branches["mozilla-beta", DEFAULT_LOCALE])) for b in list(branches["mozilla-beta", ]): branches.add(set_default({"name": "release-mozilla-beta"}, b)) for b in list(branches["mozilla-release", ]): branches.add(set_default({"name": "release-mozilla-release"}, b)) for b in list(branches["mozilla-aurora", ]): if b.locale == "en-US": continue branches.add(set_default({"name": "comm-aurora"}, b)) return branches
def _delayed_imports(): global _ListContainer global _meta global _containers from pyLibrary.queries import meta as _meta from pyLibrary.queries.containers.list_usingPythonList import ListContainer as _ListContainer from pyLibrary.queries import containers as _containers _ = _ListContainer _ = _meta _ = _containers try: from pyLibrary.queries.jx_usingMySQL import MySQL except Exception: MySQL = None from pyLibrary.queries.jx_usingES import FromES from pyLibrary.queries.meta import FromESMetadata set_default( _containers.type2container, { "elasticsearch": FromES, "mysql": MySQL, "memory": None, "meta": FromESMetadata })
def _range_composer(edge, domain, es_query, to_float): # USE RANGES _min = coalesce(domain.min, MAX(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if isinstance(edge.value, Variable): calc = {"field": edge.value.var} else: calc = {"script_field": edge.value.to_ruby()} if edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER missing_filter = set_default( {"filter": {"or": [ OrOp("or", [ InequalityOp("lt", [edge.value, Literal(None, to_float(_min))]), InequalityOp("gte", [edge.value, Literal(None, to_float(_max))]), ]).to_esfilter(), edge.value.missing().to_esfilter() ]}}, es_query ) else: missing_filter = None return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": missing_filter }})
def append_query(self, es_query, start): #TODO: USE "reverse_nested" QUERY TO PULL THESE self.start = start for i, v in enumerate(self.fields): nest = wrap({ "aggs": { "_match": set_default( {"terms": { "field": v, "size": self.domain.limit }}, es_query) } }) if self.edge.allowNulls: nest.aggs._missing = set_default( {"missing": { "field": v }}, es_query ) # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER es_query = nest if self.domain.where: filter = simplify_esfilter(self.domain.where) es_query = { "aggs": { "_filter": set_default({"filter": filter}, es_query) } } return es_query
def _range_composer(edge, domain, es_query, to_float): # USE RANGES _min = coalesce(domain.min, MAX(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if is_keyword(edge.value): calc = {"field": edge.value} else: calc = {"script": qb_expression_to_ruby(edge.value)} if is_keyword(edge.value): missing_range = {"or": [ {"range": {edge.value: {"lt": to_float(_min)}}}, {"range": {edge.value: {"gte": to_float(_max)}}} ]} else: missing_range = {"script": {"script": qb_expression_to_ruby({"or": [ {"lt": [edge.value, to_float(_min)]}, {"gt": [edge.value, to_float(_max)]}, ]})}} return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": set_default( {"filter": {"or": [ missing_range, {"missing": {"field": get_all_vars(edge.value)}} ]}}, es_query ), }})
def append_query(self, es_query, start): self.start = start if not isinstance(self.edge.value, Variable): script_field = self.edge.value.to_ruby() missing = self.edge.value.missing() output = wrap( { "aggs": { "_match": set_default( {"terms": {"script_field": script_field, "size": self.domain.limit}}, es_query ), "_missing": set_default({"filter": missing.to_esfilter()}, es_query) if missing else None, } } ) return output output = wrap( { "aggs": { "_match": set_default( {"terms": {"field": self.edge.value.var, "size": self.domain.limit}}, es_query ), "_missing": set_default( {"missing": {"field": self.edge.value}}, es_query ), # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER } } ) return output
def es_setop(es, query): es_query, filters = es14.util.es_query_template(query.frum.name) set_default(filters[0], simplify_esfilter(query.where.to_esfilter())) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = jx_sort_to_es_sort(query.sort) es_query.fields = DictList() return extract_rows(es, es_query, query)
def _delayed_imports(): global type2container from pyLibrary.queries.qb_usingMySQL import MySQL from pyLibrary.queries.qb_usingES import FromES set_default(type2container, { "elasticsearch": FromES, "mysql": MySQL, "memory": None })
def _convert_clause(self, clause): """ JSON QUERY EXPRESSIONS HAVE MANY CLAUSES WITH SIMILAR COLUMN DELCARATIONS """ if clause == None: return None elif isinstance(clause, Mapping): return set_default({"value": self.convert(clause["value"])}, clause) else: return [set_default({"value": self.convert(c.value)}, c) for c in clause]
def append_query(self, es_query, start): self.start = start return wrap({"aggs": { "_match": set_default( {"terms": { "field": self.edge.value, "size": self.edge.domain.limit }}, es_query ), "_missing": set_default({"missing": {"field": self.edge.value}}, es_query), }})
def _convert_clause(self, clause): """ Qb QUERIES HAVE MANY CLAUSES WITH SIMILAR COLUMN DELCARATIONS """ clause = wrap(clause) if clause == None: return None elif isinstance(clause, Mapping): return set_default({"value": self.convert(clause.value)}, clause) else: return [set_default({"value": self.convert(c.value)}, c) for c in clause]
def parse_comment(bug, comment): bug = bug.copy() subtests = [] lines = comment.comment.split('\n') for line in lines: if not line.strip(): continue elif line.startswith('log: https://treeherder.mozilla.org'): bug.treeherder = line.split('log: ')[1] continue elif line.startswith('buildname'): bug.build.name = line.split('buildname: ')[1] continue elif line.startswith('repository: '): bug.branch.name = line.split('repository: ')[1] continue elif line.startswith('machine: '): bug.machine.name = line.split('machine: ')[1] continue elif line.startswith('who: '): continue elif line.startswith('revision'): try: bug.build.revision = line.split('revision: ')[1] continue except: Log.error("exception splitting bug {{bug_id}} line on 'revision: ', {{line}}", bug_id=bug.id, line=line) elif line.startswith('start_time'): bug.timestamp = Date(line.split('start_time: ')[1]) continue elif line.startswith('submit_timestamp'): bug.timestamp = line.split('submit_timestamp: ')[1] continue parts = line.split("|") if len(parts) == 3: try: subtest = Dict() subtest.subtest = parse_status(parts[0]) subtest.subtest.name = parts[1].strip() subtest.subtest.message = parts[2].strip() subtest.subtest.in_ad = any(subtest.subtest.message.find(t)>=0 for t in timeouts) set_default(subtest, bug) subtest.subtest.comment_line = line subtest.subtest.report_ts = Date(comment.modified_ts) subtests.append(subtest) except Exception, e: Log.note("IGNORED LINE {{bug_id}} {{line}}", line=line, bug_id=bug.bug_id) else: Log.note("IGNORED LINE {{bug_id}} {{line}}", line=line, bug_id=bug.bug_id)
def append_query(self, es_query, start): self.start = start for i, (k, v) in enumerate(self.fields): es_query = wrap({"aggs": { "_match": set_default({"terms": {"field": v}}, es_query), "_missing": set_default({"missing": {"field": v}}, es_query), }}) if self.edge.domain.where: filter = simplify_esfilter(self.edge.domain.where) es_query = {"aggs": {"_filter": set_default({"filter": filter}, es_query)}} return es_query
def append_query(self, es_query, start): self.start = start return wrap({ "aggs": { "_match": set_default({"terms": { "field": self.edge.value }}, es_query), "_missing": set_default({"missing": { "field": self.edge.value }}, es_query), } })
def append_query(self, es_query, start): # TODO: USE "reverse_nested" QUERY TO PULL THESE self.start = start for i, v in enumerate(self.fields): nest = wrap({"aggs": {"_match": set_default({"terms": {"field": v, "size": self.domain.limit}}, es_query)}}) if self.edge.allowNulls: nest.aggs._missing = set_default({"missing": {"field": v}}, es_query) es_query = nest if self.domain.where: filter = simplify_esfilter(self.domain.where) es_query = {"aggs": {"_filter": set_default({"filter": filter}, es_query)}} return es_query
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not type2container: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not config.default.settings: Log.error("expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info") settings = set_default( { "index": split_field(frum)[0], "name": frum, }, config.default.settings ) settings.type = None # WE DO NOT WANT TO INFLUENCE THE TYPE BECAUSE NONE IS IN THE frum STRING ANYWAY return type2container["elasticsearch"](settings) elif isinstance(frum, Mapping) and frum.type and type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import Query return Query(frum, schema=schema) else: return frum
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not type2container: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not config.default.settings: Log.error( "expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info" ) settings = set_default({ "index": split_field(frum)[0], "name": frum, }, config.default.settings) settings.type = None # WE DO NOT WANT TO INFLUENCE THE TYPE BECAUSE NONE IS IN THE frum STRING ANYWAY return type2container["elasticsearch"](settings) elif isinstance(frum, Mapping) and frum.type and type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import Query return Query(frum, schema=schema) else: return frum
def unexpected( cls, template, default_params={}, cause=None, stack_depth=0, log_context=None, **more_params ): if isinstance(default_params, BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) if cause and not isinstance(cause, Except): cause = Except(UNEXPECTED, unicode(cause), trace=_extract_traceback(0)) trace = extract_stack(1) e = Except(UNEXPECTED, template, params, cause, trace) Log.note( "{{error}}", error=e, log_context=set_default({"context": WARNING}, log_context), stack_depth=stack_depth + 1 )
def warning(cls, template, default_params={}, cause=None, stack_depth=0, log_context=None, **more_params): """ :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if isinstance(default_params, BaseException): cause = default_params default_params = {} if "values" in more_params.keys(): Log.error("Can not handle a logging parameter by name `values`") params = dict(unwrap(default_params), **more_params) cause = unwraplist([Except.wrap(c) for c in listwrap(cause)]) trace = exceptions.extract_stack(stack_depth + 1) e = Except(exceptions.WARNING, template, params, cause, trace) Log.note("{{error|unicode}}", error=e, log_context=set_default({"context": exceptions.WARNING}, log_context), stack_depth=stack_depth + 1)
def unexpected(cls, template, default_params={}, cause=None, stack_depth=0, log_context=None, **more_params): """ :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if isinstance(default_params, BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) if cause and not isinstance(cause, Except): cause = Except(exceptions.UNEXPECTED, unicode(cause), trace=exceptions._extract_traceback(0)) trace = exceptions.extract_stack(1) e = Except(exceptions.UNEXPECTED, template, params, cause, trace) Log.note("{{error}}", error=e, log_context=set_default({"context": exceptions.WARNING}, log_context), stack_depth=stack_depth + 1)
def alarm( cls, template, default_params={}, stack_depth=0, log_context=None, **more_params ): """ :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ # USE replace() AS POOR MAN'S CHILD TEMPLATE template = ("*" * 80) + "\n" + indent(template, prefix="** ").strip() + "\n" + ("*" * 80) Log.note( template, default_params=default_params, stack_depth=stack_depth + 1, log_context=set_default({"context": exceptions.ALARM}, log_context), **more_params )
def unexpected( cls, template, default_params={}, cause=None, stack_depth=0, log_context=None, **more_params ): """ :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if isinstance(default_params, BaseException): cause = default_params default_params = {} params = dict(unwrap(default_params), **more_params) if cause and not isinstance(cause, Except): cause = Except(exceptions.UNEXPECTED, unicode(cause), trace=exceptions._extract_traceback(0)) trace = exceptions.extract_stack(1) e = Except(exceptions.UNEXPECTED, template, params, cause, trace) Log.note( "{{error}}", error=e, log_context=set_default({"context": exceptions.WARNING}, log_context), stack_depth=stack_depth + 1 )
def warning( cls, template, default_params={}, cause=None, stack_depth=0, log_context=None, **more_params ): """ :param template: *string* human readable string with placeholders for parameters :param default_params: *dict* parameters to fill in template :param cause: *Exception* for chaining :param stack_depth: *int* how many calls you want popped off the stack to report the *true* caller :param log_context: *dict* extra key:value pairs for your convenience :param more_params: *any more parameters (which will overwrite default_params) :return: """ if isinstance(default_params, BaseException): cause = default_params default_params = {} if "values" in more_params.keys(): Log.error("Can not handle a logging parameter by name `values`") params = dict(unwrap(default_params), **more_params) cause = unwraplist([Except.wrap(c) for c in listwrap(cause)]) trace = exceptions.extract_stack(stack_depth + 1) e = Except(exceptions.WARNING, template, params, cause, trace) Log.note( "{{error|unicode}}", error=e, log_context=set_default({"context": exceptions.WARNING}, log_context), stack_depth=stack_depth + 1 )
def send(self, topic, message): """Publishes a pulse message to the proper exchange.""" if not message: Log.error("Expecting a message") message._prepare() if not self.connection: self.connect() producer = Producer( channel=self.connection, exchange=Exchange(self.settings.exchange, type='topic'), routing_key=topic ) # The message is actually a simple envelope format with a payload and # some metadata. final_data = Dict( payload=message.data, _meta=set_default({ 'exchange': self.settings.exchange, 'routing_key': message.routing_key, 'serializer': self.settings.serializer, 'sent': time_to_string(datetime.datetime.now(timezone(self.settings.broker_timezone))), 'count': self.count }, message.metadata) ) producer.publish(jsons.scrub(final_data), serializer=self.settings.serializer) self.count += 1
def __new__(cls, e=None, query=None, *args, **kwargs): e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": if query.groupby: return object.__new__(DefaultDecoder, e) if isinstance(e.value, basestring): Log.error("Expecting Variable or Expression, not plain string") if isinstance(e.value, TupleOp): # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS if not all(isinstance(t, Variable) for t in e.value.terms): Log.error("Can only handle variables in tuples") e.domain = Dict(dimension={"fields": e.value.terms}) return object.__new__(DimFieldListDecoder, e) elif isinstance(e.value, Variable): cols = query.frum.get_columns() col = cols.filter(lambda c: c.name == e.value.var)[0] if not col: return object.__new__(DefaultDecoder, e) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.partitions != None: e.domain = SimpleSetDomain(partitions=col.partitions[:limit:]) else: e.domain = set_default(DefaultDomain(limit=limit), e.domain.as_dict()) return object.__new__(DefaultDecoder, e) else: return object.__new__(DefaultDecoder, e) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder, e) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder, e) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder, e) if e.range: return object.__new__(GeneralRangeDecoder, e) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder, e) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder, e) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if isinstance(fields, Mapping): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder, e) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder, e) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def _delayed_imports(): global type2container global _ListContainer from pyLibrary.queries.containers.lists import ListContainer as _ListContainer _ = _ListContainer from pyLibrary.queries.qb_usingMySQL import MySQL from pyLibrary.queries.qb_usingES import FromES from pyLibrary.queries.meta import FromESMetadata set_default(type2container, { "elasticsearch": FromES, "mysql": MySQL, "memory": None, "meta": FromESMetadata })
def upsert_column(self, c): existing_columns = filter(lambda r: r.table == c.table and r.abs_name == c.abs_name, self.columns.data) if not existing_columns: self.columns.add(c) cols = filter(lambda r: r.table == "meta.columns", self.columns.data) for cc in cols: cc.partitions = cc.cardinality = cc.last_updated = None self.todo.add(c) self.todo.extend(cols) else: set_default(existing_columns[0], c) self.todo.add(existing_columns[0]) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.abs_name==d.abs_name and c.table==d.table and c!=d: Log.error("")
def _get_branches_from_hg(settings): # GET MAIN PAGE response = http.get(settings.url) doc = BeautifulSoup(response.all_content) all_repos = doc("table")[1] branches = UniqueIndex(["name", "locale"], fail_on_dup=False) for i, r in enumerate(all_repos("tr")): dir, name = [v.text.strip() for v in r("td")] b = _get_single_branch_from_hg(settings, name, dir.lstrip("/")) branches.extend(b) # branches.add(set_default({"name": "release-mozilla-beta"}, branches["mozilla-beta", DEFAULT_LOCALE])) for b in list(branches["mozilla-beta", ]): branches.add(set_default({"name": "release-mozilla-beta"}, b)) # THIS IS THE l10n "name" b.url = "https://hg.mozilla.org/releases/mozilla-beta" # THIS IS THE for b in list(branches["mozilla-release", ]): branches.add(set_default({"name": "release-mozilla-release"}, b)) for b in list(branches["mozilla-aurora", ]): if b.locale == "en-US": continue branches.add(set_default({"name": "comm-aurora"}, b)) # b.url = "https://hg.mozilla.org/releases/mozilla-aurora" for b in list(branches): if b.name.startswith("mozilla-esr"): branches.add(set_default({"name": "release-" + b.name}, b)) # THIS IS THE l10n "name" b.url = "https://hg.mozilla.org/releases/" + b.name #CHECKS for b in branches: if b.name != b.name.lower(): Log.error("Expecting lowercase name") if not b.locale: Log.error("Not expected") if not b.url.startswith("http"): Log.error("Expecting a valid url") if not b.etl.timestamp: Log.error("Expecting a timestamp") return branches
def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.as_dict())))
def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.as_dict() )))
def __new__(cls, e=None, query=None, *args, **kwargs): if query.groupby: # GROUPBY ASSUMES WE IGNORE THE DOMAIN RANGE e.allowNulls = False else: e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": if query.groupby: return object.__new__(DefaultDecoder, e) if isinstance(e.value, basestring): Log.error("Expecting Variable or Expression, not plain string") if isinstance(e.value, Variable): cols = query.frum.get_columns() col = cols.filter(lambda c: c.name == e.value.var)[0] if not col: return object.__new__(DefaultDecoder, e) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.partitions != None: e.domain = SimpleSetDomain( partitions=col.partitions[:limit:]) else: e.domain = set_default(DefaultDomain(limit=limit), e.domain.as_dict()) return object.__new__(DefaultDecoder, e) else: return object.__new__(DefaultDecoder, e) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder, e) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder, e) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder, e) if e.range: return object.__new__(GeneralRangeDecoder, e) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder, e) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder, e) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if isinstance(fields, Mapping): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder, e) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder, e) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def _replace_ref(node, url): if url.path.endswith("/"): url.path = url.path[:-1] if isinstance(node, Mapping): ref = None output = {} for k, v in node.items(): if k == "$ref": ref = URL(v) else: output[k] = _replace_ref(v, url) if not ref: return output node = output if not ref.scheme and not ref.path: # DO NOT TOUCH LOCAL REF YET output["$ref"] = ref return output if not ref.scheme: # SCHEME RELATIVE IMPLIES SAME PROTOCOL AS LAST TIME, WHICH # REQUIRES THE CURRENT DOCUMENT'S SCHEME ref.scheme = url.scheme # FIND THE SCHEME AND LOAD IT if ref.scheme in scheme_loaders: new_value = scheme_loaders[ref.scheme](ref, url) else: raise _Log.error("unknown protocol {{scheme}}", scheme=ref.scheme) if ref.fragment: new_value = dot.get_attr(new_value, ref.fragment) if DEBUG: _Log.note("Replace {{ref}} with {{new_value}}", ref=ref, new_value=new_value) if not output: output = new_value else: output = unwrap(set_default(output, new_value)) if DEBUG: _Log.note("Return {{output}}", output=output) return output elif isinstance(node, list): output = [_replace_ref(n, url) for n in node] # if all(p[0] is p[1] for p in zip(output, node)): # return node return output return node
def log_loop(settings, synch, queue, bucket, please_stop): with aws.Queue(settings.work_queue) as work_queue: for i, g in qb.groupby(queue, size=settings.param.size): Log.note( "Preparing {{num}} pulse messages to bucket={{bucket}}", num=len(g), bucket=bucket.name ) full_key = unicode(synch.next_key) + ":" + unicode(MIN(g.select("_meta.count"))) try: output = [ set_default( d, {"etl": { "name": "Pulse block", "bucket": settings.destination.bucket, "timestamp": Date.now().unix, "id": synch.next_key, "source": { "name": "pulse.mozilla.org", "id": d._meta.count, "count": d._meta.count, "message_id": d._meta.message_id, "sent": Date(d._meta.sent), }, "type": "aggregation" }} ) for i, d in enumerate(g) if d != None # HAPPENS WHEN PERSISTENT QUEUE FAILS TO LOG start ] bucket.write(full_key, "\n".join(convert.value2json(d) for d in output)) synch.advance() synch.source_key = MAX(g.select("_meta.count")) + 1 now = Date.now() work_queue.add({ "bucket": bucket.name, "key": full_key, "timestamp": now.unix, "date/time": now.format() }) synch.ping() queue.commit() Log.note("Wrote {{num}} pulse messages to bucket={{bucket}}, key={{key}} ", num= len(g), bucket= bucket.name, key= full_key) except Exception, e: queue.rollback() if not queue.closed: Log.warning("Problem writing {{key}} to S3", key=full_key, cause=e) if please_stop: break
def _get_and_retry(self, url, branch, **kwargs): """ requests 2.5.0 HTTPS IS A LITTLE UNSTABLE """ kwargs = set_default(kwargs, {"timeout": self.timeout.seconds}) try: return _get_url(url, branch, **kwargs) except Exception, e: pass
def test_meta(self): test = wrap({ "query": {"from": "meta.columns"}, "data": [ {"a": "b"} ] }) settings = self.utils.fill_container(test, tjson=False) table_name = settings.index # WE REQUIRE A QUERY TO FORCE LOADING OF METADATA pre_test = { "query": { "from": table_name }, "expecting_list": { "meta": {"format": "list"}, "data": [{"a": "b"}] } } self.utils.send_queries(pre_test) test = set_default(test, { "query": { "select": ["name", "table", "type", "nested_path"], "from": "meta.columns", "where": {"eq": {"table": table_name}} }, "expecting_list": { "meta": {"format": "list"}, "data": [ {"table": table_name, "name": "a", "type": "string", "nested_path": "."} ] }, "expecting_table": { "meta": {"format": "table"}, "header": ["table", "name", "type", "nested_path"], "data": [[table_name, "a", "string", "."]] }, "expecting_cube": { "meta": {"format": "cube"}, "edges": [ { "name": "rownum", "domain": {"type": "rownum", "min": 0, "max": 1, "interval": 1} } ], "data": { "table": [table_name], "name": ["a"], "type": ["string"], "nested_path": ["."] } } }) self.utils.send_queries(test)
def get_schema(self, retry=True): if self.settings.explore_metadata: indices = self.cluster.get_metadata().indices if not self.settings.alias or self.settings.alias==self.settings.index: #PARTIALLY DEFINED settings candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases] # TODO: MERGE THE mappings OF ALL candidates, DO NOT JUST PICK THE LAST ONE index = "dummy value" schema = wrap({"_routing": {}, "properties": {}}) for _, ind in jx.sort(candidates, {"value": 0, "sort": -1}): mapping = ind.mappings[self.settings.type] set_default(schema._routing, mapping._routing) schema.properties = _merge_mapping(schema.properties, mapping.properties) else: #FULLY DEFINED settings index = indices[self.settings.index] schema = index.mappings[self.settings.type] if index == None and retry: #TRY AGAIN, JUST IN CASE self.cluster.cluster_state = None return self.get_schema(retry=False) #TODO: REMOVE THIS BUG CORRECTION if not schema and self.settings.type == "test_result": schema = index.mappings["test_results"] # DONE BUG CORRECTION if not schema: Log.error( "ElasticSearch index ({{index}}) does not have type ({{type}})", index=self.settings.index, type=self.settings.type ) return schema else: mapping = self.cluster.get(self.path + "/_mapping") if not mapping[self.settings.type]: Log.error("{{index}} does not have type {{type}}", self.settings) return wrap({"mappings": mapping[self.settings.type]})
def _replace_ref(node, url): if url.path.endswith("/"): url.path = url.path[:-1] if isinstance(node, Mapping): ref, raw_ref, node["$ref"] = URL(node["$ref"]), node["$ref"], None # RECURS return_value = node candidate = {} for k, v in node.items(): new_v = _replace_ref(v, url) candidate[k] = new_v if new_v is not v: return_value = candidate if not ref: return return_value else: node = return_value if not ref.scheme and not ref.path: # DO NOT TOUCH LOCAL REF YET node["$ref"] = ref return node if not ref.scheme: # SCHEME RELATIVE IMPLIES SAME PROTOCOL AS LAST TIME, WHICH # REQUIRES THE CURRENT DOCUMENT'S SCHEME ref.scheme = url.scheme # FIND THE SCHEME AND LOAD IT if ref.scheme in scheme_loaders: new_value = scheme_loaders[ref.scheme](ref, url) else: raise Log.error("unknown protocol {{scheme}}", scheme=ref.scheme) if ref.fragment: new_value = new_value[ref.fragment] if isinstance(new_value, Mapping): return set_default({}, node, new_value) elif node.keys() and new_value == None: return node else: return wrap(new_value) elif isinstance(node, list): candidate = [_replace_ref(n, url) for n in node] if all(p[0] is p[1] for p in zip(candidate, node)): return node return candidate return node
def append_query(self, es_query, start): self.start = start domain = self.domain field = self.edge.value if isinstance(field, Variable): key = domain.key if isinstance(key, (tuple, list)) and len(key)==1: key = key[0] include = [p[key] for p in domain.partitions] if self.edge.allowNulls: return wrap({"aggs": { "_match": set_default({"terms": { "field": field.var, "size": self.limit, "include": include }}, es_query), "_missing": set_default( {"filter": {"or": [ field.missing().to_esfilter(), {"not": {"terms": {field.var: include}}} ]}}, es_query ), }}) else: return wrap({"aggs": { "_match": set_default({"terms": { "field": field.var, "size": self.limit, "include": include }}, es_query) }}) else: include = [p[domain.key] for p in domain.partitions] if self.edge.allowNulls: return wrap({"aggs": { "_match": set_default({"terms": { "script_field": field.to_ruby(), "size": self.limit, "include": include }}, es_query), "_missing": set_default( {"filter": {"or": [ field.missing().to_esfilter(), NotOp("not", InOp("in", [field, Literal("literal", include)])).to_esfilter() ]}}, es_query ), }}) else: return wrap({"aggs": { "_match": set_default({"terms": { "script_field": field.to_ruby(), "size": self.limit, "include": include }}, es_query) }})
def wrap_from(frum, schema=None): """ :param frum: :param schema: :return: """ if not _containers: _delayed_imports() frum = wrap(frum) if isinstance(frum, basestring): if not _containers.config.default.settings: Log.error( "expecting pyLibrary.queries.query.config.default.settings to contain default elasticsearch connection info" ) type_ = None index = frum if frum.startswith("meta."): if frum == "meta.columns": return _meta.singlton.meta.columns elif frum == "meta.tables": return _meta.singlton.meta.tables else: Log.error("{{name}} not a recognized table", name=frum) else: type_ = _containers.config.default.type index = join_field(split_field(frum)[:1:]) settings = set_default({ "index": index, "name": frum }, _containers.config.default.settings) settings.type = None return _containers.type2container[type_](settings) elif isinstance( frum, Mapping) and frum.type and _containers.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return _containers.type2container[frum.type](frum.settings) elif isinstance(frum, Mapping) and (frum["from"] or isinstance(frum["from"], (list, set))): from pyLibrary.queries.query import QueryOp return QueryOp.wrap(frum, schema=schema) elif isinstance(frum, (list, set)): return _ListContainer("test_list", frum) else: return frum
def _get_and_retry(self, url, **kwargs): """ requests 2.5.0 HTTPS IS A LITTLE UNSTABLE """ kwargs = set_default(kwargs, {"timeout": self.timeout.seconds}) try: return http.get(url, **kwargs) except Exception, e: try: Thread.sleep(seconds=5) return http.get(url.replace("https://", "http://"), **kwargs) except Exception, f: Log.error("Tried {{url}} twice. Both failed.", {"url": url}, cause=[e, f])
def append_query(self, es_query, start): self.start = start for i, (k, v) in enumerate(self.fields): es_query = wrap({ "aggs": { "_match": set_default({"terms": { "field": v }}, es_query), "_missing": set_default({"missing": { "field": v }}, es_query), } }) if self.edge.domain.where: filter = simplify_esfilter(self.edge.domain.where) es_query = { "aggs": { "_filter": set_default({"filter": filter}, es_query) } } return es_query
def _convert_edge(self, edge): dim = self.dimensions[edge.value] if not dim: return edge if len(listwrap(dim.fields)) == 1: #TODO: CHECK IF EDGE DOMAIN AND DIMENSION DOMAIN CONFLICT new_edge = set_default({"value": unwraplist(dim.fields)}, edge) return new_edge new_edge.domain = dim.getDomain() edge = copy(edge) edge.value = None edge.domain = dim.getDomain() return edge
def _replace_locals(node, doc_path): if isinstance(node, Mapping): # RECURS, DEEP COPY ref = None output = {} for k, v in node.items(): if k == "$ref": ref = v elif v == None: continue else: output[k] = _replace_locals(v, [v] + doc_path) if not ref: return output # REFER TO SELF frag = ref.fragment if frag[0] == ".": # RELATIVE for i, p in enumerate(frag): if p != ".": if i > len(doc_path): _Log.error( "{{frag|quote}} reaches up past the root document", frag=frag) new_value = dot.get_attr(doc_path[i - 1], frag[i::]) break else: new_value = doc_path[len(frag) - 1] else: # ABSOLUTE new_value = dot.get_attr(doc_path[-1], frag) new_value = _replace_locals(new_value, [new_value] + doc_path) if not output: return new_value # OPTIMIZATION FOR CASE WHEN node IS {} else: return unwrap(set_default(output, new_value)) elif isinstance(node, list): candidate = [_replace_locals(n, [n] + doc_path) for n in node] # if all(p[0] is p[1] for p in zip(candidate, node)): # return node return candidate return node