def _aggs_iterator(agg, d): deeper = coalesce(agg._filter, agg._nested) while deeper: agg = deeper deeper = coalesce(agg._filter, agg._nested) if d > 0: for b in agg._match.buckets: parts[d] = b for a in _aggs_iterator(b, d - 1): yield a parts[d] = Null for b in agg._other.buckets: for a in _aggs_iterator(b, d - 1): yield a b = agg._missing if b.doc_count: for a in _aggs_iterator(b, d - 1): yield a else: for b in agg._match.buckets: parts[d] = b if b.doc_count: yield b parts[d] = Null for b in agg._other.buckets: if b.doc_count: yield b b = agg._missing if b.doc_count: yield b
def __init__(self, **desc): desc = wrap(desc) self._set_slots_to_null(self.__class__) set_default(self, desc) self.name = coalesce(desc.name, desc.type) self.isFacet = coalesce(desc.isFacet, False) self.dimension = Null
def _range_composer(edge, domain, es_query, to_float): # USE RANGES _min = coalesce(domain.min, MAX(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if is_keyword(edge.value): calc = {"field": edge.value} else: calc = {"script": qb_expression_to_ruby(edge.value)} if is_keyword(edge.value): missing_range = {"or": [ {"range": {edge.value: {"lt": to_float(_min)}}}, {"range": {edge.value: {"gte": to_float(_max)}}} ]} else: missing_range = {"script": {"script": qb_expression_to_ruby({"or": [ {"lt": [edge.value, to_float(_min)]}, {"gt": [edge.value, to_float(_max)]}, ]})}} return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": set_default( {"filter": {"or": [ missing_range, {"missing": {"field": get_all_vars(edge.value)}} ]}}, es_query ), }})
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if sort == None: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring): output.append({"value": jx_expression(s), "sort": 1}) elif isinstance(s, Expression): output.append({"value": s, "sort": 1}) elif Math.is_integer(s): output.append({"value": OffsetOp("offset", s), "sort": 1}) elif all(d in sort_direction for d in s.values()) and not s.sort and not s.value: for v, d in s.items(): output.append({"value": jx_expression(v), "sort": -1}) else: output.append({ "value": jx_expression(coalesce(s.value, s.field)), "sort": coalesce(sort_direction[s.sort], 1) }) return output
def _range_composer(edge, domain, es_query, to_float): # USE RANGES _min = coalesce(domain.min, MAX(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if isinstance(edge.value, Variable): calc = {"field": edge.value.var} else: calc = {"script_field": edge.value.to_ruby()} if edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER missing_filter = set_default( {"filter": {"or": [ OrOp("or", [ InequalityOp("lt", [edge.value, Literal(None, to_float(_min))]), InequalityOp("gte", [edge.value, Literal(None, to_float(_max))]), ]).to_esfilter(), edge.value.missing().to_esfilter() ]}}, es_query ) else: missing_filter = None return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": missing_filter }})
def __init__( self, host, index, type=None, alias=None, name=None, port=9200, read_only=True, typed=None, settings=None ): Container.__init__(self, None) if not containers.config.default: containers.config.default.settings = settings self.settings = settings self.name = coalesce(name, alias, index) if read_only: self._es = elasticsearch.Alias(alias=coalesce(alias, index), settings=settings) else: self._es = elasticsearch.Cluster(settings=settings).get_index(read_only=read_only, settings=settings) self.meta = FromESMetadata(settings=settings) self.settings.type = self._es.settings.type self.edges = Dict() self.worker = None columns = self.get_columns(table_name=name) self._schema = Schema(columns) if typed == None: # SWITCH ON TYPED MODE self.typed = any(c.name in ("$value", "$object") for c in columns) else: self.typed = typed
def __init__(self, host, index, type=None, alias=None, name=None, port=9200, read_only=True, typed=None, settings=None): Container.__init__(self, None) if not containers.config.default: containers.config.default.settings = settings self.settings = settings self.name = coalesce(name, alias, index) if read_only: self._es = elasticsearch.Alias(alias=coalesce(alias, index), settings=settings) else: self._es = elasticsearch.Cluster(settings=settings).get_index( read_only=read_only, settings=settings) self.meta = FromESMetadata(settings=settings) self.settings.type = self._es.settings.type self.edges = Dict() self.worker = None if typed == None: self._columns = self.get_columns(table_name=index) # SWITCH ON TYPED MODE self.typed = any(c.name in ("$value", "$object") for c in self._columns) else: self.typed = typed
def _range_composer(edge, domain, es_query, to_float): # USE RANGES _min = coalesce(domain.min, MAX(domain.partitions.min)) _max = coalesce(domain.max, MAX(domain.partitions.max)) if isinstance(edge.value, Variable): calc = {"field": edge.value.var} else: calc = {"script_field": edge.value.to_ruby()} if edge.allowNulls: # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER missing_filter = set_default( {"filter": {"or": [ OrOp("or", [ BinaryOp("lt", [edge.value, Literal(None, to_float(_min))]), BinaryOp("gte", [edge.value, Literal(None, to_float(_max))]), ]).to_esfilter(), edge.value.missing().to_esfilter() ]}}, es_query ) else: missing_filter = None return wrap({"aggs": { "_match": set_default( {"range": calc}, {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}}, es_query ), "_missing": missing_filter }})
def _convert_select(self, select): if isinstance(select, basestring): return Dict( name=select.rstrip( "." ), # TRAILING DOT INDICATES THE VALUE, BUT IS INVALID FOR THE NAME value=select, aggregate="none") else: select = wrap(select) output = copy(select) if not select.value or isinstance(select.value, basestring): if select.value == ".": output.name = coalesce(select.name, select.aggregate) else: output.name = coalesce(select.name, select.value, select.aggregate) elif not output.name: Log.error("Must give name to each column in select clause") if not output.name: Log.error("expecting select to have a name: {{select}}", select=select) output.aggregate = coalesce( canonical_aggregates.get(select.aggregate), select.aggregate, "none") return output
def Stats2ZeroMoment(stats): # MODIFIED FROM http://statsmodels.sourceforge.net/devel/_modules/statsmodels/stats/moment_helpers.html # ADDED count mc0, mc1, mc2, skew, kurt = stats.count, coalesce(stats.mean, 0), coalesce(stats.variance, 0), coalesce(stats.skew, 0), coalesce(stats.kurtosis, 0) mz0 = mc0 mz1 = mc1 * mc0 mz2 = (mc2 + mc1 * mc1) * mc0 mc3 = coalesce(skew, 0) * (mc2 ** 1.5) # 3rd central moment mz3 = (mc3 + 3 * mc1 * mc2 + mc1 ** 3) * mc0 # 3rd non-central moment mc4 = (coalesce(kurt, 0) + 3.0) * (mc2 ** 2.0) # 4th central moment mz4 = (mc4 + 4 * mc1 * mc3 + 6 * mc1 * mc1 * mc2 + mc1 ** 4) * mc0 m = ZeroMoment(mz0, mz1, mz2, mz3, mz4) if DEBUG: from pyLibrary.testing.fuzzytestcase import assertAlmostEqualValue globals()["DEBUG"] = False try: v = ZeroMoment2Stats(m) assertAlmostEqualValue(v.count, stats.count, places=10) assertAlmostEqualValue(v.mean, stats.mean, places=10) assertAlmostEqualValue(v.variance, stats.variance, places=10) assertAlmostEqualValue(v.skew, stats.skew, places=10) assertAlmostEqualValue(v.kurtosis, stats.kurtosis, places=10) except Exception, e: v = ZeroMoment2Stats(m) Log.error("programmer error") globals()["DEBUG"] = True
def __init__( self, exchange, # name of the Pulse exchange topic, # message name pattern to subscribe to ('#' is wildcard) target=None, # WILL BE CALLED WITH PULSE PAYLOADS AND ack() IF COMPLETE$ED WITHOUT EXCEPTION target_queue=None, # (aka self.queue) WILL BE FILLED WITH PULSE PAYLOADS host='pulse.mozilla.org', # url to connect, port=5671, # tcp port user=None, password=None, vhost="/", start=0, # USED AS STARTING POINT FOR ASSIGNING THE _meta.count ATTRIBUTE ssl=True, applabel=None, heartbeat=False, # True to also get the Pulse heartbeat message durable=False, # True to keep queue after shutdown serializer='json', broker_timezone='GMT', settings=None ): self.target_queue = target_queue self.pulse_target = target if (target_queue == None and target == None) or (target_queue != None and target != None): Log.error("Expecting a queue (for fast digesters) or a target (for slow digesters)") Thread.__init__(self, name="Pulse consumer for " + settings.exchange, target=self._worker) self.settings = settings settings.callback = self._got_result settings.user = coalesce(settings.user, settings.username) settings.applabel = coalesce(settings.applable, settings.queue, settings.queue_name) settings.topic = topic self.pulse = ModifiedGenericConsumer(settings, connect=True, **settings) self.count = coalesce(start, 0) self.start()
def add_alias(self, alias=None): if alias: self.cluster_state = None self.cluster._post("/_aliases", data=convert.unicode2utf8( convert.value2json({ "actions": [{ "add": { "index": self.settings.index, "alias": alias } }] })), timeout=coalesce(self.settings.timeout, 30)) else: # SET ALIAS ACCORDING TO LIFECYCLE RULES self.cluster_state = None self.cluster._post("/_aliases", data=convert.unicode2utf8( convert.value2json({ "actions": [{ "add": { "index": self.settings.index, "alias": self.settings.alias } }] })), timeout=coalesce(self.settings.timeout, 30))
def window(data, param): """ MAYBE WE CAN DO THIS WITH NUMPY (no, the edges of windows are not graceful with numpy) data - list of records """ name = param.name # column to assign window function result edges = param.edges # columns to gourp by where = param.where # DO NOT CONSIDER THESE VALUES sortColumns = param.sort # columns to sort by calc_value = wrap_function(jx_expression_to_function(param.value)) # function that takes a record and returns a value (for aggregation) aggregate = param.aggregate # WindowFunction to apply _range = param.range # of form {"min":-10, "max":0} to specify the size and relative position of window data = filter(data, where) if not aggregate and not edges: if sortColumns: data = sort(data, sortColumns, already_normalized=True) # SIMPLE CALCULATED VALUE for rownum, r in enumerate(data): r[name] = calc_value(r, rownum, data) return if not aggregate or aggregate == "none": for _, values in groupby(data, edges.value): if not values: continue # CAN DO NOTHING WITH THIS ZERO-SAMPLE sequence = sort(values, sortColumns, already_normalized=True) for rownum, r in enumerate(sequence): r[name] = calc_value(r, rownum, sequence) return for keys, values in groupby(data, edges.value): if not values: continue # CAN DO NOTHING WITH THIS ZERO-SAMPLE sequence = sort(values, sortColumns) for rownum, r in enumerate(sequence): r["__temp__"] = calc_value(r, rownum, sequence) head = coalesce(_range.max, _range.stop) tail = coalesce(_range.min, _range.start) # PRELOAD total total = aggregate() for i in range(tail, head): total.add(sequence[i].__temp__) # WINDOW FUNCTION APPLICATION for i, r in enumerate(sequence): r[name] = total.end() total.add(sequence[i + head].__temp__) total.sub(sequence[i + tail].__temp__) for r in data: r["__temp__"] = None # CLEANUP
def __new__(cls, e=None, query=None, *args, **kwargs): e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": if query.groupby: return object.__new__(DefaultDecoder, e) if isinstance(e.value, basestring): Log.error("Expecting Variable or Expression, not plain string") if isinstance(e.value, TupleOp): # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS if not all(isinstance(t, Variable) for t in e.value.terms): Log.error("Can only handle variables in tuples") e.domain = Dict(dimension={"fields": e.value.terms}) return object.__new__(DimFieldListDecoder, e) elif isinstance(e.value, Variable): cols = query.frum.get_columns() col = cols.filter(lambda c: c.name == e.value.var)[0] if not col: return object.__new__(DefaultDecoder, e) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.partitions != None: e.domain = SimpleSetDomain(partitions=col.partitions[:limit:]) else: e.domain = set_default(DefaultDomain(limit=limit), e.domain.as_dict()) return object.__new__(DefaultDecoder, e) else: return object.__new__(DefaultDecoder, e) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder, e) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder, e) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder, e) if e.range: return object.__new__(GeneralRangeDecoder, e) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder, e) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder, e) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if isinstance(fields, Mapping): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder, e) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder, e) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def __new__(cls, e=None, query=None, *args, **kwargs): if query.groupby: # GROUPBY ASSUMES WE IGNORE THE DOMAIN RANGE e.allowNulls = False else: e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": if query.groupby: return object.__new__(DefaultDecoder, e) if isinstance(e.value, basestring): Log.error("Expecting Variable or Expression, not plain string") if isinstance(e.value, Variable): cols = query.frum.get_columns() col = cols.filter(lambda c: c.name == e.value.var)[0] if not col: return object.__new__(DefaultDecoder, e) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.partitions != None: e.domain = SimpleSetDomain( partitions=col.partitions[:limit:]) else: e.domain = set_default(DefaultDomain(limit=limit), e.domain.as_dict()) return object.__new__(DefaultDecoder, e) else: return object.__new__(DefaultDecoder, e) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder, e) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder, e) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder, e) if e.range: return object.__new__(GeneralRangeDecoder, e) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder, e) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder, e) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if isinstance(fields, Mapping): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder, e) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder, e) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def _normalize_edge(edge, schema=None): if not _Column: _late_import() if isinstance(edge, basestring): if schema: e = schema[edge] if e: if isinstance(e, _Column): return Dict( name=edge, value=edge, allowNulls=True, domain=_normalize_domain(schema=schema) ) elif isinstance(e.fields, list) and len(e.fields) == 1: return Dict( name=e.name, value=e.fields[0], allowNulls=True, domain=e.getDomain() ) else: return Dict( name=e.name, allowNulls=True, domain=e.getDomain() ) return Dict( name=edge, value=edge, allowNulls=True, domain=_normalize_domain(schema=schema) ) else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge=edge) if isinstance(edge.value, (list, set)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain = _normalize_domain(schema=schema) domain.dimension = Dict(fields=edge.value) return Dict( name=edge.name, allowNulls=bool(coalesce(edge.allowNulls, True)), domain=domain ) domain = _normalize_domain(edge.domain, schema=schema) return Dict( name=coalesce(edge.name, edge.value), value=edge.value, range=edge.range, allowNulls=bool(coalesce(edge.allowNulls, True)), domain=domain )
def full_etl(settings): schema = convert.json2value(convert.value2json(SCHEMA), leaves=True) Cluster(settings.destination).get_or_create_index(settings=settings.destination, schema=schema, limit_replicas=True) destq = FromES(settings.destination) if settings.incremental: min_bug_id = destq.query({ "from": coalesce(settings.destination.alias, settings.destination.index), "select": {"name": "max_bug_id", "value": "bug_id", "aggregate": "max"} }) min_bug_id = int(MAX(min_bug_id-1000, 0)) else: min_bug_id = 0 sourceq = FromES(settings.source) max_bug_id = sourceq.query({ "from": coalesce(settings.source.alias, settings.source.index), "select": {"name": "max_bug_id", "value": "bug_id", "aggregate": "max"} }) + 1 max_bug_id = int(coalesce(max_bug_id, 0)) # FIRST, GET ALL MISSING BUGS for s, e in qb.reverse(list(qb.intervals(min_bug_id, max_bug_id, 10000))): with Timer("pull {{start}}..{{end}} from ES", {"start": s, "end": e}): children = sourceq.query({ "from": settings.source.alias, "select": ["bug_id", "dependson", "blocked", "modified_ts", "expires_on"], "where": {"and": [ {"range": {"bug_id": {"gte": s, "lt": e}}}, {"or": [ {"exists": "dependson"}, {"exists": "blocked"} ]} ]}, "limit": 10000 }) with Timer("fixpoint work"): to_fix_point(settings, destq, children.data) # PROCESS RECENT CHANGES with Timer("pull recent dependancies from ES"): children = sourceq.query({ "from": settings.source.alias, "select": ["bug_id", "dependson", "blocked"], "where": {"and": [ {"range": {"modified_ts": {"gte": convert.datetime2milli(datetime.utcnow() - timedelta(days=7))}}}, {"or": [ {"exists": "dependson"}, {"exists": "blocked"} ]} ]}, "limit": 100000 }) to_fix_point(settings, destq, children.data)
def comparer(left, right): left = coalesce(left) right = coalesce(right) for f in formal: try: result = value_compare(f.func(left), f.func(right), f.sort) if result != 0: return result except Exception, e: Log.error("problem with compare", e)
def single(col, r): min = coalesce(r["gte"], r[">="]) max = coalesce(r["lte"], r["<="]) if min and max: # SPECIAL CASE (BETWEEN) return db.quote_column(col) + " BETWEEN " + db.quote_value(min) + " AND " + db.quote_value(max) else: return " AND ".join( db.quote_column(col) + name2sign[sign] + db.quote_value(value) for sign, value in r.items() )
def comparer(left, right): left = coalesce(left) right = coalesce(right) for f in formal: try: result = value_compare(left[f.field], right[f.field], f.sort) if result != 0: return result except Exception, e: Log.error("problem with compare", e)
def single(col, r): min = coalesce(r["gte"], r[">="]) max = coalesce(r["lte"], r["<="]) if min and max: # SPECIAL CASE (BETWEEN) return db.quote_column(col) + " BETWEEN " + db.quote_value( min) + " AND " + db.quote_value(max) else: return " AND ".join( db.quote_column(col) + name2sign[sign] + db.quote_value(value) for sign, value in r.items())
def main(): settings = startup.read_settings(defs={ "name": ["--restart", "--reset", "--redo"], "help": "force a reprocessing of all data", "action": "store_true", "dest": "restart" }) Log.start(settings.debug) try: with startup.SingleInstance(flavor_id=settings.args.filename): if settings.args.restart: reviews = Cluster(settings.destination).create_index(settings.destination) else: reviews = Cluster(settings.destination).get_proto(settings.destination) bugs = Cluster(settings.source).get_index(settings.source) with FromES(bugs) as esq: es_max_bug = esq.query({ "from": "private_bugs", "select": {"name": "max_bug", "value": "bug_id", "aggregate": "maximum"} }) #PROBE WHAT RANGE OF BUGS IS LEFT TO DO (IN EVENT OF FAILURE) with FromES(reviews) as esq: es_min_bug = esq.query({ "from": "reviews", "select": {"name": "min_bug", "value": "bug_id", "aggregate": "minimum"} }) batch_size = coalesce(bugs.settings.batch_size, settings.size, 1000) threads = coalesce(settings.threads, 4) Log.note(str(settings.min_bug)) min_bug = int(coalesce(settings.min_bug, 0)) max_bug = int(coalesce(settings.max_bug, Math.min(es_min_bug + batch_size * threads, es_max_bug))) with ThreadedQueue(reviews, batch_size=coalesce(reviews.settings.batch_size, 100)) as sink: func = functools.partial(full_etl, settings, sink) with Multithread(func, threads=threads) as m: m.inbound.silent = True Log.note("bugs from {{min}} to {{max}}, step {{step}}", { "min": min_bug, "max": max_bug, "step": batch_size }) m.execute(reversed([{"bugs": range(s, e)} for s, e in qb.intervals(min_bug, max_bug, size=1000)])) if settings.args.restart: reviews.add_alias() reviews.delete_all_but_self() finally: Log.stop()
def format_list_from_aggop(decoders, aggs, start, query, select): agg = aggs b = coalesce(agg._filter, agg._nested) while b: agg = b b = coalesce(agg._filter, agg._nested) item = Dict() for s in select: item[s.name] = agg[s.pull] return wrap({"meta": {"format": "list"}, "data": [item]})
def format_cube_from_aggop(decoders, aggs, start, query, select): agg = aggs b = coalesce(agg._filter, agg._nested) while b: agg = b b = coalesce(agg._filter, agg._nested) matricies = [(s, Matrix(dims=[], zeros=(s.aggregate == "count"))) for s in select] for s, m in matricies: m[tuple()] = agg[s.pull] cube = Cube(query.select, [], {s.name: m for s, m in matricies}) cube.frum = query return cube
def __new__(cls, e=None, query=None, *args, **kwargs): if query.groupby: # GROUPBY ASSUMES WE IGNORE THE DOMAIN RANGE e.allowNulls = False else: e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": if query.groupby: return object.__new__(DefaultDecoder, e) if isinstance(e.value, basestring): Log.error("Not expected anymore") if isinstance(e.value, Variable): cols = query.frum.get_columns() col = cols.filter(lambda c: c.name == e.value.var)[0] if not col: return object.__new__(DefaultDecoder, e) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.partitions != None: e.domain = SimpleSetDomain(partitions=col.partitions[:limit:]) else: e.domain = set_default(DefaultDomain(limit=limit), e.domain.as_dict()) return object.__new__(DefaultDecoder, e) else: return object.__new__(DefaultDecoder, e) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder, e) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder, e) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder, e) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder, e) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder, e) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if isinstance(fields, Mapping): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder, e) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def format_table_from_aggop(decoders, aggs, start, query, select): header = select.name agg = aggs b = coalesce(agg._filter, agg._nested) while b: agg = b b = coalesce(agg._filter, agg._nested) row = [] for s in select: row.append(agg[s.pull]) return Dict(meta={"format": "table"}, header=header, data=[row])
def percent(value, decimal=None, digits=None, places=None): value = float(value) if value == 0.0: return "0%" digits = coalesce(digits, places) if digits != None: left_of_decimal = int(math.ceil(math.log10(abs(value)))) + 2 decimal = digits - left_of_decimal decimal = coalesce(decimal, 0) right_of_decimal = max(decimal, 0) format = "{:." + _unicode(right_of_decimal) + "%}" return format.format(__builtin__.round(value, decimal + 2))
def send_email(self, from_address=None, to_address=None, subject=None, text_data=None, html_data=None ): """Sends an email. from_addr is an email address; to_addrs is a list of email adresses. Addresses can be plain (e.g. "*****@*****.**") or with real names (e.g. "John Smith <*****@*****.**>"). text_data and html_data are both strings. You can specify one or both. If you specify both, the email will be sent as a MIME multipart alternative, i.e., the recipient will see the HTML content if his viewer supports it; otherwise he'll see the text content. """ settings = self.settings from_address = coalesce(from_address, settings["from"], settings.from_address) to_address = listwrap(coalesce(to_address, settings.to_address, settings.to_addrs)) if not from_address or not to_address: raise Exception("Both from_addr and to_addrs must be specified") if not text_data and not html_data: raise Exception("Must specify either text_data or html_data") if not html_data: msg = MIMEText(text_data) elif not text_data: msg = MIMEText(html_data, 'html') else: msg = MIMEMultipart('alternative') msg.attach(MIMEText(text_data, 'plain')) msg.attach(MIMEText(html_data, 'html')) msg['Subject'] = coalesce(subject, settings.subject) msg['From'] = from_address msg['To'] = ', '.join(to_address) if self.server: # CALL AS PART OF A SMTP SESSION self.server.sendmail(from_address, to_address, msg.as_string()) else: # CALL AS STAND-ALONE with self: self.server.sendmail(from_address, to_address, msg.as_string())
def _normalize_sort(sort=None): """ CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE """ if not sort: return DictList.EMPTY output = DictList() for s in listwrap(sort): if isinstance(s, basestring) or Math.is_integer(s): output.append({"field": s, "sort": 1}) else: output.append({"field": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1)}) return wrap(output)
def format_list_from_aggop(decoders, aggs, start, query, select): agg = aggs b = coalesce(agg._filter, agg._nested) while b: agg = b b = coalesce(agg._filter, agg._nested) item = Dict() for s in select: item[s.name] = agg[s.pull] return wrap({ "meta": {"format": "list"}, "data": [item] })
def _normalize_edge(edge, schema=None): if not _Column: _late_import() if isinstance(edge, basestring): if schema: e = schema[edge] if e: if isinstance(e, _Column): return Dict(name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(schema=schema)) elif isinstance(e.fields, list) and len(e.fields) == 1: return Dict(name=e.name, value=jx_expression(e.fields[0]), allowNulls=True, domain=e.getDomain()) else: return Dict(name=e.name, allowNulls=True, domain=e.getDomain()) return Dict(name=edge, value=jx_expression(edge), allowNulls=True, domain=_normalize_domain(schema=schema)) else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound and complex edges: {{edge}}", edge=edge) if isinstance(edge.value, (list, set)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain = _normalize_domain(schema=schema) domain.dimension = Dict(fields=edge.value) return Dict(name=edge.name, allowNulls=bool(coalesce(edge.allowNulls, True)), domain=domain) domain = _normalize_domain(edge.domain, schema=schema) return Dict(name=coalesce(edge.name, edge.value), value=jx_expression(edge.value), range=_normalize_range(edge.range), allowNulls=bool(coalesce(edge.allowNulls, True)), domain=domain)
def _convert_edge(self, edge): if isinstance(edge, basestring): return Dict( name=edge, value=edge, domain=self._convert_domain() ) else: edge = wrap(edge) if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge= edge) if isinstance(edge.value, (Mapping, list)) and not edge.domain: # COMPLEX EDGE IS SHORT HAND domain =self._convert_domain() domain.dimension = Dict(fields=edge.value) return Dict( name=edge.name, allowNulls=False if edge.allowNulls is False else True, domain=domain ) domain = self._convert_domain(edge.domain) return Dict( name=coalesce(edge.name, edge.value), value=edge.value, range=edge.range, allowNulls=False if edge.allowNulls is False else True, domain=domain )
def query(self, sql, param=None): """ RETURN LIST OF dicts """ self._execute_backlog() try: old_cursor = self.cursor if not old_cursor: # ALLOW NON-TRANSACTIONAL READS self.cursor = self.db.cursor() self.cursor.execute("SET TIME_ZONE='+00:00'") self.cursor.close() self.cursor = self.db.cursor() if param: sql = expand_template(sql, self.quote_param(param)) sql = self.preamble + outdent(sql) if self.debug: Log.note("Execute SQL:\n{{sql}}", sql=indent(sql)) self.cursor.execute(sql) columns = [utf8_to_unicode(d[0]) for d in coalesce(self.cursor.description, [])] fixed = [[utf8_to_unicode(c) for c in row] for row in self.cursor] result = convert.table2list(columns, fixed) if not old_cursor: # CLEANUP AFTER NON-TRANSACTIONAL READS self.cursor.close() self.cursor = None return result except Exception, e: if isinstance(e, InterfaceError) or e.message.find("InterfaceError") >= 0: Log.error("Did you close the db connection?", e) Log.error("Problem executing SQL:\n{{sql|indent}}", sql= sql, cause=e, stack_depth=1)
def __init__(self, host, index, type=None, alias=None, name=None, port=9200, settings=None): self.settings = settings self.name = coalesce(name, alias, index) self._es = elasticsearch.Alias(alias=coalesce(alias, index), settings=settings) self.settings.type = self._es.settings.type # Alias() WILL ASSIGN A TYPE IF IT WAS MISSING self.edges = Dict() self.worker = None self.ready = False
def main(): try: settings = startup.read_settings(defs=[{ "name": ["--id"], "help": "id(s) to process. Use \"..\" for a range.", "type": str, "dest": "id", "required": False }]) constants.set(settings.constants) Log.start(settings.debug) if settings.args.id: etl_one(settings) return hg = HgMozillaOrg(settings=settings.hg) resources = Dict(hg=dictwrap(hg)) stopper = Signal() for i in range(coalesce(settings.param.threads, 1)): ETL( name="ETL Loop " + unicode(i), work_queue=settings.work_queue, resources=resources, workers=settings.workers, settings=settings.param, please_stop=stopper ) Thread.wait_for_shutdown_signal(stopper, allow_exit=True) except Exception, e: Log.error("Problem with etl", e)
def convert(self, expr): """ EXPAND INSTANCES OF name TO value """ if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_keyword(expr): return coalesce(self.dimensions[expr], expr) elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, QueryOp): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({ name: self.convert(value) for name, value in expr.leaves() }) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return converter_map.get(k, self._convert_bop)(self, k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr]) else: return expr
def get_meta(self, key, conforming=True): try: metas = list(self.bucket.list(prefix=key)) metas = wrap([m for m in metas if m.name.find(".json") != -1]) perfect = Null favorite = Null too_many = False error = None for m in metas: try: simple = strip_extension(m.key) if conforming: self._verify_key_format(simple) if simple == key: perfect = m too_many = False if simple.startswith(key + ".") or simple.startswith(key + ":"): if favorite and not perfect: too_many = True favorite = m except Exception, e: error = e if too_many: Log.error( "multiple keys in {{bucket}} with prefix={{prefix|quote}}: {{list}}", bucket=self.name, prefix=key, list=[k.name for k in metas]) if not perfect and error: Log.error("Problem with key request", error) return coalesce(perfect, favorite)
def es_setop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.fields = DictList() es_query.sort = qb_sort_to_es_sort(query.sort) source = "fields" for s in select: if s.value == "*": es_query.fields = None es_query.script_fields = None source = "_source" elif s.value == ".": es_query.fields = None es_query.script_fields = None source = "_source" elif isinstance(s.value, basestring) and is_keyword(s.value): es_query.fields.append(s.value) elif isinstance(s.value, list) and es_query.fields is not None: es_query.fields.extend(s.value) else: es_query.script_fields[literal_field(s.name)] = {"script": qb_expression_to_ruby(s.value)} return extract_rows(es, es_query, source, select, query)
def compileDuration2Term(edge): if edge.esscript: Log.error("edge script not supported yet") # IS THERE A LIMIT ON THE DOMAIN? numPartitions = len(edge.domain.partitions) value = edge.value if isKeyword(value): value = "doc[\"" + value + "\"].value" ref = coalesce(edge.domain.min, edge.domain.max, durations.ZERO) nullTest = compileNullTest(edge) ms = edge.domain.interval.milli if edge.domain.interval.month > 0: ms = durations.YEAR.milli / 12 * edge.domain.interval.month partition2int = "Math.floor((" + value + "-" + value2MVEL(ref) + ")/" + ms + ")" partition2int = "((" + nullTest + ") ? " + numPartitions + " : " + partition2int + ")" def int2Partition(value): if Math.round(value) == numPartitions: return edge.domain.NULL return edge.domain.getPartByKey(ref.add(edge.domain.interval.multiply(value))) return Dict(toTerm={"head": "", "body": partition2int}, fromTerm=int2Partition)
def __init__(self, edge, query, limit): AggsDecoder.__init__(self, edge, query, limit) self.domain = edge.domain self.domain.limit = Math.min(coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list() self.key2index = {} self.computed_domain = False
def __init__(self, host, index, type="log", max_size=1000, batch_size=100, settings=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ self.es = Cluster(settings).get_or_create_index( schema=convert.json2value(convert.value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, settings=settings ) self.batch_size = batch_size self.es.add_alias(coalesce(settings.alias, settings.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE)) Thread.run("add debug logs to es", self._insert_loop)
def __init__(self, edge, query): AggsDecoder.__init__(self, edge, query) self.fields = edge.domain.dimension.fields self.domain = self.edge.domain self.domain.limit = Math.min( coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT) self.parts = list()
def _normalize_group(edge, schema=None): if isinstance(edge, basestring): return wrap({ "name": edge, "value": jx_expression(edge), "allowNulls": True, "domain": { "type": "default" } }) else: edge = wrap(edge) if (edge.domain and edge.domain.type != "default") or edge.allowNulls != None: Log.error("groupby does not accept complicated domains") if not edge.name and not isinstance(edge.value, basestring): Log.error("You must name compound edges: {{edge}}", edge=edge) return wrap({ "name": coalesce(edge.name, edge.value), "value": jx_expression(edge.value), "allowNulls": True, "domain": { "type": "default" } })
def execute_sql(host, username, password, sql, schema=None, param=None, settings=None): """EXECUTE MANY LINES OF SQL (FROM SQLDUMP FILE, MAYBE?""" settings.schema = coalesce(settings.schema, settings.database) if param: with MySQL(settings) as temp: sql = expand_template(sql, temp.quote_param(param)) # MWe have no way to execute an entire SQL file in bulk, so we # have to shell out to the commandline client. args = [ "mysql", "-h{0}".format(settings.host), "-u{0}".format(settings.username), "-p{0}".format(settings.password) ] if settings.schema: args.append("{0}".format(settings.schema)) try: proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=-1) if isinstance(sql, unicode): sql = sql.encode("utf8") (output, _) = proc.communicate(sql) except Exception, e: Log.error("Can not call \"mysql\"", e)
def _convert_group(self, column): if isinstance(column, basestring): return wrap({ "name": column, "value": column, "domain": { "type": "default" } }) else: column = wrap(column) if (column.domain and column.domain.type != "default" ) or column.allowNulls != None: Log.error("groupby does not accept complicated domains") if not column.name and not isinstance(column.value, basestring): Log.error("You must name compound edges: {{edge}}", edge=column) return wrap({ "name": coalesce(column.name, column.value), "value": column.value, "domain": { "type": "default" } })
def _get_from_elasticsearch(self, revision, locale=None): rev = revision.changeset.id query = { "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"prefix": {"changeset.id": rev[0:12]}}, {"term": {"branch.name": revision.branch.name}}, {"term": {"branch.locale": coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)}} ]} }}, "size": 2000, } try: docs = self.es.search(query, timeout=120).hits.hits if len(docs) > 1: for d in docs: if d._id.endswith(d._source.branch.locale): return d._source Log.warning("expecting no more than one document") return docs[0]._source except Exception, e: Log.warning("Bad ES call", e) return None
def convert(self, expr): """ EXPAND INSTANCES OF name TO value """ if expr is True or expr == None or expr is False: return expr elif Math.is_number(expr): return expr elif expr == ".": return "." elif is_keyword(expr): return coalesce(self.dimensions[expr], expr) elif isinstance(expr, basestring): Log.error("{{name|quote}} is not a valid variable name", name=expr) elif isinstance(expr, Date): return expr elif isinstance(expr, Query): return self._convert_query(expr) elif isinstance(expr, Mapping): if expr["from"]: return self._convert_query(expr) elif len(expr) >= 2: #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION return wrap({name: self.convert(value) for name, value in expr.leaves()}) else: # ASSUME SINGLE-CLAUSE EXPRESSION k, v = expr.items()[0] return converter_map.get(k, self._convert_bop)(self, k, v) elif isinstance(expr, (list, set, tuple)): return wrap([self.convert(value) for value in expr]) else: return expr
def _normalize_edge(self, edge): """ RETURN A EDGE DEFINITION INTO A SIMPLE ARRAY OF PATH-LEAF DEFINITIONS [ {"name":<pathA>, "value":<pathB>}, ... ] USEFUL FOR DECLARING HIGH-LEVEL DIMENSIONS, AND RELIEVING LOW LEVEL PATH PAIRS """ if isinstance(edge, basestring): e = self[edge] if e: domain = e.getDomain() fields = domain.dimension.fields if isinstance(fields, list): if len(fields) == 1: return [{"value": fields[0]}] else: return [{ "name": (edge + "[" + str(i) + "]"), "value": v } for i, v in enumerate(fields)] elif isinstance(fields, Mapping): return [{ "name": (edge + "." + k), "value": v } for k, v in fields.items()] else: Log.error("do not know how to handle") return [{"name": edge, "value": edge}] else: return [{ "name": coalesce(edge.name, edge.value), "value": edge.value }]
def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.lists import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) self.meta=Dict() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ListContainer("meta.columns", [], wrap({c.name: c for c in column_columns})) self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def backup_name(self, timestamp=None): """ RETURN A FILENAME THAT CAN SERVE AS A BACKUP FOR THIS FILE """ suffix = convert.datetime2string(coalesce(timestamp, datetime.now()), "%Y%m%d_%H%M%S") return File.add_suffix(self._filename, suffix)
def _get_from_elasticsearch(self, revision, locale=None): rev = revision.changeset.id query = { "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "prefix": { "changeset.id": rev[0:12] } }, { "term": { "branch.name": revision.branch.name } }, { "term": { "branch.locale": coalesce(locale, DEFAULT_LOCALE) } }] } } }, "size": 2000, } docs = self.es.search(query).hits.hits if len(docs) > 1: Log.error("expecting no more than one document") return docs[0]._source
def es_fieldop(es, query): es_query = es14.util.es_query_template() select = listwrap(query.select) es_query.query = { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(qb_expression_to_esfilter(query.where)) } } es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT) es_query.sort = qb_sort_to_es_sort(query.sort) es_query.fields = DictList() source = "fields" for s in select.value: if s == "*": es_query.fields=None source = "_source" elif s == ".": es_query.fields=None source = "_source" elif isinstance(s, basestring) and is_keyword(s): es_query.fields.append(s) elif isinstance(s, list) and es_query.fields is not None: es_query.fields.extend(s) elif isinstance(s, Mapping) and es_query.fields is not None: es_query.fields.extend(s.values()) elif es_query.fields is not None: es_query.fields.append(s) es_query.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort] return extract_rows(es, es_query, source, select, query)
def get_meta(self, key, conforming=True): try: # key_prefix("2") metas = list(self.bucket.list(prefix=key)) metas = wrap([m for m in metas if m.name.find(".json") != -1]) perfect = Null favorite = Null too_many = False error = None for m in metas: try: simple = strip_extension(m.key) if conforming: self._verify_key_format(simple) if simple == key: perfect = m too_many = False if simple.startswith(key + ".") or simple.startswith(key + ":"): if favorite and not perfect: too_many = True favorite = m except Exception, e: error = e if too_many: Log.error( "multiple keys in {{bucket}} with prefix={{prefix|quote}}: {{list}}", bucket=self.name, prefix=key, list=[k.name for k in metas] ) if not perfect and error: Log.error("Problem with key request", error) return coalesce(perfect, favorite)