def median(values, simple=True, mean_weight=0.0): """ RETURN MEDIAN VALUE IF simple=False THEN IN THE EVENT MULTIPLE INSTANCES OF THE MEDIAN VALUE, THE MEDIAN IS INTERPOLATED BASED ON ITS POSITION IN THE MEDIAN RANGE mean_weight IS TO PICK A MEDIAN VALUE IN THE ODD CASE THAT IS CLOSER TO THE MEAN (PICK A MEDIAN BETWEEN TWO MODES IN BIMODAL CASE) """ if OR(v == None for v in values): Log.error("median is not ready to handle None") try: if not values: return Null l = len(values) _sorted = sorted(values) middle = int(l / 2) _median = float(_sorted[middle]) if len(_sorted) == 1: return _median if simple: if l % 2 == 0: return (_sorted[middle - 1] + _median) / 2 return _median # FIND RANGE OF THE median start_index = middle - 1 while start_index > 0 and _sorted[start_index] == _median: start_index -= 1 start_index += 1 stop_index = middle + 1 while stop_index < l and _sorted[stop_index] == _median: stop_index += 1 num_middle = stop_index - start_index if l % 2 == 0: if num_middle == 1: return (_sorted[middle - 1] + _median) / 2 else: return (_median - 0.5) + (middle - start_index) / num_middle else: if num_middle == 1: return (1 - mean_weight) * _median + mean_weight * ( _sorted[middle - 1] + _sorted[middle + 1]) / 2 else: return (_median - 0.5) + (middle + 0.5 - start_index) / num_middle except Exception as e: Log.error("problem with median of {{values}}", values=values, cause=e)
def _select(self, select): selects = listwrap(select) is_aggregate = OR(s.aggregate != None and s.aggregate != "none" for s in selects) if is_aggregate: values = {s.name: Matrix(value=self.data[s.value].aggregate(s.aggregate)) for s in selects} return Cube(select, [], values) else: values = {s.name: self.data[s.value] for s in selects} return Cube(select, self.edges, values)
def __init__(self, select, edges, data, frum=None): """ data IS EXPECTED TO BE A dict TO MATRICES, BUT OTHER COLLECTIONS ARE ALLOWED, USING THE select AND edges TO DESCRIBE THE data """ self.is_value = False if isinstance(select, list) else True self.select = select self.meta = Data(format="cube") # PUT EXTRA MARKUP HERE self.is_none = False if not all(data.values()): is_none = True # ENSURE frum IS PROPER FORM if isinstance(select, list): if edges and OR(not isinstance(v, Matrix) for v in data.values()): Log.error("Expecting data to be a dict with Matrix values") if not edges: if not data: if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix.ZERO} self.edges = FlatList.EMPTY elif isinstance(data, Mapping): # EXPECTING NO MORE THAN ONE rownum EDGE IN THE DATA length = MAX([len(v) for v in data.values()]) if length >= 1: self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum"}}]) else: self.edges = FlatList.EMPTY elif isinstance(data, list): if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix.wrap(data)} self.edges = wrap([{"name": "rownum", "domain": {"type": "rownum", "min": 0, "max": len(data), "interval": 1}}]) elif isinstance(data, Matrix): if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: data} else: if isinstance(select, list): Log.error("not expecting a list of records") data = {select.name: Matrix(value=data)} self.edges = FlatList.EMPTY else: self.edges = wrap(edges) self.data = data
def pretty_json(value): try: if value is False: return "false" elif value is True: return "true" elif isinstance(value, Mapping): try: items = sort_using_key(list(value.items()), lambda r: r[0]) values = [ encode_basestring(k) + PRETTY_COLON + indent(pretty_json(v)).strip() for k, v in items if v != None ] if not values: return "{}" elif len(values) == 1: return "{" + values[0] + "}" else: return "{\n" + INDENT + (",\n" + INDENT).join(values) + "\n}" except Exception as e: from mo_logs import Log from mo_math import OR if OR(not isinstance(k, text_type) for k in value.keys()): Log.error("JSON must have string keys: {{keys}}:", keys=[k for k in value.keys()], cause=e) Log.error("problem making dict pretty: keys={{keys}}:", keys=[k for k in value.keys()], cause=e) elif value in (None, Null): return "null" elif isinstance(value, (text_type, binary_type)): if isinstance(value, binary_type): value = utf82unicode(value) try: return quote(value) except Exception as e: from mo_logs import Log try: Log.note( "try explicit convert of string with length {{length}}", length=len(value)) acc = [QUOTE] for c in value: try: try: c2 = ESCAPE_DCT[c] except Exception: c2 = c c3 = text_type(c2) acc.append(c3) except BaseException: pass # Log.warning("odd character {{ord}} found in string. Ignored.", ord= ord(c)}, cause=g) acc.append(QUOTE) output = u"".join(acc) Log.note("return value of length {{length}}", length=len(output)) return output except BaseException as f: Log.warning("can not even explicit convert {{type}}", type=f.__class__.__name__, cause=f) return "null" elif isinstance(value, list): if not value: return "[]" if ARRAY_MAX_COLUMNS == 1: return "[\n" + ",\n".join( [indent(pretty_json(v)) for v in value]) + "\n]" if len(value) == 1: j = pretty_json(value[0]) if j.find("\n") >= 0: return "[\n" + indent(j) + "\n]" else: return "[" + j + "]" js = [pretty_json(v) for v in value] max_len = max(*[len(j) for j in js]) if max_len <= ARRAY_ITEM_MAX_LENGTH and max( *[j.find("\n") for j in js]) == -1: # ALL TINY VALUES num_columns = max( 1, min( ARRAY_MAX_COLUMNS, int( floor((ARRAY_ROW_LENGTH + 2.0) / float(max_len + 2))))) # +2 TO COMPENSATE FOR COMMAS if len(js) <= num_columns: # DO NOT ADD \n IF ONLY ONE ROW return "[" + PRETTY_COMMA.join(js) + "]" if num_columns == 1: # DO NOT rjust IF THERE IS ONLY ONE COLUMN return "[\n" + ",\n".join( [indent(pretty_json(v)) for v in value]) + "\n]" content = ",\n".join( PRETTY_COMMA.join( j.rjust(max_len) for j in js[r:r + num_columns]) for r in xrange(0, len(js), num_columns)) return "[\n" + indent(content) + "\n]" pretty_list = js output = ["[\n"] for i, p in enumerate(pretty_list): try: if i > 0: output.append(",\n") output.append(indent(p)) except Exception: from mo_logs import Log Log.warning( "problem concatenating string of length {{len1}} and {{len2}}", len1=len("".join(output)), len2=len(p)) output.append("\n]") try: return "".join(output) except Exception as e: from mo_logs import Log Log.error("not expected", cause=e) elif hasattr(value, '__data__'): d = value.__data__() return pretty_json(d) elif hasattr(value, '__json__'): j = value.__json__() if j == None: return " null " # TODO: FIND OUT WHAT CAUSES THIS return pretty_json(json_decoder(j)) elif scrub(value) is None: return "null" elif hasattr(value, '__iter__'): return pretty_json(list(value)) elif hasattr(value, '__call__'): return "null" else: try: if int(value) == value: return text_type(int(value)) except Exception: pass try: if float(value) == value: return text_type(float(value)) except Exception: pass return pypy_json_encode(value) except Exception as e: problem_serializing(value, e)
def _normalize(esfilter): """ TODO: DO NOT USE Data, WE ARE SPENDING TOO MUCH TIME WRAPPING/UNWRAPPING REALLY, WE JUST COLLAPSE CASCADING `and` AND `or` FILTERS """ if esfilter == MATCH_ALL or esfilter == MATCH_NONE or esfilter.isNormal: return esfilter # Log.note("from: " + convert.value2json(esfilter)) isDiff = True while isDiff: isDiff = False if esfilter.bool.must: terms = esfilter.bool.must for (i0, t0), (i1, t1) in itertools.product(enumerate(terms), enumerate(terms)): if i0 == i1: continue # SAME, IGNORE # TERM FILTER ALREADY ASSUMES EXISTENCE with suppress_exception: if t0.exists.field != None and t0.exists.field == t1.term.items( )[0][0]: terms[i0] = MATCH_ALL continue # IDENTICAL CAN BE REMOVED with suppress_exception: if t0 == t1: terms[i0] = MATCH_ALL continue # MERGE range FILTER WITH SAME FIELD if i0 > i1: continue # SAME, IGNORE with suppress_exception: f0, tt0 = t0.range.items()[0] f1, tt1 = t1.range.items()[0] if f0 == f1: set_default(terms[i0].range[literal_field(f1)], tt1) terms[i1] = MATCH_ALL output = [] for a in terms: if isinstance(a, (list, set)): from mo_logs import Log Log.error("and clause is not allowed a list inside a list") a_ = _normalize(a) if a_ is not a: isDiff = True a = a_ if a == MATCH_ALL: isDiff = True continue if a == MATCH_NONE: return MATCH_NONE if a.bool.must: isDiff = True a.isNormal = None output.extend(a.bool.must) else: a.isNormal = None output.append(a) if not output: return MATCH_ALL elif len(output) == 1: # output[0].isNormal = True esfilter = output[0] break elif isDiff: esfilter = wrap({"and": output}) continue if esfilter.bool.should: output = [] for a in esfilter.bool.should: a_ = _normalize(a) if a_ is not a: isDiff = True a = a_ if a.bool.should: a.isNormal = None isDiff = True output.extend(a.bool.should) else: a.isNormal = None output.append(a) if not output: return MATCH_NONE elif len(output) == 1: esfilter = output[0] break elif isDiff: esfilter = wrap({"bool": {"should": output}}) continue if esfilter.term != None: if esfilter.term.keys(): esfilter.isNormal = True return esfilter else: return MATCH_ALL if esfilter.terms: for k, v in esfilter.terms.items(): if len(v) > 0: if OR(vv == None for vv in v): rest = [vv for vv in v if vv != None] if len(rest) > 0: return { "or": [{ "missing": { "field": k } }, { "terms": { k: rest } }], "isNormal": True } else: return {"missing": {"field": k}, "isNormal": True} else: esfilter.isNormal = True return esfilter return MATCH_NONE if esfilter['not']: _sub = esfilter['not'] sub = _normalize(_sub) if sub == MATCH_NONE: return MATCH_ALL elif sub == MATCH_ALL: return MATCH_NONE elif sub is not _sub: sub.isNormal = None return wrap({"not": sub, "isNormal": True}) else: sub.isNormal = None esfilter.isNormal = True return esfilter
def pretty_json(value): try: if value is False: return "false" elif value is True: return "true" elif isinstance(value, Mapping): try: if not value: return "{}" items = list(value.items()) if len(items) == 1: return "{" + unicode_key(items[0][0]) + ": " + pretty_json( items[0][1]).strip() + "}" items = sorted(items, lambda a, b: value_compare(a[0], b[0])) values = [ unicode_key(k) + ": " + indent(pretty_json(v)).strip() for k, v in items if v != None ] return "{\n" + INDENT + (",\n" + INDENT).join(values) + "\n}" except Exception as e: from mo_logs import Log from mo_math import OR if OR(not isinstance(k, basestring) for k in value.keys()): Log.error("JSON must have string keys: {{keys}}:", keys=[k for k in value.keys()], cause=e) Log.error("problem making dict pretty: keys={{keys}}:", keys=[k for k in value.keys()], cause=e) elif value in (None, Null): return "null" elif isinstance(value, basestring): if isinstance(value, str): value = utf82unicode(value) try: return quote(value) except Exception as e: from mo_logs import Log try: Log.note( "try explicit convert of string with length {{length}}", length=len(value)) acc = [u"\""] for c in value: try: try: c2 = ESCAPE_DCT[c] except Exception: c2 = c c3 = unicode(c2) acc.append(c3) except BaseException: pass # Log.warning("odd character {{ord}} found in string. Ignored.", ord= ord(c)}, cause=g) acc.append(u"\"") output = u"".join(acc) Log.note("return value of length {{length}}", length=len(output)) return output except BaseException, f: Log.warning("can not even explicit convert {{type}}", type=f.__class__.__name__, cause=f) return "null" elif isinstance(value, list): if not value: return "[]" if ARRAY_MAX_COLUMNS == 1: return "[\n" + ",\n".join( [indent(pretty_json(v)) for v in value]) + "\n]" if len(value) == 1: j = pretty_json(value[0]) if j.find("\n") >= 0: return "[\n" + indent(j) + "\n]" else: return "[" + j + "]" js = [pretty_json(v) for v in value] max_len = max(*[len(j) for j in js]) if max_len <= ARRAY_ITEM_MAX_LENGTH and max( *[j.find("\n") for j in js]) == -1: # ALL TINY VALUES num_columns = max( 1, min( ARRAY_MAX_COLUMNS, int( floor((ARRAY_ROW_LENGTH + 2.0) / float(max_len + 2))))) # +2 TO COMPENSATE FOR COMMAS if len(js) <= num_columns: # DO NOT ADD \n IF ONLY ONE ROW return "[" + ", ".join(js) + "]" if num_columns == 1: # DO NOT rjust IF THERE IS ONLY ONE COLUMN return "[\n" + ",\n".join( [indent(pretty_json(v)) for v in value]) + "\n]" content = ",\n".join(", ".join( j.rjust(max_len) for j in js[r:r + num_columns]) for r in xrange(0, len(js), num_columns)) return "[\n" + indent(content) + "\n]" pretty_list = js output = ["[\n"] for i, p in enumerate(pretty_list): try: if i > 0: output.append(",\n") output.append(indent(p)) except Exception: from mo_logs import Log Log.warning( "problem concatenating string of length {{len1}} and {{len2}}", len1=len("".join(output)), len2=len(p)) output.append("\n]") try: return "".join(output) except Exception as e: from mo_logs import Log Log.error("not expected", cause=e)
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([ s.value == None and s.aggregate not in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and isinstance(select[0].value, LeavesOp): FromES = wrap({ "query": { "bool": { "query": { "match_all": {} }, "filter": query.where.to_esfilter() } }, "sort": query.sort, "size": 0 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": { "bool": { "query": { "match_all": {} }, "filter": query.where.to_esfilter() } }, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": jx_expression(query.where).to_esfilter() } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": jx_expression(query.where).to_esfilter() } data = es_post(es, FromES, query.limit) if len(select) == 1 and isinstance(select[0].value, LeavesOp): # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = transpose(*data_list) cube = Cube( select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Data(meta={"esquery": FromES}, data=cube)