def __init__(self, **kwargs): Dict.__init__(self) self.count = 0 self.mean = None self.variance = None self.skew = None self.kurtosis = None if "samples" in kwargs: s = ZeroMoment2Stats(ZeroMoment.new_instance(kwargs["samples"])) self.count = s.count self.mean = s.mean self.variance = s.variance self.skew = s.skew self.kurtosis = s.kurtosis return if "count" not in kwargs: self.count = 0 self.mean = None self.variance = None self.skew = None self.kurtosis = None elif "mean" not in kwargs: self.count = kwargs["count"] self.mean = None self.variance = None self.skew = None self.kurtosis = None elif "variance" not in kwargs and "std" not in kwargs: self.count = kwargs["count"] self.mean = kwargs["mean"] self.variance = 0 self.skew = None self.kurtosis = None elif "skew" not in kwargs: self.count = kwargs["count"] self.mean = kwargs["mean"] self.variance = kwargs["variance"] if "variance" in kwargs else kwargs["std"] ** 2 self.skew = None self.kurtosis = None elif "kurtosis" not in kwargs: self.count = kwargs["count"] self.mean = kwargs["mean"] self.variance = kwargs["variance"] if "variance" in kwargs else kwargs["std"] ** 2 self.skew = kwargs["skew"] self.kurtosis = None else: self.count = kwargs["count"] self.mean = kwargs["mean"] self.variance = kwargs["variance"] if "variance" in kwargs else kwargs["std"] ** 2 self.skew = kwargs["skew"] self.kurtosis = kwargs["kurtosis"]
def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ signal = Signal() result = Dict() self.queue.add((command, result, signal, None)) signal.wait_for_go() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result
def format_list(T, select, query=None): data = [] if isinstance(query.select, list): for row in T: r = Dict() for s in select: r[s.put.name][s.put.child] = unwraplist(row[s.pull]) data.append(r if r else None) elif isinstance(query.select.value, LeavesOp): for row in T: r = Dict() for s in select: r[s.put.name][s.put.child] = unwraplist(row[s.pull]) data.append(r if r else None) else: for row in T: r = Dict() for s in select: r[s.put.child] = unwraplist(row[s.pull]) data.append(r if r else None) return Dict(meta={"format": "list"}, data=data)
def format_table_from_aggop(decoders, aggs, start, query, select): header = select.name agg = aggs b = coalesce(agg._filter, agg._nested) while b: agg = b b = coalesce(agg._filter, agg._nested) row = [] for s in select: row.append(agg[s.pull]) return Dict(meta={"format": "table"}, header=header, data=[row])
def __iter__(self): columns = [ c for c, cs in self.columns.items() for c in cs if c.type not in ["object", "nested"] ] command = "SELECT " + \ ",\n".join(_quote_column(c) for c in columns) + \ " FROM " + quote_table(self.name) rows = self.db.query(command) for r in rows: output = Dict() for (k, t), v in zip(columns, r): output[k] = v yield output
def format_list_from_aggop(decoders, aggs, start, query, select): agg = drill(aggs) if isinstance(query.select, list): item = Dict() for s in select: item[s.name] = _pull(s, agg) else: item = _pull(select[0], agg) if query.edges or query.groupby: return wrap({"meta": {"format": "list"}, "data": [item]}) else: return wrap({"meta": {"format": "value"}, "data": item})
def etl_one(settings): queue = Queue("temp work queue") queue.__setattr__(b"commit", Null) queue.__setattr__(b"rollback", Null) settings.param.wait_forever = False already_in_queue = set() for w in settings.workers: source = get_container(w.source) # source.settings.fast_forward = True if id(source) in already_in_queue: continue try: for i in parse_id_argument(settings.args.id): data = source.get_key(i) if data != None: already_in_queue.add(id(source)) queue.add(Dict(bucket=w.source.bucket, key=i)) except Exception, e: if "Key {{key}} does not exist" in e: already_in_queue.add(id(source)) queue.add(Dict(bucket=w.source.bucket, key=settings.args.id)) Log.warning("Problem", cause=e)
def data(): dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = Dict() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = _pull(s, agg) yield output
def test_end(self, log): self.logs[literal_field(log.test)] += [log] test = self.tests[literal_field(log.test)] if not test: self.tests[literal_field(log.test)] = test = Dict( test=log.test, start_time=log.time, missing_test_start=True) test.ok = True if log.expected == None or log.expected == log.status else False if not all(test.subtests.ok): test.ok = False test.result = log.status #TODO: REMOVE ME AFTER November 2015 test.status = log.status test.expected = coalesce(log.expected, log.status) test.end_time = log.time test.duration = coalesce(test.end_time - test.start_time, log.extra.runtime) test.extra = test.extra
def dictwrap(v): type_ = _get(v, "__class__") if type_ is dict: m = Dict() _set(m, "_dict", v) # INJECT m.__dict__=v SO THERE IS NO COPY return m elif type_ is NoneType: return None # So we allow `is None` elif type_ is list: return DictList(v) elif type_ is GeneratorType: return (wrap(vv) for vv in v) elif isinstance(v, (basestring, int, float, Decimal, datetime, date, Dict, DictList, NullType, NoneType)): return v else: return DictObject(v)
def crash(self, log): self.stats.action.crash += 1 if not log.test: return self.logs[literal_field(log.test)] += [log] test = self.tests[literal_field(log.test)] if not test: self.tests[literal_field(log.test)] = test = Dict( test=log.test, start_time=log.time, crash=True, missing_test_start=True) test.ok = False test.result = log.status #TODO: REMOVE ME AFTER November 2015 test.status = log.status test.last_log_time = log.time test.missing_test_end = True
def all_content(self): # response.content WILL LEAK MEMORY (?BECAUSE OF PYPY"S POOR HANDLING OF GENERATORS?) # THE TIGHT, SIMPLE, LOOP TO FILL blocks PREVENTS THAT LEAK if self._content is not False: self._cached_content = self._content elif self._cached_content is None: def read(size): if self.raw._fp.fp is not None: return self.raw.read(amt=size, decode_content=True) else: self.close() return None self._cached_content = safe_size(Dict(read=read)) if hasattr(self._cached_content, "read"): self._cached_content.seek(0) return self._cached_content
def select_one(record, selection): """ APPLY THE selection TO A SINGLE record """ record = wrap(record) selection = wrap(selection) if isinstance(selection, Mapping): selection = wrap(selection) return record[selection.value] elif isinstance(selection, basestring): return record[selection] elif isinstance(selection, list): output = Dict() for f in selection: f = _normalize_select(f) output[f.name] = record[f.value] return output else: Log.error("Do not know how to handle")
def __init__(self, stream, length, _shared=None): """ :param stream: THE STREAM WE WILL GET THE BYTES FROM :param length: THE MAX NUMBER OF BYTES WE ARE EXPECTING :param _shared: FOR INTERNAL USE TO SHARE THE BUFFER :return: """ self.position = 0 file_ = TemporaryFile() if not _shared: self.shared = Dict(length=length, locker=Lock(), stream=stream, done_read=0, file=file_, buffer=mmap(file_.fileno(), length)) else: self.shared = _shared self.shared.ref_count += 1
def test_51586(self): debug_settings = { "trace": True, "cprofile": { "enabled": True, "filename": "tests/results/test_51586_profile.tab" } } Log.start(debug_settings) source_key = "51586_5124145.52" content = File("tests/resources/51586_5124145.52.json.gz").read_bytes() source = Dict(read_lines=lambda: GzipLines(content)) with Accumulator( File("tests/results/51586_5124145.52.json")) as destination: with Timer("ETL file"): process_unittest_in_s3(source_key, source, destination, please_stop=None) Log.stop()
def format_list(decoders, aggs, start, query, select): new_edges = count_dim(aggs, decoders) def data(): dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = Dict() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = _pull(s, agg) yield output output = Dict(meta={"format": "list"}, data=list(data())) return output
def format_table(T, select, source): header = [s.name for s in select] map = {s.name: i for i, s in enumerate(select)} # MAP FROM name TO COLUMN INDEX data = [] for row in T: r = [None] * len(header) for s in select: if s.value == ".": r[map[s.name]] = row[source] else: if source == "_source": r[map[s.name]] = unwraplist(row[source][s.value]) elif isinstance(s.value, basestring): # fields r[map[s.name]] = unwraplist(row[source][literal_field( s.value)]) else: r[map[s.name]] = unwraplist(row[source][literal_field( s.name)]) data.append(r) return Dict(meta={"format": "table"}, header=header, data=data)
def send(self, topic, message): """Publishes a pulse message to the proper exchange.""" if not message: Log.error("Expecting a message") message._prepare() if not self.connection: self.connect() producer = Producer(channel=self.connection, exchange=Exchange(self.settings.exchange, type='topic'), routing_key=topic) # The message is actually a simple envelope format with a payload and # some metadata. final_data = Dict( payload=message.data, _meta=set_default( { 'exchange': self.settings.exchange, 'routing_key': message.routing_key, 'serializer': self.settings.serializer, 'sent': time_to_string( datetime.datetime.now( timezone(self.settings.broker_timezone))), 'count': self.count }, message.metadata)) producer.publish(jsons.scrub(final_data), serializer=self.settings.serializer) self.count += 1
def write(profile_settings): from pyLibrary import convert from pyLibrary.env.files import File profs = list(profiles.values()) for p in profs: p.stats = p.stats.end() stats = [{ "description": p.description, "num_calls": p.stats.count, "total_time": p.stats.count * p.stats.mean, "total_time_per_call": p.stats.mean } for p in profs if p.stats.count > 0] stats_file = File(profile_settings.filename, suffix=convert.datetime2string(datetime.now(), "_%Y%m%d_%H%M%S")) if stats: stats_file.write(convert.list2tab(stats)) else: stats_file.write("<no profiles>") stats_file2 = File(profile_settings.filename, suffix=convert.datetime2string(datetime.now(), "_series_%Y%m%d_%H%M%S")) if not profs: return max_samples = MAX([len(p.samples) for p in profs if p.samples]) if not max_samples: return r = range(max_samples) profs.insert(0, Dict(description="index", samples=r)) stats = [{p.description: wrap(p.samples)[i] for p in profs if p.samples} for i in r] if stats: stats_file2.write(convert.list2tab(stats))
def window(self, window): if window.edges or window.sort: Log.error("not implemented") from pyLibrary.queries import jx # SET OP canonical = self.data.values()[0] accessor = jx.get(window.value) cnames = self.data.keys() # ANNOTATE EXISTING CUBE WITH NEW COLUMN m = self.data[window.name] = Matrix(dims=canonical.dims) for coord in canonical._all_combos(): row = Dict() # IT IS SAD WE MUST HAVE A Dict(), THERE ARE {"script": expression} USING THE DOT NOTATION for k in cnames: row[k] = self.data[k][coord] for c, e in zip(coord, self.edges): row[e.name] = e.domain.partitions[c] m[coord] = accessor(row, Null, Null) # DUMMY Null VALUES BECAUSE I DO NOT KNOW WHAT TO DO self.select.append(window) return self
def stats(values): """ RETURN LOTS OF AGGREGATES """ if values == None: return None values = values.map(float, includeNone=False) z = ZeroMoment.new_instance(values) s = Dict() for k, v in z.dict.items(): s[k] = v for k, v in ZeroMoment2Stats(z).items(): s[k] = v s.max = MAX(values) s.min = MIN(values) s.median = pyLibrary.maths.stats.median(values, simple=False) s.last = values.last() s.first = values[0] if Math.is_number(s.variance) and not Math.is_nan(s.variance): s.std = sqrt(s.variance) return s
def _output(): for g, v in itertools.groupby(data, get_key): group = Dict() for k, gg in zip(keys, g): group[k] = gg yield (group, wrap(list(v)))
def selector(d): output = Dict() for n, p in push_and_pull: output[n] = p(wrap(d)) return unwrap(output)
def __init__(self, dim, parent, qb): self.name = dim.name self.parent = parent self.full_name = join_field( split_field(self.parent.full_name) + [self.name]) dot.set_default(self, dim) self.esfilter = dim.esfilter self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, qb.es.settings.name) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Dict() for e in listwrap(dim.edges): new_e = Dimension(e, self, qb) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif isinstance(fields, Mapping): self.fields = wrap(fields) edges = wrap([{ "name": k, "value": v, "allowNulls": False } for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{ "name": f, "value": f, "index": i, "allowNulls": False } for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if dim.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH with Timer("Get parts of {{name}}", {"name": self.name}): parts = qb.query({ "from": self.index, "select": { "name": "count", "aggregate": "count" }, "edges": edges, "esfilter": self.esfilter, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name=self.name, num=len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Dict(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not isinstance(a, list): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts(temp, dim.path(d.getEnd(d.partitions[i])), count, 0) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif isinstance(fields, Mapping): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = DictList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "esfilter": { "and": [{ "term": { e.value: g[e.name] } } for e in edges] }, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "esfilter": { "term": { edges[0].value: d.partitions[i].value } }, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values( )[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if isinstance(fields, Mapping): output = Dict() for e, v in zip(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "esfilter": { "term": { edges[0].value: d.partitions[i].value } }, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "esfilter": { "and": [{ "term": { edges[0].value: d.partitions[i].value } }, { "term": { edges[1].value: d2.partitions[j].value } }] }, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([ s.value == None and s.aggregate not in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and not select[0].value or select[0].value == "*": FromES = wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter( jx_expression(query.where).to_esfilter()) } }, "sort": query.sort, "size": 1 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": { "filtered": { "query": { "match_all": {} }, "filter": simplify_esfilter(query.where.to_esfilter()) } }, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE_FILTER # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": simplify_esfilter(jx_expression(query.where).to_esfilter()) } data = es09.util.post(es, FromES, query.limit) if len(select) == 1 and not select[0].value or select[0].value == "*": # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube( select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Dict(meta={"esquery": FromES}, data=cube)
def select(self, fields): if isinstance(fields, Mapping): fields = fields.value if isinstance(fields, basestring): # RETURN LIST OF VALUES if len(split_field(fields)) == 1: if self.path[0] == fields: return [d[1] for d in self.data] else: return [d[0][fields] for d in self.data] else: keys = split_field(fields) depth = coalesce( MIN([ i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p ]), len(self.path)) # LENGTH OF COMMON PREFIX short_key = keys[depth:] output = DictList() _select1((wrap(d[depth]) for d in self.data), short_key, 0, output) return output if isinstance(fields, list): output = DictList() meta = [] for f in fields: if hasattr(f.value, "__call__"): meta.append((f.name, f.value)) else: meta.append( (f.name, functools.partial(lambda v, d: d[v], f.value))) for row in self._values(): agg = Dict() for name, f in meta: agg[name] = f(row) output.append(agg) return output # meta = [] # for f in fields: # keys = split_field(f.value) # depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path)) # LENGTH OF COMMON PREFIX # short_key = join_field(keys[depth:]) # # meta.append((f.name, depth, short_key)) # # for row in self._data: # agg = Dict() # for name, depth, short_key in meta: # if short_key: # agg[name] = row[depth][short_key] # else: # agg[name] = row[depth] # output.append(agg) # return output Log.error("multiselect over FlatList not supported")
def test_start(self, log): if isinstance(log.test, list): log.test = " ".join(log.test) self.tests[literal_field(log.test)] = Dict(test=log.test, start_time=log.time) self.last_subtest = log.time
def __init__(self): Dict.__init__(self) self.tests = Dict() self.logs = Dict() self.last_subtest = None
text = replace_vars(text) data = convert.json2value(text) result = jx.run(data) output_bytes = convert.unicode2utf8(convert.value2json(result)) return wrap({ "status_code": 200, "all_content": output_bytes, "content": output_bytes }) global_settings = jsons.ref.get("file://tests/config/elasticsearch.json") constants.set(global_settings.constants) NEXT = 0 container_types = Dict(elasticsearch=ESUtils, sqlite=SQLiteUtils) # read_alternate_settings utils = None try: filename = os.environ.get("TEST_CONFIG") if filename: global_settings = jsons.ref.get("file://" + filename) else: Log.alert( "No TEST_CONFIG environment variable to point to config file. Using /tests/config/elasticsearch.json" ) if not global_settings.use: Log.error('Must have a {"use": type} set in the config file') utils = container_types[global_settings.use](global_settings)
def Parts2Term(self, domain): """ TERMS ARE ALWAYS ESCAPED SO THEY CAN BE COMPOUNDED WITH PIPE (|) CONVERT AN ARRAY OF PARTS{name, esfilter} TO AN MVEL EXPRESSION RETURN expression, function PAIR, WHERE expression - MVEL EXPRESSION function - TAKES RESULT OF expression AND RETURNS PART """ fields = domain.dimension.fields term = [] if len(split_field(self.fromData.name)) == 1 and fields: if isinstance(fields, Mapping): # CONVERT UNORDERED FIELD DEFS jx_fields, es_fields = zip(*[(k, fields[k]) for k in sorted(fields.keys())]) else: jx_fields, es_fields = zip(*[(i, e) for i, e in enumerate(fields)]) # NO LOOPS BECAUSE QUERY IS SHALLOW # DOMAIN IS FROM A DIMENSION, USE IT'S FIELD DEFS TO PULL if len(es_fields) == 1: def fromTerm(term): return domain.getPartByKey(term) return Dict(head="", body='getDocValue(' + convert.string2quote(domain.dimension.fields[0]) + ')'), fromTerm else: def fromTerm(term): terms = [ convert.pipe2value(t) for t in convert.pipe2value(term).split("|") ] candidate = dict(zip(jx_fields, terms)) for p in domain.partitions: for k, t in candidate.items(): if p.value[k] != t: break else: return p if domain.type in ["uid", "default"]: part = {"value": candidate} domain.partitions.append(part) return part else: return Null for f in es_fields: term.append('Value2Pipe(getDocValue(' + convert.string2quote(f) + '))') return Dict(head="", body='Value2Pipe(' + ('+"|"+'.join(term)) + ')'), fromTerm else: for v in domain.partitions: term.append("if (" + _where(v.esfilter, lambda x: self._translate(x)) + ") " + value2MVEL(domain.getKey(v)) + "; else ") term.append(value2MVEL(domain.getKey(domain.NULL))) func_name = "_temp" + UID() return self.register_function("+\"|\"+".join(term))
def _convert_from(self, frum): if isinstance(frum, Mapping): return Dict(name=self.convert(frum.name)) else: return self.convert(frum)