def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ now = Date.now() if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": [False, True], "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return else: es_query = { "aggs": { "count": _counting_query(column), "_filter": { "aggs": { "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "filter": { "bool": { "should": [{ "range": { "etl.timestamp.~n~": { "gte": (Date.today() - WEEK) } } }, { "bool": { "must_not": { "exists": { "field": "etl.timestamp.~n~" } } } }] } } } }, "size": 0 } result = self.es_cluster.post("/" + es_index + "/_search", data=es_query) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results._filter.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: # WHEN DOES THIS HAPPEN? query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) DEBUG and Log.note( "update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith( (TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index: # WE EXPECT TEST TABLES TO DISAPPEAR Log.warning("Missing index {{col.es_index}}", col=column, cause=e) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) elif "No field found for" in e: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) else: self.meta.columns.update({ "set": { "last_updated": now }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
def Parts2Term(self, domain): """ TERMS ARE ALWAYS ESCAPED SO THEY CAN BE COMPOUNDED WITH PIPE (|) CONVERT AN ARRAY OF PARTS{name, esfilter} TO AN MVEL EXPRESSION RETURN expression, function PAIR, WHERE expression - MVEL EXPRESSION function - TAKES RESULT OF expression AND RETURNS PART """ fields = domain.dimension.fields term = [] if len(split_field(self.fromData.name)) == 1 and fields: if isinstance(fields, Mapping): # CONVERT UNORDERED FIELD DEFS jx_fields, es_fields = transpose( *[(k, fields[k]) for k in sorted(fields.keys())]) else: jx_fields, es_fields = transpose( *[(i, e) for i, e in enumerate(fields)]) # NO LOOPS BECAUSE QUERY IS SHALLOW # DOMAIN IS FROM A DIMENSION, USE IT'S FIELD DEFS TO PULL if len(es_fields) == 1: def fromTerm(term): return domain.getPartByKey(term) return Data(head="", body='getDocValue(' + quote(domain.dimension.fields[0]) + ')'), fromTerm else: def fromTerm(term): terms = [ convert.pipe2value(t) for t in convert.pipe2value(term).split("|") ] candidate = dict(zip(jx_fields, terms)) for p in domain.partitions: for k, t in candidate.items(): if p.value[k] != t: break else: return p if domain.type in ["uid", "default"]: part = {"value": candidate} domain.partitions.append(part) return part else: return Null for f in es_fields: term.append('Value2Pipe(getDocValue(' + quote(f) + '))') return Data(head="", body='Value2Pipe(' + ('+"|"+'.join(term)) + ')'), fromTerm else: for v in domain.partitions: term.append("if (" + _where(v.esfilter, lambda x: self._translate(x)) + ") " + value2MVEL(domain.getKey(v)) + "; else ") term.append(value2MVEL(domain.getKey(domain.NULL))) func_name = "_temp" + UID() return self.register_function("+\"|\"+".join(term))
def _set_op(self, query, frum): # GET LIST OF COLUMNS frum_path = split_field(frum) primary_nested_path = join_field(frum_path[1:]) vars_ = UNION([v.var for select in listwrap(query.select) for v in select.value.vars()]) schema = self.sf.tables[primary_nested_path].schema active_columns = {".": set()} for v in vars_: for c in schema.leaves(v): nest = c.nested_path[0] active_columns.setdefault(nest, set()).add(c) # ANY VARS MENTIONED WITH NO COLUMNS? for v in vars_: if not any(startswith_field(cname, v) for cname in schema.keys()): active_columns["."].add(Column( names={".": v}, type="null", es_column=".", es_index=".", nested_path=["."] )) # EVERY COLUMN, AND THE INDEX IT TAKES UP index_to_column = {} # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE) index_to_uid = {} # FROM NESTED PATH TO THE INDEX OF UID sql_selects = [] # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE) nest_to_alias = { nested_path: "__" + unichr(ord('a') + i) + "__" for i, (nested_path, sub_table) in enumerate(self.sf.tables.items()) } sorts = [] if query.sort: for select in query.sort: col = select.value.to_sql(schema)[0] for t, sql in col.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) # SQL HAS ABS TABLE REFERENCE column_alias = _make_column_name(column_number) sql_selects.append(sql_alias(sql, column_alias)) if select.sort == -1: sorts.append(column_alias + SQL_IS_NOT_NULL) sorts.append(column_alias + " DESC") else: sorts.append(column_alias + SQL_IS_NULL) sorts.append(column_alias) primary_doc_details = Data() # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED nested_path = [] for step, sub_table in self.sf.tables.items(): nested_path.insert(0, step) nested_doc_details = { "sub_table": sub_table, "children": [], "index_to_column": {}, "nested_path": nested_path } # INSERT INTO TREE if not primary_doc_details: primary_doc_details = nested_doc_details else: def place(parent_doc_details): if startswith_field(step, parent_doc_details['nested_path'][0]): for c in parent_doc_details['children']: if place(c): return True parent_doc_details['children'].append(nested_doc_details) place(primary_doc_details) alias = nested_doc_details['alias'] = nest_to_alias[step] # WE ALWAYS ADD THE UID column_number = index_to_uid[step] = nested_doc_details['id_coord'] = len(sql_selects) sql_select = join_column(alias, quoted_UID) sql_selects.append(sql_alias(sql_select, _make_column_name(column_number))) if step != ".": # ID AND ORDER FOR CHILD TABLES index_to_column[column_number] = ColumnMapping( sql=sql_select, type="number", nested_path=nested_path, column_alias=_make_column_name(column_number) ) column_number = len(sql_selects) sql_select = join_column(alias, quoted_ORDER) sql_selects.append(sql_alias(sql_select, _make_column_name(column_number))) index_to_column[column_number] = ColumnMapping( sql=sql_select, type="number", nested_path=nested_path, column_alias=_make_column_name(column_number) ) # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM if step not in active_columns: continue # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE si = 0 for select in listwrap(query.select): try: column_number = len(sql_selects) select.pull = get_column(column_number) db_columns = select.value.partial_eval().to_sql(schema) for column in db_columns: if isinstance(column.nested_path, list): column.nested_path = column.nested_path[0] # IN THE EVENT THIS "column" IS MULTIVALUED for t, unsorted_sql in column.sql.items(): json_type = sql_type_to_json_type[t] if json_type in STRUCT: continue column_number = len(sql_selects) column_alias = _make_column_name(column_number) sql_selects.append(sql_alias(unsorted_sql, column_alias)) if startswith_field(primary_nested_path, step) and isinstance(select.value, LeavesOp): # ONLY FLATTEN primary_nested_path AND PARENTS, NOT CHILDREN index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=literal_field(get_property_name(concat_field(select.name, column.name))), push_child=".", push_column_name=get_property_name(concat_field(select.name, column.name)), push_column=si, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=nested_path ) si += 1 else: index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping( push_name=select.name, push_child=column.name, push_column_name=select.name, push_column=si, pull=get_column(column_number), sql=unsorted_sql, type=json_type, column_alias=column_alias, nested_path=nested_path ) finally: si += 1 where_clause = BooleanOp("boolean", query.where).partial_eval().to_sql(schema, boolean=True)[0].sql.b unsorted_sql = self._make_sql_for_one_nest_in_set_op( ".", sql_selects, where_clause, active_columns, index_to_column ) for n, _ in self.sf.tables.items(): sorts.append(quote_column(COLUMN + text_type(index_to_uid[n]))) ordered_sql = ( SQL_SELECT + "*" + SQL_FROM + sql_iso(unsorted_sql) + SQL_ORDERBY + sql_list(sorts) + SQL_LIMIT + quote_value(query.limit) ) self.db.create_new_functions() # creating new functions: regexp result = self.db.query(ordered_sql) def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Data() output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append(row) # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc) return output if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Data() curr_nested_path = nested_doc_details['nested_path'][0] index_to_column = nested_doc_details['index_to_column'].items() if index_to_column: for i, c in index_to_column: value = row[i] if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path = concat_field(c.push_name, c.push_child) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path = c.push_child if relative_path == ".": if value == '': doc = Null else: doc = value elif value != None and value != '': doc[relative_path] = value for child_details in nested_doc_details['children']: # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested(rows, row, child_details, doc_id, id_coord) if nested_value: push_name = child_details['nested_path'][0] if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path = relative_field(push_name, curr_nested_path) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path = "." if relative_path == "." and doc is Null: doc = nested_value elif relative_path == ".": doc = unwraplist(nested_value) else: doc[relative_path] = unwraplist(nested_value) output.append(doc) try: row = rows.pop() except IndexError: return output cols = tuple([i for i in index_to_column.values() if i.push_name != None]) rows = list(reversed(unwrap(result.data))) if rows: row = rows.pop() data = _accumulate_nested(rows, row, primary_doc_details, None, None) else: data = result.data if query.format == "cube": for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): num_rows = len(result.data) num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0 map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = [[None] * num_rows for _ in range(num_cols)] for rownum, d in enumerate(result.data): for c in cols: if c.push_child == ".": temp_data[c.push_column][rownum] = c.pull(d) else: column = temp_data[c.push_column][rownum] if column is None: column = temp_data[c.push_column][rownum] = {} column[c.push_child] = c.pull(d) output = Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) return output if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): num_rows = len(data) temp_data = {c.push_column_name: [None] * num_rows for c in cols} for rownum, d in enumerate(data): for c in cols: temp_data[c.push_column_name][rownum] = d[c.push_name] return Data( meta={"format": "cube"}, data=temp_data, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) else: num_rows = len(data) map_index_to_name = {c.push_column: c.push_column_name for c in cols} temp_data = [data] return Data( meta={"format": "cube"}, data={n: temp_data[c] for c, n in map_index_to_name.items()}, edges=[{ "name": "rownum", "domain": { "type": "rownum", "min": 0, "max": num_rows, "interval": 1 } }] ) elif query.format == "table": for f, _ in self.sf.tables.items(): if frum.endswith(f): num_column = MAX([c.push_column for c in cols]) + 1 header = [None] * num_column for c in cols: header[c.push_column] = c.push_column_name output_data = [] for d in result.data: row = [None] * num_column for c in cols: set_column(row, c.push_column, c.push_child, c.pull(d)) output_data.append(row) return Data( meta={"format": "table"}, header=header, data=output_data ) if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): column_names = [None] * (max(c.push_column for c in cols) + 1) for c in cols: column_names[c.push_column] = c.push_column_name temp_data = [] for rownum, d in enumerate(data): row = [None] * len(column_names) for c in cols: row[c.push_column] = d[c.push_name] temp_data.append(row) return Data( meta={"format": "table"}, header=column_names, data=temp_data ) else: column_names = listwrap(query.select).name return Data( meta={"format": "table"}, header=column_names, data=[[d] for d in data] ) else: for f, _ in self.sf.tables.items(): if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)): data = [] for d in result.data: row = Data() for c in cols: if c.push_child == ".": row[c.push_name] = c.pull(d) elif c.num_push_columns: tuple_value = row[c.push_name] if not tuple_value: tuple_value = row[c.push_name] = [None] * c.num_push_columns tuple_value[c.push_child] = c.pull(d) else: row[c.push_name][c.push_child] = c.pull(d) data.append(row) return Data( meta={"format": "list"}, data=data ) if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): temp_data = [] for rownum, d in enumerate(data): row = {} for c in cols: row[c.push_column_name] = d[c.push_name] temp_data.append(row) return Data( meta={"format": "list"}, data=temp_data ) else: return Data( meta={"format": "list"}, data=data )
elif depth == 1: return _MAX(cube) else: return _MAX(_max(depth - 1, c) for c in cube) def _min(depth, cube): if depth == 0: return cube elif depth == 1: return _MIN(cube) else: return _MIN(_min(depth - 1, c) for c in cube) aggregates = Data(max=_max, maximum=_max, min=_min, minimum=_min) def _iter(cube, depth): if depth == 1: return cube.__iter__() else: def iterator(): for c in cube: for b in _iter(c, depth - 1): yield b return iterator()
def apply_diff(text, diff, reverse=False, verify=True): """ SOME EXAMPLES OF diff #@@ -1 +1 @@ #-before china goes live, the content team will have to manually update the settings for the china-ready apps currently in marketplace. #+before china goes live (end January developer release, June general audience release) , the content team will have to manually update the settings for the china-ready apps currently in marketplace. @@ -0,0 +1,3 @@ +before china goes live, the content team will have to manually update the settings for the china-ready apps currently in marketplace. + +kward has the details. @@ -1 +1 @@ -before china goes live (end January developer release, June general audience release), the content team will have to manually update the settings for the china-ready apps currently in marketplace. +before china goes live , the content team will have to manually update the settings for the china-ready apps currently in marketplace. @@ -3 +3 ,6 @@ -kward has the details.+kward has the details. + +Target Release Dates : +https://mana.mozilla.org/wiki/display/PM/Firefox+OS+Wave+Launch+Cross+Functional+View + +Content Team Engagement & Tasks : https://appreview.etherpad.mozilla.org/40 """ if not diff: return text output = text hunks = [ (new_diff[start_hunk], new_diff[start_hunk + 1:end_hunk]) for new_diff in [[ d.lstrip() for d in diff if d.lstrip() and d != "\\ No newline at end of file" ] + ["@@"]] # ANOTHER REPAIR for start_hunk, end_hunk in pairwise( i for i, l in enumerate(new_diff) if l.startswith('@@')) ] for header, hunk_body in (reversed(hunks) if reverse else hunks): matches = DIFF_PREFIX.match(header.strip()) if not matches: if not _Log: _late_import() _Log.error("Can not handle \n---\n{{diff}}\n---\n", diff=diff) removes = tuple(int(i.strip()) for i in matches.group(1).split( ",")) # EXPECTING start_line, length TO REMOVE remove = Data( start=removes[0], length=1 if len(removes) == 1 else removes[1]) # ASSUME FIRST LINE adds = tuple(int(i.strip()) for i in matches.group(2).split( ",")) # EXPECTING start_line, length TO ADD add = Data(start=adds[0], length=1 if len(adds) == 1 else adds[1]) if add.length == 0 and add.start == 0: add.start = remove.start def repair_hunk(hunk_body): # THE LAST DELETED LINE MAY MISS A "\n" MEANING THE FIRST # ADDED LINE WILL BE APPENDED TO THE LAST DELETED LINE # EXAMPLE: -kward has the details.+kward has the details. # DETECT THIS PROBLEM FOR THIS HUNK AND FIX THE DIFF if reverse: last_lines = [ o for b, o in zip(reversed(hunk_body), reversed(output)) if b != "+" + o ] if not last_lines: return hunk_body last_line = last_lines[0] for problem_index, problem_line in enumerate(hunk_body): if problem_line.startswith('-') and problem_line.endswith( '+' + last_line): split_point = len(problem_line) - (len(last_line) + 1) break elif problem_line.startswith('+' + last_line + "-"): split_point = len(last_line) + 1 break else: return hunk_body else: if not output: return hunk_body last_line = output[-1] for problem_index, problem_line in enumerate(hunk_body): if problem_line.startswith('+') and problem_line.endswith( '-' + last_line): split_point = len(problem_line) - (len(last_line) + 1) break elif problem_line.startswith('-' + last_line + "+"): split_point = len(last_line) + 1 break else: return hunk_body new_hunk_body = ( hunk_body[:problem_index] + [problem_line[:split_point], problem_line[split_point:]] + hunk_body[problem_index + 1:]) return new_hunk_body hunk_body = repair_hunk(hunk_body) if reverse: new_output = (output[:add.start - 1] + [d[1:] for d in hunk_body if d and d[0] == '-'] + output[add.start + add.length - 1:]) else: new_output = (output[:add.start - 1] + [d[1:] for d in hunk_body if d and d[0] == '+'] + output[add.start + remove.length - 1:]) output = new_output if verify: original = apply_diff(output, diff, not reverse, False) if set(text) != set(original): # bugzilla-etl diffs are a jumble for t, o in zip_longest(text, original): if t in ['reports: https://goo.gl/70o6w6\r']: break # KNOWN INCONSISTENCIES if t != o: if not _Log: _late_import() _Log.error("logical verification check failed") break return output
def update(self, command): """ :param command: EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT """ command = wrap(command) # REJECT DEEP UPDATES touched_columns = command.set.keys() | set(listwrap(command['clear'])) for c in self.schema.columns: if c.name in touched_columns and len(c.nested_path) > 1: Log.error("Deep update not supported") # ADD NEW COLUMNS where = jx_expression(command.where) _vars = where.vars() _map = { v: c.es_column for v in _vars for c in self.columns.get(v, Null) if c.jx_type not in STRUCT } where_sql = where.map(_map).to_sql(self.schema) new_columns = set(command.set.keys()) - set(self.columns.keys()) for new_column_name in new_columns: nested_value = command.set[new_column_name] ctype = get_jx_type(nested_value) column = Column( name=new_column_name, jx_type=ctype, es_index=self.name, es_type=json_type_to_sqlite_type(ctype), es_column=typed_column(new_column_name, ctype), last_updated=Date.now() ) self.add_column(column) # UPDATE THE NESTED VALUES for nested_column_name, nested_value in command.set.items(): if get_jx_type(nested_value) == "nested": nested_table_name = concat_field(self.name, nested_column_name) nested_table = nested_tables[nested_column_name] self_primary_key = sql_list(quote_column(c.es_column) for u in self.uid for c in self.columns[u]) extra_key_name = UID + text(len(self.uid)) extra_key = [e for e in nested_table.columns[extra_key_name]][0] sql_command = ( SQL_DELETE + SQL_FROM + quote_column(nested_table.name) + SQL_WHERE + "EXISTS" + sql_iso( SQL_SELECT + SQL_ONE + SQL_FROM + sql_alias(quote_column(nested_table.name), "n") + SQL_INNER_JOIN + sql_iso( SQL_SELECT + self_primary_key + SQL_FROM + quote_column(abs_schema.fact) + SQL_WHERE + where_sql ) + " t ON " + SQL_AND.join( quote_column("t", c.es_column) + SQL_EQ + quote_column("n", c.es_column) for u in self.uid for c in self.columns[u] ) ) ) self.db.execute(sql_command) # INSERT NEW RECORDS if not nested_value: continue doc_collection = {} for d in listwrap(nested_value): nested_table.flatten(d, Data(), doc_collection, path=nested_column_name) prefix = SQL_INSERT + quote_column(nested_table.name) + sql_iso(sql_list( [self_primary_key] + [quote_column(extra_key)] + [ quote_column(c.es_column) for c in doc_collection.get(".", Null).active_columns ] )) # BUILD THE PARENT TABLES parent = ( SQL_SELECT + self_primary_key + SQL_FROM + quote_column(abs_schema.fact) + SQL_WHERE + jx_expression(command.where).to_sql(schema) ) # BUILD THE RECORDS children = SQL_UNION_ALL.join( SQL_SELECT + sql_alias(quote_value(i), extra_key.es_column) + SQL_COMMA + sql_list( sql_alias(quote_value(row[c.name]), quote_column(c.es_column)) for c in doc_collection.get(".", Null).active_columns ) for i, row in enumerate(doc_collection.get(".", Null).rows) ) sql_command = ( prefix + SQL_SELECT + sql_list( [quote_column("p", c.es_column) for u in self.uid for c in self.columns[u]] + [quote_column("c", extra_key)] + [quote_column("c", c.es_column) for c in doc_collection.get(".", Null).active_columns] ) + SQL_FROM + sql_iso(parent) + " p" + SQL_INNER_JOIN + sql_iso(children) + " c" + SQL_ON + SQL_TRUE ) self.db.execute(sql_command) # THE CHILD COLUMNS COULD HAVE EXPANDED # ADD COLUMNS TO SELF for n, cs in nested_table.columns.items(): for c in cs: column = Column( name=c.name, jx_type=c.jx_type, es_type=c.es_type, es_index=c.es_index, es_column=c.es_column, nested_path=[nested_column_name] + c.nested_path, last_updated=Date.now() ) if c.name not in self.columns: self.columns[column.name] = {column} elif c.jx_type not in [c.jx_type for c in self.columns[c.name]]: self.columns[column.name].add(column) command = ( SQL_UPDATE + quote_column(abs_schema.fact) + SQL_SET + sql_list( [ quote_column(c) + SQL_EQ + quote_value(get_if_type(v, c.jx_type)) for k, v in command.set.items() if get_jx_type(v) != "nested" for c in self.columns[k] if c.jx_type != "nested" and len(c.nested_path) == 1 ] + [ quote_column(c) + SQL_EQ + SQL_NULL for k in listwrap(command['clear']) if k in self.columns for c in self.columns[k] if c.jx_type != "nested" and len(c.nested_path) == 1 ] ) + SQL_WHERE + where_sql ) self.db.execute(command)
def __init__(self, header=None, data=None): self.header = header self.data = data self.meta = Data()
def __init__( self, hg=None, # hg CONNECTION INFO repo=None, # CONNECTION INFO FOR ES CACHE use_cache=False, # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES kwargs=None, ): if not _hg_branches: _late_imports() if not is_text(repo.index): Log.error("Expecting 'index' parameter") self.repo_locker = Lock() self.moves_locker = Lock() self.todo = mo_threads.Queue("todo for hg daemon", max=DAEMON_QUEUE_SIZE) self.settings = kwargs self.hg = Data( url=hg.url, timeout=Duration(coalesce(hg.timeout, "30second")).seconds, retry={ "times": 3, "sleep": DAEMON_HG_INTERVAL }, ) self.last_cache_miss = Date.now() # VERIFY CONNECTIVITY with Explanation("Test connect with hg"): http.head(self.settings.hg.url) set_default(repo, { "type": "revision", "schema": revision_schema, }) kwargs.branches = set_default( { "index": repo.index + "-branches", "type": "branch" }, repo, ) moves = set_default( {"index": repo.index + "-moves"}, repo, ) self.branches = _hg_branches.get_branches(kwargs=kwargs) cluster = elasticsearch.Cluster(kwargs=repo) self.repo = cluster.get_or_create_index(kwargs=repo) self.moves = cluster.get_or_create_index(kwargs=moves) def setup_es(please_stop): with suppress_exception: self.repo.add_alias() with suppress_exception: self.moves.add_alias() with suppress_exception: self.repo.set_refresh_interval(seconds=1) with suppress_exception: self.moves.set_refresh_interval(seconds=1) Thread.run("setup_es", setup_es) Thread.run("hg daemon", self._daemon)
def format_list(decoders, aggs, start, query, select): new_edges = count_dim(aggs, decoders) def data(): dims = tuple( len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) is_sent = Matrix(dims=dims, zeros=0) if query.sort and not query.groupby: # TODO: USE THE format_table() TO PRODUCE THE NEEDED VALUES INSTEAD OF DUPLICATING LOGIC HERE all_coord = is_sent._all_combos( ) # TRACK THE EXPECTED COMBINATIONS for row, coord, agg in aggs_iterator(aggs, decoders): missing_coord = all_coord.next() while coord != missing_coord: # INSERT THE MISSING COORDINATE INTO THE GENERATION output = Data() for i, d in enumerate(decoders): output[query.edges[i].name] = d.get_value( missing_coord[i]) for s in select: if s.aggregate == "count": output[s.name] = 0 yield output missing_coord = all_coord.next() output = Data() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = s.pull(agg) yield output else: is_sent = Matrix(dims=dims, zeros=0) for row, coord, agg in aggs_iterator(aggs, decoders): is_sent[coord] = 1 output = Data() for e, c, d in zip(query.edges, coord, decoders): output[e.name] = d.get_value(c) for s in select: output[s.name] = s.pull(agg) yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for c, v in is_sent: if not v: output = Data() for i, d in enumerate(decoders): output[query.edges[i].name] = d.get_value(c[i]) for s in select: if s.aggregate == "count": output[s.name] = 0 yield output output = Data(meta={"format": "list"}, data=list(data())) return output
Log.error("Can not deal with date like {{date|json}}", date=date) def minimize_repo(repo): """ RETURN A MINIMAL VERSION OF THIS CHANGESET """ if repo == None: return Null output = wrap(_copy_but(repo, _exclude_from_repo)) output.changeset.description = strings.limit(output.changeset.description, 1000) return output _exclude_from_repo = Data() for k in [ "changeset.files", "changeset.diff", "changeset.moves", "etl", "branch.last_used", "branch.description", "branch.etl", "branch.parent_name", "children", "parents", "phase", "bookmarks", "tags", ]:
def _get_from_hg(self, revision, locale=None, get_diff=False, get_moves=True): # RATE LIMIT CALLS TO HG (CACHE MISSES) next_cache_miss = self.last_cache_miss + ( Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND) self.last_cache_miss = Date.now() if next_cache_miss > self.last_cache_miss: Log.note( "delaying next hg call for {{seconds|round(decimal=1)}} seconds", seconds=next_cache_miss - self.last_cache_miss, ) Till(till=next_cache_miss.unix).wait() # CLEAN UP BRANCH NAME found_revision = copy(revision) if isinstance(found_revision.branch, (text, binary_type)): lower_name = found_revision.branch.lower() else: lower_name = found_revision.branch.name.lower() if not lower_name: Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch) b = found_revision.branch = self.branches[(lower_name, locale)] if not b: b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)] if not b: Log.warning( "can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale, ) return Null # REFRESH BRANCHES, IF TOO OLD if Date.now() - Date(b.etl.timestamp) > _hg_branches.OLD_BRANCH: self.branches = _hg_branches.get_branches(kwargs=self.settings) # FIND THE PUSH push = self._get_push(found_revision.branch, found_revision.changeset.id) id12 = found_revision.changeset.id[0:12] base_url = URL(found_revision.branch.url) with Explanation("get revision from {{url}}", url=base_url, debug=DEBUG): raw_rev2 = Null automation_details = Null try: raw_rev1 = self._get_raw_json_info((base_url / "json-info") + {"node": id12}) raw_rev2 = self._get_raw_json_rev(base_url / "json-rev" / id12) automation_details = self._get_raw_json_rev( base_url / "json-automationrelevance" / id12) except Exception as e: if "Hg denies it exists" in e: raw_rev1 = Data(node=revision.changeset.id) else: raise e raw_rev3_changeset = first(r for r in automation_details.changesets if r.node[:12] == id12) if last(automation_details.changesets) != raw_rev3_changeset: Log.note("interesting") output = self._normalize_revision( set_default(raw_rev1, raw_rev2, raw_rev3_changeset), found_revision, push, get_diff, get_moves, ) if output.push.date >= Date.now() - MAX_TODO_AGE: self.todo.extend([ (output.branch, listwrap(output.parents), None), (output.branch, listwrap(output.children), None), ( output.branch, listwrap(output.backsoutnodes), output.push.date, ), ]) if not get_diff: # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED output.changeset.diff = None if not get_moves: output.changeset.moves = None return output
def _output(): for g, v in itertools.groupby(data, get_key): group = Data() for k, gg in zip(keys, g): group[k] = gg yield (group, wrap(list(v)))
def selector(d): output = Data() for n, p in push_and_pull: output[n] = unwraplist(p(wrap(d))) return unwrap(output)
def get_selects(query): schema = query.frum.schema query_level = len(schema.query_path) query_path = schema.query_path[0] # SPLIT select INTO ES_SELECT AND RESULTSET SELECT split_select = OrderedDict((p, ESSelectOp(p)) for p in schema.query_path) def expand_split_select(c_nested_path): es_select = split_select.get(c_nested_path) if not es_select: temp = [(k, v) for k, v in split_select.items()] split_select.clear() split_select.update({c_nested_path: ESSelectOp(c_nested_path)}) split_select.update(temp) return split_select[c_nested_path] new_select = FlatList() post_expressions = {} selects = list_to_data([unwrap(s.copy()) for s in listwrap(query.select)]) # WHAT PATH IS _source USED, IF ANY? for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) if any(c.jx_type == NESTED for c in leaves): split_select["."].source_path = "." elif is_op(select.value, Variable): for selected_column in schema.values(select.value.var, exclude_type=(OBJECT, EXISTS)): if selected_column.jx_type == NESTED: expand_split_select( selected_column.es_column ).source_path = selected_column.es_column continue leaves = schema.leaves(selected_column.es_column) for c in leaves: if c.jx_type == NESTED: split_select[c.es_column].source_path = c.es_column # IF WE GET THE SOURCE FOR PARENT, WE ASSUME WE GOT SOURCE FOR CHILD source_path = None source_level = 0 for level, es_select in enumerate(reversed(list(split_select.values()))): if source_path: es_select.source_path = source_path elif es_select.source_path: source_level = level + 1 source_path = es_select.source_path def get_pull_source(c): nested_path = c.nested_path nested_level = len(nested_path) pos = text(nested_level) if nested_level <= query_level: if not source_level or nested_level < source_level: field = join_field([pos, "fields", c.es_column]) return jx_expression_to_function(field) elif nested_level == source_level: field = relative_field(c.es_column, nested_path[0]) def pull_source(row): return untyped(row.get(pos, Null)._source[field]) return pull_source else: field = relative_field(c.es_column, nested_path[0]) def pull_property(row): return untyped(row.get(pos, Null)[field]) return pull_property else: pos = text(query_level) if not source_level or nested_level < source_level: # PULL FIELDS AND THEN AGGREGATE THEM value = jx_expression_to_function( join_field(["fields", c.es_column])) name = literal_field(nested_path[0]) index = jx_expression_to_function("_nested.offset") def pull_nested_field(doc): hits = doc.get(pos, Null).inner_hits[name].hits.hits if not hits: return [] temp = [(index(h), value(h)) for h in hits] acc = [None] * len(temp) for i, v in temp: acc[i] = unwraplist(v) return acc return pull_nested_field else: # PULL SOURCES value = jx_expression_to_function( concat_field("_source", relative_field(c.es_column, nested_path[0]))) name = literal_field(nested_path[0]) index = jx_expression_to_function( join_field(["_nested"] * (len(c.nested_path) - 1) + ["offset"])) def pull_nested_source(doc): hits = doc.get(pos, Null).inner_hits[name].hits.hits if not hits: return [] temp = [(index(h), value(h)) for h in hits] acc = [None] * len(temp) for i, v in temp: acc[i] = untyped(v) return acc return pull_nested_source put_index = 0 for select in selects: # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS if is_op(select.value, LeavesOp) and is_op(select.value.term, Variable): term = select.value.term leaves = schema.leaves(term.var) for c in leaves: c_nested_path = c.nested_path[0] simple_name = relative_field(c.es_column, query_path).lstrip(".") name = concat_field(select.name, untype_path(simple_name)) put_name = concat_field( select.name, literal_field(untype_path(simple_name))) split_select[c_nested_path].fields.append(c.es_column) new_select.append({ "name": name, "value": Variable(c.es_column), "put": { "name": put_name, "index": put_index, "child": ".", }, "pull": get_pull_source(c), }) put_index += 1 elif is_op(select.value, Variable): if select.value.var == ".": # PULL ALL SOURCE new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": "." }, "pull": get_pull_source( Data(es_column=query_path, nested_path=schema.query_path)), }) continue for selected_column in schema.values(select.value.var, exclude_type=(EXISTS, OBJECT)): if selected_column.jx_type == NESTED: new_select.append({ "name": select.name, "value": select.value, "put": { "name": select.name, "index": put_index, "child": "." }, "pull": get_pull_source( Data( es_column=selected_column.es_column, nested_path=(selected_column.es_column, ) + selected_column.nested_path, )), }) continue leaves = schema.leaves( selected_column.es_column, exclude_type=INTERNAL) # LEAVES OF OBJECT if leaves: for c in leaves: if c.es_column == "_id": new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": ".", }, "pull": pull_id, }) continue c_nested_path = c.nested_path[0] expand_split_select(c_nested_path).fields.append( c.es_column) child = untype_path( relative_field( c.es_column, selected_column.es_column, )) new_select.append({ "name": select.name, "value": Variable(c.es_column), "put": { "name": select.name, "index": put_index, "child": child, }, "pull": get_pull_source(c), }) else: new_select.append({ "name": select.name, "value": NULL, "put": { "name": select.name, "index": put_index, "child": "." }, }) put_index += 1 else: op, split_scripts = split_expression_by_path(select.value, schema, lang=Painless) for p, script in split_scripts.items(): es_select = split_select[p] es_select.scripts[select.name] = { "script": text(Painless[script].partial_eval().to_es_script(schema)) } new_select.append({ "name": select.name, "pull": jx_expression_to_function( join_field([ text(p), "fields", select.name, ])), "put": { "name": select.name, "index": put_index, "child": "." }, }) put_index += 1 def inners(query_path, parent_pos): """ :param query_path: :return: ITERATOR OVER TUPLES ROWS AS TUPLES, WHERE row[len(nested_path)] HAS INNER HITS AND row[0] HAS post_expressions """ pos = text(int(parent_pos) + 1) if not query_path: def base_case(row): extra = {} for k, e in post_expressions.items(): extra[k] = e(row) row["0"] = extra yield row return base_case if pos == "1": more = inners(query_path[:-1], "1") def first_case(results): for result in results: for hit in result.hits.hits: seed = {"0": None, pos: hit} for row in more(seed): yield row return first_case else: more = inners(query_path[:-1], pos) if source_path and source_path < query_path[-1]: rel_path = relative_field(query_path[-1], source_path) def source(acc): for inner_row in acc[parent_pos][rel_path]: acc[pos] = inner_row for tt in more(acc): yield tt return source else: path = literal_field(query_path[-1]) def recurse(acc): hits = acc[parent_pos].inner_hits[path].hits.hits if hits: for inner_row in hits: acc[pos] = inner_row for tt in more(acc): yield tt else: for tt in more(acc): yield tt return recurse return new_select, split_select, inners(schema.query_path, "0")
def flatten_many(self, docs, path="."): """ :param docs: THE JSON DOCUMENTS :param path: FULL PATH TO THIS (INNER/NESTED) DOCUMENT :return: TUPLE (success, command, doc_collection) WHERE success: BOOLEAN INDICATING PROPER PARSING command: SCHEMA CHANGES REQUIRED TO BE SUCCESSFUL NEXT TIME doc_collection: MAP FROM NESTED PATH TO INSERTION PARAMETERS: {"active_columns": list, "rows": list of objects} """ # TODO: COMMAND TO ADD COLUMNS # TODO: COMMAND TO NEST EXISTING COLUMNS # COLLECT AS MANY doc THAT DO NOT REQUIRE SCHEMA CHANGE _insertion = Data( active_columns=Queue(), rows=[] ) doc_collection = {".": _insertion} # KEEP TRACK OF WHAT TABLE WILL BE MADE (SHORTLY) required_changes = [] facts = self.container.get_or_create_facts(self.name) snowflake = facts.snowflake def _flatten(data, uid, parent_id, order, full_path, nested_path, row=None, guid=None): """ :param data: the data we are pulling apart :param uid: the uid we are giving this doc :param parent_id: the parent id of this (sub)doc :param order: the number of siblings before this one :param full_path: path to this (sub)doc :param nested_path: list of paths, deepest first :param row: we will be filling this :return: """ table = concat_field(self.name, nested_path[0]) insertion = doc_collection[nested_path[0]] if not row: row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order} insertion.rows.append(row) if is_data(data): items = [(concat_field(full_path, k), v ) for k, v in wrap(data).leaves()] else: # PRIMITIVE VALUES items = [(full_path, data)] for cname, v in items: jx_type = get_jx_type(v) if jx_type is None: continue insertion = doc_collection[nested_path[0]] if jx_type == NESTED: c = first( cc for cc in insertion.active_columns + snowflake.columns if cc.jx_type in STRUCT and untyped_column(cc.name)[0] == cname ) else: c = first( cc for cc in insertion.active_columns + snowflake.columns if cc.jx_type == jx_type and cc.name == cname ) if isinstance(c, list): Log.error("confused") if not c: # WHAT IS THE NESTING LEVEL FOR THIS PATH? deeper_nested_path = "." for path in snowflake.query_paths: if startswith_field(cname, path[0]) and len(deeper_nested_path) < len(path): deeper_nested_path = path c = Column( name=cname, jx_type=jx_type, es_type=json_type_to_sqlite_type.get(jx_type, jx_type), es_column=typed_column(cname, json_type_to_sql_type.get(jx_type)), es_index=table, nested_path=nested_path, last_updated=Date.now() ) if jx_type == NESTED: snowflake.query_paths.append(c.es_column) required_changes.append({'nest': c}) else: insertion.active_columns.add(c) required_changes.append({"add": c}) elif c.jx_type == NESTED and jx_type == OBJECT: # ALWAYS PROMOTE OBJECTS TO NESTED jx_type = NESTED v = [v] elif len(c.nested_path) < len(nested_path): from_doc = doc_collection.get(c.nested_path[0], None) column = c.es_column from_doc.active_columns.remove(c) snowflake._remove_column(c) required_changes.append({"nest": c}) deep_c = Column( name=cname, jx_type=jx_type, es_type=json_type_to_sqlite_type.get(jx_type, jx_type), es_column=typed_column(cname, json_type_to_sql_type.get(jx_type)), es_index=table, nested_path=nested_path, last_updated=Date.now() ) snowflake._add_column(deep_c) snowflake._drop_column(c) from_doc.active_columns.remove(c) for r in from_doc.rows: r1 = unwrap(r) if column in r1: row1 = {UID: self.container.next_uid(), PARENT: r1["__id__"], ORDER: 0, column: r1[column]} insertion.rows.append(row1) elif len(c.nested_path) > len(nested_path): insertion = doc_collection[c.nested_path[0]] row = {UID: self.container.next_uid(), PARENT: uid, ORDER: order} insertion.rows.append(row) # BE SURE TO NEST VALUES, IF NEEDED if jx_type == NESTED: deeper_nested_path = [cname] + nested_path if not doc_collection.get(cname): doc_collection[cname] = Data( active_columns=Queue(), rows=[] ) for i, r in enumerate(v): child_uid = self.container.next_uid() _flatten(r, child_uid, uid, i, cname, deeper_nested_path) elif jx_type == OBJECT: _flatten(v, uid, parent_id, order, cname, nested_path, row=row) elif c.jx_type: row[c.es_column] = v for doc in docs: _flatten(doc, self.container.next_uid(), 0, 0, full_path=path, nested_path=["."], guid=generateGuid()) if required_changes: snowflake.change_schema(required_changes) required_changes = [] return doc_collection
def flatten_many(self, docs, path="."): """ :param docs: THE JSON DOCUMENT :param path: FULL PATH TO THIS (INNER/NESTED) DOCUMENT :return: TUPLE (success, command, doc_collection) WHERE success: BOOLEAN INDICATING PROPER PARSING command: SCHEMA CHANGES REQUIRED TO BE SUCCESSFUL NEXT TIME doc_collection: MAP FROM NESTED PATH TO INSERTION PARAMETERS: {"active_columns": list, "rows": list of objects} """ # TODO: COMMAND TO ADD COLUMNS # TODO: COMMAND TO NEST EXISTING COLUMNS # COLLECT AS MANY doc THAT DO NOT REQUIRE SCHEMA CHANGE _insertion = Data(active_columns=set(), rows=[]) doc_collection = {".": _insertion} # KEEP TRACK OF WHAT TABLE WILL BE MADE (SHORTLY) required_changes = [] snowflake = self.facts.snowflake abs_schema = BasicSnowflake(snowflake.query_paths, snowflake.columns) def _flatten(data, uid, parent_id, order, full_path, nested_path, row=None, guid=None): """ :param data: the data we are pulling apart :param uid: the uid we are giving this doc :param parent_id: the parent id of this (sub)doc :param order: the number of siblings before this one :param full_path: path to this (sub)doc :param nested_path: list of paths, deepest first :param row: we will be filling this :return: """ table = concat_field(self.name, nested_path[0]) insertion = doc_collection[nested_path[0]] if not row: row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order} insertion.rows.append(row) if not isinstance(data, Mapping): data = {".": data} for k, v in data.items(): insertion = doc_collection[nested_path[0]] cname = concat_field(full_path, literal_field(k)) value_type = get_type(v) if value_type is None: continue if value_type in STRUCT: c = unwraplist([ cc for cc in abs_schema.column[cname] if cc.jx_type in STRUCT ]) else: c = unwraplist([ cc for cc in abs_schema.column[cname] if cc.jx_type == value_type ]) if not c: # WHAT IS THE NESTING LEVEL FOR THIS PATH? deeper_nested_path = "." for path in abs_schema.query_paths: if startswith_field( cname, path) and len(deeper_nested_path) < len(path): deeper_nested_path = path c = Column(name=cname, jx_type=value_type, es_type=json_type_to_sqlite_type.get( value_type, value_type), es_column=typed_column( cname, json_type_to_sql_type.get(value_type)), es_index=table, nested_path=nested_path, last_updated=Date.now()) abs_schema.columns.append(c) if value_type == "nested": abs_schema.query_paths.append(c.es_column) required_changes.append({'nest': (c, nested_path)}) else: required_changes.append({"add": c}) # INSIDE IF BLOCK BECAUSE WE DO NOT WANT IT TO ADD WHAT WE columns.get() ALREADY insertion.active_columns.add(c) elif c.jx_type == "nested" and value_type == "object": value_type = "nested" v = [v] elif len(c.nested_path) < len(nested_path): from_doc = doc_collection.get(c.nested_path[0], None) column = c.es_column from_doc.active_columns.remove(c) abs_schema._remove_column(c) required_changes.append({"nest": (c, nested_path)}) deep_c = Column(name=cname, jx_type=value_type, es_column=typed_column( cname, json_type_to_sql_type(value_type)), es_index=table, nested_path=nested_path, last_updated=Date.now()) abs_schema._add_column(deep_c) insertion.active_columns.add(deep_c) for r in from_doc.rows: r1 = unwrap(r) if column in r1: row1 = { UID: self.next_uid(), PARENT: r1["__id__"], ORDER: 0, column: r1[column] } insertion.rows.append(row1) elif len(c.nested_path) > len(nested_path): insertion = doc_collection[c.nested_path[0]] row = {UID: self.next_uid(), PARENT: uid, ORDER: order} insertion.rows.append(row) # BE SURE TO NEST VALUES, IF NEEDED if value_type == "nested": row[c.es_column] = "." deeper_nested_path = [cname] + nested_path insertion = doc_collection.get(cname, None) if not insertion: insertion = doc_collection[cname] = Data( active_columns=set(), rows=[]) for i, r in enumerate(v): child_uid = self.next_uid() _flatten(r, child_uid, uid, i, cname, deeper_nested_path) elif value_type == "object": row[c.es_column] = "." _flatten(v, uid, parent_id, order, cname, nested_path, row=row) elif c.jx_type: row[c.es_column] = v for doc in docs: _flatten(doc, self.next_uid(), 0, 0, full_path=path, nested_path=["."], guid=self.next_guid()) if required_changes: snowflake.change_schema(required_changes) required_changes = [] return doc_collection
def _flatten(data, uid, parent_id, order, full_path, nested_path, row=None, guid=None): """ :param data: the data we are pulling apart :param uid: the uid we are giving this doc :param parent_id: the parent id of this (sub)doc :param order: the number of siblings before this one :param full_path: path to this (sub)doc :param nested_path: list of paths, deepest first :param row: we will be filling this :return: """ table = concat_field(self.name, nested_path[0]) insertion = doc_collection[nested_path[0]] if not row: row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order} insertion.rows.append(row) if is_data(data): items = [(concat_field(full_path, k), v ) for k, v in wrap(data).leaves()] else: # PRIMITIVE VALUES items = [(full_path, data)] for cname, v in items: jx_type = get_jx_type(v) if jx_type is None: continue insertion = doc_collection[nested_path[0]] if jx_type == NESTED: c = first( cc for cc in insertion.active_columns + snowflake.columns if cc.jx_type in STRUCT and untyped_column(cc.name)[0] == cname ) else: c = first( cc for cc in insertion.active_columns + snowflake.columns if cc.jx_type == jx_type and cc.name == cname ) if isinstance(c, list): Log.error("confused") if not c: # WHAT IS THE NESTING LEVEL FOR THIS PATH? deeper_nested_path = "." for path in snowflake.query_paths: if startswith_field(cname, path[0]) and len(deeper_nested_path) < len(path): deeper_nested_path = path c = Column( name=cname, jx_type=jx_type, es_type=json_type_to_sqlite_type.get(jx_type, jx_type), es_column=typed_column(cname, json_type_to_sql_type.get(jx_type)), es_index=table, nested_path=nested_path, last_updated=Date.now() ) if jx_type == NESTED: snowflake.query_paths.append(c.es_column) required_changes.append({'nest': c}) else: insertion.active_columns.add(c) required_changes.append({"add": c}) elif c.jx_type == NESTED and jx_type == OBJECT: # ALWAYS PROMOTE OBJECTS TO NESTED jx_type = NESTED v = [v] elif len(c.nested_path) < len(nested_path): from_doc = doc_collection.get(c.nested_path[0], None) column = c.es_column from_doc.active_columns.remove(c) snowflake._remove_column(c) required_changes.append({"nest": c}) deep_c = Column( name=cname, jx_type=jx_type, es_type=json_type_to_sqlite_type.get(jx_type, jx_type), es_column=typed_column(cname, json_type_to_sql_type.get(jx_type)), es_index=table, nested_path=nested_path, last_updated=Date.now() ) snowflake._add_column(deep_c) snowflake._drop_column(c) from_doc.active_columns.remove(c) for r in from_doc.rows: r1 = unwrap(r) if column in r1: row1 = {UID: self.container.next_uid(), PARENT: r1["__id__"], ORDER: 0, column: r1[column]} insertion.rows.append(row1) elif len(c.nested_path) > len(nested_path): insertion = doc_collection[c.nested_path[0]] row = {UID: self.container.next_uid(), PARENT: uid, ORDER: order} insertion.rows.append(row) # BE SURE TO NEST VALUES, IF NEEDED if jx_type == NESTED: deeper_nested_path = [cname] + nested_path if not doc_collection.get(cname): doc_collection[cname] = Data( active_columns=Queue(), rows=[] ) for i, r in enumerate(v): child_uid = self.container.next_uid() _flatten(r, child_uid, uid, i, cname, deeper_nested_path) elif jx_type == OBJECT: _flatten(v, uid, parent_id, order, cname, nested_path, row=row) elif c.jx_type: row[c.es_column] = v
def _flatten(data, uid, parent_id, order, full_path, nested_path, row=None, guid=None): """ :param data: the data we are pulling apart :param uid: the uid we are giving this doc :param parent_id: the parent id of this (sub)doc :param order: the number of siblings before this one :param full_path: path to this (sub)doc :param nested_path: list of paths, deepest first :param row: we will be filling this :return: """ table = concat_field(self.name, nested_path[0]) insertion = doc_collection[nested_path[0]] if not row: row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order} insertion.rows.append(row) if not isinstance(data, Mapping): data = {".": data} for k, v in data.items(): insertion = doc_collection[nested_path[0]] cname = concat_field(full_path, literal_field(k)) value_type = get_type(v) if value_type is None: continue if value_type in STRUCT: c = unwraplist([ cc for cc in abs_schema.column[cname] if cc.jx_type in STRUCT ]) else: c = unwraplist([ cc for cc in abs_schema.column[cname] if cc.jx_type == value_type ]) if not c: # WHAT IS THE NESTING LEVEL FOR THIS PATH? deeper_nested_path = "." for path in abs_schema.query_paths: if startswith_field( cname, path) and len(deeper_nested_path) < len(path): deeper_nested_path = path c = Column(name=cname, jx_type=value_type, es_type=json_type_to_sqlite_type.get( value_type, value_type), es_column=typed_column( cname, json_type_to_sql_type.get(value_type)), es_index=table, nested_path=nested_path, last_updated=Date.now()) abs_schema.columns.append(c) if value_type == "nested": abs_schema.query_paths.append(c.es_column) required_changes.append({'nest': (c, nested_path)}) else: required_changes.append({"add": c}) # INSIDE IF BLOCK BECAUSE WE DO NOT WANT IT TO ADD WHAT WE columns.get() ALREADY insertion.active_columns.add(c) elif c.jx_type == "nested" and value_type == "object": value_type = "nested" v = [v] elif len(c.nested_path) < len(nested_path): from_doc = doc_collection.get(c.nested_path[0], None) column = c.es_column from_doc.active_columns.remove(c) abs_schema._remove_column(c) required_changes.append({"nest": (c, nested_path)}) deep_c = Column(name=cname, jx_type=value_type, es_column=typed_column( cname, json_type_to_sql_type(value_type)), es_index=table, nested_path=nested_path, last_updated=Date.now()) abs_schema._add_column(deep_c) insertion.active_columns.add(deep_c) for r in from_doc.rows: r1 = unwrap(r) if column in r1: row1 = { UID: self.next_uid(), PARENT: r1["__id__"], ORDER: 0, column: r1[column] } insertion.rows.append(row1) elif len(c.nested_path) > len(nested_path): insertion = doc_collection[c.nested_path[0]] row = {UID: self.next_uid(), PARENT: uid, ORDER: order} insertion.rows.append(row) # BE SURE TO NEST VALUES, IF NEEDED if value_type == "nested": row[c.es_column] = "." deeper_nested_path = [cname] + nested_path insertion = doc_collection.get(cname, None) if not insertion: insertion = doc_collection[cname] = Data( active_columns=set(), rows=[]) for i, r in enumerate(v): child_uid = self.next_uid() _flatten(r, child_uid, uid, i, cname, deeper_nested_path) elif value_type == "object": row[c.es_column] = "." _flatten(v, uid, parent_id, order, cname, nested_path, row=row) elif c.jx_type: row[c.es_column] = v
def __new__(cls, e=None, query=None, *args, **kwargs): e.allowNulls = coalesce(e.allowNulls, True) if e.value and e.domain.type == "default": # if query.groupby: # return object.__new__(DefaultDecoder, e) if is_text(e.value): Log.error("Expecting Variable or Expression, not plain string") if is_op(e.value, LeavesOp): return object.__new__(ObjectDecoder) elif is_op(e.value, TupleOp): # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS if not all(is_op(t, Variable) for t in e.value.terms): Log.error("Can only handle variables in tuples") e.domain = Data(dimension={"fields": e.value.terms}) return object.__new__(DimFieldListDecoder) elif is_op(e.value, Variable): schema = query.frum.schema cols = schema.leaves(e.value.var) if not cols: return object.__new__(DefaultDecoder) if len(cols) != 1: return object.__new__(ObjectDecoder) col = first(cols) limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT) if col.cardinality == None: DEBUG and Log.warning( "metadata for column {{name|quote}} (id={{id}}) is not ready", name=concat_field(col.es_index, col.es_column), id=id(col)) e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) elif col.multi <= 1 and col.partitions == None: e.domain = set_default(DefaultDomain(limit=limit), e.domain.__data__()) return object.__new__(DefaultDecoder) else: DEBUG and Log.note("id={{id}} has parts!!!", id=id(col)) if col.multi > 1: return object.__new__(MultivalueDecoder) partitions = col.partitions[:limit:] if e.domain.sort == -1: partitions = list(reversed(sorted(partitions))) else: partitions = sorted(partitions) e.domain = SimpleSetDomain(partitions=partitions, limit=limit) else: return object.__new__(DefaultDecoder) if e.value and e.domain.type in PARTITION: return object.__new__(SetDecoder) if isinstance(e.domain.dimension, Dimension): e.domain = e.domain.dimension.getDomain() return object.__new__(SetDecoder) if e.value and e.domain.type == "time": return object.__new__(TimeDecoder) if e.range: return object.__new__(GeneralRangeDecoder) if e.value and e.domain.type == "duration": return object.__new__(DurationDecoder) elif e.value and e.domain.type == "range": return object.__new__(RangeDecoder) elif not e.value and e.domain.dimension.fields: # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields # JUST PULL THE FIELDS fields = e.domain.dimension.fields if is_data(fields): Log.error("No longer allowed: All objects are expressions") else: return object.__new__(DimFieldListDecoder) elif not e.value and all(e.domain.partitions.where): return object.__new__(GeneralSetDecoder) else: Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
def _run(self): with CProfiler(): self.id = thread.get_ident() with ALL_LOCK: ALL[self.id] = self try: if self.target is not None: a, k, self.args, self.kwargs = self.args, self.kwargs, None, None response = self.target(*a, **k) with self.synch_lock: self.end_of_thread = Data(response=response) else: with self.synch_lock: self.end_of_thread = Null except Exception as e: e = Except.wrap(e) with self.synch_lock: self.end_of_thread = Data(exception=e) if self not in self.parent.children: # THREAD FAILURES ARE A PROBLEM ONLY IF NO ONE WILL BE JOINING WITH IT try: Log.fatal("Problem in thread {{name|quote}}", name=self.name, cause=e) except Exception: sys.stderr.write(b"ERROR in thread: " + str(self.name) + b" " + str(e) + b"\n") finally: try: children = copy(self.children) for c in children: try: if DEBUG: sys.stdout.write(b"Stopping thread " + str(c.name) + b"\n") c.stop() except Exception as e: Log.warning("Problem stopping thread {{thread}}", thread=c.name, cause=e) for c in children: try: if DEBUG: sys.stdout.write(b"Joining on thread " + str(c.name) + b"\n") c.join() except Exception as e: Log.warning("Problem joining thread {{thread}}", thread=c.name, cause=e) finally: if DEBUG: sys.stdout.write(b"Joined on thread " + str(c.name) + b"\n") self.stopped.go() if DEBUG: Log.note("thread {{name|quote}} stopping", name=self.name) del self.target, self.args, self.kwargs with ALL_LOCK: del ALL[self.id] except Exception as e: if DEBUG: Log.warning("problem with thread {{name|quote}}", cause=e, name=self.name) finally: self.stopped.go() if DEBUG: Log.note("thread {{name|quote}} is done", name=self.name)
# License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. # # Author: Kyle Lahnakoski ([email protected]) # from __future__ import unicode_literals from collections import Mapping from jx_base import container from jx_python import containers from mo_dots import Data from mo_dots import wrap, set_default, split_field from mo_logs import Log config = Data() # config.default IS EXPECTED TO BE SET BEFORE CALLS ARE MADE _ListContainer = None _meta = None def _delayed_imports(): global _ListContainer global _meta from jx_python import meta as _meta from jx_python.containers.list_usingPythonList import ListContainer as _ListContainer _ = _ListContainer _ = _meta
def __init__(self, dim, parent, jx): dim = wrap(dim) self.name = dim.name self.parent = coalesce(parent) self.full_name = join_field(split_field(self.parent.full_name)+[self.name]) self.edges = None # FOR NOW dot.set_default(self, dim) self.where = dim.where self.type = coalesce(dim.type, "set") self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT) self.index = coalesce(dim.index, coalesce(parent, Null).index, jx.settings.index) if not self.index: Log.error("Expecting an index name") # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION) self.edges = Data() for e in listwrap(dim.edges): new_e = Dimension(e, self, jx) self.edges[new_e.full_name] = new_e self.partitions = wrap(coalesce(dim.partitions, [])) parse_partition(self) fields = coalesce(dim.field, dim.fields) if not fields: return # NO FIELDS TO SEARCH elif isinstance(fields, Mapping): self.fields = wrap(fields) edges = wrap([{"name": k, "value": v, "allowNulls": False} for k, v in self.fields.items()]) else: self.fields = listwrap(fields) edges = wrap([{"name": f, "value": f, "index": i, "allowNulls": False} for i, f in enumerate(self.fields)]) if dim.partitions: return # ALREADY HAVE PARTS if self.type not in KNOWN - ALGEBRAIC: return # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH jx.get_columns() with Timer("Get parts of {{name}}", {"name": self.name}): parts = jx.query({ "from": self.index, "select": {"name": "count", "aggregate": "count"}, "edges": edges, "where": self.where, "limit": self.limit }) Log.note("{{name}} has {{num}} parts", name= self.name, num= len(parts)) d = parts.edges[0].domain if dim.path: if len(edges) > 1: Log.error("Not supported yet") # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE temp = Data(partitions=[]) for i, count in enumerate(parts): a = dim.path(d.getEnd(d.partitions[i])) if not isinstance(a, list): Log.error("The path function on " + dim.name + " must return an ARRAY of parts") addParts( temp, dim.path(d.getEnd(d.partitions[i])), count, 0 ) self.value = coalesce(dim.value, "name") self.partitions = temp.partitions elif isinstance(fields, Mapping): self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS partitions = FlatList() for g, p in parts.groupby(edges): if p: partitions.append({ "value": g, "where": {"and": [ {"term": {e.value: g[e.name]}} for e in edges ]}, "count": int(p) }) self.partitions = partitions elif len(edges) == 1: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": count } for i, count in enumerate(parts) ]) self.order = {p.value: i for i, p in enumerate(self.partitions)} elif len(edges) == 2: self.value = "name" # USE THE "name" ATTRIBUTE OF PARTS d2 = parts.edges[1].domain # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM array = parts.data.values()[0].cube # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END) def edges2value(*values): if isinstance(fields, Mapping): output = Data() for e, v in zip(edges, values): output[e.name] = v return output else: return tuple(values) self.partitions = wrap([ { "name": str(d.partitions[i].name), # CONVERT TO STRING "value": d.getEnd(d.partitions[i]), "where": {"term": {edges[0].value: d.partitions[i].value}}, "count": SUM(subcube), "partitions": [ { "name": str(d2.partitions[j].name), # CONVERT TO STRING "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])), "where": {"and": [ {"term": {edges[0].value: d.partitions[i].value}}, {"term": {edges[1].value: d2.partitions[j].value}} ]}, "count": count2 } for j, count2 in enumerate(subcube) if count2 > 0 # ONLY INCLUDE PROPERTIES THAT EXIST ] } for i, subcube in enumerate(array) ]) else: Log.error("Not supported") parse_partition(self) # RELATE THE PARTS TO THE PARENTS
def _get_single_branch_from_hg(settings, description, dir): if dir == "users": return [] response = http.get(settings.url + "/" + dir) doc = BeautifulSoup(response.all_content, "html.parser") output = [] try: all_branches = doc("table")[0] except Exception: return [] for i, b in enumerate(all_branches("tr")): if i == 0: continue # IGNORE HEADER columns = b("td") try: path = columns[0].a.get('href') if path == "/": continue name, desc, last_used = [c.text.strip() for c in columns][0:3] if last_used.startswith('at'): last_used = last_used[2:] detail = Data( name=name.lower(), locale=DEFAULT_LOCALE, parent_name=description, url=settings.url + path, description=desc, last_used=Date(last_used), etl={"timestamp": Date.now()} ) if detail.description == "unknown": detail.description = None # SOME BRANCHES HAVE NAME COLLISIONS, IGNORE LEAST POPULAR if path in [ "/projects/dxr/", # moved to webtools "/build/compare-locales/", # ?build team likes to clone? "/build/puppet/", # ?build team likes to clone? "/SeaMonkey/puppet/", # looses the popularity contest "/releases/gaia-l10n/v1_2/en-US/", # use default branch "/releases/gaia-l10n/v1_3/en-US/", # use default branch "/releases/gaia-l10n/v1_4/en-US/", # use default branch "/releases/gaia-l10n/v2_0/en-US/", # use default branch "/releases/gaia-l10n/v2_1/en-US/", # use default branch "/build/autoland/" ]: continue # MARKUP BRANCH IF LOCALE SPECIFIC if path.startswith("/l10n-central"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "mozilla-central" elif path.startswith("/releases/l10n/"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = _path[-2].lower() elif path.startswith("/releases/gaia-l10n/"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "gaia-" + _path[-2][1::] elif path.startswith("/weave-l10n"): _path = path.strip("/").split("/") detail.locale = _path[-1] detail.name = "weave" if BRANCH_WHITELIST is not None: found = False for br in BRANCH_WHITELIST: if br in str(detail.name): found = True break if not found: continue Log.note("Branch {{name}} {{locale}}", name=detail.name, locale=detail.locale) output.append(detail) except Exception as e: Log.warning("branch digestion problem", cause=e) return output
def format_table(aggs, es_query, query, decoders, all_selects): new_edges = wrap(count_dim(aggs, es_query, decoders)) dims = tuple(len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges) rank = len(dims) header = tuple(new_edges.name + all_selects.name) name2index = {s.name: i + rank for i, s in enumerate(all_selects)} def data(): is_sent = Matrix(dims=dims) give_me_zeros = query.sort and not query.groupby if give_me_zeros: # WE REQUIRE THE ZEROS FOR SORTING all_coord = is_sent._all_combos() # TRACK THE EXPECTED COMBINATIONS ordered_coord = next(all_coord)[::-1] output = None for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders): if coord != ordered_coord: # output HAS BEEN YIELDED, BUT SET THE DEFAULT VALUES if output is not None: for s in all_selects: i = name2index[s.name] if output[i] is None: output[i] = s.default # WE CAN GET THE SAME coord MANY TIMES, SO ONLY ADVANCE WHEN NOT ordered_coord = next(all_coord)[::-1] while coord != ordered_coord: # HAPPENS WHEN THE coord IS AHEAD OF ordered_coord record = [d.get_value(ordered_coord[i]) for i, d in enumerate(decoders)] + [s.default for s in all_selects] yield record ordered_coord = next(all_coord)[::-1] # coord == missing_coord output = [d.get_value(c) for c, d in zip(coord, decoders)] + [None for s in all_selects] for select in ss: v = select.pull(agg) if v != None: union(output, name2index[select.name], v, select.aggregate) yield output else: last_coord = None # HANG ONTO THE output FOR A BIT WHILE WE FILL THE ELEMENTS output = None for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders): if coord != last_coord: if output: # SET DEFAULTS for i, s in enumerate(all_selects): v = output[rank+i] if v == None: output[rank+i] = s.default yield output output = is_sent[coord] if output == None: output = is_sent[coord] = [d.get_value(c) for c, d in zip(coord, decoders)] + [None for _ in all_selects] last_coord = coord # THIS IS A TRICK! WE WILL UPDATE A ROW THAT WAS ALREADY YIELDED for select in ss: v = select.pull(agg) if v != None: union(output, name2index[select.name], v, select.aggregate) if output: # SET DEFAULTS ON LAST ROW for i, s in enumerate(all_selects): v = output[rank+i] if v == None: output[rank+i] = s.default yield output # EMIT THE MISSING CELLS IN THE CUBE if not query.groupby: for coord, output in is_sent: if output == None: record = [d.get_value(c) for c, d in zip(coord, decoders)] + [s.default for s in all_selects] yield record return Data( meta={"format": "table"}, header=header, data=list(data()) )
from mo_logs import Log from mo_logs.exceptions import Except from mo_logs.strings import utf82unicode, unicode2utf8 from mo_math import Math from mo_threads import Lock from mo_threads import Till from mo_times.durations import Duration from pyLibrary import convert from pyLibrary.env.big_data import safe_size, ibytes2ilines, icompressed2ibytes DEBUG = False FILE_SIZE_LIMIT = 100 * 1024 * 1024 MIN_READ_SIZE = 8 * 1024 ZIP_REQUEST = False default_headers = Data( ) # TODO: MAKE THIS VARIABLE A SPECIAL TYPE OF EXPECTED MODULE PARAMETER SO IT COMPLAINS IF NOT SET default_timeout = 600 DEFAULTS = { "allow_redirects": True, "stream": True, "verify": True, "timeout": 600, "zip": False, "retry": { "times": 1, "sleep": 0, "http": False } } _warning_sent = False request_count = 0
def es_setop(es, mvel, query): FromES = es09.util.build_es_query(query) select = listwrap(query.select) isDeep = len(split_field( query.frum.name)) > 1 # LOOKING INTO NESTED WILL REQUIRE A SCRIPT isComplex = OR([ s.value == None and s.aggregate not in ("count", "none") for s in select ]) # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT if not isDeep and not isComplex: if len(select) == 1 and isinstance(select[0].value, LeavesOp): FromES = wrap({ "query": { "bool": { "query": { "match_all": {} }, "filter": query.where.to_esfilter() } }, "sort": query.sort, "size": 0 }) elif all(isinstance(v, Variable) for v in select.value): FromES = wrap({ "query": { "bool": { "query": { "match_all": {} }, "filter": query.where.to_esfilter() } }, "fields": select.value, "sort": query.sort, "size": coalesce(query.limit, 200000) }) elif not isDeep: simple_query = query.copy() simple_query.where = TRUE # THE FACET FILTER IS FASTER FromES.facets.mvel = { "terms": { "script_field": mvel.code(simple_query), "size": coalesce(simple_query.limit, 200000) }, "facet_filter": jx_expression(query.where).to_esfilter() } else: FromES.facets.mvel = { "terms": { "script_field": mvel.code(query), "size": coalesce(query.limit, 200000) }, "facet_filter": jx_expression(query.where).to_esfilter() } data = es_post(es, FromES, query.limit) if len(select) == 1 and isinstance(select[0].value, LeavesOp): # SPECIAL CASE FOR SINGLE COUNT cube = wrap(data).hits.hits._source elif isinstance(select[0].value, Variable): # SPECIAL CASE FOR SINGLE TERM cube = wrap(data).hits.hits.fields else: data_list = unpack_terms(data.facets.mvel, select) if not data_list: cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select}) else: output = zip(*data_list) cube = Cube( select, [], {s.name: Matrix(list=output[i]) for i, s in enumerate(select)}) return Data(meta={"esquery": FromES}, data=cube)
def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord): """ :param rows: REVERSED STACK OF ROWS (WITH push() AND pop()) :param row: CURRENT ROW BEING EXTRACTED :param nested_doc_details: { "nested_path": wrap_nested_path(nested_path), "index_to_column": map from column number to column details "children": all possible direct decedents' nested_doc_details } :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop) :param parent_id_coord: the column number for the parent id (so we ca extract from each row) :return: the nested property (usually an array) """ previous_doc_id = None doc = Data() output = [] id_coord = nested_doc_details['id_coord'] while True: doc_id = row[id_coord] if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id): rows.append(row) # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc) return output if doc_id != previous_doc_id: previous_doc_id = doc_id doc = Data() curr_nested_path = nested_doc_details['nested_path'][0] index_to_column = nested_doc_details['index_to_column'].items() if index_to_column: for i, c in index_to_column: value = row[i] if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path = concat_field(c.push_name, c.push_child) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path = c.push_child if relative_path == ".": if value == '': doc = Null else: doc = value elif value != None and value != '': doc[relative_path] = value for child_details in nested_doc_details['children']: # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS child_id = row[child_details['id_coord']] if child_id is not None: nested_value = _accumulate_nested(rows, row, child_details, doc_id, id_coord) if nested_value: push_name = child_details['nested_path'][0] if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp): # ASSIGN INNER PROPERTIES relative_path = relative_field(push_name, curr_nested_path) else: # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT relative_path = "." if relative_path == "." and doc is Null: doc = nested_value elif relative_path == ".": doc = unwraplist(nested_value) else: doc[relative_path] = unwraplist(nested_value) output.append(doc) try: row = rows.pop() except IndexError: return output
def es_aggsop(es, frum, query): query = query.copy() # WE WILL MARK UP THIS QUERY schema = frum.schema select = listwrap(query.select) es_query = Data() new_select = Data() # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".": if schema.query_path == ".": s.pull = jx_expression_to_function("doc_count") else: s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]}) elif isinstance(s.value, Variable): if s.aggregate == "count": new_select["count_"+literal_field(s.value.var)] += [s] else: new_select[literal_field(s.value.var)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): for s in many: es_cols = frum.schema.values(s.value.var) if s.aggregate == "count": canonical_names = [] for es_col in es_cols: cn = literal_field(es_col.es_column + "_count") if es_col.type == EXISTS: canonical_names.append(cn + ".doc_count") es_query.aggs[cn].filter.range = {es_col.es_column: {"gt": 0}} else: canonical_names.append(cn+ ".value") es_query.aggs[cn].value_count.field = es_col.es_column if len(es_cols) == 1: s.pull = jx_expression_to_function(canonical_names[0]) else: s.pull = jx_expression_to_function({"add": canonical_names}) elif s.aggregate == "median": if len(es_cols) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = es_cols[0].es_column es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\.0") elif s.aggregate == "percentile": if len(es_cols) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile: Log.error("Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = es_cols[0].es_column es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": canonical_names = [] for es_col in es_cols: cn = literal_field(es_col.es_column + "_cardinality") canonical_names.append(cn) es_query.aggs[cn].cardinality.field = es_col.es_column if len(es_cols) == 1: s.pull = jx_expression_to_function(canonical_names[0] + ".value") else: s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0}) elif s.aggregate == "stats": if len(es_cols) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = es_cols[0].es_column # GET MEDIAN TOO! median_name = literal_field(canonical_name + "_percentile") es_query.aggs[median_name].percentiles.field = es_cols[0].es_column es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate == "union": pulls = [] for es_col in es_cols: script = {"scripted_metric": { 'init_script': 'params._agg.terms = new HashSet()', 'map_script': 'for (v in doc['+quote(es_col.es_column)+'].values) params._agg.terms.add(v)', 'combine_script': 'return params._agg.terms.toArray()', 'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()', }} stats_name = encode_property(es_col.es_column) if es_col.nested_path[0] == ".": es_query.aggs[stats_name] = script pulls.append(jx_expression_to_function(stats_name + ".value")) else: es_query.aggs[stats_name] = { "nested": {"path": es_col.nested_path[0]}, "aggs": {"_nested": script} } pulls.append(jx_expression_to_function(stats_name + "._nested.value")) if len(pulls) == 0: s.pull = NULL elif len(pulls) == 1: s.pull = pulls[0] else: s.pull = lambda row: UNION(p(row) for p in pulls) else: if len(es_cols) > 1: Log.error("Do not know how to count columns with more than one type (script probably)") # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field(canonical_name)].extended_stats.field = es_cols[0].es_column s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]}) for i, s in enumerate(formula): canonical_name = literal_field(s.name) if isinstance(s.value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_painless(schema).script(schema) s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value") elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = s.value.to_painless(schema).script(schema) es_query.aggs[key].percentiles.percents += [50] s.pull = jx_expression_to_function(key + ".values.50\.0") elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = s.value.to_painless(schema).script(schema) es_query.aggs[key].percentiles.percents += [percent] s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent))) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = s.value.to_painless(schema).script(schema) s.pull = jx_expression_to_function(key + ".value") elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.script = s.value.to_painless(schema).script(schema) # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = s.value.to_painless(schema).script(schema) es_query.aggs[median_name].percentiles.percents += [50] s.pull = get_pull_stats(stats_name, median_name) elif s.aggregate=="union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = s.value.to_painless(schema).script(schema) s.pull = jx_expression_to_function(stats_name + ".buckets.key") else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate]) es_query.aggs[canonical_name].extended_stats.script = s.value.to_painless(schema).script(schema) decoders = get_decoders_by_depth(query) start = 0 #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = AndOp("and", split_where[1]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)} ) es_query = wrap({ "aggs": {"_nested": set_default( { "nested": { "path": schema.query_path } }, es_query )} }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = AndOp("and", split_where[0]).to_esfilter(schema) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)} ) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es_post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", cause=e)
def es_aggsop(es, frum, query): select = wrap([s.copy() for s in listwrap(query.select)]) # [0] is a cheat; each es_column should be a dict of columns keyed on type, like in sqlite es_column_map = {v: frum.schema[v][0].es_column for v in query.vars()} es_query = Data() new_select = Data( ) #MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING formula = [] for s in select: if s.aggregate == "count" and isinstance( s.value, Variable) and s.value.var == ".": s.pull = "doc_count" elif isinstance(s.value, Variable): if s.value.var == ".": if frum.typed: # STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING if s.aggregate in NON_STATISTICAL_AGGS: #TODO: HANDLE BOTH $value AND $objects TO COUNT Log.error("do not know how to handle") else: s.value.var = "$value" new_select["$value"] += [s] else: if s.aggregate in NON_STATISTICAL_AGGS: #TODO: WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT Log.error("do not know how to handle") else: Log.error( 'Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate) elif s.aggregate == "count": s.value = s.value.map(es_column_map) new_select["count_" + literal_field(s.value.var)] += [s] else: s.value = s.value.map(es_column_map) new_select[literal_field(s.value.var)] += [s] else: formula.append(s) for canonical_name, many in new_select.items(): representative = many[0] if representative.value.var == ".": Log.error("do not know how to handle") else: field_name = representative.value.var # canonical_name=literal_field(many[0].name) for s in many: if s.aggregate == "count": es_query.aggs[literal_field( canonical_name)].value_count.field = field_name s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES key = literal_field(canonical_name + " percentile") if isinstance( s.percentile, basestring) or s.percetile < 0 or 1 < s.percentile: Log.error( "Expecting percentile to be a float from 0.0 to 1.0") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.field = field_name es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = literal_field(canonical_name + " cardinality") es_query.aggs[key].cardinality.field = field_name s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[stats_name].extended_stats.field = field_name # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.field = field_name es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.field = field_name es_query.aggs[stats_name].terms.size = Math.min( s.limit, MAX_LIMIT) s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE es_query.aggs[literal_field( canonical_name)].extended_stats.field = field_name s.pull = literal_field(canonical_name) + "." + aggregates1_4[ s.aggregate] for i, s in enumerate(formula): canonical_name = literal_field(s.name) abs_value = s.value.map(es_column_map) if isinstance(abs_value, TupleOp): if s.aggregate == "count": # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY s.pull = "doc_count" else: Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate) elif s.aggregate == "count": es_query.aggs[literal_field( canonical_name)].value_count.script = abs_value.to_ruby() s.pull = literal_field(canonical_name) + ".value" elif s.aggregate == "median": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [50] s.pull = key + ".values.50\.0" elif s.aggregate == "percentile": # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT key = literal_field(canonical_name + " percentile") percent = Math.round(s.percentile * 100, decimal=6) es_query.aggs[key].percentiles.script = abs_value.to_ruby() es_query.aggs[key].percentiles.percents += [percent] s.pull = key + ".values." + literal_field(unicode(percent)) elif s.aggregate == "cardinality": # ES USES DIFFERENT METHOD FOR CARDINALITY key = canonical_name + " cardinality" es_query.aggs[key].cardinality.script = abs_value.to_ruby() s.pull = key + ".value" elif s.aggregate == "stats": # REGULAR STATS stats_name = literal_field(canonical_name) es_query.aggs[ stats_name].extended_stats.script = abs_value.to_ruby() # GET MEDIAN TOO! median_name = literal_field(canonical_name + " percentile") es_query.aggs[median_name].percentiles.script = abs_value.to_ruby() es_query.aggs[median_name].percentiles.percents += [50] s.pull = { "count": stats_name + ".count", "sum": stats_name + ".sum", "min": stats_name + ".min", "max": stats_name + ".max", "avg": stats_name + ".avg", "sos": stats_name + ".sum_of_squares", "std": stats_name + ".std_deviation", "var": stats_name + ".variance", "median": median_name + ".values.50\.0" } elif s.aggregate == "union": # USE TERMS AGGREGATE TO SIMULATE union stats_name = literal_field(canonical_name) es_query.aggs[stats_name].terms.script_field = abs_value.to_ruby() s.pull = stats_name + ".buckets.key" else: # PULL VALUE OUT OF THE stats AGGREGATE s.pull = canonical_name + "." + aggregates1_4[s.aggregate] es_query.aggs[ canonical_name].extended_stats.script = abs_value.to_ruby() decoders = get_decoders_by_depth(query) start = 0 vars_ = query.where.vars() #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested split_where = split_expression_by_depth(query.where, schema=frum.schema) if len(split_field(frum.name)) > 1: if any(split_where[2::]): Log.error("Where clause is too deep") for d in decoders[1]: es_query = d.append_query(es_query, start) start += d.num_columns if split_where[1]: #TODO: INCLUDE FILTERS ON EDGES filter_ = simplify_esfilter( AndOp("and", split_where[1]).to_esfilter()) es_query = Data( aggs={"_filter": set_default({"filter": filter_}, es_query)}) es_query = wrap({ "aggs": { "_nested": set_default({"nested": { "path": frum.query_path }}, es_query) } }) else: if any(split_where[1::]): Log.error("Where clause is too deep") if decoders: for d in jx.reverse(decoders[0]): es_query = d.append_query(es_query, start) start += d.num_columns if split_where[0]: #TODO: INCLUDE FILTERS ON EDGES filter = simplify_esfilter(AndOp("and", split_where[0]).to_esfilter()) es_query = Data( aggs={"_filter": set_default({"filter": filter}, es_query)}) # </TERRIBLE SECTION> if not es_query: es_query = wrap({"query": {"match_all": {}}}) es_query.size = 0 with Timer("ES query time") as es_duration: result = es09.util.post(es, es_query, query.limit) try: format_time = Timer("formatting") with format_time: decoders = [d for ds in decoders for d in ds] result.aggregations.doc_count = coalesce( result.aggregations.doc_count, result.hits.total) # IT APPEARS THE OLD doc_count IS GONE formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[ query.format] if query.edges: output = formatter(decoders, result.aggregations, start, query, select) elif query.groupby: output = groupby_formatter(decoders, result.aggregations, start, query, select) else: output = aggop_formatter(decoders, result.aggregations, start, query, select) output.meta.timing.formatting = format_time.duration output.meta.timing.es_search = es_duration.duration output.meta.content_type = mime_type output.meta.es_query = es_query return output except Exception as e: if query.format not in format_dispatch: Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e) Log.error("Some problem", e)
def get_value_from_row(self, row): output = Data() for k, v in zip(self.put, row[self.start:self.start + self.num_columns:]): output[k] = v["key"] return output