Python Data.Data Beispiele

Programmiersprache: Python

Namespace / Paketname: mo_dots

Klasse / Typ: Data

Methode / Funktion: Data

Beispiele auf hotexamples.com: 30

Python Data.Data - 30 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die mo_dots.Data.Data, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Data(30)

items(11)

copy(6)

get(4)

keys(4)

append(3)

data(3)

sleep(2)

size(2)

salt(2)

length(2)

start(2)

b(2)

a(2)

locale(1)

__init__(1)

_db(1)

result(1)

prefix(1)

name(1)

marker(1)

description(1)

email_verified(1)

cause(1)

constraint_name(1)

header(1)

get_more(1)

delimiter(1)

__getitem__(1)

failAction(1)

encoding(1)

foo(1)

Beispiel #1

Datei anzeigen

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        now = Date.now()
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.columns, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.tables, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [
                cc for cc in self.meta.columns
                if cc.es_column == column.es_column and cc.es_type == "text"
            ]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count": {
                                                          "filter": {
                                                              "match_all": {}
                                                          }
                                                      }
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "query": {
                                                      "match_all": {}
                                                  },
                                                  "size": 0
                                              })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count":
                                                      _counting_query(column)
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 2

                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": [False, True],
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            else:
                es_query = {
                    "aggs": {
                        "count": _counting_query(column),
                        "_filter": {
                            "aggs": {
                                "multi": {
                                    "max": {
                                        "script":
                                        "doc[" + quote(column.es_column) +
                                        "].values.size()"
                                    }
                                }
                            },
                            "filter": {
                                "bool": {
                                    "should": [{
                                        "range": {
                                            "etl.timestamp.~n~": {
                                                "gte": (Date.today() - WEEK)
                                            }
                                        }
                                    }, {
                                        "bool": {
                                            "must_not": {
                                                "exists": {
                                                    "field":
                                                    "etl.timestamp.~n~"
                                                }
                                            }
                                        }
                                    }]
                                }
                            }
                        }
                    },
                    "size": 0
                }

                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data=es_query)
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value,
                                       agg_results.count._nested.value,
                                       agg_results.count.doc_count)
                multi = int(coalesce(agg_results._filter.multi.value, 1))
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count
                                        ) or (count >= 1000
                                              and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {
                        "path": column.nested_path[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": column.es_column
                            }
                        }
                    }
                }
            elif cardinality == 0:  # WHEN DOES THIS HAPPEN?
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {
                    "terms": {
                        "field": column.es_column,
                        "size": cardinality
                    }
                }

            result = self.es_cluster.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            DEBUG and Log.note(
                "update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}",
                id=id(column),
                column=column,
                time=now)
            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": now
                },
                "where": {
                    "eq": {
                        "es_index": column.es_index,
                        "es_column": column.es_column
                    }
                }
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(
                w in e for w in
                ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith(
                (TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                Log.warning("Missing index {{col.es_index}}",
                            col=column,
                            cause=e)
                self.meta.columns.update({
                    "clear": ".",
                    "where": {
                        "eq": {
                            "es_index": column.es_index
                        }
                    }
                })
                self.index_does_not_exist.add(column.es_index)
            elif "No field found for" in e:
                self.meta.columns.update({
                    "clear": ".",
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get column {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": now
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)

Beispiel #2

Datei anzeigen

Datei: expressions.py Projekt: yoyogias2011/TUID

    def Parts2Term(self, domain):
        """
        TERMS ARE ALWAYS ESCAPED SO THEY CAN BE COMPOUNDED WITH PIPE (|)

        CONVERT AN ARRAY OF PARTS{name, esfilter} TO AN MVEL EXPRESSION
        RETURN expression, function PAIR, WHERE
            expression - MVEL EXPRESSION
            function - TAKES RESULT OF expression AND RETURNS PART
        """
        fields = domain.dimension.fields

        term = []
        if len(split_field(self.fromData.name)) == 1 and fields:
            if isinstance(fields, Mapping):
                # CONVERT UNORDERED FIELD DEFS
                jx_fields, es_fields = transpose(
                    *[(k, fields[k]) for k in sorted(fields.keys())])
            else:
                jx_fields, es_fields = transpose(
                    *[(i, e) for i, e in enumerate(fields)])

            # NO LOOPS BECAUSE QUERY IS SHALLOW
            # DOMAIN IS FROM A DIMENSION, USE IT'S FIELD DEFS TO PULL
            if len(es_fields) == 1:

                def fromTerm(term):
                    return domain.getPartByKey(term)

                return Data(head="",
                            body='getDocValue(' +
                            quote(domain.dimension.fields[0]) + ')'), fromTerm
            else:

                def fromTerm(term):
                    terms = [
                        convert.pipe2value(t)
                        for t in convert.pipe2value(term).split("|")
                    ]

                    candidate = dict(zip(jx_fields, terms))
                    for p in domain.partitions:
                        for k, t in candidate.items():
                            if p.value[k] != t:
                                break
                        else:
                            return p
                    if domain.type in ["uid", "default"]:
                        part = {"value": candidate}
                        domain.partitions.append(part)
                        return part
                    else:
                        return Null

                for f in es_fields:
                    term.append('Value2Pipe(getDocValue(' + quote(f) + '))')

                return Data(head="",
                            body='Value2Pipe(' + ('+"|"+'.join(term)) +
                            ')'), fromTerm
        else:
            for v in domain.partitions:
                term.append("if (" +
                            _where(v.esfilter, lambda x: self._translate(x)) +
                            ") " + value2MVEL(domain.getKey(v)) + "; else ")
            term.append(value2MVEL(domain.getKey(domain.NULL)))

            func_name = "_temp" + UID()
            return self.register_function("+\"|\"+".join(term))

Beispiel #3

Datei anzeigen

Datei: setop_table.py Projekt: nknick99/MySQL-to-S3

    def _set_op(self, query, frum):
        # GET LIST OF COLUMNS
        frum_path = split_field(frum)
        primary_nested_path = join_field(frum_path[1:])
        vars_ = UNION([v.var for select in listwrap(query.select) for v in select.value.vars()])
        schema = self.sf.tables[primary_nested_path].schema

        active_columns = {".": set()}
        for v in vars_:
            for c in schema.leaves(v):
                nest = c.nested_path[0]
                active_columns.setdefault(nest, set()).add(c)

        # ANY VARS MENTIONED WITH NO COLUMNS?
        for v in vars_:
            if not any(startswith_field(cname, v) for cname in schema.keys()):
                active_columns["."].add(Column(
                    names={".": v},
                    type="null",
                    es_column=".",
                    es_index=".",
                    nested_path=["."]
                ))

        # EVERY COLUMN, AND THE INDEX IT TAKES UP
        index_to_column = {}  # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE)
        index_to_uid = {}  # FROM NESTED PATH TO THE INDEX OF UID
        sql_selects = []  # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE)
        nest_to_alias = {
            nested_path: "__" + unichr(ord('a') + i) + "__"
            for i, (nested_path, sub_table) in enumerate(self.sf.tables.items())
        }

        sorts = []
        if query.sort:
            for select in query.sort:
                col = select.value.to_sql(schema)[0]
                for t, sql in col.sql.items():
                    json_type = sql_type_to_json_type[t]
                    if json_type in STRUCT:
                        continue
                    column_number = len(sql_selects)
                    # SQL HAS ABS TABLE REFERENCE
                    column_alias = _make_column_name(column_number)
                    sql_selects.append(sql_alias(sql, column_alias))
                    if select.sort == -1:
                        sorts.append(column_alias + SQL_IS_NOT_NULL)
                        sorts.append(column_alias + " DESC")
                    else:
                        sorts.append(column_alias + SQL_IS_NULL)
                        sorts.append(column_alias)

        primary_doc_details = Data()
        # EVERY SELECT STATEMENT THAT WILL BE REQUIRED, NO MATTER THE DEPTH
        # WE WILL CREATE THEM ACCORDING TO THE DEPTH REQUIRED
        nested_path = []
        for step, sub_table in self.sf.tables.items():
            nested_path.insert(0, step)
            nested_doc_details = {
                "sub_table": sub_table,
                "children": [],
                "index_to_column": {},
                "nested_path": nested_path
            }

            # INSERT INTO TREE
            if not primary_doc_details:
                primary_doc_details = nested_doc_details
            else:
                def place(parent_doc_details):
                    if startswith_field(step, parent_doc_details['nested_path'][0]):
                        for c in parent_doc_details['children']:
                            if place(c):
                                return True
                        parent_doc_details['children'].append(nested_doc_details)

                place(primary_doc_details)

            alias = nested_doc_details['alias'] = nest_to_alias[step]

            # WE ALWAYS ADD THE UID
            column_number = index_to_uid[step] = nested_doc_details['id_coord'] = len(sql_selects)
            sql_select = join_column(alias, quoted_UID)
            sql_selects.append(sql_alias(sql_select, _make_column_name(column_number)))
            if step != ".":
                # ID AND ORDER FOR CHILD TABLES
                index_to_column[column_number] = ColumnMapping(
                    sql=sql_select,
                    type="number",
                    nested_path=nested_path,
                    column_alias=_make_column_name(column_number)
                )
                column_number = len(sql_selects)
                sql_select = join_column(alias, quoted_ORDER)
                sql_selects.append(sql_alias(sql_select, _make_column_name(column_number)))
                index_to_column[column_number] = ColumnMapping(
                    sql=sql_select,
                    type="number",
                    nested_path=nested_path,
                    column_alias=_make_column_name(column_number)
                )

            # WE DO NOT NEED DATA FROM TABLES WE REQUEST NOTHING FROM
            if step not in active_columns:
                continue

            # ADD SQL SELECT COLUMNS FOR EACH jx SELECT CLAUSE
            si = 0
            for select in listwrap(query.select):
                try:
                    column_number = len(sql_selects)
                    select.pull = get_column(column_number)
                    db_columns = select.value.partial_eval().to_sql(schema)

                    for column in db_columns:
                        if isinstance(column.nested_path, list):
                            column.nested_path = column.nested_path[0]  # IN THE EVENT THIS "column" IS MULTIVALUED
                        for t, unsorted_sql in column.sql.items():
                            json_type = sql_type_to_json_type[t]
                            if json_type in STRUCT:
                                continue
                            column_number = len(sql_selects)
                            column_alias = _make_column_name(column_number)
                            sql_selects.append(sql_alias(unsorted_sql, column_alias))
                            if startswith_field(primary_nested_path, step) and isinstance(select.value, LeavesOp):
                                # ONLY FLATTEN primary_nested_path AND PARENTS, NOT CHILDREN
                                index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping(
                                    push_name=literal_field(get_property_name(concat_field(select.name, column.name))),
                                    push_child=".",
                                    push_column_name=get_property_name(concat_field(select.name, column.name)),
                                    push_column=si,
                                    pull=get_column(column_number),
                                    sql=unsorted_sql,
                                    type=json_type,
                                    column_alias=column_alias,
                                    nested_path=nested_path
                                )
                                si += 1
                            else:
                                index_to_column[column_number] = nested_doc_details['index_to_column'][column_number] = ColumnMapping(
                                    push_name=select.name,
                                    push_child=column.name,
                                    push_column_name=select.name,
                                    push_column=si,
                                    pull=get_column(column_number),
                                    sql=unsorted_sql,
                                    type=json_type,
                                    column_alias=column_alias,
                                    nested_path=nested_path
                                )
                finally:
                    si += 1

        where_clause = BooleanOp("boolean", query.where).partial_eval().to_sql(schema, boolean=True)[0].sql.b
        unsorted_sql = self._make_sql_for_one_nest_in_set_op(
            ".",
            sql_selects,
            where_clause,
            active_columns,
            index_to_column
        )

        for n, _ in self.sf.tables.items():
            sorts.append(quote_column(COLUMN + text_type(index_to_uid[n])))

        ordered_sql = (
            SQL_SELECT + "*" +
            SQL_FROM + sql_iso(unsorted_sql) +
            SQL_ORDERBY + sql_list(sorts) +
            SQL_LIMIT + quote_value(query.limit)
        )
        self.db.create_new_functions()  # creating new functions: regexp
        result = self.db.query(ordered_sql)

        def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord):
            """
            :param rows: REVERSED STACK OF ROWS (WITH push() AND pop())
            :param row: CURRENT ROW BEING EXTRACTED
            :param nested_doc_details: {
                    "nested_path": wrap_nested_path(nested_path),
                    "index_to_column": map from column number to column details
                    "children": all possible direct decedents' nested_doc_details
                 }
            :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop)
            :param parent_id_coord: the column number for the parent id (so we ca extract from each row)
            :return: the nested property (usually an array)
            """
            previous_doc_id = None
            doc = Data()
            output = []
            id_coord = nested_doc_details['id_coord']

            while True:
                doc_id = row[id_coord]

                if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id):
                    rows.append(row)  # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc)
                    return output

                if doc_id != previous_doc_id:
                    previous_doc_id = doc_id
                    doc = Data()
                    curr_nested_path = nested_doc_details['nested_path'][0]
                    index_to_column = nested_doc_details['index_to_column'].items()
                    if index_to_column:
                        for i, c in index_to_column:
                            value = row[i]
                            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                                # ASSIGN INNER PROPERTIES
                                relative_path = concat_field(c.push_name, c.push_child)
                            else:  # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT
                                relative_path = c.push_child

                            if relative_path == ".":
                                if value == '':
                                    doc = Null
                                else:
                                    doc = value
                            elif value != None and value != '':
                                doc[relative_path] = value

                for child_details in nested_doc_details['children']:
                    # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS
                    child_id = row[child_details['id_coord']]
                    if child_id is not None:
                        nested_value = _accumulate_nested(rows, row, child_details, doc_id, id_coord)
                        if nested_value:
                            push_name = child_details['nested_path'][0]
                            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                                # ASSIGN INNER PROPERTIES
                                relative_path = relative_field(push_name, curr_nested_path)
                            else:  # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT
                                relative_path = "."

                            if relative_path == "." and doc is Null:
                                doc = nested_value
                            elif relative_path == ".":
                                doc = unwraplist(nested_value)
                            else:
                                doc[relative_path] = unwraplist(nested_value)

                output.append(doc)

                try:
                    row = rows.pop()
                except IndexError:
                    return output

        cols = tuple([i for i in index_to_column.values() if i.push_name != None])
        rows = list(reversed(unwrap(result.data)))
        if rows:
            row = rows.pop()
            data = _accumulate_nested(rows, row, primary_doc_details, None, None)
        else:
            data = result.data

        if query.format == "cube":
            for f, _ in self.sf.tables.items():
                if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)):
                    num_rows = len(result.data)
                    num_cols = MAX([c.push_column for c in cols]) + 1 if len(cols) else 0
                    map_index_to_name = {c.push_column: c.push_column_name for c in cols}
                    temp_data = [[None] * num_rows for _ in range(num_cols)]
                    for rownum, d in enumerate(result.data):
                        for c in cols:
                            if c.push_child == ".":
                                temp_data[c.push_column][rownum] = c.pull(d)
                            else:
                                column = temp_data[c.push_column][rownum]
                                if column is None:
                                    column = temp_data[c.push_column][rownum] = {}
                                column[c.push_child] = c.pull(d)
                    output = Data(
                        meta={"format": "cube"},
                        data={n: temp_data[c] for c, n in map_index_to_name.items()},
                        edges=[{
                            "name": "rownum",
                            "domain": {
                                "type": "rownum",
                                "min": 0,
                                "max": num_rows,
                                "interval": 1
                            }
                        }]
                    )
                    return output

            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                num_rows = len(data)
                temp_data = {c.push_column_name: [None] * num_rows for c in cols}
                for rownum, d in enumerate(data):
                    for c in cols:
                        temp_data[c.push_column_name][rownum] = d[c.push_name]
                return Data(
                    meta={"format": "cube"},
                    data=temp_data,
                    edges=[{
                        "name": "rownum",
                        "domain": {
                            "type": "rownum",
                            "min": 0,
                            "max": num_rows,
                            "interval": 1
                        }
                    }]
                )
            else:
                num_rows = len(data)
                map_index_to_name = {c.push_column: c.push_column_name for c in cols}
                temp_data = [data]

                return Data(
                    meta={"format": "cube"},
                    data={n: temp_data[c] for c, n in map_index_to_name.items()},
                    edges=[{
                        "name": "rownum",
                        "domain": {
                            "type": "rownum",
                            "min": 0,
                            "max": num_rows,
                            "interval": 1
                        }
                    }]
                )

        elif query.format == "table":
            for f, _ in self.sf.tables.items():
                if frum.endswith(f):
                    num_column = MAX([c.push_column for c in cols]) + 1
                    header = [None] * num_column
                    for c in cols:
                        header[c.push_column] = c.push_column_name

                    output_data = []
                    for d in result.data:
                        row = [None] * num_column
                        for c in cols:
                            set_column(row, c.push_column, c.push_child, c.pull(d))
                        output_data.append(row)

                    return Data(
                        meta={"format": "table"},
                        header=header,
                        data=output_data
                    )
            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                column_names = [None] * (max(c.push_column for c in cols) + 1)
                for c in cols:
                    column_names[c.push_column] = c.push_column_name

                temp_data = []
                for rownum, d in enumerate(data):
                    row = [None] * len(column_names)
                    for c in cols:
                        row[c.push_column] = d[c.push_name]
                    temp_data.append(row)

                return Data(
                    meta={"format": "table"},
                    header=column_names,
                    data=temp_data
                )
            else:
                column_names = listwrap(query.select).name
                return Data(
                    meta={"format": "table"},
                    header=column_names,
                    data=[[d] for d in data]
                )

        else:
            for f, _ in self.sf.tables.items():
                if frum.endswith(f) or (test_dots(cols) and isinstance(query.select, list)):
                    data = []
                    for d in result.data:
                        row = Data()
                        for c in cols:
                            if c.push_child == ".":
                                row[c.push_name] = c.pull(d)
                            elif c.num_push_columns:
                                tuple_value = row[c.push_name]
                                if not tuple_value:
                                    tuple_value = row[c.push_name] = [None] * c.num_push_columns
                                tuple_value[c.push_child] = c.pull(d)
                            else:
                                row[c.push_name][c.push_child] = c.pull(d)

                        data.append(row)

                    return Data(
                        meta={"format": "list"},
                        data=data
                    )

            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                temp_data = []
                for rownum, d in enumerate(data):
                    row = {}
                    for c in cols:
                        row[c.push_column_name] = d[c.push_name]
                    temp_data.append(row)
                return Data(
                    meta={"format": "list"},
                    data=temp_data
                )
            else:
                return Data(
                    meta={"format": "list"},
                    data=data
                )

Beispiel #4

Datei anzeigen

Datei: matrix.py Projekt: ubdussamad/mo-parquet

    elif depth == 1:
        return _MAX(cube)
    else:
        return _MAX(_max(depth - 1, c) for c in cube)


def _min(depth, cube):
    if depth == 0:
        return cube
    elif depth == 1:
        return _MIN(cube)
    else:
        return _MIN(_min(depth - 1, c) for c in cube)


aggregates = Data(max=_max, maximum=_max, min=_min, minimum=_min)


def _iter(cube, depth):
    if depth == 1:
        return cube.__iter__()
    else:

        def iterator():
            for c in cube:
                for b in _iter(c, depth - 1):
                    yield b

        return iterator()

Beispiel #5

Datei anzeigen

def apply_diff(text, diff, reverse=False, verify=True):
    """
    SOME EXAMPLES OF diff
    #@@ -1 +1 @@
    #-before china goes live, the content team will have to manually update the settings for the china-ready apps currently in marketplace.
    #+before china goes live (end January developer release, June general audience release) , the content team will have to manually update the settings for the china-ready apps currently in marketplace.
    @@ -0,0 +1,3 @@
    +before china goes live, the content team will have to manually update the settings for the china-ready apps currently in marketplace.
    +
    +kward has the details.
    @@ -1 +1 @@
    -before china goes live (end January developer release, June general audience release), the content team will have to manually update the settings for the china-ready apps currently in marketplace.
    +before china goes live , the content team will have to manually update the settings for the china-ready apps currently in marketplace.
    @@ -3 +3 ,6 @@
    -kward has the details.+kward has the details.
    +
    +Target Release Dates :
    +https://mana.mozilla.org/wiki/display/PM/Firefox+OS+Wave+Launch+Cross+Functional+View
    +
    +Content Team Engagement & Tasks : https://appreview.etherpad.mozilla.org/40
    """

    if not diff:
        return text
    output = text
    hunks = [
        (new_diff[start_hunk], new_diff[start_hunk + 1:end_hunk])
        for new_diff in [[
            d.lstrip()
            for d in diff if d.lstrip() and d != "\\ No newline at end of file"
        ] + ["@@"]]  # ANOTHER REPAIR
        for start_hunk, end_hunk in pairwise(
            i for i, l in enumerate(new_diff) if l.startswith('@@'))
    ]
    for header, hunk_body in (reversed(hunks) if reverse else hunks):
        matches = DIFF_PREFIX.match(header.strip())
        if not matches:
            if not _Log:
                _late_import()

            _Log.error("Can not handle \n---\n{{diff}}\n---\n", diff=diff)

        removes = tuple(int(i.strip()) for i in matches.group(1).split(
            ","))  # EXPECTING start_line, length TO REMOVE
        remove = Data(
            start=removes[0],
            length=1 if len(removes) == 1 else removes[1])  # ASSUME FIRST LINE
        adds = tuple(int(i.strip()) for i in matches.group(2).split(
            ","))  # EXPECTING start_line, length TO ADD
        add = Data(start=adds[0], length=1 if len(adds) == 1 else adds[1])

        if add.length == 0 and add.start == 0:
            add.start = remove.start

        def repair_hunk(hunk_body):
            # THE LAST DELETED LINE MAY MISS A "\n" MEANING THE FIRST
            # ADDED LINE WILL BE APPENDED TO THE LAST DELETED LINE
            # EXAMPLE: -kward has the details.+kward has the details.
            # DETECT THIS PROBLEM FOR THIS HUNK AND FIX THE DIFF
            if reverse:
                last_lines = [
                    o for b, o in zip(reversed(hunk_body), reversed(output))
                    if b != "+" + o
                ]
                if not last_lines:
                    return hunk_body

                last_line = last_lines[0]
                for problem_index, problem_line in enumerate(hunk_body):
                    if problem_line.startswith('-') and problem_line.endswith(
                            '+' + last_line):
                        split_point = len(problem_line) - (len(last_line) + 1)
                        break
                    elif problem_line.startswith('+' + last_line + "-"):
                        split_point = len(last_line) + 1
                        break
                else:
                    return hunk_body
            else:
                if not output:
                    return hunk_body
                last_line = output[-1]
                for problem_index, problem_line in enumerate(hunk_body):
                    if problem_line.startswith('+') and problem_line.endswith(
                            '-' + last_line):
                        split_point = len(problem_line) - (len(last_line) + 1)
                        break
                    elif problem_line.startswith('-' + last_line + "+"):
                        split_point = len(last_line) + 1
                        break
                else:
                    return hunk_body

            new_hunk_body = (
                hunk_body[:problem_index] +
                [problem_line[:split_point], problem_line[split_point:]] +
                hunk_body[problem_index + 1:])
            return new_hunk_body

        hunk_body = repair_hunk(hunk_body)

        if reverse:
            new_output = (output[:add.start - 1] +
                          [d[1:] for d in hunk_body if d and d[0] == '-'] +
                          output[add.start + add.length - 1:])
        else:
            new_output = (output[:add.start - 1] +
                          [d[1:] for d in hunk_body if d and d[0] == '+'] +
                          output[add.start + remove.length - 1:])
        output = new_output

    if verify:
        original = apply_diff(output, diff, not reverse, False)
        if set(text) != set(original):  # bugzilla-etl diffs are a jumble

            for t, o in zip_longest(text, original):
                if t in ['reports: https://goo.gl/70o6w6\r']:
                    break  # KNOWN INCONSISTENCIES
                if t != o:
                    if not _Log:
                        _late_import()
                    _Log.error("logical verification check failed")
                    break

    return output

Beispiel #6

Datei anzeigen

    def update(self, command):
        """
        :param command:  EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT
        """
        command = wrap(command)

        # REJECT DEEP UPDATES
        touched_columns = command.set.keys() | set(listwrap(command['clear']))
        for c in self.schema.columns:
            if c.name in touched_columns and len(c.nested_path) > 1:
                Log.error("Deep update not supported")

        # ADD NEW COLUMNS
        where = jx_expression(command.where)
        _vars = where.vars()
        _map = {
            v: c.es_column
            for v in _vars
            for c in self.columns.get(v, Null)
            if c.jx_type not in STRUCT
        }
        where_sql = where.map(_map).to_sql(self.schema)
        new_columns = set(command.set.keys()) - set(self.columns.keys())
        for new_column_name in new_columns:
            nested_value = command.set[new_column_name]
            ctype = get_jx_type(nested_value)
            column = Column(
                name=new_column_name,
                jx_type=ctype,
                es_index=self.name,
                es_type=json_type_to_sqlite_type(ctype),
                es_column=typed_column(new_column_name, ctype),
                last_updated=Date.now()
            )
            self.add_column(column)

        # UPDATE THE NESTED VALUES
        for nested_column_name, nested_value in command.set.items():
            if get_jx_type(nested_value) == "nested":
                nested_table_name = concat_field(self.name, nested_column_name)
                nested_table = nested_tables[nested_column_name]
                self_primary_key = sql_list(quote_column(c.es_column) for u in self.uid for c in self.columns[u])
                extra_key_name = UID + text(len(self.uid))
                extra_key = [e for e in nested_table.columns[extra_key_name]][0]

                sql_command = (
                    SQL_DELETE + SQL_FROM + quote_column(nested_table.name) +
                    SQL_WHERE + "EXISTS" +
                    sql_iso(
                        SQL_SELECT + SQL_ONE +
                        SQL_FROM + sql_alias(quote_column(nested_table.name), "n") +
                        SQL_INNER_JOIN + sql_iso(
                            SQL_SELECT + self_primary_key +
                            SQL_FROM + quote_column(abs_schema.fact) +
                            SQL_WHERE + where_sql
                    ) +
                    " t ON " +
                    SQL_AND.join(
                        quote_column("t", c.es_column) + SQL_EQ + quote_column("n", c.es_column)
                        for u in self.uid
                        for c in self.columns[u]
                    )
                )
                )
                self.db.execute(sql_command)

                # INSERT NEW RECORDS
                if not nested_value:
                    continue

                doc_collection = {}
                for d in listwrap(nested_value):
                    nested_table.flatten(d, Data(), doc_collection, path=nested_column_name)

                prefix = SQL_INSERT + quote_column(nested_table.name) + sql_iso(sql_list(
                    [self_primary_key] +
                    [quote_column(extra_key)] +
                    [
                        quote_column(c.es_column)
                        for c in doc_collection.get(".", Null).active_columns
                    ]
                ))

                # BUILD THE PARENT TABLES
                parent = (
                    SQL_SELECT + self_primary_key +
                    SQL_FROM + quote_column(abs_schema.fact) +
                    SQL_WHERE + jx_expression(command.where).to_sql(schema)
                )

                # BUILD THE RECORDS
                children = SQL_UNION_ALL.join(
                    SQL_SELECT +
                    sql_alias(quote_value(i), extra_key.es_column) + SQL_COMMA +
                    sql_list(
                        sql_alias(quote_value(row[c.name]), quote_column(c.es_column))
                        for c in doc_collection.get(".", Null).active_columns
                    )
                    for i, row in enumerate(doc_collection.get(".", Null).rows)
                )

                sql_command = (
                    prefix +
                    SQL_SELECT +
                    sql_list(
                        [quote_column("p", c.es_column) for u in self.uid for c in self.columns[u]] +
                        [quote_column("c", extra_key)] +
                        [quote_column("c", c.es_column) for c in doc_collection.get(".", Null).active_columns]
                    ) +
                    SQL_FROM + sql_iso(parent) + " p" +
                    SQL_INNER_JOIN + sql_iso(children) + " c" + SQL_ON + SQL_TRUE
                )

                self.db.execute(sql_command)

                # THE CHILD COLUMNS COULD HAVE EXPANDED
                # ADD COLUMNS TO SELF
                for n, cs in nested_table.columns.items():
                    for c in cs:
                        column = Column(
                            name=c.name,
                            jx_type=c.jx_type,
                            es_type=c.es_type,
                            es_index=c.es_index,
                            es_column=c.es_column,
                            nested_path=[nested_column_name] + c.nested_path,
                            last_updated=Date.now()
                        )
                        if c.name not in self.columns:
                            self.columns[column.name] = {column}
                        elif c.jx_type not in [c.jx_type for c in self.columns[c.name]]:
                            self.columns[column.name].add(column)

        command = (
            SQL_UPDATE + quote_column(abs_schema.fact) + SQL_SET +
            sql_list(
                [
                    quote_column(c) + SQL_EQ + quote_value(get_if_type(v, c.jx_type))
                    for k, v in command.set.items()
                    if get_jx_type(v) != "nested"
                    for c in self.columns[k]
                    if c.jx_type != "nested" and len(c.nested_path) == 1
                ] +
                [
                    quote_column(c) + SQL_EQ + SQL_NULL
                    for k in listwrap(command['clear'])
                    if k in self.columns
                    for c in self.columns[k]
                    if c.jx_type != "nested" and len(c.nested_path) == 1
                ]
            ) +
            SQL_WHERE + where_sql
        )

        self.db.execute(command)

Beispiel #7

Datei anzeigen

Datei: table.py Projekt: othmane-kada/SpotManager

    def __init__(self, header=None, data=None):
        self.header = header

        self.data = data
        self.meta = Data()

Beispiel #8

Datei anzeigen

    def __init__(
        self,
        hg=None,  # hg CONNECTION INFO
        repo=None,  # CONNECTION INFO FOR ES CACHE
        use_cache=False,  # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES
        kwargs=None,
    ):
        if not _hg_branches:
            _late_imports()

        if not is_text(repo.index):
            Log.error("Expecting 'index' parameter")
        self.repo_locker = Lock()
        self.moves_locker = Lock()
        self.todo = mo_threads.Queue("todo for hg daemon",
                                     max=DAEMON_QUEUE_SIZE)
        self.settings = kwargs
        self.hg = Data(
            url=hg.url,
            timeout=Duration(coalesce(hg.timeout, "30second")).seconds,
            retry={
                "times": 3,
                "sleep": DAEMON_HG_INTERVAL
            },
        )
        self.last_cache_miss = Date.now()

        # VERIFY CONNECTIVITY
        with Explanation("Test connect with hg"):
            http.head(self.settings.hg.url)

        set_default(repo, {
            "type": "revision",
            "schema": revision_schema,
        })
        kwargs.branches = set_default(
            {
                "index": repo.index + "-branches",
                "type": "branch"
            },
            repo,
        )
        moves = set_default(
            {"index": repo.index + "-moves"},
            repo,
        )

        self.branches = _hg_branches.get_branches(kwargs=kwargs)
        cluster = elasticsearch.Cluster(kwargs=repo)
        self.repo = cluster.get_or_create_index(kwargs=repo)
        self.moves = cluster.get_or_create_index(kwargs=moves)

        def setup_es(please_stop):
            with suppress_exception:
                self.repo.add_alias()
            with suppress_exception:
                self.moves.add_alias()

            with suppress_exception:
                self.repo.set_refresh_interval(seconds=1)
            with suppress_exception:
                self.moves.set_refresh_interval(seconds=1)

        Thread.run("setup_es", setup_es)
        Thread.run("hg daemon", self._daemon)

Beispiel #9

Datei anzeigen

def format_list(decoders, aggs, start, query, select):
    new_edges = count_dim(aggs, decoders)

    def data():
        dims = tuple(
            len(e.domain.partitions) + (0 if e.allowNulls is False else 1)
            for e in new_edges)

        is_sent = Matrix(dims=dims, zeros=0)
        if query.sort and not query.groupby:
            # TODO: USE THE format_table() TO PRODUCE THE NEEDED VALUES INSTEAD OF DUPLICATING LOGIC HERE
            all_coord = is_sent._all_combos(
            )  # TRACK THE EXPECTED COMBINATIONS
            for row, coord, agg in aggs_iterator(aggs, decoders):
                missing_coord = all_coord.next()
                while coord != missing_coord:
                    # INSERT THE MISSING COORDINATE INTO THE GENERATION
                    output = Data()
                    for i, d in enumerate(decoders):
                        output[query.edges[i].name] = d.get_value(
                            missing_coord[i])

                    for s in select:
                        if s.aggregate == "count":
                            output[s.name] = 0
                    yield output
                    missing_coord = all_coord.next()

                output = Data()
                for e, c, d in zip(query.edges, coord, decoders):
                    output[e.name] = d.get_value(c)

                for s in select:
                    output[s.name] = s.pull(agg)
                yield output
        else:
            is_sent = Matrix(dims=dims, zeros=0)
            for row, coord, agg in aggs_iterator(aggs, decoders):
                is_sent[coord] = 1

                output = Data()
                for e, c, d in zip(query.edges, coord, decoders):
                    output[e.name] = d.get_value(c)

                for s in select:
                    output[s.name] = s.pull(agg)
                yield output

            # EMIT THE MISSING CELLS IN THE CUBE
            if not query.groupby:
                for c, v in is_sent:
                    if not v:
                        output = Data()
                        for i, d in enumerate(decoders):
                            output[query.edges[i].name] = d.get_value(c[i])

                        for s in select:
                            if s.aggregate == "count":
                                output[s.name] = 0
                        yield output

    output = Data(meta={"format": "list"}, data=list(data()))
    return output

Beispiel #10

Datei anzeigen

        Log.error("Can not deal with date like {{date|json}}", date=date)


def minimize_repo(repo):
    """
    RETURN A MINIMAL VERSION OF THIS CHANGESET
    """
    if repo == None:
        return Null
    output = wrap(_copy_but(repo, _exclude_from_repo))
    output.changeset.description = strings.limit(output.changeset.description,
                                                 1000)
    return output


_exclude_from_repo = Data()
for k in [
        "changeset.files",
        "changeset.diff",
        "changeset.moves",
        "etl",
        "branch.last_used",
        "branch.description",
        "branch.etl",
        "branch.parent_name",
        "children",
        "parents",
        "phase",
        "bookmarks",
        "tags",
]:

Beispiel #11

Datei anzeigen

    def _get_from_hg(self,
                     revision,
                     locale=None,
                     get_diff=False,
                     get_moves=True):
        # RATE LIMIT CALLS TO HG (CACHE MISSES)
        next_cache_miss = self.last_cache_miss + (
            Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND)
        self.last_cache_miss = Date.now()
        if next_cache_miss > self.last_cache_miss:
            Log.note(
                "delaying next hg call for {{seconds|round(decimal=1)}} seconds",
                seconds=next_cache_miss - self.last_cache_miss,
            )
            Till(till=next_cache_miss.unix).wait()

        # CLEAN UP BRANCH NAME
        found_revision = copy(revision)
        if isinstance(found_revision.branch, (text, binary_type)):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}",
                      rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name,
                                                       DEFAULT_LOCALE)]
            if not b:
                Log.warning(
                    "can not find branch ({{branch}}, {{locale}})",
                    branch=lower_name,
                    locale=locale,
                )
                return Null

        # REFRESH BRANCHES, IF TOO OLD
        if Date.now() - Date(b.etl.timestamp) > _hg_branches.OLD_BRANCH:
            self.branches = _hg_branches.get_branches(kwargs=self.settings)

        # FIND THE PUSH
        push = self._get_push(found_revision.branch,
                              found_revision.changeset.id)
        id12 = found_revision.changeset.id[0:12]
        base_url = URL(found_revision.branch.url)

        with Explanation("get revision from {{url}}",
                         url=base_url,
                         debug=DEBUG):
            raw_rev2 = Null
            automation_details = Null
            try:
                raw_rev1 = self._get_raw_json_info((base_url / "json-info") +
                                                   {"node": id12})
                raw_rev2 = self._get_raw_json_rev(base_url / "json-rev" / id12)
                automation_details = self._get_raw_json_rev(
                    base_url / "json-automationrelevance" / id12)
            except Exception as e:
                if "Hg denies it exists" in e:
                    raw_rev1 = Data(node=revision.changeset.id)
                else:
                    raise e

            raw_rev3_changeset = first(r for r in automation_details.changesets
                                       if r.node[:12] == id12)
            if last(automation_details.changesets) != raw_rev3_changeset:
                Log.note("interesting")

            output = self._normalize_revision(
                set_default(raw_rev1, raw_rev2, raw_rev3_changeset),
                found_revision,
                push,
                get_diff,
                get_moves,
            )
            if output.push.date >= Date.now() - MAX_TODO_AGE:
                self.todo.extend([
                    (output.branch, listwrap(output.parents), None),
                    (output.branch, listwrap(output.children), None),
                    (
                        output.branch,
                        listwrap(output.backsoutnodes),
                        output.push.date,
                    ),
                ])

            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None

        return output

Beispiel #12

Datei anzeigen

 def _output():
     for g, v in itertools.groupby(data, get_key):
         group = Data()
         for k, gg in zip(keys, g):
             group[k] = gg
         yield (group, wrap(list(v)))

Beispiel #13

Datei anzeigen

 def selector(d):
     output = Data()
     for n, p in push_and_pull:
         output[n] = unwraplist(p(wrap(d)))
     return unwrap(output)

Beispiel #14

Datei anzeigen

Datei: set_op.py Projekt: klahnakoski/pyLibrary

def get_selects(query):
    schema = query.frum.schema
    query_level = len(schema.query_path)
    query_path = schema.query_path[0]
    # SPLIT select INTO ES_SELECT AND RESULTSET SELECT
    split_select = OrderedDict((p, ESSelectOp(p)) for p in schema.query_path)

    def expand_split_select(c_nested_path):
        es_select = split_select.get(c_nested_path)
        if not es_select:
            temp = [(k, v) for k, v in split_select.items()]
            split_select.clear()
            split_select.update({c_nested_path: ESSelectOp(c_nested_path)})
            split_select.update(temp)
        return split_select[c_nested_path]

    new_select = FlatList()
    post_expressions = {}

    selects = list_to_data([unwrap(s.copy()) for s in listwrap(query.select)])

    # WHAT PATH IS _source USED, IF ANY?
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            if any(c.jx_type == NESTED for c in leaves):
                split_select["."].source_path = "."
        elif is_op(select.value, Variable):
            for selected_column in schema.values(select.value.var,
                                                 exclude_type=(OBJECT,
                                                               EXISTS)):
                if selected_column.jx_type == NESTED:
                    expand_split_select(
                        selected_column.es_column
                    ).source_path = selected_column.es_column
                    continue
                leaves = schema.leaves(selected_column.es_column)
                for c in leaves:
                    if c.jx_type == NESTED:
                        split_select[c.es_column].source_path = c.es_column

    # IF WE GET THE SOURCE FOR PARENT, WE ASSUME WE GOT SOURCE FOR CHILD
    source_path = None
    source_level = 0
    for level, es_select in enumerate(reversed(list(split_select.values()))):
        if source_path:
            es_select.source_path = source_path
        elif es_select.source_path:
            source_level = level + 1
            source_path = es_select.source_path

    def get_pull_source(c):
        nested_path = c.nested_path
        nested_level = len(nested_path)
        pos = text(nested_level)

        if nested_level <= query_level:
            if not source_level or nested_level < source_level:
                field = join_field([pos, "fields", c.es_column])
                return jx_expression_to_function(field)
            elif nested_level == source_level:
                field = relative_field(c.es_column, nested_path[0])

                def pull_source(row):
                    return untyped(row.get(pos, Null)._source[field])

                return pull_source
            else:
                field = relative_field(c.es_column, nested_path[0])

                def pull_property(row):
                    return untyped(row.get(pos, Null)[field])

                return pull_property
        else:
            pos = text(query_level)

            if not source_level or nested_level < source_level:
                # PULL FIELDS AND THEN AGGREGATE THEM
                value = jx_expression_to_function(
                    join_field(["fields", c.es_column]))
                name = literal_field(nested_path[0])
                index = jx_expression_to_function("_nested.offset")

                def pull_nested_field(doc):
                    hits = doc.get(pos, Null).inner_hits[name].hits.hits
                    if not hits:
                        return []

                    temp = [(index(h), value(h)) for h in hits]
                    acc = [None] * len(temp)
                    for i, v in temp:
                        acc[i] = unwraplist(v)
                    return acc

                return pull_nested_field
            else:
                # PULL SOURCES
                value = jx_expression_to_function(
                    concat_field("_source",
                                 relative_field(c.es_column, nested_path[0])))
                name = literal_field(nested_path[0])
                index = jx_expression_to_function(
                    join_field(["_nested"] * (len(c.nested_path) - 1) +
                               ["offset"]))

                def pull_nested_source(doc):
                    hits = doc.get(pos, Null).inner_hits[name].hits.hits
                    if not hits:
                        return []

                    temp = [(index(h), value(h)) for h in hits]
                    acc = [None] * len(temp)
                    for i, v in temp:
                        acc[i] = untyped(v)
                    return acc

                return pull_nested_source

    put_index = 0
    for select in selects:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if is_op(select.value, LeavesOp) and is_op(select.value.term,
                                                   Variable):
            term = select.value.term
            leaves = schema.leaves(term.var)
            for c in leaves:
                c_nested_path = c.nested_path[0]
                simple_name = relative_field(c.es_column,
                                             query_path).lstrip(".")
                name = concat_field(select.name, untype_path(simple_name))
                put_name = concat_field(
                    select.name, literal_field(untype_path(simple_name)))
                split_select[c_nested_path].fields.append(c.es_column)
                new_select.append({
                    "name": name,
                    "value": Variable(c.es_column),
                    "put": {
                        "name": put_name,
                        "index": put_index,
                        "child": ".",
                    },
                    "pull": get_pull_source(c),
                })
                put_index += 1
        elif is_op(select.value, Variable):
            if select.value.var == ".":
                # PULL ALL SOURCE
                new_select.append({
                    "name":
                    select.name,
                    "value":
                    select.value,
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                    "pull":
                    get_pull_source(
                        Data(es_column=query_path,
                             nested_path=schema.query_path)),
                })
                continue

            for selected_column in schema.values(select.value.var,
                                                 exclude_type=(EXISTS,
                                                               OBJECT)):
                if selected_column.jx_type == NESTED:
                    new_select.append({
                        "name":
                        select.name,
                        "value":
                        select.value,
                        "put": {
                            "name": select.name,
                            "index": put_index,
                            "child": "."
                        },
                        "pull":
                        get_pull_source(
                            Data(
                                es_column=selected_column.es_column,
                                nested_path=(selected_column.es_column, ) +
                                selected_column.nested_path,
                            )),
                    })
                    continue

                leaves = schema.leaves(
                    selected_column.es_column,
                    exclude_type=INTERNAL)  # LEAVES OF OBJECT
                if leaves:
                    for c in leaves:
                        if c.es_column == "_id":
                            new_select.append({
                                "name": select.name,
                                "value": Variable(c.es_column),
                                "put": {
                                    "name": select.name,
                                    "index": put_index,
                                    "child": ".",
                                },
                                "pull": pull_id,
                            })
                            continue
                        c_nested_path = c.nested_path[0]
                        expand_split_select(c_nested_path).fields.append(
                            c.es_column)
                        child = untype_path(
                            relative_field(
                                c.es_column,
                                selected_column.es_column,
                            ))
                        new_select.append({
                            "name": select.name,
                            "value": Variable(c.es_column),
                            "put": {
                                "name": select.name,
                                "index": put_index,
                                "child": child,
                            },
                            "pull": get_pull_source(c),
                        })

                else:
                    new_select.append({
                        "name": select.name,
                        "value": NULL,
                        "put": {
                            "name": select.name,
                            "index": put_index,
                            "child": "."
                        },
                    })
                put_index += 1
        else:
            op, split_scripts = split_expression_by_path(select.value,
                                                         schema,
                                                         lang=Painless)
            for p, script in split_scripts.items():
                es_select = split_select[p]
                es_select.scripts[select.name] = {
                    "script":
                    text(Painless[script].partial_eval().to_es_script(schema))
                }
                new_select.append({
                    "name":
                    select.name,
                    "pull":
                    jx_expression_to_function(
                        join_field([
                            text(p),
                            "fields",
                            select.name,
                        ])),
                    "put": {
                        "name": select.name,
                        "index": put_index,
                        "child": "."
                    },
                })
                put_index += 1

    def inners(query_path, parent_pos):
        """
        :param query_path:
        :return:  ITERATOR OVER TUPLES ROWS AS TUPLES, WHERE  row[len(nested_path)] HAS INNER HITS
                  AND row[0] HAS post_expressions
        """
        pos = text(int(parent_pos) + 1)
        if not query_path:

            def base_case(row):
                extra = {}
                for k, e in post_expressions.items():
                    extra[k] = e(row)
                row["0"] = extra
                yield row

            return base_case

        if pos == "1":
            more = inners(query_path[:-1], "1")

            def first_case(results):
                for result in results:
                    for hit in result.hits.hits:
                        seed = {"0": None, pos: hit}
                        for row in more(seed):
                            yield row

            return first_case

        else:
            more = inners(query_path[:-1], pos)
            if source_path and source_path < query_path[-1]:
                rel_path = relative_field(query_path[-1], source_path)

                def source(acc):
                    for inner_row in acc[parent_pos][rel_path]:
                        acc[pos] = inner_row
                        for tt in more(acc):
                            yield tt

                return source
            else:
                path = literal_field(query_path[-1])

                def recurse(acc):
                    hits = acc[parent_pos].inner_hits[path].hits.hits
                    if hits:
                        for inner_row in hits:
                            acc[pos] = inner_row
                            for tt in more(acc):
                                yield tt
                    else:
                        for tt in more(acc):
                            yield tt

                return recurse

    return new_select, split_select, inners(schema.query_path, "0")

Beispiel #15

Datei anzeigen

    def flatten_many(self, docs, path="."):
        """
        :param docs: THE JSON DOCUMENTS
        :param path: FULL PATH TO THIS (INNER/NESTED) DOCUMENT
        :return: TUPLE (success, command, doc_collection) WHERE
                 success: BOOLEAN INDICATING PROPER PARSING
                 command: SCHEMA CHANGES REQUIRED TO BE SUCCESSFUL NEXT TIME
                 doc_collection: MAP FROM NESTED PATH TO INSERTION PARAMETERS:
                 {"active_columns": list, "rows": list of objects}
        """

        # TODO: COMMAND TO ADD COLUMNS
        # TODO: COMMAND TO NEST EXISTING COLUMNS
        # COLLECT AS MANY doc THAT DO NOT REQUIRE SCHEMA CHANGE

        _insertion = Data(
            active_columns=Queue(),
            rows=[]
        )
        doc_collection = {".": _insertion}
        # KEEP TRACK OF WHAT TABLE WILL BE MADE (SHORTLY)
        required_changes = []
        facts = self.container.get_or_create_facts(self.name)
        snowflake = facts.snowflake

        def _flatten(data, uid, parent_id, order, full_path, nested_path, row=None, guid=None):
            """
            :param data: the data we are pulling apart
            :param uid: the uid we are giving this doc
            :param parent_id: the parent id of this (sub)doc
            :param order: the number of siblings before this one
            :param full_path: path to this (sub)doc
            :param nested_path: list of paths, deepest first
            :param row: we will be filling this
            :return:
            """
            table = concat_field(self.name, nested_path[0])
            insertion = doc_collection[nested_path[0]]
            if not row:
                row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order}
                insertion.rows.append(row)

            if is_data(data):
                items = [(concat_field(full_path, k), v ) for k, v in wrap(data).leaves()]
            else:
                # PRIMITIVE VALUES
                items = [(full_path, data)]

            for cname, v in items:
                jx_type = get_jx_type(v)
                if jx_type is None:
                    continue

                insertion = doc_collection[nested_path[0]]
                if jx_type == NESTED:
                    c = first(
                        cc
                        for cc in insertion.active_columns + snowflake.columns
                        if cc.jx_type in STRUCT and untyped_column(cc.name)[0] == cname
                    )
                else:
                    c = first(
                        cc
                        for cc in insertion.active_columns + snowflake.columns
                        if cc.jx_type == jx_type and cc.name == cname
                    )

                if isinstance(c, list):
                    Log.error("confused")

                if not c:
                    # WHAT IS THE NESTING LEVEL FOR THIS PATH?
                    deeper_nested_path = "."
                    for path in snowflake.query_paths:
                        if startswith_field(cname, path[0]) and len(deeper_nested_path) < len(path):
                            deeper_nested_path = path

                    c = Column(
                        name=cname,
                        jx_type=jx_type,
                        es_type=json_type_to_sqlite_type.get(jx_type, jx_type),
                        es_column=typed_column(cname, json_type_to_sql_type.get(jx_type)),
                        es_index=table,
                        nested_path=nested_path,
                        last_updated=Date.now()
                    )
                    if jx_type == NESTED:
                        snowflake.query_paths.append(c.es_column)
                        required_changes.append({'nest': c})
                    else:
                        insertion.active_columns.add(c)
                        required_changes.append({"add": c})
                elif c.jx_type == NESTED and jx_type == OBJECT:
                    # ALWAYS PROMOTE OBJECTS TO NESTED
                    jx_type = NESTED
                    v = [v]
                elif len(c.nested_path) < len(nested_path):
                    from_doc = doc_collection.get(c.nested_path[0], None)
                    column = c.es_column
                    from_doc.active_columns.remove(c)
                    snowflake._remove_column(c)
                    required_changes.append({"nest": c})
                    deep_c = Column(
                        name=cname,
                        jx_type=jx_type,
                        es_type=json_type_to_sqlite_type.get(jx_type, jx_type),
                        es_column=typed_column(cname, json_type_to_sql_type.get(jx_type)),
                        es_index=table,
                        nested_path=nested_path,
                        last_updated=Date.now()
                    )
                    snowflake._add_column(deep_c)
                    snowflake._drop_column(c)
                    from_doc.active_columns.remove(c)

                    for r in from_doc.rows:
                        r1 = unwrap(r)
                        if column in r1:
                            row1 = {UID: self.container.next_uid(), PARENT: r1["__id__"], ORDER: 0, column: r1[column]}
                            insertion.rows.append(row1)
                elif len(c.nested_path) > len(nested_path):
                    insertion = doc_collection[c.nested_path[0]]
                    row = {UID: self.container.next_uid(), PARENT: uid, ORDER: order}
                    insertion.rows.append(row)

                # BE SURE TO NEST VALUES, IF NEEDED
                if jx_type == NESTED:
                    deeper_nested_path = [cname] + nested_path
                    if not doc_collection.get(cname):
                        doc_collection[cname] = Data(
                            active_columns=Queue(),
                            rows=[]
                        )
                    for i, r in enumerate(v):
                        child_uid = self.container.next_uid()
                        _flatten(r, child_uid, uid, i, cname, deeper_nested_path)
                elif jx_type == OBJECT:
                    _flatten(v, uid, parent_id, order, cname, nested_path, row=row)
                elif c.jx_type:
                    row[c.es_column] = v

        for doc in docs:
            _flatten(doc, self.container.next_uid(), 0, 0, full_path=path, nested_path=["."], guid=generateGuid())
            if required_changes:
                snowflake.change_schema(required_changes)
            required_changes = []

        return doc_collection

Beispiel #16

Datei anzeigen

Datei: insert_table.py Projekt: klahnakoski/annotations

    def flatten_many(self, docs, path="."):
        """
        :param docs: THE JSON DOCUMENT
        :param path: FULL PATH TO THIS (INNER/NESTED) DOCUMENT
        :return: TUPLE (success, command, doc_collection) WHERE
                 success: BOOLEAN INDICATING PROPER PARSING
                 command: SCHEMA CHANGES REQUIRED TO BE SUCCESSFUL NEXT TIME
                 doc_collection: MAP FROM NESTED PATH TO INSERTION PARAMETERS:
                 {"active_columns": list, "rows": list of objects}
        """

        # TODO: COMMAND TO ADD COLUMNS
        # TODO: COMMAND TO NEST EXISTING COLUMNS
        # COLLECT AS MANY doc THAT DO NOT REQUIRE SCHEMA CHANGE

        _insertion = Data(active_columns=set(), rows=[])
        doc_collection = {".": _insertion}
        # KEEP TRACK OF WHAT TABLE WILL BE MADE (SHORTLY)
        required_changes = []
        snowflake = self.facts.snowflake
        abs_schema = BasicSnowflake(snowflake.query_paths, snowflake.columns)

        def _flatten(data,
                     uid,
                     parent_id,
                     order,
                     full_path,
                     nested_path,
                     row=None,
                     guid=None):
            """
            :param data: the data we are pulling apart
            :param uid: the uid we are giving this doc
            :param parent_id: the parent id of this (sub)doc
            :param order: the number of siblings before this one
            :param full_path: path to this (sub)doc
            :param nested_path: list of paths, deepest first
            :param row: we will be filling this
            :return:
            """
            table = concat_field(self.name, nested_path[0])
            insertion = doc_collection[nested_path[0]]
            if not row:
                row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order}
                insertion.rows.append(row)

            if not isinstance(data, Mapping):
                data = {".": data}
            for k, v in data.items():
                insertion = doc_collection[nested_path[0]]
                cname = concat_field(full_path, literal_field(k))
                value_type = get_type(v)
                if value_type is None:
                    continue

                if value_type in STRUCT:
                    c = unwraplist([
                        cc for cc in abs_schema.column[cname]
                        if cc.jx_type in STRUCT
                    ])
                else:
                    c = unwraplist([
                        cc for cc in abs_schema.column[cname]
                        if cc.jx_type == value_type
                    ])

                if not c:
                    # WHAT IS THE NESTING LEVEL FOR THIS PATH?
                    deeper_nested_path = "."
                    for path in abs_schema.query_paths:
                        if startswith_field(
                                cname,
                                path) and len(deeper_nested_path) < len(path):
                            deeper_nested_path = path

                    c = Column(name=cname,
                               jx_type=value_type,
                               es_type=json_type_to_sqlite_type.get(
                                   value_type, value_type),
                               es_column=typed_column(
                                   cname,
                                   json_type_to_sql_type.get(value_type)),
                               es_index=table,
                               nested_path=nested_path,
                               last_updated=Date.now())
                    abs_schema.columns.append(c)
                    if value_type == "nested":
                        abs_schema.query_paths.append(c.es_column)
                        required_changes.append({'nest': (c, nested_path)})
                    else:
                        required_changes.append({"add": c})

                    # INSIDE IF BLOCK BECAUSE WE DO NOT WANT IT TO ADD WHAT WE columns.get() ALREADY
                    insertion.active_columns.add(c)
                elif c.jx_type == "nested" and value_type == "object":
                    value_type = "nested"
                    v = [v]
                elif len(c.nested_path) < len(nested_path):
                    from_doc = doc_collection.get(c.nested_path[0], None)
                    column = c.es_column
                    from_doc.active_columns.remove(c)
                    abs_schema._remove_column(c)
                    required_changes.append({"nest": (c, nested_path)})
                    deep_c = Column(name=cname,
                                    jx_type=value_type,
                                    es_column=typed_column(
                                        cname,
                                        json_type_to_sql_type(value_type)),
                                    es_index=table,
                                    nested_path=nested_path,
                                    last_updated=Date.now())
                    abs_schema._add_column(deep_c)
                    insertion.active_columns.add(deep_c)

                    for r in from_doc.rows:
                        r1 = unwrap(r)
                        if column in r1:
                            row1 = {
                                UID: self.next_uid(),
                                PARENT: r1["__id__"],
                                ORDER: 0,
                                column: r1[column]
                            }
                            insertion.rows.append(row1)

                elif len(c.nested_path) > len(nested_path):
                    insertion = doc_collection[c.nested_path[0]]
                    row = {UID: self.next_uid(), PARENT: uid, ORDER: order}
                    insertion.rows.append(row)

                # BE SURE TO NEST VALUES, IF NEEDED
                if value_type == "nested":
                    row[c.es_column] = "."
                    deeper_nested_path = [cname] + nested_path
                    insertion = doc_collection.get(cname, None)
                    if not insertion:
                        insertion = doc_collection[cname] = Data(
                            active_columns=set(), rows=[])
                    for i, r in enumerate(v):
                        child_uid = self.next_uid()
                        _flatten(r, child_uid, uid, i, cname,
                                 deeper_nested_path)
                elif value_type == "object":
                    row[c.es_column] = "."
                    _flatten(v,
                             uid,
                             parent_id,
                             order,
                             cname,
                             nested_path,
                             row=row)
                elif c.jx_type:
                    row[c.es_column] = v

        for doc in docs:
            _flatten(doc,
                     self.next_uid(),
                     0,
                     0,
                     full_path=path,
                     nested_path=["."],
                     guid=self.next_guid())
            if required_changes:
                snowflake.change_schema(required_changes)
            required_changes = []

        return doc_collection

Beispiel #17

Datei anzeigen

        def _flatten(data, uid, parent_id, order, full_path, nested_path, row=None, guid=None):
            """
            :param data: the data we are pulling apart
            :param uid: the uid we are giving this doc
            :param parent_id: the parent id of this (sub)doc
            :param order: the number of siblings before this one
            :param full_path: path to this (sub)doc
            :param nested_path: list of paths, deepest first
            :param row: we will be filling this
            :return:
            """
            table = concat_field(self.name, nested_path[0])
            insertion = doc_collection[nested_path[0]]
            if not row:
                row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order}
                insertion.rows.append(row)

            if is_data(data):
                items = [(concat_field(full_path, k), v ) for k, v in wrap(data).leaves()]
            else:
                # PRIMITIVE VALUES
                items = [(full_path, data)]

            for cname, v in items:
                jx_type = get_jx_type(v)
                if jx_type is None:
                    continue

                insertion = doc_collection[nested_path[0]]
                if jx_type == NESTED:
                    c = first(
                        cc
                        for cc in insertion.active_columns + snowflake.columns
                        if cc.jx_type in STRUCT and untyped_column(cc.name)[0] == cname
                    )
                else:
                    c = first(
                        cc
                        for cc in insertion.active_columns + snowflake.columns
                        if cc.jx_type == jx_type and cc.name == cname
                    )

                if isinstance(c, list):
                    Log.error("confused")

                if not c:
                    # WHAT IS THE NESTING LEVEL FOR THIS PATH?
                    deeper_nested_path = "."
                    for path in snowflake.query_paths:
                        if startswith_field(cname, path[0]) and len(deeper_nested_path) < len(path):
                            deeper_nested_path = path

                    c = Column(
                        name=cname,
                        jx_type=jx_type,
                        es_type=json_type_to_sqlite_type.get(jx_type, jx_type),
                        es_column=typed_column(cname, json_type_to_sql_type.get(jx_type)),
                        es_index=table,
                        nested_path=nested_path,
                        last_updated=Date.now()
                    )
                    if jx_type == NESTED:
                        snowflake.query_paths.append(c.es_column)
                        required_changes.append({'nest': c})
                    else:
                        insertion.active_columns.add(c)
                        required_changes.append({"add": c})
                elif c.jx_type == NESTED and jx_type == OBJECT:
                    # ALWAYS PROMOTE OBJECTS TO NESTED
                    jx_type = NESTED
                    v = [v]
                elif len(c.nested_path) < len(nested_path):
                    from_doc = doc_collection.get(c.nested_path[0], None)
                    column = c.es_column
                    from_doc.active_columns.remove(c)
                    snowflake._remove_column(c)
                    required_changes.append({"nest": c})
                    deep_c = Column(
                        name=cname,
                        jx_type=jx_type,
                        es_type=json_type_to_sqlite_type.get(jx_type, jx_type),
                        es_column=typed_column(cname, json_type_to_sql_type.get(jx_type)),
                        es_index=table,
                        nested_path=nested_path,
                        last_updated=Date.now()
                    )
                    snowflake._add_column(deep_c)
                    snowflake._drop_column(c)
                    from_doc.active_columns.remove(c)

                    for r in from_doc.rows:
                        r1 = unwrap(r)
                        if column in r1:
                            row1 = {UID: self.container.next_uid(), PARENT: r1["__id__"], ORDER: 0, column: r1[column]}
                            insertion.rows.append(row1)
                elif len(c.nested_path) > len(nested_path):
                    insertion = doc_collection[c.nested_path[0]]
                    row = {UID: self.container.next_uid(), PARENT: uid, ORDER: order}
                    insertion.rows.append(row)

                # BE SURE TO NEST VALUES, IF NEEDED
                if jx_type == NESTED:
                    deeper_nested_path = [cname] + nested_path
                    if not doc_collection.get(cname):
                        doc_collection[cname] = Data(
                            active_columns=Queue(),
                            rows=[]
                        )
                    for i, r in enumerate(v):
                        child_uid = self.container.next_uid()
                        _flatten(r, child_uid, uid, i, cname, deeper_nested_path)
                elif jx_type == OBJECT:
                    _flatten(v, uid, parent_id, order, cname, nested_path, row=row)
                elif c.jx_type:
                    row[c.es_column] = v

Beispiel #18

Datei anzeigen

Datei: insert_table.py Projekt: klahnakoski/annotations

        def _flatten(data,
                     uid,
                     parent_id,
                     order,
                     full_path,
                     nested_path,
                     row=None,
                     guid=None):
            """
            :param data: the data we are pulling apart
            :param uid: the uid we are giving this doc
            :param parent_id: the parent id of this (sub)doc
            :param order: the number of siblings before this one
            :param full_path: path to this (sub)doc
            :param nested_path: list of paths, deepest first
            :param row: we will be filling this
            :return:
            """
            table = concat_field(self.name, nested_path[0])
            insertion = doc_collection[nested_path[0]]
            if not row:
                row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order}
                insertion.rows.append(row)

            if not isinstance(data, Mapping):
                data = {".": data}
            for k, v in data.items():
                insertion = doc_collection[nested_path[0]]
                cname = concat_field(full_path, literal_field(k))
                value_type = get_type(v)
                if value_type is None:
                    continue

                if value_type in STRUCT:
                    c = unwraplist([
                        cc for cc in abs_schema.column[cname]
                        if cc.jx_type in STRUCT
                    ])
                else:
                    c = unwraplist([
                        cc for cc in abs_schema.column[cname]
                        if cc.jx_type == value_type
                    ])

                if not c:
                    # WHAT IS THE NESTING LEVEL FOR THIS PATH?
                    deeper_nested_path = "."
                    for path in abs_schema.query_paths:
                        if startswith_field(
                                cname,
                                path) and len(deeper_nested_path) < len(path):
                            deeper_nested_path = path

                    c = Column(name=cname,
                               jx_type=value_type,
                               es_type=json_type_to_sqlite_type.get(
                                   value_type, value_type),
                               es_column=typed_column(
                                   cname,
                                   json_type_to_sql_type.get(value_type)),
                               es_index=table,
                               nested_path=nested_path,
                               last_updated=Date.now())
                    abs_schema.columns.append(c)
                    if value_type == "nested":
                        abs_schema.query_paths.append(c.es_column)
                        required_changes.append({'nest': (c, nested_path)})
                    else:
                        required_changes.append({"add": c})

                    # INSIDE IF BLOCK BECAUSE WE DO NOT WANT IT TO ADD WHAT WE columns.get() ALREADY
                    insertion.active_columns.add(c)
                elif c.jx_type == "nested" and value_type == "object":
                    value_type = "nested"
                    v = [v]
                elif len(c.nested_path) < len(nested_path):
                    from_doc = doc_collection.get(c.nested_path[0], None)
                    column = c.es_column
                    from_doc.active_columns.remove(c)
                    abs_schema._remove_column(c)
                    required_changes.append({"nest": (c, nested_path)})
                    deep_c = Column(name=cname,
                                    jx_type=value_type,
                                    es_column=typed_column(
                                        cname,
                                        json_type_to_sql_type(value_type)),
                                    es_index=table,
                                    nested_path=nested_path,
                                    last_updated=Date.now())
                    abs_schema._add_column(deep_c)
                    insertion.active_columns.add(deep_c)

                    for r in from_doc.rows:
                        r1 = unwrap(r)
                        if column in r1:
                            row1 = {
                                UID: self.next_uid(),
                                PARENT: r1["__id__"],
                                ORDER: 0,
                                column: r1[column]
                            }
                            insertion.rows.append(row1)

                elif len(c.nested_path) > len(nested_path):
                    insertion = doc_collection[c.nested_path[0]]
                    row = {UID: self.next_uid(), PARENT: uid, ORDER: order}
                    insertion.rows.append(row)

                # BE SURE TO NEST VALUES, IF NEEDED
                if value_type == "nested":
                    row[c.es_column] = "."
                    deeper_nested_path = [cname] + nested_path
                    insertion = doc_collection.get(cname, None)
                    if not insertion:
                        insertion = doc_collection[cname] = Data(
                            active_columns=set(), rows=[])
                    for i, r in enumerate(v):
                        child_uid = self.next_uid()
                        _flatten(r, child_uid, uid, i, cname,
                                 deeper_nested_path)
                elif value_type == "object":
                    row[c.es_column] = "."
                    _flatten(v,
                             uid,
                             parent_id,
                             order,
                             cname,
                             nested_path,
                             row=row)
                elif c.jx_type:
                    row[c.es_column] = v

Beispiel #19

Datei anzeigen

    def __new__(cls, e=None, query=None, *args, **kwargs):
        e.allowNulls = coalesce(e.allowNulls, True)

        if e.value and e.domain.type == "default":
            # if query.groupby:
            #     return object.__new__(DefaultDecoder, e)

            if is_text(e.value):
                Log.error("Expecting Variable or Expression, not plain string")

            if is_op(e.value, LeavesOp):
                return object.__new__(ObjectDecoder)
            elif is_op(e.value, TupleOp):
                # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields
                # JUST PULL THE FIELDS
                if not all(is_op(t, Variable) for t in e.value.terms):
                    Log.error("Can only handle variables in tuples")

                e.domain = Data(dimension={"fields": e.value.terms})
                return object.__new__(DimFieldListDecoder)

            elif is_op(e.value, Variable):
                schema = query.frum.schema
                cols = schema.leaves(e.value.var)
                if not cols:
                    return object.__new__(DefaultDecoder)
                if len(cols) != 1:
                    return object.__new__(ObjectDecoder)
                col = first(cols)
                limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT)

                if col.cardinality == None:
                    DEBUG and Log.warning(
                        "metadata for column {{name|quote}} (id={{id}}) is not ready",
                        name=concat_field(col.es_index, col.es_column),
                        id=id(col))
                    e.domain = set_default(DefaultDomain(limit=limit),
                                           e.domain.__data__())
                    return object.__new__(DefaultDecoder)
                elif col.multi <= 1 and col.partitions == None:
                    e.domain = set_default(DefaultDomain(limit=limit),
                                           e.domain.__data__())
                    return object.__new__(DefaultDecoder)
                else:
                    DEBUG and Log.note("id={{id}} has parts!!!", id=id(col))
                    if col.multi > 1:
                        return object.__new__(MultivalueDecoder)

                    partitions = col.partitions[:limit:]
                    if e.domain.sort == -1:
                        partitions = list(reversed(sorted(partitions)))
                    else:
                        partitions = sorted(partitions)
                    e.domain = SimpleSetDomain(partitions=partitions,
                                               limit=limit)

            else:
                return object.__new__(DefaultDecoder)

        if e.value and e.domain.type in PARTITION:
            return object.__new__(SetDecoder)
        if isinstance(e.domain.dimension, Dimension):
            e.domain = e.domain.dimension.getDomain()
            return object.__new__(SetDecoder)
        if e.value and e.domain.type == "time":
            return object.__new__(TimeDecoder)
        if e.range:
            return object.__new__(GeneralRangeDecoder)
        if e.value and e.domain.type == "duration":
            return object.__new__(DurationDecoder)
        elif e.value and e.domain.type == "range":
            return object.__new__(RangeDecoder)
        elif not e.value and e.domain.dimension.fields:
            # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields
            # JUST PULL THE FIELDS
            fields = e.domain.dimension.fields
            if is_data(fields):
                Log.error("No longer allowed: All objects are expressions")
            else:
                return object.__new__(DimFieldListDecoder)
        elif not e.value and all(e.domain.partitions.where):
            return object.__new__(GeneralSetDecoder)
        else:
            Log.error("domain type of {{type}} is not supported yet",
                      type=e.domain.type)

Beispiel #20

Datei anzeigen

    def _run(self):
        with CProfiler():

            self.id = thread.get_ident()
            with ALL_LOCK:
                ALL[self.id] = self

            try:
                if self.target is not None:
                    a, k, self.args, self.kwargs = self.args, self.kwargs, None, None
                    response = self.target(*a, **k)
                    with self.synch_lock:
                        self.end_of_thread = Data(response=response)
                else:
                    with self.synch_lock:
                        self.end_of_thread = Null
            except Exception as e:
                e = Except.wrap(e)
                with self.synch_lock:
                    self.end_of_thread = Data(exception=e)
                if self not in self.parent.children:
                    # THREAD FAILURES ARE A PROBLEM ONLY IF NO ONE WILL BE JOINING WITH IT
                    try:
                        Log.fatal("Problem in thread {{name|quote}}", name=self.name, cause=e)
                    except Exception:
                        sys.stderr.write(b"ERROR in thread: " + str(self.name) + b" " + str(e) + b"\n")
            finally:
                try:
                    children = copy(self.children)
                    for c in children:
                        try:
                            if DEBUG:
                                sys.stdout.write(b"Stopping thread " + str(c.name) + b"\n")
                            c.stop()
                        except Exception as e:
                            Log.warning("Problem stopping thread {{thread}}", thread=c.name, cause=e)

                    for c in children:
                        try:
                            if DEBUG:
                                sys.stdout.write(b"Joining on thread " + str(c.name) + b"\n")
                            c.join()
                        except Exception as e:
                            Log.warning("Problem joining thread {{thread}}", thread=c.name, cause=e)
                        finally:
                            if DEBUG:
                                sys.stdout.write(b"Joined on thread " + str(c.name) + b"\n")

                    self.stopped.go()
                    if DEBUG:
                        Log.note("thread {{name|quote}} stopping", name=self.name)
                    del self.target, self.args, self.kwargs
                    with ALL_LOCK:
                        del ALL[self.id]

                except Exception as e:
                    if DEBUG:
                        Log.warning("problem with thread {{name|quote}}", cause=e, name=self.name)
                finally:
                    self.stopped.go()
                    if DEBUG:
                        Log.note("thread {{name|quote}} is done", name=self.name)

Beispiel #21

Datei anzeigen

# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski ([email protected])
#
from __future__ import unicode_literals

from collections import Mapping

from jx_base import container
from jx_python import containers
from mo_dots import Data
from mo_dots import wrap, set_default, split_field
from mo_logs import Log

config = Data()   # config.default IS EXPECTED TO BE SET BEFORE CALLS ARE MADE
_ListContainer = None
_meta = None


def _delayed_imports():
    global _ListContainer
    global _meta


    from jx_python import meta as _meta
    from jx_python.containers.list_usingPythonList import ListContainer as _ListContainer

    _ = _ListContainer
    _ = _meta

Beispiel #22

Datei anzeigen

Datei: dimensions.py Projekt: othmane-kada/SpotManager

    def __init__(self, dim, parent, jx):
        dim = wrap(dim)

        self.name = dim.name
        self.parent = coalesce(parent)
        self.full_name = join_field(split_field(self.parent.full_name)+[self.name])
        self.edges = None  # FOR NOW
        dot.set_default(self, dim)
        self.where = dim.where
        self.type = coalesce(dim.type, "set")
        self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT)
        self.index = coalesce(dim.index, coalesce(parent, Null).index, jx.settings.index)

        if not self.index:
            Log.error("Expecting an index name")

        # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION)
        self.edges = Data()
        for e in listwrap(dim.edges):
            new_e = Dimension(e, self, jx)
            self.edges[new_e.full_name] = new_e

        self.partitions = wrap(coalesce(dim.partitions, []))
        parse_partition(self)

        fields = coalesce(dim.field, dim.fields)
        if not fields:
            return  # NO FIELDS TO SEARCH
        elif isinstance(fields, Mapping):
            self.fields = wrap(fields)
            edges = wrap([{"name": k, "value": v, "allowNulls": False} for k, v in self.fields.items()])
        else:
            self.fields = listwrap(fields)
            edges = wrap([{"name": f, "value": f, "index": i, "allowNulls": False} for i, f in enumerate(self.fields)])

        if dim.partitions:
            return  # ALREADY HAVE PARTS
        if self.type not in KNOWN - ALGEBRAIC:
            return  # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH

        jx.get_columns()
        with Timer("Get parts of {{name}}", {"name": self.name}):
            parts = jx.query({
                "from": self.index,
                "select": {"name": "count", "aggregate": "count"},
                "edges": edges,
                "where": self.where,
                "limit": self.limit
            })
            Log.note("{{name}} has {{num}} parts",  name= self.name,  num= len(parts))

        d = parts.edges[0].domain

        if dim.path:
            if len(edges) > 1:
                Log.error("Not supported yet")
            # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE
            temp = Data(partitions=[])
            for i, count in enumerate(parts):
                a = dim.path(d.getEnd(d.partitions[i]))
                if not isinstance(a, list):
                    Log.error("The path function on " + dim.name + " must return an ARRAY of parts")
                addParts(
                    temp,
                    dim.path(d.getEnd(d.partitions[i])),
                    count,
                    0
                )
            self.value = coalesce(dim.value, "name")
            self.partitions = temp.partitions
        elif isinstance(fields, Mapping):
            self.value = "name"  # USE THE "name" ATTRIBUTE OF PARTS

            partitions = FlatList()
            for g, p in parts.groupby(edges):
                if p:
                    partitions.append({
                        "value": g,
                        "where": {"and": [
                            {"term": {e.value: g[e.name]}}
                            for e in edges
                        ]},
                        "count": int(p)
                    })
            self.partitions = partitions
        elif len(edges) == 1:
            self.value = "name"  # USE THE "name" ATTRIBUTE OF PARTS

            # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM
            self.partitions = wrap([
                {
                    "name": str(d.partitions[i].name),  # CONVERT TO STRING
                    "value": d.getEnd(d.partitions[i]),
                    "where": {"term": {edges[0].value: d.partitions[i].value}},
                    "count": count
                }
                for i, count in enumerate(parts)
            ])
            self.order = {p.value: i for i, p in enumerate(self.partitions)}
        elif len(edges) == 2:
            self.value = "name"  # USE THE "name" ATTRIBUTE OF PARTS
            d2 = parts.edges[1].domain

            # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM
            array = parts.data.values()[0].cube  # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END)

            def edges2value(*values):
                if isinstance(fields, Mapping):
                    output = Data()
                    for e, v in zip(edges, values):
                        output[e.name] = v
                    return output
                else:
                    return tuple(values)

            self.partitions = wrap([
                {
                    "name": str(d.partitions[i].name),  # CONVERT TO STRING
                    "value": d.getEnd(d.partitions[i]),
                    "where": {"term": {edges[0].value: d.partitions[i].value}},
                    "count": SUM(subcube),
                    "partitions": [
                        {
                            "name": str(d2.partitions[j].name),  # CONVERT TO STRING
                            "value": edges2value(d.getEnd(d.partitions[i]), d2.getEnd(d2.partitions[j])),
                            "where": {"and": [
                                {"term": {edges[0].value: d.partitions[i].value}},
                                {"term": {edges[1].value: d2.partitions[j].value}}
                            ]},
                            "count": count2
                        }
                        for j, count2 in enumerate(subcube)
                        if count2 > 0  # ONLY INCLUDE PROPERTIES THAT EXIST
                    ]
                }
                for i, subcube in enumerate(array)
            ])
        else:
            Log.error("Not supported")

        parse_partition(self)  # RELATE THE PARTS TO THE PARENTS

Beispiel #23

Datei anzeigen

Datei: hg_branches.py Projekt: yoyogias2011/TUID

def _get_single_branch_from_hg(settings, description, dir):
    if dir == "users":
        return []
    response = http.get(settings.url + "/" + dir)
    doc = BeautifulSoup(response.all_content, "html.parser")

    output = []
    try:
        all_branches = doc("table")[0]
    except Exception:
        return []

    for i, b in enumerate(all_branches("tr")):
        if i == 0:
            continue  # IGNORE HEADER
        columns = b("td")

        try:
            path = columns[0].a.get('href')
            if path == "/":
                continue

            name, desc, last_used = [c.text.strip() for c in columns][0:3]

            if last_used.startswith('at'):
                last_used = last_used[2:]

            detail = Data(
                name=name.lower(),
                locale=DEFAULT_LOCALE,
                parent_name=description,
                url=settings.url + path,
                description=desc,
                last_used=Date(last_used),
                etl={"timestamp": Date.now()}
            )
            if detail.description == "unknown":
                detail.description = None

            # SOME BRANCHES HAVE NAME COLLISIONS, IGNORE LEAST POPULAR
            if path in [
                "/projects/dxr/",                   # moved to webtools
                "/build/compare-locales/",          # ?build team likes to clone?
                "/build/puppet/",                   # ?build team likes to clone?
                "/SeaMonkey/puppet/",               # looses the popularity contest
                "/releases/gaia-l10n/v1_2/en-US/",  # use default branch
                "/releases/gaia-l10n/v1_3/en-US/",  # use default branch
                "/releases/gaia-l10n/v1_4/en-US/",  # use default branch
                "/releases/gaia-l10n/v2_0/en-US/",  # use default branch
                "/releases/gaia-l10n/v2_1/en-US/",  # use default branch
                "/build/autoland/"
            ]:
                continue

            # MARKUP BRANCH IF LOCALE SPECIFIC
            if path.startswith("/l10n-central"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = "mozilla-central"
            elif path.startswith("/releases/l10n/"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = _path[-2].lower()
            elif path.startswith("/releases/gaia-l10n/"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = "gaia-" + _path[-2][1::]
            elif path.startswith("/weave-l10n"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = "weave"

            if BRANCH_WHITELIST is not None:
                found = False
                for br in BRANCH_WHITELIST:
                    if br in str(detail.name):
                        found = True
                        break
                if not found:
                    continue

            Log.note("Branch {{name}} {{locale}}", name=detail.name, locale=detail.locale)
            output.append(detail)
        except Exception as e:
            Log.warning("branch digestion problem", cause=e)

    return output

Beispiel #24

Datei anzeigen

Datei: agg_format.py Projekt: klahnakoski/ActiveData

def format_table(aggs, es_query, query, decoders, all_selects):
    new_edges = wrap(count_dim(aggs, es_query, decoders))
    dims = tuple(len(e.domain.partitions) + (0 if e.allowNulls is False else 1) for e in new_edges)
    rank = len(dims)
    header = tuple(new_edges.name + all_selects.name)
    name2index = {s.name: i + rank for i, s in enumerate(all_selects)}

    def data():
        is_sent = Matrix(dims=dims)
        give_me_zeros = query.sort and not query.groupby
        if give_me_zeros:
            # WE REQUIRE THE ZEROS FOR SORTING
            all_coord = is_sent._all_combos()  # TRACK THE EXPECTED COMBINATIONS
            ordered_coord = next(all_coord)[::-1]
            output = None
            for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders):
                if coord != ordered_coord:
                    # output HAS BEEN YIELDED, BUT SET THE DEFAULT VALUES
                    if output is not None:
                        for s in all_selects:
                            i = name2index[s.name]
                            if output[i] is None:
                                output[i] = s.default
                        # WE CAN GET THE SAME coord MANY TIMES, SO ONLY ADVANCE WHEN NOT
                        ordered_coord = next(all_coord)[::-1]

                while coord != ordered_coord:
                    # HAPPENS WHEN THE coord IS AHEAD OF ordered_coord
                    record = [d.get_value(ordered_coord[i]) for i, d in enumerate(decoders)] + [s.default for s in all_selects]
                    yield record
                    ordered_coord = next(all_coord)[::-1]
                # coord == missing_coord
                output = [d.get_value(c) for c, d in zip(coord, decoders)] + [None for s in all_selects]
                for select in ss:
                    v = select.pull(agg)
                    if v != None:
                        union(output, name2index[select.name], v, select.aggregate)
                yield output
        else:
            last_coord = None   # HANG ONTO THE output FOR A BIT WHILE WE FILL THE ELEMENTS
            output = None
            for row, coord, agg, ss in aggs_iterator(aggs, es_query, decoders):
                if coord != last_coord:
                    if output:
                        # SET DEFAULTS
                        for i, s in enumerate(all_selects):
                            v = output[rank+i]
                            if v == None:
                                output[rank+i] = s.default
                        yield output
                    output = is_sent[coord]
                    if output == None:
                        output = is_sent[coord] = [d.get_value(c) for c, d in zip(coord, decoders)] + [None for _ in all_selects]
                    last_coord = coord
                # THIS IS A TRICK!  WE WILL UPDATE A ROW THAT WAS ALREADY YIELDED
                for select in ss:
                    v = select.pull(agg)
                    if v != None:
                        union(output, name2index[select.name], v, select.aggregate)

            if output:
                # SET DEFAULTS ON LAST ROW
                for i, s in enumerate(all_selects):
                    v = output[rank+i]
                    if v == None:
                        output[rank+i] = s.default
                yield output

            # EMIT THE MISSING CELLS IN THE CUBE
            if not query.groupby:
                for coord, output in is_sent:
                    if output == None:
                        record = [d.get_value(c) for c, d in zip(coord, decoders)] + [s.default for s in all_selects]
                        yield record

    return Data(
        meta={"format": "table"},
        header=header,
        data=list(data())
    )

Beispiel #25

Datei anzeigen

from mo_logs import Log
from mo_logs.exceptions import Except
from mo_logs.strings import utf82unicode, unicode2utf8
from mo_math import Math
from mo_threads import Lock
from mo_threads import Till
from mo_times.durations import Duration
from pyLibrary import convert
from pyLibrary.env.big_data import safe_size, ibytes2ilines, icompressed2ibytes

DEBUG = False
FILE_SIZE_LIMIT = 100 * 1024 * 1024
MIN_READ_SIZE = 8 * 1024
ZIP_REQUEST = False

default_headers = Data(
)  # TODO: MAKE THIS VARIABLE A SPECIAL TYPE OF EXPECTED MODULE PARAMETER SO IT COMPLAINS IF NOT SET
default_timeout = 600
DEFAULTS = {
    "allow_redirects": True,
    "stream": True,
    "verify": True,
    "timeout": 600,
    "zip": False,
    "retry": {
        "times": 1,
        "sleep": 0,
        "http": False
    }
}
_warning_sent = False
request_count = 0

Beispiel #26

Datei anzeigen

Datei: setop.py Projekt: klahnakoski/tuid_experiment

def es_setop(es, mvel, query):
    FromES = es09.util.build_es_query(query)
    select = listwrap(query.select)

    isDeep = len(split_field(
        query.frum.name)) > 1  # LOOKING INTO NESTED WILL REQUIRE A SCRIPT
    isComplex = OR([
        s.value == None and s.aggregate not in ("count", "none")
        for s in select
    ])  # CONVERTING esfilter DEFINED PARTS WILL REQUIRE SCRIPT

    if not isDeep and not isComplex:
        if len(select) == 1 and isinstance(select[0].value, LeavesOp):
            FromES = wrap({
                "query": {
                    "bool": {
                        "query": {
                            "match_all": {}
                        },
                        "filter": query.where.to_esfilter()
                    }
                },
                "sort": query.sort,
                "size": 0
            })
        elif all(isinstance(v, Variable) for v in select.value):
            FromES = wrap({
                "query": {
                    "bool": {
                        "query": {
                            "match_all": {}
                        },
                        "filter": query.where.to_esfilter()
                    }
                },
                "fields": select.value,
                "sort": query.sort,
                "size": coalesce(query.limit, 200000)
            })
    elif not isDeep:
        simple_query = query.copy()
        simple_query.where = TRUE  # THE FACET FILTER IS FASTER
        FromES.facets.mvel = {
            "terms": {
                "script_field": mvel.code(simple_query),
                "size": coalesce(simple_query.limit, 200000)
            },
            "facet_filter": jx_expression(query.where).to_esfilter()
        }
    else:
        FromES.facets.mvel = {
            "terms": {
                "script_field": mvel.code(query),
                "size": coalesce(query.limit, 200000)
            },
            "facet_filter": jx_expression(query.where).to_esfilter()
        }

    data = es_post(es, FromES, query.limit)

    if len(select) == 1 and isinstance(select[0].value, LeavesOp):
        # SPECIAL CASE FOR SINGLE COUNT
        cube = wrap(data).hits.hits._source
    elif isinstance(select[0].value, Variable):
        # SPECIAL CASE FOR SINGLE TERM
        cube = wrap(data).hits.hits.fields
    else:
        data_list = unpack_terms(data.facets.mvel, select)
        if not data_list:
            cube = Cube(select, [], {s.name: Matrix.wrap([]) for s in select})
        else:
            output = zip(*data_list)
            cube = Cube(
                select, [],
                {s.name: Matrix(list=output[i])
                 for i, s in enumerate(select)})

    return Data(meta={"esquery": FromES}, data=cube)

Beispiel #27

Datei anzeigen

Datei: setop_table.py Projekt: nknick99/MySQL-to-S3

        def _accumulate_nested(rows, row, nested_doc_details, parent_doc_id, parent_id_coord):
            """
            :param rows: REVERSED STACK OF ROWS (WITH push() AND pop())
            :param row: CURRENT ROW BEING EXTRACTED
            :param nested_doc_details: {
                    "nested_path": wrap_nested_path(nested_path),
                    "index_to_column": map from column number to column details
                    "children": all possible direct decedents' nested_doc_details
                 }
            :param parent_doc_id: the id of the parent doc (for detecting when to step out of loop)
            :param parent_id_coord: the column number for the parent id (so we ca extract from each row)
            :return: the nested property (usually an array)
            """
            previous_doc_id = None
            doc = Data()
            output = []
            id_coord = nested_doc_details['id_coord']

            while True:
                doc_id = row[id_coord]

                if doc_id == None or (parent_id_coord is not None and row[parent_id_coord] != parent_doc_id):
                    rows.append(row)  # UNDO PREVIOUS POP (RECORD IS NOT A NESTED RECORD OF parent_doc)
                    return output

                if doc_id != previous_doc_id:
                    previous_doc_id = doc_id
                    doc = Data()
                    curr_nested_path = nested_doc_details['nested_path'][0]
                    index_to_column = nested_doc_details['index_to_column'].items()
                    if index_to_column:
                        for i, c in index_to_column:
                            value = row[i]
                            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                                # ASSIGN INNER PROPERTIES
                                relative_path = concat_field(c.push_name, c.push_child)
                            else:  # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT
                                relative_path = c.push_child

                            if relative_path == ".":
                                if value == '':
                                    doc = Null
                                else:
                                    doc = value
                            elif value != None and value != '':
                                doc[relative_path] = value

                for child_details in nested_doc_details['children']:
                    # EACH NESTED TABLE MUST BE ASSEMBLED INTO A LIST OF OBJECTS
                    child_id = row[child_details['id_coord']]
                    if child_id is not None:
                        nested_value = _accumulate_nested(rows, row, child_details, doc_id, id_coord)
                        if nested_value:
                            push_name = child_details['nested_path'][0]
                            if isinstance(query.select, list) or isinstance(query.select.value, LeavesOp):
                                # ASSIGN INNER PROPERTIES
                                relative_path = relative_field(push_name, curr_nested_path)
                            else:  # FACT IS EXPECTED TO BE A SINGLE VALUE, NOT AN OBJECT
                                relative_path = "."

                            if relative_path == "." and doc is Null:
                                doc = nested_value
                            elif relative_path == ".":
                                doc = unwraplist(nested_value)
                            else:
                                doc[relative_path] = unwraplist(nested_value)

                output.append(doc)

                try:
                    row = rows.pop()
                except IndexError:
                    return output

Beispiel #28

Datei anzeigen

Datei: aggs.py Projekt: ubdussamad/mo-parquet

def es_aggsop(es, frum, query):
    query = query.copy()  # WE WILL MARK UP THIS QUERY
    schema = frum.schema
    select = listwrap(query.select)

    es_query = Data()
    new_select = Data()  # MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING
    formula = []
    for s in select:
        if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".":
            if schema.query_path == ".":
                s.pull = jx_expression_to_function("doc_count")
            else:
                s.pull = jx_expression_to_function({"coalesce": ["_nested.doc_count", "doc_count", 0]})
        elif isinstance(s.value, Variable):
            if s.aggregate == "count":
                new_select["count_"+literal_field(s.value.var)] += [s]
            else:
                new_select[literal_field(s.value.var)] += [s]
        else:
            formula.append(s)

    for canonical_name, many in new_select.items():
        for s in many:
            es_cols = frum.schema.values(s.value.var)

            if s.aggregate == "count":
                canonical_names = []
                for es_col in es_cols:
                    cn = literal_field(es_col.es_column + "_count")
                    if es_col.type == EXISTS:
                        canonical_names.append(cn + ".doc_count")
                        es_query.aggs[cn].filter.range = {es_col.es_column: {"gt": 0}}
                    else:
                        canonical_names.append(cn+ ".value")
                        es_query.aggs[cn].value_count.field = es_col.es_column
                if len(es_cols) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0])
                else:
                    s.pull = jx_expression_to_function({"add": canonical_names})
            elif s.aggregate == "median":
                if len(es_cols) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")

                es_query.aggs[key].percentiles.field = es_cols[0].es_column
                es_query.aggs[key].percentiles.percents += [50]
                s.pull = jx_expression_to_function(key + ".values.50\.0")
            elif s.aggregate == "percentile":
                if len(es_cols) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")
                if isinstance(s.percentile, text_type) or s.percetile < 0 or 1 < s.percentile:
                    Log.error("Expecting percentile to be a float from 0.0 to 1.0")
                percent = Math.round(s.percentile * 100, decimal=6)

                es_query.aggs[key].percentiles.field = es_cols[0].es_column
                es_query.aggs[key].percentiles.percents += [percent]
                s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
            elif s.aggregate == "cardinality":
                canonical_names = []
                for es_col in es_cols:
                    cn = literal_field(es_col.es_column + "_cardinality")
                    canonical_names.append(cn)
                    es_query.aggs[cn].cardinality.field = es_col.es_column
                if len(es_cols) == 1:
                    s.pull = jx_expression_to_function(canonical_names[0] + ".value")
                else:
                    s.pull = jx_expression_to_function({"add": [cn + ".value" for cn in canonical_names], "default": 0})
            elif s.aggregate == "stats":
                if len(es_cols) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")
                # REGULAR STATS
                stats_name = literal_field(canonical_name)
                es_query.aggs[stats_name].extended_stats.field = es_cols[0].es_column

                # GET MEDIAN TOO!
                median_name = literal_field(canonical_name + "_percentile")
                es_query.aggs[median_name].percentiles.field = es_cols[0].es_column
                es_query.aggs[median_name].percentiles.percents += [50]

                s.pull = get_pull_stats(stats_name, median_name)
            elif s.aggregate == "union":
                pulls = []
                for es_col in es_cols:
                    script = {"scripted_metric": {
                        'init_script': 'params._agg.terms = new HashSet()',
                        'map_script': 'for (v in doc['+quote(es_col.es_column)+'].values) params._agg.terms.add(v)',
                        'combine_script': 'return params._agg.terms.toArray()',
                        'reduce_script': 'HashSet output = new HashSet(); for (a in params._aggs) { if (a!=null) for (v in a) {output.add(v)} } return output.toArray()',
                    }}
                    stats_name = encode_property(es_col.es_column)
                    if es_col.nested_path[0] == ".":
                        es_query.aggs[stats_name] = script
                        pulls.append(jx_expression_to_function(stats_name + ".value"))
                    else:
                        es_query.aggs[stats_name] = {
                            "nested": {"path": es_col.nested_path[0]},
                            "aggs": {"_nested": script}
                        }
                        pulls.append(jx_expression_to_function(stats_name + "._nested.value"))

                if len(pulls) == 0:
                    s.pull = NULL
                elif len(pulls) == 1:
                    s.pull = pulls[0]
                else:
                    s.pull = lambda row: UNION(p(row) for p in pulls)
            else:
                if len(es_cols) > 1:
                    Log.error("Do not know how to count columns with more than one type (script probably)")

                # PULL VALUE OUT OF THE stats AGGREGATE
                es_query.aggs[literal_field(canonical_name)].extended_stats.field = es_cols[0].es_column
                s.pull = jx_expression_to_function({"coalesce": [literal_field(canonical_name) + "." + aggregates[s.aggregate], s.default]})

    for i, s in enumerate(formula):
        canonical_name = literal_field(s.name)

        if isinstance(s.value, TupleOp):
            if s.aggregate == "count":
                # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY
                s.pull = "doc_count"
            else:
                Log.error("{{agg}} is not a supported aggregate over a tuple", agg=s.aggregate)
        elif s.aggregate == "count":
            es_query.aggs[literal_field(canonical_name)].value_count.script = s.value.partial_eval().to_painless(schema).script(schema)
            s.pull = jx_expression_to_function(literal_field(canonical_name) + ".value")
        elif s.aggregate == "median":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")

            es_query.aggs[key].percentiles.script = s.value.to_painless(schema).script(schema)
            es_query.aggs[key].percentiles.percents += [50]
            s.pull = jx_expression_to_function(key + ".values.50\.0")
        elif s.aggregate == "percentile":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            percent = Math.round(s.percentile * 100, decimal=6)

            es_query.aggs[key].percentiles.script = s.value.to_painless(schema).script(schema)
            es_query.aggs[key].percentiles.percents += [percent]
            s.pull = jx_expression_to_function(key + ".values." + literal_field(text_type(percent)))
        elif s.aggregate == "cardinality":
            # ES USES DIFFERENT METHOD FOR CARDINALITY
            key = canonical_name + " cardinality"

            es_query.aggs[key].cardinality.script = s.value.to_painless(schema).script(schema)
            s.pull = jx_expression_to_function(key + ".value")
        elif s.aggregate == "stats":
            # REGULAR STATS
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].extended_stats.script = s.value.to_painless(schema).script(schema)

            # GET MEDIAN TOO!
            median_name = literal_field(canonical_name + " percentile")
            es_query.aggs[median_name].percentiles.script = s.value.to_painless(schema).script(schema)
            es_query.aggs[median_name].percentiles.percents += [50]

            s.pull = get_pull_stats(stats_name, median_name)
        elif s.aggregate=="union":
            # USE TERMS AGGREGATE TO SIMULATE union
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].terms.script_field = s.value.to_painless(schema).script(schema)
            s.pull = jx_expression_to_function(stats_name + ".buckets.key")
        else:
            # PULL VALUE OUT OF THE stats AGGREGATE
            s.pull = jx_expression_to_function(canonical_name + "." + aggregates[s.aggregate])
            es_query.aggs[canonical_name].extended_stats.script = s.value.to_painless(schema).script(schema)

    decoders = get_decoders_by_depth(query)
    start = 0

    #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested
    split_where = split_expression_by_depth(query.where, schema=frum.schema)

    if len(split_field(frum.name)) > 1:
        if any(split_where[2::]):
            Log.error("Where clause is too deep")

        for d in decoders[1]:
            es_query = d.append_query(es_query, start)
            start += d.num_columns

        if split_where[1]:
            #TODO: INCLUDE FILTERS ON EDGES
            filter_ = AndOp("and", split_where[1]).to_esfilter(schema)
            es_query = Data(
                aggs={"_filter": set_default({"filter": filter_}, es_query)}
            )

        es_query = wrap({
            "aggs": {"_nested": set_default(
                {
                    "nested": {
                        "path": schema.query_path
                    }
                },
                es_query
            )}
        })
    else:
        if any(split_where[1::]):
            Log.error("Where clause is too deep")

    if decoders:
        for d in jx.reverse(decoders[0]):
            es_query = d.append_query(es_query, start)
            start += d.num_columns

    if split_where[0]:
        #TODO: INCLUDE FILTERS ON EDGES
        filter = AndOp("and", split_where[0]).to_esfilter(schema)
        es_query = Data(
            aggs={"_filter": set_default({"filter": filter}, es_query)}
        )
    # </TERRIBLE SECTION>

    if not es_query:
        es_query = wrap({"query": {"match_all": {}}})

    es_query.size = 0

    with Timer("ES query time") as es_duration:
        result = es_post(es, es_query, query.limit)

    try:
        format_time = Timer("formatting")
        with format_time:
            decoders = [d for ds in decoders for d in ds]
            result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total)  # IT APPEARS THE OLD doc_count IS GONE

            formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format]
            if query.edges:
                output = formatter(decoders, result.aggregations, start, query, select)
            elif query.groupby:
                output = groupby_formatter(decoders, result.aggregations, start, query, select)
            else:
                output = aggop_formatter(decoders, result.aggregations, start, query, select)

        output.meta.timing.formatting = format_time.duration
        output.meta.timing.es_search = es_duration.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        if query.format not in format_dispatch:
            Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e)
        Log.error("Some problem", cause=e)

Beispiel #29

Datei anzeigen

def es_aggsop(es, frum, query):
    select = wrap([s.copy() for s in listwrap(query.select)])
    # [0] is a cheat; each es_column should be a dict of columns keyed on type, like in sqlite
    es_column_map = {v: frum.schema[v][0].es_column for v in query.vars()}

    es_query = Data()
    new_select = Data(
    )  #MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING
    formula = []
    for s in select:
        if s.aggregate == "count" and isinstance(
                s.value, Variable) and s.value.var == ".":
            s.pull = "doc_count"
        elif isinstance(s.value, Variable):
            if s.value.var == ".":
                if frum.typed:
                    # STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING
                    if s.aggregate in NON_STATISTICAL_AGGS:
                        #TODO: HANDLE BOTH $value AND $objects TO COUNT
                        Log.error("do not know how to handle")
                    else:
                        s.value.var = "$value"
                        new_select["$value"] += [s]
                else:
                    if s.aggregate in NON_STATISTICAL_AGGS:
                        #TODO:  WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT
                        Log.error("do not know how to handle")
                    else:
                        Log.error(
                            'Not expecting ES to have a value at "." which {{agg}} can be applied',
                            agg=s.aggregate)
            elif s.aggregate == "count":
                s.value = s.value.map(es_column_map)
                new_select["count_" + literal_field(s.value.var)] += [s]
            else:
                s.value = s.value.map(es_column_map)
                new_select[literal_field(s.value.var)] += [s]
        else:
            formula.append(s)

    for canonical_name, many in new_select.items():
        representative = many[0]
        if representative.value.var == ".":
            Log.error("do not know how to handle")
        else:
            field_name = representative.value.var

        # canonical_name=literal_field(many[0].name)
        for s in many:
            if s.aggregate == "count":
                es_query.aggs[literal_field(
                    canonical_name)].value_count.field = field_name
                s.pull = literal_field(canonical_name) + ".value"
            elif s.aggregate == "median":
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")

                es_query.aggs[key].percentiles.field = field_name
                es_query.aggs[key].percentiles.percents += [50]
                s.pull = key + ".values.50\.0"
            elif s.aggregate == "percentile":
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")
                if isinstance(
                        s.percentile,
                        basestring) or s.percetile < 0 or 1 < s.percentile:
                    Log.error(
                        "Expecting percentile to be a float from 0.0 to 1.0")
                percent = Math.round(s.percentile * 100, decimal=6)

                es_query.aggs[key].percentiles.field = field_name
                es_query.aggs[key].percentiles.percents += [percent]
                s.pull = key + ".values." + literal_field(unicode(percent))
            elif s.aggregate == "cardinality":
                # ES USES DIFFERENT METHOD FOR CARDINALITY
                key = literal_field(canonical_name + " cardinality")

                es_query.aggs[key].cardinality.field = field_name
                s.pull = key + ".value"
            elif s.aggregate == "stats":
                # REGULAR STATS
                stats_name = literal_field(canonical_name)
                es_query.aggs[stats_name].extended_stats.field = field_name

                # GET MEDIAN TOO!
                median_name = literal_field(canonical_name + " percentile")
                es_query.aggs[median_name].percentiles.field = field_name
                es_query.aggs[median_name].percentiles.percents += [50]

                s.pull = {
                    "count": stats_name + ".count",
                    "sum": stats_name + ".sum",
                    "min": stats_name + ".min",
                    "max": stats_name + ".max",
                    "avg": stats_name + ".avg",
                    "sos": stats_name + ".sum_of_squares",
                    "std": stats_name + ".std_deviation",
                    "var": stats_name + ".variance",
                    "median": median_name + ".values.50\.0"
                }
            elif s.aggregate == "union":
                # USE TERMS AGGREGATE TO SIMULATE union
                stats_name = literal_field(canonical_name)
                es_query.aggs[stats_name].terms.field = field_name
                es_query.aggs[stats_name].terms.size = Math.min(
                    s.limit, MAX_LIMIT)
                s.pull = stats_name + ".buckets.key"
            else:
                # PULL VALUE OUT OF THE stats AGGREGATE
                es_query.aggs[literal_field(
                    canonical_name)].extended_stats.field = field_name
                s.pull = literal_field(canonical_name) + "." + aggregates1_4[
                    s.aggregate]

    for i, s in enumerate(formula):
        canonical_name = literal_field(s.name)
        abs_value = s.value.map(es_column_map)

        if isinstance(abs_value, TupleOp):
            if s.aggregate == "count":
                # TUPLES ALWAYS EXIST, SO COUNTING THEM IS EASY
                s.pull = "doc_count"
            else:
                Log.error("{{agg}} is not a supported aggregate over a tuple",
                          agg=s.aggregate)
        elif s.aggregate == "count":
            es_query.aggs[literal_field(
                canonical_name)].value_count.script = abs_value.to_ruby()
            s.pull = literal_field(canonical_name) + ".value"
        elif s.aggregate == "median":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")

            es_query.aggs[key].percentiles.script = abs_value.to_ruby()
            es_query.aggs[key].percentiles.percents += [50]
            s.pull = key + ".values.50\.0"
        elif s.aggregate == "percentile":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            percent = Math.round(s.percentile * 100, decimal=6)

            es_query.aggs[key].percentiles.script = abs_value.to_ruby()
            es_query.aggs[key].percentiles.percents += [percent]
            s.pull = key + ".values." + literal_field(unicode(percent))
        elif s.aggregate == "cardinality":
            # ES USES DIFFERENT METHOD FOR CARDINALITY
            key = canonical_name + " cardinality"

            es_query.aggs[key].cardinality.script = abs_value.to_ruby()
            s.pull = key + ".value"
        elif s.aggregate == "stats":
            # REGULAR STATS
            stats_name = literal_field(canonical_name)
            es_query.aggs[
                stats_name].extended_stats.script = abs_value.to_ruby()

            # GET MEDIAN TOO!
            median_name = literal_field(canonical_name + " percentile")
            es_query.aggs[median_name].percentiles.script = abs_value.to_ruby()
            es_query.aggs[median_name].percentiles.percents += [50]

            s.pull = {
                "count": stats_name + ".count",
                "sum": stats_name + ".sum",
                "min": stats_name + ".min",
                "max": stats_name + ".max",
                "avg": stats_name + ".avg",
                "sos": stats_name + ".sum_of_squares",
                "std": stats_name + ".std_deviation",
                "var": stats_name + ".variance",
                "median": median_name + ".values.50\.0"
            }
        elif s.aggregate == "union":
            # USE TERMS AGGREGATE TO SIMULATE union
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].terms.script_field = abs_value.to_ruby()
            s.pull = stats_name + ".buckets.key"
        else:
            # PULL VALUE OUT OF THE stats AGGREGATE
            s.pull = canonical_name + "." + aggregates1_4[s.aggregate]
            es_query.aggs[
                canonical_name].extended_stats.script = abs_value.to_ruby()

    decoders = get_decoders_by_depth(query)
    start = 0

    vars_ = query.where.vars()

    #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested
    split_where = split_expression_by_depth(query.where, schema=frum.schema)

    if len(split_field(frum.name)) > 1:
        if any(split_where[2::]):
            Log.error("Where clause is too deep")

        for d in decoders[1]:
            es_query = d.append_query(es_query, start)
            start += d.num_columns

        if split_where[1]:
            #TODO: INCLUDE FILTERS ON EDGES
            filter_ = simplify_esfilter(
                AndOp("and", split_where[1]).to_esfilter())
            es_query = Data(
                aggs={"_filter": set_default({"filter": filter_}, es_query)})

        es_query = wrap({
            "aggs": {
                "_nested":
                set_default({"nested": {
                    "path": frum.query_path
                }}, es_query)
            }
        })
    else:
        if any(split_where[1::]):
            Log.error("Where clause is too deep")

    if decoders:
        for d in jx.reverse(decoders[0]):
            es_query = d.append_query(es_query, start)
            start += d.num_columns

    if split_where[0]:
        #TODO: INCLUDE FILTERS ON EDGES
        filter = simplify_esfilter(AndOp("and", split_where[0]).to_esfilter())
        es_query = Data(
            aggs={"_filter": set_default({"filter": filter}, es_query)})
    # </TERRIBLE SECTION>

    if not es_query:
        es_query = wrap({"query": {"match_all": {}}})

    es_query.size = 0

    with Timer("ES query time") as es_duration:
        result = es09.util.post(es, es_query, query.limit)

    try:
        format_time = Timer("formatting")
        with format_time:
            decoders = [d for ds in decoders for d in ds]
            result.aggregations.doc_count = coalesce(
                result.aggregations.doc_count,
                result.hits.total)  # IT APPEARS THE OLD doc_count IS GONE

            formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[
                query.format]
            if query.edges:
                output = formatter(decoders, result.aggregations, start, query,
                                   select)
            elif query.groupby:
                output = groupby_formatter(decoders, result.aggregations,
                                           start, query, select)
            else:
                output = aggop_formatter(decoders, result.aggregations, start,
                                         query, select)

        output.meta.timing.formatting = format_time.duration
        output.meta.timing.es_search = es_duration.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        if query.format not in format_dispatch:
            Log.error("Format {{format|quote}} not supported yet",
                      format=query.format,
                      cause=e)
        Log.error("Some problem", e)

Beispiel #30

Datei anzeigen

Datei: decoders.py Projekt: cnmade/ActiveData

 def get_value_from_row(self, row):
     output = Data()
     for k, v in zip(self.put,
                     row[self.start:self.start + self.num_columns:]):
         output[k] = v["key"]
     return output