Exemple #1
0
def format_cube(decoders, aggs, start, query, select):
    # decoders = sorted(decoders, key=lambda d: -d.edge.dim)  # REVERSE DECODER ORDER, BECAUSE ES QUERY WAS BUILT IN REVERSE ORDER
    new_edges = count_dim(aggs, decoders)

    dims = []
    for e in new_edges:
        if isinstance(e.value, TupleOp):
            e.allowNulls = False

        extra = 0 if e.allowNulls is False else 1
        dims.append(len(e.domain.partitions) + extra)

    dims = tuple(dims)
    matricies = [(s, Matrix(dims=dims, zeros=s.default)) for s in select]
    for row, coord, agg in aggs_iterator(aggs, decoders):
        for s, m in matricies:
            try:
                v = s.pull(agg)
                m[coord] = v
            except Exception as e:
                # THIS HAPPENS WHEN ES RETURNS MORE TUPLE COMBINATIONS THAN DOCUMENTS
                if agg.get('doc_count') != 0:
                    Log.error("Programmer error", cause=e)

    cube = Cube(
        query.select,
        sort_using_key(
            new_edges,
            key=lambda e: e.dim),  # ENSURE EDGES ARE IN SAME ORDER AS QUERY
        {s.name: m
         for s, m in matricies})
    cube.frum = query
    return cube
Exemple #2
0
def _dict2json(value, sub_schema, path, net_new_properties, buffer):
    prefix = "{"
    for k, v in sort_using_key(value.items(), lambda r: r[0]):
        if v == None or v == "":
            continue
        append(buffer, prefix)
        prefix = COMMA
        if is_binary(k):
            k = k.decode("utf8")
        if not is_text(k):
            Log.error("Expecting property name to be a string")
        if k not in sub_schema:
            sub_schema[k] = {}
            net_new_properties.append(path + [k])
        append(buffer, quote(encode_property(k)))
        append(buffer, COLON)
        typed_encode(v, sub_schema[k], path + [k], net_new_properties, buffer)
    if prefix is COMMA:
        append(buffer, COMMA)
        append(buffer, QUOTED_EXISTS_KEY)
        append(buffer, "1}")
    else:
        append(buffer, "{")
        append(buffer, QUOTED_EXISTS_KEY)
        append(buffer, "1}")
Exemple #3
0
def _dict2json(value, sub_schema, path, net_new_properties, buffer):
    prefix = '{'
    for k, v in sort_using_key(value.items(), lambda r: r[0]):
        if v == None or v == '':
            continue
        append(buffer, prefix)
        prefix = COMMA
        if is_binary(k):
            k = utf82unicode(k)
        if not is_text(k):
            Log.error("Expecting property name to be a string")
        if k not in sub_schema:
            sub_schema[k] = {}
            net_new_properties.append(path + [k])
        append(buffer, encode_basestring(encode_property(k)))
        append(buffer, COLON)
        typed_encode(v, sub_schema[k], path + [k], net_new_properties, buffer)
    if prefix is COMMA:
        append(buffer, COMMA)
        append(buffer, QUOTED_EXISTS_TYPE)
        append(buffer, '1}')
    else:
        append(buffer, '{')
        append(buffer, QUOTED_EXISTS_TYPE)
        append(buffer, '1}')
Exemple #4
0
def format_cube(decoders, aggs, start, query, select):
    # decoders = sorted(decoders, key=lambda d: -d.edge.dim)  # REVERSE DECODER ORDER, BECAUSE ES QUERY WAS BUILT IN REVERSE ORDER
    new_edges = count_dim(aggs, decoders)

    dims = []
    for e in new_edges:
        if isinstance(e.value, TupleOp):
            e.allowNulls = False

        extra = 0 if e.allowNulls is False else 1
        dims.append(len(e.domain.partitions) + extra)

    dims = tuple(dims)
    matricies = [(s, Matrix(dims=dims, zeros=s.default)) for s in select]
    for row, coord, agg in aggs_iterator(aggs, decoders):
        for s, m in matricies:
            try:
                v = s.pull(agg)
                m[coord] = v
            except Exception as e:
                # THIS HAPPENS WHEN ES RETURNS MORE TUPLE COMBINATIONS THAN DOCUMENTS
                if agg.get('doc_count') != 0:
                    Log.error("Programmer error", cause=e)

    cube = Cube(
        query.select,
        sort_using_key(new_edges, key=lambda e: e.dim),  # ENSURE EDGES ARE IN SAME ORDER AS QUERY
        {s.name: m for s, m in matricies}
    )
    cube.frum = query
    return cube
def _dict2json(value, sub_schema, path, net_new_properties, buffer):
    prefix = '{'
    for k, v in sort_using_key(value.items(), lambda r: r[0]):
        if v == None or v == '':
            continue
        append(buffer, prefix)
        prefix = COMMA
        if is_binary(k):
            k = utf82unicode(k)
        if not is_text(k):
            Log.error("Expecting property name to be a string")
        if k not in sub_schema:
            sub_schema[k] = {}
            net_new_properties.append(path + [k])
        append(buffer, encode_basestring(encode_property(k)))
        append(buffer, COLON)
        typed_encode(v, sub_schema[k], path + [k], net_new_properties, buffer)
    if prefix is COMMA:
        append(buffer, COMMA)
        append(buffer, QUOTED_EXISTS_TYPE)
        append(buffer, '1}')
    else:
        append(buffer, '{')
        append(buffer, QUOTED_EXISTS_TYPE)
        append(buffer, '1}')
Exemple #6
0
    def _get_queue(self, row):
        row = wrap(row)
        if row.json:
            row.value, row.json = json2value(row.json), None
        timestamp = Date(self.rollover_field(row.value))
        if timestamp == None:
            return Null
        elif timestamp < Date.today() - self.rollover_max:
            return DATA_TOO_OLD

        rounded_timestamp = timestamp.floor(self.rollover_interval)
        with self.locker:
            queue = self.known_queues.get(rounded_timestamp.unix)
        if queue == None:
            candidates = sort_using_key(
                filter(
                    lambda r: re.match(
                        re.escape(self.settings.index) + r"\d\d\d\d\d\d\d\d_\d\d\d\d\d\d$",
                        r['index']
                    ),
                    self.cluster.get_aliases()
                ),
                key=lambda r: r['index']
            )
            best = None
            for c in candidates:
                c = wrap(c)
                c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT)
                if timestamp > c.date:
                    best = c
            if not best or rounded_timestamp > best.date:
                if rounded_timestamp < wrap(candidates[-1]).date:
                    es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)
                else:
                    try:
                        es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings)
                        es.add_alias(self.settings.index)
                    except Exception as e:
                        e = Except.wrap(e)
                        if "IndexAlreadyExistsException" not in e:
                            Log.error("Problem creating index", cause=e)
                        return self._get_queue(row)  # TRY AGAIN
            else:
                es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)

            def refresh(please_stop):
                try:
                    es.set_refresh_interval(seconds=60 * 10, timeout=5)
                except Exception:
                    Log.note("Could not set refresh interval for {{index}}", index=es.settings.index)

            Thread.run("refresh", refresh)

            self._delete_old_indexes(candidates)
            threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True)
            with self.locker:
                queue = self.known_queues[rounded_timestamp.unix] = threaded_queue
        return queue
def format_cube(aggs, es_query, query, decoders, all_selects):
    new_edges = count_dim(aggs, es_query, decoders)

    dims = []
    for e in new_edges:
        if isinstance(e.value, TupleOp):
            e.allowNulls = False

        extra = 0 if e.allowNulls is False else 1
        dims.append(len(e.domain.partitions) + extra)

    dims = tuple(dims)
    if any(s.default != canonical_aggregates[s.aggregate].default
           for s in all_selects):
        # UNUSUAL DEFAULT VALUES MESS THE union() FUNCTION
        is_default = Matrix(dims=dims, zeros=True)
        matricies = {s.name: Matrix(dims=dims) for s in all_selects}
        for row, coord, agg, selects in aggs_iterator(aggs, es_query,
                                                      decoders):
            for select in selects:
                m = matricies[select.name]
                v = select.pull(agg)
                if v == None:
                    continue
                is_default[coord] = False
                union(m, coord, v, select.aggregate)

        # FILL THE DEFAULT VALUES
        for c, v in is_default:
            if v:
                for s in all_selects:
                    matricies[s.name][c] = s.default
    else:
        matricies = {
            s.name: Matrix(dims=dims, zeros=s.default)
            for s in all_selects
        }
        for row, coord, agg, selects in aggs_iterator(aggs, es_query,
                                                      decoders):
            for select in selects:
                m = matricies[select.name]
                v = select.pull(agg)
                union(m, coord, v, select.aggregate)

    cube = Cube(
        query.select,
        sort_using_key(
            new_edges,
            key=lambda e: e.dim),  # ENSURE EDGES ARE IN SAME ORDER AS QUERY
        matricies)
    cube.frum = query
    return cube
Exemple #8
0
    def to_bq_schema(self):
        top_fields = []

        def _schema_to_bq_schema(jx_path, es_path, schema):
            output = []
            nt = schema.get(NESTED_TYPE)
            if nt:
                schema = {NESTED_TYPE: nt}
            for t, sub_schema in jx.sort(schema.items(), 0):
                bqt = typed_to_bq_type.get(
                    t, {"field_type": "RECORD", "mode": "NULLABLE"}
                )
                full_name = es_path + escape_name(t)
                top_field = self._top_level_fields.get(text(full_name))
                if is_text(sub_schema):
                    new_field_type = json_type_to_bq_type.get(sub_schema, sub_schema)
                    if new_field_type != bqt["field_type"]:
                        # OVERRIDE TYPE
                        bqt = bqt.copy()
                        bqt["field_type"] = new_field_type
                    fields = ()
                else:
                    fields = _schema_to_bq_schema(jx_path + (t,), full_name, sub_schema)

                if top_field:
                    if fields:
                        Log.error("not expecting a structure")
                    if self._partition.field == top_field:
                        if bqt["field_type"] != "TIMESTAMP":
                            Log.error("Partition field must be of time type")
                    struct = SchemaField(name=top_field, fields=fields, **bqt)
                    top_fields.append(struct)
                elif not fields and bqt["field_type"] == "RECORD":
                    # THIS CAN HAPPEN WHEN WE MOVE A PRIMITIVE FIELD TO top_fields
                    pass
                else:
                    struct = SchemaField(
                        name=text(escape_name(t)), fields=fields, **bqt
                    )
                    output.append(struct)
            return output

        _ = self.columns  # ENSURE schema HAS BEEN PROCESSED
        if not self.schema:
            return []
        main_schema = _schema_to_bq_schema((), ApiName(), self.schema)
        output = sort_using_key(top_fields, key=lambda v: v.name) + main_schema
        return output
    def groupby(self, keys, contiguous=False):
        try:
            keys = listwrap(keys)
            get_key = jx_expression_to_function(keys)
            if not contiguous:
                data = sort_using_key(self.data, key=get_key)

            def _output():
                for g, v in itertools.groupby(data, get_key):
                    group = Data()
                    for k, gg in zip(keys, g):
                        group[k] = gg
                    yield (group, wrap(list(v)))

            return _output()
        except Exception as e:
            Log.error("Problem grouping", e)
Exemple #10
0
    def get_parquet_metadata(self, path='.'):
        """
        OUTPUT PARQUET METADATA COLUMNS
        :param path: FOR INTERNAL USE
        :return: LIST OF SchemaElement
        """
        children = []
        for name, child_schema in sort_using_key(self.more.items(),
                                                 lambda p: p[0]):
            children.extend(
                child_schema.get_parquet_metadata(concat_field(path, name)))
        if self.element.type:
            children.append(self.element)

        return [
            parquet_thrift.SchemaElement(name=path, num_children=len(children))
        ] + children
    def groupby(self, keys, contiguous=False):
        try:
            keys = listwrap(keys)
            get_key = jx_expression_to_function(keys)
            if not contiguous:
                data = sort_using_key(self.data, key=get_key)

            def _output():
                for g, v in itertools.groupby(data, get_key):
                    group = Data()
                    for k, gg in zip(keys, g):
                        group[k] = gg
                    yield (group, wrap(list(v)))

            return _output()
        except Exception as e:
            Log.error("Problem grouping", e)
Exemple #12
0
def format_cube(aggs, es_query, query, decoders, all_selects):
    new_edges = count_dim(aggs, es_query, decoders)

    dims = []
    for e in new_edges:
        if is_op(e.value, TupleOp):
            e.allowNulls = False

        extra = 0 if e.allowNulls is False else 1
        dims.append(len(e.domain.partitions) + extra)

    dims = tuple(dims)
    if any(s.default != canonical_aggregates[s.aggregate].default for s in all_selects):
        # UNUSUAL DEFAULT VALUES MESS THE union() FUNCTION
        is_default = Matrix(dims=dims, zeros=True)
        matricies = {s.name: Matrix(dims=dims) for s in all_selects}
        for row, coord, agg, selects in aggs_iterator(aggs, es_query, decoders):
            for select in selects:
                m = matricies[select.name]
                v = select.pull(agg)
                if v == None:
                    continue
                is_default[coord] = False
                union(m, coord, v, select.aggregate)

        # FILL THE DEFAULT VALUES
        for c, v in is_default:
            if v:
                for s in all_selects:
                    matricies[s.name][c] = s.default
    else:
        matricies = {s.name: Matrix(dims=dims, zeros=s.default) for s in all_selects}
        for row, coord, agg, selects in aggs_iterator(aggs, es_query, decoders):
            for select in selects:
                m = matricies[select.name]
                v = select.pull(agg)
                union(m, coord, v, select.aggregate)

    cube = Cube(
        query.select,
        sort_using_key(new_edges, key=lambda e: e.dim),  # ENSURE EDGES ARE IN SAME ORDER AS QUERY
        matricies
    )
    cube.frum = query
    return cube
Exemple #13
0
    def get_parquet_metadata(
        self,
        path='.'
    ):
        """
        OUTPUT PARQUET METADATA COLUMNS
        :param path: FOR INTERNAL USE
        :return: LIST OF SchemaElement
        """
        children = []
        for name, child_schema in sort_using_key(self.more.items(), lambda p: p[0]):
            children.extend(child_schema.get_parquet_metadata(concat_field(path, name)))

        if path == '.':
            return children
        else:
            self.element.num_children = len(children)
            return [self.element] + children
Exemple #14
0
 def _dict2json(self, value, sub_schema, path, net_new_properties, _buffer):
     prefix = '{'
     for k, v in sort_using_key(value.items(), lambda r: r[0]):
         if v == None or v == '':
             continue
         append(_buffer, prefix)
         prefix = COMMA
         if isinstance(k, binary_type):
             k = utf82unicode(k)
         if not isinstance(k, text_type):
             Log.error("Expecting property name to be a string")
         if k not in sub_schema:
             sub_schema[k] = {}
             net_new_properties.append(path + [k])
         append(_buffer, json.dumps(encode_property(k)))
         append(_buffer, COLON)
         self._typed_encode(v, sub_schema[k], path + [k],
                            net_new_properties, _buffer)
     if prefix == COMMA:
         append(_buffer, COMMA + QUOTED_EXISTS_TYPE + COLON + '1}')
     else:
         append(_buffer, '{' + QUOTED_EXISTS_TYPE + COLON + '0}')
Exemple #15
0
        def follow_paths(position, path, nested_path, done_relations,
                         no_nested_docs):
            if position.name in self.settings.exclude:
                return

            if self.path_not_allowed(path):
                return
            if DEBUG:
                Log.note("Trace {{path}}", path=path)
            if position.name != "__ids__":
                # USED TO CONFIRM WE CAN ACCESS THE TABLE (WILL THROW ERROR WHEN IF IT FAILS)
                self.db.query(
                    ConcatSQL(
                        SQL_SELECT,
                        SQL_STAR,
                        SQL_FROM,
                        quote_column(position.schema, position.name),
                        SQL_LIMIT,
                        SQL_ONE,
                    ))

            if position.name in reference_all_tables:
                no_nested_docs = True
            if position.name in reference_only_tables:
                return
            curr_join_list = copy(nested_path_to_join[nested_path[0]])

            ###############################################################################
            # INNER OBJECTS
            ###############################################################################
            referenced_tables = list(
                sort_using_key(
                    jx.groupby(
                        jx.filter(
                            relations,
                            {
                                "eq": {
                                    "table.name": position.name,
                                    "table.schema": position.schema,
                                }
                            },
                        ),
                        "constraint.name",
                    ),
                    key=lambda p: first(p[1]).column.name,
                ))
            for g, constraint_columns in referenced_tables:
                g = unwrap(g)
                constraint_columns = deepcopy(constraint_columns)
                if g["constraint.name"] in done_relations:
                    continue
                if any(cc for cc in constraint_columns
                       if cc.referenced.table.name in self.settings.exclude):
                    continue

                done_relations.add(g["constraint.name"])

                many_to_one_joins = nested_path_to_join[nested_path[0]]
                index = len(many_to_one_joins)

                alias = "t" + text(index)
                for c in constraint_columns:
                    c.referenced.table.alias = alias
                    c.table = position
                many_to_one_joins.append({
                    "join_columns": constraint_columns,
                    "path": path,
                    "nested_path": nested_path,
                })

                # HANDLE THE COMMON *id SUFFIX
                name = []
                for cname, tname in zip(
                        constraint_columns.column.name,
                        constraint_columns.referenced.table.name,
                ):
                    if cname.startswith(tname):
                        name.append(tname)
                    elif cname.endswith("_id"):
                        name.append(cname[:-3])
                    else:
                        name.append(cname)

                relation_string = many_to_one_string(constraint_columns[0])
                step = "/".join(name)
                if len(constraint_columns) == 1:
                    step = self.name_relations.get(relation_string, step)

                referenced_column_path = concat_field(path, step)
                if self.path_not_allowed(referenced_column_path):
                    continue

                if referenced_column_path in reference_only_tables:
                    continue

                col_pointer_name = relative_field(referenced_column_path,
                                                  nested_path[0])
                for col in columns:
                    if (col.table.name
                            == constraint_columns[0].referenced.table.name
                            and col.table.schema
                            == constraint_columns[0].referenced.table.schema):
                        col_full_name = concat_field(
                            col_pointer_name, literal_field(col.column.name))

                        if (col.is_id and len(nested_path) == 1
                                and col.table.name == fact_table.name
                                and col.table.schema == fact_table.schema):
                            # ALWAYS SHOW THE ID OF THE FACT
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                True,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name,
                            })
                        elif col.column.name == constraint_columns[
                                0].column.name:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None,
                            })
                        elif col.is_id:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name
                                if self.settings.show_foreign_keys else None,
                            })
                        elif col.reference:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_pointer_name
                                if not self.settings.show_foreign_keys else
                                col_full_name,  # REFERENCE FIELDS CAN REPLACE THE WHOLE OBJECT BEING REFERENCED
                            })
                        elif col.include:
                            c_index = len(output_columns)
                            output_columns.append({
                                "table_alias":
                                alias,
                                "column_alias":
                                "c" + text(c_index),
                                "column":
                                col,
                                "sort":
                                False,
                                "path":
                                referenced_column_path,
                                "nested_path":
                                nested_path,
                                "put":
                                col_full_name,
                            })

                if position.name in reference_only_tables:
                    continue

                todo.append(
                    Data(
                        position=copy(constraint_columns[0].referenced.table),
                        path=referenced_column_path,
                        nested_path=nested_path,
                        done_relations=copy(done_relations),
                        no_nested_docs=no_nested_docs,
                    ))
            ###############################################################################
            # NESTED OBJECTS
            ###############################################################################
            if not no_nested_docs:
                nesting_tables = list(
                    sort_using_key(
                        jx.groupby(
                            jx.filter(
                                relations,
                                {
                                    "eq": {
                                        "referenced.table.name": position.name,
                                        "referenced.table.schema":
                                        position.schema,
                                    }
                                },
                            ),
                            "constraint.name",
                        ),
                        key=lambda p: [(r.table.name, r.column.name)
                                       for r in [first(p[1])]][0],
                    ))

                for g, constraint_columns in nesting_tables:
                    g = unwrap(g)
                    constraint_columns = deepcopy(constraint_columns)
                    if g["constraint.name"] in done_relations:
                        continue
                    done_relations.add(g["constraint.name"])

                    many_table = set(constraint_columns.table.name)
                    if not (many_table - self.settings.exclude):
                        continue

                    relation_string = one_to_many_string(constraint_columns[0])
                    step = "/".join(many_table)
                    if len(constraint_columns) == 1:
                        step = self.name_relations.get(relation_string, step)

                    referenced_column_path = concat_field(path, step)
                    if self.path_not_allowed(referenced_column_path):
                        continue

                    new_nested_path = [referenced_column_path] + nested_path
                    all_nested_paths.append(new_nested_path)

                    if referenced_column_path in nested_path_to_join:
                        Log.error(
                            "{{path}} already exists, try adding entry to name_relations",
                            path=referenced_column_path,
                        )
                    one_to_many_joins = nested_path_to_join[
                        referenced_column_path] = copy(curr_join_list)
                    index = len(one_to_many_joins)
                    alias = "t" + text(index)
                    for c in constraint_columns:
                        c.table.alias = alias
                        c.referenced.table = position
                    one_to_many_joins.append(
                        set_default(
                            {},
                            g,
                            {
                                "children": True,
                                "join_columns": constraint_columns,
                                "path": path,
                                "nested_path": nested_path,
                            },
                        ))
                    for col in columns:
                        if (col.table.name == constraint_columns[0].table.name
                                and col.table.schema
                                == constraint_columns[0].table.schema):
                            col_full_name = join_field(
                                split_field(referenced_column_path)
                                [len(split_field(new_nested_path[0])):] +
                                [literal_field(col.column.name)])

                            if col.column.name == constraint_columns[
                                    0].column.name:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None,
                                })
                            elif col.is_id:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if
                                    self.settings.show_foreign_keys else None,
                                })
                            else:
                                c_index = len(output_columns)
                                output_columns.append({
                                    "table_alias":
                                    alias,
                                    "column_alias":
                                    "c" + text(c_index),
                                    "column":
                                    col,
                                    "sort":
                                    col.is_id,
                                    "path":
                                    referenced_column_path,
                                    "nested_path":
                                    new_nested_path,
                                    "put":
                                    col_full_name if col.include else None,
                                })

                    todo.append(
                        Data(
                            position=constraint_columns[0].table,
                            path=referenced_column_path,
                            nested_path=new_nested_path,
                            done_relations=copy(done_relations),
                            no_nested_docs=no_nested_docs,
                        ))
Exemple #16
0
    def __init__(
        self,
        host,
        index,
        type=None,
        name=None,
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None
    ):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=index, kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {".": None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if not v:
                    return []
                else:
                    return [v] + nested_path_of(all_paths[v])

            all = sort_using_key(set(step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p)))
            for step in sorted(all):
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                nested_path = nested_path_of(all_paths[p])
                if not nested_path:
                    nested_path = ['.']
                self.namespace.meta.columns.add(Column(
                    name=p,
                    es_column=p,
                    es_index=self.name,
                    es_type=OBJECT,
                    jx_type=EXISTS,
                    nested_path=nested_path,
                    last_updated=Date.now()
                ))
Exemple #17
0
def pretty_json(value):
    try:
        if value is False:
            return "false"
        elif value is True:
            return "true"
        elif is_data(value):
            try:
                items = sort_using_key(value.items(), lambda r: r[0])
                values = [encode_basestring(k) + PRETTY_COLON + pretty_json(v) for k, v in items if v != None]
                if not values:
                    return "{}"
                elif len(values) == 1:
                    return "{" + values[0] + "}"
                else:
                    return "{\n" + ",\n".join(indent(v) for v in values) + "\n}"
            except Exception as e:
                from mo_logs import Log
                from mo_math import OR

                if OR(not is_text(k) for k in value.keys()):
                    Log.error(
                        "JSON must have string keys: {{keys}}:",
                        keys=[k for k in value.keys()],
                        cause=e
                    )

                Log.error(
                    "problem making dict pretty: keys={{keys}}:",
                    keys=[k for k in value.keys()],
                    cause=e
                )
        elif value in (None, Null):
            return "null"
        elif value.__class__ in (binary_type, text_type):
            if is_binary(value):
                value = utf82unicode(value)
            try:
                return quote(value)
            except Exception as e:
                from mo_logs import Log

                try:
                    Log.note("try explicit convert of string with length {{length}}", length=len(value))
                    acc = [QUOTE]
                    for c in value:
                        try:
                            try:
                                c2 = ESCAPE_DCT[c]
                            except Exception:
                                c2 = c
                            c3 = text_type(c2)
                            acc.append(c3)
                        except BaseException:
                            pass
                            # Log.warning("odd character {{ord}} found in string.  Ignored.",  ord= ord(c)}, cause=g)
                    acc.append(QUOTE)
                    output = u"".join(acc)
                    Log.note("return value of length {{length}}", length=len(output))
                    return output
                except BaseException as f:
                    Log.warning("can not convert {{type}} to json", type=f.__class__.__name__, cause=f)
                    return "null"
        elif is_list(value):
            if not value:
                return "[]"

            if ARRAY_MAX_COLUMNS == 1:
                return "[\n" + ",\n".join([indent(pretty_json(v)) for v in value]) + "\n]"

            if len(value) == 1:
                j = pretty_json(value[0])
                if j.find("\n") >= 0:
                    return "[\n" + indent(j) + "\n]"
                else:
                    return "[" + j + "]"

            js = [pretty_json(v) for v in value]
            max_len = max(*[len(j) for j in js])
            if max_len <= ARRAY_ITEM_MAX_LENGTH and max(*[j.find("\n") for j in js]) == -1:
                # ALL TINY VALUES
                num_columns = max(1, min(ARRAY_MAX_COLUMNS, int(floor((ARRAY_ROW_LENGTH + 2.0) / float(max_len + 2)))))  # +2 TO COMPENSATE FOR COMMAS
                if len(js) <= num_columns:  # DO NOT ADD \n IF ONLY ONE ROW
                    return "[" + PRETTY_COMMA.join(js) + "]"
                if num_columns == 1:  # DO NOT rjust IF THERE IS ONLY ONE COLUMN
                    return "[\n" + ",\n".join([indent(pretty_json(v)) for v in value]) + "\n]"

                content = ",\n".join(
                    PRETTY_COMMA.join(j.rjust(max_len) for j in js[r:r + num_columns])
                    for r in xrange(0, len(js), num_columns)
                )
                return "[\n" + indent(content) + "\n]"

            pretty_list = js

            output = ["[\n"]
            for i, p in enumerate(pretty_list):
                try:
                    if i > 0:
                        output.append(",\n")
                    output.append(indent(p))
                except Exception:
                    from mo_logs import Log

                    Log.warning("problem concatenating string of length {{len1}} and {{len2}}",
                        len1=len("".join(output)),
                        len2=len(p)
                    )
            output.append("\n]")
            try:
                return "".join(output)
            except Exception as e:
                from mo_logs import Log

                Log.error("not expected", cause=e)
        elif hasattr(value, '__data__'):
            d = value.__data__()
            return pretty_json(d)
        elif hasattr(value, '__json__'):
            j = value.__json__()
            if j == None:
                return "   null   "  # TODO: FIND OUT WHAT CAUSES THIS
            return pretty_json(json_decoder(j))
        elif scrub(value) is None:
            return "null"
        elif hasattr(value, '__iter__'):
            return pretty_json(list(value))
        elif hasattr(value, '__call__'):
            return "null"
        else:
            try:
                if int(value) == value:
                    return text_type(int(value))
            except Exception:
                pass

            try:
                if float(value) == value:
                    return text_type(float(value))
            except Exception:
                pass

            return pypy_json_encode(value)

    except Exception as e:
        problem_serializing(value, e)
    def __init__(
            self,
            host,
            index,  # THE NAME OF THE SNOWFLAKE (IF WRITING)
            alias=None,  # THE NAME OF THE SNOWFLAKE (FOR READING)
            type=None,
            name=None,  # THE FULL NAME OF THE TABLE (THE NESTED PATH INTO THE SNOWFLAKE)
            port=9200,
            read_only=True,
            timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
            wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
            typed=None,
            kwargs=None):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.edges = Data()  # SET EARLY, SO OTHER PROCESSES CAN REQUEST IT
        self.worker = None
        self.settings = kwargs
        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.name = name = self._namespace._find_alias(
            coalesce(alias, index, name))
        if read_only:
            self.es = elasticsearch.Alias(alias=name,
                                          index=None,
                                          kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(
                read_only=read_only, kwargs=kwargs)

        self._ensure_max_result_window_set(name)
        self.settings.type = self.es.settings.type
        self.stats = QueryStats(self.es.cluster)

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error(
                    "Expecting given typed {{typed}} to match {{is_typed}}",
                    typed=typed,
                    is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {'.': None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if v == '.':
                    return ('.', )
                return (v, ) + nested_path_of(all_paths[v])

            query_paths = sort_using_key(set(
                step for path in self.snowflake.query_paths for step in path),
                                         key=lambda p: len(split_field(p)))
            for step in query_paths:
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                if p == ".":
                    nested_path = ('.', )
                else:
                    nested_path = nested_path_of(p)[1:]

                jx_type = (OBJECT if p == "." else NESTED)
                self.namespace.meta.columns.add(
                    Column(name=p,
                           es_column=p,
                           es_index=self.name,
                           es_type=jx_type,
                           jx_type=jx_type,
                           cardinality=1,
                           nested_path=nested_path,
                           multi=1001 if jx_type is NESTED else 1,
                           last_updated=Date.now()))
def pretty_json(value):
    try:
        if value is False:
            return "false"
        elif value is True:
            return "true"
        elif isinstance(value, Mapping):
            try:
                items = sort_using_key(list(value.items()), lambda r: r[0])
                values = [
                    encode_basestring(k) + PRETTY_COLON +
                    indent(pretty_json(v)).strip() for k, v in items
                    if v != None
                ]
                if not values:
                    return "{}"
                elif len(values) == 1:
                    return "{" + values[0] + "}"
                else:
                    return "{\n" + INDENT + (",\n" +
                                             INDENT).join(values) + "\n}"
            except Exception as e:
                from mo_logs import Log
                from mo_math import OR

                if OR(not isinstance(k, text_type) for k in value.keys()):
                    Log.error("JSON must have string keys: {{keys}}:",
                              keys=[k for k in value.keys()],
                              cause=e)

                Log.error("problem making dict pretty: keys={{keys}}:",
                          keys=[k for k in value.keys()],
                          cause=e)
        elif value in (None, Null):
            return "null"
        elif isinstance(value, (text_type, binary_type)):
            if isinstance(value, binary_type):
                value = utf82unicode(value)
            try:
                return quote(value)
            except Exception as e:
                from mo_logs import Log

                try:
                    Log.note(
                        "try explicit convert of string with length {{length}}",
                        length=len(value))
                    acc = [QUOTE]
                    for c in value:
                        try:
                            try:
                                c2 = ESCAPE_DCT[c]
                            except Exception:
                                c2 = c
                            c3 = text_type(c2)
                            acc.append(c3)
                        except BaseException:
                            pass
                            # Log.warning("odd character {{ord}} found in string.  Ignored.",  ord= ord(c)}, cause=g)
                    acc.append(QUOTE)
                    output = u"".join(acc)
                    Log.note("return value of length {{length}}",
                             length=len(output))
                    return output
                except BaseException as f:
                    Log.warning("can not even explicit convert {{type}}",
                                type=f.__class__.__name__,
                                cause=f)
                    return "null"
        elif isinstance(value, list):
            if not value:
                return "[]"

            if ARRAY_MAX_COLUMNS == 1:
                return "[\n" + ",\n".join(
                    [indent(pretty_json(v)) for v in value]) + "\n]"

            if len(value) == 1:
                j = pretty_json(value[0])
                if j.find("\n") >= 0:
                    return "[\n" + indent(j) + "\n]"
                else:
                    return "[" + j + "]"

            js = [pretty_json(v) for v in value]
            max_len = max(*[len(j) for j in js])
            if max_len <= ARRAY_ITEM_MAX_LENGTH and max(
                    *[j.find("\n") for j in js]) == -1:
                # ALL TINY VALUES
                num_columns = max(
                    1,
                    min(
                        ARRAY_MAX_COLUMNS,
                        int(
                            floor((ARRAY_ROW_LENGTH + 2.0) /
                                  float(max_len +
                                        2)))))  # +2 TO COMPENSATE FOR COMMAS
                if len(js) <= num_columns:  # DO NOT ADD \n IF ONLY ONE ROW
                    return "[" + PRETTY_COMMA.join(js) + "]"
                if num_columns == 1:  # DO NOT rjust IF THERE IS ONLY ONE COLUMN
                    return "[\n" + ",\n".join(
                        [indent(pretty_json(v)) for v in value]) + "\n]"

                content = ",\n".join(
                    PRETTY_COMMA.join(
                        j.rjust(max_len) for j in js[r:r + num_columns])
                    for r in xrange(0, len(js), num_columns))
                return "[\n" + indent(content) + "\n]"

            pretty_list = js

            output = ["[\n"]
            for i, p in enumerate(pretty_list):
                try:
                    if i > 0:
                        output.append(",\n")
                    output.append(indent(p))
                except Exception:
                    from mo_logs import Log

                    Log.warning(
                        "problem concatenating string of length {{len1}} and {{len2}}",
                        len1=len("".join(output)),
                        len2=len(p))
            output.append("\n]")
            try:
                return "".join(output)
            except Exception as e:
                from mo_logs import Log

                Log.error("not expected", cause=e)
        elif hasattr(value, '__data__'):
            d = value.__data__()
            return pretty_json(d)
        elif hasattr(value, '__json__'):
            j = value.__json__()
            if j == None:
                return "   null   "  # TODO: FIND OUT WHAT CAUSES THIS
            return pretty_json(json_decoder(j))
        elif scrub(value) is None:
            return "null"
        elif hasattr(value, '__iter__'):
            return pretty_json(list(value))
        elif hasattr(value, '__call__'):
            return "null"
        else:
            try:
                if int(value) == value:
                    return text_type(int(value))
            except Exception:
                pass

            try:
                if float(value) == value:
                    return text_type(float(value))
            except Exception:
                pass

            return pypy_json_encode(value)

    except Exception as e:
        problem_serializing(value, e)
Exemple #20
0
    def __init__(
        self,
        host,
        index,
        type=None,
        name=None,
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None
    ):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=index, kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {".": None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if not v:
                    return []
                else:
                    return [v] + nested_path_of(all_paths[v])

            all = sort_using_key(set(step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p)))
            for step in sorted(all):
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                nested_path = nested_path_of(all_paths[p])
                if not nested_path:
                    nested_path = ['.']
                self.namespace.meta.columns.add(Column(
                    name=p,
                    es_column=p,
                    es_index=self.name,
                    es_type=OBJECT,
                    jx_type=EXISTS,
                    nested_path=nested_path,
                    last_updated=Date.now()
                ))