Exemple #1
0
def quote_value(value):
    """
    convert values to mysql code for the same
    mostly delegate directly to the mysql lib, but some exceptions exist
    """
    try:
        if value == None:
            return SQL_NULL
        elif isinstance(value, SQL):
            return quote_sql(value.template, value.param)
        elif is_text(value):
            return SQL("'" + "".join(ESCAPE_DCT.get(c, c) for c in value) + "'")
        elif is_data(value):
            return quote_value(json_encode(value))
        elif is_number(value):
            return SQL(text_type(value))
        elif isinstance(value, datetime):
            return SQL("str_to_date('" + value.strftime("%Y%m%d%H%M%S.%f") + "', '%Y%m%d%H%i%s.%f')")
        elif isinstance(value, Date):
            return SQL("str_to_date('" + value.format("%Y%m%d%H%M%S.%f") + "', '%Y%m%d%H%i%s.%f')")
        elif hasattr(value, '__iter__'):
            return quote_value(json_encode(value))
        else:
            return quote_value(text_type(value))
    except Exception as e:
        Log.error("problem quoting SQL {{value}}", value=repr(value), cause=e)
Exemple #2
0
    def insert_list(self, table_name, records):
        if not records:
            return

        columns = set()
        for r in records:
            columns |= set(r.keys())
        columns = jx.sort(columns)

        try:
            self.execute(
                "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}",
                {"ids": self.quote_column([r["_id"] for r in records])}
            )

            command = (
                "INSERT INTO " + self.quote_column(table_name) + "(" +
                ",".join([self.quote_column(k) for k in columns]) +
                ") VALUES " + ",\n".join([
                sql_iso(",".join([self.quote_value(r.get(k, None)) for k in columns]))
                for r in records
            ])
            )
            self.execute(command)
        except Exception as e:
            Log.error("problem with insert", e)
Exemple #3
0
def jx_sort_to_es_sort(sort, schema):
    if not sort:
        return []

    output = []
    for s in sort:
        if isinstance(s.value, Variable):
            cols = schema.leaves(s.value.var)
            if s.sort == -1:
                types = OBJECT, STRING, NUMBER, BOOLEAN
            else:
                types = BOOLEAN, NUMBER, STRING, OBJECT

            for type in types:
                for c in cols:
                    if c.jx_type == type:
                        if s.sort == -1:
                            output.append({c.es_column: "desc"})
                        else:
                            output.append(c.es_column)
        else:
            from mo_logs import Log

            Log.error("do not know how to handle")
    return output
Exemple #4
0
 def _open(self):
     """ DO NOT USE THIS UNLESS YOU close() FIRST"""
     try:
         self.db = connect(
             host=self.settings.host,
             port=self.settings.port,
             user=coalesce(self.settings.username, self.settings.user),
             passwd=coalesce(self.settings.password, self.settings.passwd),
             db=coalesce(self.settings.schema, self.settings.db),
             read_timeout=coalesce(self.settings.read_timeout, (EXECUTE_TIMEOUT / 1000) - 10 if EXECUTE_TIMEOUT else None, 5*60),
             charset=u"utf8",
             use_unicode=True,
             ssl=coalesce(self.settings.ssl, None),
             cursorclass=cursors.SSCursor
         )
     except Exception as e:
         if self.settings.host.find("://") == -1:
             Log.error(
                 u"Failure to connect to {{host}}:{{port}}",
                 host=self.settings.host,
                 port=self.settings.port,
                 cause=e
             )
         else:
             Log.error(u"Failure to connect.  PROTOCOL PREFIX IS PROBABLY BAD", e)
     self.cursor = None
     self.partial_rollback = False
     self.transaction_level = 0
     self.backlog = []  # accumulate the write commands so they are sent at once
     if self.readonly:
         self.begin()
Exemple #5
0
    def execute(
        self,
        command,
        param=None,
        retry=True     # IF command FAILS, JUST THROW ERROR
    ):
        if param:
            command = expand_template(command, self.quote_param(param))

        output = None
        done = False
        while not done:
            try:
                with self.locker:
                    if not self.connection:
                        self._connect()

                with Closer(self.connection.cursor()) as curs:
                    curs.execute(command)
                    if curs.rowcount >= 0:
                        output = curs.fetchall()
                self.connection.commit()
                done = True
            except Exception as e:
                with suppress_exception:
                    self.connection.rollback()
                    # TODO: FIGURE OUT WHY rollback() DOES NOT HELP
                    self.connection.close()
                self.connection = None
                self._connect()
                if not retry:
                    Log.error("Problem with command:\n{{command|indent}}",  command= command, cause=e)
        return output
Exemple #6
0
def problem_serializing(value, e=None):
    """
    THROW ERROR ABOUT SERIALIZING
    """
    from mo_logs import Log

    try:
        typename = type(value).__name__
    except Exception:
        typename = "<error getting name>"

    try:
        rep = text_type(repr(value))
    except Exception as _:
        rep = None

    if rep == None:
        Log.error(
            "Problem turning value of type {{type}} to json",
            type=typename,
            cause=e
        )
    else:
        Log.error(
            "Problem turning value ({{value}}) of type {{type}} to json",
            value=rep,
            type=typename,
            cause=e
        )
Exemple #7
0
def pypy_json_encode(value, pretty=False):
    """
    pypy DOES NOT OPTIMIZE GENERATOR CODE WELL
    """
    global _dealing_with_problem
    if pretty:
        return pretty_json(value)

    try:
        _buffer = UnicodeBuilder(2048)
        _value2json(value, _buffer)
        output = _buffer.build()
        return output
    except Exception as e:
        # THE PRETTY JSON WILL PROVIDE MORE DETAIL ABOUT THE SERIALIZATION CONCERNS
        from mo_logs import Log

        if _dealing_with_problem:
            Log.error("Serialization of JSON problems", e)
        else:
            Log.warning("Serialization of JSON problems", e)
        _dealing_with_problem = True
        try:
            return pretty_json(value)
        except Exception as f:
            Log.error("problem serializing object", f)
        finally:
            _dealing_with_problem = False
def _dict2json(value, sub_schema, path, net_new_properties, buffer):
    prefix = '{'
    for k, v in sort_using_key(value.items(), lambda r: r[0]):
        if v == None or v == '':
            continue
        append(buffer, prefix)
        prefix = COMMA
        if is_binary(k):
            k = utf82unicode(k)
        if not is_text(k):
            Log.error("Expecting property name to be a string")
        if k not in sub_schema:
            sub_schema[k] = {}
            net_new_properties.append(path + [k])
        append(buffer, encode_basestring(encode_property(k)))
        append(buffer, COLON)
        typed_encode(v, sub_schema[k], path + [k], net_new_properties, buffer)
    if prefix is COMMA:
        append(buffer, COMMA)
        append(buffer, QUOTED_EXISTS_TYPE)
        append(buffer, '1}')
    else:
        append(buffer, '{')
        append(buffer, QUOTED_EXISTS_TYPE)
        append(buffer, '1}')
Exemple #9
0
def Stats2ZeroMoment(stats):
    # MODIFIED FROM http://statsmodels.sourceforge.net/devel/_modules/statsmodels/stats/moment_helpers.html
    # ADDED count
    mc0, mc1, mc2, skew, kurt = stats.count, coalesce(stats.mean, 0), coalesce(stats.variance, 0), coalesce(stats.skew, 0), coalesce(stats.kurtosis, 0)

    mz0 = mc0
    mz1 = mc1 * mc0
    mz2 = (mc2 + mc1 * mc1) * mc0
    mc3 = coalesce(skew, 0) * (mc2 ** 1.5) # 3rd central moment
    mz3 = (mc3 + 3 * mc1 * mc2 + mc1 ** 3) * mc0  # 3rd non-central moment
    mc4 = (coalesce(kurt, 0) + 3.0) * (mc2 ** 2.0) # 4th central moment
    mz4 = (mc4 + 4 * mc1 * mc3 + 6 * mc1 * mc1 * mc2 + mc1 ** 4) * mc0

    m = ZeroMoment(mz0, mz1, mz2, mz3, mz4)
    if DEBUG:
        from mo_testing.fuzzytestcase import assertAlmostEqualValue

        globals()["DEBUG"] = False
        try:
            v = ZeroMoment2Stats(m)
            assertAlmostEqualValue(v.count, stats.count, places=10)
            assertAlmostEqualValue(v.mean, stats.mean, places=10)
            assertAlmostEqualValue(v.variance, stats.variance, places=10)
            assertAlmostEqualValue(v.skew, stats.skew, places=10)
            assertAlmostEqualValue(v.kurtosis, stats.kurtosis, places=10)
        except Exception as e:
            v = ZeroMoment2Stats(m)
            Log.error("programmer error")
        globals()["DEBUG"] = True
    return m
    def __init__(self, name, params, cwd=None, env=None, debug=False, shell=False, bufsize=-1):
        self.name = name
        self.service_stopped = Signal("stopped signal for " + strings.quote(name))
        self.stdin = Queue("stdin for process " + strings.quote(name), silent=True)
        self.stdout = Queue("stdout for process " + strings.quote(name), silent=True)
        self.stderr = Queue("stderr for process " + strings.quote(name), silent=True)

        try:
            self.debug = debug or DEBUG
            self.service = service = subprocess.Popen(
                params,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                bufsize=bufsize,
                cwd=cwd if isinstance(cwd, (basestring, NullType, NoneType)) else cwd.abspath,
                env=unwrap(set_default(env, os.environ)),
                shell=shell
            )

            self.please_stop = Signal()
            self.please_stop.on_go(self._kill)
            self.thread_locker = Lock()
            self.children = [
                Thread.run(self.name + " stdin", self._writer, service.stdin, self.stdin, please_stop=self.service_stopped, parent_thread=self),
                Thread.run(self.name + " stdout", self._reader, "stdout", service.stdout, self.stdout, please_stop=self.service_stopped, parent_thread=self),
                Thread.run(self.name + " stderr", self._reader, "stderr", service.stderr, self.stderr, please_stop=self.service_stopped, parent_thread=self),
                Thread.run(self.name + " waiter", self._monitor, parent_thread=self),
            ]
        except Exception as e:
            Log.error("Can not call", e)

        if self.debug:
            Log.note("{{process}} START: {{command}}", process=self.name, command=" ".join(map(strings.quote, params)))
Exemple #11
0
 def output(*args, **kwargs):
     if len(args):
         if len(kwargs.keys()):
             Log.error("Not allowed to use both args and kwargs")
         return self._execute({item: args})
     else:
         return self._execute({item: kwargs})
Exemple #12
0
def filter(data, where):
    """
    where  - a function that accepts (record, rownum, rows) and returns boolean
    """
    if len(data) == 0 or where == None or where == TRUE:
        return data

    if isinstance(data, Container):
        return data.filter(where)

    if is_container(data):
        temp = jx_expression_to_function(where)
        dd = wrap(data)
        return wrap([unwrap(d) for i, d in enumerate(data) if temp(wrap(d), i, dd)])
    else:
        Log.error(
            "Do not know how to handle type {{type}}", type=data.__class__.__name__
        )

    try:
        return drill_filter(where, data)
    except Exception as _:
        # WOW!  THIS IS INEFFICIENT!
        return wrap(
            [unwrap(d) for d in drill_filter(where, [DataObject(d) for d in data])]
        )
Exemple #13
0
    def parse_field(fieldname, data, depth):
        """
        RETURN (first, rest) OF fieldname
        """
        col = split_field(fieldname)
        d = data
        for i, c in enumerate(col):
            try:
                d = d[c]
            except Exception as e:
                Log.error("{{name}} does not exist", name=fieldname)
            if is_list(d) and len(col) > 1:
                if len(primary_column) <= depth + i:
                    primary_nested.append(True)
                    primary_column.append(c)
                    primary_branch.append(d)
                elif primary_nested[depth] and primary_column[depth + i] != c:
                    Log.error("only one branch of tree allowed")
                else:
                    primary_nested[depth + i] = True
                    primary_column[depth + i] = c
                    primary_branch[depth + i] = d

                return c, join_field(col[i + 1 :])
            else:
                if len(primary_column) <= depth + i:
                    primary_nested.append(False)
                    primary_column.append(c)
                    primary_branch.append([d])
        return fieldname, None
Exemple #14
0
def _select(template, data, fields, depth):
    output = FlatList()
    deep_path = []
    deep_fields = UniqueIndex(["name"])
    for d in data:
        if d.__class__ is Data:
            Log.error("programmer error, _select can not handle Data, only dict")

        record = template.copy()
        children = None
        for f in fields:
            index, c = _select_deep(d, f, depth, record)
            children = c if children is None else children
            if index:
                path = f.value[0:index:]
                if not deep_fields[f]:
                    deep_fields.add(f)  # KEEP TRACK OF WHICH FIELDS NEED DEEPER SELECT
                short = MIN([len(deep_path), len(path)])
                if path[:short:] != deep_path[:short:]:
                    Log.error("Dangerous to select into more than one branch at time")
                if len(deep_path) < len(path):
                    deep_path = path
        if not children:
            output.append(record)
        else:
            output.extend(_select(record, children, deep_fields, depth + 1))

    return output
Exemple #15
0
def _select_deep(v, field, depth, record):
    """
    field = {"name":name, "value":["attribute", "path"]}
    r[field.name]=v[field.value], BUT WE MUST DEAL WITH POSSIBLE LIST IN field.value PATH
    """
    if hasattr(field.value, "__call__"):
        try:
            record[field.name] = field.value(wrap(v))
        except Exception as e:
            record[field.name] = None
        return 0, None

    for i, f in enumerate(field.value[depth : len(field.value) - 1 :]):
        v = v.get(f)
        if v is None:
            return 0, None
        if is_list(v):
            return depth + i + 1, v

    f = field.value.last()
    try:
        if not f:  # NO NAME FIELD INDICATES SELECT VALUE
            record[field.name] = v
        else:
            record[field.name] = v.get(f)
    except Exception as e:
        Log.error(
            "{{value}} does not have {{field}} property", value=v, field=f, cause=e
        )
    return 0, None
Exemple #16
0
    def _decode_token(index, c, full_path, path, name2index, destination, expected_vars):
        if c == b'{':
            if not expected_vars:
                index = jump_to_end(index, c)
                value = None
            elif expected_vars[0] == ".":
                json.mark(index-1)
                index = jump_to_end(index, c)
                value = json_decoder(json.release(index).decode("utf8"))
            else:
                count = 0
                for v, i in _decode_object(index, full_path, path, name2index, destination, expected_vars=expected_vars):
                    index = i
                    value = v
                    count += 1
                if count != 1:
                    Log.error("Expecting object, nothing nested")
        elif c == b'[':
            if not expected_vars:
                index = jump_to_end(index, c)
                value = None
            else:
                json.mark(index - 1)
                index = jump_to_end(index, c)
                value = json_decoder(json.release(index).decode("utf8"))
        else:
            if expected_vars and expected_vars[0] == ".":
                value, index = simple_token(index, c)
            else:
                index = jump_to_end(index, c)
                value = None

        return value, index
Exemple #17
0
def tuple(data, field_name):
    """
    RETURN LIST  OF TUPLES
    """
    if isinstance(data, Cube):
        Log.error("not supported yet")

    if isinstance(data, FlatList):
        Log.error("not supported yet")

    if is_data(field_name) and "value" in field_name:
        # SIMPLIFY {"value":value} AS STRING
        field_name = field_name["value"]

    # SIMPLE PYTHON ITERABLE ASSUMED
    if is_text(field_name):
        if len(split_field(field_name)) == 1:
            return [(d[field_name],) for d in data]
        else:
            path = split_field(field_name)
            output = []
            flat_list._tuple1(data, path, 0, output)
            return output
    elif is_list(field_name):
        paths = [_select_a_field(f) for f in field_name]
        output = FlatList()
        _tuple((), unwrap(data), paths, 0, output)
        return output
    else:
        paths = [_select_a_field(field_name)]
        output = FlatList()
        _tuple((), data, paths, 0, output)
        return output
Exemple #18
0
 def simple_token(index, c):
     if c == b'"':
         json.mark(index - 1)
         while True:
             c = json[index]
             index += 1
             if c == b"\\":
                 index += 1
             elif c == b'"':
                 break
         return json_decoder(json.release(index).decode("utf8")), index
     elif c in b"{[":
         Log.error("Expecting a primitive value")
     elif c == b"t" and json.slice(index, index + 3) == "rue":
         return True, index + 3
     elif c == b"n" and json.slice(index, index + 3) == "ull":
         return None, index + 3
     elif c == b"f" and json.slice(index, index + 4) == "alse":
         return False, index + 4
     else:
         json.mark(index-1)
         while True:
             c = json[index]
             if c in b',]}':
                 break
             index += 1
         return float(json.release(index)), index
Exemple #19
0
    def _decode(index, parent_path, path, name2index, expected_vars=NO_VARS):
        c, index = skip_whitespace(index)

        if not path:
            if c != b"[":
                # TREAT VALUE AS SINGLE-VALUE ARRAY
                yield _decode_token(index, c, parent_path, path, name2index, None, expected_vars)
            else:
                c, index = skip_whitespace(index)
                if c == b']':
                    return  # EMPTY ARRAY

                while True:
                    value, index = _decode_token(index, c, parent_path, path, name2index, None, expected_vars)
                    c, index = skip_whitespace(index)
                    if c == b']':
                        yield value, index
                        return
                    elif c == b',':
                        c, index = skip_whitespace(index)
                        yield value, index

        else:
            if c != b'{':
                Log.error("Expecting all objects to at least have {{path}}", path=path[0])

            for j, i in _decode_object(index, parent_path, path, name2index, expected_vars=expected_vars):
                yield j, i
Exemple #20
0
def replacePrefix(value, prefix, new_prefix):
    try:
        if value.startswith(prefix):
            return new_prefix+value[len(prefix)::]
        return value
    except Exception as e:
        Log.error("can not replace prefix", e)
    def __init__(self, logger):
        if not isinstance(logger, StructuredLogger):
            Log.error("Expecting a StructuredLogger")

        self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True)
        self.logger = logger

        def worker(logger, please_stop):
            try:
                while not please_stop:
                    logs = self.queue.pop_all()
                    if not logs:
                        (Till(seconds=1) | please_stop).wait()
                        continue
                    for log in logs:
                        if log is THREAD_STOP:
                            please_stop.go()
                        else:
                            logger.write(**log)
            except Exception as e:
                print("problem in " + StructuredLogger_usingThread.__name__ + ": " + str(e))
            finally:
                Log.note("stop the child")
                logger.stop()

        self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger)
        self.thread.parent.remove_child(self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()
Exemple #22
0
def utf82unicode(value):
    """
    WITH EXPLANATION FOR FAILURE
    """
    try:
        return value.decode("utf8")
    except Exception as e:
        if not _Log:
            _late_import()

        if not is_binary(value):
            _Log.error("Can not convert {{type}} to unicode because it's not bytes",  type= type(value).__name__)

        e = _Except.wrap(e)
        for i, c in enumerate(value):
            try:
                c.decode("utf8")
            except Exception as f:
                _Log.error("Can not convert charcode {{c}} in string index {{i}}", i=i, c=ord(c), cause=[e, _Except.wrap(f)])

        try:
            latin1 = text_type(value.decode("latin1"))
            _Log.error("Can not explain conversion failure, but seems to be latin1", e)
        except Exception:
            pass

        try:
            a = text_type(value.decode("latin1"))
            _Log.error("Can not explain conversion failure, but seems to be latin1", e)
        except Exception:
            pass

        _Log.error("Can not explain conversion failure of " + type(value).__name__ + "!", e)
Exemple #23
0
    def __getitem__(self, index):
        offset = index - self.start
        if offset < len(self.buffer):
            return self.buffer[offset:offset + 1]

        if offset < 0:
            Log.error("Can not go in reverse on stream index=={{index}} (offset={{offset}})", index=index, offset=offset)

        if self._mark == -1:
            self.start += self.buffer_length
            offset = index - self.start
            self.buffer = self.get_more()
            self.buffer_length = len(self.buffer)
            while self.buffer_length <= offset:
                more = self.get_more()
                self.buffer += more
                self.buffer_length = len(self.buffer)
            return self.buffer[offset:offset+1]

        needless_bytes = self._mark - self.start
        if needless_bytes:
            self.start = self._mark
            offset = index - self.start
            self.buffer = self.buffer[needless_bytes:]
            self.buffer_length = len(self.buffer)

        while self.buffer_length <= offset:
            more = self.get_more()
            self.buffer += more
            self.buffer_length = len(self.buffer)

        try:
            return self.buffer[offset:offset+1]
        except Exception as e:
            Log.error("error", cause=e)
Exemple #24
0
    def _decode_object_items(index, c, parent_path, query_path, expected_vars):
        """
        ITERATE THROUGH THE PROPERTIES OF AN OBJECT
        """
        c, index = skip_whitespace(index)
        num_items = 0
        while True:
            if c == b',':
                c, index = skip_whitespace(index)
            elif c == b'"':
                name, index = simple_token(index, c)
                if "name" in expected_vars:
                    for i, e in enumerate(expected_vars):
                        if e == "name":
                            destination[i] = name

                c, index = skip_whitespace(index)
                if c != b':':
                    Log.error("Expecting colon")
                c, index = skip_whitespace(index)

                child_expected = needed("value", expected_vars)
                index = _assign_token(index, c, child_expected)
                c, index = skip_whitespace(index)
                DEBUG and not num_items % 1000 and Log.note("{{num}} items iterated", num=num_items)
                yield index
                num_items += 1
            elif c == b"}":
                break
Exemple #25
0
 def simple_token(index, c):
     if c == b'"':
         json.mark(index - 1)
         while True:
             c = json[index]
             index += 1
             if c == b"\\":
                 index += 1
             elif c == b'"':
                 break
         return json_decoder(json.release(index).decode("utf8")), index
     elif c in b"{[":
         json.mark(index-1)
         index = jump_to_end(index, c)
         value = wrap(json_decoder(json.release(index).decode("utf8")))
         return value, index
     elif c == b"t" and json.slice(index, index + 3) == b"rue":
         return True, index + 3
     elif c == b"n" and json.slice(index, index + 3) == b"ull":
         return None, index + 3
     elif c == b"f" and json.slice(index, index + 4) == b"alse":
         return False, index + 4
     else:
         json.mark(index-1)
         while True:
             c = json[index]
             if c in b',]}':
                 break
             index += 1
         text = json.release(index)
         try:
             return float(text), index
         except Exception:
             Log.error("Not a known JSON primitive: {{text|quote}}", text=text)
def _exec(code):
    try:
        temp = None
        exec("temp = " + code)
        return temp
    except Exception as e:
        Log.error("Could not execute {{code|quote}}", code=code, cause=e)
Exemple #27
0
def _normalize_groupby(groupby, limit, schema=None):
    if groupby == None:
        return None
    output = wrap([n for ie, e in enumerate(listwrap(groupby)) for n in _normalize_group(e, ie, limit, schema=schema) ])
    if any(o==None for o in output):
        Log.error("not expected")
    return output
Exemple #28
0
def _expand(template, seq):
    """
    seq IS TUPLE OF OBJECTS IN PATH ORDER INTO THE DATA TREE
    """
    if is_text(template):
        return _simple_expand(template, seq)
    elif is_data(template):
        # EXPAND LISTS OF ITEMS USING THIS FORM
        # {"from":from, "template":template, "separator":separator}
        template = wrap(template)
        assert template["from"], "Expecting template to have 'from' attribute"
        assert template.template, "Expecting template to have 'template' attribute"

        data = seq[-1][template["from"]]
        output = []
        for d in data:
            s = seq + (d,)
            output.append(_expand(template.template, s))
        return coalesce(template.separator, "").join(output)
    elif is_list(template):
        return "".join(_expand(t, seq) for t in template)
    else:
        if not _Log:
            _late_import()

        _Log.error("can not handle")
Exemple #29
0
def format_cube(decoders, aggs, start, query, select):
    # decoders = sorted(decoders, key=lambda d: -d.edge.dim)  # REVERSE DECODER ORDER, BECAUSE ES QUERY WAS BUILT IN REVERSE ORDER
    new_edges = count_dim(aggs, decoders)

    dims = []
    for e in new_edges:
        if isinstance(e.value, TupleOp):
            e.allowNulls = False

        extra = 0 if e.allowNulls is False else 1
        dims.append(len(e.domain.partitions) + extra)

    dims = tuple(dims)
    matricies = [(s, Matrix(dims=dims, zeros=s.default)) for s in select]
    for row, coord, agg in aggs_iterator(aggs, decoders):
        for s, m in matricies:
            try:
                v = _pull(s, agg)
                m[coord] = v
            except Exception as e:
                Log.error("", e)

    cube = Cube(
        query.select,
        sorted(new_edges, key=lambda e: e.dim),  # ENSURE EDGES ARE IN SAME ORDER AS QUERY
        {s.name: m for s, m in matricies}
    )
    cube.frum = query
    return cube
Exemple #30
0
    def select(self, selectList, fromPath, varName, sourceVar):
        path = split_field(fromPath)
        is_deep = len(path) > 1
        heads = []
        list = []
        for s in selectList:
            if is_deep:
                if s.value and is_variable_name(s.value):
                    shortForm = self._translate(s.value)
                    list.append("Value2Pipe(" + shortForm + ")\n")
                else:
                    Log.error("do not know how to handle yet")
            else:
                if s.value and is_variable_name(s.value):
                    list.append("Value2Pipe(getDocValue(" + value2MVEL(s.value) + "))\n")
                elif s.value:
                    shortForm = self._translate(s.value)
                    list.append("Value2Pipe(" + shortForm + ")\n")
                else:
                    code, decode = self.Parts2Term(s.domain)
                    heads.append(code.head)
                    list.append("Value2Pipe(" + code.body + ")\n")


        if len(split_field(fromPath)) > 1:
            output = 'if (' + varName + ' != "") ' + varName + '+="|";\n' + varName + '+=' + '+"|"+'.join(["Value2Pipe("+v+")\n" for v in list]) + ';\n'
        else:
            output = varName + ' = ' + '+"|"+'.join(["Value2Pipe("+v+")\n" for v in list]) + ';\n'

        return Data(
            head="".join(heads),
            body=output
        )
 def insert(self, docs):
     if not is_many(docs):
         Log.error("Expecting a list of documents")
     doc_collection = self.flatten_many(docs)
     self._insert(doc_collection)
Exemple #32
0
    def __init__(
        self,
        hg=None,  # CONNECT TO hg
        repo=None,  # CONNECTION INFO FOR ES CACHE
        use_cache=False,  # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES
        timeout=30 * SECOND,
        kwargs=None,
    ):
        if not _hg_branches:
            _late_imports()

        if not is_text(repo.index):
            Log.error("Expecting 'index' parameter")
        self.repo_locker = Lock()
        self.moves_locker = Lock()
        self.todo = mo_threads.Queue("todo for hg daemon",
                                     max=DAEMON_QUEUE_SIZE)
        self.settings = kwargs
        self.timeout = Duration(timeout)
        self.last_cache_miss = Date.now()

        # VERIFY CONNECTIVITY
        with Explanation("Test connect with hg"):
            http.head(self.settings.hg.url)

        set_default(repo, {
            "type": "revision",
            "schema": revision_schema,
        })
        kwargs.branches = set_default(
            {
                "index": repo.index + "-branches",
                "type": "branch",
            },
            repo,
        )
        moves = set_default(
            {
                "index": repo.index + "-moves",
            },
            repo,
        )

        self.branches = _hg_branches.get_branches(kwargs=kwargs)
        cluster = elasticsearch.Cluster(kwargs=repo)
        self.repo = cluster.get_or_create_index(kwargs=repo)
        self.moves = cluster.get_or_create_index(kwargs=moves)

        def setup_es(please_stop):
            with suppress_exception:
                self.repo.add_alias()
            with suppress_exception:
                self.moves.add_alias()

            with suppress_exception:
                self.repo.set_refresh_interval(seconds=1)
            with suppress_exception:
                self.moves.set_refresh_interval(seconds=1)

        Thread.run("setup_es", setup_es)
        Thread.run("hg daemon", self._daemon)
Exemple #33
0
    def _get_push(self, branch, changeset_id):
        if self.repo.cluster.version.startswith("1.7."):
            query = {
                "query": {
                    "filtered": {
                        "query": {
                            "match_all": {}
                        },
                        "filter": {
                            "and": [
                                {
                                    "term": {
                                        "branch.name": branch.name
                                    }
                                },
                                {
                                    "prefix": {
                                        "changeset.id": changeset_id[0:12]
                                    }
                                },
                            ]
                        },
                    }
                },
                "size": 1,
            }
        else:
            query = {
                "query": {
                    "bool": {
                        "must": [
                            {
                                "term": {
                                    "branch.name": branch.name
                                }
                            },
                            {
                                "prefix": {
                                    "changeset.id": changeset_id[0:12]
                                }
                            },
                        ]
                    }
                },
                "size": 1,
            }

        try:
            # ALWAYS TRY ES FIRST
            with self.repo_locker:
                response = self.repo.search(query)
                json_push = response.hits.hits[0]._source.push
            if json_push:
                return json_push
        except Exception:
            pass

        url = branch.url.rstrip(
            "/") + "/json-pushes?full=1&changeset=" + changeset_id
        with Explanation("Pulling pushlog from {{url}}", url=url, debug=DEBUG):
            Log.note("Reading pushlog from {{url}}",
                     url=url,
                     changeset=changeset_id)
            data = self._get_and_retry(url, branch)
            # QUEUE UP THE OTHER CHANGESETS IN THE PUSH
            self.todo.add(
                (branch,
                 [c.node for cs in data.values().changesets
                  for c in cs], None))
            pushes = [
                Push(id=int(index), date=_push.date, user=_push.user)
                for index, _push in data.items()
            ]

        if len(pushes) == 0:
            return Null
        elif len(pushes) == 1:
            return pushes[0]
        else:
            Log.error("do not know what to do")
Exemple #34
0
    def get_revision(self,
                     revision,
                     locale=None,
                     get_diff=False,
                     get_moves=True,
                     after=None):
        """
        EXPECTING INCOMPLETE revision OBJECT
        RETURNS revision
        """
        rev = revision.changeset.id
        if not rev:
            return Null
        elif rev == "None":
            return Null
        elif revision.branch.name == None:
            return Null
        locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)
        output = self._get_from_elasticsearch(revision,
                                              locale=locale,
                                              get_diff=get_diff,
                                              get_moves=get_moves,
                                              after=after)
        if output:
            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            DEBUG and Log.note(
                "Got hg ({{branch}}, {{locale}}, {{revision}}) from ES",
                branch=output.branch.name,
                locale=locale,
                revision=output.changeset.id,
            )
            if output.push.date >= Date.now() - MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents), None))
                self.todo.add((output.branch, listwrap(output.children), None))
            if output.push.date:
                return output

        # RATE LIMIT CALLS TO HG (CACHE MISSES)
        next_cache_miss = self.last_cache_miss + (
            Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND)
        self.last_cache_miss = Date.now()
        if next_cache_miss > self.last_cache_miss:
            Log.note(
                "delaying next hg call for {{seconds|round(decimal=1)}} seconds",
                seconds=next_cache_miss - self.last_cache_miss,
            )
            Till(till=next_cache_miss.unix).wait()

        found_revision = copy(revision)
        if isinstance(found_revision.branch, (text, binary_type)):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}",
                      rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name,
                                                       DEFAULT_LOCALE)]
            if not b:
                Log.warning(
                    "can not find branch ({{branch}}, {{locale}})",
                    branch=lower_name,
                    locale=locale,
                )
                return Null

        if Date.now() - Date(b.etl.timestamp) > _hg_branches.OLD_BRANCH:
            self.branches = _hg_branches.get_branches(kwargs=self.settings)

        push = self._get_push(found_revision.branch,
                              found_revision.changeset.id)
        id12 = found_revision.changeset.id[0:12]

        url1 = found_revision.branch.url.rstrip(
            "/") + "/json-info?node=" + id12
        url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + id12
        url3 = (found_revision.branch.url.rstrip("/") +
                "/json-automationrelevance/" + id12)
        with Explanation("get revision from {{url}}", url=url1, debug=DEBUG):
            raw_rev2 = Null
            automation_details = Null
            try:
                raw_rev1 = self._get_raw_json_info(url1, found_revision.branch)
                raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch)
                automation_details = self._get_raw_json_rev(
                    url3, found_revision.branch)
            except Exception as e:
                if "Hg denies it exists" in e:
                    raw_rev1 = Data(node=revision.changeset.id)
                else:
                    raise e

            raw_rev3_changeset = first(r for r in automation_details.changesets
                                       if r.node[:12] == id12)
            if last(automation_details.changesets) != raw_rev3_changeset:
                Log.note("interesting")

            output = self._normalize_revision(
                set_default(raw_rev1, raw_rev2, raw_rev3_changeset),
                found_revision,
                push,
                get_diff,
                get_moves,
            )
            if output.push.date >= Date.now() - MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents), None))
                self.todo.add((output.branch, listwrap(output.children), None))
                self.todo.add((output.branch, listwrap(output.backsoutnodes),
                               output.push.date))

            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            return output
Exemple #35
0
 def milli(self, value):
     if not isinstance(value, float):
         from mo_logs import Log
         Log.error("not allowed")
     self._milli = value
Exemple #36
0
    def select(self, fields):
        if is_data(fields):
            fields = fields.value

        if is_text(fields):
            # RETURN LIST OF VALUES
            if len(split_field(fields)) == 1:
                if self.path[0] == fields:
                    return [d[1] for d in self.data]
                else:
                    return [d[0][fields] for d in self.data]
            else:
                keys = split_field(fields)
                depth = coalesce(
                    MIN([
                        i for i, (k, p) in enumerate(zip(keys, self.path))
                        if k != p
                    ]), len(self.path))  # LENGTH OF COMMON PREFIX
                short_key = keys[depth:]

                output = FlatList()
                _select1((wrap(d[depth]) for d in self.data), short_key, 0,
                         output)
                return output

        if is_list(fields):
            output = FlatList()

            meta = []
            for f in fields:
                if hasattr(f.value, "__call__"):
                    meta.append((f.name, f.value))
                else:
                    meta.append(
                        (f.name, functools.partial(lambda v, d: d[v],
                                                   f.value)))

            for row in self._values():
                agg = Data()
                for name, f in meta:
                    agg[name] = f(row)

                output.append(agg)

            return output

            # meta = []
            # for f in fields:
            #     keys = split_field(f.value)
            #     depth = coalesce(MIN([i for i, (k, p) in enumerate(zip(keys, self.path)) if k != p]), len(self.path))  # LENGTH OF COMMON PREFIX
            #     short_key = join_field(keys[depth:])
            #
            #     meta.append((f.name, depth, short_key))
            #
            # for row in self._data:
            #     agg = Data()
            #     for name, depth, short_key in meta:
            #         if short_key:
            #             agg[name] = row[depth][short_key]
            #         else:
            #             agg[name] = row[depth]
            #     output.append(agg)
            # return output

        Log.error("multiselect over FlatList not supported")
    def update(self, command):
        """
        :param command:  EXPECTING dict WITH {"set": s, "clear": c, "where": w} FORMAT
        """
        command = wrap(command)
        clear_columns = set(listwrap(command['clear']))

        # REJECT DEEP UPDATES
        touched_columns = command.set.keys() | clear_columns
        for c in self.schema.columns:
            if c.name in touched_columns and len(c.nested_path) > 1:
                Log.error("Deep update not supported")

        # ADD NEW COLUMNS
        where = jx_expression(command.where) or TRUE
        _vars = where.vars()
        _map = {
            v: c.es_column
            for v in _vars for c in self.columns.get(v, Null)
            if c.jx_type not in STRUCT
        }
        where_sql = where.map(_map).to_sql(self.schema)[0].sql.b
        new_columns = set(command.set.keys()) - set(
            c.name for c in self.schema.columns)
        for new_column_name in new_columns:
            nested_value = command.set[new_column_name]
            ctype = get_jx_type(nested_value)
            column = Column(name=new_column_name,
                            jx_type=ctype,
                            es_index=self.name,
                            es_type=json_type_to_sqlite_type(ctype),
                            es_column=typed_column(new_column_name, ctype),
                            last_updated=Date.now())
            self.add_column(column)

        # UPDATE THE NESTED VALUES
        for nested_column_name, nested_value in command.set.items():
            if get_jx_type(nested_value) == "nested":
                nested_table_name = concat_field(self.name, nested_column_name)
                nested_table = nested_tables[nested_column_name]
                self_primary_key = sql_list(
                    quote_column(c.es_column) for u in self.uid
                    for c in self.columns[u])
                extra_key_name = UID + text(len(self.uid))
                extra_key = [e
                             for e in nested_table.columns[extra_key_name]][0]

                sql_command = (
                    SQL_DELETE + SQL_FROM + quote_column(nested_table.name) +
                    SQL_WHERE + "EXISTS" +
                    sql_iso(SQL_SELECT + SQL_ONE + SQL_FROM +
                            sql_alias(quote_column(nested_table.name), "n") +
                            SQL_INNER_JOIN +
                            sql_iso(SQL_SELECT + self_primary_key + SQL_FROM +
                                    quote_column(abs_schema.fact) + SQL_WHERE +
                                    where_sql) + " t ON " +
                            SQL_AND.join(
                                quote_column("t", c.es_column) + SQL_EQ +
                                quote_column("n", c.es_column)
                                for u in self.uid for c in self.columns[u])))
                self.db.execute(sql_command)

                # INSERT NEW RECORDS
                if not nested_value:
                    continue

                doc_collection = {}
                for d in listwrap(nested_value):
                    nested_table.flatten(d,
                                         Data(),
                                         doc_collection,
                                         path=nested_column_name)

                prefix = SQL_INSERT + quote_column(nested_table.name) + sql_iso(
                    sql_list([self_primary_key] + [quote_column(extra_key)] + [
                        quote_column(c.es_column)
                        for c in doc_collection.get(".", Null).active_columns
                    ]))

                # BUILD THE PARENT TABLES
                parent = (SQL_SELECT + self_primary_key + SQL_FROM +
                          quote_column(abs_schema.fact) + SQL_WHERE +
                          jx_expression(command.where).to_sql(schema))

                # BUILD THE RECORDS
                children = SQL_UNION_ALL.join(
                    SQL_SELECT +
                    sql_alias(quote_value(i), extra_key.es_column) +
                    SQL_COMMA + sql_list(
                        sql_alias(quote_value(row[c.name]),
                                  quote_column(c.es_column))
                        for c in doc_collection.get(".", Null).active_columns)
                    for i, row in enumerate(
                        doc_collection.get(".", Null).rows))

                sql_command = (prefix + SQL_SELECT + sql_list([
                    quote_column("p", c.es_column) for u in self.uid
                    for c in self.columns[u]
                ] + [quote_column("c", extra_key)] + [
                    quote_column("c", c.es_column)
                    for c in doc_collection.get(".", Null).active_columns
                ]) + SQL_FROM + sql_iso(parent) + " p" + SQL_INNER_JOIN +
                               sql_iso(children) + " c" + SQL_ON + SQL_TRUE)

                self.db.execute(sql_command)

                # THE CHILD COLUMNS COULD HAVE EXPANDED
                # ADD COLUMNS TO SELF
                for n, cs in nested_table.columns.items():
                    for c in cs:
                        column = Column(name=c.name,
                                        jx_type=c.jx_type,
                                        es_type=c.es_type,
                                        es_index=c.es_index,
                                        es_column=c.es_column,
                                        nested_path=[nested_column_name] +
                                        c.nested_path,
                                        last_updated=Date.now())
                        if c.name not in self.columns:
                            self.columns[column.name] = {column}
                        elif c.jx_type not in [
                                c.jx_type for c in self.columns[c.name]
                        ]:
                            self.columns[column.name].add(column)

        command = ConcatSQL(
            SQL_UPDATE, quote_column(self.name), SQL_SET,
            sql_list([
                quote_column(c.es_column) + SQL_EQ +
                quote_value(get_if_type(v, c.jx_type))
                for c in self.schema.columns
                if c.jx_type != NESTED and len(c.nested_path) == 1
                for v in [command.set[c.name]] if v != None
            ] + [
                quote_column(c.es_column) + SQL_EQ + SQL_NULL
                for c in self.schema.columns
                if (c.name in clear_columns and command.set[c.name] != None
                    and c.jx_type != NESTED and len(c.nested_path) == 1)
            ]), SQL_WHERE, where_sql)

        with self.db.transaction() as t:
            t.execute(command)
Exemple #38
0
def es_deepop(es, query):
    schema = query.frum.schema
    columns = schema.columns
    query_path = schema.query_path

    # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions
    # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER:  ES ALLOWS
    # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU
    # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT
    post_expressions = {}
    es_query, es_filters = es_query_template(query_path)

    # SPLIT WHERE CLAUSE BY DEPTH
    wheres = split_expression_by_depth(query.where, schema)
    for i, f in enumerate(es_filters):
        script = AndOp("and", wheres[i]).partial_eval().to_esfilter(schema)
        set_default(f, script)

    if not wheres[1]:
        more_filter = {
            "bool": {
                "must":
                [AndOp("and", wheres[0]).partial_eval().to_esfilter(schema)],
                "must_not": {
                    "nested": {
                        "path": query_path,
                        "query": {
                            "match_all": {}
                        }
                    }
                }
            }
        }
    else:
        more_filter = None

    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)

    # es_query.sort = jx_sort_to_es_sort(query.sort)
    map_to_es_columns = schema.map_to_es()
    # {c.names["."]: c.es_column for c in schema.leaves(".")}
    query_for_es = query.map(map_to_es_columns)
    es_query.sort = jx_sort_to_es_sort(query_for_es.sort, schema)

    es_query.stored_fields = []

    is_list = isinstance(query.select, list)
    new_select = FlatList()

    i = 0
    for s in listwrap(query.select):
        if isinstance(s.value, LeavesOp) and isinstance(
                s.value.term, Variable):
            # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
            leaves = schema.leaves(s.value.term.var)
            col_names = set()
            for c in leaves:
                if c.nested_path[0] == ".":
                    if c.type == NESTED:
                        continue
                    es_query.stored_fields += [c.es_column]
                c_name = untype_path(c.names[query_path])
                col_names.add(c_name)
                new_select.append({
                    "name": concat_field(s.name, c_name),
                    "nested_path": c.nested_path[0],
                    "put": {
                        "name": concat_field(s.name, literal_field(c_name)),
                        "index": i,
                        "child": "."
                    },
                    "pull": get_pull_function(c)
                })
                i += 1

            # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS
            for n in new_select:
                if n.name.startswith("..") and n.name.lstrip(
                        ".") not in col_names:
                    n.put.name = n.name = n.name.lstrip(".")
                    col_names.add(n.name)
        elif isinstance(s.value, Variable):
            net_columns = schema.leaves(s.value.var)
            if not net_columns:
                new_select.append({
                    "name": s.name,
                    "nested_path": ".",
                    "put": {
                        "name": s.name,
                        "index": i,
                        "child": "."
                    },
                    "pull": NULL
                })
            else:
                for n in net_columns:
                    pull = get_pull_function(n)
                    if n.nested_path[0] == ".":
                        if n.type == NESTED:
                            continue
                        es_query.stored_fields += [n.es_column]

                    # WE MUST FIGURE OUT WHICH NAMESSPACE s.value.var IS USING SO WE CAN EXTRACT THE child
                    for np in n.nested_path:
                        c_name = untype_path(n.names[np])
                        if startswith_field(c_name, s.value.var):
                            child = relative_field(c_name, s.value.var)
                            break
                    else:
                        child = relative_field(
                            untype_path(n.names[n.nested_path[0]]),
                            s.value.var)

                    new_select.append({
                        "name": s.name,
                        "pull": pull,
                        "nested_path": n.nested_path[0],
                        "put": {
                            "name": s.name,
                            "index": i,
                            "child": child
                        }
                    })
            i += 1
        else:
            expr = s.value
            for v in expr.vars():
                for c in schema[v]:
                    if c.nested_path[0] == ".":
                        es_query.stored_fields += [c.es_column]
                    # else:
                    #     Log.error("deep field not expected")

            pull_name = EXPRESSION_PREFIX + s.name
            map_to_local = {
                untype_path(k): get_pull(cc)
                for k, c in schema.lookup.items() for cc in c
                if cc.type not in STRUCT
            }
            pull = jx_expression_to_function(pull_name)
            post_expressions[pull_name] = compile_expression(
                expr.map(map_to_local).to_python())

            new_select.append({
                "name": s.name if is_list else ".",
                "pull": pull,
                "value": expr.__data__(),
                "put": {
                    "name": s.name,
                    "index": i,
                    "child": "."
                }
            })
            i += 1

    # <COMPLICATED> ES needs two calls to get all documents
    more = []

    def get_more(please_stop):
        more.append(
            es_post(
                es,
                Data(query=more_filter, stored_fields=es_query.stored_fields),
                query.limit))

    if more_filter:
        need_more = Thread.run("get more", target=get_more)

    with Timer("call to ES") as call_timer:
        data = es_post(es, es_query, query.limit)

    # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED
    def inners():
        for t in data.hits.hits:
            for i in t.inner_hits[literal_field(query_path)].hits.hits:
                t._inner = i._source
                for k, e in post_expressions.items():
                    t[k] = e(t)
                yield t
        if more_filter:
            Thread.join(need_more)
            for t in more[0].hits.hits:
                yield t

    #</COMPLICATED>

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        output = formatter(inners(), new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
Exemple #39
0
    def __init__(self, **desc):
        Domain.__init__(self, **desc)
        desc = wrap(desc)

        self.type = "set"
        self.order = {}
        self.NULL = Null
        self.partitions = FlatList()
        self.primitive = True  # True IF DOMAIN IS A PRIMITIVE VALUE SET

        if isinstance(self.key, set):
            Log.error("problem")

        if not desc.key and (len(desc.partitions) == 0
                             or isinstance(desc.partitions[0],
                                           (basestring, Number, tuple))):
            # ASSUME PARTS ARE STRINGS, CONVERT TO REAL PART OBJECTS
            self.key = "value"
            self.map = {}
            self.order[None] = len(desc.partitions)
            for i, p in enumerate(desc.partitions):
                part = {"name": p, "value": p, "dataIndex": i}
                self.partitions.append(part)
                self.map[p] = part
                self.order[p] = i
            self.label = coalesce(self.label, "name")
            self.primitive = True
            return

        if desc.partitions and desc.dimension.fields and len(
                desc.dimension.fields) > 1:
            self.key = desc.key
            self.map = UniqueIndex(keys=desc.dimension.fields)
        elif desc.partitions and isinstance(desc.key, (list, set)):
            # TODO: desc.key CAN BE MUCH LIKE A SELECT, WHICH UniqueIndex CAN NOT HANDLE
            self.key = desc.key
            self.map = UniqueIndex(keys=desc.key)
        elif desc.partitions and isinstance(desc.partitions[0][desc.key],
                                            Mapping):
            self.key = desc.key
            self.map = UniqueIndex(keys=desc.key)
            # self.key = UNION(set(d[desc.key].keys()) for d in desc.partitions)
            # self.map = UniqueIndex(keys=self.key)
        elif len(desc.partitions) == 0:
            # CREATE AN EMPTY DOMAIN
            self.key = "value"
            self.map = {}
            self.order[None] = 0
            self.label = coalesce(self.label, "name")
            return
        elif desc.key == None:
            if desc.partitions and all(desc.partitions.where) or all(
                    desc.partitions.esfilter):
                if not all(desc.partitions.name):
                    Log.error("Expecting all partitions to have a name")
                from pyLibrary.queries.expressions import jx_expression

                self.key = "name"
                self.map = dict()
                self.map[None] = self.NULL
                self.order[None] = len(desc.partitions)
                for i, p in enumerate(desc.partitions):
                    self.partitions.append({
                        "where":
                        jx_expression(coalesce(p.where, p.esfilter)),
                        "name":
                        p.name,
                        "dataIndex":
                        i
                    })
                    self.map[p.name] = p
                    self.order[p.name] = i
                return
            elif desc.partitions and len(set(desc.partitions.value) -
                                         {None}) == len(desc.partitions):
                # TRY A COMMON KEY CALLED "value".  IT APPEARS UNIQUE
                self.key = "value"
                self.map = dict()
                self.map[None] = self.NULL
                self.order[None] = len(desc.partitions)
                for i, p in enumerate(desc.partitions):
                    self.map[p[self.key]] = p
                    self.order[p[self.key]] = i
                self.primitive = False
            else:
                Log.error("Domains must have keys, or partitions")
        elif self.key:
            self.key = desc.key
            self.map = dict()
            self.map[None] = self.NULL
            self.order[None] = len(desc.partitions)
            for i, p in enumerate(desc.partitions):
                self.map[p[self.key]] = p
                self.order[p[self.key]] = i
            self.primitive = False
        else:
            Log.error("Can not hanldle")

        self.label = coalesce(self.label, "name")

        if hasattr(desc.partitions, "__iter__"):
            self.partitions = wrap(list(desc.partitions))
        else:
            Log.error("expecting a list of partitions")
        def _flatten(data,
                     uid,
                     parent_id,
                     order,
                     full_path,
                     nested_path,
                     row=None,
                     guid=None):
            """
            :param data: the data we are pulling apart
            :param uid: the uid we are giving this doc
            :param parent_id: the parent id of this (sub)doc
            :param order: the number of siblings before this one
            :param full_path: path to this (sub)doc
            :param nested_path: list of paths, deepest first
            :param row: we will be filling this
            :return:
            """
            table = concat_field(self.name, nested_path[0])
            insertion = doc_collection[nested_path[0]]
            if not row:
                row = {GUID: guid, UID: uid, PARENT: parent_id, ORDER: order}
                insertion.rows.append(row)

            if is_data(data):
                items = [(concat_field(full_path, k), v)
                         for k, v in wrap(data).leaves()]
            else:
                # PRIMITIVE VALUES
                items = [(full_path, data)]

            for cname, v in items:
                jx_type = get_jx_type(v)
                if jx_type is None:
                    continue

                insertion = doc_collection[nested_path[0]]
                if jx_type == NESTED:
                    c = first(cc for cc in insertion.active_columns +
                              snowflake.columns if cc.jx_type in STRUCT
                              and untyped_column(cc.name)[0] == cname)
                else:
                    c = first(cc for cc in insertion.active_columns +
                              snowflake.columns
                              if cc.jx_type == jx_type and cc.name == cname)

                if isinstance(c, list):
                    Log.error("confused")

                if not c:
                    # WHAT IS THE NESTING LEVEL FOR THIS PATH?
                    deeper_nested_path = "."
                    for path in snowflake.query_paths:
                        if startswith_field(cname, path[0]) and len(
                                deeper_nested_path) < len(path):
                            deeper_nested_path = path

                    c = Column(name=cname,
                               jx_type=jx_type,
                               es_type=json_type_to_sqlite_type.get(
                                   jx_type, jx_type),
                               es_column=typed_column(
                                   cname, json_type_to_sql_type.get(jx_type)),
                               es_index=table,
                               nested_path=nested_path,
                               last_updated=Date.now())
                    if jx_type == NESTED:
                        snowflake.query_paths.append(c.es_column)
                        required_changes.append({'nest': c})
                    else:
                        insertion.active_columns.add(c)
                        required_changes.append({"add": c})
                elif c.jx_type == NESTED and jx_type == OBJECT:
                    # ALWAYS PROMOTE OBJECTS TO NESTED
                    jx_type = NESTED
                    v = [v]
                elif len(c.nested_path) < len(nested_path):
                    from_doc = doc_collection.get(c.nested_path[0], None)
                    column = c.es_column
                    from_doc.active_columns.remove(c)
                    snowflake._remove_column(c)
                    required_changes.append({"nest": c})
                    deep_c = Column(name=cname,
                                    jx_type=jx_type,
                                    es_type=json_type_to_sqlite_type.get(
                                        jx_type, jx_type),
                                    es_column=typed_column(
                                        cname,
                                        json_type_to_sql_type.get(jx_type)),
                                    es_index=table,
                                    nested_path=nested_path,
                                    last_updated=Date.now())
                    snowflake._add_column(deep_c)
                    snowflake._drop_column(c)
                    from_doc.active_columns.remove(c)

                    for r in from_doc.rows:
                        r1 = unwrap(r)
                        if column in r1:
                            row1 = {
                                UID: self.container.next_uid(),
                                PARENT: r1["__id__"],
                                ORDER: 0,
                                column: r1[column]
                            }
                            insertion.rows.append(row1)
                elif len(c.nested_path) > len(nested_path):
                    insertion = doc_collection[c.nested_path[0]]
                    row = {
                        UID: self.container.next_uid(),
                        PARENT: uid,
                        ORDER: order
                    }
                    insertion.rows.append(row)

                # BE SURE TO NEST VALUES, IF NEEDED
                if jx_type == NESTED:
                    deeper_nested_path = [cname] + nested_path
                    if not doc_collection.get(cname):
                        doc_collection[cname] = Data(active_columns=Queue(),
                                                     rows=[])
                    for i, r in enumerate(v):
                        child_uid = self.container.next_uid()
                        _flatten(r, child_uid, uid, i, cname,
                                 deeper_nested_path)
                elif jx_type == OBJECT:
                    _flatten(v,
                             uid,
                             parent_id,
                             order,
                             cname,
                             nested_path,
                             row=row)
                elif c.jx_type:
                    row[c.es_column] = v
Exemple #41
0
def bytes2sha1(value):
    if is_text(value):
        Log.error("can not convert unicode to sha1")
    sha = hashlib.sha1(value)
    return sha.hexdigest()
Exemple #42
0
 def getDomain(self):
     Log.error("Not implemented")
 def error(self, message):
     Log.error("argparse error: {{error}}", error=message)
Exemple #44
0
 def __bool__(self):
     Log.error(
         "Detecting truthiness of NullOp is too confusing to be allowed")
Exemple #45
0
                assertAlmostEqual(a,
                                  b,
                                  msg=msg,
                                  digits=digits,
                                  places=places,
                                  delta=delta)
        else:
            assertAlmostEqualValue(test,
                                   expected,
                                   msg=msg,
                                   digits=digits,
                                   places=places,
                                   delta=delta)
    except Exception as e:
        Log.error("{{test|json}} does not match expected {{expected|json}}",
                  test=test if show_detail else "[can not show]",
                  expected=expected if show_detail else "[can not show]",
                  cause=e)


def assertAlmostEqualValue(test,
                           expected,
                           digits=None,
                           places=None,
                           msg=None,
                           delta=None):
    """
    Snagged from unittest/case.py, then modified (Aug2014)
    """
    if expected.__class__.__name__ == "NullOp":
        if test == None:
            return
Exemple #46
0
    def login(self, please_stop=None):
        """
        WILL REGISTER THIS DEVICE, AND SHOW A QR-CODE TO LOGIN
        WILL POLL THE SERVICE ENDPOINT UNTIL LOGIN IS COMPLETED, OR FAILED

        :param please_stop: SIGNAL TO STOP EARLY
        :return: SESSION THAT CAN BE USED TO SEND AUTHENTICATED REQUESTS
        """
        # SEND PUBLIC KEY
        now = Date.now().unix
        login_session = requests.session()
        signed = rsa_crypto.sign(
            Data(public_key=self.public_key, timestamp=now), self.private_key)
        DEBUG and Log.note("register (unsigned)\n{{request|json}}",
                           request=rsa_crypto.verify(signed, self.public_key))
        DEBUG and Log.note("register (signed)\n{{request|json}}",
                           request=signed)
        try:
            response = login_session.request(
                "POST",
                str(URL(self.config.service) / self.config.endpoints.register),
                data=value2json(signed))
        except Exception as e:
            raise Log.error("problem registering device", cause=e)

        device = wrap(response.json())
        DEBUG and Log.note("response:\n{{response}}", response=device)
        device.interval = parse(device.interval).seconds
        expires = Till(till=parse(device.expires).unix)
        session_id = self.session_id = device.session_id
        if not session_id:
            Log.error("expecting a session cookie")

        # SHOW URL AS QR CODE
        image = text2QRCode(device.url)

        sys.stdout.write("\n\nLogin using thie URL:\n")
        sys.stdout.write(device.url + CR)
        sys.stdout.write(image)

        while not please_stop and not expires:
            Log.note("waiting for login...")
            try:
                now = Date.now()
                signed = rsa_crypto.sign(
                    Data(timestamp=now, session_id=session_id),
                    self.private_key)
                url = URL(self.config.service) / self.config.endpoints.status
                DEBUG and Log.note("ping (unsigned) {{url}}\n{{request|json}}",
                                   url=url,
                                   request=rsa_crypto.verify(
                                       signed, self.public_key))
                response = login_session.request("POST",
                                                 url,
                                                 data=value2json(signed))
                ping = wrap(response.json())
                DEBUG and Log.note("response\n{{response|json}}",
                                   response=ping)
                if ping.status == "verified":
                    return self
                if not ping.try_again:
                    Log.note("Failed to login {{reason}}", reason=ping.status)
                    return
            except Exception as e:
                Log.warning(
                    "problem calling {{url}}",
                    url=URL(self.config.service) /
                    self.config.endpoints.status,
                    cause=e,
                )
            (Till(seconds=device.interval) | please_stop | expires).wait()
        return self
    def query(self, query=None):
        """
        :param query:  JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT
        :return:
        """
        if not query:
            query = {}
        if not query.get('from'):
            query['from'] = self.name
        elif not startswith_field(query['from'], self.name):
            Log.error("Expecting table, or some nested table")
        query = QueryOp.wrap(query, self.container, self.namespace)
        new_table = "temp_" + unique_name()

        if query.format == "container":
            create_table = SQL_CREATE + quote_column(new_table) + SQL_AS
        else:
            create_table = ""

        if query.groupby and query.format != "cube":
            op, index_to_columns = self._groupby_op(query, self.schema)
            command = create_table + op
        elif query.groupby:
            query.edges, query.groupby = query.groupby, query.edges
            op, index_to_columns = self._edges_op(query, self.schema)
            command = create_table + op
            query.edges, query.groupby = query.groupby, query.edges
        elif query.edges or any(a != "none" for a in listwrap(query.select).aggregate):
            op, index_to_columns = self._edges_op(query, query.frum.schema)
            command = create_table + op
        else:
            op = self._set_op(query)
            return op

        result = self.db.query(command)

        if query.format == "container":
            output = QueryTable(new_table, db=self.db, uid=self.uid, exists=True)
        elif query.format == "cube" or (not query.format and query.edges):
            column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1)
            for c in index_to_columns.values():
                column_names[c.push_column] = c.push_column_name

            if len(query.edges) == 0 and len(query.groupby) == 0:
                data = {n: Data() for n in column_names}
                for s in index_to_columns.values():
                    data[s.push_name][s.push_child] = unwrap(s.pull(result.data[0]))
                if is_list(query.select):
                    select = [{"name": s.name} for s in query.select]
                else:
                    select = {"name": query.select.name}

                return Data(
                    data=unwrap(data),
                    select=select,
                    meta={"format": "cube"}
                )

            if not result.data:
                edges = []
                dims = []
                for i, e in enumerate(query.edges + query.groupby):
                    allowNulls = coalesce(e.allowNulls, True)

                    if e.domain.type == "set" and e.domain.partitions:
                        domain = SimpleSetDomain(partitions=e.domain.partitions.name)
                    elif e.domain.type == "range":
                        domain = e.domain
                    elif is_op(e.value, TupleOp):
                        pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name],
                                        "push_child").pull
                        parts = [tuple(p(d) for p in pulls) for d in result.data]
                        domain = SimpleSetDomain(partitions=jx.sort(set(parts)))
                    else:
                        domain = SimpleSetDomain(partitions=[])

                    dims.append(1 if allowNulls else 0)
                    edges.append(Data(
                        name=e.name,
                        allowNulls=allowNulls,
                        domain=domain
                    ))

                data = {}
                for si, s in enumerate(listwrap(query.select)):
                    if s.aggregate == "count":
                        data[s.name] = Matrix(dims=dims, zeros=0)
                    else:
                        data[s.name] = Matrix(dims=dims)

                if is_list(query.select):
                    select = [{"name": s.name} for s in query.select]
                else:
                    select = {"name": query.select.name}

                return Data(
                    meta={"format": "cube"},
                    edges=edges,
                    select=select,
                    data={k: v.cube for k, v in data.items()}
                )

            columns = None

            edges = []
            dims = []
            for g in query.groupby:
                g.is_groupby = True

            for i, e in enumerate(query.edges + query.groupby):
                allowNulls = coalesce(e.allowNulls, True)

                if e.domain.type == "set" and e.domain.partitions:
                    domain = SimpleSetDomain(partitions=e.domain.partitions.name)
                elif e.domain.type == "range":
                    domain = e.domain
                elif e.domain.type == "time":
                    domain = wrap(mo_json.scrub(e.domain))
                elif e.domain.type == "duration":
                    domain = wrap(mo_json.scrub(e.domain))
                elif is_op(e.value, TupleOp):
                    pulls = jx.sort([c for c in index_to_columns.values() if c.push_name == e.name], "push_child").pull
                    parts = [tuple(p(d) for p in pulls) for d in result.data]
                    domain = SimpleSetDomain(partitions=jx.sort(set(parts)))
                else:
                    if not columns:
                        columns = transpose(*result.data)
                    parts = set(columns[i])
                    if e.is_groupby and None in parts:
                        allowNulls = True
                    parts -= {None}

                    if query.sort[i].sort == -1:
                        domain = SimpleSetDomain(partitions=wrap(sorted(parts, reverse=True)))
                    else:
                        domain = SimpleSetDomain(partitions=jx.sort(parts))

                dims.append(len(domain.partitions) + (1 if allowNulls else 0))
                edges.append(Data(
                    name=e.name,
                    allowNulls=allowNulls,
                    domain=domain
                ))

            data_cubes = {}
            for si, s in enumerate(listwrap(query.select)):
                if s.aggregate == "count":
                    data_cubes[s.name] = Matrix(dims=dims, zeros=0)
                else:
                    data_cubes[s.name] = Matrix(dims=dims)

            r2c = index_to_coordinate(dims)  # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM
            for rownum, row in enumerate(result.data):
                coord = r2c(rownum)

                for i, s in enumerate(index_to_columns.values()):
                    if s.is_edge:
                        continue
                    if s.push_child == ".":
                        data_cubes[s.push_name][coord] = s.pull(row)
                    else:
                        data_cubes[s.push_name][coord][s.push_child] = s.pull(row)

            if query.select == None:
                select = Null
            elif is_list(query.select):
                select = [{"name": s.name} for s in query.select]
            else:
                select = {"name": query.select.name}

            return Data(
                meta={"format": "cube"},
                edges=edges,
                select=select,
                data={k: v.cube for k, v in data_cubes.items()}
            )
        elif query.format == "table" or (not query.format and query.groupby):
            column_names = [None] * (max(c.push_column for c in index_to_columns.values()) + 1)
            for c in index_to_columns.values():
                column_names[c.push_column] = c.push_column_name
            data = []
            for d in result.data:
                row = [None for _ in column_names]
                for s in index_to_columns.values():
                    if s.push_child == ".":
                        row[s.push_column] = s.pull(d)
                    elif s.num_push_columns:
                        tuple_value = row[s.push_column]
                        if tuple_value == None:
                            tuple_value = row[s.push_column] = [None] * s.num_push_columns
                        tuple_value[s.push_child] = s.pull(d)
                    elif row[s.push_column] == None:
                        row[s.push_column] = Data()
                        row[s.push_column][s.push_child] = s.pull(d)
                    else:
                        row[s.push_column][s.push_child] = s.pull(d)
                data.append(tuple(unwrap(r) for r in row))

            output = Data(
                meta={"format": "table"},
                header=column_names,
                data=data
            )
        elif query.format == "list" or (not query.edges and not query.groupby):
            if not query.edges and not query.groupby and any(listwrap(query.select).aggregate):
                if is_list(query.select):
                    data = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            if data[c.push_name] == None:
                                data[c.push_name] = c.pull(result.data[0])
                            elif is_list(data[c.push_name]):
                                data[c.push_name].append(c.pull(result.data[0]))
                            else:
                                data[c.push_name] = [data[c.push_name], c.pull(result.data[0])]
                        else:
                            data[c.push_name][c.push_child] = c.pull(result.data[0])

                    output = Data(
                        meta={"format": "value"},
                        data=data
                    )
                else:
                    data = Data()
                    for s in index_to_columns.values():
                        if not data[s.push_child]:
                            data[s.push_child] = s.pull(result.data[0])
                        else:
                            data[s.push_child] += [s.pull(result.data[0])]
                    output = Data(
                        meta={"format": "value"},
                        data=unwrap(data)
                    )
            else:
                data = []
                for rownum in result.data:
                    row = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            row[c.push_name] = c.pull(rownum)
                        elif c.num_push_columns:
                            tuple_value = row[c.push_name]
                            if not tuple_value:
                                tuple_value = row[c.push_name] = [None] * c.num_push_columns
                            tuple_value[c.push_child] = c.pull(rownum)
                        else:
                            row[c.push_name][c.push_child] = c.pull(rownum)

                    data.append(row)

                output = Data(
                    meta={"format": "list"},
                    data=data
                )
        else:
            Log.error("unknown format {{format}}", format=query.format)

        return output
Exemple #48
0
def assertAlmostEqual(test,
                      expected,
                      digits=None,
                      places=None,
                      msg=None,
                      delta=None):
    show_detail = True
    test = unwrap(test)
    expected = unwrap(expected)
    try:
        if test is None and expected is None:
            return
        elif isinstance(test, UniqueIndex):
            if test ^ expected:
                Log.error("Sets do not match")
        elif isinstance(expected, Mapping) and isinstance(test, Mapping):
            for k, v2 in unwrap(expected).items():
                v1 = test.get(k)
                assertAlmostEqual(v1,
                                  v2,
                                  msg=msg,
                                  digits=digits,
                                  places=places,
                                  delta=delta)
        elif isinstance(expected, Mapping):
            for k, v2 in expected.items():
                if isinstance(k, basestring):
                    v1 = mo_dots.get_attr(test, literal_field(k))
                else:
                    v1 = test[k]
                assertAlmostEqual(v1,
                                  v2,
                                  msg=msg,
                                  digits=digits,
                                  places=places,
                                  delta=delta)
        elif isinstance(test, (set, list)) and isinstance(expected, set):
            test = set(test)
            if len(test) != len(expected):
                Log.error(
                    "Sets do not match, element count different:\n{{test|json|indent}}\nexpecting{{expectedtest|json|indent}}",
                    test=test,
                    expected=expected)

            for e in expected:
                for t in test:
                    try:
                        assertAlmostEqual(t,
                                          e,
                                          msg=msg,
                                          digits=digits,
                                          places=places,
                                          delta=delta)
                        break
                    except Exception, _:
                        pass
                else:
                    Log.error(
                        "Sets do not match. {{value|json}} not found in {{test|json}}",
                        value=e,
                        test=test)

        elif isinstance(expected, types.FunctionType):
            return expected(test)
Exemple #49
0
 def __init__(self, value):
     SQL.__init__(self)
     if DEBUG and isinstance(value, SQL):
         Log.error("Expecting text, not SQL")
     self.value = value
Exemple #50
0
 def add(self, record):
     if isinstance(record, list):
         Log.error("no longer accepting lists, use extend()")
     return self.extend([record])
Exemple #51
0
    def __new__(cls, e=None, query=None, *args, **kwargs):
        e.allowNulls = coalesce(e.allowNulls, True)

        if e.value and e.domain.type == "default":
            # if query.groupby:
            #     return object.__new__(DefaultDecoder, e)

            if is_text(e.value):
                Log.error("Expecting Variable or Expression, not plain string")

            if is_op(e.value, LeavesOp):
                return object.__new__(ObjectDecoder)
            elif is_op(e.value, TupleOp):
                # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields
                # JUST PULL THE FIELDS
                if not all(is_op(t, Variable) for t in e.value.terms):
                    Log.error("Can only handle variables in tuples")

                e.domain = Data(dimension={"fields": e.value.terms})
                return object.__new__(DimFieldListDecoder)

            elif is_op(e.value, Variable):
                schema = query.frum.schema
                cols = schema.leaves(e.value.var)
                if not cols:
                    return object.__new__(DefaultDecoder)
                if len(cols) != 1:
                    return object.__new__(ObjectDecoder)
                col = first(cols)
                limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT)

                if col.cardinality == None:
                    DEBUG and Log.warning(
                        "metadata for column {{name|quote}} (id={{id}}) is not ready",
                        name=concat_field(col.es_index, col.es_column),
                        id=id(col))
                    e.domain = set_default(DefaultDomain(limit=limit),
                                           e.domain.__data__())
                    return object.__new__(DefaultDecoder)
                elif col.partitions == None:
                    e.domain = set_default(DefaultDomain(limit=limit),
                                           e.domain.__data__())
                    return object.__new__(DefaultDecoder)
                else:
                    DEBUG and Log.note("id={{id}} has parts!!!", id=id(col))
                    if col.multi > 1:
                        return object.__new__(MultivalueDecoder)

                    partitions = col.partitions[:limit:]
                    if e.domain.sort == -1:
                        partitions = list(reversed(sorted(partitions)))
                    else:
                        partitions = sorted(partitions)
                    e.domain = SimpleSetDomain(partitions=partitions,
                                               limit=limit)

            else:
                return object.__new__(DefaultDecoder)

        if e.value and e.domain.type in PARTITION:
            return object.__new__(SetDecoder)
        if isinstance(e.domain.dimension, Dimension):
            e.domain = e.domain.dimension.getDomain()
            return object.__new__(SetDecoder)
        if e.value and e.domain.type == "time":
            return object.__new__(TimeDecoder)
        if e.range:
            return object.__new__(GeneralRangeDecoder)
        if e.value and e.domain.type == "duration":
            return object.__new__(DurationDecoder)
        elif e.value and e.domain.type == "range":
            return object.__new__(RangeDecoder)
        elif not e.value and e.domain.dimension.fields:
            # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields
            # JUST PULL THE FIELDS
            fields = e.domain.dimension.fields
            if is_data(fields):
                Log.error("No longer allowed: All objects are expressions")
            else:
                return object.__new__(DimFieldListDecoder)
        elif not e.value and all(e.domain.partitions.where):
            return object.__new__(GeneralSetDecoder)
        else:
            Log.error("domain type of {{type}} is not supported yet",
                      type=e.domain.type)
    def query_metadata(self, query):
        frum, query['from'] = query['from'], self
        schema = self.snowflake.tables["."].schema
        query = QueryOp.wrap(query, schema)
        columns = self.snowflake.columns
        where = query.where
        table_name = None
        column_name = None

        if query.edges or query.groupby:
            Log.error("Aggregates(groupby or edge) are not supported")

        if where.op == "eq" and where.lhs.var == "table":
            table_name = mo_json.json2value(where.rhs.json)
        elif where.op == "eq" and where.lhs.var == "name":
            column_name = mo_json.json2value(where.rhs.json)
        else:
            Log.error("Only simple filters are expected like: \"eq\" on table and column name")

        tables = [concat_field(self.snowflake.fact_name, i) for i in self.tables.keys()]

        metadata = []
        if columns[-1].es_column != GUID:
            columns.append(Column(
                name=GUID,
                jx_type=STRING,
                es_column=GUID,
                es_index=self.snowflake.fact_name,
                nested_path=["."]
            ))

        for tname, table in zip(t, tables):
            if table_name != None and table_name != table:
                continue

            for col in columns:
                cname, ctype = untyped_column(col.es_column)
                if column_name != None and column_name != cname:
                    continue

                metadata.append((table, relative_field(col.name, tname), col.type, unwraplist(col.nested_path)))

        if query.format == "cube":
            num_rows = len(metadata)
            header = ["table", "name", "type", "nested_path"]
            temp_data = dict(zip(header, zip(*metadata)))
            return Data(
                meta={"format": "cube"},
                data=temp_data,
                edges=[{
                    "name": "rownum",
                    "domain": {
                        "type": "rownum",
                        "min": 0,
                        "max": num_rows,
                        "interval": 1
                    }
                }]
            )
        elif query.format == "table":
            header = ["table", "name", "type", "nested_path"]
            return Data(
                meta={"format": "table"},
                header=header,
                data=metadata
            )
        else:
            header = ["table", "name", "type", "nested_path"]
            return Data(
                meta={"format": "list"},
                data=[dict(zip(header, r)) for r in metadata]
            )
Exemple #53
0
 def append_query(self, query_path, es_query):
     Log.error("Not supported")
Exemple #54
0
 def __iter__(self):
     raise Log.error("not implemented")
Exemple #55
0
 def verify_attributes_not_null(self, attribute_names):
     for name in attribute_names:
         if getattr(self, name) == None:
             Log.error('{{type}} domain expects a {{name|quote}} parameter',
                       type=self.type,
                       name=name)
Exemple #56
0
 def get_index(self, row, es_query=None, index=None):
     try:
         key = row[0].get('key')
         return self.domain.getIndexByKey(key)
     except Exception as e:
         Log.error("problem", cause=e)
Exemple #57
0
    def __init__(
        self,
        host,
        index,  # THE NAME OF THE SNOWFLAKE (IF WRITING)
        alias=None,  # THE NAME OF THE SNOWFLAKE (FOR READING)
        type=None,
        name=None,  # THE FULL NAME OF THE TABLE (THE NESTED PATH INTO THE SNOWFLAKE)
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None,
    ):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs),
            }
        self.edges = Data()  # SET EARLY, SO OTHER PROCESSES CAN REQUEST IT
        self.worker = None
        self.settings = kwargs
        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.name = name = self._namespace._find_alias(
            coalesce(alias, index, name))
        if read_only:
            self.es = elasticsearch.Alias(alias=name,
                                          index=None,
                                          kwargs=kwargs)
        else:
            self.es = (elasticsearch.Cluster(kwargs=kwargs).get_index(
                read_only=read_only, kwargs=kwargs))

        self._ensure_max_result_window_set(name)
        self.settings.type = self.es.settings.type
        self.stats = QueryStats(self.es.cluster)

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error(
                    "Expecting given typed {{typed}} to match {{is_typed}}",
                    typed=typed,
                    is_typed=is_typed,
                )
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {".": None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if v == ".":
                    return (".", )
                return (v, ) + nested_path_of(all_paths[v])

            query_paths = sort_using_key(
                set(step for path in self.snowflake.query_paths
                    for step in path),
                key=lambda p: len(split_field(p)),
            )
            for step in query_paths:
                if step in all_paths:
                    continue
                else:
                    best = "."
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                if p == ".":
                    nested_path = (".", )
                else:
                    nested_path = nested_path_of(p)[1:]

                jx_type = OBJECT if p == "." else NESTED
                self.namespace.meta.columns.add(
                    Column(
                        name=p,
                        es_column=p,
                        es_index=self.name,
                        es_type=jx_type,
                        jx_type=jx_type,
                        cardinality=1,
                        nested_path=nested_path,
                        multi=1001 if jx_type is NESTED else 1,
                        last_updated=Date.now(),
                    ))
Exemple #58
0
    def extract(self, settings, force, restart, start, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if start:
                state = start, 0
            elif restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, job_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}",
                    last_modified=last_modified,
                    job_id=job_id,
                )

                # Example: job.id ==283890114
                # get_ids = ConcatSQL(
                #     (SQL_SELECT, sql_alias(quote_value(283890114), "id"))
                # )
                get_ids = sql_query({
                    "from": "job",
                    "select": ["id"],
                    "where": {
                        "or": [
                            {
                                "gt": {
                                    "last_modified": Date(last_modified)
                                }
                            },
                            {
                                "and": [
                                    {
                                        "eq": {
                                            "last_modified":
                                            Date(last_modified)
                                        }
                                    },
                                    {
                                        "gt": {
                                            "id": job_id
                                        }
                                    },
                                ]
                            },
                        ]
                    },
                    "sort": ["last_modified", "id"],
                    "limit": settings.extractor.chunk_size,
                })
                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break

                # SOME LIMITS PLACES ON STRING SIZE
                for fl in jx.drill(acc, "job_log.failure_line"):
                    fl.message = strings.limit(fl.message, 10000)

                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, job_id = last_doc.last_modified, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, job_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done job extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done job merge")
Exemple #59
0
    def update(self, command):
        """
        EXPECTING command == {"set":term, "where":where}
        THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES
        THE where CLAUSE IS AN ES FILTER
        """
        command = to_data(command)
        table = self.get_table(command["update"])

        es_index = self.es.cluster.get_index(read_only=False,
                                             alias=None,
                                             kwargs=self.es.settings)

        schema = table.schema

        # GET IDS OF DOCUMENTS
        query = {
            "from":
            command["update"],
            "select": [{
                "value": "_id"
            }] + [{
                "name": k,
                "value": v
            } for k, v in command.set.items()],
            "where":
            command.where,
            "format":
            "list",
            "limit":
            10000,
        }

        results = self.query(query)

        if results.data:
            content = "".join(t for r in results.data
                              for _id, row in [(r._id, r)]
                              for _ in [row.__setitem__("_id", None)
                                        ]  # WARNING! DESTRUCTIVE TO row
                              for update in map(value2json, ({
                                  "update": {
                                      "_id": _id
                                  }
                              }, {
                                  "doc": row
                              })) for t in (update, "\n"))
            response = self.es.cluster.post(
                es_index.path + "/" + "_bulk",
                data=content,
                timeout=self.settings.timeout,
                params={
                    "wait_for_active_shards":
                    self.settings.wait_for_active_shards
                },
            )
            if response.errors:
                Log.error(
                    "could not update: {{error}}",
                    error=[
                        e.error for i in response["items"] for e in i.values()
                        if e.status not in (200, 201)
                    ],
                )

        # DELETE BY QUERY, IF NEEDED
        if "." in listwrap(command["clear"]):
            es_filter = (ES52Lang[jx_expression(
                command.where)].partial_eval().to_es(schema))
            self.es.delete_record(es_filter)
            return

        es_index.refresh()
Exemple #60
0
def parse(json, query_path, expected_vars=NO_VARS):
    """
    INTENDED TO TREAT JSON AS A STREAM; USING MINIMAL MEMORY WHILE IT ITERATES
    THROUGH THE STRUCTURE.  ASSUMING THE JSON IS LARGE, AND HAS A HIGH LEVEL
    ARRAY STRUCTURE, IT WILL yield EACH OBJECT IN THAT ARRAY.  NESTED ARRAYS
    ARE HANDLED BY REPEATING THE PARENT PROPERTIES FOR EACH MEMBER OF THE
    NESTED ARRAY. DEEPER NESTED PROPERTIES ARE TREATED AS PRIMITIVE VALUES;
    THE STANDARD JSON DECODER IS USED.

    LARGE MANY-PROPERTY OBJECTS CAN BE HANDLED BY `items()`

    :param json:       SOME STRING-LIKE STRUCTURE THAT CAN ASSUME WE LOOK AT
                       ONE CHARACTER AT A TIME, IN ORDER
    :param query_path: A DOT-SEPARATED STRING INDICATING THE PATH TO THE
                       NESTED ARRAY OPTIONALLY, {"items":query_path} TO
                       FURTHER ITERATE OVER PROPERTIES OF OBJECTS FOUND AT
                       query_path
    :param expected_vars: REQUIRED PROPERTY NAMES, USED TO DETERMINE IF
                          MORE-THAN-ONE PASS IS REQUIRED
    :return: RETURNS AN ITERATOR OVER ALL OBJECTS FROM ARRAY LOCATED AT query_path
    """
    if hasattr(json, "read"):
        # ASSUME IT IS A STREAM
        temp = json

        def get_more():
            return temp.read(MIN_READ_SIZE)

        json = List_usingStream(get_more)
    elif hasattr(json, "__call__"):
        json = List_usingStream(json)
    elif isinstance(json, GeneratorType):
        json = List_usingStream(json.next)
    else:
        Log.error(
            "Expecting json to be a stream, or a function that will return more bytes"
        )

    def _iterate_list(index, c, parent_path, path, expected_vars):
        c, index = skip_whitespace(index)
        if c == b']':
            yield index
            return

        while True:
            if not path:
                index = _assign_token(index, c, expected_vars)
                c, index = skip_whitespace(index)
                if c == b']':
                    yield index
                    _done(parent_path)
                    return
                elif c == b',':
                    yield index
                    c, index = skip_whitespace(index)
            else:
                for index in _decode_token(index, c, parent_path, path,
                                           expected_vars):
                    c, index = skip_whitespace(index)
                    if c == b']':
                        yield index
                        _done(parent_path)
                        return
                    elif c == b',':
                        yield index
                        c, index = skip_whitespace(index)

    def _done(parent_path):
        if len(parent_path) < len(done[0]):
            done[0] = parent_path

    def _decode_object(index, c, parent_path, query_path, expected_vars):
        if "." in expected_vars:
            if len(done[0]) <= len(parent_path) and all(
                    d == p for d, p in zip(done[0], parent_path)):
                Log.error("Can not pick up more variables, iterator is done")

            if query_path:
                Log.error("Can not extract objects that contain the iteration",
                          var=join_field(query_path))

            index = _assign_token(index, c, expected_vars)
            # c, index = skip_whitespace(index)
            yield index
            return

        did_yield = False
        while True:
            c, index = skip_whitespace(index)
            if c == b',':
                continue
            elif c == b'"':
                name, index = simple_token(index, c)

                c, index = skip_whitespace(index)
                if c != b':':
                    Log.error("Expecting colon")
                c, index = skip_whitespace(index)

                child_expected = needed(name, expected_vars)
                child_path = parent_path + [name]
                if any(child_expected):
                    if not query_path:
                        index = _assign_token(index, c, child_expected)
                    elif query_path[0] == name:
                        for index in _decode_token(index, c, child_path,
                                                   query_path[1:],
                                                   child_expected):
                            did_yield = True
                            yield index
                    else:
                        if len(done[0]) <= len(child_path):
                            Log.error(
                                "Can not pick up more variables, iterator over {{path}} is done",
                                path=join_field(done[0]))
                        index = _assign_token(index, c, child_expected)
                elif query_path and query_path[0] == name:
                    for index in _decode_token(index, c, child_path,
                                               query_path[1:], child_expected):
                        yield index
                else:
                    index = jump_to_end(index, c)
            elif c == b"}":
                if not did_yield:
                    yield index
                break

    def set_destination(expected_vars, value):
        for i, e in enumerate(expected_vars):
            if e is None:
                pass
            elif e == ".":
                destination[i] = value
            elif isinstance(value, Mapping):
                destination[i] = value[e]
            else:
                destination[i] = Null

    def _decode_object_items(index, c, parent_path, query_path, expected_vars):
        """
        ITERATE THROUGH THE PROPERTIES OF AN OBJECT
        """
        c, index = skip_whitespace(index)
        num_items = 0
        while True:
            if c == b',':
                c, index = skip_whitespace(index)
            elif c == b'"':
                name, index = simple_token(index, c)
                if "name" in expected_vars:
                    for i, e in enumerate(expected_vars):
                        if e == "name":
                            destination[i] = name

                c, index = skip_whitespace(index)
                if c != b':':
                    Log.error("Expecting colon")
                c, index = skip_whitespace(index)

                child_expected = needed("value", expected_vars)
                index = _assign_token(index, c, child_expected)
                c, index = skip_whitespace(index)
                DEBUG and not num_items % 1000 and Log.note(
                    "{{num}} items iterated", num=num_items)
                yield index
                num_items += 1
            elif c == b"}":
                break

    def _decode_token(index, c, parent_path, query_path, expected_vars):
        if c == b'{':
            if query_path and query_path[0] == "$items":
                if any(expected_vars):
                    for index in _decode_object_items(index, c, parent_path,
                                                      query_path[1:],
                                                      expected_vars):
                        yield index
                else:
                    index = jump_to_end(index, c)
                    yield index
            elif not any(expected_vars):
                index = jump_to_end(index, c)
                yield index
            else:
                for index in _decode_object(index, c, parent_path, query_path,
                                            expected_vars):
                    yield index
        elif c == b'[':
            for index in _iterate_list(index, c, parent_path, query_path,
                                       expected_vars):
                yield index
        else:
            index = _assign_token(index, c, expected_vars)
            yield index

    def _assign_token(index, c, expected_vars):
        if not any(expected_vars):
            return jump_to_end(index, c)

        value, index = simple_token(index, c)
        set_destination(expected_vars, value)

        return index

    def jump_to_end(index, c):
        """
        DO NOT PROCESS THIS JSON OBJECT, JUST RETURN WHERE IT ENDS
        """
        if c == b'"':
            while True:
                c = json[index]
                index += 1
                if c == b'\\':
                    index += 1
                elif c == b'"':
                    break
            return index
        elif c not in b"[{":
            while True:
                c = json[index]
                index += 1
                if c in b',]}':
                    break
            return index - 1

        # OBJECTS AND ARRAYS ARE MORE INVOLVED
        stack = [None] * 1024
        stack[0] = CLOSE[c]
        i = 0  # FOR INDEXING THE STACK
        while True:
            c = json[index]
            index += 1

            if c == b'"':
                while True:
                    c = json[index]
                    index += 1
                    if c == b'\\':
                        index += 1
                    elif c == b'"':
                        break
            elif c in b'[{':
                i += 1
                stack[i] = CLOSE[c]
            elif c == stack[i]:
                i -= 1
                if i == -1:
                    return index  # FOUND THE MATCH!  RETURN
            elif c in b']}':
                Log.error("expecting {{symbol}}", symbol=stack[i])

    def simple_token(index, c):
        if c == b'"':
            json.mark(index - 1)
            while True:
                c = json[index]
                index += 1
                if c == b"\\":
                    index += 1
                elif c == b'"':
                    break
            return json_decoder(json.release(index).decode("utf8")), index
        elif c in b"{[":
            json.mark(index - 1)
            index = jump_to_end(index, c)
            value = wrap(json_decoder(json.release(index).decode("utf8")))
            return value, index
        elif c == b"t" and json.slice(index, index + 3) == b"rue":
            return True, index + 3
        elif c == b"n" and json.slice(index, index + 3) == b"ull":
            return None, index + 3
        elif c == b"f" and json.slice(index, index + 4) == b"alse":
            return False, index + 4
        else:
            json.mark(index - 1)
            while True:
                c = json[index]
                if c in b',]}':
                    break
                index += 1
            text = json.release(index)
            try:
                return float(text), index
            except Exception:
                Log.error("Not a known JSON primitive: {{text|quote}}",
                          text=text)

    def skip_whitespace(index):
        """
        RETURN NEXT NON-WHITESPACE CHAR, AND ITS INDEX
        """
        c = json[index]
        while c in WHITESPACE:
            index += 1
            c = json[index]
        return c, index + 1

    if isinstance(query_path, Mapping) and query_path.get("items"):
        path_list = split_field(query_path.get("items")) + [
            "$items"
        ]  # INSERT A MARKER SO THAT OBJECT IS STREAM DECODED
    else:
        path_list = split_field(query_path)

    destination = [None] * len(expected_vars)
    c, index = skip_whitespace(0)
    done = [path_list + [None]]
    for _ in _decode_token(index, c, [], path_list, expected_vars):
        output = Data()
        for i, e in enumerate(expected_vars):
            output[e] = destination[i]
        yield output