Example #1
0
class Log_usingThread(BaseLog):

    def __init__(self, logger):
        # DELAYED LOAD FOR THREADS MODULE
        from pyLibrary.thread.threads import Queue

        self.queue = Queue("logs", max=10000, silent=True)
        self.logger = logger

        def worker(please_stop):
            while not please_stop:
                Thread.sleep(1)
                logs = self.queue.pop_all()
                for log in logs:
                    if log is Thread.STOP:
                        if DEBUG_LOGGING:
                            sys.stdout.write("Log_usingThread.worker() sees stop, filling rest of queue\n")
                        please_stop.go()
                    else:
                        self.logger.write(**log)

        self.thread = Thread("log thread", worker)
        self.thread.start()

    def write(self, template, params):
        try:
            self.queue.add({"template": template, "params": params})
            return self
        except Exception, e:
            sys.stdout.write("IF YOU SEE THIS, IT IS LIKELY YOU FORGOT TO RUN Log.start() FIRST\n")
            raise e  # OH NO!
Example #2
0
def etl_one(settings):
    queue = Queue("temp work queue")
    queue.__setattr__(b"commit", Null)
    queue.__setattr__(b"rollback", Null)

    settings.param.wait_forever = False
    already_in_queue = set()
    for w in settings.workers:
        source = get_container(w.source)
        # source.settings.fast_forward = True
        if id(source) in already_in_queue:
            continue
        try:
            for i in parse_id_argument(settings.args.id):
                data = source.get_key(i)
                if data != None:
                    already_in_queue.add(id(source))
                    queue.add(Dict(
                        bucket=w.source.bucket,
                        key=i
                    ))
        except Exception, e:
            if "Key {{key}} does not exist" in e:
                already_in_queue.add(id(source))
                queue.add(Dict(
                    bucket=w.source.bucket,
                    key=settings.args.id
                ))
            Log.warning("Problem", cause=e)
Example #3
0
    def worker(please_stop):
        pending = Queue("pending ids", max=BATCH_SIZE*3, silent=False)

        pending_thread = Thread.run(
            "get pending",
            get_pending,
            source=source,
            since=last_updated,
            pending_bugs=pending,
            please_stop=please_stop
        )
        diff_thread = Thread.run(
            "diff",
            diff,
            source,
            destination,
            pending,
            please_stop=please_stop
        )
        replication_thread = Thread.run(
            "replication",
            replicate,
            source,
            destination,
            pending,
            config.fix,
            please_stop=please_stop
        )
        pending_thread.join()
        diff_thread.join()
        pending.add(Thread.STOP)
        replication_thread.join()
        done.go()
        please_stop.go()
Example #4
0
class TextLog_usingElasticSearch(TextLog):
    @use_settings
    def __init__(self, host, index, type="log", max_size=1000, batch_size=100, settings=None):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        self.es = Cluster(settings).get_or_create_index(
            schema=convert.json2value(convert.value2json(SCHEMA), leaves=True),
            limit_replicas=True,
            tjson=True,
            settings=settings
        )
        self.batch_size = batch_size
        self.es.add_alias(coalesce(settings.alias, settings.index))
        self.queue = Queue("debug logs to es", max=max_size, silent=True)
        self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3)
        self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE))
        Thread.run("add debug logs to es", self._insert_loop)

    def write(self, template, params):
        if params.get("template"):
            # DETECTED INNER TEMPLATE, ASSUME TRACE IS ON, SO DO NOT NEED THE OUTER TEMPLATE
            self.queue.add({"value": params})
        else:
            template = strings.limit(template, 2000)
            self.queue.add({"value": {"template": template, "params": params}}, timeout=3 * MINUTE)
        return self

    def _insert_loop(self, please_stop=None):
        bad_count = 0
        while not please_stop:
            try:
                Thread.sleep(seconds=1)
                messages = wrap(self.queue.pop_all())
                if messages:
                    # for m in messages:
                    #     m.value.params = leafer(m.value.params)
                    #     m.value.error = leafer(m.value.error)
                    for g, mm in jx.groupby(messages, size=self.batch_size):
                        self.es.extend(mm)
                    bad_count = 0
            except Exception, e:
                Log.warning("Problem inserting logs into ES", cause=e)
                bad_count += 1
                if bad_count > 5:
                    break
        Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index)

        # CONTINUE TO DRAIN THIS QUEUE
        while not please_stop:
            try:
                Thread.sleep(seconds=1)
                self.queue.pop_all()
            except Exception, e:
                Log.warning("Should not happen", cause=e)
class TextLog_usingElasticSearch(TextLog):
    @use_settings
    def __init__(self, host, index, type="log", max_size=1000, batch_size=100, settings=None):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        self.es = Cluster(settings).get_or_create_index(
            schema=convert.json2value(convert.value2json(SCHEMA), leaves=True),
            limit_replicas=True,
            tjson=True,
            settings=settings,
        )
        self.batch_size = batch_size
        self.es.add_alias(coalesce(settings.alias, settings.index))
        self.queue = Queue("debug logs to es", max=max_size, silent=True)
        self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3)
        self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE))
        Thread.run("add debug logs to es", self._insert_loop)

    def write(self, template, params):
        if params.get("template"):
            # DETECTED INNER TEMPLATE, ASSUME TRACE IS ON, SO DO NOT NEED THE OUTER TEMPLATE
            self.queue.add({"value": params})
        else:
            template = strings.limit(template, 2000)
            self.queue.add({"value": {"template": template, "params": params}}, timeout=3 * MINUTE)
        return self

    def _insert_loop(self, please_stop=None):
        bad_count = 0
        while not please_stop:
            try:
                Thread.sleep(seconds=1)
                messages = wrap(self.queue.pop_all())
                if messages:
                    # for m in messages:
                    #     m.value.params = leafer(m.value.params)
                    #     m.value.error = leafer(m.value.error)
                    for g, mm in jx.groupby(messages, size=self.batch_size):
                        self.es.extend(mm)
                    bad_count = 0
            except Exception, e:
                Log.warning("Problem inserting logs into ES", cause=e)
                bad_count += 1
                if bad_count > 5:
                    break
        Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index)

        # CONTINUE TO DRAIN THIS QUEUE
        while not please_stop:
            try:
                Thread.sleep(seconds=1)
                self.queue.pop_all()
            except Exception, e:
                Log.warning("Should not happen", cause=e)
class Log_usingThreadedStream(BaseLog):
    # stream CAN BE AN OBJCET WITH write() METHOD, OR A STRING
    # WHICH WILL eval() TO ONE
    def __init__(self, stream):
        assert stream

        use_UTF8 = False

        if isinstance(stream, basestring):
            if stream.startswith("sys."):
                use_UTF8 = True  # sys.* ARE OLD AND CAN NOT HANDLE unicode
            self.stream = eval(stream)
            name = stream
        else:
            self.stream = stream
            name = "stream"

        # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD
        from pyLibrary.thread.threads import Queue

        if use_UTF8:

            def utf8_appender(value):
                if isinstance(value, unicode):
                    value = value.encode('utf8')
                self.stream.write(value)

            appender = utf8_appender
        else:
            appender = self.stream.write

        self.queue = Queue("log to stream", max=10000, silent=True)
        self.thread = Thread("log to " + name,
                             time_delta_pusher,
                             appender=appender,
                             queue=self.queue,
                             interval=timedelta(seconds=0.3))
        self.thread.parent.remove_child(
            self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()

    def write(self, template, params):
        try:
            self.queue.add({"template": template, "params": params})
            return self
        except Exception, e:
            raise e  # OH NO!
class Multithread(object):
    """
    SIMPLE SEMANTICS FOR SYMMETRIC MULTITHREADING
    PASS A SET OF functions TO BE EXECUTED (ONE PER THREAD)
    SET outbound==False TO SIMPLY THROW AWAY RETURN VALUES, IF ANY
    threads - IF functions IS NOT AN ARRAY, THEN threads  IS USED TO MAKE AN ARRAY
    THE inbound QUEUE IS EXPECTING dicts, EACH dict IS USED AS kwargs TO GIVEN functions
    """

    def __init__(self, functions, threads=None, outbound=None, silent_queues=None):
        if outbound is None:
            self.outbound = Queue("multithread", silent=silent_queues)
        elif outbound is False:
            self.outbound = None
        else:
            self.outbound = outbound

        self.inbound = Queue("multithread", silent=silent_queues)

        # MAKE THREADS
        if isinstance(functions, Iterable):
            Log.error("Not supported anymore")

        self.threads = []
        for t in range(coalesce(threads, 1)):
            thread = worker_thread("worker " + unicode(t), self.inbound, self.outbound, functions)
            self.threads.append(thread)

    def __enter__(self):
        return self

    # WAIT FOR ALL QUEUED WORK TO BE DONE BEFORE RETURNING
    def __exit__(self, type, value, traceback):
        try:
            if isinstance(value, Exception):
                self.inbound.close()

                for t in self.threads:
                    t.keep_running = False
            else:
                # ADD STOP MESSAGE, ONE FOR EACH THREAD, FOR ORDERLY SHUTDOWN
                for t in self.threads:
                    self.inbound.add(Thread.STOP)
            self.join()
        except Exception, e:
            Log.warning("Problem sending stops", e)
Example #8
0
class Log_usingQueue(BaseLog):
    def __init__(self):
        self.queue = Queue("log messages")

    def write(self, template, params):
        self.queue.add(expand_template(template, params))

    def stop(self):
        self.queue.close()

    def pop(self):
        lines = self.queue.pop()
        output = []
        for l in lines.split("\n"):
            if l[19:22] == " - ":
                l = l[22:]
            if l.strip().startswith("File"):
                continue
            output.append(l)
        return "\n".join(output).strip()
class TextLog_usingThreadedStream(TextLog):
    # stream CAN BE AN OBJCET WITH write() METHOD, OR A STRING
    # WHICH WILL eval() TO ONE
    def __init__(self, stream):
        assert stream

        use_UTF8 = False

        if isinstance(stream, basestring):
            if stream.startswith("sys."):
                use_UTF8 = True  # sys.* ARE OLD AND CAN NOT HANDLE unicode
            self.stream = eval(stream)
            name = stream
        else:
            self.stream = stream
            name = "stream"

        # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD
        from pyLibrary.thread.threads import Queue

        if use_UTF8:
            def utf8_appender(value):
                if isinstance(value, unicode):
                    value = value.encode('utf8')
                self.stream.write(value)

            appender = utf8_appender
        else:
            appender = self.stream.write

        self.queue = Queue("log to stream", max=10000, silent=True)
        self.thread = Thread("log to " + name, time_delta_pusher, appender=appender, queue=self.queue, interval=timedelta(seconds=0.3))
        self.thread.parent.remove_child(self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()

    def write(self, template, params):
        try:
            self.queue.add({"template": template, "params": params})
            return self
        except Exception, e:
            raise e  # OH NO!
Example #10
0
    def find_changeset(self, revision, please_stop=False):
        locker = Lock()
        output = []
        queue = Queue("branches", max=2000)
        queue.extend(self.branches)
        queue.add(Thread.STOP)

        problems = []
        def _find(please_stop):
            for b in queue:
                if please_stop:
                    return
                try:
                    url = b.url + "json-info?node=" + revision
                    response = http.get(url, timeout=30)
                    if response.status_code == 200:
                        with locker:
                            output.append(b)
                        Log.note("{{revision}} found at {{url}}", url=url, revision=revision)
                except Exception, f:
                    problems.append(f)
Example #11
0
def etl_one(settings):
    queue = Queue("temp work queue")
    queue.__setattr__(b"commit", Null)
    queue.__setattr__(b"rollback", Null)

    settings.param.wait_forever = False
    already_in_queue = set()
    for w in settings.workers:
        source = get_container(w.source)
        # source.settings.fast_forward = True
        if id(source) in already_in_queue:
            continue
        try:
            for i in parse_id_argument(settings.args.id):
                data = source.get_key(i)
                if data != None:
                    already_in_queue.add(id(source))
                    queue.add(Dict(bucket=w.source.bucket, key=i))
        except Exception, e:
            if "Key {{key}} does not exist" in e:
                already_in_queue.add(id(source))
                queue.add(Dict(bucket=w.source.bucket, key=settings.args.id))
            Log.warning("Problem", cause=e)
class TextLog_usingQueue(TextLog):

    def __init__(self, name=None):
        queue_name = "log messages to queue"
        if name:
            queue_name += " "+name
        self.queue = Queue(queue_name)

    def write(self, template, params):
        self.queue.add(expand_template(template, params))

    def stop(self):
        self.queue.close()

    def pop(self):
        lines = self.queue.pop()
        output = []
        for l in lines.split("\n"):
            if l[19:22] == " - ":
                l = l[22:]
            if l.strip().startswith("File"):
                continue
            output.append(l)
        return "\n".join(output).strip()
Example #13
0
class Log_usingThread(BaseLog):
    def __init__(self, logger):
        # DELAYED LOAD FOR THREADS MODULE
        from pyLibrary.thread.threads import Queue

        self.queue = Queue("logs", max=10000, silent=True)
        self.logger = logger

        def worker(please_stop):
            while not please_stop:
                Thread.sleep(1)
                logs = self.queue.pop_all()
                for log in logs:
                    if log is Thread.STOP:
                        if DEBUG_LOGGING:
                            sys.stdout.write(
                                "Log_usingThread.worker() sees stop, filling rest of queue\n"
                            )
                        please_stop.go()
                    else:
                        self.logger.write(**log)

        self.thread = Thread("log thread", worker)
        self.thread.parent.remove_child(
            self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()

    def write(self, template, params):
        try:
            self.queue.add({"template": template, "params": params})
            return self
        except Exception, e:
            sys.stdout.write(
                "IF YOU SEE THIS, IT IS LIKELY YOU FORGOT TO RUN Log.start() FIRST\n"
            )
            raise e  # OH NO!
Example #14
0
class Sqlite(DB):
    """
    Allows multi-threaded access
    Loads extension functions (like SQRT)
    """

    canonical = None

    def __init__(self, filename=None, db=None):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread save database
        """
        if not _upgraded:
            _upgrade()

        self.filename = filename
        self.db = db
        self.queue = Queue("sql commands")   # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = DEBUG

    def execute(self, command):
        """
        COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN
        BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS
        :param command: COMMAND FOR SQLITE
        :return: None
        """
        if self.get_trace:
            trace = extract_stack(1)
        else:
            trace = None
        self.queue.add((command, None, None, trace))

    def query(self, command):
        """
        WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED
        :param command: COMMAND FOR SQLITE
        :return: list OF RESULTS
        """
        signal = Signal()
        result = Data()
        self.queue.add((command, result, signal, None))
        signal.wait()
        if result.exception:
            Log.error("Problem with Sqlite call", cause=result.exception)
        return result

    def _worker(self, please_stop):
        if Sqlite.canonical:
            self.db = Sqlite.canonical
        else:
            self.db = sqlite3.connect(coalesce(self.filename, ':memory:'))
            try:
                full_path = File("pyLibrary/vendor/sqlite/libsqlitefunctions.so").abspath
                # self.db.execute("SELECT sqlite3_enable_load_extension(1)")
                self.db.enable_load_extension(True)
                self.db.execute("SELECT load_extension('" + full_path + "')")
            except Exception, e:
                Log.warning("loading sqlite extension functions failed, doing without. (no SQRT for you!)", cause=e)

        try:
            while not please_stop:
                if DEBUG:
                    Log.note("begin pop")
                command, result, signal, trace = self.queue.pop(till=please_stop)
                if DEBUG:
                    Log.note("done pop")

                if DEBUG:
                    Log.note("Running command\n{{command|indent}}", command=command)
                with Timer("Run command", debug=DEBUG):
                    if signal is not None:
                        try:
                            curr = self.db.execute(command)
                            result.meta.format = "table"
                            result.header = [d[0] for d in curr.description] if curr.description else None
                            result.data = curr.fetchall()
                            if DEBUG and result.data:
                                text = convert.table2csv(list(result.data))
                                Log.note("Result:\n{{data}}", data=text)
                        except Exception, e:
                            e = Except.wrap(e)
                            result.exception = Except(ERROR, "Problem with\n{{command|indent}}", command=command, cause=e)
                        finally:
Example #15
0
class FromESMetadata(Schema):
    """
    QUERY THE METADATA
    """
    def __new__(cls, *args, **kwargs):
        global singlton
        if singlton:
            return singlton
        else:
            singlton = object.__new__(cls)
            return singlton

    @use_settings
    def __init__(self,
                 host,
                 index,
                 alias=None,
                 name=None,
                 port=9200,
                 settings=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = settings
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(settings=settings)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now() - OLD_METADATA

        self.meta = Dict()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer(
            "meta.tables", [], wrap({c.name: c
                                     for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def query_path(self):
        return None

    @property
    def url(self):
        return self.default_es.path + "/" + self.default_name.replace(".", "/")

    def get_table(self, table_name):
        with self.meta.tables.locker:
            return wrap(
                [t for t in self.meta.tables.data if t.name == table_name])

    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = self.meta.columns.find(c.table, c.name)
        if not existing_columns:
            self.meta.columns.add(c)
            self.todo.add(c)

            if ENABLE_META_SCAN:
                Log.note("todo: {{table}}::{{column}}",
                         table=c.table,
                         column=c.es_column)
                # MARK meta.columns AS DIRTY TOO
                cols = self.meta.columns.find("meta.columns", None)
                for cc in cols:
                    cc.partitions = cc.cardinality = None
                    cc.last_updated = Date.now()
                self.todo.extend(cols)
        else:
            canonical = existing_columns[0]
            if canonical.relative and not c.relative:
                return  # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS

            for key in Column.__slots__:
                canonical[key] = c[key]
            Log.note("todo: {{table}}::{{column}}",
                     table=canonical.table,
                     column=canonical.es_column)
            self.todo.add(canonical)

    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        meta = self.es_metadata.indices[table]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[table]

        for _, properties in meta.mappings.items():
            self._parse_properties(meta.index, properties, meta)

    def _parse_properties(self, abs_index, properties, meta):
        abs_columns = _elasticsearch.parse_properties(abs_index, None,
                                                      properties.properties)
        abs_columns = abs_columns.filter(  # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED
            lambda r: not r.es_column.startswith("other.") and not r.es_column.
            startswith("previous_values.cf_") and not r.es_index.startswith(
                "debug") and r.es_column.find("=") == -1 and r.es_column.find(
                    " ") == -1)
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   debug=DEBUG):

            def add_column(c, query_path):
                c.last_updated = Date.now()
                c.table = join_field([c.es_index] + split_field(query_path[0]))

                with self.meta.columns.locker:
                    self._upsert_column(c)
                    for alias in meta.aliases:
                        c = copy(c)
                        c.table = join_field([alias] +
                                             split_field(query_path[0]))
                        self._upsert_column(c)

            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns
                           if c.type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(ROOT_PATH)

            # ADD RELATIVE COLUMNS
            for abs_column in abs_columns:
                full_path = abs_column.nested_path
                abs_depth = len(full_path) - 1
                abs_parent = full_path[1] if abs_depth else ""

                for query_path in query_paths:
                    rel_depth = len(query_path) - 1
                    rel_parent = query_path[0]
                    rel_column = copy(abs_column)
                    rel_column.relative = True

                    add_column(copy(abs_column), query_path)

                    if rel_parent == ".":
                        add_column(rel_column, query_path)
                    elif abs_column.es_column.startswith(rel_parent + "."):
                        rel_column.name = abs_column.es_column[len(rel_parent
                                                                   ) + 1:]
                        add_column(rel_column, query_path)
                    elif abs_column.es_column == rel_parent:
                        rel_column.name = "."
                        add_column(rel_column, query_path)
                    elif not abs_parent:
                        # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o)
                        # AND THEN REMOVE THE SHADOWED
                        rel_column.name = "." + (
                            "." *
                            (rel_depth - abs_depth)) + abs_column.es_column
                        add_column(rel_column, query_path)
                    elif rel_parent.startswith(abs_parent + "."):
                        rel_column.name = "." + (
                            "." *
                            (rel_depth - abs_depth)) + abs_column.es_column
                        add_column(rel_column, query_path)
                    elif rel_parent != abs_parent:
                        # SIBLING NESTED PATHS ARE INVISIBLE
                        pass
                    else:
                        Log.error("logic error")

    def query(self, _query):
        return self.meta.columns.query(
            QueryOp(
                set_default(
                    {
                        "from": self.meta.columns,
                        "sort": ["table", "name"]
                    }, _query.as_dict())))

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            short_name = join_field(split_field(table_name)[0:1])
            table = self.get_table(short_name)[0]

            if not table:
                table = Table(name=short_name,
                              url=None,
                              query_path=None,
                              timestamp=Date.now())
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=short_name)
            elif force or table.timestamp == None or table.timestamp < Date.now(
            ) - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=short_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(table_name, column_name)
            if columns:
                columns = jx.sort(columns, "name")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    Log.note("waiting for columns to update {{columns|json}}",
                             columns=[
                                 c.table + "." + c.es_column for c in columns
                                 if not c.last_updated
                             ])
                    Thread.sleep(seconds=1)
                return columns
        except Exception, e:
            Log.error("Not expected", cause=e)

        if column_name:
            Log.error("no columns matching {{table}}.{{column}}",
                      table=table_name,
                      column=column_name)
        else:
            self._get_columns(table=table_name)
            Log.error("no columns for {{table}}?!", table=table_name)
Example #16
0
class FromESMetadata(Schema):
    """
    QUERY THE METADATA
    """
    def __new__(cls, *args, **kwargs):
        global singlton
        if singlton:
            return singlton
        else:
            singlton = object.__new__(cls)
            return singlton

    @use_settings
    def __init__(self,
                 host,
                 index,
                 alias=None,
                 name=None,
                 port=9200,
                 settings=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.lists import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = settings
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(settings=settings)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.meta = Dict()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer(
            "meta.tables", [], wrap({c.name: c
                                     for c in table_columns}))
        self.meta.columns = ListContainer(
            "meta.columns", [], wrap({c.name: c
                                      for c in column_columns}))
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def query_path(self):
        return None

    @property
    def url(self):
        return self.default_es.path + "/" + self.default_name.replace(".", "/")

    def get_table(self, table_name):
        with self.meta.tables.locker:
            return self.meta.tables.query(
                {"where": {
                    "eq": {
                        "name": table_name
                    }
                }})

    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = [
            r for r in self.meta.columns.data
            if r.table == c.table and r.name == c.name
        ]
        if not existing_columns:
            self.meta.columns.add(c)
            Log.note("todo: {{table}}.{{column}}",
                     table=c.table,
                     column=c.es_column)
            self.todo.add(c)

            # MARK meta.columns AS DIRTY TOO
            cols = [
                r for r in self.meta.columns.data if r.table == "meta.columns"
            ]
            for cc in cols:
                cc.partitions = cc.cardinality = None
                cc.last_updated = Date.now()
            self.todo.extend(cols)
        else:
            canonical = existing_columns[0]
            if canonical.relative and not c.relative:
                return  # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS

            for key in Column.__slots__:
                canonical[key] = c[key]
            Log.note("todo: {{table}}.{{column}}",
                     table=canonical.table,
                     column=canonical.es_column)
            self.todo.add(canonical)

    def _get_columns(self, table=None, metadata=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        if not metadata:
            metadata = self.default_es.get_metadata(force=True)

        def parse_all(please_stop):
            for abs_index, meta in jx.sort(metadata.indices.items(), {
                    "value": 0,
                    "sort": -1
            }):
                if meta.index != abs_index:
                    continue

                for _, properties in meta.mappings.items():
                    if please_stop:
                        return
                    self._parse_properties(abs_index, properties, meta)

        if table:
            for abs_index, meta in jx.sort(metadata.indices.items(), {
                    "value": 0,
                    "sort": -1
            }):
                if table == meta.index:
                    for _, properties in meta.mappings.items():
                        self._parse_properties(abs_index, properties, meta)
                    return
                if table == abs_index:
                    self._get_columns(table=meta.index, metadata=metadata)
                    return
        else:
            self.parser = Thread.run("parse properties", parse_all)

    def _parse_properties(self, abs_index, properties, meta):
        abs_columns = _elasticsearch.parse_properties(abs_index, None,
                                                      properties.properties)
        abs_columns = abs_columns.filter(  # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED
            lambda r: not r.es_column.startswith("other.") and not r.es_column.
            startswith("previous_values.cf_") and not r.es_index.startswith(
                "debug"))
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   debug=DEBUG):

            def add_column(c, query_path):
                c.last_updated = Date.now()
                if query_path:
                    c.table = c.es_index + "." + query_path.last()
                else:
                    c.table = c.es_index

                with self.meta.columns.locker:
                    self._upsert_column(c)
                    for alias in meta.aliases:
                        c = copy(c)
                        if query_path:
                            c.table = alias + "." + query_path.last()
                        else:
                            c.table = alias
                        self._upsert_column(c)

            # EACH query_path IS A LIST OF EVER-INCREASING PATHS THROUGH EACH NESTED LEVEL
            query_paths = wrap([[c.es_column] for c in abs_columns
                                if c.type == "nested"])
            for a, b in itertools.product(query_paths, query_paths):
                aa = a.last()
                bb = b.last()
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) < len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(0, aa)
                        break
            query_paths.append([])

            for c in abs_columns:
                # ADD RELATIVE COLUMNS
                full_path = listwrap(c.nested_path)
                abs_depth = len(full_path)
                abs_parent = coalesce(full_path.last(), "")
                for query_path in query_paths:
                    rel_depth = len(query_path)

                    # ABSOLUTE
                    add_column(copy(c), query_path)
                    cc = copy(c)
                    cc.relative = True

                    if not query_path:
                        add_column(cc, query_path)
                        continue

                    rel_parent = query_path.last()

                    if c.es_column.startswith(rel_parent + "."):
                        cc.name = c.es_column[len(rel_parent) + 1:]
                        add_column(cc, query_path)
                    elif c.es_column == rel_parent:
                        cc.name = "."
                        add_column(cc, query_path)
                    elif not abs_parent:
                        # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o)
                        # AND THEN REMOVE THE SHADOWED
                        cc.name = "." + ("." *
                                         (rel_depth - abs_depth)) + c.es_column
                        add_column(cc, query_path)
                    elif rel_parent.startswith(abs_parent + "."):
                        cc.name = "." + ("." *
                                         (rel_depth - abs_depth)) + c.es_column
                        add_column(cc, query_path)
                    elif rel_parent != abs_parent:
                        # SIBLING NESTED PATHS ARE INVISIBLE
                        pass
                    else:
                        Log.error("logic error")

    def query(self, _query):
        return self.meta.columns.query(
            QueryOp(
                set_default(
                    {
                        "from": self.meta.columns,
                        "sort": ["table", "name"]
                    }, _query.as_dict())))

    def get_columns(self,
                    table_name,
                    column_name=None,
                    fail_when_not_found=False):
        """
        RETURN METADATA COLUMNS
        """
        try:
            with self.meta.columns.locker:
                columns = [
                    c for c in self.meta.columns.data
                    if c.table == table_name and (
                        column_name is None or c.name == column_name)
                ]
            if columns:
                columns = jx.sort(columns, "name")
                if fail_when_not_found:
                    # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                    while len(self.todo) and not all(
                            columns.get("last_updated")):
                        Log.note(
                            "waiting for columns to update {{columns|json}}",
                            columns=[
                                c.table + "." + c.es_column for c in columns
                                if not c.last_updated
                            ])
                        Thread.sleep(seconds=1)
                    return columns
                elif all(columns.get("last_updated")):
                    return columns
        except Exception, e:
            Log.error("Not expected", cause=e)

        if fail_when_not_found:
            if column_name:
                Log.error("no columns matching {{table}}.{{column}}",
                          table=table_name,
                          column=column_name)
            else:
                self._get_columns(table=table_name)
                Log.error("no columns for {{table}}", table=table_name)

        self._get_columns(table=join_field(split_field(table_name)[0:1]))
        return self.get_columns(table_name=table_name,
                                column_name=column_name,
                                fail_when_not_found=True)
Example #17
0
class FromESMetadata(Schema):
    """
    QUERY THE METADATA
    """

    def __new__(cls, *args, **kwargs):
        global singlton
        if singlton:
            return singlton
        else:
            singlton = object.__new__(cls)
            return singlton

    @use_settings
    def __init__(self, host, index, alias=None, name=None, port=9200, settings=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.lists import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = settings
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(settings=settings)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.meta=Dict()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns}))
        self.meta.columns = ListContainer("meta.columns", [], wrap({c.name: c for c in column_columns}))
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def query_path(self):
        return None

    @property
    def url(self):
        return self.default_es.path + "/" + self.default_name.replace(".", "/")

    def get_table(self, table_name):
        with self.meta.tables.locker:
            return self.meta.tables.query({"where": {"eq": {"name": table_name}}})

    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = [r for r in self.meta.columns.data if r.table == c.table and r.name == c.name]
        if not existing_columns:
            self.meta.columns.add(c)
            Log.note("todo: {{table}}.{{column}}", table=c.table, column=c.es_column)
            self.todo.add(c)

            # MARK meta.columns AS DIRTY TOO
            cols = [r for r in self.meta.columns.data if r.table == "meta.columns"]
            for cc in cols:
                cc.partitions = cc.cardinality = None
                cc.last_updated = Date.now()
            self.todo.extend(cols)
        else:
            canonical = existing_columns[0]
            if canonical.relative and not c.relative:
                return  # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS

            for key in Column.__slots__:
                canonical[key] = c[key]
            Log.note("todo: {{table}}.{{column}}", table=canonical.table, column=canonical.es_column)
            self.todo.add(canonical)

    def _get_columns(self, table=None, metadata=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        if not metadata:
            metadata = self.default_es.get_metadata(force=True)

        def parse_all(please_stop):
            for abs_index, meta in jx.sort(metadata.indices.items(), {"value": 0, "sort": -1}):
                if meta.index != abs_index:
                    continue

                for _, properties in meta.mappings.items():
                    if please_stop:
                        return
                    self._parse_properties(abs_index, properties, meta)

        if table:
            for abs_index, meta in jx.sort(metadata.indices.items(), {"value": 0, "sort": -1}):
                if table == meta.index:
                    for _, properties in meta.mappings.items():
                        self._parse_properties(abs_index, properties, meta)
                    return
                if table == abs_index:
                    self._get_columns(table=meta.index, metadata=metadata)
                    return
        else:
            self.parser = Thread.run("parse properties", parse_all)




    def _parse_properties(self, abs_index, properties, meta):
        abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties)
        abs_columns = abs_columns.filter(  # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED
            lambda r: not r.es_column.startswith("other.") and
                      not r.es_column.startswith("previous_values.cf_") and
                      not r.es_index.startswith("debug")
        )
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG):
            def add_column(c, query_path):
                c.last_updated = Date.now()
                if query_path:
                    c.table = c.es_index + "." + query_path.last()
                else:
                    c.table = c.es_index

                with self.meta.columns.locker:
                    self._upsert_column(c)
                    for alias in meta.aliases:
                        c = copy(c)
                        if query_path:
                            c.table = alias + "." + query_path.last()
                        else:
                            c.table = alias
                        self._upsert_column(c)

            # EACH query_path IS A LIST OF EVER-INCREASING PATHS THROUGH EACH NESTED LEVEL
            query_paths = wrap([[c.es_column] for c in abs_columns if c.type == "nested"])
            for a, b in itertools.product(query_paths, query_paths):
                aa = a.last()
                bb = b.last()
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) < len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(0, aa)
                        break
            query_paths.append([])

            for c in abs_columns:
                # ADD RELATIVE COLUMNS
                full_path = listwrap(c.nested_path)
                abs_depth = len(full_path)
                abs_parent = coalesce(full_path.last(), "")
                for query_path in query_paths:
                    rel_depth = len(query_path)

                    # ABSOLUTE
                    add_column(copy(c), query_path)
                    cc = copy(c)
                    cc.relative = True

                    if not query_path:
                        add_column(cc, query_path)
                        continue

                    rel_parent = query_path.last()

                    if c.es_column.startswith(rel_parent+"."):
                        cc.name = c.es_column[len(rel_parent)+1:]
                        add_column(cc, query_path)
                    elif c.es_column == rel_parent:
                        cc.name = "."
                        add_column(cc, query_path)
                    elif not abs_parent:
                        # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o)
                        # AND THEN REMOVE THE SHADOWED
                        cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column
                        add_column(cc, query_path)
                    elif rel_parent.startswith(abs_parent+"."):
                        cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column
                        add_column(cc, query_path)
                    elif rel_parent != abs_parent:
                        # SIBLING NESTED PATHS ARE INVISIBLE
                        pass
                    else:
                        Log.error("logic error")

    def query(self, _query):
        return self.meta.columns.query(QueryOp(set_default(
            {
                "from": self.meta.columns,
                "sort": ["table", "name"]
            },
            _query.as_dict()
        )))

    def get_columns(self, table_name, column_name=None, fail_when_not_found=False):
        """
        RETURN METADATA COLUMNS
        """
        try:
            with self.meta.columns.locker:
                columns = [c for c in self.meta.columns.data if c.table == table_name and (column_name is None or c.name==column_name)]
            if columns:
                columns = jx.sort(columns, "name")
                if fail_when_not_found:
                    # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                    while len(self.todo) and not all(columns.get("last_updated")):
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated])
                        Thread.sleep(seconds=1)
                    return columns
                elif all(columns.get("last_updated")):
                    return columns
        except Exception, e:
            Log.error("Not expected", cause=e)

        if fail_when_not_found:
            if column_name:
                Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name)
            else:
                self._get_columns(table=table_name)
                Log.error("no columns for {{table}}", table=table_name)

        self._get_columns(table=join_field(split_field(table_name)[0:1]))
        return self.get_columns(table_name=table_name, column_name=column_name, fail_when_not_found=True)
Example #18
0
class FromESMetadata(Schema):
    """
    QUERY THE METADATA
    """

    def __new__(cls, *args, **kwargs):
        global singlton
        if singlton:
            return singlton
        else:
            singlton = object.__new__(cls)
            return singlton

    @use_settings
    def __init__(self, host, index, alias=None, name=None, port=9200, settings=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = settings
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(settings=settings)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Dict()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def query_path(self):
        return None

    @property
    def url(self):
        return self.default_es.path + "/" + self.default_name.replace(".", "/")

    def get_table(self, table_name):
        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == table_name])

    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = self.meta.columns.find(c.table, c.name)
        if not existing_columns:
            self.meta.columns.add(c)
            self.todo.add(c)

            if ENABLE_META_SCAN:
                Log.note("todo: {{table}}::{{column}}", table=c.table, column=c.es_column)
                # MARK meta.columns AS DIRTY TOO
                cols = self.meta.columns.find("meta.columns", None)
                for cc in cols:
                    cc.partitions = cc.cardinality = None
                    cc.last_updated = Date.now()
                self.todo.extend(cols)
        else:
            canonical = existing_columns[0]
            if canonical.relative and not c.relative:
                return  # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS

            for key in Column.__slots__:
                canonical[key] = c[key]
            Log.note("todo: {{table}}::{{column}}", table=canonical.table, column=canonical.es_column)
            self.todo.add(canonical)

    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        meta = self.es_metadata.indices[table]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[table]

        for _, properties in meta.mappings.items():
            self._parse_properties(meta.index, properties, meta)

    def _parse_properties(self, abs_index, properties, meta):
        abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties)
        abs_columns = abs_columns.filter(  # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED
            lambda r: not r.es_column.startswith("other.") and
                      not r.es_column.startswith("previous_values.cf_") and
                      not r.es_index.startswith("debug") and
                      r.es_column.find("=")==-1 and
                      r.es_column.find(" ")==-1
        )
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG):
            def add_column(c, query_path):
                c.last_updated = Date.now()
                c.table = join_field([c.es_index]+split_field(query_path[0]))

                with self.meta.columns.locker:
                    self._upsert_column(c)
                    for alias in meta.aliases:
                        c = copy(c)
                        c.table = join_field([alias]+split_field(query_path[0]))
                        self._upsert_column(c)

            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(ROOT_PATH)

            # ADD RELATIVE COLUMNS
            for abs_column in abs_columns:
                full_path = abs_column.nested_path
                abs_depth = len(full_path)-1
                abs_parent = full_path[1] if abs_depth else ""

                for query_path in query_paths:
                    rel_depth = len(query_path)-1
                    rel_parent = query_path[0]
                    rel_column = copy(abs_column)
                    rel_column.relative = True

                    add_column(copy(abs_column), query_path)

                    if rel_parent == ".":
                        add_column(rel_column, query_path)
                    elif abs_column.es_column.startswith(rel_parent+"."):
                        rel_column.name = abs_column.es_column[len(rel_parent)+1:]
                        add_column(rel_column, query_path)
                    elif abs_column.es_column == rel_parent:
                        rel_column.name = "."
                        add_column(rel_column, query_path)
                    elif not abs_parent:
                        # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o)
                        # AND THEN REMOVE THE SHADOWED
                        rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column
                        add_column(rel_column, query_path)
                    elif rel_parent.startswith(abs_parent+"."):
                        rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column
                        add_column(rel_column, query_path)
                    elif rel_parent != abs_parent:
                        # SIBLING NESTED PATHS ARE INVISIBLE
                        pass
                    else:
                        Log.error("logic error")

    def query(self, _query):
        return self.meta.columns.query(QueryOp(set_default(
            {
                "from": self.meta.columns,
                "sort": ["table", "name"]
            },
            _query.as_dict()
        )))

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            short_name = join_field(split_field(table_name)[0:1])
            table = self.get_table(short_name)[0]

            if not table:
                table = Table(
                    name=short_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=short_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=short_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(table_name, column_name)
            if columns:
                columns = jx.sort(columns, "name")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated])
                    Thread.sleep(seconds=1)
                return columns
        except Exception, e:
            Log.error("Not expected", cause=e)

        if column_name:
            Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name)
        else:
            self._get_columns(table=table_name)
            Log.error("no columns for {{table}}?!", table=table_name)
Example #19
0
class Sqlite(object):

    canonical = None

    def __init__(self, db=None):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread save database
        """
        self.db = None
        self.queue = Queue(
            "sql commands")  # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = DEBUG

    def execute(self, command):
        """
        COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN
        BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS
        :param command: COMMAND FOR SQLITE
        :return: None
        """
        if self.get_trace:
            trace = extract_stack(1)
        else:
            trace = None
        self.queue.add((command, None, None, trace))

    def query(self, command):
        """
        WILL STALL CALLING THREAD UNTIL THE command IS COMPLETED
        :param command: COMMAND FOR SQLITE
        :return: list OF RESULTS
        """
        signal = Signal()
        result = Dict()
        self.queue.add((command, result, signal, None))
        signal.wait_for_go()
        if result.exception:
            Log.error("Problem with Sqlite call", cause=result.exception)
        return result

    def _worker(self, please_stop):
        if Sqlite.canonical:
            self.db = Sqlite.canonical
        else:
            self.db = sqlite3.connect(':memory:')

        try:
            while not please_stop:
                if DEBUG:
                    Log.note("begin pop")
                command, result, signal, trace = self.queue.pop()
                if DEBUG:
                    Log.note("done pop")

                if DEBUG:
                    Log.note("Running command\n{{command|indent}}",
                             command=command)
                with Timer("Run command", debug=DEBUG):
                    if signal is not None:
                        try:
                            curr = self.db.execute(command)
                            result.meta.format = "table"
                            result.data = curr.fetchall()
                        except Exception, e:
                            e = Except.wrap(e)
                            result.exception = Except(
                                ERROR,
                                "Problem with\n{{command|indent}}",
                                command=command,
                                cause=e)
                        finally:
                            signal.go()
Example #20
0
class Sqlite(object):
    """
    Allows multi-threaded access
    Loads extension functions (like SQRT)
    """

    canonical = None

    def __init__(self, db=None):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread save database
        """
        if not _upgraded:
            _upgrade()

        self.db = None
        self.queue = Queue(
            "sql commands")  # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = DEBUG

    def execute(self, command):
        """
        COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN
        BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS
        :param command: COMMAND FOR SQLITE
        :return: None
        """
        if self.get_trace:
            trace = extract_stack(1)
        else:
            trace = None
        self.queue.add((command, None, None, trace))

    def query(self, command):
        """
        WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED
        :param command: COMMAND FOR SQLITE
        :return: list OF RESULTS
        """
        signal = Signal()
        result = Dict()
        self.queue.add((command, result, signal, None))
        signal.wait_for_go()
        if result.exception:
            Log.error("Problem with Sqlite call", cause=result.exception)
        return result

    def _worker(self, please_stop):
        if Sqlite.canonical:
            self.db = Sqlite.canonical
        else:
            self.db = sqlite3.connect(':memory:')
            try:
                full_path = File(
                    "pyLibrary/vendor/sqlite/libsqlitefunctions.so").abspath
                # self.db.execute("SELECT sqlite3_enable_load_extension(1)")
                self.db.enable_load_extension(True)
                self.db.execute("SELECT load_extension('" + full_path + "')")
            except Exception, e:
                Log.warning(
                    "loading sqlite extension functions failed, doing without. (no SQRT for you!)",
                    cause=e)

        try:
            while not please_stop:
                if DEBUG:
                    Log.note("begin pop")
                command, result, signal, trace = self.queue.pop()
                if DEBUG:
                    Log.note("done pop")

                if DEBUG:
                    Log.note("Running command\n{{command|indent}}",
                             command=command)
                with Timer("Run command", debug=DEBUG):
                    if signal is not None:
                        try:
                            curr = self.db.execute(command)
                            result.meta.format = "table"
                            result.header = [d[0] for d in curr.description
                                             ] if curr.description else None
                            result.data = curr.fetchall()
                        except Exception, e:
                            e = Except.wrap(e)
                            result.exception = Except(
                                ERROR,
                                "Problem with\n{{command|indent}}",
                                command=command,
                                cause=e)
                        finally:
Example #21
0
class Sqlite(object):

    canonical = None

    def __init__(self, db=None):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread save database
        """
        self.db = None
        self.queue = Queue("sql commands")   # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = DEBUG

    def execute(self, command):
        """
        COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN
        BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS
        :param command: COMMAND FOR SQLITE
        :return: None
        """
        if self.get_trace:
            trace = extract_stack(1)
        else:
            trace = None
        self.queue.add((command, None, None, trace))

    def query(self, command):
        """
        WILL STALL CALLING THREAD UNTIL THE command IS COMPLETED
        :param command: COMMAND FOR SQLITE
        :return: list OF RESULTS
        """
        signal = Signal()
        result = Dict()
        self.queue.add((command, result, signal, None))
        signal.wait_for_go()
        if result.exception:
            Log.error("Problem with Sqlite call", cause=result.exception)
        return result

    def _worker(self, please_stop):
        if Sqlite.canonical:
            self.db = Sqlite.canonical
        else:
            self.db = sqlite3.connect(':memory:')

        try:
            while not please_stop:
                if DEBUG:
                    Log.note("begin pop")
                command, result, signal, trace = self.queue.pop()
                if DEBUG:
                    Log.note("done pop")

                if DEBUG:
                    Log.note("Running command\n{{command|indent}}", command=command)
                with Timer("Run command", debug=DEBUG):
                    if signal is not None:
                        try:
                            curr = self.db.execute(command)
                            result.meta.format = "table"
                            result.data = curr.fetchall()
                        except Exception, e:
                            e=Except.wrap(e)
                            result.exception = Except(ERROR, "Problem with\n{{command|indent}}", command=command, cause=e)
                        finally:
                            signal.go()
Example #22
0
def loop(source, coverage_summary_index, settings, please_stop):
    try:
        cluster = elasticsearch.Cluster(source)
        aliases = cluster.get_aliases()
        candidates = []
        for pairs in aliases:
            if pairs.alias == source.index:
                candidates.append(pairs.index)
        candidates = jx.sort(candidates, {".": "desc"})

        for index_name in candidates:
            coverage_index = elasticsearch.Index(index=index_name, read_only=False, settings=source)
            push_date_filter = unicode2Date(coverage_index.settings.index[-15::], elasticsearch.INDEX_DATE_FORMAT)

            while not please_stop:
                # IDENTIFY NEW WORK
                Log.note("Working on index {{index}}", index=index_name)
                coverage_index.refresh()

                todo = http.post_json(settings.url, json={
                    "from": "coverage",
                    "groupby": ["source.file.name", "build.revision12"],
                    "where": {"and": [
                        {"missing": "source.method.name"},
                        {"missing": "source.file.min_line_siblings"},
                        {"gte": {"repo.push.date": push_date_filter}}
                    ]},
                    "format": "list",
                    "limit": coalesce(settings.batch_size, 100)
                })

                if not todo.data:
                    break

                queue = Queue("pending source files to review")
                queue.extend(todo.data[0:coalesce(settings.batch_size, 100):])

                threads = [
                    Thread.run(
                        "processor" + unicode(i),
                        process_batch,
                        queue,
                        coverage_index,
                        coverage_summary_index,
                        settings,
                        please_stop=please_stop
                    )
                    for i in range(NUM_THREAD)
                ]

                # ADD STOP MESSAGE
                queue.add(Thread.STOP)

                # WAIT FOR THEM TO COMPLETE
                for t in threads:
                    t.join()

        please_stop.go()
        return

    except Exception, e:
        Log.warning("Problem processing", cause=e)
Example #23
0
class FromESMetadata(object):
    """
    QUERY THE METADATA
    """

    def __new__(cls, *args, **kwargs):
        global singlton
        if singlton:
            return singlton
        else:
            singlton = object.__new__(cls)
            return singlton

    @use_settings
    def __init__(self, host, index, alias=None, name=None, port=9200, settings=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.lists import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = settings
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(settings=settings)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns}))
        self.columns = ListContainer("meta.columns", [], wrap({c.name: c for c in column_columns}))
        self.columns.insert(column_columns)
        self.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def query_path(self):
        return None

    @property
    def url(self):
        return self.default_es.path + "/" + self.default_name.replace(".", "/")

    def get_table(self, table_name):
        with self.tables.locker:
            return self.tables.query({"where": {"eq": {"name": table_name}}})

    def upsert_column(self, c):
        existing_columns = filter(lambda r: r.table == c.table and r.abs_name == c.abs_name, self.columns.data)
        if not existing_columns:
            self.columns.add(c)
            cols = filter(lambda r: r.table == "meta.columns", self.columns.data)
            for cc in cols:
                cc.partitions = cc.cardinality = cc.last_updated = None
            self.todo.add(c)
            self.todo.extend(cols)
        else:
            set_default(existing_columns[0], c)
            self.todo.add(existing_columns[0])

            # TEST CONSISTENCY
            for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                if c.abs_name==d.abs_name and c.table==d.table and c!=d:
                    Log.error("")


    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        alias_done = set()
        index = split_field(table)[0]
        query_path = split_field(table)[1:]
        metadata = self.default_es.get_metadata(index=index)
        for index, meta in qb.sort(metadata.indices.items(), {"value": 0, "sort": -1}):
            for _, properties in meta.mappings.items():
                columns = _elasticsearch.parse_properties(index, None, properties.properties)
                columns = columns.filter(lambda r: not r.abs_name.startswith("other.") and not r.abs_name.startswith("previous_values.cf_"))  # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED
                with Timer("upserting {{num}} columns", {"num": len(columns)}, debug=DEBUG):
                    with self.columns.locker:
                        for c in columns:
                            # ABSOLUTE
                            c.table = join_field([index]+query_path)
                            self.upsert_column(c)

                            for alias in meta.aliases:
                                # ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS
                                if alias in alias_done:
                                    continue
                                alias_done.add(alias)
                                c = copy(c)
                                c.table = join_field([alias]+query_path)
                                self.upsert_column(c)

    def query(self, _query):
        return self.columns.query(Query(set_default(
            {
                "from": self.columns,
                "sort": ["table", "name"]
            },
            _query.as_dict()
        )))

    def get_columns(self, table):
        """
        RETURN METADATA COLUMNS
        """
        with self.columns.locker:
            columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name")
            if columns:
                return columns

        self._get_columns(table=table)
        with self.columns.locker:
            columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name")
            if columns:
                return columns

        # self._get_columns(table=table)
        Log.error("no columns for {{table}}", table=table)

    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in ["object", "nested"]:
            Log.error("not supported")
        try:
            if c.table == "meta.columns":
                with self.columns.locker:
                    partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.columns, c.abs_name) if g[c.abs_name] != None])
                    self.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
                    })
                return
            if c.table == "meta.tables":
                with self.columns.locker:
                    partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.tables, c.abs_name) if g[c.abs_name] != None])
                    self.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return

            es_index = c.table.split(".")[0]
            result = self.default_es.post("/"+es_index+"/_search", data={
                "aggs": {c.name: _counting_query(c)},
                "size": 0
            })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value)
            if cardinality == None:
                Log.error("logic error")

            query = Dict(size=0)
            if c.type in ["object", "nested"]:
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.columns.locker:
                    self.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.columns.locker:
                    self.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.columns.locker:
                    self.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return
            elif c.nested_path:
                query.aggs[literal_field(c.name)] = {
                    "nested": {"path": listwrap(c.nested_path)[0]},
                    "aggs": {"_nested": {"terms": {"field": c.abs_name, "size": 0}}}
                }
            else:
                query.aggs[literal_field(c.name)] = {"terms": {"field": c.abs_name, "size": 0}}

            result = self.default_es.post("/"+es_index+"/_search", data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = qb.sort(aggs._nested.buckets.key)
            else:
                parts = qb.sort(aggs.buckets.key)

            Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.columns.locker:
                self.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
                })
        except Exception, e:
            if "IndexMissingException" in e and c.table.startswith("testing"):
                Log.alert("{{col.table}} does not exist", col=c)
            else:
                self.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear":[
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
                })
                Log.warning("Could not get {{col.table}}.{{col.abs_name}} info", col=c, cause=e)