Example #1
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            column = self.todo.pop()
            if column == THREAD_STOP:
                break

            if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE:
                DEBUG and Log.note("{{column.es_column}} is a struct", column=column)
                column.last_updated = Date.now()
                continue
            elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None:
                # DO NOT UPDATE FRESH COLUMN METADATA
                DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds)
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05):
                if untype_path(column.name) in ["build.type", "run.type"]:
                    try:
                        self._update_cardinality(column)
                    except Exception as e:
                        Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
                else:
                    column.last_updated = Date.now()
    def _send_email(self):
        try:
            if self.accumulation:
                with Emailer(self.settings) as emailer:
                    # WHO ARE WE SENDING TO
                    emails = Data()
                    for template, params in self.accumulation:
                        content = expand_template(template, params)
                        emails[literal_field(self.settings.to_address)] += [content]
                        for c in self.cc:
                            if any(d in params.params.error for d in c.contains):
                                emails[literal_field(c.to_address)] += [content]

                    # SEND TO EACH
                    for to_address, content in emails.items():
                        emailer.send_email(
                            from_address=self.settings.from_address,
                            to_address=listwrap(to_address),
                            subject=self.settings.subject,
                            text_data="\n\n".join(content)
                        )

            self.next_send = Date.now() + self.settings.max_interval
            self.accumulation = []
        except Exception as e:
            self.next_send = Date.now() + self.settings.max_interval
            Log.warning("Could not send", e)
Example #3
0
    def _send_email(self):
        try:
            if not self.accumulation:
                return
            with Closer(
                    connect_to_region(
                        self.settings.region,
                        aws_access_key_id=unwrap(
                            self.settings.aws_access_key_id),
                        aws_secret_access_key=unwrap(
                            self.settings.aws_secret_access_key))) as conn:

                # WHO ARE WE SENDING TO
                emails = Data()
                for template, params in self.accumulation:
                    content = expand_template(template, params)
                    emails[literal_field(
                        self.settings.to_address)] += [content]
                    for c in self.cc:
                        if any(c in params.params.error for c in c.contains):
                            emails[literal_field(c.to_address)] += [content]

                # SEND TO EACH
                for to_address, content in emails.items():
                    conn.send_email(source=self.settings.from_address,
                                    to_addresses=listwrap(to_address),
                                    subject=self.settings.subject,
                                    body="\n\n".join(content),
                                    format="text")

            self.next_send = Date.now() + self.settings.max_interval
            self.accumulation = []
        except Exception as e:
            self.next_send = Date.now() + self.settings.max_interval
            Log.warning("Could not send", e)
Example #4
0
    def _send_email(self):
        try:
            if self.accumulation:
                with Emailer(self.settings) as emailer:
                    # WHO ARE WE SENDING TO
                    emails = Data()
                    for template, params in self.accumulation:
                        content = expand_template(template, params)
                        emails[literal_field(
                            self.settings.to_address)] += [content]
                        for c in self.cc:
                            if any(c in params.params.error
                                   for c in c.contains):
                                emails[literal_field(
                                    c.to_address)] += [content]

                    # SEND TO EACH
                    for to_address, content in emails.items():
                        emailer.send_email(
                            from_address=self.settings.from_address,
                            to_address=listwrap(to_address),
                            subject=self.settings.subject,
                            text_data="\n\n".join(content))

            self.next_send = Date.now() + self.settings.max_interval
            self.accumulation = []
        except Exception as e:
            self.next_send = Date.now() + self.settings.max_interval
            Log.warning("Could not send", e)
Example #5
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if c.last_updated >= Date.now() - TOO_OLD:
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}",
                       param={"col": c},
                       silent=not DEBUG,
                       too_long=0.05):
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "es_index": c.es_index,
                            "es_column": c.es_column
                        }
                    }
                })
Example #6
0
    def _send_email(self):
        try:
            if not self.accumulation:
                return
            with Closer(connect_to_region(
                self.settings.region,
                aws_access_key_id=unwrap(self.settings.aws_access_key_id),
                aws_secret_access_key=unwrap(self.settings.aws_secret_access_key)
            )) as conn:

                # WHO ARE WE SENDING TO
                emails = Data()
                for template, params in self.accumulation:
                    content = expand_template(template, params)
                    emails[literal_field(self.settings.to_address)] += [content]
                    for c in self.cc:
                        if any(c in params.params.error for c in c.contains):
                            emails[literal_field(c.to_address)] += [content]

                # SEND TO EACH
                for to_address, content in emails.items():
                    conn.send_email(
                        source=self.settings.from_address,
                        to_addresses=listwrap(to_address),
                        subject=self.settings.subject,
                        body="\n\n".join(content),
                        format="text"
                    )

            self.next_send = Date.now() + self.settings.max_interval
            self.accumulation = []
        except Exception as e:
            self.next_send = Date.now() + self.settings.max_interval
            Log.warning("Could not send", e)
Example #7
0
def metadata_tables():
    return wrap(
        [
            Column(
                name=c,
                es_index="meta.tables",
                es_column=c,
                es_type="string",
                jx_type=STRING,
                last_updated=Date.now(),
                nested_path=ROOT_PATH
            )
            for c in [
                "name",
                "url",
                "query_path"
            ]
        ]+[
            Column(
                name=c,
                es_index="meta.tables",
                es_column=c,
                es_type="integer",
                jx_type=INTEGER,
                last_updated=Date.now(),
                nested_path=ROOT_PATH
            )
            for c in [
                "timestamp"
            ]
        ]
    )
Example #8
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now() - TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "es_index": c.es_index,
                            "es_column": c.es_column
                        }
                    }
                })
            if DEBUG:
                Log.note(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=c)
Example #9
0
    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c
                        for c in self.meta.columns
                        if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[Date(t).format() for t in wrap(old_columns).last_updated]
                        )
                        self.todo.extend(old_columns)
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            DEBUG and Log.note("{{column.es_column}} does not exist", column=column)
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {"eq": {"es_index": column.es_index}}
                            })
                            continue
                        if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE:
                            DEBUG and Log.note("{{column.es_column}} is a struct", column=column)
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None:
                            # DO NOT UPDATE FRESH COLUMN METADATA
                            DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds)
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                                })
                            else:
                                Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)
Example #10
0
    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c
                        for c in self.meta.columns
                        if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[Date(t).format() for t in wrap(old_columns).last_updated]
                        )
                        self.todo.extend(old_columns)
                        # TEST CONSISTENCY
                        for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                            if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                Log.error("")
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {"eq": {"es_index": column.es_index}}
                            })
                            continue
                        if column.jx_type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE):
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated >= Date.now()-TOO_OLD:
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                                })
                            else:
                                Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)
Example #11
0
    def test_bad_exists_properties(self):
        test = {
            "data": [{
                "~e~": 1
            }, {
                "~e~": 1
            }],
            "query": {
                "from": TEST_TABLE,
                "select": [{
                    "name": "count",
                    "aggregate": "count"
                }],
            },
            "expecting_list": {
                "meta": {
                    "format": "value"
                },
                "data": {
                    "count": 2
                }
            }
        }

        subtest = wrap(test)

        cont = self.utils.fill_container(subtest, typed=False)
        db = Sqlite(filename="metadata.localhost.sqlite")
        try:
            with db.transaction() as t:
                t.execute(
                    "insert into " + quote_column("meta.columns") +
                    "(name, es_type, jx_type, nested_path, es_column, es_index, last_updated) VALUES "
                    + quote_set([
                        ".", "object", "exists", '["."]', ".", cont.alias,
                        Date.now()
                    ]))
        except Exception as e:
            pass
        try:
            with db.transaction() as t:
                t.execute(
                    "insert into " + quote_column("meta.columns") +
                    "(name, es_type, jx_type, nested_path, es_column, es_index, last_updated) VALUES "
                    + quote_set([
                        "~e~", "long", "exists", '["."]', "~e~", cont.alias,
                        Date.now()
                    ]))
        except Exception as e:
            pass

        self.utils.send_queries(subtest)
Example #12
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(
            query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(name=es_index_name,
                              url=None,
                              query_path=['.'],
                              timestamp=Date.now())
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now(
            ) - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note(
                            "waiting for columns to update {{columns|json}}",
                            columns=[
                                c.es_index + "." + c.es_column for c in columns
                                if not c.last_updated
                            ])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}",
                      table=table_name,
                      column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)
Example #13
0
    def test_column_constraints(self):
        multi = Column(
            name="name",
            es_column="es_column.~N~",
            es_index="es_index",
            es_type="nested",
            jx_type=NESTED,
            cardinality=1,
            multi=2,
            nested_path=".",
            last_updated=Date.now(),
        )

        self.assertRaises(
            Exception,
            Column,
            name="name",
            es_column="es_column.~N~",
            es_index="es_index",
            es_type="es_type",
            jx_type=INTEGER,
            multi=1,
            nested_path=".",
            last_updated=Date.now(),
        )

        self.assertRaises(
            Exception,
            Column,
            name="name",
            es_column="es_column.~N~",
            es_index="es_index",
            es_type="es_type",
            jx_type=INTEGER,
            multi=0,
            nested_path=".",
            last_updated=Date.now(),
        )

        self.assertRaises(
            Exception,
            Column,
            name="name",
            es_column="es_column.~N~",
            es_index="es_index",
            es_type="es_type",
            jx_type=INTEGER,
            nested_path=".",
            last_updated=Date.now(),
        )
Example #14
0
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = Relation_usingList()

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList()

        self.alias_to_query_paths = {
            "meta.columns": [['.']],
            "meta.tables": [['.']]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer(
            "meta.tables",
            [
                # TableDesc("meta.columns", None, ".", Date.now()),
                # TableDesc("meta.tables", None, ".", Date.now())
            ],
            jx_base.Schema(".", table_columns))
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Example #15
0
    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = self.meta.columns.find(c.es_index, c.names["."])
        for canonical in existing_columns:
            if canonical.type == c.type and canonical is not c:
                set_default(c.names, canonical.names)
                for key in Column.__slots__:
                    canonical[key] = c[key]
                if DEBUG:
                    Log.note("todo: {{table}}::{{column}}",
                             table=canonical.es_index,
                             column=canonical.es_column)
                self.todo.add(canonical)
                break
        else:
            self.meta.columns.add(c)
            self.todo.add(c)

            if ENABLE_META_SCAN:
                if DEBUG:
                    Log.note("todo: {{table}}::{{column}}",
                             table=c.es_index,
                             column=c.es_column)
                # MARK meta.columns AS DIRTY TOO
                cols = self.meta.columns.find("meta.columns", None)
                for cc in cols:
                    cc.partitions = cc.cardinality = None
                    cc.last_updated = Date.now() - TOO_OLD
                self.todo.extend(cols)
Example #16
0
def get_stats(query):
    frum = query.frum
    if isinstance(frum, Table):
        vars_record = {"table": frum.name}
    elif is_text(frum):
        vars_record = {"table": frum}
    else:
        vars_record = get_stats(frum)
    now = Date.now()
    vars_record['timestamp'] = now

    output = []
    for clause in ["select", "edges", "groupby", "window", "sort"]:
        vars_record["mode"] = clause
        for expr in listwrap(getattr(query, clause)):
            if isinstance(expr.value, Expression):
                for v in expr.value.vars():
                    output.append(set_default({"column": v.var}, vars_record))
    for v in query.where.vars():
        output.append(
            set_default({
                "column": v.var,
                "mode": "where"
            }, vars_record))
    return output
Example #17
0
    def _send_email(self):
        try:
            if not self.accumulation:
                return
            with Emailer(self.settings) as emailer:
                # WHO ARE WE SENDING TO
                emails = Data()
                for template, params in self.accumulation:
                    content = expand_template(template, params)
                    emails[literal_field(self.settings.to_address)] += [content]
                    for c in self.cc:
                        if any(d in params.params.error for d in c.contains):
                            emails[literal_field(c.to_address)] += [content]

                # SEND TO EACH
                for to_address, content in emails.items():
                    emailer.send_email(
                        source=self.settings.from_address,
                        to_addresses=listwrap(to_address),
                        subject=self.settings.subject,
                        body="\n\n".join(content),
                        format="text"
                    )

            self.accumulation = []
        except Exception as e:
            Log.warning("Could not send", e)
        finally:
            self.next_send = Date.now() + self.settings.average_interval * (2 * Random.float())
Example #18
0
def queue_consumer(pull_queue, please_stop=None):
    queue = aws.Queue(pull_queue)
    time_offset = None
    request_count = 0

    while not please_stop:
        request = queue.pop(till=please_stop)
        if please_stop:
            break
        if not request:
            Log.note("Nothing in queue, pausing for 5 seconds...")
            (please_stop | Till(seconds=5)).wait()
            continue

        if SKIP_TRY_REQUESTS and 'try' in request.where['and'].eq.branch:
            Log.note("Skipping try revision.")
            queue.commit()
            continue

        now = Date.now().unix
        if time_offset is None:
            time_offset = now - request.meta.request_time

        next_request = request.meta.request_time + time_offset
        if next_request > now:
            Log.note("Next request in {{wait_time}}", wait_time=Duration(seconds=next_request - now))
            Till(till=next_request).wait()

        Thread.run("request "+text_type(request_count), one_request, request)
        request_count += 1
        queue.commit()
Example #19
0
    def replay(
        self,
        name,
        confirm_delay_seconds=60,
        next_emit_serial=1,
        look_ahead_serial=1000
    ):
        """
        A SUBSCRIBER FOR REVIEWING THE QUEUE CONTENTS, IN ORDER
        """
        queue = self.get_or_create_queue(name)
        id = self.next_id()

        with self.db.transaction() as t:
            t.execute(
                sql_insert(
                    table=SUBSCRIBER,
                    records={
                        "id": id,
                        "queue": queue.id,
                        "last_emit_timestamp": Date.now(),
                        "confirm_delay_seconds": confirm_delay_seconds,
                        "last_confirmed_serial": next_emit_serial - 1,
                        "next_emit_serial": next_emit_serial,
                        "look_ahead_serial": look_ahead_serial,
                    },
                )
            )

        return Subscription(
            id=id, queue=queue, confirm_delay_seconds=confirm_delay_seconds
        )
    def __init__(self,
                 from_address,
                 to_address,
                 subject,
                 region,
                 aws_access_key_id=None,
                 aws_secret_access_key=None,
                 cc=None,
                 log_type="ses",
                 average_interval=HOUR,
                 kwargs=None):
        """
        SEND WARNINGS AND ERRORS VIA EMAIL

        settings = {
            "log_type": "ses",
            "from_address": "*****@*****.**",
            "to_address": "*****@*****.**",
            "cc":[
                {"to_address":"*****@*****.**", "where":{"eq":{"template":"gr"}}}
            ],
            "subject": "[ALERT][STAGING] Problem in ETL",
            "aws_access_key_id": "userkey"
            "aws_secret_access_key": "secret"
            "region":"us-west-2"
        }
        """
        assert kwargs.log_type == "ses", "Expecing settings to be of type 'ses'"
        self.settings = kwargs
        self.accumulation = []
        self.cc = listwrap(cc)
        self.next_send = Date.now() + MINUTE
        self.locker = Lock()
        self.settings.average_interval = Duration(kwargs.average_interval)
Example #21
0
    def __init__(
        self,
        interval,  # TIME INTERVAL BETWEEN RUNS
        starting,  # THE TIME TO START THE INTERVAL COUNT
        max_runtime=MAX_RUNTIME,  # LIMIT HOW LONG THE PROCESS IS ALIVE
        wait_for_shutdown=WAIT_FOR_SHUTDOWN,  # LIMIT PAITENCE WHEN ASKING FOR SHUTDOWN, THEN SEND KILL
        process=None,
    ):
        self.duration = Duration(interval)
        self.starting = coalesce(Date(starting), Date.now())
        self.max_runtime = Duration(max_runtime)
        self.wait_for_shutdown = Duration(wait_for_shutdown)
        # Process parameters
        self.process = process

        # STATE
        self.last_started = None
        self.last_finished = None
        self.run_count = 0
        self.fail_count = 0
        self.current = None
        self.terminator = None  # SIGNAL TO KILL THE PROCESS
        self.next_run = self._next_run()
        self.next = Till(till=self.next_run)
        self.next_run.then(self.run)
Example #22
0
    def _worker(self, please_stop):
        while not please_stop:
            pair = self.requests.pop(till=please_stop)
            if please_stop:
                break
            ready, method, path, req_headers, timestamp = pair

            try:
                url = self.url / path
                self.outbound_rate.add(Date.now())
                response = http.request(method, url, req_headers)

                del response.headers['transfer-encoding']
                resp_headers = value2json(response.headers)
                resp_content = response.raw.read()

                please_cache = self.please_cache(path)
                if please_cache:
                    with self.db.transaction() as t:
                        t.execute("INSERT INTO cache (path, headers, response, timestamp) VALUES" + quote_list((path, resp_headers, resp_content.decode('latin1'), timestamp)))
                with self.cache_locker:
                    self.cache[path] = (ready, resp_headers, resp_content, timestamp)
            except Exception as e:
                Log.warning("problem with request to {{path}}", path=path, cause=e)
                with self.cache_locker:
                    ready, headers, response = self.cache[path]
                    del self.cache[path]
            finally:
                ready.go()
Example #23
0
def queue_consumer(pull_queue, please_stop=None):
    queue = aws.Queue(pull_queue)
    time_offset = None
    request_count = 0

    while not please_stop:
        request = queue.pop(till=please_stop)
        if please_stop:
            break
        if not request:
            Log.note("Nothing in queue, pausing for 5 seconds...")
            (please_stop | Till(seconds=5)).wait()
            continue

        if SKIP_TRY_REQUESTS and 'try' in request.where['and'].eq.branch:
            Log.note("Skipping try revision.")
            queue.commit()
            continue

        now = Date.now().unix
        if time_offset is None:
            time_offset = now - request.meta.request_time

        next_request = request.meta.request_time + time_offset
        if next_request > now:
            Log.note("Next request in {{wait_time}}",
                     wait_time=Duration(seconds=next_request - now))
            Till(till=next_request).wait()

        Thread.run("request " + text_type(request_count), one_request, request)
        request_count += 1
        queue.commit()
Example #24
0
    def __init__(self, name):
        self.name = name
        self.lock = Lock("rate locker")
        self.request_rate = 0.0
        self.last_request = Date.now()

        Thread.run("rate logger", self._daemon)
Example #25
0
    def setup(
        self,
        instance,   # THE boto INSTANCE OBJECT FOR THE MACHINE TO SETUP
        utility     # THE utility OBJECT FOUND IN CONFIG
    ):
        with self.locker:
            if not self.settings.setup_timeout:
                Log.error("expecting instance.setup_timeout to prevent setup from locking")

            def worker(please_stop):
                cpu_count = int(round(utility.cpu))

                with hide('output'):
                    Log.note("setup {{instance}}", instance=instance.id)
                    self._config_fabric(instance)
                    Log.note("update packages on {{instance}} ip={{ip}}", instance=instance.id, ip=instance.ip_address)
                    try:
                        self._update_ubuntu_packages()
                    except Exception as e:
                        Log.warning("Can not setup {{instance}}, type={{type}}", instance=instance.id, type=instance.instance_type, cause=e)
                        return
                    Log.note("setup etl on {{instance}}", instance=instance.id)
                    self._setup_etl_code()
                    Log.note("setup grcov on {{instance}}", instance=instance.id)
                    self._setup_grcov()
                    Log.note("add config file on {{instance}}", instance=instance.id)
                    self._add_private_file()
                    Log.note("setup supervisor on {{instance}}", instance=instance.id)
                    self._setup_etl_supervisor(cpu_count)
                    Log.note("setup done {{instance}}", instance=instance.id)
            worker_thread = Thread.run("etl setup started at "+unicode(Date.now().format()), worker)
            (Till(timeout=Duration(self.settings.setup_timeout).seconds) | worker_thread.stopped).wait()
            if not worker_thread.stopped:
                Log.error("critical failure in thread {{name|quote}}", name=worker_thread.name)
            worker_thread.join()
Example #26
0
    def __init__(
            self,
            elasticsearch,  # the ElasticSearch settings for filling the backend
            kwargs=None):
        if elasticsearch.schema == None:
            Log.error("Expecting backed_es to have a schema defined")

        letters = text(ascii_lowercase)
        self.random_letter = letters[int(Date.now().unix / 30) % 26]
        self.elasticsearch = elasticsearch
        self.settings = kwargs
        self._es_test_settings = None
        self._es_cluster = None
        self._index = None

        if not jx_containers.config.default:
            jx_containers.config.default = {
                "type": "elasticsearch",
                "settings": elasticsearch
            }

        self.server = FakeHttp()
        jx_containers.config.default = {
            "type": "elasticsearch",
            "settings": kwargs.elasticsearch.copy()
        }
Example #27
0
    def get_or_create_facts(self, fact_name, uid=UID):
        """
        FIND TABLE BY NAME, OR CREATE IT IF IT DOES NOT EXIST
        :param fact_name:  NAME FOR THE CENTRAL INDEX
        :param uid: name, or list of names, for the GUID
        :return: Facts
        """
        about = self.db.about(fact_name)
        if not about:
            if uid != UID:
                Log.error("do not know how to handle yet")

            self.ns.columns._snowflakes[fact_name] = ["."]
            self.ns.columns.add(Column(
                name="_id",
                es_column="_id",
                es_index=fact_name,
                es_type=json_type_to_sqlite_type[STRING],
                jx_type=STRING,
                nested_path=['.'],
                multi=1,
                last_updated=Date.now()
            ))
            command = sql_create(fact_name, {UID: "INTEGER PRIMARY KEY", GUID: "TEXT"}, unique=UID)

            with self.db.transaction() as t:
                t.execute(command)

        return QueryTable(fact_name, self)
Example #28
0
    def __init__(self, name):
        self.name = name
        self.lock = Lock("rate locker")
        self.request_rate = 0.0
        self.last_request = Date.now()

        Thread.run("rate logger", self._daemon)
Example #29
0
    def write(self, template, params):
        with self.locker:
            if params.context not in [NOTE, ALARM]:  # SEND ONLY THE NOT BORING STUFF
                self.accumulation.append((template, params))

            if Date.now() > self.next_send:
                self._send_email()
Example #30
0
    def _worker(self, please_stop):
        while not please_stop:
            pair = self.requests.pop(till=please_stop)
            if please_stop:
                break
            ready, method, path, req_headers, timestamp = pair

            try:
                url = self.url / path
                self.outbound_rate.add(Date.now())
                response = http.request(method, url, req_headers)

                del response.headers['transfer-encoding']
                resp_headers = value2json(response.headers)
                resp_content = response.raw.read()

                please_cache = self.please_cache(path)
                if please_cache:
                    with self.db.transaction() as t:
                        t.execute("INSERT INTO cache (path, headers, response, timestamp) VALUES" + quote_list((path, resp_headers, resp_content.decode('latin1'), timestamp)))
                with self.cache_locker:
                    self.cache[path] = (ready, resp_headers, resp_content, timestamp)
            except Exception as e:
                Log.warning("problem with request to {{path}}", path=path, cause=e)
                with self.cache_locker:
                    ready, headers, response = self.cache[path]
                    del self.cache[path]
            finally:
                ready.go()
    def _send_email(self):
        try:
            if not self.accumulation:
                return
            with Emailer(self.settings) as emailer:
                # WHO ARE WE SENDING TO
                emails = Data()
                for template, params in self.accumulation:
                    content = expand_template(template, params)
                    emails[literal_field(
                        self.settings.to_address)] += [content]
                    for c in self.cc:
                        if any(d in params.params.error for d in c.contains):
                            emails[literal_field(c.to_address)] += [content]

                # SEND TO EACH
                for to_address, content in emails.items():
                    emailer.send_email(source=self.settings.from_address,
                                       to_addresses=listwrap(to_address),
                                       subject=self.settings.subject,
                                       body="\n\n".join(content),
                                       format="text")

            self.accumulation = []
        except Exception as e:
            Log.warning("Could not send", e)
        finally:
            self.next_send = Date.now() + self.settings.average_interval * (
                2 * randoms.float())
Example #32
0
    def write(self, template, params):
        with self.locker:
            if params.context not in [NOTE, ALARM]:  # SEND ONLY THE NOT BORING STUFF
                self.accumulation.append((template, params))

            if Date.now() > self.next_send:
                self._send_email()
Example #33
0
        def process_task(mail, please_stop=None):
            try:
                if not this.call:
                    this.call = this.celery.Task.__call__
                    this.dummy = this.celery.Task()

                name = mail.sender.name
                args = (mail,) if mail.sender.bind else tuple()

                this._status_update(mail, states.STARTED, {"response": {"start_time": Date.now().format()}})
                fun = this.celery._tasks[name]
                mail.result = this.call(this.dummy, fun, *args, **unwrap(mail.message))
                mail.status = states.SUCCESS
            except Exception as e:
                mail.result = Except.wrap(e)
                mail.status = states.FAILURE
                # mail = wrap({"request": {"id": mail.request.id}, "sender": {"name": "mail.sender.name"}})
                Log.warning("worker failed to process {{mail}}", mail=mail, cause=e)

            mail.response.end_time = Date.now().format()
            if isinstance(mail.result, Exception):
                mail.result = Except.wrap(mail.result)
            mail.receiver.thread = None

            Log.note("Add {{id}} ({{name}}) to response queue\n{{result}}", id=mail.request.id, name=mail.sender.name, result=mail)
            this.response_queue.add(value2json(mail))
            with work_list_lock:
                del work_list[mail.request.id]
Example #34
0
 def run(self):
     self.last_started = Date.now()
     self.run_count += 1
     self.current = Process(**self.process)
     self.terminator = Till(seconds=self.max_runtime.seconds)
     self.terminator.then(self.killer)
     self.current.service_stopped.then(self.done)
Example #35
0
    def _after_test(self):
        Log.main_log = self.temp
        cluster = self.es_logger.es.cluster

        # WAIT FOR SOMETHING TO SHOW UP IN THE LOG
        found = False
        while not found:
            for q in self.es_logger.es.known_queues.values():
                result = q.slow_queue.search({
                    "from": 0,
                    "size": 1,
                    "stored_fields": ["_source"]
                })
                if result.hits.total:
                    found = True
                    break
                q.slow_queue.refresh()
            else:
                print("wait for es log content")
                Till(seconds=1).wait()

        self.es_logger.stop()
        cluster.get_metadata(after=Date.now())
        index = cluster.get_index(TEST_CONFIG)
        return index
Example #36
0
    def setUpClass(self):
        while True:
            try:
                es = test_jx.global_settings.backend_es
                http.get_json(URL(es.host, port=es.port))
                break
            except Exception as e:
                e = Except.wrap(e)
                if "No connection could be made because the target machine actively refused it" in e or "Connection refused" in e:
                    Log.alert("Problem connecting")
                else:
                    Log.error("Server raised exception", e)

        # REMOVE OLD INDEXES
        cluster = elasticsearch.Cluster(test_jx.global_settings.backend_es)
        aliases = cluster.get_aliases()
        for a in aliases:
            try:
                if a.index.startswith("testing_"):
                    create_time = Date(
                        a.index[-15:], "%Y%m%d_%H%M%S"
                    )  # EXAMPLE testing_0ef53e45b320160118_180420
                    if create_time < Date.now() - 10 * MINUTE:
                        cluster.delete_index(a.index)
            except Exception as e:
                Log.warning("Problem removing {{index|quote}}",
                            index=a.index,
                            cause=e)
Example #37
0
    def login(self, path=None):
        """
        EXPECT AN ACCESS TOKEN, RETURN A SESSION TOKEN
        """
        now = Date.now().unix
        try:
            access_token = get_token_auth_header()
            # if access_token.error:
            #     Log.error("{{error}}: {{error_description}}", access_token)
            if len(access_token.split(".")) == 3:
                access_details = self.verify_jwt_token(access_token)
                session.scope = access_details["scope"]

            # ADD TO SESSION
            self.session_manager.setup_session(session)
            user_details = self.verify_opaque_token(access_token)
            session.user = self.permissions.get_or_create_user(user_details)
            session.last_used = now

            self.markup_user()

            return Response(value2json(
                self.session_manager.make_cookie(session)),
                            status=200)
        except Exception as e:
            session.user = None
            session.last_used = None
            Log.error("failure to authorize", cause=e)
Example #38
0
def write_profiles(main_thread_profile):
    if cprofiler_stats is None:
        return

    from mo_files import File

    cprofiler_stats.add(pstats.Stats(main_thread_profile.cprofiler))
    stats = cprofiler_stats.pop_all()

    Log.note("aggregating {{num}} profile stats", num=len(stats))
    acc = stats[0]
    for s in stats[1:]:
        acc.add(s)

    stats = [
        {
            "num_calls": d[1],
            "self_time": d[2],
            "total_time": d[3],
            "self_time_per_call": d[2] / d[1],
            "total_time_per_call": d[3] / d[1],
            "file": (f[0] if f[0] != "~" else "").replace("\\", "/"),
            "line": f[1],
            "method": f[2].lstrip("<").rstrip(">")
        }
        for f, d, in iteritems(acc.stats)
    ]
    from mo_times import Date

    stats_file = File(FILENAME, suffix=Date.now().format("_%Y%m%d_%H%M%S"))
    stats_file.write(list2tab(stats))
    Log.note("profile written to {{filename}}", filename=stats_file.abspath)
Example #39
0
 def done(self):
     self.last_finished = Date.now()
     self.terminator.remove_go(self.killer)
     self.terminator = None
     self.current = None
     self.next_run = self._next_run_time()
     self.next = Till(till=self.next_run.unix)
     self.next.then(self.run)
Example #40
0
    def _next_run_time(self):
        """
        :return: return signal for next
        """

        interval = mo_math.floor((Date.now() - self.starting) / self.duration)
        next_time = self.starting + (interval * self.duration)
        return next_time
Example #41
0
 def _delete_testindex(self):
     # DELETE OLD INDEX
     try:
         i = self.cluster.get_best_matching_index(TEST_CONFIG.index)
         if i:
             self.cluster.delete_index(i.index)
     except Exception as e:
         pass
     self.cluster.get_metadata(after=Date.now())
Example #42
0
    def output(*args, **kwargs):
        # IS THIS A NEW SESSION
        now = Date.now().unix
        user = session.get("user")
        if not user:
            Log.error("must authorize first")

        session.last_used = now
        return func(*args, user=user, **kwargs)
Example #43
0
    def required_utility(self):
        queue = aws.Queue(self.settings.work_queue)
        pending = len(queue)

        tod_minimum = None
        if Date.now().hour not in [4, 5, 6, 7, 8, 9, 10, 11]:
            tod_minimum = 100

        return max(self.settings.minimum_utility, tod_minimum, Math.ceiling(pending / 30))
Example #44
0
    def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = Relation_usingList()

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList()

        self.alias_to_query_paths = {
            "meta.columns": [['.']],
            "meta.tables": [['.']]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer(
            "meta.tables",
            [
                # TableDesc("meta.columns", None, ".", Date.now()),
                # TableDesc("meta.tables", None, ".", Date.now())
            ],
            jx_base.Schema(".", table_columns)
        )
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Example #45
0
    def _daemon(self, please_stop):
        while not please_stop:
            timestamp = Date.now()
            with self.lock:
                decay = METRIC_DECAY_RATE ** (timestamp - self.last_request).seconds
                request_rate = self.request_rate = decay * self.request_rate
                self.last_request = timestamp

            Log.note("{{name}} request rate: {{rate|round(places=2)}} requests per second", name=self.name, rate=request_rate)
            (please_stop | Till(seconds=METRIC_REPORT_PERIOD.seconds)).wait()
Example #46
0
    def _request_spot_instances(self, price, availability_zone_group, instance_type, kwargs):
        kwargs.kwargs = None

        # m3 INSTANCES ARE NOT ALLOWED PLACEMENT GROUP
        if instance_type.startswith("m3."):
            kwargs.placement_group = None

        kwargs.network_interfaces = NetworkInterfaceCollection(*(
            NetworkInterfaceSpecification(**i)
            for i in listwrap(kwargs.network_interfaces)
            if self.vpc_conn.get_all_subnets(subnet_ids=i.subnet_id, filters={"availabilityZone": availability_zone_group})
        ))

        if len(kwargs.network_interfaces) == 0:
            Log.error("No network interface specifications found for {{availability_zone}}!", availability_zone=kwargs.availability_zone_group)

        block_device_map = BlockDeviceMapping()

        # GENERIC BLOCK DEVICE MAPPING
        for dev, dev_settings in kwargs.block_device_map.items():
            block_device_map[dev] = BlockDeviceType(
                delete_on_termination=True,
                **dev_settings
            )

        kwargs.block_device_map = block_device_map

        # INCLUDE EPHEMERAL STORAGE IN BlockDeviceMapping
        num_ephemeral_volumes = ephemeral_storage[instance_type]["num"]
        for i in range(num_ephemeral_volumes):
            letter = convert.ascii2char(98 + i)  # START AT "b"
            kwargs.block_device_map["/dev/sd" + letter] = BlockDeviceType(
                ephemeral_name='ephemeral' + unicode(i),
                delete_on_termination=True
            )

        if kwargs.expiration:
            kwargs.valid_until = (Date.now() + Duration(kwargs.expiration)).format(ISO8601)
            kwargs.expiration = None

        # ATTACH NEW EBS VOLUMES
        for i, drive in enumerate(self.settings.utility[instance_type].drives):
            letter = convert.ascii2char(98 + i + num_ephemeral_volumes)
            device = drive.device = coalesce(drive.device, "/dev/sd" + letter)
            d = drive.copy()
            d.path = None  # path AND device PROPERTY IS NOT ALLOWED IN THE BlockDeviceType
            d.device = None
            if d.size:
                kwargs.block_device_map[device] = BlockDeviceType(
                    delete_on_termination=True,
                    **d
                )

        output = list(self.ec2_conn.request_spot_instances(**kwargs))
        return output
Example #47
0
    def request(self, method, path, headers):
        now = Date.now()
        self.inbound_rate.add(now)
        ready = Signal(path)

        # TEST CACHE
        with self.cache_locker:
            pair = self.cache.get(path)
            if pair is None:
                self.cache[path] = (ready, None, None, now)


        if pair is not None:
            # REQUEST IS IN THE QUEUE ALREADY, WAIT
            ready, headers, response, then = pair
            if response is None:
                ready.wait()
                with self.cache_locker:
                    ready, headers, response, timestamp = self.cache.get(path)
            with self.db.transaction() as t:
                t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now))
            return Response(
                response,
                status=200,
                headers=json.loads(headers)
            )

        # TEST DB
        db_response = self.db.query("SELECT headers, response FROM cache WHERE path=" + quote_value(path)).data
        if db_response:
            headers, response = db_response[0]
            with self.db.transaction() as t:
                t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now))
            with self.cache_locker:
                self.cache[path] = (ready, headers, response.encode('latin1'), now)
            ready.go()

            return Response(
                response,
                status=200,
                headers=json.loads(headers)
            )

        # MAKE A NETWORK REQUEST
        self.todo.add((ready, method, path, headers, now))
        ready.wait()
        with self.cache_locker:
            ready, headers, response, timestamp = self.cache[path]
        return Response(
            response,
            status=200,
            headers=json.loads(headers)
        )
Example #48
0
    def _cache_cleaner(self, please_stop):
        while not please_stop:
            now = Date.now()
            too_old = now-CACHE_RETENTION

            remove = set()
            with self.cache_locker:
                for path, (ready, headers, response, timestamp) in self.cache:
                    if timestamp < too_old:
                        remove.add(path)
                for r in remove:
                    del self.cache[r]
            (please_stop | Till(seconds=CACHE_RETENTION.seconds / 2)).wait()
Example #49
0
    def _rate_limiter(self, please_stop):
        try:
            max_requests = self.requests.max
            recent_requests = []

            while not please_stop:
                now = Date.now()
                too_old = now - self.amortization_period

                recent_requests = [t for t in recent_requests if t > too_old]

                num_recent = len(recent_requests)
                if num_recent >= max_requests:
                    space_free_at = recent_requests[0] + self.amortization_period
                    (please_stop | Till(till=space_free_at.unix)).wait()
                    continue
                for _ in xrange(num_recent, max_requests):
                    request = self.todo.pop()
                    now = Date.now()
                    recent_requests.append(now)
                    self.requests.add(request)
        except Exception as e:
            Log.warning("failure", cause=e)
Example #50
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if c.last_updated >= Date.now()-TOO_OLD:
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": c}, silent=not DEBUG, too_long=0.05):
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
Example #51
0
 def __init__(
     self,
     from_address,
     to_address,
     subject,
     region,
     aws_access_key_id=None,
     aws_secret_access_key=None,
     cc=None,
     log_type="ses",
     max_interval=HOUR,
     kwargs=None
 ):
     assert kwargs.log_type == "ses", "Expecing settings to be of type 'ses'"
     self.settings = kwargs
     self.accumulation = []
     self.cc = listwrap(cc)
     self.next_send = Date.now() + MINUTE
     self.locker = Lock()
     self.settings.max_interval = Duration(kwargs.max_interval)
Example #52
0
    def __init__(
        self,
        from_address,
        to_address,
        subject,
        host,
        username,
        password,
        port=465,
        use_ssl=1,
        cc=None,
        log_type="email",
        max_interval=HOUR,
        kwargs=None
    ):
        """
        SEND WARNINGS AND ERRORS VIA EMAIL

        settings = {
            "log_type":"email",
            "from_address": "*****@*****.**",
            "to_address": "*****@*****.**",
            "cc":[
                {"to_address":"*****@*****.**", "where":{"eq":{"template":"gr"}}}
            ],
            "subject": "Problem in Pulse Logger",
            "host": "mail.mozilla.com",
            "port": 465,
            "username": "******",
            "password": "******",
            "use_ssl": 1
        }

        """
        assert kwargs.log_type == "email", "Expecing settings to be of type 'email'"
        self.settings = kwargs
        self.accumulation = []
        self.cc = listwrap(cc)
        self.next_send = Date.now() + MINUTE
        self.locker = Lock()
        self.settings.max_interval = Duration(kwargs.max_interval)
Example #53
0
    def __init__(
        self,
        from_address,
        to_address,
        subject,
        region,
        aws_access_key_id=None,
        aws_secret_access_key=None,
        cc=None,
        log_type="ses",
        average_interval=HOUR,
        kwargs=None
    ):
        """
        SEND WARNINGS AND ERRORS VIA EMAIL

        settings = {
            "log_type": "ses",
            "from_address": "*****@*****.**",
            "to_address": "*****@*****.**",
            "cc":[
                {"to_address":"*****@*****.**", "where":{"eq":{"template":"gr"}}}
            ],
            "subject": "[ALERT][STAGING] Problem in ETL",
            "aws_access_key_id": "userkey"
            "aws_secret_access_key": "secret"
            "region":"us-west-2"
        }
        """
        assert kwargs.log_type == "ses", "Expecing settings to be of type 'ses'"
        self.settings = kwargs
        self.accumulation = []
        self.cc = listwrap(cc)
        self.next_send = Date.now() + MINUTE
        self.locker = Lock()
        self.settings.average_interval = Duration(kwargs.average_interval)
Example #54
0
    def __init__(
        self,
        host,
        index,
        type=None,
        name=None,
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None
    ):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=index, kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {".": None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if not v:
                    return []
                else:
                    return [v] + nested_path_of(all_paths[v])

            all = sort_using_key(set(step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p)))
            for step in sorted(all):
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                nested_path = nested_path_of(all_paths[p])
                if not nested_path:
                    nested_path = ['.']
                self.namespace.meta.columns.add(Column(
                    name=p,
                    es_column=p,
                    es_index=self.name,
                    es_type=OBJECT,
                    jx_type=EXISTS,
                    nested_path=nested_path,
                    last_updated=Date.now()
                ))
Example #55
0
        def life_cycle_watcher(please_stop):
            failed_attempts=Data()

            while not please_stop:
                spot_requests = self._get_managed_spot_requests()
                last_get = Date.now()
                instances = wrap({i.id: i for r in self.ec2_conn.get_all_instances() for i in r.instances})
                # INSTANCES THAT REQUIRE SETUP
                time_to_stop_trying = {}
                please_setup = [
                    (i, r) for i, r in [(instances[r.instance_id], r) for r in spot_requests]
                    if i.id and not i.tags.get("Name") and i._state.name == "running" and Date.now() > Date(i.launch_time) + DELAY_BEFORE_SETUP
                ]
                for i, r in please_setup:
                    try:
                        p = self.settings.utility[i.instance_type]
                        if p == None:
                            try:
                                self.ec2_conn.terminate_instances(instance_ids=[i.id])
                                with self.net_new_locker:
                                    self.net_new_spot_requests.remove(r.id)
                            finally:
                                Log.error("Can not setup unknown {{instance_id}} of type {{type}}", instance_id=i.id, type=i.instance_type)
                        i.markup = p
                        try:
                            self.instance_manager.setup(i, coalesce(p, 0))
                        except Exception as e:
                            e = Except.wrap(e)
                            failed_attempts[r.id] += [e]
                            Log.error(ERROR_ON_CALL_TO_SETUP, e)
                        i.add_tag("Name", self.settings.ec2.instance.name + " (running)")
                        with self.net_new_locker:
                            self.net_new_spot_requests.remove(r.id)
                    except Exception as e:
                        if not time_to_stop_trying.get(i.id):
                            time_to_stop_trying[i.id] = Date.now() + TIME_FROM_RUNNING_TO_LOGIN
                        if Date.now() > time_to_stop_trying[i.id]:
                            # FAIL TO SETUP AFTER x MINUTES, THEN TERMINATE INSTANCE
                            self.ec2_conn.terminate_instances(instance_ids=[i.id])
                            with self.net_new_locker:
                                self.net_new_spot_requests.remove(r.id)
                            Log.warning("Problem with setup of {{instance_id}}.  Time is up.  Instance TERMINATED!", instance_id=i.id, cause=e)
                        elif "Can not setup unknown " in e:
                            Log.warning("Unexpected failure on startup", instance_id=i.id, cause=e)
                        elif ERROR_ON_CALL_TO_SETUP in e:
                            if len(failed_attempts[r.id]) > 2:
                                Log.warning("Problem with setup() of {{instance_id}}", instance_id=i.id, cause=failed_attempts[r.id])
                        else:
                            Log.warning("Unexpected failure on startup", instance_id=i.id, cause=e)

                if Date.now() - last_get > 5 * SECOND:
                    # REFRESH STALE
                    spot_requests = self._get_managed_spot_requests()
                    last_get = Date.now()

                pending = wrap([r for r in spot_requests if r.status.code in PENDING_STATUS_CODES])
                give_up = wrap([r for r in spot_requests if r.status.code in PROBABLY_NOT_FOR_A_WHILE | TERMINATED_STATUS_CODES])
                ignore = wrap([r for r in spot_requests if r.status.code in MIGHT_HAPPEN])  # MIGHT HAPPEN, BUT NO NEED TO WAIT FOR IT

                if self.done_spot_requests:
                    with self.net_new_locker:
                        expired = Date.now() - self.settings.run_interval + 2 * MINUTE
                        for ii in list(self.net_new_spot_requests):
                            if Date(ii.create_time) < expired:
                                ## SOMETIMES REQUESTS NEVER GET INTO THE MAIN LIST OF REQUESTS
                                self.net_new_spot_requests.remove(ii)
                        for g in give_up:
                            self.net_new_spot_requests.remove(g.id)
                        for g in ignore:
                            self.net_new_spot_requests.remove(g.id)
                        pending = UniqueIndex(("id",), data=pending)
                        pending = pending | self.net_new_spot_requests

                    if give_up:
                        self.ec2_conn.cancel_spot_instance_requests(request_ids=give_up.id)
                        Log.note("Cancelled spot requests {{spots}}, {{reasons}}", spots=give_up.id, reasons=give_up.status.code)

                if not pending and not time_to_stop_trying and self.done_spot_requests:
                    Log.note("No more pending spot requests")
                    please_stop.go()
                    break
                elif pending:
                    Log.note("waiting for spot requests: {{pending}}", pending=[p.id for p in pending])

                (Till(seconds=10) | please_stop).wait()

            Log.note("life cycle watcher has stopped")
Example #56
0
    def pricing(self):
        with self.price_locker:
            if self.prices:
                return self.prices

            prices = self._get_spot_prices_from_aws()

            now = Date.now()
            expressions.ALLOW_SCRIPTING = True

            with Timer("processing pricing data"):
                hourly_pricing = jx.run({
                    "from": {
                        # AWS PRICING ONLY SENDS timestamp OF CHANGES, MATCH WITH NEXT INSTANCE
                        "from": prices,
                        "window": [
                            {
                                "name": "expire",
                                "value": {"coalesce": [{"rows": {"timestamp": 1}}, {"date": "eod"}]},
                                "edges": ["availability_zone", "instance_type"],
                                "sort": "timestamp"
                            },
                            {  # MAKE THIS PRICE EFFECTIVE INTO THE PAST, THIS HELPS SPREAD PRICE SPIKES OVER TIME
                                "name": "effective",
                                "value": {"sub": {"timestamp": self.settings.uptime.duration.seconds}}
                            }
                        ]
                    },
                    "edges": [
                        "availability_zone",
                        "instance_type",
                        {
                            "name": "time",
                            "range": {"min": "effective", "max": "expire", "mode": "inclusive"},
                            "allowNulls": False,
                            "domain": {"type": "time", "min": now.floor(HOUR) - self.settings.uptime.history, "max": Date.now().floor(HOUR)+HOUR, "interval": "hour"}
                        }
                    ],
                    "select": [
                        {"value": "price", "aggregate": "max"},
                        {"aggregate": "count"}
                    ],
                    "where": {"gt": {"expire": now.floor(HOUR) - self.settings.uptime.history}},
                    "window": [
                        {
                            "name": "current_price",
                            "value": "rows.last.price",
                            "edges": ["availability_zone", "instance_type"],
                            "sort": "time"
                        }
                    ]
                }).data

                bid80 = jx.run({
                    "from": hourly_pricing,
                    "edges": [
                        {
                            "value": "availability_zone",
                            "allowNulls": False
                        },
                        {
                            "name": "type",
                            "value": "instance_type",
                            "allowNulls": False,
                            "domain": {"type": "set", "key": "instance_type", "partitions": self.settings.utility}
                        }
                    ],
                    "select": [
                        {"name": "price_80", "value": "price", "aggregate": "percentile", "percentile": self.settings.uptime.bid_percentile},
                        {"name": "max_price", "value": "price", "aggregate": "max"},
                        {"aggregate": "count"},
                        {"value": "current_price", "aggregate": "one"},
                        {"name": "all_price", "value": "price", "aggregate": "list"}
                    ],
                    "window": [
                        {"name": "estimated_value", "value": {"div": ["type.utility", "price_80"]}},
                        {"name": "higher_price", "value": lambda row, rownum, rows: find_higher(row.all_price, row.price_80)}  # TODO: SUPPORT {"from":"all_price", "where":{"gt":[".", "price_80"]}, "select":{"aggregate":"min"}}
                    ]
                })

                output = jx.run({
                    "from": bid80,
                    "sort": {"value": "estimated_value", "sort": -1}
                })

                self.prices = wrap(output.data)
                self.price_lookup = UniqueIndex(("type.instance_type", "availability_zone"), data=self.prices)
            return self.prices
Example #57
0
    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": {"filter": {"match_all": {}}}
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "query": {"match_all": {}},
                    "size": 0
                })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column)
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = 2
                multi = 1
            else:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column),
                        "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}
                    },
                    "size": 0
                })
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count)
                multi = int(coalesce(agg_results.multi.value, 1))
                if cardinality == None:
                   Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {"path": column.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": column.es_column}}}
                }
            elif cardinality == 0:
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}}

            result = self.es_cluster.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": Date.now()
                },
                "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index and is_test_table:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index}}
                })
                self.index_does_not_exist.add(column.es_index)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
Example #58
0
        except Exception as e:
            Log.error("Unexpected error getting changset-log for {{url}}", url=clog_url, error=e)

        cset = ''
        for clog_cset in clog_obj['changesets']:
            cset = clog_cset['node'][:12]
            if len(csets) < rev_count:
                csets.append(cset)

        final_rev = cset

    # Oldest is now first
    csets.reverse()

    # Make the initial insertion (always slow)
    with Timer("Make initialization request for TUID", {"timestamp": Date.now()}):
        resp = service.get_tuids_from_files(files, csets[0])

    # Backup the database now
    shutil.copy('resources/tuid_app.db', 'resources/tuid_app_tmp.db')

    # While we haven't hit the breaking point perform
    # the stress test
    while tries_after_wait >= 0:
        # Restore the database to before any testing
        # shutil.copy('resources/test_tmp.db', 'resources/test.db')

        for rev in csets[1:]:
            # Request each revision in order
            request = wrap({
                "from": "files",
Example #59
0
    def get_tuids(self, branch, revision, files):
        """
        GET TUIDS FROM ENDPOINT, AND STORE IN DB
        :param branch: BRANCH TO FIND THE REVISION/FILE
        :param revision: THE REVISION NUNMBER
        :param files: THE FULL PATHS TO THE FILES
        :return: MAP FROM FILENAME TO TUID LIST
        """

        # SCRUB INPUTS
        revision = revision[:12]
        files = [file.lstrip('/') for file in files]

        with Timer(
            "ask tuid service for {{num}} files at {{revision|left(12)}}",
            {"num": len(files), "revision": revision},
            silent=not self.enabled
        ):
            response = self.db.query(
                "SELECT file, tuids FROM tuid WHERE revision=" + quote_value(revision) +
                " AND file IN " + quote_list(files)
            )
            found = {file: json2value(tuids) for file, tuids in response.data}

            try:
                remaining = set(files) - set(found.keys())
                new_response = None
                if remaining:
                    request = wrap({
                        "from": "files",
                        "where": {"and": [
                            {"eq": {"revision": revision}},
                            {"in": {"path": remaining}},
                            {"eq": {"branch": branch}}
                        ]},
                        "branch": branch,
                        "meta": {
                            "format": "list",
                            "request_time": Date.now()
                        }
                    })
                    if self.push_queue is not None:
                        if DEBUG:
                            Log.note("record tuid request to SQS: {{timestamp}}", timestamp=request.meta.request_time)
                        self.push_queue.add(request)
                    else:
                        if DEBUG:
                            Log.note("no recorded tuid request")

                    if not self.enabled:
                        return found

                    new_response = http.post_json(
                        self.endpoint,
                        json=request,
                        timeout=self.timeout
                    )

                    with self.db.transaction() as transaction:
                        command = "INSERT INTO tuid (revision, file, tuids) VALUES " + sql_list(
                            quote_list((revision, r.path, value2json(r.tuids)))
                            for r in new_response.data
                            if r.tuids != None
                        )
                        if not command.endswith(" VALUES "):
                            transaction.execute(command)
                    self.num_bad_requests = 0

                found.update({r.path: r.tuids for r in new_response.data} if new_response else {})
                return found

            except Exception as e:
                self.num_bad_requests += 1
                Till(seconds=SLEEP_ON_ERROR).wait()
                if self.enabled and self.num_bad_requests >= 3:
                    self.enabled = False
                    Log.error("TUID service has problems.", cause=e)
                return found