def _worker(self, please_stop):
        curr = "0.0"
        acc = []
        last_count_written = -1
        next_write = Date.now()

        while not please_stop:
            d = self.temp_queue.pop(timeout=MINUTE)
            if d == None:
                if not acc:
                    continue
                # WRITE THE INCOMPLETE DATA TO S3, BUT NOT TOO OFTEN
                next_write = Date.now() + MINUTE
                try:
                    if last_count_written != len(acc):
                        if DEBUG:
                            Log.note("write incomplete data ({{num}} lines) to {{uid}} in S3 next (time = {{next_write}})", uid=curr, next_write=next_write, num=len(acc))
                        self.bucket.write_lines(curr, (convert.value2json(a) for a in acc))
                        last_count_written = len(acc)
                except Exception, e:
                    Log.note("Problem with write to S3", cause=e)
            elif d[UID_PATH] != curr:
                # WRITE acc TO S3 IF WE ARE MOVING TO A NEW KEY
                try:
                    if acc:
                        if DEBUG:
                            Log.note("write complete data ({{num}} lines) to {{curr}} in S3", num=len(acc), curr=curr)
                        self.bucket.write_lines(curr, (convert.value2json(a) for a in acc))
                        last_count_written = 0
                    curr = d[UID_PATH]
                    acc = [d]
                except Exception, e:
                    Log.warning("Can not store data", cause=e)
                    Thread.sleep(30*MINUTE)
Beispiel #2
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            short_name = join_field(split_field(table_name)[0:1])
            table = self.get_table(short_name)[0]

            if not table:
                table = Table(
                    name=short_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=short_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=short_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(table_name, column_name)
            if columns:
                columns = jx.sort(columns, "name")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated])
                    Thread.sleep(seconds=1)
                return columns
        except Exception, e:
            Log.error("Not expected", cause=e)
Beispiel #3
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(Thread.STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == Thread.STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now() - TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "es_index": c.es_index,
                            "es_column": c.es_column
                        }
                    }
                })
            Log.note("Could not get {{col.es_index}}.{{col.es_column}} info",
                     col=c)
Beispiel #4
0
    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(Thread.STOP))
        while not please_stop:
            try:
                if not self.todo:
                    with self.columns.locker:
                        old_columns = filter(
                            lambda c: (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in ["object", "nested"],
                            self.columns
                        )
                        if old_columns:
                            self.todo.extend(old_columns)
                            # TEST CONSISTENCY
                            for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                                if c.abs_name==d.abs_name and c.table==d.table and c!=d:
                                    Log.error("")


                        else:
                            Log.note("no more metatdata to update")

                column = self.todo.pop(timeout=10*MINUTE)
                if column:
                    if column.type in ["object", "nested"]:
                        continue
                    elif column.last_updated >= Date.now()-TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        Log.note("updated {{column.name}}", column=column)
                    except Exception, e:
                        Log.warning("problem getting cardinality for  {{column.name}}", column=column, cause=e)
            except Exception, e:
                Log.warning("problem in cardinality monitor", cause=e)
Beispiel #5
0
 def __init__(self,
              host,
              index,
              type="query",
              max_size=10,
              batch_size=10,
              settings=None):
     """
     settings ARE FOR THE ELASTICSEARCH INDEX
     """
     es = Cluster(settings).get_or_create_index(schema=convert.json2value(
         convert.value2json(SCHEMA), leaves=True),
                                                limit_replicas=True,
                                                settings=settings)
     #ENSURE THE TYPE EXISTS FOR PROBING
     try:
         es.add({
             "id": "dummy",
             "value": {
                 "hash": "dummy",
                 "create_time": Date.now(),
                 "last_used": Date.now(),
                 "query": {}
             }
         })
     except Exception, e:
         Log.warning("Problem saving query", cause=e)
Beispiel #6
0
    def test_timing(self):
        if self.not_real_service():
            return

        test = wrap({
            "query": {
                "from": {
                    "type": "elasticsearch",
                    "settings": {
                        "host": ES_CLUSTER_LOCATION,
                        "index": "unittest",
                        "type": "test_result"
                    }
                },
                "select": [{
                    "name": "count",
                    "value": "run.duration",
                    "aggregate": "count"
                }, {
                    "name": "total",
                    "value": "run.duration",
                    "aggregate": "sum"
                }],
                "edges": [{
                    "name": "chunk",
                    "value": ["run.suite", "run.chunk"]
                }, "result.ok"],
                "where": {
                    "and": [{
                        "lt": {
                            "timestamp": Date.floor(Date.now()).milli / 1000
                        }
                    }, {
                        "gte": {
                            "timestamp":
                            Date.floor(Date.now() - (Duration.DAY * 7),
                                       Duration.DAY).milli / 1000
                        }
                    }]
                },
                "format":
                "cube",
                "samples": {
                    "limit": 30
                }
            }
        })

        query = convert.unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.service_url, data=query)
            if response.status_code != 200:
                error(response)
        result = convert.json2value(convert.utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
def log_loop(settings, synch, queue, bucket, please_stop):
    with aws.Queue(settings.work_queue) as work_queue:
        for i, g in qb.groupby(queue, size=settings.param.size):
            Log.note(
                "Preparing {{num}} pulse messages to bucket={{bucket}}",
                num=len(g),
                bucket=bucket.name
            )

            full_key = unicode(synch.next_key) + ":" + unicode(MIN(g.select("_meta.count")))
            try:
                output = [
                    set_default(
                        d,
                        {"etl": {
                            "name": "Pulse block",
                            "bucket": settings.destination.bucket,
                            "timestamp": Date.now().unix,
                            "id": synch.next_key,
                            "source": {
                                "name": "pulse.mozilla.org",
                                "id": d._meta.count,
                                "count": d._meta.count,
                                "message_id": d._meta.message_id,
                                "sent": Date(d._meta.sent),
                            },
                            "type": "aggregation"
                        }}
                    )
                    for i, d in enumerate(g)
                    if d != None  # HAPPENS WHEN PERSISTENT QUEUE FAILS TO LOG start
                ]
                bucket.write(full_key, "\n".join(convert.value2json(d) for d in output))
                synch.advance()
                synch.source_key = MAX(g.select("_meta.count")) + 1

                now = Date.now()
                work_queue.add({
                    "bucket": bucket.name,
                    "key": full_key,
                    "timestamp": now.unix,
                    "date/time": now.format()
                })

                synch.ping()
                queue.commit()
                Log.note("Wrote {{num}} pulse messages to bucket={{bucket}}, key={{key}} ",
                    num= len(g),
                    bucket= bucket.name,
                    key= full_key)
            except Exception, e:
                queue.rollback()
                if not queue.closed:
                    Log.warning("Problem writing {{key}} to S3", key=full_key, cause=e)

            if please_stop:
                break
Beispiel #8
0
    def test_timeout(self):
        def test(please_stop):
            Till(seconds=10).wait()

        now = Date.now()
        thread = Thread.run("sleeper", test)
        Till(seconds=0.5).wait()
        thread.stop()
        self.assertGreater(now.unix+1, Date.now().unix, "Expecting quick stop")
        Log.note("done")
Beispiel #9
0
    def save(self, query):
        query.meta = None
        json = convert.value2json(query)
        hash = convert.unicode2utf8(json)

        #TRY MANY HASHES AT ONCE
        hashes = [None] * HASH_BLOCK_SIZE
        for i in range(HASH_BLOCK_SIZE):
            hash = hashlib.sha1(hash).digest()
            hashes[i] = hash

        short_hashes = [
            convert.bytes2base64(h[0:6]).replace("/", "_") for h in hashes
        ]
        available = {h: True for h in short_hashes}

        existing = self.es.query({
            "from": {
                "type": "elasticsearch",
                "settings": self.es.settings
            },
            "where": {
                "terms": {
                    "hash": short_hashes
                }
            },
            "meta": {
                "timeout": "2second"
            }
        })

        for e in Cube(select=existing.select,
                      edges=existing.edges,
                      data=existing.data).values():
            if e.query == json:
                return e.hash
            available[e.hash] = False

        # THIS WILL THROW AN ERROR IF THERE ARE NONE, HOW UNLUCKY!
        best = [h for h in short_hashes if available[h]][0]

        self.queue.add({
            "id": best,
            "value": {
                "hash": best,
                "create_time": Date.now(),
                "last_used": Date.now(),
                "query": json
            }
        })

        Log.note("Saved query as {{hash}}", hash=best)

        return best
Beispiel #10
0
 def _send_email(self):
     try:
         if self.accumulation:
             with Emailer(self.settings) as emailer:
                 emailer.send_email(from_address=self.settings.from_address,
                                    to_address=self.settings.to_address,
                                    subject=self.settings.subject,
                                    text_data="\n\n".join(
                                        self.accumulation))
         self.next_send = Date.now() + self.settings.max_interval
         self.accumulation = []
     except Exception, e:
         self.next_send = Date.now() + self.settings.max_interval
         Log.warning("Could not send", e)
Beispiel #11
0
 def _send_email(self):
     try:
         if self.accumulation:
             with Emailer(self.settings) as emailer:
                 emailer.send_email(
                     from_address=self.settings.from_address,
                     to_address=self.settings.to_address,
                     subject=self.settings.subject,
                     text_data="\n\n".join(self.accumulation)
                 )
         self.next_send = Date.now() + self.settings.max_interval
         self.accumulation = []
     except Exception, e:
         self.next_send = Date.now() + self.settings.max_interval
         Log.warning("Could not send", e)
Beispiel #12
0
def get_branches(hg, branches, use_cache=True, settings=None):
    if not settings.branches or not use_cache:
        found_branches = _get_branches_from_hg(hg)

        es = elasticsearch.Cluster(settings=branches).get_or_create_index(settings=branches)
        es.add_alias()
        es.extend({"id": b.name + " " + b.locale, "value": b} for b in found_branches)
        es.flush()
        return found_branches

    # TRY ES
    try:
        es = elasticsearch.Cluster(settings=branches).get_index(settings=branches)
        query = {
            "query": {"match_all": {}},
            "size": 20000
        }

        docs = es.search(query).hits.hits._source
        # IF IT IS TOO OLD, THEN PULL FROM HG
        oldest = Date(Math.MAX(docs.etl.timestamp))
        if Date.now() - oldest > OLD_BRANCH:
            return get_branches(use_cache=False, settings=settings)

        try:
            return UniqueIndex(["name", "locale"], data=docs, fail_on_dup=False)
        except Exception, e:
            Log.error("Bad branch in ES index", cause=e)
    except Exception, e:
        if "Can not find index " in e:
            return get_branches(use_cache=False, settings=settings)
        Log.error("problem getting branches", cause=e)
Beispiel #13
0
    def write(self, template, params):
        with self.locker:
            if params.context not in [NOTE, ALARM]:  # DO NOT SEND THE BORING STUFF
                self.accumulation.append(expand_template(template, params))

            if Date.now() > self.next_send:
                self._send_email()
Beispiel #14
0
    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = [
            r for r in self.meta.columns.data
            if r.table == c.table and r.name == c.name
        ]
        if not existing_columns:
            self.meta.columns.add(c)
            Log.note("todo: {{table}}.{{column}}",
                     table=c.table,
                     column=c.es_column)
            self.todo.add(c)

            # MARK meta.columns AS DIRTY TOO
            cols = [
                r for r in self.meta.columns.data if r.table == "meta.columns"
            ]
            for cc in cols:
                cc.partitions = cc.cardinality = None
                cc.last_updated = Date.now()
            self.todo.extend(cols)
        else:
            canonical = existing_columns[0]
            if canonical.relative and not c.relative:
                return  # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS

            for key in Column.__slots__:
                canonical[key] = c[key]
            Log.note("todo: {{table}}.{{column}}",
                     table=canonical.table,
                     column=canonical.es_column)
            self.todo.add(canonical)
Beispiel #15
0
def record_request(request, query_, data, error):
    try:
        if request_log_queue == None:
            return

        if data and len(data) > 10000:
            data = data[:10000]

        log = wrap({
            "timestamp":
            Date.now(),
            "http_user_agent":
            request.headers.get("user_agent"),
            "http_accept_encoding":
            request.headers.get("accept_encoding"),
            "path":
            request.headers.environ["werkzeug.request"].full_path,
            "content_length":
            request.headers.get("content_length"),
            "remote_addr":
            request.remote_addr,
            "query":
            query_,
            "data":
            data,
            "error":
            error
        })
        log["from"] = request.headers.get("from")
        request_log_queue.add({"value": log})
    except Exception, e:
        Log.warning("Can not record", cause=e)
Beispiel #16
0
    def loop(self, please_stop):
        with self.work_queue:
            while not please_stop:
                if self.settings.wait_forever:
                    todo = None
                    while not please_stop and not todo:
                        if isinstance(self.work_queue, aws.Queue):
                            todo = self.work_queue.pop(wait=EXTRA_WAIT_TIME)
                        else:
                            todo = self.work_queue.pop()
                else:
                    if isinstance(self.work_queue, aws.Queue):
                        todo = self.work_queue.pop()
                    else:
                        todo = self.work_queue.pop(till=Date.now())
                    if todo == None:
                        please_stop.go()
                        return

                try:
                    is_ok = self._dispatch_work(todo)
                    if is_ok:
                        self.work_queue.commit()
                    else:
                        self.work_queue.rollback()
                except Exception, e:
                    self.work_queue.rollback()
                    Log.warning("could not processs {{key}}.  Returned back to work queue.", key=todo.key, cause=e)
Beispiel #17
0
    def loop(self, please_stop):
        with self.work_queue:
            while not please_stop:
                if self.settings.wait_forever:
                    todo = None
                    while not please_stop and not todo:
                        if isinstance(self.work_queue, aws.Queue):
                            todo = self.work_queue.pop(wait=EXTRA_WAIT_TIME)
                        else:
                            todo = self.work_queue.pop()
                else:
                    if isinstance(self.work_queue, aws.Queue):
                        todo = self.work_queue.pop()
                    else:
                        todo = self.work_queue.pop(till=Date.now())
                    if todo == None:
                        please_stop.go()
                        return

                try:
                    is_ok = self._dispatch_work(todo)
                    if is_ok:
                        self.work_queue.commit()
                    else:
                        self.work_queue.rollback()
                except Exception, e:
                    self.work_queue.rollback()
                    Log.warning(
                        "could not processs {{key}}.  Returned back to work queue.",
                        key=todo.key,
                        cause=e)
Beispiel #18
0
    def output(*args):
        with cache_store.locker:
            if using_self:
                self = args[0]
                args = args[1:]
            else:
                self = cache_store

            now = Date.now()
            try:
                _cache = getattr(self, attr_name)
            except Exception, _:
                _cache = {}
                setattr(self, attr_name, _cache)

            if Random.int(100) == 0:
                # REMOVE OLD CACHE
                _cache = {
                    k: v
                    for k, v in _cache.items() if v[0] == None or v[0] > now
                }
                setattr(self, attr_name, _cache)

            timeout, key, value, exception = _cache.get(
                args, (Null, Null, Null, Null))
Beispiel #19
0
    def write(self, template, params):
        with self.locker:
            if params.params.warning.template or params.params.warning.template:
                self.accumulation.append(expand_template(template, params))

                if Date.now() > self.last_sent + WAIT_TO_SEND_MORE:
                    self._send_email()
    def write(self, template, params):
        with self.locker:
            if params.params.warning.template or params.params.warning.template:
                self.accumulation.append(expand_template(template, params))

                if Date.now() > self.last_sent + WAIT_TO_SEND_MORE:
                    self._send_email()
    def __init__(
        self,
        from_address,
        to_address,
        subject,
        host,
        username,
        password,
        port=465,
        use_ssl=1,
        log_type="email",
        settings=None
    ):
        """
        SEND WARNINGS AND ERRORS VIA EMAIL

        settings = {
            "log_type":"email",
            "from_address": "*****@*****.**",
            "to_address": "*****@*****.**",
            "subject": "Problem in Pulse Logger",
            "host": "mail.mozilla.com",
            "port": 465,
            "username": "******",
            "password": "******",
            "use_ssl": 1
        }

        """
        assert settings.log_type == "email", "Expecing settings to be of type 'email'"
        self.settings = settings
        self.accumulation = []
        self.last_sent = Date.now()-Duration.YEAR
        self.locker = Lock()
Beispiel #22
0
    def __init__(self, host, index, alias=None, name=None, port=9200, settings=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = settings
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(settings=settings)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Dict()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Beispiel #23
0
    def __init__(self,
                 from_address,
                 to_address,
                 subject,
                 host,
                 username,
                 password,
                 port=465,
                 use_ssl=1,
                 log_type="email",
                 max_interval=HOUR,
                 settings=None):
        """
        SEND WARNINGS AND ERRORS VIA EMAIL

        settings = {
            "log_type":"email",
            "from_address": "*****@*****.**",
            "to_address": "*****@*****.**",
            "subject": "Problem in Pulse Logger",
            "host": "mail.mozilla.com",
            "port": 465,
            "username": "******",
            "password": "******",
            "use_ssl": 1
        }

        """
        assert settings.log_type == "email", "Expecing settings to be of type 'email'"
        self.settings = settings
        self.accumulation = []
        self.next_send = Date.now() + MINUTE
        self.locker = Lock()
        self.settings.max_interval = Duration(settings.max_interval)
Beispiel #24
0
    def get_markup(self, branch, revision, task_id=None, buildername=None, timestamp=None):
        # TRY CACHE
        if not branch or not revision:
            Log.error("expecting branch and revision")

        if self.settings.use_cache:
            if task_id:
                _filter = {"term": {"task.id": task_id}}
            else:
                _filter = {"term": {"ref_data_name": buildername}}

            query = {
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and": [
                        _filter,
                        {"term": {"repo.branch": branch}},
                        {"prefix": {"repo.revision": revision}},
                        {"or": [
                            {"range": {"etl.timestamp": {"gte": (Date.now() - HOUR).unix}}},
                            {"range": {"job.timing.last_modified": {"lt": (Date.now() - DAY).unix}}}
                        ]}
                    ]}
                }},
                "size": 10000
            }

            try:
                docs = self.cache.search(query, timeout=120).hits.hits
            except Exception, e:
                docs = None
                Log.warning("Bad ES call, fall back to TH", cause=e)

            if not docs:
                pass
            elif len(docs) == 1:
                if DEBUG:
                    Log.note("Used ES cache to get TH details on {{value|quote}}", value=coalesce(task_id, buildername))
                return docs[0]._source
            elif timestamp == None:
                Log.error("timestamp required to find best match")
            else:
                # MISSING docs._source.job.timing.end WHEN A PLACEHOLDER WAS ADDED
                # TODO: SHOULD DELETE OVERAPPING PLACEHOLDER RECORDS
                timestamp = Date(timestamp).unix
                best_index = jx.sort([(i, abs(coalesce(e, 0) - timestamp)) for i, e in enumerate(docs._source.job.timing.end)], 1)[0][0]
                return docs[best_index]._source
Beispiel #25
0
 def ping(self):
     self.ping_time = Date.now()
     self.synch.write(convert.value2json({
         "action": "ping",
         "next_key": self.next_key,
         "source_key": self.source_key,
         "timestamp": self.ping_time.milli
     }))
Beispiel #26
0
    def write(self, template, params):
        with self.locker:
            if params.context not in [NOTE,
                                      ALARM]:  # SEND ONLY THE NOT BORING STUFF
                self.accumulation.append(expand_template(template, params))

            if Date.now() > self.next_send:
                self._send_email()
Beispiel #27
0
    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(Thread.STOP))
        while not please_stop:
            try:
                if not self.todo:
                    with self.meta.columns.locker:
                        old_columns = [
                            c for c in self.meta.columns
                            if (c.last_updated == None or c.last_updated <
                                Date.now() - TOO_OLD) and c.type not in STRUCT
                        ]
                        if old_columns:
                            Log.note("Old columns wth dates {{dates|json}}",
                                     dates=wrap(old_columns).last_updated)
                            self.todo.extend(old_columns)
                            # TEST CONSISTENCY
                            for c, d in product(list(self.todo.queue),
                                                list(self.todo.queue)):
                                if c.es_column == d.es_column and c.table == d.table and c != d:
                                    Log.error("")
                        else:
                            Log.note("no more metatdata to update")

                column = self.todo.pop(Till(timeout=10 * MINUTE))
                if column:
                    Log.note("update {{table}}.{{column}}",
                             table=column.table,
                             column=column.es_column)
                    if column.type in STRUCT:
                        with self.meta.columns.locker:
                            column.last_updated = Date.now()
                        continue
                    elif column.last_updated >= Date.now() - TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        if DEBUG and not column.table.startswith(
                                TEST_TABLE_PREFIX):
                            Log.note("updated {{column.name}}", column=column)
                    except Exception, e:
                        Log.warning(
                            "problem getting cardinality for {{column.name}}",
                            column=column,
                            cause=e)
            except Exception, e:
                Log.warning("problem in cardinality monitor", cause=e)
Beispiel #28
0
    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        meta = self.es_metadata.indices[table]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[table]

        for _, properties in meta.mappings.items():
            self._parse_properties(meta.index, properties, meta)
Beispiel #29
0
 def ping(self):
     self.ping_time = Date.now()
     self.synch.write(
         convert.value2json({
             "action": "ping",
             "next_key": self.next_key,
             "source_key": self.source_key,
             "timestamp": self.ping_time.milli
         }))
Beispiel #30
0
    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        meta = self.es_metadata.indices[table]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[table]

        for _, properties in meta.mappings.items():
            self._parse_properties(meta.index, properties, meta)
Beispiel #31
0
 def shutdown(self):
     self.pinger_thread.stop()
     self.pinger_thread.join()
     self.synch.write(convert.value2json({
         "action": "shutdown",
         "next_key": self.next_key,
         "source_key": self.source_key,
         "timestamp": Date.now().milli
     }))
Beispiel #32
0
            def add_column(c, query_path):
                c.last_updated = Date.now()
                c.table = join_field([c.es_index]+split_field(query_path[0]))

                with self.meta.columns.locker:
                    self._upsert_column(c)
                    for alias in meta.aliases:
                        c = copy(c)
                        c.table = join_field([alias]+split_field(query_path[0]))
                        self._upsert_column(c)
Beispiel #33
0
 def shutdown(self):
     self.pinger_thread.stop()
     self.pinger_thread.join()
     self.synch.write(
         convert.value2json({
             "action": "shutdown",
             "next_key": self.next_key,
             "source_key": self.source_key,
             "timestamp": Date.now().milli
         }))
Beispiel #34
0
 def __init__(self, synch):
     """
     synch HAS read() AND write() SO SEPARATE INSTANCES CAN DETERMINE IF OTHERS ARE ALIVE
     RAISE EXCEPTION IF SOME OTHER INSTANCE HAS BEEN DETECTED
     RETURN START OF COUNT (always >=1)
     """
     self.synch = synch
     self.pinger_thread = None
     self.next_key = 1
     self.ping_time = Date.now()
     self.source_key = 0
Beispiel #35
0
 def __init__(self, synch):
     """
     synch HAS read() AND write() SO SEPARATE INSTANCES CAN DETERMINE IF OTHERS ARE ALIVE
     RAISE EXCEPTION IF SOME OTHER INSTANCE HAS BEEN DETECTED
     RETURN START OF COUNT (always >=1)
     """
     self.synch = synch
     self.pinger_thread = None
     self.next_key = 1
     self.ping_time = Date.now()
     self.source_key = 0
Beispiel #36
0
            def add_column(c, query_path):
                c.last_updated = Date.now()
                c.table = join_field([c.es_index] + split_field(query_path[0]))

                with self.meta.columns.locker:
                    self._upsert_column(c)
                    for alias in meta.aliases:
                        c = copy(c)
                        c.table = join_field([alias] +
                                             split_field(query_path[0]))
                        self._upsert_column(c)
Beispiel #37
0
def _test_mode_wait(query):
    """
    WAIT FOR METADATA TO ARRIVE ON INDEX
    :param query: dict() OF REQUEST BODY
    :return: nothing
    """
    try:
        m = meta.singlton
        now = Date.now()
        end_time = now + MINUTE

        # MARK COLUMNS DIRTY
        m.meta.columns.update({
            "clear": ["partitions", "count", "cardinality", "last_updated"],
            "where": {
                "eq": {
                    "table": join_field(split_field(query["from"])[0:1])
                }
            }
        })

        # BE SURE THEY ARE ON THE todo QUEUE FOR RE-EVALUATION
        cols = [
            c for c in m.get_columns(table_name=query["from"], force=True)
            if c.type not in STRUCT
        ]
        for c in cols:
            Log.note("Mark {{column}} dirty at {{time}}",
                     column=c.name,
                     time=now)
            c.last_updated = now - TOO_OLD
            m.todo.push(c)

        while end_time > now:
            # GET FRESH VERSIONS
            cols = [
                c for c in m.get_columns(table_name=query["from"])
                if c.type not in STRUCT
            ]
            for c in cols:
                if not c.last_updated or c.cardinality == None:
                    Log.note(
                        "wait for column (table={{col.table}}, name={{col.name}}) metadata to arrive",
                        col=c)
                    break
            else:
                break
            Thread.sleep(seconds=1)
        for c in cols:
            Log.note(
                "fresh column name={{column.name}} updated={{column.last_updated|date}} parts={{column.partitions}}",
                column=c)
    except Exception, e:
        Log.warning("could not pickup columns", cause=e)
Beispiel #38
0
    def _rate_limited_get_json(self, *args, **kwargs):
        now = Date.now().unix
        with self.rate_locker:
            if self.request_times[self.request_pointer] >= now - 1:
                Log.note("Rate limiting")
                Thread.sleep(seconds=self.request_times[self.request_pointer] - now + 1)
            self.request_times[self.request_pointer] = now
            self.request_pointer += 1
            self.request_pointer %= len(self.request_times)

        return http.get_json(*args, **kwargs)
Beispiel #39
0
    def _wait_for_stable(self, detect_function, timeout):
        """
        WAIT FOR RESULTS OF detect_function TO BE STABLE
        """
        if not isinstance(timeout, timedelta):
            Log.error("Expecting a timeout as a timedelta")

        detectTime = Date.now()
        oldValue = "probably never an initial value"
        newValue = detect_function()
        while True:
            now = Date.now()
            potentialValue = detect_function()
            if potentialValue != newValue:
                oldValue = newValue
                newValue = potentialValue
                detectTime = now
            if now - detectTime > timeout:
                return
            Thread.sleep(seconds=0.5)
Beispiel #40
0
    def _wait_for_stable(self, detect_function, timeout):
        """
        WAIT FOR RESULTS OF detect_function TO BE STABLE
        """
        if not isinstance(timeout, timedelta):
            Log.error("Expecting a timeout as a timedelta")

        detectTime = Date.now()
        oldValue = "probably never an initial value"
        newValue = detect_function()
        while True:
            now = Date.now()
            potentialValue = detect_function()
            if potentialValue != newValue:
                oldValue = newValue
                newValue = potentialValue
                detectTime = now
            if now - detectTime > timeout:
                return
            Thread.sleep(seconds=0.5)
Beispiel #41
0
 def _pinger(self, please_stop):
     Log.note("pinger started")
     while not please_stop:
         Thread.sleep(till=self.ping_time + PING_PERIOD, please_stop=please_stop)
         if please_stop:  #EXIT EARLY, OTHERWISE WE MAY OVERWRITE THE shutdown
             break
         if Date.now() < self.ping_time + PING_PERIOD:
             continue
         try:
             self.ping()
         except Exception, e:
             Log.warning("synchro.py could not ping", e)
Beispiel #42
0
    def _rate_limited_get_json(self, *args, **kwargs):
        now = Date.now().unix
        with self.rate_locker:
            if self.request_times[self.request_pointer] >= now - 1:
                Log.note("Rate limiting")
                Thread.sleep(seconds=self.request_times[self.request_pointer] -
                             now + 1)
            self.request_times[self.request_pointer] = now
            self.request_pointer += 1
            self.request_pointer %= len(self.request_times)

        return http.get_json(*args, **kwargs)
Beispiel #43
0
 def _pinger(self, please_stop):
     Log.note("pinger started")
     while not please_stop:
         Thread.sleep(till=self.ping_time + PING_PERIOD,
                      please_stop=please_stop)
         if please_stop:  #EXIT EARLY, OTHERWISE WE MAY OVERWRITE THE shutdown
             break
         if Date.now() < self.ping_time + PING_PERIOD:
             continue
         try:
             self.ping()
         except Exception, e:
             Log.warning("synchro.py could not ping", e)
Beispiel #44
0
    def trigger_job(self):
        while self.please_stop:
            now = Date.now()
            next = now + DAY

            for j in self.jobs:
                if j.next_run_time < now:
                    j.next_run_time = next_run(j)
                    self.run_job(j)

                next = Date.min(next, j.next_run_time)

            Thread.sleep(till=next, please_stop=self.please_stop)
Beispiel #45
0
    def _send_email(self):
        try:
            if self.accumulation:
                conn = connect_to_region(
                    self.settings.region,
                    aws_access_key_id=unwrap(self.settings.aws_access_key_id),
                    aws_secret_access_key=unwrap(
                        self.settings.aws_secret_access_key))

                conn.send_email(source=self.settings.from_address,
                                to_addresses=listwrap(
                                    self.settings.to_address),
                                subject=self.settings.subject,
                                body="\n\n".join(self.accumulation),
                                format="text")

                conn.close()
            self.next_send = Date.now() + WAIT_TO_SEND_MORE
            self.accumulation = []
        except Exception, e:
            self.next_send = Date.now() + WAIT_TO_SEND_MORE
            Log.warning("Could not send", e)
Beispiel #46
0
    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(Thread.STOP))
        while not please_stop:
            try:
                if not self.todo:
                    with self.meta.columns.locker:
                        old_columns = [
                            c
                            for c in self.meta.columns
                            if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT
                        ]
                        if old_columns:
                            Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated)
                            self.todo.extend(old_columns)
                            # TEST CONSISTENCY
                            for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                                if c.es_column == d.es_column and c.table == d.table and c != d:
                                    Log.error("")
                        else:
                            Log.note("no more metatdata to update")

                column = self.todo.pop(Till(timeout=10*MINUTE))
                if column:
                    Log.note("update {{table}}.{{column}}", table=column.table, column=column.es_column)
                    if column.type in STRUCT:
                        with self.meta.columns.locker:
                            column.last_updated = Date.now()
                        continue
                    elif column.last_updated >= Date.now()-TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        if DEBUG and not column.table.startswith(TEST_TABLE_PREFIX):
                            Log.note("updated {{column.name}}", column=column)
                    except Exception, e:
                        Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception, e:
                Log.warning("problem in cardinality monitor", cause=e)
Beispiel #47
0
 def __init__(self,
              from_address,
              to_address,
              subject,
              region,
              aws_access_key_id=None,
              aws_secret_access_key=None,
              log_type="ses",
              settings=None):
     assert settings.log_type == "ses", "Expecing settings to be of type 'ses'"
     self.settings = settings
     self.accumulation = []
     self.next_send = Date.now() + MINUTE
     self.locker = Lock()
Beispiel #48
0
def next_run(job):
    if job.settings.start_next:
        formula = next_run(job.settings.start_next)
    elif job.settings.start_interval:
        formula = "now|" + job.settings.start_interval + "+" + job.settings.start_interval
    else:
        Log.error("Expecting `start_next` or `start_interval` for job {{job}}",
                  job=job.name)

    now = Date.now()
    next = Date(formula)
    if next < now:
        Log.error("{{formula|quote}} does not calculate a future date")
    return next
    def _send_email(self):
        try:
            if self.accumulation:
                conn = connect_to_region(
                    self.settings.region,
                    aws_access_key_id=unwrap(self.settings.aws_access_key_id),
                    aws_secret_access_key=unwrap(self.settings.aws_secret_access_key)
                )

                conn.send_email(
                    source=self.settings.from_address,
                    to_addresses=listwrap(self.settings.to_address),
                    subject=self.settings.subject,
                    body="\n\n".join(self.accumulation),
                    format="text"
                )

                conn.close()
            self.next_send = Date.now() + WAIT_TO_SEND_MORE
            self.accumulation = []
        except Exception, e:
            self.next_send = Date.now() + WAIT_TO_SEND_MORE
            Log.warning("Could not send", e)
Beispiel #50
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            short_name = join_field(split_field(table_name)[0:1])
            table = self.get_table(short_name)[0]

            if not table:
                table = Table(name=short_name,
                              url=None,
                              query_path=None,
                              timestamp=Date.now())
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=short_name)
            elif force or table.timestamp == None or table.timestamp < Date.now(
            ) - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=short_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(table_name, column_name)
            if columns:
                columns = jx.sort(columns, "name")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    Log.note("waiting for columns to update {{columns|json}}",
                             columns=[
                                 c.table + "." + c.es_column for c in columns
                                 if not c.last_updated
                             ])
                    Thread.sleep(seconds=1)
                return columns
        except Exception, e:
            Log.error("Not expected", cause=e)
Beispiel #51
0
    def startup(self):
        try:
            json = self.synch.read()
            if not json:
                Log.note("{{synchro_key}} does not exist.  Starting.",
                         synchro_key=SYNCHRONIZATION_KEY)
                return

            last_run = convert.json2value(json)
            self.next_key = last_run.next_key
            self.source_key = last_run.source_key
            if last_run.action == "shutdown":
                Log.note(
                    "{{synchro_key}} shutdown detected.  Starting at {{num}}",
                    synchro_key=SYNCHRONIZATION_KEY,
                    num=self.next_key)
            else:
                resume_time = Date(last_run.timestamp) + WAIT_FOR_ACTIVITY
                Log.note(
                    "Shutdown not detected, waiting until {{time}} to see if existing pulse_logger is running...",
                    time=resume_time)
                while resume_time > Date.now():
                    Thread.sleep(seconds=10)
                    json = self.synch.read()
                    if json == None:
                        Log.note(
                            "{{synchro_key}} disappeared!  Starting over.",
                            synchro_key=SYNCHRONIZATION_KEY)
                        self._start()
                        self.pinger_thread = Thread.run(
                            "synch pinger", self._pinger)
                        return

                    self.next_key = last_run.next_key
                    self.source_key = last_run.source_key
                    if last_run.action == "shutdown":
                        Log.note("Shutdown detected!  Resuming...")
                        self._start()
                        self.pinger_thread = Thread.run(
                            "synch pinger", self._pinger)
                        return

                    if last_run.timestamp > self.ping_time:
                        Log.error(
                            "Another instance of pulse_logger is running!")
                    Log.note("No activity, still waiting...")
                Log.note("No activity detected!  Resuming...")
        except Exception, e:
            Log.error("Can not start", e)
Beispiel #52
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(Thread.STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == Thread.STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear":[
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
            Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
Beispiel #53
0
def add_to_queue(work_queue, redo, bucket_name):
    now = Date.now()
    for r in redo:
        k = literal_field(r)
        counter[k] += 1
        if counter[k] > 3:
            Log.error("Problem backfilling {{key}}: Tried >=3 times, giving up",  key= r)
            continue

        work_queue.add({
            "bucket": bucket_name,
            "key": r,
            "timestamp": now.unix,
            "date/time": now.format()
        })
Beispiel #54
0
def submit_data(bucket, permissions, value):
    global containers
    # CONFIRM THIS IS JSON, AND ANNOTATE
    data = {
        "etl": {
            "user": permissions.hawk.id,
            "bucket": bucket,
            "timestamp": Date.now()
        },
        "data": value
    }

    storage = containers.get(bucket)
    if storage == None:
        storage = containers[bucket] = Storage(bucket=bucket, settings=config.aws)
    return storage.add(data)
Beispiel #55
0
            def add_column(c, query_path):
                c.last_updated = Date.now()
                if query_path:
                    c.table = c.es_index + "." + query_path.last()
                else:
                    c.table = c.es_index

                with self.meta.columns.locker:
                    self._upsert_column(c)
                    for alias in meta.aliases:
                        c = copy(c)
                        if query_path:
                            c.table = alias + "." + query_path.last()
                        else:
                            c.table = alias
                        self._upsert_column(c)
Beispiel #56
0
 def setUpClass(self):
     # REMOVE OLD INDEXES
     cluster = elasticsearch.Cluster(global_settings.backend_es)
     aliases = cluster.get_aliases()
     for a in aliases:
         try:
             if a.index.startswith("testing_"):
                 create_time = Date(
                     a.index[-15:], "%Y%m%d_%H%M%S"
                 )  # EXAMPLE testing_0ef53e45b320160118_180420
                 if create_time < Date.now() - 10 * MINUTE:
                     cluster.delete_index(a.index)
         except Exception, e:
             Log.warning("Problem removing {{index|quote}}",
                         index=a.index,
                         cause=e)
Beispiel #57
0
            def add_column(c, query_path):
                c.last_updated = Date.now()
                if query_path:
                    c.table = c.es_index + "." + query_path.last()
                else:
                    c.table = c.es_index

                with self.meta.columns.locker:
                    self._upsert_column(c)
                    for alias in meta.aliases:
                        c = copy(c)
                        if query_path:
                            c.table = alias + "." + query_path.last()
                        else:
                            c.table = alias
                        self._upsert_column(c)
 def __init__(
     self,
     from_address,
     to_address,
     subject,
     region,
     aws_access_key_id=None,
     aws_secret_access_key=None,
     log_type="ses",
     settings=None
 ):
     assert settings.log_type == "ses", "Expecing settings to be of type 'ses'"
     self.settings = settings
     self.accumulation = []
     self.next_send = Date.now() + MINUTE
     self.locker = Lock()
Beispiel #59
0
def seen_nonce(sender_id, nonce, timestamp):
    global seen
    key = '{id}:{nonce}:{ts}'.format(
        id=sender_id,
        nonce=nonce,
        ts=timestamp
    )

    if Random.int(1000) == 0:
        old = (Date.now() - HOUR).unix
        seen = {k: v for k, v in seen.items() if v["timestamp"] >= old}

    if seen.get(key):
        return True
    else:
        seen[key] = {"timestamp": timestamp}
        return False
    def next(
        self,
        source_etl,  # ETL STRUCTURE DESCRIBING SOURCE
        name  # NAME FOR HUMANS TO BETTER UNDERSTAND WHICH SOURCE THIS IS
    ):
        num = self.next_id
        self.next_id = num + 1
        dest_key = self.source_key + "." + unicode(num)

        dest_etl = wrap({
            "id": num,
            "name": name,
            "source": source_etl,
            "type": "join",
            "timestamp": Date.now().unix
        })

        return dest_key, dest_etl