Beispiel #1
0
def test_wrap_2():
    Log.alert("Random types")
    switch = [
        lambda: Random.int(20),
        lambda: Random.string(20),
        lambda: {"i": Random.int(2000)},
        lambda: Data(i=Random.int(2000)),
        lambda: FlatList([{"i": Random.int(2000)}]),
        lambda: [{"i": Random.int(2000)}]
    ]

    inputs = [switch[min(len(switch) - 1, int(floor(-log(Random.float(), 2))))]() for i in range(NUM_INPUT)]

    for i in range(NUM_REPEAT):
        results = []
        gc.collect()
        with Timer("more string: to_data"):
            for v in inputs:
                results.append(to_data(v))

        results = []
        gc.collect()
        with Timer("more string: baseline"):
            for v in inputs:
                results.append(baseline(v))

        Log.note("Done {{i}} of {{num}}", i=i, num=NUM_REPEAT)
Beispiel #2
0
    def test_compare_isinstance_to_text(self):
        num = 1 * 1000 * 1000
        options = {
            0: lambda: 6,
            1: lambda: "string"
            # 2: lambda: {},
            # 3: lambda: Data(),
            # 4: lambda: Null,
        }
        data = [options[Random.int(len(options))]() for _ in range(num)]

        with Timer("isinstance check") as i_time:
            i_result = [isinstance(d, text) for d in data]

        with Timer("set check") as s_time:
            s_result = [d.__class__ in (text, ) for d in data]

        with Timer("eq check") as e_time:
            e_result = [d.__class__ is text for d in data]

        with Timer("name check") as n_time:
            n_result = [is_instance(d, text) for d in data]

        with Timer("check w method") as m_time:
            m_result = [is_text(d) for d in data]

        self.assertEqual(s_result, i_result)
        self.assertEqual(m_result, i_result)
        self.assertEqual(e_result, i_result)
        self.assertEqual(n_result, i_result)

        self.assertGreater(i_time.duration, s_time.duration)
        self.assertGreater(m_time.duration, s_time.duration)
Beispiel #3
0
    def extend(self, rows):
        if self.read_only:
            Log.error("not for writing")

        try:
            update = {}
            with Timer("encoding"):
                while True:
                    output = []
                    for rownum, row in enumerate(rows):
                        typed, more, add_nested = typed_encode(row, self.flake)
                        update.update(more)
                        if add_nested:
                            # row HAS NEW NESTED COLUMN!
                            # GO OVER THE rows AGAIN SO "RECORD" GET MAPPED TO "REPEATED"
                            break
                        output.append(typed)
                    else:
                        break

            if update or not self.shard:
                # BATCH HAS ADDITIONAL COLUMNS!!
                # WE CAN NOT USE THE EXISTING SHARD, MAKE A NEW ONE:
                self._create_new_shard()
                Log.note("added new shard with name: {{shard}}",
                         shard=self.shard.table_id)
            with Timer("insert {{num}} rows to bq", param={"num": len(rows)}):
                failures = self.container.client.insert_rows_json(
                    self.shard,
                    json_rows=output,
                    row_ids=[None] * len(output),
                    skip_invalid_rows=False,
                    ignore_unknown_values=False,
                )
            if failures:
                if all(r == "stopped" for r in wrap(failures).errors.reason):
                    self._create_new_shard()
                    Log.note(
                        "STOPPED encountered: Added new shard with name: {{shard}}",
                        shard=self.shard.table_id,
                    )
                Log.error(
                    "Got {{num}} failures:\n{{failures|json}}",
                    num=len(failures),
                    failures=failures[:5],
                )
            else:
                self.last_extend = Date.now()
                Log.note("{{num}} rows added", num=len(output))
        except Exception as e:
            e = Except.wrap(e)
            if len(rows) > 1 and "Request payload size exceeds the limit" in e:
                # TRY A SMALLER BATCH
                cut = len(rows) // 2
                self.extend(rows[:cut])
                self.extend(rows[cut:])
                return
            Log.error("Do not know how to handle", cause=e)
Beispiel #4
0
    def test_id_vs_id(self):

        ops = [Op() for _ in range(200)]
        lang1 = {id(o): o for o in ops}

        sample = Random.sample(ops, 1000 * 1000)
        with Timer("using id()"):
            result1 = [lang1[id(o)] for o in sample]

        lang2 = [None] * (max(o.id for o in ops) + 1)
        for o in ops:
            lang2[o.id] = o
        # lang2 = tuple(lang2)

        with Timer("using o.id"):
            result2 = [lang2[o.id] for o in sample]
Beispiel #5
0
 def _get(self, key):
     with Timer("get {{key}} from S3", {"key": key},
                verbose=False) as timer:
         output = s3_get(self, key)
         if output is not None:
             timer.verbose = True
         return output
Beispiel #6
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if c.last_updated >= Date.now() - TOO_OLD:
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}",
                       param={"col": c},
                       silent=not DEBUG,
                       too_long=0.05):
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "es_index": c.es_index,
                            "es_column": c.es_column
                        }
                    }
                })
Beispiel #7
0
    def _parse_properties(self, alias, mapping, meta):
        abs_columns = elasticsearch.parse_properties(alias, None,
                                                     mapping.properties)
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   debug=DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns
                           if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(SELF_PATH)
            query_paths.append(ROOT_PATH)
            self.alias_to_query_paths[alias] = query_paths

            # ADD RELATIVE NAMES
            for abs_column in abs_columns:
                abs_column.last_updated = None
                abs_column.jx_type = es_type_to_json_type[abs_column.es_type]
                for query_path in query_paths:
                    abs_column.names[query_path[0]] = relative_field(
                        abs_column.names["."], query_path[0])
                self.todo.add(self.meta.columns.add(abs_column))
        pass
    def _db_load(self):
        self.last_load = Date.now()

        try:
            self.es_index = self.es_cluster.get_index(
                id=ID,
                index=META_COLUMNS_NAME,
                type=META_COLUMNS_TYPE_NAME,
                read_only=False)

            result = self.es_index.search({
                "query": {
                    "bool": {
                        "should": [
                            {
                                "bool": {
                                    "must_not": {
                                        "exists": {
                                            "field": "cardinality.~n~"
                                        }
                                    }
                                }
                            },
                            {  # ASSUME UNUSED COLUMNS DO NOT EXIST
                                "range": {
                                    "cardinality.~n~": {
                                        "gte": 0
                                    }
                                }
                            },
                        ]
                    }
                },
                "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"],
                "size":
                10000,
            })

            with Timer("adding columns to structure"):
                for r in result.hits.hits._source:
                    col = doc_to_column(r)
                    if col:
                        self._add(col)

            Log.note("{{num}} columns loaded", num=result.hits.total)
            if not self.data.get(META_COLUMNS_NAME):
                Log.error("metadata missing from index!")

        except Exception as e:
            metadata = self.es_cluster.get_metadata(after=Date.now())
            if any(
                    index.startswith(META_COLUMNS_NAME)
                    for index in metadata.indices.keys()):
                Log.error("metadata already exists!", cause=e)

            Log.warning("no {{index}} exists, making one",
                        index=META_COLUMNS_NAME,
                        cause=e)
            self._db_create()
Beispiel #9
0
def update_local_database(config, deviant_summary, candidates, since):
    if isinstance(deviant_summary, bigquery.Table):
        Log.note("Only the ETL process should fill the bigquery table")
        return

    # GET EVERYTHING WE HAVE SO FAR
    exists = deviant_summary.query({
        "select": ["signature_hash", "last_updated"],
        "where": {
            "and": [
                {
                    "in": {
                        "signature_hash": candidates.signature_hash
                    }
                },
                {
                    "exists": "num_pushes"
                },
            ]
        },
        "sort": "last_updated",
        "limit": 100000,
        "format": "list",
    }).data
    # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT"
    missing = list(set(candidates.signature_hash) - set(exists.signature_hash))

    too_old = Date.today() - parse(LOCAL_RETENTION)
    needs_update = missing + [
        e.signature_hash for e in exists if e.last_updated < too_old.unix
    ]
    Log.alert("{{num}} series are candidates for local update",
              num=len(needs_update))

    limited_update = Queue("sigs")
    limited_update.extend(
        left(needs_update, coalesce(config.display.download_limit, 100)))
    Log.alert("Updating local database with {{num}} series",
              num=len(limited_update))

    with Timer("Updating local database"):

        def loop(please_stop):
            while not please_stop:
                signature_hash = limited_update.pop_one()
                if not signature_hash:
                    return
                process(
                    signature_hash,
                    since,
                    source=config.database,
                    deviant_summary=deviant_summary,
                )

        threads = [Thread.run(text(i), loop) for i in range(3)]
        for t in threads:
            t.join()

    Log.note("Local database is up to date")
Beispiel #10
0
    def encode(self, value, pretty=False):
        if pretty:
            return pretty_json(value)

        try:
            with Timer("scrub", too_long=0.1):
                scrubbed = scrub(value)
            with Timer("encode", too_long=0.1):
                return text_type(self.encoder(scrubbed))
        except Exception as e:
            from mo_logs.exceptions import Except
            from mo_logs import Log

            e = Except.wrap(e)
            Log.warning("problem serializing {{type}}",
                        type=text_type(repr(value)),
                        cause=e)
            raise e
Beispiel #11
0
 def _monitor(self, please_stop):
     with Timer(self.name):
         self.service.wait()
         self.debug and Log.note(
             "{{process}} STOP: returncode={{returncode}}",
             process=self.name,
             returncode=self.service.returncode)
         self.service_stopped.go()
         please_stop.go()
Beispiel #12
0
 def __init__(self, kwargs=None):
     # GENERATE PRIVATE KEY
     self.config = kwargs
     self.session = None
     with Timer("generate {{bits}} bits rsa key",
                {"bits": self.config.rsa.bits}):
         Log.note("This will take a while....")
         self.public_key, self.private_key = rsa_crypto.generate_key(
             bits=self.config.rsa.bits)
def test_long_file(service):
    timer = Timer("test", silent=True)

    with timer:
        service.get_tuids(
            files="gfx/angle/checkout/src/libANGLE/formatutils.cpp",
            revision="29dcc9cb77c3")

    assert timer.duration.seconds < 30
Beispiel #14
0
    def encode(self, value, pretty=False):
        if pretty:
            return pretty_json(value)

        try:
            with Timer("scrub", too_long=0.1):
                scrubbed = scrub(value)
            param = {"size": 0}
            with Timer("encode {{size}} characters", param=param, too_long=0.1):
                output = text_type(self.encoder(scrubbed))
                param["size"] = len(output)
                return output
        except Exception as e:
            from mo_logs.exceptions import Except
            from mo_logs import Log

            e = Except.wrap(e)
            Log.warning("problem serializing {{type}}", type=text_type(repr(value)), cause=e)
            raise e
Beispiel #15
0
    def test_save_then_load(self):

        test = {
            "data": [{
                "a": "b"
            }],
            "query": {
                "meta": {
                    "save": True
                },
                "from": TEST_TABLE,
                "select": "a"
            },
            "expecting_list": {
                "meta": {
                    "format": "list"
                },
                "data": ["b"]
            }
        }

        settings = self.utils.fill_container(test)

        bytes = unicode2utf8(
            value2json({
                "from": settings.index,
                "select": "a",
                "format": "list"
            }))
        expected_hash = convert.bytes2base64(
            hashlib.sha1(bytes).digest()[0:6]).replace("/", "_")
        wrap(test).expecting_list.meta.saved_as = expected_hash

        self.utils.send_queries(test)

        # ENSURE THE QUERY HAS BEEN INDEXED
        Log.note("Flush saved query (with hash {{hash}})", hash=expected_hash)
        container = elasticsearch.Index(index="saved_queries",
                                        type=save_query.DATA_TYPE,
                                        kwargs=settings)
        container.flush(forced=True)
        with Timer("wait for 5 seconds"):
            Till(seconds=5).wait()

        url = URL(self.utils.testing.query)
        response = self.utils.try_till_response(url.scheme + "://" + url.host +
                                                ":" + text_type(url.port) +
                                                "/find/" + expected_hash,
                                                data=b'')
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.all_content, bytes)
Beispiel #16
0
    def test_compare_isinstance_to_class_checks(self):
        num = 1 * 1000 * 1000
        options = {
            0: lambda: {},
            1: lambda: Data(),
            2: lambda: Null,
            3: lambda: 6,
            4: lambda: "string",
        }
        data = [options[Random.int(len(options))]() for _ in range(num)]

        with Timer("isinstance check") as i_time:
            i_result = [isinstance(d, Mapping) for d in data]

        with Timer("set check") as s_time:
            s_result = [d.__class__ in MAPPING_TYPES for d in data]

        with Timer("eq check") as e_time:
            e_result = [
                d.__class__ is Data or d.__class__ is dict for d in data
            ]

        with Timer("name check") as n_time:
            n_result = [
                is_instance(d, Data) or is_instance(d, dict) for d in data
            ]

        with Timer("check w method") as m_time:
            m_result = [is_mapping(d) for d in data]

        self.assertEqual(s_result, i_result)
        self.assertEqual(m_result, i_result)
        self.assertEqual(e_result, i_result)
        self.assertEqual(n_result, i_result)

        self.assertGreater(i_time.duration, s_time.duration)
        self.assertGreater(m_time.duration, s_time.duration)
def upload(filename, temp_file):
    with Timer("upload file to S3 {{file}}", param={"file": filename}):
        try:
            connection = Connection(S3_CONFIG).connection
            bucket = connection.get_bucket(S3_CONFIG.bucket, validate=False)
            storage = bucket.new_key(filename)
            storage.set_contents_from_filename(
                temp_file.abspath, headers={"Content-Type": mimetype.JSON})
            if S3_CONFIG.public:
                storage.set_acl("public-read")

        except Exception as e:
            Log.error("Problem connecting to {{bucket}}",
                      bucket=S3_CONFIG.bucket,
                      cause=e)
Beispiel #18
0
def test_mode_wait(query, please_stop):
    """
    WAIT FOR METADATA TO ARRIVE ON INDEX
    :param query: dict() OF REQUEST BODY
    :return: nothing
    """

    if not query["from"]:
        return

    try:
        if query["from"].startswith("meta."):
            return

        alias = split_field(query["from"])[0]
        after = Date.now()
        require_cardinality = meta.ENABLE_META_SCAN
        with Timer(
                "Get columns for {{table}} after {{after}}",
            {
                "table": alias,
                "after": after
            },
                verbose=DEBUG,
        ):
            metadata_manager = find_container(alias, after=after).namespace

            timeout = Till(seconds=MINUTE.seconds) | please_stop
            while not timeout:
                # GET FRESH VERSIONS
                cols = metadata_manager.get_columns(table_name=alias,
                                                    after=after,
                                                    timeout=timeout)
                not_ready = [
                    c for c in cols if c.jx_type not in STRUCT and (
                        after >= c.last_updated or
                        (require_cardinality and c.cardinality == None))
                ]
                if not_ready:
                    Log.note(
                        "wait for column (table={{col.es_index}}, name={{col.es_column}}, cardinality={{col.cardinality|json}}, last_updated={{col.last_updated|datetime}}) metadata to arrive",
                        col=first(not_ready),
                    )
                else:
                    break
                Till(seconds=1).wait()
    except Exception as e:
        Log.warning("could not pickup columns", cause=e)
Beispiel #19
0
def update_local_database():
    # GET EVERYTHING WE HAVE SO FAR
    exists = summary_table.query({
        "select": ["id", "last_updated"],
        "where": {
            "and": [{
                "in": {
                    "id": candidates.id
                }
            }, {
                "exists": "num_pushes"
            }]
        },
        "sort": "last_updated",
        "limit": 100000,
        "format": "list",
    }).data
    # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT"
    missing = list(set(candidates.id) - set(exists.id))

    too_old = Date.today() - parse(LOCAL_RETENTION)
    needs_update = missing + [
        e for e in exists if e.last_updated < too_old.unix
    ]
    Log.alert("{{num}} series are candidates for local update",
              num=len(needs_update))

    limited_update = Queue("sigs")
    limited_update.extend(
        left(needs_update, coalesce(config.analysis.download_limit, 100)))
    Log.alert("Updating local database with {{num}} series",
              num=len(limited_update))

    with Timer("Updating local database"):

        def loop(please_stop):
            while not please_stop:
                sig_id = limited_update.pop_one()
                if not sig_id:
                    return
                process(sig_id)

        threads = [Thread.run(text(i), loop) for i in range(3)]
        for t in threads:
            t.join()

    Log.note("Local database is up to date")
Beispiel #20
0
def main():
    try:
        config = startup.read_settings()
        constants.set(config.constants)
        inject_secrets(config)

        with Timer("PATCH ADR: dd update() method to Configuration class"):

            def update(self, config):
                """
                Update the configuration object with new parameters
                :param config: dict of configuration
                """
                for k, v in config.items():
                    if v != None:
                        self._config[k] = v

                self._config["sources"] = sorted(
                    map(os.path.expanduser, set(self._config["sources"]))
                )

                # Use the NullStore by default. This allows us to control whether
                # caching is enabled or not at runtime.
                self._config["cache"].setdefault("stores", {"null": {"driver": "null"}})
                object.__setattr__(self, "cache", CacheManager(self._config["cache"]))
                self.cache.extend("null", lambda driver: NullStore())

            setattr(Configuration, "update", update)

        # UPDATE ADR COFIGURATION
        adr.config.update(config.adr)

        Log.start(config.debug)

        # SHUNT ADR LOGGING TO MAIN LOGGING
        # https://loguru.readthedocs.io/en/stable/api/logger.html#loguru._logger.Logger.add
        loguru.logger.remove()
        loguru.logger.add(
            _logging, level="DEBUG", format="{message}", filter=lambda r: True,
        )

        Schedulers(config).process()
    except Exception as e:
        Log.warning("Problem with etl! Shutting down.", cause=e)
    finally:
        Log.stop()
Beispiel #21
0
    def _parse_properties(self, abs_index, properties, meta):
        # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND
        # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES
        def add_column(c, query_path):
            c.last_updated = Date.now() - TOO_OLD
            if query_path[0] != ".":
                c.names[query_path[0]] = relative_field(
                    c.names["."], query_path[0])

            with self.meta.columns.locker:
                for alias in meta.aliases:
                    c_ = copy(c)
                    c_.es_index = alias
                    self._upsert_column(c_)
                self._upsert_column(c)

        abs_columns = elasticsearch.parse_properties(abs_index, None,
                                                     properties.properties)
        self.abs_columns.update(abs_columns)
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   debug=DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns
                           if c.type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(SELF_PATH)

            # ADD RELATIVE COLUMNS
            for abs_column in abs_columns:
                abs_column = abs_column.__copy__()
                abs_column.type = es_type_to_json_type[abs_column.type]
                for query_path in query_paths:
                    add_column(abs_column, query_path)
        pass
Beispiel #22
0
def inject_secrets(config):
    """
    INJECT THE SECRETS INTO THE CONFIGURATION
    :param config: CONFIG DATA

    ************************************************************************
    ** ENSURE YOU HAVE AN ENVIRONMENT VARIABLE SET:
    ** TASKCLUSTER_ROOT_URL = https://community-tc.services.mozilla.com
    ************************************************************************
    """
    with Timer("get secrets"):
        options = taskcluster.optionsFromEnvironment()
        secrets = taskcluster.Secrets(options)
        acc = Data()
        for s in listwrap(SECRET_NAMES):
            acc[s] = secrets.get(concat_field(SECRET_PREFIX, s))['secret']
        set_default(config, acc)
Beispiel #23
0
def inject_secrets(config):
    """
    INJECT THE SECRETS INTO THE CONFIGURATION
    :param config: CONFIG DATA

    ************************************************************************
    ** ENSURE YOU HAVE AN ENVIRONMENT VARIABLE SET:
    ** TASKCLUSTER_ROOT_URL = https://community-tc.services.mozilla.com
    ************************************************************************
    """
    with Timer("get secrets"):
        secrets = taskcluster.Secrets(config.taskcluster)
        acc = Data()
        for s in listwrap(SECRET_NAMES):
            secret_name = concat_field(SECRET_PREFIX, s)
            Log.note("get secret named {{name|quote}}", name=secret_name)
            acc[s] = secrets.get(secret_name)["secret"]
        set_default(config, acc)
Beispiel #24
0
    def _parse_properties(self, alias, mapping, meta):
        abs_columns = elasticsearch.parse_properties(alias, None,
                                                     mapping.properties)
        if any(c.cardinality == 0 and c.names['.'] != '_id'
               for c in abs_columns):
            Log.warning("Some columns are not stored {{names}}",
                        names=[
                            ".".join((c.es_index, c.names['.']))
                            for c in abs_columns if c.cardinality == 0
                        ])

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   silent=not DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns
                           if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(SELF_PATH)
            query_paths.append(ROOT_PATH)
            self.alias_to_query_paths[alias] = query_paths
            for i in self.index_to_alias.get_domain(alias):
                self.alias_to_query_paths[i] = query_paths

            # ADD RELATIVE NAMES
            for abs_column in abs_columns:
                abs_column.last_updated = None
                abs_column.jx_type = jx_type(abs_column)
                for query_path in query_paths:
                    abs_column.names[query_path[0]] = relative_field(
                        abs_column.names["."], query_path[0])
                self.todo.add(self.meta.columns.add(abs_column))
        pass
Beispiel #25
0
def one_request(request, please_stop):
    and_op = request.where['and']

    files = []
    for a in and_op:
        if a['in'].path:
            files = a['in'].path
        elif a.eq.path:
            files = [a.eq.path]

    with Timer("Make TUID request from {{timestamp|datetime}}",
               {"timestamp": request.meta.request_time}):
        try:
            result = http.post_json("http://localhost:5000/tuid",
                                    json=request,
                                    timeout=30)
            if result is None or len(result.data) != len(files):
                Log.note("incomplete response for {{thread}}",
                         thread=Thread.current().name)
        except Exception as e:
            Log.warning("Request failure", cause=e)
Beispiel #26
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            column = self.todo.pop()
            if column == THREAD_STOP:
                break
            # if untype_path(column.name) in ["build.type", "run.type"]:
            #     Log.note("found")

            if column.jx_type in STRUCT or split_field(
                    column.es_column)[-1] == EXISTS_TYPE:
                DEBUG and Log.note("{{column.es_column}} is a struct",
                                   column=column)
                column.last_updated = Date.now()
                continue
            elif column.last_updated > Date.now(
            ) - TOO_OLD and column.cardinality is not None:
                # DO NOT UPDATE FRESH COLUMN METADATA
                DEBUG and Log.note(
                    "{{column.es_column}} is still fresh ({{ago}} ago)",
                    column=column,
                    ago=(Date.now() - Date(column.last_updated)).seconds)
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}",
                       param={"col": column},
                       silent=not DEBUG,
                       too_long=0.05):
                if untype_path(column.name) in ["build.type", "run.type"]:
                    try:
                        self._update_cardinality(column)
                    except Exception as e:
                        Log.warning(
                            "problem getting cardinality for {{column.name}}",
                            column=column,
                            cause=e)
                else:
                    column.last_updated = Date.now()
Beispiel #27
0
    def test_recovery_of_empty_string(self):
        test = wrap({
            "data": [
                {"a": "bee"}
            ],
            "query": {
                "from": TEST_TABLE,
                "select": "a",
                "where": {"prefix": {"a": ""}},
                "format": "list"
            },
            "expecting_list": {
                "meta": {
                    "format": "list"
                },
                "data": ["bee"]
            }
        })

        settings = self.utils.fill_container(test)

        bytes = value2json(test.query).encode('utf8')
        expected_hash = convert.bytes2base64(hashlib.sha1(bytes).digest()[0:6]).replace("/", "_")
        test.expecting_list.meta.saved_as = expected_hash

        test.query.meta = {"save": True}
        self.utils.send_queries(test)

        # ENSURE THE QUERY HAS BEEN INDEXED
        Log.note("Flush saved query")
        container = elasticsearch.Index(index="saved_queries", kwargs=settings)
        container.flush(forced=True)
        with Timer("wait for 5 seconds"):
            Till(seconds=5).wait()

        url = URL(self.utils.testing.query)
        response = self.utils.try_till_response(url.scheme + "://" + url.host + ":" + text(url.port) + "/find/" + expected_hash, data=b'')
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.all_content, bytes)
Beispiel #28
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.then(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            pair = self.todo.pop()
            if pair is THREAD_STOP:
                break
            column, after = pair

            with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05):
                if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE:
                    # DEBUG and Log.note("{{column.es_column}} is a struct", column=column)
                    continue
                elif after and column.last_updated > after:
                    continue  # COLUMN IS STILL YOUNG
                elif column.last_updated > Date.now() - TOO_OLD and column.cardinality > 0:
                    # DO NOT UPDATE FRESH COLUMN METADATA
                    DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds)
                    continue

                if untype_path(column.name) in KNOWN_MULTITYPES:
                    try:
                        self._update_cardinality(column)
                    except Exception as e:
                        Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
                    continue

                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
Beispiel #29
0
def write_status(guid, status):
    try:
        filename = guid + ".status.json"
        with Timer("upload status to S3 {{file}}", param={"file": filename}, verbose=DEBUG):
            try:
                connection = Connection(S3_CONFIG).connection
                bucket = connection.get_bucket(S3_CONFIG.bucket, validate=False)
                storage = bucket.new_key(filename)
                storage.set_contents_from_string(
                    value2json(status), headers={"Content-Type": mimetype.JSON}
                )
                if S3_CONFIG.public:
                    storage.set_acl("public-read")

            except Exception as e:
                Log.error(
                    "Problem connecting to {{bucket}}",
                    bucket=S3_CONFIG.bucket,
                    cause=e
                )
    except Exception as e:
        Log.warning("problem setting status", cause=e)
def es_bulkaggsop(esq, frum, query):
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html#_filtering_values_with_partitions
    query = query.copy()  # WE WILL MARK UP THIS QUERY

    chunk_size = min(coalesce(query.chunk_size, MAX_CHUNK_SIZE),
                     MAX_CHUNK_SIZE)
    schema = frum.schema
    query_path = first(schema.query_path)
    selects = listwrap(query.select)

    variable = first(query.groupby).value
    # FIND CARDINALITY

    cardinality_check = Timer("Get cardinality for {{column}}",
                              param={"column": variable.var})

    with cardinality_check:
        columns = schema.leaves(variable.var)
        if len(columns) != 1:
            Log.error("too many columns to bulk groupby:\n{{columns|json}}",
                      columns=columns)
        column = first(columns)

        if query.where is TRUE:
            cardinality = column.cardinality
            if cardinality == None:
                esq.namespace._update_cardinality(column)
                cardinality = column.cardinality
        else:
            cardinality = esq.query({
                "select": {
                    "name": "card",
                    "value": variable,
                    "aggregate": "cardinality",
                },
                "from": frum.name,
                "where": query.where,
                "format": "cube",
            }).card

        num_partitions = (cardinality + chunk_size - 1) // chunk_size

        if num_partitions > MAX_PARTITIONS:
            Log.error("Requesting more than {{num}} partitions",
                      num=num_partitions)
        if num_partitions == 0:
            num_partitions = 1

        acc, decoders, es_query = aggop_to_es_queries(selects, query_path,
                                                      schema, query)
        guid = randoms.base64(32, extra="-_")
        abs_limit = mo_math.MIN(
            (query.limit, first(query.groupby).domain.limit))
        formatter = formatters[query.format](abs_limit)

        Thread.run(
            "extract to " + guid + ".json",
            extractor,
            guid,
            num_partitions,
            esq,
            query,
            selects,
            query_path,
            schema,
            chunk_size,
            cardinality,
            abs_limit,
            formatter,
            parent_thread=Null,
        ).release()

    output = to_data({
        "url": URL_PREFIX / (guid + ".json"),
        "status": URL_PREFIX / (guid + ".status.json"),
        "meta": {
            "format": query.format,
            "timing": {
                "cardinality_check": cardinality_check.duration
            },
            "es_query": es_query,
            "num_partitions": num_partitions,
            "cardinality": cardinality,
        },
    })
    return output