Ejemplo n.º 1
0
    def write_lines(self, key, *lines):
        self._verify_key_format(key)
        storage = self.bucket.new_key(key + ".json.gz")

        buff = BytesIO()
        archive = gzip.GzipFile(fileobj=buff, mode='w')
        count = 0
        for l in lines:
            if hasattr(l, "__iter__"):
                for ll in l:
                    archive.write(ll.encode("utf8"))
                    archive.write(b"\n")
                    count += 1
            else:
                archive.write(l.encode("utf8"))
                archive.write(b"\n")
                count += 1
        archive.close()
        file_length = buff.tell()
        buff.seek(0)
        with Timer("Sending {{count}} lines in {{file_length|comma}} bytes", {
                "file_length": file_length,
                "count": count
        },
                   debug=self.settings.debug):
            storage.set_contents_from_file(buff)

        if self.settings.public:
            storage.set_acl('public-read')
        return
Ejemplo n.º 2
0
def get_raw_json(path):
    active_data_timer = Timer("total duration")
    body = flask.request.get_data()
    try:
        with active_data_timer:
            args = wrap(Dict(**flask.request.args))
            limit = args.limit if args.limit else 10
            args.limit = None
            frum = wrap_from(path)
            result = jx.run(
                {
                    "from": path,
                    "where": {
                        "eq": args
                    },
                    "limit": limit,
                    "format": "list"
                }, frum)

            if isinstance(
                    result, Container
            ):  #TODO: REMOVE THIS CHECK, jx SHOULD ALWAYS RETURN Containers
                result = result.format("list")

        result.meta.active_data_response_time = active_data_timer.duration

        response_data = convert.unicode2utf8(
            convert.value2json(result.data, pretty=True))
        Log.note("Response is {{num}} bytes", num=len(response_data))
        return Response(response_data, status=200)
    except Exception, e:
        e = Except.wrap(e)
        return _send_error(active_data_timer, body, e)
Ejemplo n.º 3
0
    def get_treeherder_job(self):
        try:
            with Timer("Process Request"):
                args = Dict(**flask.request.args)

                # IS THE branch/revision PENDING?

                result = self.get_markup(unwraplist(args.branch),
                                         unwraplist(args.revision),
                                         unwraplist(args.task_id),
                                         unwraplist(args.buildername),
                                         unwraplist(args.timestamp))

                response_data = convert.unicode2utf8(
                    convert.value2json(result))
                return Response(response_data,
                                status=200,
                                headers={
                                    "access-control-allow-origin": "*",
                                    "content-type": "text/plain"
                                })
        except Exception, e:
            e = Except.wrap(e)
            Log.warning("Could not process", cause=e)
            e = e.as_dict()

            return Response(convert.unicode2utf8(convert.value2json(e)),
                            status=400,
                            headers={
                                "access-control-allow-origin": "*",
                                "content-type": "application/json"
                            })
Ejemplo n.º 4
0
    def _worker(self, please_stop):
        if Sqlite.canonical:
            self.db = Sqlite.canonical
        else:
            self.db = sqlite3.connect(':memory:')

        try:
            while not please_stop:
                if DEBUG:
                    Log.note("begin pop")
                command, result, signal, trace = self.queue.pop()
                if DEBUG:
                    Log.note("done pop")

                if DEBUG:
                    Log.note("Running command\n{{command|indent}}",
                             command=command)
                with Timer("Run command", debug=DEBUG):
                    if signal is not None:
                        try:
                            curr = self.db.execute(command)
                            result.meta.format = "table"
                            result.data = curr.fetchall()
                        except Exception, e:
                            e = Except.wrap(e)
                            result.exception = Except(
                                ERROR,
                                "Problem with\n{{command|indent}}",
                                command=command,
                                cause=e)
                        finally:
                            signal.go()
Ejemplo n.º 5
0
    def test_multiple_agg_on_same_field(self):
        if self.not_real_service():
            return

        test = wrap({
            "query": {
                "from": {
                    "type": "elasticsearch",
                    "settings": {
                        "host": ES_CLUSTER_LOCATION,
                        "index": "unittest",
                        "type": "test_result"
                    }
                },
                "select": [{
                    "name": "max_bytes",
                    "value": "run.stats.bytes",
                    "aggregate": "max"
                }, {
                    "name": "count",
                    "value": "run.stats.bytes",
                    "aggregate": "count"
                }]
            }
        })

        query = convert.unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.service_url, data=query)
            if response.status_code != 200:
                error(response)
        result = convert.json2value(convert.utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
def verify_blobber_file(line_number, name, url):
    """
    :param line_number:  for debugging
    :param name:  for debugging
    :param url:  TO BE READ
    :return:  RETURNS BYTES **NOT** UNICODE
    """
    if name in ["emulator-5554.log", "qemu.log"] or any(
            map(name.endswith, [".png", ".html"])):
        return None, 0

    with Timer("Read {{name}}: {{url}}", {
            "name": name,
            "url": url
    },
               debug=DEBUG):
        response = http.get(url)
        try:
            logs = response.all_lines
        except Exception, e:
            if name.endswith("_raw.log"):
                Log.error(
                    "Line {{line}}: {{name}} = {{url}} is NOT structured log",
                    line=line_number,
                    name=name,
                    url=url,
                    cause=e)
            if DEBUG:
                Log.note(
                    "Line {{line}}: {{name}} = {{url}} is NOT structured log",
                    line=line_number,
                    name=name,
                    url=url)
            return None, 0
Ejemplo n.º 7
0
 def extend(self, values):
     records = []
     for v in wrap(values):
         row = {"_id": v.id}
         for k, vv in v.value.leaves():
             row[k] = vv
         records.append(row)
     with Timer("Push {{num}} records to Redshift", {"num": len(records)}):
         self.db.insert_list(self.settings.table, records)
Ejemplo n.º 8
0
    def test_timing(self):
        if self.not_real_service():
            return

        test = wrap({
            "query": {
                "from": {
                    "type": "elasticsearch",
                    "settings": {
                        "host": ES_CLUSTER_LOCATION,
                        "index": "unittest",
                        "type": "test_result"
                    }
                },
                "select": [{
                    "name": "count",
                    "value": "run.duration",
                    "aggregate": "count"
                }, {
                    "name": "total",
                    "value": "run.duration",
                    "aggregate": "sum"
                }],
                "edges": [{
                    "name": "chunk",
                    "value": ["run.suite", "run.chunk"]
                }, "result.ok"],
                "where": {
                    "and": [{
                        "lt": {
                            "timestamp": Date.floor(Date.now()).milli / 1000
                        }
                    }, {
                        "gte": {
                            "timestamp":
                            Date.floor(Date.now() - (Duration.DAY * 7),
                                       Duration.DAY).milli / 1000
                        }
                    }]
                },
                "format":
                "cube",
                "samples": {
                    "limit": 30
                }
            }
        })

        query = convert.unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.service_url, data=query)
            if response.status_code != 200:
                error(response)
        result = convert.json2value(convert.utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Ejemplo n.º 9
0
    def test_simple_query(self):
        if self.not_real_service():
            return

        query = convert.unicode2utf8(convert.value2json({"from": "unittest"}))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.service_url, data=query)
            if response.status_code != 200:
                error(response)
        result = convert.json2value(convert.utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Ejemplo n.º 10
0
 def find_largest_key(self):
     """
     FIND LARGEST VERSION NUMBER (with dots (.) and colons(:)) IN
     THE KEYS OF AN S3 BUCKET.
     """
     with Timer("Full scan of {{bucket}} for max key",
                {"bucket": self.bucket.name}):
         maxi = 0
         for k in self.bucket.bucket.list(delimiter=":"):
             try:
                 v = key_prefix(k.name)
                 maxi = max(maxi, v)
             except Exception, e:
                 self.bucket.bucket.delete_key(k.name)
         return maxi
Ejemplo n.º 11
0
def list_s3(settings, filter):
    """
    LIST THE KEYS AND TIMESTAMPS FOUND IN AN S3 BUCKET
    """

    with Timer("get all metadata"):
        metas = Bucket(settings).metas()

    filtered = qb.run({
        "from": metas,
        "where": filter,
        "sort": "last_modified"
    })
    for meta in filtered:
        Log.note("Read {{key}} {{timestamp}}",
                 key=meta.key,
                 timestamp=meta.last_modified)
Ejemplo n.º 12
0
def process_unittest(source_key,
                     etl_header,
                     buildbot_summary,
                     unittest_log,
                     destination,
                     please_stop=None):

    timer = Timer("Process log {{file}} for {{key}}", {
        "file": etl_header.name,
        "key": source_key
    })
    try:
        with timer:
            summary = accumulate_logs(source_key, etl_header.name,
                                      unittest_log, please_stop)
    except Exception, e:
        Log.error("Problem processing {{key}}", key=source_key, cause=e)
        summary = None
Ejemplo n.º 13
0
    def test_longest_running_tests(self):
        test = wrap({
            "query": {
                "sort": {
                    "sort": -1,
                    "field": "avg"
                },
                "from": {
                    "from":
                    "unittest",
                    "where": {
                        "and": [{
                            "gt": {
                                "build.date": "1439337600"
                            }
                        }]
                    },
                    "groupby": [
                        "build.platform", "build.type", "run.suite",
                        "result.test"
                    ],
                    "select": [{
                        "aggregate": "avg",
                        "name": "avg",
                        "value": "result.duration"
                    }],
                    "format":
                    "table",
                    "limit":
                    100
                },
                "limit": 100,
                "format": "list"
            }
        })
        query = convert.unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.service_url, data=query)
            if response.status_code != 200:
                error(response)
        result = convert.json2value(convert.utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Ejemplo n.º 14
0
    def copy(self,
             keys,
             source,
             sample_only_filter=None,
             sample_size=None,
             done_copy=None):
        """
        :param keys: THE KEYS TO LOAD FROM source
        :param source: THE SOURCE (USUALLY S3 BUCKET)
        :param sample_only_filter: SOME FILTER, IN CASE YOU DO NOT WANT TO SEND EVERYTHING
        :param sample_size: FOR RANDOM SAMPLE OF THE source DATA
        :param done_copy: CALLBACK, ADDED TO queue, TO FINISH THE TRANSACTION
        :return: LIST OF SUB-keys PUSHED INTO ES
        """
        num_keys = 0
        queue = None
        for key in keys:
            timer = Timer("key")
            try:
                with timer:
                    for rownum, line in enumerate(
                            source.read_lines(strip_extension(key))):
                        if not line:
                            continue

                        row, please_stop = fix(rownum, line, source,
                                               sample_only_filter, sample_size)
                        num_keys += 1

                        if queue == None:
                            queue = self._get_queue(row)
                        queue.add(row)

                        if please_stop:
                            break
            except Exception, e:
                done_copy = None
                Log.warning(
                    "Could not process {{key}} after {{duration|round(places=2)}}seconds",
                    key=key,
                    duration=timer.duration.seconds,
                    cause=e)
Ejemplo n.º 15
0
def copy2es(es, settings, work_queue, please_stop=None):
    # EVERYTHING FROM ELASTICSEARCH
    bucket = s3.Bucket(settings.source)

    for key in iter(work_queue.pop, ""):
        if please_stop:
            return
        if key == None:
            continue

        key = unicode(key)
        extend_time = Timer("insert", silent=True)
        Log.note("Indexing {{key}}", key=key)
        with extend_time:
            if settings.sample_only:
                sample_filter = {
                    "terms": {
                        "build.branch": settings.sample_only
                    }
                }
            elif settings.sample_size:
                sample_filter = True
            else:
                sample_filter = None

            if key.find(":") >= 0:
                more_keys = bucket.keys(prefix=key)
            else:
                more_keys = bucket.keys(prefix=key + ":")
            num_keys = es.copy(more_keys, bucket, sample_filter,
                               settings.sample_size)

        if num_keys > 1:
            Log.note(
                "Added {{num}} keys from {{key}} block in {{duration}} ({{rate|round(places=3)}} keys/second)",
                num=num_keys,
                key=key,
                duration=extend_time.duration,
                rate=num_keys / Math.max(extend_time.duration.seconds, 0.01))

        work_queue.commit()
Ejemplo n.º 16
0
    def test_branch_count(self):
        if self.not_real_service():
            return

        test = wrap({
            "query": {
                "from": {
                    "type": "elasticsearch",
                    "settings": {
                        "host": ES_CLUSTER_LOCATION,
                        "index": "unittest",
                        "type": "test_result"
                    }
                },
                "select": [
                    {
                        "aggregate": "count"
                    },
                ],
                "edges": ["build.branch"],
                "where": {
                    "or":
                    [{
                        "missing": "build.id"
                    }
                     # {"gte": {"timestamp": Date.floor(Date.now() - (Duration.DAY * 7), Duration.DAY).milli / 1000}}
                     ]
                },
                "format": "table"
            }
        })

        query = convert.unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.service_url, data=query)
            if response.status_code != 200:
                error(response)
        result = convert.json2value(convert.utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Ejemplo n.º 17
0
    def test_51586(self):
        debug_settings = {
            "trace": True,
            "cprofile": {
                "enabled": True,
                "filename": "tests/results/test_51586_profile.tab"
            }
        }
        Log.start(debug_settings)

        source_key = "51586_5124145.52"
        content = File("tests/resources/51586_5124145.52.json.gz").read_bytes()
        source = Dict(read_lines=lambda: GzipLines(content))
        with Accumulator(
                File("tests/results/51586_5124145.52.json")) as destination:
            with Timer("ETL file"):
                process_unittest_in_s3(source_key,
                                       source,
                                       destination,
                                       please_stop=None)
        Log.stop()
Ejemplo n.º 18
0
def extract_rows(es, es_query, source, select, query):
    with Timer("call to ES") as call_timer:
        data = es09.util.post(es, es_query, query.limit)

    T = data.hits.hits
    for i, s in enumerate(select.copy()):
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if s.value == "*":
            try:
                column_names = set(c.name for c in query.frum.get_columns()
                                   if (c.type not in ["object"] or c.useSource)
                                   and not c.depth)
            except Exception, e:
                Log.warning("can not get columns", e)
                column_names = UNION(*[[k for k, v in row.items()]
                                       for row in T.select(source)])
            column_names -= set(select.name)
            select = select[:i:] + [{
                "name": n,
                "value": n
            } for n in column_names] + select[i + 1::]
            break
Ejemplo n.º 19
0
    def write_lines(self, key, lines):
        self._verify_key_format(key)
        storage = self.bucket.new_key(key + ".json.gz")

        buff = TemporaryFile()
        archive = gzip.GzipFile(fileobj=buff, mode='w')
        count = 0
        for l in lines:
            if hasattr(l, "__iter__"):
                for ll in l:
                    archive.write(ll.encode("utf8"))
                    archive.write(b"\n")
                    count += 1
            else:
                archive.write(l.encode("utf8"))
                archive.write(b"\n")
                count += 1
        archive.close()
        file_length = buff.tell()

        retry = 3
        while retry:
            try:
                with Timer(
                        "Sending {{count}} lines in {{file_length|comma}} bytes",
                    {
                        "file_length": file_length,
                        "count": count
                    },
                        debug=self.settings.debug):
                    buff.seek(0)
                    storage.set_contents_from_file(buff)
                break
            except Exception, e:
                Log.warning("could not push data to s3", cause=e)
                retry -= 1
Ejemplo n.º 20
0
    def test_failures_by_directory(self):
        if self.not_real_service():
            return

        test = wrap({
            "query": {
                "from": {
                    "type": "elasticsearch",
                    "settings": {
                        "host": ES_CLUSTER_LOCATION,
                        "index": "unittest",
                        "type": "test_result"
                    }
                },
                "select": [{
                    "aggregate": "count"
                }],
                "edges": ["result.test", "result.ok"],
                "where": {
                    "prefix": {
                        "result.test": "/"
                    }
                },
                "format": "table"
            }
        })

        query = convert.unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.service_url, data=query)
            if response.status_code != 200:
                error(response)
        result = convert.json2value(convert.utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Ejemplo n.º 21
0
def pull_repo(repo):
    if not File(os.path.join(repo.directory, ".hg")).exists:
        File(repo.directory).delete()

        # REPO DOES NOT EXIST, CLONE IT
        with Timer("Clone hg log for {{name}}", {"name": repo.name}):
            proc = subprocess.Popen(
                ["hg", "clone", repo.url,
                 File(repo.directory).filename],
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                bufsize=-1)
            try:
                while True:
                    line = proc.stdout.readline()
                    if line.startswith("abort:"):
                        Log.error(
                            "Can not clone {{repos.url}}, because {{problem}}",
                            {
                                "repos": repo,
                                "problem": line
                            })
                    if line == '':
                        break
                    Log.note("Mercurial cloning: {{status}}", {"status": line})
            finally:
                proc.wait()

    else:
        hgrc_file = File(os.path.join(repo.directory, ".hg", "hgrc"))
        if not hgrc_file.exists:
            hgrc_file.write("[paths]\ndefault = " + repo.url + "\n")

        # REPO EXISTS, PULL TO UPDATE
        with Timer("Pull hg log for {{name}}", {"name": repo.name}):
            proc = subprocess.Popen(
                ["hg", "pull", "--cwd",
                 File(repo.directory).filename],
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                bufsize=-1)
            (output, _) = proc.communicate()

            if output.find("abort: repository default not found!") >= 0:
                File(repo.directory).delete()
                pull_repo(repo)
                return
            if output.find("abort: abandoned transaction found") >= 0:
                Log.error(
                    "Problem pulling repos, try \"hg recover\"\n{{reason|indent}}",
                    {"reason": output})
                File(repo.directory).delete()
                pull_repo(repo)
                return
            if output.find("abort: ") >= 0:
                Log.error("Problem with pull {{reason}}",
                          {"reason": between(output, "abort:", "\n")})

            Log.note("Mercurial pull results:\n{{pull_results}}",
                     {"pull_results": output})
Ejemplo n.º 22
0
def extract_rows(es, es_query, query):
    is_list = isinstance(query.select, list)
    select = wrap([s.copy() for s in listwrap(query.select)])
    new_select = DictList()
    columns = query.frum.get_columns()
    leaf_columns = set(c.name for c in columns if c.type not in ["object", "nested"] and (not c.nested_path or c.es_column == c.nested_path))
    nested_columns = set(c.name for c in columns if c.nested_path)

    i = 0
    source = "fields"
    for s in select:
        # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
        if isinstance(s.value, LeavesOp):
            if isinstance(s.value.term, Variable):
                if s.value.term.var == ".":
                    es_query.fields = None
                    source = "_source"

                    net_columns = leaf_columns - set(select.name)
                    for n in net_columns:
                        new_select.append({
                            "name": n,
                            "value": n,
                            "put": {"name": n, "index": i, "child": "."}
                        })
                        i += 1
                else:
                    parent = s.value.var + "."
                    prefix = len(parent)
                    for c in leaf_columns:
                        if c.startswith(parent):
                            if es_query.fields is not None:
                                es_query.fields.append(c)

                            new_select.append({
                                "name": s.name + "." + c[prefix:],
                                "value": c,
                                "put": {"name": s.name + "." + c[prefix:], "index": i, "child": "."}
                            })
                            i += 1

        elif isinstance(s.value, Variable):
            if s.value.var == ".":
                es_query.fields = None
                source = "_source"

                new_select.append({
                    "name": s.name,
                    "value": s.value.var,
                    "put": {"name": s.name, "index": i, "child": "."}
                })
                i += 1
            elif s.value.var == "_id":
                new_select.append({
                    "name": s.name,
                    "value": s.value.var,
                    "pull": "_id",
                    "put": {"name": s.name, "index": i, "child": "."}
                })
                i += 1
            elif s.value.var in nested_columns:
                es_query.fields = None
                source = "_source"

                new_select.append({
                    "name": s.name,
                    "value": s.value,
                    "put": {"name": s.name, "index": i, "child": "."}
                })
                i += 1
            else:
                parent = s.value.var + "."
                prefix = len(parent)
                net_columns = [c for c in leaf_columns if c.startswith(parent)]
                if not net_columns:
                    # LEAF
                    if es_query.fields is not None:
                        es_query.fields.append(s.value.var)
                    new_select.append({
                        "name": s.name,
                        "value": s.value,
                        "put": {"name": s.name, "index": i, "child": "."}
                    })
                else:
                    # LEAVES OF OBJECT
                    for n in net_columns:
                        if es_query.fields is not None:
                            es_query.fields.append(n)
                        new_select.append({
                            "name": s.name,
                            "value": n,
                            "put": {"name": s.name, "index": i, "child": n[prefix:]}
                        })
                i += 1
        else:
            es_query.script_fields[literal_field(s.name)] = {"script": s.value.to_ruby()}
            new_select.append({
                "name": s.name,
                "pull": "fields." + literal_field(s.name),
                "put": {"name": s.name, "index": i, "child": "."}
            })
            i += 1

    for n in new_select:
        if n.pull:
            continue
        if source == "_source":
            n.pull = join_field(["_source"] + split_field(n.value))
        else:
            n.pull = "fields." + literal_field(n.value)

    with Timer("call to ES") as call_timer:
        data = es09.util.post(es, es_query, query.limit)

    T = data.hits.hits

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception, e:
        Log.error("problem formatting", e)
Ejemplo n.º 23
0
def es_aggsop(es, frum, query):
    select = wrap([s.copy() for s in listwrap(query.select)])
    es_column_map = {c.name: unwraplist(c.es_column) for c in frum.schema.all_columns}

    es_query = Dict()
    new_select = Dict()  #MAP FROM canonical_name (USED FOR NAMES IN QUERY) TO SELECT MAPPING
    formula = []
    for s in select:
        if s.aggregate == "count" and isinstance(s.value, Variable) and s.value.var == ".":
            s.pull = "doc_count"
        elif isinstance(s.value, Variable):
            if s.value.var == ".":
                if frum.typed:
                    # STATISITCAL AGGS IMPLY $value, WHILE OTHERS CAN BE ANYTHING
                    if s.aggregate in NON_STATISTICAL_AGGS:
                        #TODO: HANDLE BOTH $value AND $objects TO COUNT
                        Log.error("do not know how to handle")
                    else:
                        s.value.var = "$value"
                        new_select["$value"] += [s]
                else:
                    if s.aggregate in NON_STATISTICAL_AGGS:
                        #TODO:  WE SHOULD BE ABLE TO COUNT, BUT WE MUST *OR* ALL LEAF VALUES TO DO IT
                        Log.error("do not know how to handle")
                    else:
                        Log.error('Not expecting ES to have a value at "." which {{agg}} can be applied', agg=s.aggregate)
            elif s.aggregate == "count":
                s.value = s.value.map(es_column_map)
                new_select["count_"+literal_field(s.value.var)] += [s]
            else:
                s.value = s.value.map(es_column_map)
                new_select[literal_field(s.value.var)] += [s]
        else:
            formula.append(s)

    for canonical_name, many in new_select.items():
        representative = many[0]
        if representative.value.var == ".":
            Log.error("do not know how to handle")
        else:
            field_name = representative.value.var

        # canonical_name=literal_field(many[0].name)
        for s in many:
            if s.aggregate == "count":
                es_query.aggs[literal_field(canonical_name)].value_count.field = field_name
                s.pull = literal_field(canonical_name) + ".value"
            elif s.aggregate == "median":
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")

                es_query.aggs[key].percentiles.field = field_name
                es_query.aggs[key].percentiles.percents += [50]
                s.pull = key + ".values.50\.0"
            elif s.aggregate == "percentile":
                # ES USES DIFFERENT METHOD FOR PERCENTILES
                key = literal_field(canonical_name + " percentile")
                if isinstance(s.percentile, basestring) or s.percetile < 0 or 1 < s.percentile:
                    Log.error("Expecting percentile to be a float from 0.0 to 1.0")
                percent = Math.round(s.percentile * 100, decimal=6)

                es_query.aggs[key].percentiles.field = field_name
                es_query.aggs[key].percentiles.percents += [percent]
                s.pull = key + ".values." + literal_field(unicode(percent))
            elif s.aggregate == "cardinality":
                # ES USES DIFFERENT METHOD FOR CARDINALITY
                key = literal_field(canonical_name + " cardinality")

                es_query.aggs[key].cardinality.field = field_name
                s.pull = key + ".value"
            elif s.aggregate == "stats":
                # REGULAR STATS
                stats_name = literal_field(canonical_name)
                es_query.aggs[stats_name].extended_stats.field = field_name

                # GET MEDIAN TOO!
                median_name = literal_field(canonical_name + " percentile")
                es_query.aggs[median_name].percentiles.field = field_name
                es_query.aggs[median_name].percentiles.percents += [50]

                s.pull = {
                    "count": stats_name + ".count",
                    "sum": stats_name + ".sum",
                    "min": stats_name + ".min",
                    "max": stats_name + ".max",
                    "avg": stats_name + ".avg",
                    "sos": stats_name + ".sum_of_squares",
                    "std": stats_name + ".std_deviation",
                    "var": stats_name + ".variance",
                    "median": median_name + ".values.50\.0"
                }
            elif s.aggregate == "union":
                # USE TERMS AGGREGATE TO SIMULATE union
                stats_name = literal_field(canonical_name)
                es_query.aggs[stats_name].terms.field = field_name
                es_query.aggs[stats_name].terms.size = Math.min(s.limit, MAX_LIMIT)
                s.pull = stats_name + ".buckets.key"
            else:
                # PULL VALUE OUT OF THE stats AGGREGATE
                es_query.aggs[literal_field(canonical_name)].extended_stats.field = field_name
                s.pull = literal_field(canonical_name) + "." + aggregates1_4[s.aggregate]

    for i, s in enumerate(formula):
        canonical_name = literal_field(s.name)
        abs_value = s.value.map(es_column_map)

        if s.aggregate == "count":
            es_query.aggs[literal_field(canonical_name)].value_count.script = abs_value.to_ruby()
            s.pull = literal_field(canonical_name) + ".value"
        elif s.aggregate == "median":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")

            es_query.aggs[key].percentiles.script = abs_value.to_ruby()
            es_query.aggs[key].percentiles.percents += [50]
            s.pull = key + ".values.50\.0"
        elif s.aggregate == "percentile":
            # ES USES DIFFERENT METHOD FOR PERCENTILES THAN FOR STATS AND COUNT
            key = literal_field(canonical_name + " percentile")
            percent = Math.round(s.percentile * 100, decimal=6)

            es_query.aggs[key].percentiles.script = abs_value.to_ruby()
            es_query.aggs[key].percentiles.percents += [percent]
            s.pull = key + ".values." + literal_field(unicode(percent))
        elif s.aggregate == "cardinality":
            # ES USES DIFFERENT METHOD FOR CARDINALITY
            key = canonical_name + " cardinality"

            es_query.aggs[key].cardinality.script = abs_value.to_ruby()
            s.pull = key + ".value"
        elif s.aggregate == "stats":
            # REGULAR STATS
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].extended_stats.script = abs_value.to_ruby()

            # GET MEDIAN TOO!
            median_name = literal_field(canonical_name + " percentile")
            es_query.aggs[median_name].percentiles.script = abs_value.to_ruby()
            es_query.aggs[median_name].percentiles.percents += [50]

            s.pull = {
                "count": stats_name + ".count",
                "sum": stats_name + ".sum",
                "min": stats_name + ".min",
                "max": stats_name + ".max",
                "avg": stats_name + ".avg",
                "sos": stats_name + ".sum_of_squares",
                "std": stats_name + ".std_deviation",
                "var": stats_name + ".variance",
                "median": median_name + ".values.50\.0"
            }
        elif s.aggregate=="union":
            # USE TERMS AGGREGATE TO SIMULATE union
            stats_name = literal_field(canonical_name)
            es_query.aggs[stats_name].terms.script_field = abs_value.to_ruby()
            s.pull = stats_name + ".buckets.key"
        else:
            # PULL VALUE OUT OF THE stats AGGREGATE
            s.pull = canonical_name + "." + aggregates1_4[s.aggregate]
            es_query.aggs[canonical_name].extended_stats.script = abs_value.to_ruby()

    decoders = get_decoders_by_depth(query)
    start = 0

    vars_ = query.where.vars()

    #<TERRIBLE SECTION> THIS IS WHERE WE WEAVE THE where CLAUSE WITH nested
    split_where = split_expression_by_depth(query.where, schema=frum, map_=es_column_map)

    if len(split_field(frum.name)) > 1:
        if any(split_where[2::]):
            Log.error("Where clause is too deep")

        for d in decoders[1]:
            es_query = d.append_query(es_query, start)
            start += d.num_columns

        if split_where[1]:
            #TODO: INCLUDE FILTERS ON EDGES
            filter_ = simplify_esfilter(AndOp("and", split_where[1]).to_esfilter())
            es_query = Dict(
                aggs={"_filter": set_default({"filter": filter_}, es_query)}
            )

        es_query = wrap({
            "aggs": {"_nested": set_default(
                {
                    "nested": {
                        "path": frum.query_path
                    }
                },
                es_query
            )}
        })
    else:
        if any(split_where[1::]):
            Log.error("Where clause is too deep")

    for d in decoders[0]:
        es_query = d.append_query(es_query, start)
        start += d.num_columns

    if split_where[0]:
        #TODO: INCLUDE FILTERS ON EDGES
        filter = simplify_esfilter(AndOp("and", split_where[0]).to_esfilter())
        es_query = Dict(
            aggs={"_filter": set_default({"filter": filter}, es_query)}
        )
    # </TERRIBLE SECTION>

    if not es_query:
        es_query = wrap({"query": {"match_all": {}}})

    es_query.size = 0

    with Timer("ES query time") as es_duration:
        result = es09.util.post(es, es_query, query.limit)

    try:
        format_time = Timer("formatting")
        with format_time:
            decoders = [d for ds in decoders for d in ds]
            result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total)  # IT APPEARS THE OLD doc_count IS GONE

            formatter, groupby_formatter, aggop_formatter, mime_type = format_dispatch[query.format]
            if query.edges:
                output = formatter(decoders, result.aggregations, start, query, select)
            elif query.groupby:
                output = groupby_formatter(decoders, result.aggregations, start, query, select)
            else:
                output = aggop_formatter(decoders, result.aggregations, start, query, select)

        output.meta.timing.formatting = format_time.duration
        output.meta.timing.es_search = es_duration.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception, e:
        if query.format not in format_dispatch:
            Log.error("Format {{format|quote}} not supported yet", format=query.format, cause=e)
        Log.error("Some problem", e)
Ejemplo n.º 24
0
    def _parse_properties(self, abs_index, properties, meta):
        abs_columns = _elasticsearch.parse_properties(abs_index, None,
                                                      properties.properties)
        abs_columns = abs_columns.filter(  # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED
            lambda r: not r.es_column.startswith("other.") and not r.es_column.
            startswith("previous_values.cf_") and not r.es_index.startswith(
                "debug"))
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   debug=DEBUG):

            def add_column(c, query_path):
                c.last_updated = Date.now()
                if query_path:
                    c.table = c.es_index + "." + query_path.last()
                else:
                    c.table = c.es_index

                with self.meta.columns.locker:
                    self._upsert_column(c)
                    for alias in meta.aliases:
                        c = copy(c)
                        if query_path:
                            c.table = alias + "." + query_path.last()
                        else:
                            c.table = alias
                        self._upsert_column(c)

            # EACH query_path IS A LIST OF EVER-INCREASING PATHS THROUGH EACH NESTED LEVEL
            query_paths = wrap([[c.es_column] for c in abs_columns
                                if c.type == "nested"])
            for a, b in itertools.product(query_paths, query_paths):
                aa = a.last()
                bb = b.last()
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) < len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(0, aa)
                        break
            query_paths.append([])

            for c in abs_columns:
                # ADD RELATIVE COLUMNS
                full_path = listwrap(c.nested_path)
                abs_depth = len(full_path)
                abs_parent = coalesce(full_path.last(), "")
                for query_path in query_paths:
                    rel_depth = len(query_path)

                    # ABSOLUTE
                    add_column(copy(c), query_path)
                    cc = copy(c)
                    cc.relative = True

                    if not query_path:
                        add_column(cc, query_path)
                        continue

                    rel_parent = query_path.last()

                    if c.es_column.startswith(rel_parent + "."):
                        cc.name = c.es_column[len(rel_parent) + 1:]
                        add_column(cc, query_path)
                    elif c.es_column == rel_parent:
                        cc.name = "."
                        add_column(cc, query_path)
                    elif not abs_parent:
                        # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o)
                        # AND THEN REMOVE THE SHADOWED
                        cc.name = "." + ("." *
                                         (rel_depth - abs_depth)) + c.es_column
                        add_column(cc, query_path)
                    elif rel_parent.startswith(abs_parent + "."):
                        cc.name = "." + ("." *
                                         (rel_depth - abs_depth)) + c.es_column
                        add_column(cc, query_path)
                    elif rel_parent != abs_parent:
                        # SIBLING NESTED PATHS ARE INVISIBLE
                        pass
                    else:
                        Log.error("logic error")
Ejemplo n.º 25
0
    def _get_job_results_from_th(self, branch, revision):
        output = []

        with self.locker:
            waiting_threads = self.pending.get((branch, revision))
            if waiting_threads is None:
                sig = None
                waiting_threads = self.pending[(branch, revision)] = [output]
            else:
                sig = Signal()
                waiting_threads.append(Signal())

        if sig is not None:
            Log.note("Holding thread for {{branch}}/{{revision}}",
                     branch=branch,
                     revision=revision)
            sig.wait_for_go()
            return waiting_threads[0]

        try:
            results = DictList()
            while True:
                response = self._rate_limited_get_json(
                    expand_template(RESULT_SET_URL, {
                        "branch": branch,
                        "revision": revision[0:12:]
                    }))
                results.extend(response.results)
                if len(response.results) != 1000:
                    break

            for g, repo_ids in jx.groupby(results.id, size=10):
                jobs = DictList()
                with Timer("Get {{num}} jobs", {"num": len(repo_ids)},
                           debug=DEBUG):
                    while True:
                        response = self._rate_limited_get_json(
                            expand_template(
                                JOBS_URL, {
                                    "branch":
                                    branch,
                                    "offset":
                                    len(jobs),
                                    "result_set_id":
                                    ",".join(map(unicode, repo_ids))
                                }))
                        jobs.extend(response.results)
                        if len(response.results) != 2000:
                            break

                with Timer("Get (up to {{num}}) details from TH",
                           {"num": len(jobs)},
                           debug=DEBUG):
                    details = []
                    for _, ids in jx.groupby(jobs.id, size=40):
                        details.extend(
                            self._rate_limited_get_json(url=expand_template(
                                DETAILS_URL, {
                                    "branch": branch,
                                    "job_id": ",".join(map(unicode, ids))
                                }),
                                                        retry={
                                                            "times": 3
                                                        }).results)
                    details = {
                        k.job_guid: list(v)
                        for k, v in jx.groupby(details, "job_guid")
                    }

                with Timer("Get (up to {{num}}) stars from TH",
                           {"num": len(jobs)},
                           debug=DEBUG):
                    stars = []
                    for _, ids in jx.groupby(jobs.id, size=40):
                        response = self._rate_limited_get_json(
                            expand_template(
                                JOB_BUG_MAP, {
                                    "branch": branch,
                                    "job_id": "&job_id=".join(map(
                                        unicode, ids))
                                }))
                        stars.extend(response),
                    stars = {
                        k.job_id: list(v)
                        for k, v in jx.groupby(stars, "job_id")
                    }

                with Timer("Get notes from TH", debug=DEBUG):
                    notes = []
                    for jid in set([
                            j.id
                            for j in jobs if j.failure_classification_id != 1
                    ] + stars.keys()):
                        response = self._rate_limited_get_json(
                            expand_template(NOTES_URL, {
                                "branch": branch,
                                "job_id": unicode(jid)
                            }))
                        notes.extend(response),
                    notes = {
                        k.job_id: list(v)
                        for k, v in jx.groupby(notes, "job_id")
                    }

                for j in jobs:
                    output.append(
                        self._normalize_job_result(branch, revision, j,
                                                   details, notes, stars))

            if output:
                with Timer("Write to ES cache", debug=DEBUG):
                    self.cache.extend(
                        {
                            "id": "-".join([c.repo.branch,
                                            unicode(c.job.id)]),
                            "value": c
                        } for c in output)
                    try:
                        self.cache.flush()
                    except Exception, e:
                        Log.warning("problem flushing. nevermind.", cause=e)
        finally:
            with self.locker:
                for p in waiting_threads[1:]:
                    if DEBUG:
                        Log.note(
                            "releasing thread for {{branch}}/{{revision}}",
                            branch=branch,
                            revision=revision)
                    p.go()
                self.pending[(branch, revision)] = None

        return output
Ejemplo n.º 26
0
def main():
    """
    CLEAR OUT KEYS FROM BUCKET BY RANGE, OR BY FILE
    """
    try:
        settings = startup.read_settings(defs=[{
            "name": ["--bucket"],
            "help": "bucket to reprocess",
            "type": str,
            "dest": "bucket",
            "required": True
        }, {
            "name": ["--begin", "--start"],
            "help": "lowest key (or prefix) to reprocess",
            "type": str,
            "dest": "start",
            "default": "1",
            "required": False
        }, {
            "name": ["--end", "--stop"],
            "help": "highest key (or prefix) to reprocess",
            "type": str,
            "dest": "end",
            "default": None,
            "required": False
        }, {
            "name": ["--file"],
            "help": "path to file with CR-delimited prefix list",
            "type": str,
            "dest": "file",
            "default": None,
            "required": False
        }])
        Log.start(settings.debug)

        with aws.Queue(settings.work_queue) as work_queue:
            source = Connection(settings.aws).get_bucket(settings.args.bucket)

            if settings.args.file:
                now = Date.now()
                for prefix in File(settings.args.file):
                    all_keys = source.keys(prefix=key_prefix(prefix))
                    for k in all_keys:
                        Log.note("Adding {{key}}", key=k)
                        work_queue.add({
                            "bucket": settings.args.bucket,
                            "key": k,
                            "timestamp": now.unix,
                            "date/time": now.format()
                        })
                return

            if settings.args.end and settings.args.start:
                up_to = str(int(settings.args.end) - 1)
                prefix = strings.common_prefix(settings.args.start, up_to)
            else:
                prefix = None
            start = Version(settings.args.start)
            end = Version(settings.args.end)

            all_keys = source.keys(prefix=prefix)
            with Timer("filtering {{num}} keys", {"num": len(all_keys)}):
                all_keys = [(k, Version(k)) for k in all_keys
                            if k.find("None") == -1]
                all_keys = [(k, p) for k, p in all_keys if start <= p < end]
            with Timer("sorting {{num}} keys", {"num": len(all_keys)}):
                all_keys = qb.sort(all_keys, 1)
            for k, p in all_keys:
                Log.note("Adding {{key}}", key=k)
                now = Date.now()
                work_queue.add({
                    "bucket": settings.args.bucket,
                    "key": k,
                    "timestamp": now.unix,
                    "date/time": now.format()
                })

    except Exception, e:
        Log.error("Problem with etl", e)
def process_pulse_block(source_key, source, destination, please_stop=None):
    """
    SIMPLE CONVERT pulse_block INTO S3 LOGFILES
    PREPEND WITH ETL HEADER AND PULSE ENVELOPE
    """
    output = []
    stats = Dict()
    etl_header_gen = EtlHeadGenerator(source_key)

    for i, line in enumerate(source.read_lines()):
        if please_stop:
            Log.error("Stopping early")

        pulse_record = scrub_pulse_record(source_key, i, line, stats)
        if not pulse_record:
            continue

        if DEBUG or DEBUG_SHOW_LINE:
            Log.note(
                "Source {{key}}, line {{line}}, buildid = {{buildid|quote}}",
                key=source_key,
                line=i,
                buildid=pulse_record.payload.builddate)

        file_num = 0
        for name, url in pulse_record.payload.blobber_files.items():
            try:
                if url == None:
                    if DEBUG:
                        Log.note(
                            "Line {{line}}: found structured log with NULL url",
                            line=i)
                    continue

                log_content, num_lines = verify_blobber_file(i, name, url)
                if not log_content:
                    continue

                with Timer(
                        "Copied {{line}}, {{name}} with {{num_lines}} lines", {
                            "line": i,
                            "name": name,
                            "num_lines": num_lines
                        },
                        debug=DEBUG):
                    dest_key, dest_etl = etl_header_gen.next(
                        pulse_record.payload.etl, name)

                    destination.write_lines(
                        dest_key,
                        convert.value2json(dest_etl),  # ETL HEADER
                        line,  # PULSE MESSAGE
                        log_content)
                    file_num += 1
                    output.append(dest_key)

                    if DEBUG_SHOW_LINE:
                        Log.note("Copied {{key}}: {{url}}",
                                 key=dest_key,
                                 url=url)
            except Exception, e:
                Log.error("Problem processing {{name}} = {{url}}",
                          name=name,
                          url=url,
                          cause=e)

        if not file_num and DEBUG_SHOW_NO_LOG:
            Log.note("No structured log {{json}}", json=pulse_record.payload)
                    url=url,
                    cause=e)
            if DEBUG:
                Log.note(
                    "Line {{line}}: {{name}} = {{url}} is NOT structured log",
                    line=line_number,
                    name=name,
                    url=url)
            return None, 0

    if any(name.endswith(e) for e in STRUCTURED_LOG_ENDINGS):
        # FAST TRACK THE FILES WE SUSPECT TO BE STRUCTURED LOGS ALREADY
        return logs, "unknown"

    # DETECT IF THIS IS A STRUCTURED LOG
    with Timer("Structured log detection {{name}}:", {"name": name},
               debug=DEBUG):
        try:
            total = 0  # ENSURE WE HAVE A SIDE EFFECT
            count = 0
            bad = 0
            for blobber_line in logs:
                blobber_line = strings.strip(blobber_line)
                if not blobber_line:
                    continue

                try:
                    total += len(convert.json2value(blobber_line))
                    count += 1
                except Exception, e:
                    if DEBUG:
                        Log.note("Not JSON: {{line}}",
Ejemplo n.º 29
0
    def __init__(self, dim, parent, qb):
        self.name = dim.name
        self.parent = parent
        self.full_name = join_field(
            split_field(self.parent.full_name) + [self.name])
        dot.set_default(self, dim)
        self.esfilter = dim.esfilter
        self.type = coalesce(dim.type, "set")
        self.limit = coalesce(dim.limit, DEFAULT_QUERY_LIMIT)
        self.index = coalesce(dim.index,
                              coalesce(parent, Null).index,
                              qb.es.settings.name)

        if not self.index:
            Log.error("Expecting an index name")

        # ALLOW ACCESS TO SUB-PART BY NAME (IF ONLY THERE IS NO NAME COLLISION)
        self.edges = Dict()
        for e in listwrap(dim.edges):
            new_e = Dimension(e, self, qb)
            self.edges[new_e.full_name] = new_e

        self.partitions = wrap(coalesce(dim.partitions, []))
        parse_partition(self)

        fields = coalesce(dim.field, dim.fields)
        if not fields:
            return  # NO FIELDS TO SEARCH
        elif isinstance(fields, Mapping):
            self.fields = wrap(fields)
            edges = wrap([{
                "name": k,
                "value": v,
                "allowNulls": False
            } for k, v in self.fields.items()])
        else:
            self.fields = listwrap(fields)
            edges = wrap([{
                "name": f,
                "value": f,
                "index": i,
                "allowNulls": False
            } for i, f in enumerate(self.fields)])

        if dim.partitions:
            return  # ALREADY HAVE PARTS
        if dim.type not in KNOWN - ALGEBRAIC:
            return  # PARTS OR TOO FUZZY (OR TOO NUMEROUS) TO FETCH

        with Timer("Get parts of {{name}}", {"name": self.name}):
            parts = qb.query({
                "from": self.index,
                "select": {
                    "name": "count",
                    "aggregate": "count"
                },
                "edges": edges,
                "esfilter": self.esfilter,
                "limit": self.limit
            })
            Log.note("{{name}} has {{num}} parts",
                     name=self.name,
                     num=len(parts))

        d = parts.edges[0].domain

        if dim.path:
            if len(edges) > 1:
                Log.error("Not supported yet")
            # EACH TERM RETURNED IS A PATH INTO A PARTITION TREE
            temp = Dict(partitions=[])
            for i, count in enumerate(parts):
                a = dim.path(d.getEnd(d.partitions[i]))
                if not isinstance(a, list):
                    Log.error("The path function on " + dim.name +
                              " must return an ARRAY of parts")
                addParts(temp, dim.path(d.getEnd(d.partitions[i])), count, 0)
            self.value = coalesce(dim.value, "name")
            self.partitions = temp.partitions
        elif isinstance(fields, Mapping):
            self.value = "name"  # USE THE "name" ATTRIBUTE OF PARTS

            partitions = DictList()
            for g, p in parts.groupby(edges):
                if p:
                    partitions.append({
                        "value": g,
                        "esfilter": {
                            "and": [{
                                "term": {
                                    e.value: g[e.name]
                                }
                            } for e in edges]
                        },
                        "count": int(p)
                    })
            self.partitions = partitions
        elif len(edges) == 1:
            self.value = "name"  # USE THE "name" ATTRIBUTE OF PARTS

            # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM
            self.partitions = wrap([
                {
                    "name": str(d.partitions[i].name),  # CONVERT TO STRING
                    "value": d.getEnd(d.partitions[i]),
                    "esfilter": {
                        "term": {
                            edges[0].value: d.partitions[i].value
                        }
                    },
                    "count": count
                } for i, count in enumerate(parts)
            ])
            self.order = {p.value: i for i, p in enumerate(self.partitions)}
        elif len(edges) == 2:
            self.value = "name"  # USE THE "name" ATTRIBUTE OF PARTS
            d2 = parts.edges[1].domain

            # SIMPLE LIST OF PARTS RETURNED, BE SURE TO INTERRELATE THEM
            array = parts.data.values(
            )[0].cube  # DIG DEEP INTO RESULT (ASSUME SINGLE VALUE CUBE, WITH NULL AT END)

            def edges2value(*values):
                if isinstance(fields, Mapping):
                    output = Dict()
                    for e, v in zip(edges, values):
                        output[e.name] = v
                    return output
                else:
                    return tuple(values)

            self.partitions = wrap([
                {
                    "name":
                    str(d.partitions[i].name),  # CONVERT TO STRING
                    "value":
                    d.getEnd(d.partitions[i]),
                    "esfilter": {
                        "term": {
                            edges[0].value: d.partitions[i].value
                        }
                    },
                    "count":
                    SUM(subcube),
                    "partitions": [
                        {
                            "name":
                            str(d2.partitions[j].name),  # CONVERT TO STRING
                            "value":
                            edges2value(d.getEnd(d.partitions[i]),
                                        d2.getEnd(d2.partitions[j])),
                            "esfilter": {
                                "and": [{
                                    "term": {
                                        edges[0].value: d.partitions[i].value
                                    }
                                }, {
                                    "term": {
                                        edges[1].value: d2.partitions[j].value
                                    }
                                }]
                            },
                            "count":
                            count2
                        } for j, count2 in enumerate(subcube)
                        if count2 > 0  # ONLY INCLUDE PROPERTIES THAT EXIST
                    ]
                } for i, subcube in enumerate(array)
            ])
        else:
            Log.error("Not supported")

        parse_partition(self)  # RELATE THE PARTS TO THE PARENTS
Ejemplo n.º 30
0
class Sqlite(object):
    """
    Allows multi-threaded access
    Loads extension functions (like SQRT)
    """

    canonical = None

    def __init__(self, db=None):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread save database
        """
        if not _upgraded:
            _upgrade()

        self.db = None
        self.queue = Queue(
            "sql commands")  # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = DEBUG

    def execute(self, command):
        """
        COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN
        BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS
        :param command: COMMAND FOR SQLITE
        :return: None
        """
        if self.get_trace:
            trace = extract_stack(1)
        else:
            trace = None
        self.queue.add((command, None, None, trace))

    def query(self, command):
        """
        WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED
        :param command: COMMAND FOR SQLITE
        :return: list OF RESULTS
        """
        signal = Signal()
        result = Dict()
        self.queue.add((command, result, signal, None))
        signal.wait_for_go()
        if result.exception:
            Log.error("Problem with Sqlite call", cause=result.exception)
        return result

    def _worker(self, please_stop):
        if Sqlite.canonical:
            self.db = Sqlite.canonical
        else:
            self.db = sqlite3.connect(':memory:')
            try:
                full_path = File(
                    "pyLibrary/vendor/sqlite/libsqlitefunctions.so").abspath
                # self.db.execute("SELECT sqlite3_enable_load_extension(1)")
                self.db.enable_load_extension(True)
                self.db.execute("SELECT load_extension('" + full_path + "')")
            except Exception, e:
                Log.warning(
                    "loading sqlite extension functions failed, doing without. (no SQRT for you!)",
                    cause=e)

        try:
            while not please_stop:
                if DEBUG:
                    Log.note("begin pop")
                command, result, signal, trace = self.queue.pop()
                if DEBUG:
                    Log.note("done pop")

                if DEBUG:
                    Log.note("Running command\n{{command|indent}}",
                             command=command)
                with Timer("Run command", debug=DEBUG):
                    if signal is not None:
                        try:
                            curr = self.db.execute(command)
                            result.meta.format = "table"
                            result.header = [d[0] for d in curr.description
                                             ] if curr.description else None
                            result.data = curr.fetchall()
                        except Exception, e:
                            e = Except.wrap(e)
                            result.exception = Except(
                                ERROR,
                                "Problem with\n{{command|indent}}",
                                command=command,
                                cause=e)
                        finally: