コード例 #1
0
ファイル: aggs.py プロジェクト: davehunt/ActiveData
def get_decoders_by_depth(query):
    """
    RETURN A LIST OF DECODER ARRAYS, ONE ARRAY FOR EACH NESTED DEPTH
    """
    schema = query.frum
    output = DictList()
    for e in wrap(coalesce(query.edges, query.groupby, [])):
        if e.value != None and not isinstance(e.value, NullOp):
            e = e.copy()
            vars_ = e.value.vars()

            for v in vars_:
                if not schema[v]:
                    Log.error("{{var}} does not exist in schema", var=v)

            e.value = e.value.map({schema[v].name: schema[v].es_column for v in vars_})
        elif e.range:
            e = e.copy()
            min_ = e.range.min
            max_ = e.range.max
            vars_ = min_.vars() | max_.vars()

            for v in vars_:
                if not schema[v]:
                    Log.error("{{var}} does not exist in schema", var=v)

            map_ = {schema[v].name: schema[v].es_column for v in vars_}
            e.range = {
                "min": min_.map(map_),
                "max": max_.map(map_)
            }
        elif e.domain.dimension:
            vars_ = e.domain.dimension.fields
            e.domain.dimension = e.domain.dimension.copy()
            e.domain.dimension.fields = [schema[v].es_column for v in vars_]
        elif all(e.domain.partitions.where):
            vars_ = set()
            for p in e.domain.partitions:
                vars_ |= p.where.vars()

        try:
            depths = set(len(schema[v].nested_path)-1 for v in vars_)
            if -1 in depths:
                Log.error(
                    "Do not know of column {{column}}",
                    column=unwraplist([v for v in vars_ if schema[v]==None])
                )
            if len(depths) > 1:
                Log.error("expression {{expr}} spans tables, can not handle", expr=e.value)
            max_depth = Math.MAX(depths)
            while len(output) <= max_depth:
                output.append([])
        except Exception, e:
            # USUALLY THE SCHEMA IS EMPTY, SO WE ASSUME THIS IS A SIMPLE QUERY
            max_depth = 0
            output.append([])

        limit = 0
        output[max_depth].append(AggsDecoder(e, query, limit))
コード例 #2
0
def dictwrap(v):
    type_ = _get(v, "__class__")

    if type_ is dict:
        m = Dict()
        _set(m, "_dict", v)  # INJECT m.__dict__=v SO THERE IS NO COPY
        return m
    elif type_ is NoneType:
        return None  # So we allow `is None`
    elif type_ is list:
        return DictList(v)
    elif type_ is GeneratorType:
        return (wrap(vv) for vv in v)
    elif isinstance(v, (basestring, int, float, Decimal, datetime, date, Dict,
                        DictList, NullType, NoneType)):
        return v
    else:
        return DictObject(v)
コード例 #3
0
    def _get_job_results_from_th(self, branch, revision):
        output = []

        with self.locker:
            waiting_threads = self.pending.get((branch, revision))
            if waiting_threads is None:
                sig = None
                waiting_threads = self.pending[(branch, revision)] = [output]
            else:
                sig = Signal()
                waiting_threads.append(Signal())

        if sig is not None:
            Log.note("Holding thread for {{branch}}/{{revision}}",
                     branch=branch,
                     revision=revision)
            sig.wait_for_go()
            return waiting_threads[0]

        try:
            results = DictList()
            while True:
                response = self._rate_limited_get_json(
                    expand_template(RESULT_SET_URL, {
                        "branch": branch,
                        "revision": revision[0:12:]
                    }))
                results.extend(response.results)
                if len(response.results) != 1000:
                    break

            for g, repo_ids in jx.groupby(results.id, size=10):
                jobs = DictList()
                with Timer("Get {{num}} jobs", {"num": len(repo_ids)},
                           debug=DEBUG):
                    while True:
                        response = self._rate_limited_get_json(
                            expand_template(
                                JOBS_URL, {
                                    "branch":
                                    branch,
                                    "offset":
                                    len(jobs),
                                    "result_set_id":
                                    ",".join(map(unicode, repo_ids))
                                }))
                        jobs.extend(response.results)
                        if len(response.results) != 2000:
                            break

                with Timer("Get (up to {{num}}) details from TH",
                           {"num": len(jobs)},
                           debug=DEBUG):
                    details = []
                    for _, ids in jx.groupby(jobs.id, size=40):
                        details.extend(
                            self._rate_limited_get_json(url=expand_template(
                                DETAILS_URL, {
                                    "branch": branch,
                                    "job_id": ",".join(map(unicode, ids))
                                }),
                                                        retry={
                                                            "times": 3
                                                        }).results)
                    details = {
                        k.job_guid: list(v)
                        for k, v in jx.groupby(details, "job_guid")
                    }

                with Timer("Get (up to {{num}}) stars from TH",
                           {"num": len(jobs)},
                           debug=DEBUG):
                    stars = []
                    for _, ids in jx.groupby(jobs.id, size=40):
                        response = self._rate_limited_get_json(
                            expand_template(
                                JOB_BUG_MAP, {
                                    "branch": branch,
                                    "job_id": "&job_id=".join(map(
                                        unicode, ids))
                                }))
                        stars.extend(response),
                    stars = {
                        k.job_id: list(v)
                        for k, v in jx.groupby(stars, "job_id")
                    }

                with Timer("Get notes from TH", debug=DEBUG):
                    notes = []
                    for jid in set([
                            j.id
                            for j in jobs if j.failure_classification_id != 1
                    ] + stars.keys()):
                        response = self._rate_limited_get_json(
                            expand_template(NOTES_URL, {
                                "branch": branch,
                                "job_id": unicode(jid)
                            }))
                        notes.extend(response),
                    notes = {
                        k.job_id: list(v)
                        for k, v in jx.groupby(notes, "job_id")
                    }

                for j in jobs:
                    output.append(
                        self._normalize_job_result(branch, revision, j,
                                                   details, notes, stars))

            if output:
                with Timer("Write to ES cache", debug=DEBUG):
                    self.cache.extend(
                        {
                            "id": "-".join([c.repo.branch,
                                            unicode(c.job.id)]),
                            "value": c
                        } for c in output)
                    try:
                        self.cache.flush()
                    except Exception, e:
                        Log.warning("problem flushing. nevermind.", cause=e)
        finally:
            with self.locker:
                for p in waiting_threads[1:]:
                    if DEBUG:
                        Log.note(
                            "releasing thread for {{branch}}/{{revision}}",
                            branch=branch,
                            revision=revision)
                    p.go()
                self.pending[(branch, revision)] = None

        return output
コード例 #4
0
def es_deepop(es, query):
    columns = query.frum.get_columns(query.frum.name)
    query_path = query.frum.query_path
    columns = UniqueIndex(keys=["name"],
                          data=sorted(
                              columns,
                              lambda a, b: cmp(len(listwrap(b.nested_path)),
                                               len(listwrap(a.nested_path)))),
                          fail_on_dup=False)
    map_to_es_columns = {c.name: c.es_column for c in columns}
    map_to_local = {
        c.name: "_inner" + c.es_column[len(listwrap(c.nested_path)[0]):]
        if c.nested_path else "fields." + literal_field(c.es_column)
        for c in columns
    }
    # TODO: FIX THE GREAT SADNESS CAUSED BY EXECUTING post_expressions
    # THE EXPRESSIONS SHOULD BE PUSHED TO THE CONTAINER:  ES ALLOWS
    # {"inner_hit":{"script_fields":[{"script":""}...]}}, BUT THEN YOU
    # LOOSE "_source" BUT GAIN "fields", FORCING ALL FIELDS TO BE EXPLICIT
    post_expressions = {}
    es_query, es_filters = es14.util.es_query_template(query.frum.name)

    # SPLIT WHERE CLAUSE BY DEPTH
    wheres = split_expression_by_depth(query.where, query.frum,
                                       map_to_es_columns)
    for i, f in enumerate(es_filters):
        # PROBLEM IS {"match_all": {}} DOES NOT SURVIVE set_default()
        for k, v in unwrap(
                simplify_esfilter(AndOp("and",
                                        wheres[i]).to_esfilter())).items():
            f[k] = v

    if not wheres[1]:
        more_filter = {
            "and": [
                simplify_esfilter(AndOp("and", wheres[0]).to_esfilter()), {
                    "not": {
                        "nested": {
                            "path": query_path,
                            "filter": {
                                "match_all": {}
                            }
                        }
                    }
                }
            ]
        }
    else:
        more_filter = None

    es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT)
    es_query.sort = jx_sort_to_es_sort(query.sort)
    es_query.fields = []

    is_list = isinstance(query.select, list)
    new_select = DictList()

    def get_pull(column):
        if column.nested_path:
            return "_inner" + column.es_column[
                len(listwrap(column.nested_path)[0]):]
        else:
            return "fields." + literal_field(column.es_column)

    i = 0
    for s in listwrap(query.select):
        if isinstance(s.value, LeavesOp):
            if isinstance(s.value.term, Variable):
                if s.value.term.var == ".":
                    # IF THERE IS A *, THEN INSERT THE EXTRA COLUMNS
                    for c in columns:
                        if c.relative and c.type not in ["nested", "object"]:
                            if not c.nested_path:
                                es_query.fields += [c.es_column]
                            new_select.append({
                                "name":
                                c.name,
                                "pull":
                                get_pull(c),
                                "nested_path":
                                listwrap(c.nested_path)[0],
                                "put": {
                                    "name": literal_field(c.name),
                                    "index": i,
                                    "child": "."
                                }
                            })
                            i += 1

                    # REMOVE DOTS IN PREFIX IF NAME NOT AMBIGUOUS
                    col_names = [c.name for c in columns if c.relative]
                    for n in new_select:
                        if n.name.startswith("..") and n.name.lstrip(
                                ".") not in col_names:
                            n.name = n.put.name = n.name.lstrip(".")
                else:
                    column = s.term.value.var + "."
                    prefix = len(column)
                    for c in columns:
                        if c.name.startswith(column) and c.type not in [
                                "object", "nested"
                        ]:
                            pull = get_pull(c)
                            if len(listwrap(c.nested_path)) == 0:
                                es_query.fields += [c.es_column]

                            new_select.append({
                                "name":
                                s.name + "." + c.name[prefix:],
                                "pull":
                                pull,
                                "nested_path":
                                listwrap(c.nested_path)[0],
                                "put": {
                                    "name":
                                    s.name + "." +
                                    literal_field(c.name[prefix:]),
                                    "index":
                                    i,
                                    "child":
                                    "."
                                }
                            })
                            i += 1
        elif isinstance(s.value, Variable):
            if s.value.var == ".":
                for c in columns:
                    if c.relative and c.type not in ["nested", "object"]:
                        if not c.nested_path:
                            es_query.fields += [c.es_column]
                        new_select.append({
                            "name":
                            c.name,
                            "pull":
                            get_pull(c),
                            "nested_path":
                            listwrap(c.nested_path)[0],
                            "put": {
                                "name": ".",
                                "index": i,
                                "child": c.es_column
                            }
                        })
                i += 1
            elif s.value.var == "_id":
                new_select.append({
                    "name": s.name,
                    "value": s.value.var,
                    "pull": "_id",
                    "put": {
                        "name": s.name,
                        "index": i,
                        "child": "."
                    }
                })
                i += 1
            else:
                column = columns[(s.value.var, )]
                parent = column.es_column + "."
                prefix = len(parent)
                net_columns = [
                    c for c in columns if c.es_column.startswith(parent)
                    and c.type not in ["object", "nested"]
                ]
                if not net_columns:
                    pull = get_pull(column)
                    if not column.nested_path:
                        es_query.fields += [column.es_column]
                    new_select.append({
                        "name":
                        s.name,
                        "pull":
                        pull,
                        "nested_path":
                        listwrap(column.nested_path)[0],
                        "put": {
                            "name": s.name,
                            "index": i,
                            "child": "."
                        }
                    })
                else:
                    done = set()
                    for n in net_columns:
                        # THE COLUMNS CAN HAVE DUPLICATE REFERNCES TO THE SAME ES_COLUMN
                        if n.es_column in done:
                            continue
                        done.add(n.es_column)

                        pull = get_pull(n)
                        if not n.nested_path:
                            es_query.fields += [n.es_column]
                        new_select.append({
                            "name":
                            s.name,
                            "pull":
                            pull,
                            "nested_path":
                            listwrap(n.nested_path)[0],
                            "put": {
                                "name": s.name,
                                "index": i,
                                "child": n.es_column[prefix:]
                            }
                        })
                i += 1
        else:
            expr = s.value
            for v in expr.vars():
                for n in columns:
                    if n.name == v:
                        if not n.nested_path:
                            es_query.fields += [n.es_column]

            pull = EXPRESSION_PREFIX + s.name
            post_expressions[pull] = compile_expression(
                expr.map(map_to_local).to_python())

            new_select.append({
                "name": s.name if is_list else ".",
                "pull": pull,
                "value": expr.to_dict(),
                "put": {
                    "name": s.name,
                    "index": i,
                    "child": "."
                }
            })
            i += 1

    # <COMPLICATED> ES needs two calls to get all documents
    more = []

    def get_more(please_stop):
        more.append(
            es09.util.post(es, Dict(filter=more_filter,
                                    fields=es_query.fields), query.limit))

    if more_filter:
        need_more = Thread.run("get more", target=get_more)

    with Timer("call to ES") as call_timer:
        data = es09.util.post(es, es_query, query.limit)

    # EACH A HIT IS RETURNED MULTIPLE TIMES FOR EACH INNER HIT, WITH INNER HIT INCLUDED
    def inners():
        for t in data.hits.hits:
            for i in t.inner_hits[literal_field(query_path)].hits.hits:
                t._inner = i._source
                for k, e in post_expressions.items():
                    t[k] = e(t)
                yield t
        if more_filter:
            Thread.join(need_more)
            for t in more[0].hits.hits:
                yield t

    #</COMPLICATED>

    try:
        formatter, groupby_formatter, mime_type = format_dispatch[query.format]

        output = formatter(inners(), new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception, e:
        Log.error("problem formatting", e)