Beispiel #1
0
def format_table(T, select, query=None):
    data = []
    num_columns = (Math.MAX(select.put.index) + 1)
    for row in T:
        r = [None] * num_columns
        for s in select:
            value = unwraplist(row[s.pull])

            if value == None:
                continue

            index, child = s.put.index, s.put.child
            if child == ".":
                r[index] = value
            else:
                if r[index] is None:
                    r[index] = Dict()
                r[index][child] = value

        data.append(r)

    header = [None] * num_columns
    for s in select:
        if header[s.put.index]:
            continue
        header[s.put.index] = s.name

    return Dict(
        meta={"format": "table"},
        header=header,
        data=data
    )
Beispiel #2
0
def get_decoders_by_depth(query):
    """
    RETURN A LIST OF DECODER ARRAYS, ONE ARRAY FOR EACH NESTED DEPTH
    """
    schema = query.frum
    output = DictList()
    for e in wrap(coalesce(query.edges, query.groupby, [])):
        if e.value != None and not isinstance(e.value, NullOp):
            e = e.copy()
            vars_ = e.value.vars()

            for v in vars_:
                if not schema[v]:
                    Log.error("{{var}} does not exist in schema", var=v)

            e.value = e.value.map({schema[v].name: schema[v].es_column for v in vars_})
        elif e.range:
            e = e.copy()
            min_ = e.range.min
            max_ = e.range.max
            vars_ = min_.vars() | max_.vars()

            for v in vars_:
                if not schema[v]:
                    Log.error("{{var}} does not exist in schema", var=v)

            map_ = {schema[v].name: schema[v].es_column for v in vars_}
            e.range = {
                "min": min_.map(map_),
                "max": max_.map(map_)
            }
        elif e.domain.dimension:
            vars_ = e.domain.dimension.fields
            e.domain.dimension = e.domain.dimension.copy()
            e.domain.dimension.fields = [schema[v].es_column for v in vars_]
        elif all(e.domain.partitions.where):
            vars_ = set()
            for p in e.domain.partitions:
                vars_ |= p.where.vars()

        try:
            depths = set(len(schema[v].nested_path)-1 for v in vars_)
            if -1 in depths:
                Log.error(
                    "Do not know of column {{column}}",
                    column=unwraplist([v for v in vars_ if schema[v]==None])
                )
            if len(depths) > 1:
                Log.error("expression {{expr}} spans tables, can not handle", expr=e.value)
            max_depth = Math.MAX(depths)
            while len(output) <= max_depth:
                output.append([])
        except Exception, e:
            # USUALLY THE SCHEMA IS EMPTY, SO WE ASSUME THIS IS A SIMPLE QUERY
            max_depth = 0
            output.append([])

        limit = 0
        output[max_depth].append(AggsDecoder(e, query, limit))
Beispiel #3
0
def diff(settings, please_stop=None):
    # EVERYTHING FROM ELASTICSEARCH
    es = MultiDayIndex(settings.elasticsearch, queue_size=100000)

    in_es = get_all_in_es(es)
    in_s3 = get_all_s3(in_es, settings)

    # IGNORE THE 500 MOST RECENT BLOCKS, BECAUSE THEY ARE PROBABLY NOT DONE
    in_s3 = in_s3[500:500 + settings.limit:]

    Log.note(
        "Queueing {{num}} keys (from {{min}} to {{max}}) for insertion to ES",
        num=len(in_s3),
        min=Math.MIN(in_s3),
        max=Math.MAX(in_s3))
    work_queue = aws.Queue(settings=settings.work_queue)
    work_queue.extend(in_s3)
Beispiel #4
0
def get_branches(hg, branches, use_cache=True, settings=None):
    if not settings.branches or not use_cache:
        found_branches = _get_branches_from_hg(hg)

        es = elasticsearch.Cluster(settings=branches).get_or_create_index(
            settings=branches)
        es.add_alias()
        es.extend({
            "id": b.name + " " + b.locale,
            "value": b
        } for b in found_branches)
        es.flush()
        return found_branches

    # TRY ES
    try:
        es = elasticsearch.Cluster(settings=branches).get_index(
            settings=branches)
        query = {"query": {"match_all": {}}, "size": 20000}

        docs = es.search(query).hits.hits._source
        # IF IT IS TOO OLD, THEN PULL FROM HG
        oldest = Date(Math.MAX(docs.etl.timestamp))
        if Date.now() - oldest > OLD_BRANCH:
            return get_branches(use_cache=False, settings=settings)

        try:
            return UniqueIndex(["name", "locale"],
                               data=docs,
                               fail_on_dup=False)
        except Exception, e:
            Log.error("Bad branch in ES index", cause=e)
    except Exception, e:
        if "Can not find index " in e:
            return get_branches(use_cache=False, settings=settings)
        Log.error("problem getting branches", cause=e)