Beispiel #1
0
    def _aggs_iterator(agg, d):
        deeper = coalesce(agg._filter, agg._nested)
        while deeper:
            agg = deeper
            deeper = coalesce(agg._filter, agg._nested)

        if d > 0:
            for b in agg._match.buckets:
                parts[d] = b
                for a in _aggs_iterator(b, d - 1):
                    yield a
            parts[d] = Null
            for b in agg._other.buckets:
                for a in _aggs_iterator(b, d - 1):
                    yield a
            b = agg._missing
            if b.doc_count:
                for a in _aggs_iterator(b, d - 1):
                    yield a
        else:
            for b in agg._match.buckets:
                parts[d] = b
                if b.doc_count:
                    yield b
            parts[d] = Null
            for b in agg._other.buckets:
                if b.doc_count:
                    yield b
            b = agg._missing
            if b.doc_count:
                yield b
Beispiel #2
0
 def __init__(self, **desc):
     desc = wrap(desc)
     self._set_slots_to_null(self.__class__)
     set_default(self, desc)
     self.name = coalesce(desc.name, desc.type)
     self.isFacet = coalesce(desc.isFacet, False)
     self.dimension = Null
Beispiel #3
0
def _range_composer(edge, domain, es_query, to_float):
    # USE RANGES
    _min = coalesce(domain.min, MAX(domain.partitions.min))
    _max = coalesce(domain.max, MAX(domain.partitions.max))

    if is_keyword(edge.value):
        calc = {"field": edge.value}
    else:
        calc = {"script": qb_expression_to_ruby(edge.value)}

    if is_keyword(edge.value):
        missing_range = {"or": [
            {"range": {edge.value: {"lt": to_float(_min)}}},
            {"range": {edge.value: {"gte": to_float(_max)}}}
        ]}
    else:
        missing_range = {"script": {"script": qb_expression_to_ruby({"or": [
            {"lt": [edge.value, to_float(_min)]},
            {"gt": [edge.value, to_float(_max)]},
        ]})}}

    return wrap({"aggs": {
        "_match": set_default(
            {"range": calc},
            {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}},
            es_query
        ),
        "_missing": set_default(
            {"filter": {"or": [
                missing_range,
                {"missing": {"field": get_all_vars(edge.value)}}
            ]}},
            es_query
        ),
    }})
Beispiel #4
0
 def __init__(self, **desc):
     desc = wrap(desc)
     self._set_slots_to_null(self.__class__)
     set_default(self, desc)
     self.name = coalesce(desc.name, desc.type)
     self.isFacet = coalesce(desc.isFacet, False)
     self.dimension = Null
Beispiel #5
0
def _normalize_sort(sort=None):
    """
    CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE
    """

    if sort == None:
        return DictList.EMPTY

    output = DictList()
    for s in listwrap(sort):
        if isinstance(s, basestring):
            output.append({"value": jx_expression(s), "sort": 1})
        elif isinstance(s, Expression):
            output.append({"value": s, "sort": 1})
        elif Math.is_integer(s):
            output.append({"value": OffsetOp("offset", s), "sort": 1})
        elif all(d in sort_direction
                 for d in s.values()) and not s.sort and not s.value:
            for v, d in s.items():
                output.append({"value": jx_expression(v), "sort": -1})
        else:
            output.append({
                "value": jx_expression(coalesce(s.value, s.field)),
                "sort": coalesce(sort_direction[s.sort], 1)
            })
    return output
Beispiel #6
0
    def _aggs_iterator(agg, d):
        deeper = coalesce(agg._filter, agg._nested)
        while deeper:
            agg = deeper
            deeper = coalesce(agg._filter, agg._nested)

        if d > 0:
            for b in agg._match.buckets:
                parts[d] = b
                for a in _aggs_iterator(b, d - 1):
                    yield a
            parts[d] = Null
            for b in agg._other.buckets:
                for a in _aggs_iterator(b, d - 1):
                    yield a
            b = agg._missing
            if b.doc_count:
                for a in _aggs_iterator(b, d - 1):
                    yield a
        else:
            for b in agg._match.buckets:
                parts[d] = b
                if b.doc_count:
                    yield b
            parts[d] = Null
            for b in agg._other.buckets:
                if b.doc_count:
                    yield b
            b = agg._missing
            if b.doc_count:
                yield b
Beispiel #7
0
def _range_composer(edge, domain, es_query, to_float):
    # USE RANGES
    _min = coalesce(domain.min, MAX(domain.partitions.min))
    _max = coalesce(domain.max, MAX(domain.partitions.max))

    if isinstance(edge.value, Variable):
        calc = {"field": edge.value.var}
    else:
        calc = {"script_field": edge.value.to_ruby()}

    if edge.allowNulls:    # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER
        missing_filter = set_default(
            {"filter": {"or": [
                OrOp("or", [
                    InequalityOp("lt", [edge.value, Literal(None, to_float(_min))]),
                    InequalityOp("gte", [edge.value, Literal(None, to_float(_max))]),
                ]).to_esfilter(),
                edge.value.missing().to_esfilter()
            ]}},
            es_query
        )
    else:
        missing_filter = None

    return wrap({"aggs": {
        "_match": set_default(
            {"range": calc},
            {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}},
            es_query
        ),
        "_missing": missing_filter
    }})
Beispiel #8
0
    def __init__(
        self, host, index, type=None, alias=None, name=None, port=9200, read_only=True, typed=None, settings=None
    ):
        Container.__init__(self, None)
        if not containers.config.default:
            containers.config.default.settings = settings
        self.settings = settings
        self.name = coalesce(name, alias, index)
        if read_only:
            self._es = elasticsearch.Alias(alias=coalesce(alias, index), settings=settings)
        else:
            self._es = elasticsearch.Cluster(settings=settings).get_index(read_only=read_only, settings=settings)

        self.meta = FromESMetadata(settings=settings)
        self.settings.type = self._es.settings.type
        self.edges = Dict()
        self.worker = None

        columns = self.get_columns(table_name=name)
        self._schema = Schema(columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = any(c.name in ("$value", "$object") for c in columns)
        else:
            self.typed = typed
Beispiel #9
0
    def __init__(self,
                 host,
                 index,
                 type=None,
                 alias=None,
                 name=None,
                 port=9200,
                 read_only=True,
                 typed=None,
                 settings=None):
        Container.__init__(self, None)
        if not containers.config.default:
            containers.config.default.settings = settings
        self.settings = settings
        self.name = coalesce(name, alias, index)
        if read_only:
            self._es = elasticsearch.Alias(alias=coalesce(alias, index),
                                           settings=settings)
        else:
            self._es = elasticsearch.Cluster(settings=settings).get_index(
                read_only=read_only, settings=settings)

        self.meta = FromESMetadata(settings=settings)
        self.settings.type = self._es.settings.type
        self.edges = Dict()
        self.worker = None
        if typed == None:
            self._columns = self.get_columns(table_name=index)
            # SWITCH ON TYPED MODE
            self.typed = any(c.name in ("$value", "$object")
                             for c in self._columns)
        else:
            self.typed = typed
Beispiel #10
0
def _range_composer(edge, domain, es_query, to_float):
    # USE RANGES
    _min = coalesce(domain.min, MAX(domain.partitions.min))
    _max = coalesce(domain.max, MAX(domain.partitions.max))

    if isinstance(edge.value, Variable):
        calc = {"field": edge.value.var}
    else:
        calc = {"script_field": edge.value.to_ruby()}

    if edge.allowNulls:    # TODO: Use Expression.missing().esfilter() TO GET OPTIMIZED FILTER
        missing_filter = set_default(
            {"filter": {"or": [
                OrOp("or", [
                    BinaryOp("lt", [edge.value, Literal(None, to_float(_min))]),
                    BinaryOp("gte", [edge.value, Literal(None, to_float(_max))]),
                ]).to_esfilter(),
                edge.value.missing().to_esfilter()
            ]}},
            es_query
        )
    else:
        missing_filter = None

    return wrap({"aggs": {
        "_match": set_default(
            {"range": calc},
            {"range": {"ranges": [{"from": to_float(p.min), "to": to_float(p.max)} for p in domain.partitions]}},
            es_query
        ),
        "_missing": missing_filter
    }})
Beispiel #11
0
    def _convert_select(self, select):
        if isinstance(select, basestring):
            return Dict(
                name=select.rstrip(
                    "."
                ),  # TRAILING DOT INDICATES THE VALUE, BUT IS INVALID FOR THE NAME
                value=select,
                aggregate="none")
        else:
            select = wrap(select)
            output = copy(select)
            if not select.value or isinstance(select.value, basestring):
                if select.value == ".":
                    output.name = coalesce(select.name, select.aggregate)
                else:
                    output.name = coalesce(select.name, select.value,
                                           select.aggregate)
            elif not output.name:
                Log.error("Must give name to each column in select clause")

            if not output.name:
                Log.error("expecting select to have a name: {{select}}",
                          select=select)

            output.aggregate = coalesce(
                canonical_aggregates.get(select.aggregate), select.aggregate,
                "none")
            return output
Beispiel #12
0
def Stats2ZeroMoment(stats):
    # MODIFIED FROM http://statsmodels.sourceforge.net/devel/_modules/statsmodels/stats/moment_helpers.html
    # ADDED count
    mc0, mc1, mc2, skew, kurt = stats.count, coalesce(stats.mean, 0), coalesce(stats.variance, 0), coalesce(stats.skew, 0), coalesce(stats.kurtosis, 0)

    mz0 = mc0
    mz1 = mc1 * mc0
    mz2 = (mc2 + mc1 * mc1) * mc0
    mc3 = coalesce(skew, 0) * (mc2 ** 1.5) # 3rd central moment
    mz3 = (mc3 + 3 * mc1 * mc2 + mc1 ** 3) * mc0  # 3rd non-central moment
    mc4 = (coalesce(kurt, 0) + 3.0) * (mc2 ** 2.0) # 4th central moment
    mz4 = (mc4 + 4 * mc1 * mc3 + 6 * mc1 * mc1 * mc2 + mc1 ** 4) * mc0

    m = ZeroMoment(mz0, mz1, mz2, mz3, mz4)
    if DEBUG:
        from pyLibrary.testing.fuzzytestcase import assertAlmostEqualValue

        globals()["DEBUG"] = False
        try:
            v = ZeroMoment2Stats(m)
            assertAlmostEqualValue(v.count, stats.count, places=10)
            assertAlmostEqualValue(v.mean, stats.mean, places=10)
            assertAlmostEqualValue(v.variance, stats.variance, places=10)
            assertAlmostEqualValue(v.skew, stats.skew, places=10)
            assertAlmostEqualValue(v.kurtosis, stats.kurtosis, places=10)
        except Exception, e:
            v = ZeroMoment2Stats(m)
            Log.error("programmer error")
        globals()["DEBUG"] = True
Beispiel #13
0
    def __init__(
        self,
        exchange,  # name of the Pulse exchange
        topic,  # message name pattern to subscribe to  ('#' is wildcard)
        target=None,  # WILL BE CALLED WITH PULSE PAYLOADS AND ack() IF COMPLETE$ED WITHOUT EXCEPTION
        target_queue=None,  # (aka self.queue) WILL BE FILLED WITH PULSE PAYLOADS
        host='pulse.mozilla.org',  # url to connect,
        port=5671,  # tcp port
        user=None,
        password=None,
        vhost="/",
        start=0,  # USED AS STARTING POINT FOR ASSIGNING THE _meta.count ATTRIBUTE
        ssl=True,
        applabel=None,
        heartbeat=False,  # True to also get the Pulse heartbeat message
        durable=False,  # True to keep queue after shutdown
        serializer='json',
        broker_timezone='GMT',
        settings=None
    ):
        self.target_queue = target_queue
        self.pulse_target = target
        if (target_queue == None and target == None) or (target_queue != None and target != None):
            Log.error("Expecting a queue (for fast digesters) or a target (for slow digesters)")

        Thread.__init__(self, name="Pulse consumer for " + settings.exchange, target=self._worker)
        self.settings = settings
        settings.callback = self._got_result
        settings.user = coalesce(settings.user, settings.username)
        settings.applabel = coalesce(settings.applable, settings.queue, settings.queue_name)
        settings.topic = topic

        self.pulse = ModifiedGenericConsumer(settings, connect=True, **settings)
        self.count = coalesce(start, 0)
        self.start()
Beispiel #14
0
 def add_alias(self, alias=None):
     if alias:
         self.cluster_state = None
         self.cluster._post("/_aliases",
                            data=convert.unicode2utf8(
                                convert.value2json({
                                    "actions": [{
                                        "add": {
                                            "index": self.settings.index,
                                            "alias": alias
                                        }
                                    }]
                                })),
                            timeout=coalesce(self.settings.timeout, 30))
     else:
         # SET ALIAS ACCORDING TO LIFECYCLE RULES
         self.cluster_state = None
         self.cluster._post("/_aliases",
                            data=convert.unicode2utf8(
                                convert.value2json({
                                    "actions": [{
                                        "add": {
                                            "index": self.settings.index,
                                            "alias": self.settings.alias
                                        }
                                    }]
                                })),
                            timeout=coalesce(self.settings.timeout, 30))
Beispiel #15
0
def window(data, param):
    """
    MAYBE WE CAN DO THIS WITH NUMPY (no, the edges of windows are not graceful with numpy)
    data - list of records
    """
    name = param.name            # column to assign window function result
    edges = param.edges          # columns to gourp by
    where = param.where          # DO NOT CONSIDER THESE VALUES
    sortColumns = param.sort     # columns to sort by
    calc_value = wrap_function(jx_expression_to_function(param.value))  # function that takes a record and returns a value (for aggregation)
    aggregate = param.aggregate  # WindowFunction to apply
    _range = param.range         # of form {"min":-10, "max":0} to specify the size and relative position of window

    data = filter(data, where)

    if not aggregate and not edges:
        if sortColumns:
            data = sort(data, sortColumns, already_normalized=True)
        # SIMPLE CALCULATED VALUE
        for rownum, r in enumerate(data):
            r[name] = calc_value(r, rownum, data)
        return

    if not aggregate or aggregate == "none":
        for _, values in groupby(data, edges.value):
            if not values:
                continue     # CAN DO NOTHING WITH THIS ZERO-SAMPLE

            sequence = sort(values, sortColumns, already_normalized=True)

            for rownum, r in enumerate(sequence):
                r[name] = calc_value(r, rownum, sequence)
        return

    for keys, values in groupby(data, edges.value):
        if not values:
            continue     # CAN DO NOTHING WITH THIS ZERO-SAMPLE

        sequence = sort(values, sortColumns)

        for rownum, r in enumerate(sequence):
            r["__temp__"] = calc_value(r, rownum, sequence)

        head = coalesce(_range.max, _range.stop)
        tail = coalesce(_range.min, _range.start)

        # PRELOAD total
        total = aggregate()
        for i in range(tail, head):
            total.add(sequence[i].__temp__)

        # WINDOW FUNCTION APPLICATION
        for i, r in enumerate(sequence):
            r[name] = total.end()
            total.add(sequence[i + head].__temp__)
            total.sub(sequence[i + tail].__temp__)

    for r in data:
        r["__temp__"] = None  # CLEANUP
Beispiel #16
0
    def __new__(cls, e=None, query=None, *args, **kwargs):
        e.allowNulls = coalesce(e.allowNulls, True)

        if e.value and e.domain.type == "default":
            if query.groupby:
                return object.__new__(DefaultDecoder, e)

            if isinstance(e.value, basestring):
                Log.error("Expecting Variable or Expression, not plain string")

            if isinstance(e.value, TupleOp):
                # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields
                # JUST PULL THE FIELDS
                if not all(isinstance(t, Variable) for t in e.value.terms):
                    Log.error("Can only handle variables in tuples")

                e.domain = Dict(dimension={"fields": e.value.terms})
                return object.__new__(DimFieldListDecoder, e)
            elif isinstance(e.value, Variable):
                cols = query.frum.get_columns()
                col = cols.filter(lambda c: c.name == e.value.var)[0]
                if not col:
                    return object.__new__(DefaultDecoder, e)
                limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT)

                if col.partitions != None:
                    e.domain = SimpleSetDomain(partitions=col.partitions[:limit:])
                else:
                    e.domain = set_default(DefaultDomain(limit=limit), e.domain.as_dict())
                    return object.__new__(DefaultDecoder, e)

            else:
                return object.__new__(DefaultDecoder, e)

        if e.value and e.domain.type in PARTITION:
            return object.__new__(SetDecoder, e)
        if isinstance(e.domain.dimension, Dimension):
            e.domain = e.domain.dimension.getDomain()
            return object.__new__(SetDecoder, e)
        if e.value and e.domain.type == "time":
            return object.__new__(TimeDecoder, e)
        if e.range:
            return object.__new__(GeneralRangeDecoder, e)
        if e.value and e.domain.type == "duration":
            return object.__new__(DurationDecoder, e)
        elif e.value and e.domain.type == "range":
            return object.__new__(RangeDecoder, e)
        elif not e.value and e.domain.dimension.fields:
            # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields
            # JUST PULL THE FIELDS
            fields = e.domain.dimension.fields
            if isinstance(fields, Mapping):
                Log.error("No longer allowed: All objects are expressions")
            else:
                return object.__new__(DimFieldListDecoder, e)
        elif not e.value and all(e.domain.partitions.where):
            return object.__new__(GeneralSetDecoder, e)
        else:
            Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
Beispiel #17
0
def window(data, param):
    """
    MAYBE WE CAN DO THIS WITH NUMPY (no, the edges of windows are not graceful with numpy)
    data - list of records
    """
    name = param.name            # column to assign window function result
    edges = param.edges          # columns to gourp by
    where = param.where          # DO NOT CONSIDER THESE VALUES
    sortColumns = param.sort     # columns to sort by
    calc_value = wrap_function(jx_expression_to_function(param.value))  # function that takes a record and returns a value (for aggregation)
    aggregate = param.aggregate  # WindowFunction to apply
    _range = param.range         # of form {"min":-10, "max":0} to specify the size and relative position of window

    data = filter(data, where)

    if not aggregate and not edges:
        if sortColumns:
            data = sort(data, sortColumns, already_normalized=True)
        # SIMPLE CALCULATED VALUE
        for rownum, r in enumerate(data):
            r[name] = calc_value(r, rownum, data)
        return

    if not aggregate or aggregate == "none":
        for _, values in groupby(data, edges.value):
            if not values:
                continue     # CAN DO NOTHING WITH THIS ZERO-SAMPLE

            sequence = sort(values, sortColumns, already_normalized=True)

            for rownum, r in enumerate(sequence):
                r[name] = calc_value(r, rownum, sequence)
        return

    for keys, values in groupby(data, edges.value):
        if not values:
            continue     # CAN DO NOTHING WITH THIS ZERO-SAMPLE

        sequence = sort(values, sortColumns)

        for rownum, r in enumerate(sequence):
            r["__temp__"] = calc_value(r, rownum, sequence)

        head = coalesce(_range.max, _range.stop)
        tail = coalesce(_range.min, _range.start)

        # PRELOAD total
        total = aggregate()
        for i in range(tail, head):
            total.add(sequence[i].__temp__)

        # WINDOW FUNCTION APPLICATION
        for i, r in enumerate(sequence):
            r[name] = total.end()
            total.add(sequence[i + head].__temp__)
            total.sub(sequence[i + tail].__temp__)

    for r in data:
        r["__temp__"] = None  # CLEANUP
Beispiel #18
0
    def __new__(cls, e=None, query=None, *args, **kwargs):
        if query.groupby:
            # GROUPBY ASSUMES WE IGNORE THE DOMAIN RANGE
            e.allowNulls = False
        else:
            e.allowNulls = coalesce(e.allowNulls, True)

        if e.value and e.domain.type == "default":
            if query.groupby:
                return object.__new__(DefaultDecoder, e)

            if isinstance(e.value, basestring):
                Log.error("Expecting Variable or Expression, not plain string")

            if isinstance(e.value, Variable):
                cols = query.frum.get_columns()
                col = cols.filter(lambda c: c.name == e.value.var)[0]
                if not col:
                    return object.__new__(DefaultDecoder, e)
                limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT)

                if col.partitions != None:
                    e.domain = SimpleSetDomain(
                        partitions=col.partitions[:limit:])
                else:
                    e.domain = set_default(DefaultDomain(limit=limit),
                                           e.domain.as_dict())
                    return object.__new__(DefaultDecoder, e)

            else:
                return object.__new__(DefaultDecoder, e)

        if e.value and e.domain.type in PARTITION:
            return object.__new__(SetDecoder, e)
        if isinstance(e.domain.dimension, Dimension):
            e.domain = e.domain.dimension.getDomain()
            return object.__new__(SetDecoder, e)
        if e.value and e.domain.type == "time":
            return object.__new__(TimeDecoder, e)
        if e.range:
            return object.__new__(GeneralRangeDecoder, e)
        if e.value and e.domain.type == "duration":
            return object.__new__(DurationDecoder, e)
        elif e.value and e.domain.type == "range":
            return object.__new__(RangeDecoder, e)
        elif not e.value and e.domain.dimension.fields:
            # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields
            # JUST PULL THE FIELDS
            fields = e.domain.dimension.fields
            if isinstance(fields, Mapping):
                Log.error("No longer allowed: All objects are expressions")
            else:
                return object.__new__(DimFieldListDecoder, e)
        elif not e.value and all(e.domain.partitions.where):
            return object.__new__(GeneralSetDecoder, e)
        else:
            Log.error("domain type of {{type}} is not supported yet",
                      type=e.domain.type)
Beispiel #19
0
def _normalize_edge(edge, schema=None):
    if not _Column:
        _late_import()

    if isinstance(edge, basestring):
        if schema:
            e = schema[edge]
            if e:
                if isinstance(e, _Column):
                    return Dict(
                        name=edge,
                        value=edge,
                        allowNulls=True,
                        domain=_normalize_domain(schema=schema)
                    )
                elif isinstance(e.fields, list) and len(e.fields) == 1:
                    return Dict(
                        name=e.name,
                        value=e.fields[0],
                        allowNulls=True,
                        domain=e.getDomain()
                    )
                else:
                    return Dict(
                        name=e.name,
                        allowNulls=True,
                        domain=e.getDomain()
                    )
        return Dict(
            name=edge,
            value=edge,
            allowNulls=True,
            domain=_normalize_domain(schema=schema)
        )
    else:
        edge = wrap(edge)
        if not edge.name and not isinstance(edge.value, basestring):
            Log.error("You must name compound edges: {{edge}}", edge=edge)

        if isinstance(edge.value, (list, set)) and not edge.domain:
            # COMPLEX EDGE IS SHORT HAND
            domain = _normalize_domain(schema=schema)
            domain.dimension = Dict(fields=edge.value)

            return Dict(
                name=edge.name,
                allowNulls=bool(coalesce(edge.allowNulls, True)),
                domain=domain
            )

        domain = _normalize_domain(edge.domain, schema=schema)
        return Dict(
            name=coalesce(edge.name, edge.value),
            value=edge.value,
            range=edge.range,
            allowNulls=bool(coalesce(edge.allowNulls, True)),
            domain=domain
        )
Beispiel #20
0
def full_etl(settings):
    schema = convert.json2value(convert.value2json(SCHEMA), leaves=True)
    Cluster(settings.destination).get_or_create_index(settings=settings.destination, schema=schema, limit_replicas=True)
    destq = FromES(settings.destination)
    if settings.incremental:
        min_bug_id = destq.query({
            "from": coalesce(settings.destination.alias, settings.destination.index),
            "select": {"name": "max_bug_id", "value": "bug_id", "aggregate": "max"}
        })

        min_bug_id = int(MAX(min_bug_id-1000, 0))
    else:
        min_bug_id = 0

    sourceq = FromES(settings.source)
    max_bug_id = sourceq.query({
        "from": coalesce(settings.source.alias, settings.source.index),
        "select": {"name": "max_bug_id", "value": "bug_id", "aggregate": "max"}
    }) + 1
    max_bug_id = int(coalesce(max_bug_id, 0))

    # FIRST, GET ALL MISSING BUGS
    for s, e in qb.reverse(list(qb.intervals(min_bug_id, max_bug_id, 10000))):
        with Timer("pull {{start}}..{{end}} from ES", {"start": s, "end": e}):
            children = sourceq.query({
                "from": settings.source.alias,
                "select": ["bug_id", "dependson", "blocked", "modified_ts", "expires_on"],
                "where": {"and": [
                    {"range": {"bug_id": {"gte": s, "lt": e}}},
                    {"or": [
                        {"exists": "dependson"},
                        {"exists": "blocked"}
                    ]}
                ]},
                "limit": 10000
            })

        with Timer("fixpoint work"):
            to_fix_point(settings, destq, children.data)

    # PROCESS RECENT CHANGES
    with Timer("pull recent dependancies from ES"):
        children = sourceq.query({
            "from": settings.source.alias,
            "select": ["bug_id", "dependson", "blocked"],
            "where": {"and": [
                {"range": {"modified_ts": {"gte": convert.datetime2milli(datetime.utcnow() - timedelta(days=7))}}},
                {"or": [
                    {"exists": "dependson"},
                    {"exists": "blocked"}
                ]}
            ]},
            "limit": 100000
        })

    to_fix_point(settings, destq, children.data)
Beispiel #21
0
 def comparer(left, right):
     left = coalesce(left)
     right = coalesce(right)
     for f in formal:
         try:
             result = value_compare(f.func(left), f.func(right), f.sort)
             if result != 0:
                 return result
         except Exception, e:
             Log.error("problem with compare", e)
 def single(col, r):
     min = coalesce(r["gte"], r[">="])
     max = coalesce(r["lte"], r["<="])
     if min and max:
         # SPECIAL CASE (BETWEEN)
         return db.quote_column(col) + " BETWEEN " + db.quote_value(min) + " AND " + db.quote_value(max)
     else:
         return " AND ".join(
             db.quote_column(col) + name2sign[sign] + db.quote_value(value)
             for sign, value in r.items()
         )
Beispiel #23
0
 def comparer(left, right):
     left = coalesce(left)
     right = coalesce(right)
     for f in formal:
         try:
             result = value_compare(left[f.field], right[f.field],
                                    f.sort)
             if result != 0:
                 return result
         except Exception, e:
             Log.error("problem with compare", e)
Beispiel #24
0
 def single(col, r):
     min = coalesce(r["gte"], r[">="])
     max = coalesce(r["lte"], r["<="])
     if min and max:
         # SPECIAL CASE (BETWEEN)
         return db.quote_column(col) + " BETWEEN " + db.quote_value(
             min) + " AND " + db.quote_value(max)
     else:
         return " AND ".join(
             db.quote_column(col) + name2sign[sign] +
             db.quote_value(value) for sign, value in r.items())
Beispiel #25
0
def main():
    settings = startup.read_settings(defs={
       "name": ["--restart", "--reset", "--redo"],
       "help": "force a reprocessing of all data",
       "action": "store_true",
       "dest": "restart"
    })
    Log.start(settings.debug)

    try:
        with startup.SingleInstance(flavor_id=settings.args.filename):
            if settings.args.restart:
                reviews = Cluster(settings.destination).create_index(settings.destination)
            else:
                reviews = Cluster(settings.destination).get_proto(settings.destination)

            bugs = Cluster(settings.source).get_index(settings.source)

            with FromES(bugs) as esq:
                es_max_bug = esq.query({
                    "from": "private_bugs",
                    "select": {"name": "max_bug", "value": "bug_id", "aggregate": "maximum"}
                })

            #PROBE WHAT RANGE OF BUGS IS LEFT TO DO (IN EVENT OF FAILURE)
            with FromES(reviews) as esq:
                es_min_bug = esq.query({
                    "from": "reviews",
                    "select": {"name": "min_bug", "value": "bug_id", "aggregate": "minimum"}
                })

            batch_size = coalesce(bugs.settings.batch_size, settings.size, 1000)
            threads = coalesce(settings.threads, 4)
            Log.note(str(settings.min_bug))
            min_bug = int(coalesce(settings.min_bug, 0))
            max_bug = int(coalesce(settings.max_bug, Math.min(es_min_bug + batch_size * threads, es_max_bug)))

            with ThreadedQueue(reviews, batch_size=coalesce(reviews.settings.batch_size, 100)) as sink:
                func = functools.partial(full_etl, settings, sink)
                with Multithread(func, threads=threads) as m:
                    m.inbound.silent = True
                    Log.note("bugs from {{min}} to {{max}}, step {{step}}", {
                        "min": min_bug,
                        "max": max_bug,
                        "step": batch_size
                    })
                    m.execute(reversed([{"bugs": range(s, e)} for s, e in qb.intervals(min_bug, max_bug, size=1000)]))

            if settings.args.restart:
                reviews.add_alias()
                reviews.delete_all_but_self()
    finally:
        Log.stop()
Beispiel #26
0
def format_list_from_aggop(decoders, aggs, start, query, select):
    agg = aggs
    b = coalesce(agg._filter, agg._nested)
    while b:
        agg = b
        b = coalesce(agg._filter, agg._nested)

    item = Dict()
    for s in select:
        item[s.name] = agg[s.pull]

    return wrap({"meta": {"format": "list"}, "data": [item]})
Beispiel #27
0
def format_cube_from_aggop(decoders, aggs, start, query, select):
    agg = aggs
    b = coalesce(agg._filter, agg._nested)
    while b:
        agg = b
        b = coalesce(agg._filter, agg._nested)

    matricies = [(s, Matrix(dims=[], zeros=(s.aggregate == "count"))) for s in select]
    for s, m in matricies:
        m[tuple()] = agg[s.pull]
    cube = Cube(query.select, [], {s.name: m for s, m in matricies})
    cube.frum = query
    return cube
Beispiel #28
0
    def __new__(cls, e=None, query=None, *args, **kwargs):
        if query.groupby:
            # GROUPBY ASSUMES WE IGNORE THE DOMAIN RANGE
            e.allowNulls = False
        else:
            e.allowNulls = coalesce(e.allowNulls, True)

        if e.value and e.domain.type == "default":
            if query.groupby:
                return object.__new__(DefaultDecoder, e)

            if isinstance(e.value, basestring):
                Log.error("Not expected anymore")

            if isinstance(e.value, Variable):
                cols = query.frum.get_columns()
                col = cols.filter(lambda c: c.name == e.value.var)[0]
                if not col:
                    return object.__new__(DefaultDecoder, e)
                limit = coalesce(e.domain.limit, query.limit, DEFAULT_LIMIT)

                if col.partitions != None:
                    e.domain = SimpleSetDomain(partitions=col.partitions[:limit:])
                else:
                    e.domain = set_default(DefaultDomain(limit=limit), e.domain.as_dict())
                    return object.__new__(DefaultDecoder, e)

            else:
                return object.__new__(DefaultDecoder, e)

        if e.value and e.domain.type in PARTITION:
            return object.__new__(SetDecoder, e)
        if isinstance(e.domain.dimension, Dimension):
            e.domain = e.domain.dimension.getDomain()
            return object.__new__(SetDecoder, e)
        if e.value and e.domain.type == "time":
            return object.__new__(TimeDecoder, e)
        if e.value and e.domain.type == "duration":
            return object.__new__(DurationDecoder, e)
        elif e.value and e.domain.type == "range":
            return object.__new__(RangeDecoder, e)
        elif not e.value and e.domain.dimension.fields:
            # THIS domain IS FROM A dimension THAT IS A SIMPLE LIST OF fields
            # JUST PULL THE FIELDS
            fields = e.domain.dimension.fields
            if isinstance(fields, Mapping):
                Log.error("No longer allowed: All objects are expressions")
            else:
                return object.__new__(DimFieldListDecoder, e)
        else:
            Log.error("domain type of {{type}} is not supported yet", type=e.domain.type)
Beispiel #29
0
def format_table_from_aggop(decoders, aggs, start, query, select):
    header = select.name

    agg = aggs
    b = coalesce(agg._filter, agg._nested)
    while b:
        agg = b
        b = coalesce(agg._filter, agg._nested)

    row = []
    for s in select:
        row.append(agg[s.pull])

    return Dict(meta={"format": "table"}, header=header, data=[row])
Beispiel #30
0
def percent(value, decimal=None, digits=None, places=None):
    value = float(value)
    if value == 0.0:
        return "0%"

    digits = coalesce(digits, places)
    if digits != None:
        left_of_decimal = int(math.ceil(math.log10(abs(value)))) + 2
        decimal = digits - left_of_decimal

    decimal = coalesce(decimal, 0)
    right_of_decimal = max(decimal, 0)
    format = "{:." + _unicode(right_of_decimal) + "%}"
    return format.format(__builtin__.round(value, decimal + 2))
Beispiel #31
0
def format_cube_from_aggop(decoders, aggs, start, query, select):
    agg = aggs
    b = coalesce(agg._filter, agg._nested)
    while b:
        agg = b
        b = coalesce(agg._filter, agg._nested)

    matricies = [(s, Matrix(dims=[], zeros=(s.aggregate == "count")))
                 for s in select]
    for s, m in matricies:
        m[tuple()] = agg[s.pull]
    cube = Cube(query.select, [], {s.name: m for s, m in matricies})
    cube.frum = query
    return cube
Beispiel #32
0
def percent(value, decimal=None, digits=None, places=None):
    value = float(value)
    if value == 0.0:
        return "0%"

    digits = coalesce(digits, places)
    if digits != None:
        left_of_decimal = int(math.ceil(math.log10(abs(value)))) + 2
        decimal = digits - left_of_decimal

    decimal = coalesce(decimal, 0)
    right_of_decimal = max(decimal, 0)
    format = "{:." + _unicode(right_of_decimal) + "%}"
    return format.format(__builtin__.round(value, decimal + 2))
Beispiel #33
0
    def send_email(self,
            from_address=None,
            to_address=None,
            subject=None,
            text_data=None,
            html_data=None
    ):
        """Sends an email.

        from_addr is an email address; to_addrs is a list of email adresses.
        Addresses can be plain (e.g. "*****@*****.**") or with real names
        (e.g. "John Smith <*****@*****.**>").

        text_data and html_data are both strings.  You can specify one or both.
        If you specify both, the email will be sent as a MIME multipart
        alternative, i.e., the recipient will see the HTML content if his
        viewer supports it; otherwise he'll see the text content.
        """

        settings = self.settings

        from_address = coalesce(from_address, settings["from"], settings.from_address)
        to_address = listwrap(coalesce(to_address, settings.to_address, settings.to_addrs))

        if not from_address or not to_address:
            raise Exception("Both from_addr and to_addrs must be specified")
        if not text_data and not html_data:
            raise Exception("Must specify either text_data or html_data")

        if not html_data:
            msg = MIMEText(text_data)
        elif not text_data:
            msg = MIMEText(html_data, 'html')
        else:
            msg = MIMEMultipart('alternative')
            msg.attach(MIMEText(text_data, 'plain'))
            msg.attach(MIMEText(html_data, 'html'))

        msg['Subject'] = coalesce(subject, settings.subject)
        msg['From'] = from_address
        msg['To'] = ', '.join(to_address)

        if self.server:
            # CALL AS PART OF A SMTP SESSION
            self.server.sendmail(from_address, to_address, msg.as_string())
        else:
            # CALL AS STAND-ALONE
            with self:
                self.server.sendmail(from_address, to_address, msg.as_string())
Beispiel #34
0
def _normalize_sort(sort=None):
    """
    CONVERT SORT PARAMETERS TO A NORMAL FORM SO EASIER TO USE
    """

    if not sort:
        return DictList.EMPTY

    output = DictList()
    for s in listwrap(sort):
        if isinstance(s, basestring) or Math.is_integer(s):
            output.append({"field": s, "sort": 1})
        else:
            output.append({"field": coalesce(s.field, s.value), "sort": coalesce(sort_direction[s.sort], 1)})
    return wrap(output)
Beispiel #35
0
def format_list_from_aggop(decoders, aggs, start, query, select):
    agg = aggs
    b = coalesce(agg._filter, agg._nested)
    while b:
        agg = b
        b = coalesce(agg._filter, agg._nested)

    item = Dict()
    for s in select:
        item[s.name] = agg[s.pull]

    return wrap({
        "meta": {"format": "list"},
        "data": [item]
    })
Beispiel #36
0
def _normalize_edge(edge, schema=None):
    if not _Column:
        _late_import()

    if isinstance(edge, basestring):
        if schema:
            e = schema[edge]
            if e:
                if isinstance(e, _Column):
                    return Dict(name=edge,
                                value=jx_expression(edge),
                                allowNulls=True,
                                domain=_normalize_domain(schema=schema))
                elif isinstance(e.fields, list) and len(e.fields) == 1:
                    return Dict(name=e.name,
                                value=jx_expression(e.fields[0]),
                                allowNulls=True,
                                domain=e.getDomain())
                else:
                    return Dict(name=e.name,
                                allowNulls=True,
                                domain=e.getDomain())
        return Dict(name=edge,
                    value=jx_expression(edge),
                    allowNulls=True,
                    domain=_normalize_domain(schema=schema))
    else:
        edge = wrap(edge)
        if not edge.name and not isinstance(edge.value, basestring):
            Log.error("You must name compound and complex edges: {{edge}}",
                      edge=edge)

        if isinstance(edge.value, (list, set)) and not edge.domain:
            # COMPLEX EDGE IS SHORT HAND
            domain = _normalize_domain(schema=schema)
            domain.dimension = Dict(fields=edge.value)

            return Dict(name=edge.name,
                        allowNulls=bool(coalesce(edge.allowNulls, True)),
                        domain=domain)

        domain = _normalize_domain(edge.domain, schema=schema)

        return Dict(name=coalesce(edge.name, edge.value),
                    value=jx_expression(edge.value),
                    range=_normalize_range(edge.range),
                    allowNulls=bool(coalesce(edge.allowNulls, True)),
                    domain=domain)
Beispiel #37
0
    def _convert_edge(self, edge):
        if isinstance(edge, basestring):
            return Dict(
                name=edge,
                value=edge,
                domain=self._convert_domain()
            )
        else:
            edge = wrap(edge)
            if not edge.name and not isinstance(edge.value, basestring):
                Log.error("You must name compound edges: {{edge}}",  edge= edge)

            if isinstance(edge.value, (Mapping, list)) and not edge.domain:
                # COMPLEX EDGE IS SHORT HAND
                domain =self._convert_domain()
                domain.dimension = Dict(fields=edge.value)

                return Dict(
                    name=edge.name,
                    allowNulls=False if edge.allowNulls is False else True,
                    domain=domain
                )

            domain = self._convert_domain(edge.domain)
            return Dict(
                name=coalesce(edge.name, edge.value),
                value=edge.value,
                range=edge.range,
                allowNulls=False if edge.allowNulls is False else True,
                domain=domain
            )
Beispiel #38
0
    def query(self, sql, param=None):
        """
        RETURN LIST OF dicts
        """
        self._execute_backlog()
        try:
            old_cursor = self.cursor
            if not old_cursor:  # ALLOW NON-TRANSACTIONAL READS
                self.cursor = self.db.cursor()
                self.cursor.execute("SET TIME_ZONE='+00:00'")
                self.cursor.close()
                self.cursor = self.db.cursor()

            if param:
                sql = expand_template(sql, self.quote_param(param))
            sql = self.preamble + outdent(sql)
            if self.debug:
                Log.note("Execute SQL:\n{{sql}}", sql=indent(sql))

            self.cursor.execute(sql)
            columns = [utf8_to_unicode(d[0]) for d in coalesce(self.cursor.description, [])]
            fixed = [[utf8_to_unicode(c) for c in row] for row in self.cursor]
            result = convert.table2list(columns, fixed)

            if not old_cursor:   # CLEANUP AFTER NON-TRANSACTIONAL READS
                self.cursor.close()
                self.cursor = None

            return result
        except Exception, e:
            if isinstance(e, InterfaceError) or e.message.find("InterfaceError") >= 0:
                Log.error("Did you close the db connection?", e)
            Log.error("Problem executing SQL:\n{{sql|indent}}",  sql= sql, cause=e, stack_depth=1)
Beispiel #39
0
 def __init__(self,
              host,
              index,
              type=None,
              alias=None,
              name=None,
              port=9200,
              settings=None):
     self.settings = settings
     self.name = coalesce(name, alias, index)
     self._es = elasticsearch.Alias(alias=coalesce(alias, index),
                                    settings=settings)
     self.settings.type = self._es.settings.type  # Alias() WILL ASSIGN A TYPE IF IT WAS MISSING
     self.edges = Dict()
     self.worker = None
     self.ready = False
Beispiel #40
0
def main():

    try:
        settings = startup.read_settings(defs=[{
            "name": ["--id"],
            "help": "id(s) to process.  Use \"..\" for a range.",
            "type": str,
            "dest": "id",
            "required": False
        }])
        constants.set(settings.constants)
        Log.start(settings.debug)

        if settings.args.id:
            etl_one(settings)
            return

        hg = HgMozillaOrg(settings=settings.hg)
        resources = Dict(hg=dictwrap(hg))
        stopper = Signal()
        for i in range(coalesce(settings.param.threads, 1)):
            ETL(
                name="ETL Loop " + unicode(i),
                work_queue=settings.work_queue,
                resources=resources,
                workers=settings.workers,
                settings=settings.param,
                please_stop=stopper
            )

        Thread.wait_for_shutdown_signal(stopper, allow_exit=True)
    except Exception, e:
        Log.error("Problem with etl", e)
Beispiel #41
0
 def convert(self, expr):
     """
     EXPAND INSTANCES OF name TO value
     """
     if expr is True or expr == None or expr is False:
         return expr
     elif Math.is_number(expr):
         return expr
     elif expr == ".":
         return "."
     elif is_keyword(expr):
         return coalesce(self.dimensions[expr], expr)
     elif isinstance(expr, basestring):
         Log.error("{{name|quote}} is not a valid variable name", name=expr)
     elif isinstance(expr, Date):
         return expr
     elif isinstance(expr, QueryOp):
         return self._convert_query(expr)
     elif isinstance(expr, Mapping):
         if expr["from"]:
             return self._convert_query(expr)
         elif len(expr) >= 2:
             #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION
             return wrap({
                 name: self.convert(value)
                 for name, value in expr.leaves()
             })
         else:
             # ASSUME SINGLE-CLAUSE EXPRESSION
             k, v = expr.items()[0]
             return converter_map.get(k, self._convert_bop)(self, k, v)
     elif isinstance(expr, (list, set, tuple)):
         return wrap([self.convert(value) for value in expr])
     else:
         return expr
Beispiel #42
0
    def get_meta(self, key, conforming=True):
        try:
            metas = list(self.bucket.list(prefix=key))
            metas = wrap([m for m in metas if m.name.find(".json") != -1])

            perfect = Null
            favorite = Null
            too_many = False
            error = None
            for m in metas:
                try:
                    simple = strip_extension(m.key)
                    if conforming:
                        self._verify_key_format(simple)
                    if simple == key:
                        perfect = m
                        too_many = False
                    if simple.startswith(key + ".") or simple.startswith(key +
                                                                         ":"):
                        if favorite and not perfect:
                            too_many = True
                        favorite = m
                except Exception, e:
                    error = e

            if too_many:
                Log.error(
                    "multiple keys in {{bucket}} with prefix={{prefix|quote}}: {{list}}",
                    bucket=self.name,
                    prefix=key,
                    list=[k.name for k in metas])
            if not perfect and error:
                Log.error("Problem with key request", error)
            return coalesce(perfect, favorite)
Beispiel #43
0
def es_setop(es, query):
    es_query = es14.util.es_query_template()
    select = listwrap(query.select)

    es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT)
    es_query.fields = DictList()
    es_query.sort = qb_sort_to_es_sort(query.sort)
    source = "fields"
    for s in select:
        if s.value == "*":
            es_query.fields = None
            es_query.script_fields = None
            source = "_source"
        elif s.value == ".":
            es_query.fields = None
            es_query.script_fields = None
            source = "_source"
        elif isinstance(s.value, basestring) and is_keyword(s.value):
            es_query.fields.append(s.value)
        elif isinstance(s.value, list) and es_query.fields is not None:
            es_query.fields.extend(s.value)
        else:
            es_query.script_fields[literal_field(s.name)] = {"script": qb_expression_to_ruby(s.value)}

    return extract_rows(es, es_query, source, select, query)
Beispiel #44
0
def compileDuration2Term(edge):
    if edge.esscript:
        Log.error("edge script not supported yet")

    # IS THERE A LIMIT ON THE DOMAIN?
    numPartitions = len(edge.domain.partitions)
    value = edge.value
    if isKeyword(value):
        value = "doc[\"" + value + "\"].value"

    ref = coalesce(edge.domain.min, edge.domain.max, durations.ZERO)
    nullTest = compileNullTest(edge)

    ms = edge.domain.interval.milli
    if edge.domain.interval.month > 0:
        ms = durations.YEAR.milli / 12 * edge.domain.interval.month

    partition2int = "Math.floor((" + value + "-" + value2MVEL(ref) + ")/" + ms + ")"
    partition2int = "((" + nullTest + ") ? " + numPartitions + " : " + partition2int + ")"

    def int2Partition(value):
        if Math.round(value) == numPartitions:
            return edge.domain.NULL
        return edge.domain.getPartByKey(ref.add(edge.domain.interval.multiply(value)))

    return Dict(toTerm={"head": "", "body": partition2int}, fromTerm=int2Partition)
Beispiel #45
0
 def __init__(self, edge, query, limit):
     AggsDecoder.__init__(self, edge, query, limit)
     self.domain = edge.domain
     self.domain.limit = Math.min(coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT)
     self.parts = list()
     self.key2index = {}
     self.computed_domain = False
 def __init__(self, host, index, type="log", max_size=1000, batch_size=100, settings=None):
     """
     settings ARE FOR THE ELASTICSEARCH INDEX
     """
     self.es = Cluster(settings).get_or_create_index(
         schema=convert.json2value(convert.value2json(SCHEMA), leaves=True),
         limit_replicas=True,
         tjson=True,
         settings=settings
     )
     self.batch_size = batch_size
     self.es.add_alias(coalesce(settings.alias, settings.index))
     self.queue = Queue("debug logs to es", max=max_size, silent=True)
     self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3)
     self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE))
     Thread.run("add debug logs to es", self._insert_loop)
Beispiel #47
0
 def __init__(self, edge, query):
     AggsDecoder.__init__(self, edge, query)
     self.fields = edge.domain.dimension.fields
     self.domain = self.edge.domain
     self.domain.limit = Math.min(
         coalesce(self.domain.limit, query.limit, 10), MAX_LIMIT)
     self.parts = list()
Beispiel #48
0
def _normalize_group(edge, schema=None):
    if isinstance(edge, basestring):
        return wrap({
            "name": edge,
            "value": jx_expression(edge),
            "allowNulls": True,
            "domain": {
                "type": "default"
            }
        })
    else:
        edge = wrap(edge)
        if (edge.domain
                and edge.domain.type != "default") or edge.allowNulls != None:
            Log.error("groupby does not accept complicated domains")

        if not edge.name and not isinstance(edge.value, basestring):
            Log.error("You must name compound edges: {{edge}}", edge=edge)

        return wrap({
            "name": coalesce(edge.name, edge.value),
            "value": jx_expression(edge.value),
            "allowNulls": True,
            "domain": {
                "type": "default"
            }
        })
Beispiel #49
0
    def execute_sql(host,
                    username,
                    password,
                    sql,
                    schema=None,
                    param=None,
                    settings=None):
        """EXECUTE MANY LINES OF SQL (FROM SQLDUMP FILE, MAYBE?"""
        settings.schema = coalesce(settings.schema, settings.database)

        if param:
            with MySQL(settings) as temp:
                sql = expand_template(sql, temp.quote_param(param))

        # MWe have no way to execute an entire SQL file in bulk, so we
        # have to shell out to the commandline client.
        args = [
            "mysql", "-h{0}".format(settings.host),
            "-u{0}".format(settings.username),
            "-p{0}".format(settings.password)
        ]
        if settings.schema:
            args.append("{0}".format(settings.schema))

        try:
            proc = subprocess.Popen(args,
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.STDOUT,
                                    bufsize=-1)
            if isinstance(sql, unicode):
                sql = sql.encode("utf8")
            (output, _) = proc.communicate(sql)
        except Exception, e:
            Log.error("Can not call \"mysql\"", e)
Beispiel #50
0
    def _convert_group(self, column):
        if isinstance(column, basestring):
            return wrap({
                "name": column,
                "value": column,
                "domain": {
                    "type": "default"
                }
            })
        else:
            column = wrap(column)
            if (column.domain and column.domain.type != "default"
                ) or column.allowNulls != None:
                Log.error("groupby does not accept complicated domains")

            if not column.name and not isinstance(column.value, basestring):
                Log.error("You must name compound edges: {{edge}}",
                          edge=column)

            return wrap({
                "name": coalesce(column.name, column.value),
                "value": column.value,
                "domain": {
                    "type": "default"
                }
            })
Beispiel #51
0
    def _get_from_elasticsearch(self, revision, locale=None):
        rev = revision.changeset.id
        query = {
            "query": {"filtered": {
                "query": {"match_all": {}},
                "filter": {"and": [
                    {"prefix": {"changeset.id": rev[0:12]}},
                    {"term": {"branch.name": revision.branch.name}},
                    {"term": {"branch.locale": coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)}}
                ]}
            }},
            "size": 2000,
        }
        try:
            docs = self.es.search(query, timeout=120).hits.hits
            if len(docs) > 1:
                for d in docs:
                    if d._id.endswith(d._source.branch.locale):
                        return d._source
                Log.warning("expecting no more than one document")

            return docs[0]._source
        except Exception, e:
            Log.warning("Bad ES call", e)
            return None
Beispiel #52
0
 def convert(self, expr):
     """
     EXPAND INSTANCES OF name TO value
     """
     if expr is True or expr == None or expr is False:
         return expr
     elif Math.is_number(expr):
         return expr
     elif expr == ".":
         return "."
     elif is_keyword(expr):
         return coalesce(self.dimensions[expr], expr)
     elif isinstance(expr, basestring):
         Log.error("{{name|quote}} is not a valid variable name", name=expr)
     elif isinstance(expr, Date):
         return expr
     elif isinstance(expr, Query):
         return self._convert_query(expr)
     elif isinstance(expr, Mapping):
         if expr["from"]:
             return self._convert_query(expr)
         elif len(expr) >= 2:
             #ASSUME WE HAVE A NAMED STRUCTURE, NOT AN EXPRESSION
             return wrap({name: self.convert(value) for name, value in expr.leaves()})
         else:
             # ASSUME SINGLE-CLAUSE EXPRESSION
             k, v = expr.items()[0]
             return converter_map.get(k, self._convert_bop)(self, k, v)
     elif isinstance(expr, (list, set, tuple)):
         return wrap([self.convert(value) for value in expr])
     else:
         return expr
Beispiel #53
0
    def _normalize_edge(self, edge):
        """
        RETURN A EDGE DEFINITION INTO A SIMPLE ARRAY OF PATH-LEAF
        DEFINITIONS [ {"name":<pathA>, "value":<pathB>}, ... ]

        USEFUL FOR DECLARING HIGH-LEVEL DIMENSIONS, AND RELIEVING LOW LEVEL PATH PAIRS
        """
        if isinstance(edge, basestring):
            e = self[edge]
            if e:
                domain = e.getDomain()
                fields = domain.dimension.fields

                if isinstance(fields, list):
                    if len(fields) == 1:
                        return [{"value": fields[0]}]
                    else:
                        return [{
                            "name": (edge + "[" + str(i) + "]"),
                            "value": v
                        } for i, v in enumerate(fields)]
                elif isinstance(fields, Mapping):
                    return [{
                        "name": (edge + "." + k),
                        "value": v
                    } for k, v in fields.items()]
                else:
                    Log.error("do not know how to handle")

            return [{"name": edge, "value": edge}]
        else:
            return [{
                "name": coalesce(edge.name, edge.value),
                "value": edge.value
            }]
Beispiel #54
0
    def __init__(self, host, index, alias=None, name=None, port=9200, settings=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.lists import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = settings
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(settings=settings)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.meta=Dict()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns}))
        self.meta.columns = ListContainer("meta.columns", [], wrap({c.name: c for c in column_columns}))
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Beispiel #55
0
 def backup_name(self, timestamp=None):
     """
     RETURN A FILENAME THAT CAN SERVE AS A BACKUP FOR THIS FILE
     """
     suffix = convert.datetime2string(coalesce(timestamp, datetime.now()),
                                      "%Y%m%d_%H%M%S")
     return File.add_suffix(self._filename, suffix)
    def _get_from_elasticsearch(self, revision, locale=None):
        rev = revision.changeset.id
        query = {
            "query": {
                "filtered": {
                    "query": {
                        "match_all": {}
                    },
                    "filter": {
                        "and": [{
                            "prefix": {
                                "changeset.id": rev[0:12]
                            }
                        }, {
                            "term": {
                                "branch.name": revision.branch.name
                            }
                        }, {
                            "term": {
                                "branch.locale":
                                coalesce(locale, DEFAULT_LOCALE)
                            }
                        }]
                    }
                }
            },
            "size": 2000,
        }
        docs = self.es.search(query).hits.hits
        if len(docs) > 1:
            Log.error("expecting no more than one document")

        return docs[0]._source
Beispiel #57
0
def es_fieldop(es, query):
    es_query = es14.util.es_query_template()
    select = listwrap(query.select)
    es_query.query = {
        "filtered": {
            "query": {
                "match_all": {}
            },
            "filter": simplify_esfilter(qb_expression_to_esfilter(query.where))
        }
    }
    es_query.size = coalesce(query.limit, queries.query.DEFAULT_LIMIT)
    es_query.sort = qb_sort_to_es_sort(query.sort)
    es_query.fields = DictList()
    source = "fields"
    for s in select.value:
        if s == "*":
            es_query.fields=None
            source = "_source"
        elif s == ".":
            es_query.fields=None
            source = "_source"
        elif isinstance(s, basestring) and is_keyword(s):
            es_query.fields.append(s)
        elif isinstance(s, list) and es_query.fields is not None:
            es_query.fields.extend(s)
        elif isinstance(s, Mapping) and es_query.fields is not None:
            es_query.fields.extend(s.values())
        elif es_query.fields is not None:
            es_query.fields.append(s)
    es_query.sort = [{s.field: "asc" if s.sort >= 0 else "desc"} for s in query.sort]

    return extract_rows(es, es_query, source, select, query)
Beispiel #58
0
    def get_meta(self, key, conforming=True):
        try:
            # key_prefix("2")
            metas = list(self.bucket.list(prefix=key))
            metas = wrap([m for m in metas if m.name.find(".json") != -1])

            perfect = Null
            favorite = Null
            too_many = False
            error = None
            for m in metas:
                try:
                    simple = strip_extension(m.key)
                    if conforming:
                        self._verify_key_format(simple)
                    if simple == key:
                        perfect = m
                        too_many = False
                    if simple.startswith(key + ".") or simple.startswith(key + ":"):
                        if favorite and not perfect:
                            too_many = True
                        favorite = m
                except Exception, e:
                    error = e

            if too_many:
                Log.error(
                    "multiple keys in {{bucket}} with prefix={{prefix|quote}}: {{list}}",
                    bucket=self.name,
                    prefix=key,
                    list=[k.name for k in metas]
                )
            if not perfect and error:
                Log.error("Problem with key request", error)
            return coalesce(perfect, favorite)