Exemple #1
0
def find_container(frum, after):
    """
    :param frum:
    :return:
    """
    global namespace
    if not namespace:
        if not container.config.default.settings:
            Log.error(
                "expecting jx_base.container.config.default.settings to contain default elasticsearch connection info"
            )
        namespace = ElasticsearchMetadata(container.config.default.settings)
    if not frum:
        Log.error("expecting json query expression with from clause")

    # FORCE A RELOAD
    namespace.get_columns(frum, after=after)

    if is_text(frum):
        if frum in container_cache:
            return container_cache[frum]

        path = split_field(frum)
        if path[0] == "meta":
            if path[1] == "columns":
                return namespace.meta.columns.denormalized()
            elif path[1] == "tables":
                return namespace.meta.tables
            else:
                fact_table_name = join_field(path[:2])
        else:
            fact_table_name = path[0]

        type_ = container.config.default.type

        settings = set_default(
            {
                "alias": fact_table_name,
                "name": frum,
                "exists": True
            },
            container.config.default.settings,
        )
        settings.type = None
        output = container.type2container[type_](settings)
        container_cache[frum] = output
        return output
    elif is_data(frum) and frum.type and container.type2container[frum.type]:
        # TODO: Ensure the frum.name is set, so we capture the deep queries
        if not frum.type:
            Log.error("Expecting from clause to have a 'type' property")
        return container.type2container[frum.type](frum.settings)
    elif is_data(frum) and (frum["from"] or is_container(frum["from"])):
        from jx_base.query import QueryOp

        return QueryOp.wrap(frum)
    elif is_container(frum):
        return ListContainer("test_list", frum)
    else:
        return frum
Exemple #2
0
    def fill_container(self, subtest, typed=True):
        """
        RETURN SETTINGS THAT CAN BE USED TO POINT TO THE INDEX THAT'S FILLED
        """
        subtest = wrap(subtest)
        _settings = self._es_test_settings  # ALREADY COPIED AT setUp()

        try:
            url = "file://resources/schema/basic_schema.json.template?{{.|url}}"
            url = expand_template(url, {
                "type": _settings.type,
                "metadata": subtest.metadata
            })
            _settings.schema = mo_json_config.get(url)

            # MAKE CONTAINER
            container = self._es_cluster.get_or_create_index(
                typed=typed,
                schema=subtest.schema or _settings.schema,
                kwargs=_settings
            )
            container.add_alias(_settings.index)

            _settings.alias = container.settings.alias
            _settings.index = container.settings.index
            ESUtils.indexes.append(_settings.index)

            # INSERT DATA
            if '"null"' in value2json(subtest.data):
                Log.error("not expected")
            container.extend([{"value": d} for d in subtest.data])
            container.flush()

            now = Date.now()
            namespace = ElasticsearchMetadata(self._es_cluster.settings)
            namespace.get_columns(_settings.alias, after=now)  # FORCE A RELOAD

            # ENSURE query POINTS TO CONTAINER
            frum = subtest.query["from"]
            if frum == None:
                subtest.query["from"] = _settings.alias
            elif is_text(frum):
                subtest.query["from"] = frum.replace(test_jx.TEST_TABLE, _settings.alias)
            else:
                Log.error("Do not know how to handle")

        except Exception as e:
            Log.error("can not load {{data}} into container", data=subtest.data, cause=e)

        return _settings
Exemple #3
0
 def __new__(cls, *args, **kwargs):
     if (len(args) == 1 and args[0].get("index") == "meta") or kwargs.get("index") == "meta":
         output = ElasticsearchMetadata.__new__(ElasticsearchMetadata, *args, **kwargs)
         output.__init__(*args, **kwargs)
         return output
     else:
         return Container.__new__(cls)
Exemple #4
0
    def __init__(
            self,
            host,
            index,
            type=None,
            alias=None,
            name=None,
            port=9200,
            read_only=True,
            timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
            wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
            typed=None,
            kwargs=None):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, alias, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=coalesce(alias, index),
                                          kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(
                read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self._namespace.get_snowflake(
            self.es.settings.alias).columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error(
                    "Expecting given typed {{typed}} to match {{is_typed}}",
                    typed=typed,
                    is_typed=is_typed)
            self.typed = typed
Exemple #5
0
    def __init__(
        self,
        host,
        index,
        type=None,
        name=None,
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None
    ):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=index, kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed)
            self.typed = typed
Exemple #6
0
class ES52(Container):
    """
    SEND jx QUERIES TO ElasticSearch
    """

    def __new__(cls, *args, **kwargs):
        if (len(args) == 1 and args[0].get("index") == "meta") or kwargs.get("index") == "meta":
            output = ElasticsearchMetadata.__new__(ElasticsearchMetadata, *args, **kwargs)
            output.__init__(*args, **kwargs)
            return output
        else:
            return Container.__new__(cls)

    @override
    def __init__(
        self,
        host,
        index,
        type=None,
        name=None,
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None
    ):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=index, kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed)
            self.typed = typed

    @property
    def snowflake(self):
        return self._namespace.get_snowflake(self.es.settings.alias)

    @property
    def namespace(self):
        return self._namespace

    def get_table(self, full_name):
        return Table(full_name, self)

    def get_schema(self, query_path):
        return self._namespace.get_schema(query_path)

    def __data__(self):
        settings = self.settings.copy()
        settings.settings = None
        return settings

    def __enter__(self):
        Log.error("No longer used")
        return self

    def __exit__(self, type, value, traceback):
        if not self.worker:
            return

        if isinstance(value, Exception):
            self.worker.stop()
            self.worker.join()
        else:
            self.worker.join()

    @property
    def query_path(self):
        return join_field(split_field(self.name)[1:])

    @property
    def url(self):
        return self.es.url

    def query(self, _query):
        try:
            query = QueryOp.wrap(_query, container=self, namespace=self.namespace)

            for s in listwrap(query.select):
                if s.aggregate != None and not aggregates.get(s.aggregate):
                    Log.error(
                        "ES can not aggregate {{name}} because {{aggregate|quote}} is not a recognized aggregate",
                        name=s.name,
                        aggregate=s.aggregate
                    )

            frum = query["from"]
            if isinstance(frum, QueryOp):
                result = self.query(frum)
                q2 = query.copy()
                q2.frum = result
                return jx.run(q2)

            if is_deepop(self.es, query):
                return es_deepop(self.es, query)
            if is_aggsop(self.es, query):
                return es_aggsop(self.es, frum, query)
            if is_setop(self.es, query):
                return es_setop(self.es, query)
            Log.error("Can not handle")
        except Exception as e:
            e = Except.wrap(e)
            if "Data too large, data for" in e:
                http.post(self.es.cluster.url / "_cache/clear")
                Log.error("Problem (Tried to clear Elasticsearch cache)", e)
            Log.error("problem", e)

    def addDimension(self, dim):
        if isinstance(dim, list):
            Log.error("Expecting dimension to be a object, not a list:\n{{dim}}",  dim= dim)
        self._addDimension(dim, [])

    def _addDimension(self, dim, path):
        dim.full_name = dim.name
        for e in dim.edges:
            d = Dimension(e, dim, self)
            self.edges[d.full_name] = d

    def __getitem__(self, item):
        c = self.get_columns(table_name=self.name, column_name=item)
        if c:
            if len(c) > 1:
                Log.error("Do not know how to handle multipole matches")
            return c[0]

        e = self.edges[item]
        if not c:
            Log.warning("Column with name {{column|quote}} can not be found in {{table}}", column=item, table=self.name)
        return e

    def __getattr__(self, item):
        return self.edges[item]

    def update(self, command):
        """
        EXPECTING command == {"set":term, "where":where}
        THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES
        THE where CLAUSE IS AN ES FILTER
        """
        command = wrap(command)
        table = self.get_table(command['update'])

        es_index = self.es.cluster.get_index(read_only=False, alias=None, kwargs=self.es.settings)

        schema = table.schema
        es_filter = jx_expression(command.where).to_esfilter(schema)

        # GET IDS OF DOCUMENTS
        query = {
            "from": command['update'],
            "select": ["_id"] + [
                {"name": k, "value": v}
                for k, v in command.set.items()
            ],
            "where": command.where,
            "format": "list",
            "limit": 10000
        }

        results = self.query(query)

        if results.data:
            content = "".join(
                t
                for r in results.data
                for _id, row in [(r._id, r)]
                for _ in [row.__setitem__('_id', None)]  # WARNING! DESTRUCTIVE TO row
                for update in map(value2json, ({"update": {"_id": _id}}, {"doc": row}))
                for t in (update, "\n")
            )
            response = self.es.cluster.post(
                es_index.path + "/" + "_bulk",
                data=content,
                headers={"Content-Type": "application/json"},
                timeout=self.settings.timeout,
                params={"wait_for_active_shards": self.settings.wait_for_active_shards}
            )
            if response.errors:
                Log.error("could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)])

        # DELETE BY QUERY, IF NEEDED
        if '.' in listwrap(command.clear):
            self.es.delete_record(es_filter)
            return

        es_index.flush()
Exemple #7
0
class ES52(Container):
    """
    SEND jx QUERIES TO ElasticSearch
    """
    def __new__(cls, *args, **kwargs):
        if (len(args) == 1 and args[0].get("index")
                == "meta") or kwargs.get("index") == "meta":
            output = ElasticsearchMetadata.__new__(ElasticsearchMetadata,
                                                   *args, **kwargs)
            output.__init__(*args, **kwargs)
            return output
        else:
            return Container.__new__(cls)

    @override
    def __init__(
            self,
            host,
            index,
            type=None,
            alias=None,
            name=None,
            port=9200,
            read_only=True,
            timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
            wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
            typed=None,
            kwargs=None):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, alias, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=coalesce(alias, index),
                                          kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(
                read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self._namespace.get_snowflake(
            self.es.settings.alias).columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error(
                    "Expecting given typed {{typed}} to match {{is_typed}}",
                    typed=typed,
                    is_typed=is_typed)
            self.typed = typed

    @property
    def snowflake(self):
        return self._namespace.get_snowflake(self.es.settings.alias)

    @property
    def namespace(self):
        return self._namespace

    def get_table(self, full_name):
        return Table(full_name, self)

    def get_schema(self, query_path):
        return self._namespace.get_schema(query_path)

    def __data__(self):
        settings = self.settings.copy()
        settings.settings = None
        return settings

    def __enter__(self):
        Log.error("No longer used")
        return self

    def __exit__(self, type, value, traceback):
        if not self.worker:
            return

        if isinstance(value, Exception):
            self.worker.stop()
            self.worker.join()
        else:
            self.worker.join()

    @property
    def query_path(self):
        return join_field(split_field(self.name)[1:])

    @property
    def url(self):
        return self.es.url

    def query(self, _query):
        try:
            query = QueryOp.wrap(_query,
                                 container=self,
                                 namespace=self.namespace)

            for s in listwrap(query.select):
                if s.aggregate != None and not aggregates.get(s.aggregate):
                    Log.error(
                        "ES can not aggregate {{name}} because {{aggregate|quote}} is not a recognized aggregate",
                        name=s.name,
                        aggregate=s.aggregate)

            frum = query["from"]
            if isinstance(frum, QueryOp):
                result = self.query(frum)
                q2 = query.copy()
                q2.frum = result
                return jx.run(q2)

            if is_deepop(self.es, query):
                return es_deepop(self.es, query)
            if is_aggsop(self.es, query):
                return es_aggsop(self.es, frum, query)
            if is_setop(self.es, query):
                return es_setop(self.es, query)
            Log.error("Can not handle")
        except Exception as e:
            e = Except.wrap(e)
            if "Data too large, data for" in e:
                http.post(self.es.cluster.path + "/_cache/clear")
                Log.error("Problem (Tried to clear Elasticsearch cache)", e)
            Log.error("problem", e)

    def addDimension(self, dim):
        if isinstance(dim, list):
            Log.error(
                "Expecting dimension to be a object, not a list:\n{{dim}}",
                dim=dim)
        self._addDimension(dim, [])

    def _addDimension(self, dim, path):
        dim.full_name = dim.name
        for e in dim.edges:
            d = Dimension(e, dim, self)
            self.edges[d.full_name] = d

    def __getitem__(self, item):
        c = self.get_columns(table_name=self.name, column_name=item)
        if c:
            if len(c) > 1:
                Log.error("Do not know how to handle multipole matches")
            return c[0]

        e = self.edges[item]
        if not c:
            Log.warning(
                "Column with name {{column|quote}} can not be found in {{table}}",
                column=item,
                table=self.name)
        return e

    def __getattr__(self, item):
        return self.edges[item]

    def update(self, command):
        """
        EXPECTING command == {"set":term, "where":where}
        THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES
        THE where CLAUSE IS AN ES FILTER
        """
        command = wrap(command)
        schema = self.es.get_properties()

        # GET IDS OF DOCUMENTS
        results = self.es.search({
            "stored_fields":
            listwrap(schema._routing.path),
            "query": {
                "bool": {
                    "filter": jx_expression(command.where).to_esfilter(Null)
                }
            },
            "size":
            10000
        })

        # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT)
        scripts = FlatList()
        for k, v in command.set.items():
            if not is_variable_name(k):
                Log.error("Only support simple paths for now")
            if isinstance(v, Mapping) and v.doc:
                scripts.append({"doc": v.doc})
            else:
                v = scrub(v)
                scripts.append({
                    "script":
                    "ctx._source." + k + " = " +
                    jx_expression(v).to_es_script(schema).script(schema)
                })

        if results.hits.hits:
            updates = []
            for h in results.hits.hits:
                for s in scripts:
                    updates.append({
                        "update": {
                            "_id":
                            h._id,
                            "_routing":
                            unwraplist(h.fields[literal_field(
                                schema._routing.path)])
                        }
                    })
                    updates.append(s)
            content = ("\n".join(value2json(c) for c in updates) + "\n")
            response = self.es.cluster.post(
                self.es.path + "/_bulk",
                data=content,
                headers={"Content-Type": "application/json"},
                timeout=self.settings.timeout,
                params={
                    "wait_for_active_shards":
                    self.settings.wait_for_active_shards
                })
            if response.errors:
                Log.error("could not update: {{error}}",
                          error=[
                              e.error for i in response["items"]
                              for e in i.values() if e.status not in (200, 201)
                          ])
Exemple #8
0
    def __init__(
        self,
        host,
        index,
        type=None,
        name=None,
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None
    ):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=index, kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {".": None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if not v:
                    return []
                else:
                    return [v] + nested_path_of(all_paths[v])

            all = sort_using_key(set(step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p)))
            for step in sorted(all):
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                nested_path = nested_path_of(all_paths[p])
                if not nested_path:
                    nested_path = ['.']
                self.namespace.meta.columns.add(Column(
                    name=p,
                    es_column=p,
                    es_index=self.name,
                    es_type=OBJECT,
                    jx_type=EXISTS,
                    nested_path=nested_path,
                    last_updated=Date.now()
                ))
    def __init__(
            self,
            host,
            index,  # THE NAME OF THE SNOWFLAKE (IF WRITING)
            alias=None,  # THE NAME OF THE SNOWFLAKE (FOR READING)
            type=None,
            name=None,  # THE FULL NAME OF THE TABLE (THE NESTED PATH INTO THE SNOWFLAKE)
            port=9200,
            read_only=True,
            timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
            wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
            typed=None,
            kwargs=None):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.edges = Data()  # SET EARLY, SO OTHER PROCESSES CAN REQUEST IT
        self.worker = None
        self.settings = kwargs
        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.name = name = self._namespace._find_alias(
            coalesce(alias, index, name))
        if read_only:
            self.es = elasticsearch.Alias(alias=name,
                                          index=None,
                                          kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(
                read_only=read_only, kwargs=kwargs)

        self._ensure_max_result_window_set(name)
        self.settings.type = self.es.settings.type
        self.stats = QueryStats(self.es.cluster)

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error(
                    "Expecting given typed {{typed}} to match {{is_typed}}",
                    typed=typed,
                    is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {'.': None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if v == '.':
                    return ('.', )
                return (v, ) + nested_path_of(all_paths[v])

            query_paths = sort_using_key(set(
                step for path in self.snowflake.query_paths for step in path),
                                         key=lambda p: len(split_field(p)))
            for step in query_paths:
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                if p == ".":
                    nested_path = ('.', )
                else:
                    nested_path = nested_path_of(p)[1:]

                jx_type = (OBJECT if p == "." else NESTED)
                self.namespace.meta.columns.add(
                    Column(name=p,
                           es_column=p,
                           es_index=self.name,
                           es_type=jx_type,
                           jx_type=jx_type,
                           cardinality=1,
                           nested_path=nested_path,
                           multi=1001 if jx_type is NESTED else 1,
                           last_updated=Date.now()))
class ES52(Container):
    """
    SEND jx QUERIES TO ElasticSearch
    """
    def __new__(cls, *args, **kwargs):
        if (len(args) == 1 and args[0].get("index")
                == "meta") or kwargs.get("index") == "meta":
            output = ElasticsearchMetadata.__new__(ElasticsearchMetadata,
                                                   *args, **kwargs)
            output.__init__(*args, **kwargs)
            return output
        else:
            return Container.__new__(cls)

    @override
    def __init__(
            self,
            host,
            index,  # THE NAME OF THE SNOWFLAKE (IF WRITING)
            alias=None,  # THE NAME OF THE SNOWFLAKE (FOR READING)
            type=None,
            name=None,  # THE FULL NAME OF THE TABLE (THE NESTED PATH INTO THE SNOWFLAKE)
            port=9200,
            read_only=True,
            timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
            wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
            typed=None,
            kwargs=None):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.edges = Data()  # SET EARLY, SO OTHER PROCESSES CAN REQUEST IT
        self.worker = None
        self.settings = kwargs
        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.name = name = self._namespace._find_alias(
            coalesce(alias, index, name))
        if read_only:
            self.es = elasticsearch.Alias(alias=name,
                                          index=None,
                                          kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(
                read_only=read_only, kwargs=kwargs)

        self._ensure_max_result_window_set(name)
        self.settings.type = self.es.settings.type
        self.stats = QueryStats(self.es.cluster)

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error(
                    "Expecting given typed {{typed}} to match {{is_typed}}",
                    typed=typed,
                    is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {'.': None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if v == '.':
                    return ('.', )
                return (v, ) + nested_path_of(all_paths[v])

            query_paths = sort_using_key(set(
                step for path in self.snowflake.query_paths for step in path),
                                         key=lambda p: len(split_field(p)))
            for step in query_paths:
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                if p == ".":
                    nested_path = ('.', )
                else:
                    nested_path = nested_path_of(p)[1:]

                jx_type = (OBJECT if p == "." else NESTED)
                self.namespace.meta.columns.add(
                    Column(name=p,
                           es_column=p,
                           es_index=self.name,
                           es_type=jx_type,
                           jx_type=jx_type,
                           cardinality=1,
                           nested_path=nested_path,
                           multi=1001 if jx_type is NESTED else 1,
                           last_updated=Date.now()))

    @property
    def snowflake(self):
        return self._namespace.get_snowflake(self.es.settings.alias)

    @property
    def namespace(self):
        return self._namespace

    def get_table(self, full_name):
        return Table(full_name, self)

    def get_schema(self, query_path):
        return self._namespace.get_schema(query_path)

    def __data__(self):
        settings = self.settings.copy()
        settings.settings = None
        return settings

    @property
    def url(self):
        return self.es.url

    def _ensure_max_result_window_set(self, name):
        # TODO : CHECK IF THIS IS ALREADY SET, IT TAKES TOO LONG
        for i, s in self.es.cluster.get_metadata().indices.items():
            if name == i or name in s.aliases:
                if s.settings.index.max_result_window != '100000' or s.settings.index.max_inner_result_window != '100000':
                    Log.note("setting max_result_window")
                    self.es.cluster.put("/" + name + "/_settings",
                                        data={
                                            "index": {
                                                "max_inner_result_window":
                                                100000,
                                                "max_result_window": 100000
                                            }
                                        })
                    break

    def query(self, _query):
        try:
            query = QueryOp.wrap(_query,
                                 container=self,
                                 namespace=self.namespace)

            self.stats.record(query)

            for s in listwrap(query.select):
                if s.aggregate != None and not aggregates.get(s.aggregate):
                    Log.error(
                        "ES can not aggregate {{name}} because {{aggregate|quote}} is not a recognized aggregate",
                        name=s.name,
                        aggregate=s.aggregate)

            frum = query["from"]
            if is_op(frum, QueryOp):
                result = self.query(frum)
                q2 = query.copy()
                q2.frum = result
                return jx.run(q2)

            if is_bulk_agg(self.es, query):
                return es_bulkaggsop(self, frum, query)
            if is_bulk_set(self.es, query):
                return es_bulksetop(self, frum, query)

            query.limit = temper_limit(query.limit, query)

            if is_deepop(self.es, query):
                return es_deepop(self.es, query)
            if is_aggsop(self.es, query):
                return es_aggsop(self.es, frum, query)
            if is_setop(self.es, query):
                return es_setop(self.es, query)
            Log.error("Can not handle")
        except Exception as e:
            e = Except.wrap(e)
            if "Data too large, data for" in e:
                http.post(self.es.cluster.url / "_cache/clear")
                Log.error("Problem (Tried to clear Elasticsearch cache)", e)
            Log.error("problem", e)

    def update(self, command):
        """
        EXPECTING command == {"set":term, "where":where}
        THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES
        THE where CLAUSE IS AN ES FILTER
        """
        command = to_data(command)
        table = self.get_table(command['update'])

        es_index = self.es.cluster.get_index(read_only=False,
                                             alias=None,
                                             kwargs=self.es.settings)

        schema = table.schema

        # GET IDS OF DOCUMENTS
        query = {
            "from":
            command['update'],
            "select": [{
                "value": "_id"
            }] + [{
                "name": k,
                "value": v
            } for k, v in command.set.items()],
            "where":
            command.where,
            "format":
            "list",
            "limit":
            10000
        }

        results = self.query(query)

        if results.data:
            content = "".join(t for r in results.data
                              for _id, row in [(r._id, r)]
                              for _ in [row.__setitem__('_id', None)
                                        ]  # WARNING! DESTRUCTIVE TO row
                              for update in map(value2json, ({
                                  "update": {
                                      "_id": _id
                                  }
                              }, {
                                  "doc": row
                              })) for t in (update, "\n"))
            response = self.es.cluster.post(
                es_index.path + "/" + "_bulk",
                data=content,
                timeout=self.settings.timeout,
                params={
                    "wait_for_active_shards":
                    self.settings.wait_for_active_shards
                })
            if response.errors:
                Log.error("could not update: {{error}}",
                          error=[
                              e.error for i in response["items"]
                              for e in i.values() if e.status not in (200, 201)
                          ])

        # DELETE BY QUERY, IF NEEDED
        if "." in listwrap(command['clear']):
            es_filter = ES52Lang[jx_expression(
                command.where)].partial_eval().to_es(schema)
            self.es.delete_record(es_filter)
            return

        es_index.refresh()
Exemple #11
0
class ES14(Container):
    """
    SEND jx QUERIES TO ElasticSearch
    """

    def __new__(cls, *args, **kwargs):
        if (len(args) == 1 and args[0].get("index") == "meta") or kwargs.get("index") == "meta":
            output = ElasticsearchMetadata.__new__(ElasticsearchMetadata, *args, **kwargs)
            output.__init__(*args, **kwargs)
            return output
        else:
            return Container.__new__(cls)

    @override
    def __init__(
        self,
        host,
        index,
        type=None,
        name=None,
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None
    ):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, alias, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=coalesce(alias, index), kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self._namespace.get_snowflake(self.es.settings.alias).columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed)
            self.typed = typed

    @property
    def snowflake(self):
        return self._namespace.get_snowflake(self.es.settings.alias)

    @property
    def namespace(self):
        return self._namespace


    def get_table(self, full_name):
        return Table(full_name, self)

    def get_schema(self, query_path):
        return self._namespace.get_schema(query_path)

    def __data__(self):
        settings = self.settings.copy()
        settings.settings = None
        return settings

    def __enter__(self):
        Log.error("No longer used")
        return self

    def __exit__(self, type, value, traceback):
        if not self.worker:
            return

        if isinstance(value, Exception):
            self.worker.stop()
            self.worker.join()
        else:
            self.worker.join()

    @property
    def query_path(self):
        return join_field(split_field(self.name)[1:])

    @property
    def url(self):
        return self.es.url

    def query(self, _query):
        try:
            query = QueryOp.wrap(_query, container=self, namespace=self.namespace)

            for s in listwrap(query.select):
                if s.aggregate != None and not aggregates.get(s.aggregate):
                    Log.error(
                        "ES can not aggregate {{name}} because {{aggregate|quote}} is not a recognized aggregate",
                        name=s.name,
                        aggregate=s.aggregate
                    )

            frum = query["from"]
            if isinstance(frum, QueryOp):
                result = self.query(frum)
                q2 = query.copy()
                q2.frum = result
                return jx.run(q2)

            if is_deepop(self.es, query):
                return es_deepop(self.es, query)
            if is_aggsop(self.es, query):
                return es_aggsop(self.es, frum, query)
            if is_setop(self.es, query):
                return es_setop(self.es, query)
            Log.error("Can not handle")
        except Exception as e:
            e = Except.wrap(e)
            if "Data too large, data for" in e:
                http.post(self.es.cluster.url / "_cache/clear")
                Log.error("Problem (Tried to clear Elasticsearch cache)", e)
            Log.error("problem", e)

    def addDimension(self, dim):
        if isinstance(dim, list):
            Log.error("Expecting dimension to be a object, not a list:\n{{dim}}",  dim= dim)
        self._addDimension(dim, [])

    def _addDimension(self, dim, path):
        dim.full_name = dim.name
        for e in dim.edges:
            d = Dimension(e, dim, self)
            self.edges[d.full_name] = d

    def __getitem__(self, item):
        c = self.get_columns(table_name=self.name, column_name=item)
        if c:
            if len(c) > 1:
                Log.error("Do not know how to handle multipole matches")
            return c[0]

        e = self.edges[item]
        if not c:
            Log.warning("Column with name {{column|quote}} can not be found in {{table}}", column=item, table=self.name)
        return e

    def __getattr__(self, item):
        return self.edges[item]

    def update(self, command):
        """
        EXPECTING command == {"set":term, "where":where}
        THE set CLAUSE IS A DICT MAPPING NAMES TO VALUES
        THE where CLAUSE IS AN ES FILTER
        """
        command = wrap(command)
        schema = self.es.get_properties()

        # GET IDS OF DOCUMENTS
        results = self.es.search({
            "fields": listwrap(schema._routing.path),
            "query": {"filtered": {
                "filter": jx_expression(command.where).to_esfilter(Null)
            }},
            "size": 10000
        })

        # SCRIPT IS SAME FOR ALL (CAN ONLY HANDLE ASSIGNMENT TO CONSTANT)
        scripts = FlatList()
        for k, v in command.set.items():
            if not is_variable_name(k):
                Log.error("Only support simple paths for now")
            if isinstance(v, Mapping) and v.doc:
                scripts.append({"doc": v.doc})
            else:
                v = scrub(v)
                scripts.append({"script": "ctx._source." + k + " = " + jx_expression(v).to_es_script(schema).script(schema)})

        if results.hits.hits:
            updates = []
            for h in results.hits.hits:
                for s in scripts:
                    updates.append({"update": {"_id": h._id, "_routing": unwraplist(h.fields[literal_field(schema._routing.path)])}})
                    updates.append(s)
            content = ("\n".join(value2json(c) for c in updates) + "\n")
            response = self.es.cluster.post(
                self.es.path + "/_bulk",
                data=content,
                headers={"Content-Type": "application/json"},
                timeout=self.settings.timeout,
                params={"wait_for_active_shards": self.settings.wait_for_active_shards}
            )
            if response.errors:
                Log.error("could not update: {{error}}", error=[e.error for i in response["items"] for e in i.values() if e.status not in (200, 201)])
Exemple #12
0
    def __init__(
        self,
        host,
        index,
        type=None,
        name=None,
        port=9200,
        read_only=True,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
        typed=None,
        kwargs=None
    ):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.settings = kwargs
        self.name = name = coalesce(name, index)
        if read_only:
            self.es = elasticsearch.Alias(alias=index, kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(read_only=read_only, kwargs=kwargs)

        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.settings.type = self.es.settings.type
        self.edges = Data()
        self.worker = None

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error("Expecting given typed {{typed}} to match {{is_typed}}", typed=typed, is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {".": None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if not v:
                    return []
                else:
                    return [v] + nested_path_of(all_paths[v])

            all = sort_using_key(set(step for path in self.snowflake.query_paths for step in path), key=lambda p: len(split_field(p)))
            for step in sorted(all):
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                nested_path = nested_path_of(all_paths[p])
                if not nested_path:
                    nested_path = ['.']
                self.namespace.meta.columns.add(Column(
                    name=p,
                    es_column=p,
                    es_index=self.name,
                    es_type=OBJECT,
                    jx_type=EXISTS,
                    nested_path=nested_path,
                    last_updated=Date.now()
                ))