Python Elasticsearch.delete_by_query Examples

Programming Language: Python

Namespace/Package Name: elasticsearch.client

Class/Type: Elasticsearch

Method/Function: delete_by_query

Examples at hotexamples.com: 4

Python Elasticsearch.delete_by_query - 4 examples found. These are the top rated real world Python examples of elasticsearch.client.Elasticsearch.delete_by_query extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Elasticsearch(30)

search(20)

index(8)

bulk(5)

create(5)

info(4)

update(4)

delete_by_query(3)

get(3)

mtermvectors(3)

scroll(2)

put_template(2)

ping(2)

delete(2)

count_percolate(1)

msearch(1)

termvector(1)

suggest(1)

search_template(1)

search_shards(1)

benchmark(1)

put_script(1)

percolate(1)

mlt(1)

mpercolate(1)

mget(1)

clear_scroll(1)

count(1)

abort_benchmark(1)

get_template(1)

get_source(1)

get_script(1)

explain(1)

exists(1)

delete_template(1)

delete_script(1)

list_benchmarks(1)

Example #1

Show file

class ELmonocleDB:

    log = logging.getLogger("monocle.ELmonocleDB")

    def __init__(
        self,
        elastic_conn="localhost:9200",
        index=None,
        timeout=10,
        prefix=CHANGE_PREFIX,
        create=True,
        previous_schema=False,
        idents_config: Optional[IdentsConfig] = None,
        user=None,
        password=None,
        use_ssl=None,
        verify_certs=None,
        ssl_show_warn=None,
    ) -> None:
        host, port = elastic_conn.split(":")
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        ip = socket.gethostbyname(host)
        self.log.info("ES IP is %s" % ip)
        self.log.info("ES prefix is %s" % prefix)

        elastic_conn = [
            {
                "host": host,
                "port": port,
            }
        ]

        if use_ssl:
            elastic_conn[0]["use_ssl"] = use_ssl

        if not verify_certs:
            elastic_conn[0]["verify_certs"] = verify_certs

        if not ssl_show_warn:
            elastic_conn[0]["ssl_show_warn"] = ssl_show_warn

        if user and password:
            elastic_conn[0]["http_auth"] = "%s:%s" % (user, password)

        while True:
            try:
                s.connect((ip, int(port)))
                s.shutdown(2)
                s.close()
                break
            except Exception as excpt:
                self.log.info(
                    "Unable to connect to %s: %s. Sleeping for %ds."
                    % (elastic_conn, excpt, timeout)
                )
                time.sleep(timeout)

        self.log.info("Connecting to ES server at %s" % elastic_conn)
        self.es = Elasticsearch(elastic_conn)
        self.log.info(self.es.info())

        if previous_schema:
            self.prefix = PREV_CHANGE_PREFIX
        else:
            self.prefix = prefix

        if not index:
            self.log.info("No index provided")
            return

        self.idents_config = idents_config or []

        self.index = "{}{}".format(self.prefix, index)
        self.log.info("Using ES index %s" % self.index)
        self.mapping = {
            "properties": {
                "id": {"type": "keyword"},
                "type": {"type": "keyword"},
                "number": {"type": "keyword"},
                "change_id": {"type": "keyword"},
                "title": {
                    "type": "text",
                    "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}},
                },
                "text": {
                    "type": "text",
                    "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}},
                },
                "url": {"type": "keyword"},
                "commit_count": {"type": "integer"},
                "additions": {"type": "integer"},
                "deletions": {"type": "integer"},
                "changed_files_count": {"type": "integer"},
                "changed_files": {
                    "properties": {
                        "additions": {"type": "integer"},
                        "deletions": {"type": "integer"},
                        "path": {"type": "keyword"},
                    }
                },
                "commits": {
                    "properties": {
                        "sha": {"type": "keyword"},
                        "author": {
                            "properties": {
                                "uid": {"type": "keyword"},
                                "muid": {"type": "keyword"},
                            }
                        },
                        "committer": {
                            "properties": {
                                "uid": {"type": "keyword"},
                                "muid": {"type": "keyword"},
                            }
                        },
                        "authored_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "committed_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "additions": {"type": "integer"},
                        "deletions": {"type": "integer"},
                        "title": {"type": "text"},
                    }
                },
                "repository_prefix": {"type": "keyword"},
                "repository_fullname": {"type": "keyword"},
                "repository_shortname": {"type": "keyword"},
                "author": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "on_author": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "committer": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "merged_by": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "branch": {"type": "keyword"},
                "target_branch": {"type": "keyword"},
                "created_at": {"type": "date", "format": "date_time_no_millis"},
                "on_created_at": {"type": "date", "format": "date_time_no_millis"},
                "merged_at": {"type": "date", "format": "date_time_no_millis"},
                "updated_at": {"type": "date", "format": "date_time_no_millis"},
                "closed_at": {"type": "date", "format": "date_time_no_millis"},
                "state": {"type": "keyword"},
                "duration": {"type": "integer"},
                "mergeable": {"type": "keyword"},
                "labels": {"type": "keyword"},
                "assignees": {
                    "type": "nested",
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    },
                },
                "approval": {"type": "keyword"},
                "draft": {"type": "boolean"},
                "self_merged": {"type": "boolean"},
                "crawler_metadata": {
                    "properties": {
                        "last_commit_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "last_post_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "total_docs_posted": {"type": "integer"},
                        "total_changes_updated": {"type": "integer"},
                        "total_orphans_updated": {"type": "integer"},
                    }
                },
                "tasks_data": {
                    "properties": {
                        "tid": {"type": "keyword"},
                        "ttype": {"type": "keyword"},
                        "crawler_name": {"type": "keyword"},
                        "updated_at": {"type": "date", "format": "date_time_no_millis"},
                        "change_url": {"type": "keyword"},
                        "severity": {"type": "keyword"},
                        "priority": {"type": "keyword"},
                        "score": {"type": "integer"},
                        "url": {"type": "keyword"},
                        "title": {
                            "type": "text",
                            "fields": {
                                "keyword": {"type": "keyword", "ignore_above": 8191}
                            },
                        },
                        "_adopted": {"type": "boolean"},
                    }
                },
            }
        }
        settings = {"mappings": self.mapping}
        self.ic = self.es.indices
        if create:
            self.ic.create(index=self.index, ignore=400, body=settings)
        # The authors_histo is failing on some context with this error when the
        # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be
        # set by changing the [search.max_buckets] cluster level)
        # This is an attempt to mitigate the issue
        cluster_settings = {"transient": {"search.max_buckets": 100000}}
        self.es.cluster.put_settings(body=cluster_settings)

    def update(self, source_it: List[Union[Change, Event]]) -> None:
        def gen(it):
            for _source in it:
                source = change_or_event_to_dict(_source)
                d = {}
                d["_index"] = self.index
                d["_op_type"] = "update"
                d["_id"] = source["id"]
                d["doc"] = source
                d["doc_as_upsert"] = True
                yield d

        bulk(self.es, gen(source_it))
        self.es.indices.refresh(index=self.index)

    def update_task_data(
        self,
        source_it: Union[
            List[TaskDataForEL],
            List[OrphanTaskDataForEL],
            List[AdoptedTaskDataForEL],
        ],
    ) -> Optional[BulkIndexError]:
        def gen(it):
            for _source in it:
                d = {}
                d["_index"] = self.index
                d["_op_type"] = "update"
                d["_id"] = _source._id
                d["doc"] = {}
                d["doc"].update({"id": _source._id})
                if isinstance(_source, TaskDataForEL):
                    d["doc"].update(
                        {"tasks_data": [asdict(td) for td in _source.tasks_data]}
                    )
                if isinstance(_source, OrphanTaskDataForEL):
                    d["doc"].update({"tasks_data": asdict(_source.task_data)})
                    d["doc"]["type"] = "OrphanTaskData"
                if isinstance(_source, AdoptedTaskDataForEL):
                    d["doc"].update({"tasks_data": asdict(_source.task_data)})
                d["doc_as_upsert"] = True
                yield d

        ret = None
        try:
            bulk(self.es, gen(source_it))
        except BulkIndexError as err:
            ret = err
        self.es.indices.refresh(index=self.index)
        return ret

    def compute_crawler_id_by_name(self, name, _type):
        return "crawler/%s/%s" % (_type, name)

    def get_task_crawler_metadata(self, name: str) -> Dict:
        try:
            ret = self.es.get(
                self.index, self.compute_crawler_id_by_name(name, "tasks_crawler")
            )
            return ret["_source"]["crawler_metadata"]
        except Exception:
            return {}

    def set_task_crawler_metadata(
        self, name: str, commit_date: datetime = None, push_infos: Dict = None
    ):
        metadata = {}
        if commit_date:
            metadata.update({"last_commit_at": commit_date})
        if push_infos:
            prev_metadata = self.get_task_crawler_metadata(name)
            metadata.update(
                {
                    "last_post_at": push_infos["last_post_at"],
                    "total_docs_posted": prev_metadata.get("total_docs_posted", 0)
                    + push_infos["total_docs_posted"],
                    "total_changes_updated": prev_metadata.get(
                        "total_changes_updated", 0
                    )
                    + push_infos["total_changes_updated"],
                    "total_orphans_updated": prev_metadata.get(
                        "total_orphans_updated", 0
                    )
                    + push_infos["total_orphans_updated"],
                }
            )
        body = {
            "doc": {"type": "TaskCrawlerDataCommit", "crawler_metadata": metadata},
            "doc_as_upsert": True,
        }
        ret = None
        try:
            self.es.update(
                self.index,
                self.compute_crawler_id_by_name(name, "tasks_crawler"),
                body=body,
            )
            self.es.indices.refresh(index=self.index)
        except Exception as err:
            ret = err
        return ret

    def delete_index(self):
        self.log.info("Deleting index: %s" % self.index)
        self.ic.delete(index=self.index)

    def delete_repository(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "query": {
                "bool": {
                    "filter": {
                        "regexp": {
                            "repository_fullname": {"value": repository_fullname}
                        }
                    }
                }
            }
        }
        params["body"] = body
        self.es.delete_by_query(**params)
        self.es.indices.refresh(index=self.index)

    def get_last_updated(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "sort": [{"updated_at": {"order": "desc"}}],
            "query": {
                "bool": {
                    "filter": [
                        {"term": {"type": "Change"}},
                        {
                            "regexp": {
                                "repository_fullname": {"value": repository_fullname}
                            }
                        },
                    ]
                }
            },
        }
        params["body"] = body
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        ret = [r["_source"] for r in res["hits"]["hits"]]
        if not ret:
            return []
        return ret[0]

    def get_changes_by_url(self, change_urls, size):
        params = {
            "index": self.index,
            "body": {
                "size": size,
                "query": {
                    "bool": {
                        "filter": [
                            {"term": {"type": "Change"}},
                            {"terms": {"url": change_urls}},
                        ]
                    }
                },
            },
        }
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        return [r["_source"] for r in res["hits"]["hits"]]

    def get_orphan_tds_by_change_urls(self, change_urls):
        assert len(change_urls) <= 50
        size = 5000  # Asumming not more that 100 TD data relataed to a change
        params = {
            "index": self.index,
            "body": {
                "size": size,
                "query": {
                    "bool": {
                        "must_not": {"exists": {"field": "tasks_data._adopted"}},
                        "filter": [
                            {"term": {"type": "OrphanTaskData"}},
                            {"terms": {"tasks_data.change_url": change_urls}},
                        ],
                    }
                },
            },
        }
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        return [r["_source"] for r in res["hits"]["hits"]]

    def get_orphan_tds_and_declare_adpotion(self, changes_url):
        assert len(changes_url) <= 50
        tds = self.get_orphan_tds_by_change_urls(changes_url)
        if tds:
            adopted_tds = [
                AdoptedTaskDataForEL(
                    _id=td["id"],
                    task_data=AdoptedTaskData(_adopted=True),
                )
                for td in tds
            ]
            self.update_task_data(adopted_tds)
        return tds

    def update_changes_with_orphan_tds(self, mapping: Dict[str, str]):
        change_urls = list(mapping.keys())
        while change_urls:
            change_urls_to_process = change_urls[:50]
            change_urls = change_urls[50:]
            tds = self.get_orphan_tds_and_declare_adpotion(change_urls_to_process)
            # Group tds in buckets by change_url
            _map: Dict[str, List] = dict()
            for td in tds:
                _map.setdefault(td["tasks_data"]["change_url"], []).append(
                    td["tasks_data"]
                )
            # Create update docs to attach tds to matching changes
            to_update = []
            for change_url, tds in _map.items():
                to_update.append(
                    TaskDataForEL(
                        _id=mapping[change_url],
                        tasks_data=createELTaskData(tds),
                    )
                )
            self.update_task_data(to_update)

    def run_named_query(self, name, *args, **kwargs):
        if name not in queries.public_queries:
            raise UnknownQueryException("Unknown query: %s" % name)
        return getattr(queries, name)(self.es, self.index, *args, **kwargs)

    def get_indices(self):
        return [
            ind.replace(self.prefix, "")
            for ind in self.es.indices.get(self.prefix + "*")
        ]

    def iter_index(self):
        body = {"query": {"match_all": {}}}
        return scan(self.es, query=body, index=self.index, size=5000)

    def update_idents(self) -> None:

        import json

        bulk_size = 7500

        def get_obj_hash(obj: Dict) -> int:
            obj_json = json.dumps(obj, sort_keys=True)
            return hash(obj_json)

        def update_ident(dict_ident: Dict) -> Dict:
            dict_ident["muid"] = create_muid(dict_ident["uid"], self.idents_config)
            return dict_ident

        def _update_idents(obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]:

            prev_hash = get_obj_hash(obj)

            if obj["type"] == "Change":
                obj["author"] = update_ident(obj["author"])
                if "committer" in obj:
                    obj["committer"] = update_ident(obj["committer"])
                if "merged_by" in obj:
                    obj["merged_by"] = update_ident(obj["merged_by"])
                if "assignees" in obj:
                    obj["assignees"] = list(map(update_ident, obj["assignees"]))
                if "commits" in obj:
                    for commit in obj["commits"]:
                        commit["author"] = update_ident(commit["author"])
                        commit["committer"] = update_ident(commit["committer"])
            if obj["type"] in get_events_list():
                if "author" in obj:
                    obj["author"] = update_ident(obj["author"])
                if "on_author" in obj:
                    obj["on_author"] = update_ident(obj["on_author"])
            updated = not prev_hash == get_obj_hash(obj)
            if updated:
                return dict_to_change_or_event(obj), True
            else:
                return None, False

        def bulk_update(to_update: List) -> List:
            print("Updating %s objects ..." % len(to_update))
            self.update(to_update)
            return []

        to_update = []
        total_read = 0
        for _obj in self.iter_index():
            total_read += 1
            if total_read % bulk_size == 0:
                print("%s objects read from the database" % total_read)
            obj = _obj["_source"]
            obj, updated = _update_idents(obj)
            if updated:
                to_update.append(obj)
            if len(to_update) == bulk_size:
                to_update = bulk_update(to_update)

        bulk_update(to_update)

Example #2

Show file

class BaseElasticsearchBackend(Base):
    """Base connection wrapper based on the ElasticSearch official library.

    It uses two entry points to configure the underlying connection:

    * ``transport_class``: the transport class from ``elasticsearch``. By
      default ``elasticsearch.transport.Transport``.
    * ``connection_class``: the connection class used by the transport class.
      It's undefined by default, as it is on the subclasses to provide one.

    If any of these elements is not defined, an ``ImproperlyConfigured`` error
    will be raised when the backend will try to configure the client.
    """
    #: ElasticSearch transport class used by the client class to perform
    #: requests.
    transport_class = Transport
    #: ElasticSearch connection class used by the transport class to perform
    #: requests.
    connection_class = None

    def configure_client(self):
        """Instantiate and configure the ElasticSearch client.

        It simply takes the given HOSTS list and uses PARAMS as the keyword
        arguments of the ElasticSearch class.

        The client's transport_class is given by the class attribute
        ``transport_class``, and the connection class used by the transport
        class is given by the class attribute ``connection_class``.

        An ``ImproperlyConfigured`` exception is raised if any of these
        elements is undefined.
        """
        hosts = self.server['HOSTS']
        params = self.server['PARAMS']

        if not self.transport_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no transport class provided' % self.__class__)

        if not self.connection_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no connection class provided' % self.__class__)

        #pylint: disable=star-args
        self.client = Elasticsearch(hosts,
                                    transport_class=self.transport_class,
                                    connection_class=self.connection_class,
                                    **params)

    # Server methods
    # ==============
    # The underlying client does not require index names to perform server
    # related queries, such as "ping" or "info". The connection wrapper act
    # for them as a proxy.

    def ping(self, **kwargs):
        return self.client.ping(**kwargs)

    def info(self, **kwargs):
        return self.client.info(**kwargs)

    def put_script(self, lang, script_id, body, **kwargs):
        return self.client.put_script(lang, script_id, body, **kwargs)

    def get_script(self, lang, script_id, **kwargs):
        return self.client.get_script(lang, script_id, **kwargs)

    def delete_script(self, lang, script_id, **kwargs):
        return self.client.delete_script(lang, script_id, **kwargs)

    def put_template(self, template_id, body, **kwargs):
        return self.client.put_template(template_id, body, **kwargs)

    def get_template(self, template_id, body=None, **kwargs):
        return self.client.get_template(template_id, body, **kwargs)

    def delete_template(self, template_id=None, **kwargs):
        return self.client.delete_template(template_id, **kwargs)

    # Bulk methods
    # ============
    # The underlying client does not require index names, but it can be used.
    # As it makes sense to not give an index, developers are free to use these
    # as they want, as long as they are careful.

    def mget(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mget(body, index, doc_type, **kwargs)

    def bulk(self, body, index=None, doc_type=None, **kwargs):
        return self.client.bulk(body, index, doc_type, **kwargs)

    def msearch(self, body, index=None, doc_type=None, **kwargs):
        return self.client.msearch(body, index, doc_type, **kwargs)

    def mpercolate(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mpercolate(body, index, doc_type, **kwargs)

    # Scroll methods
    # ==============
    # The underlying client does not require an index to perform scroll.

    def scroll(self, scroll_id, **kwargs):
        return self.client.scroll(scroll_id, **kwargs)

    def clear_scroll(self, scroll_id, body=None, **kwargs):
        return self.client.clear_scroll(scroll_id, body, **kwargs)

    # Query methods
    # =============
    # The underlying client requires index names (or alias names) to perform
    # queries. The connection wrapper overrides these client methods to
    # automatically uses the configured names (indices and/or aliases).

    def create(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.create(self.indices, doc_type, body, doc_id,
                                  **kwargs)

    def index(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.index(self.indices, doc_type, body, doc_id,
                                 **kwargs)

    def exists(self, doc_id, doc_type='_all', **kwargs):
        return self.client.exists(self.indices, doc_id, doc_type, **kwargs)

    def get(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get(self.indices, doc_id, doc_type, **kwargs)

    def get_source(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get_source(self.indices, doc_id, doc_type, **kwargs)

    def update(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.update(self.indices, doc_type, doc_id, body,
                                  **kwargs)

    def search(self, doc_type=None, body=None, **kwargs):
        return self.client.search(self.indices, doc_type, body, **kwargs)

    def search_shards(self, doc_type=None, **kwargs):
        return self.client.search_shards(self.indices, doc_type, **kwargs)

    def search_template(self, doc_type=None, body=None, **kwargs):
        return self.client.search_template(self.indices, doc_type, body,
                                           **kwargs)

    def explain(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.explain(self.indices, doc_type, doc_id, body,
                                   **kwargs)

    def delete(self, doc_type, doc_id, **kwargs):
        return self.client.delete(self.indices, doc_type, doc_id, **kwargs)

    def count(self, doc_type=None, body=None, **kwargs):
        return self.client.count(self.indices, doc_type, body, **kwargs)

    def delete_by_query(self, doc_type=None, body=None, **kwargs):
        return self.client.delete_by_query(self.indices, doc_type, body,
                                           **kwargs)

    def suggest(self, body, **kwargs):
        return self.client.suggest(body, self.indices, **kwargs)

    def percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.percolate(self.indices, doc_type, doc_id, body,
                                     **kwargs)

    def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.count_percolate(self.indices, doc_type, doc_id,
                                           body, **kwargs)

    def mlt(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs)

    def termvector(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.termvector(self.indices, doc_type, doc_id, body,
                                      **kwargs)

    def mtermvectors(self, doc_type=None, body=None, **kwargs):
        return self.client.mtermvectors(self.indices, doc_type, body, **kwargs)

    def benchmark(self, doc_type=None, body=None, **kwargs):
        return self.client.benchmark(self.indices, doc_type, body, **kwargs)

    def abort_benchmark(self, name=None, **kwargs):
        return self.client.abort_benchmark(name, **kwargs)

    def list_benchmarks(self, doc_type=None, **kwargs):
        return self.client.list_benchmarks(self.indices, doc_type, **kwargs)

Example #3

Show file

File: elasticsearch.py Project: Exirel/djangoes

class BaseElasticsearchBackend(Base):
    """Base connection wrapper based on the ElasticSearch official library.

    It uses two entry points to configure the underlying connection:

    * ``transport_class``: the transport class from ``elasticsearch``. By
      default ``elasticsearch.transport.Transport``.
    * ``connection_class``: the connection class used by the transport class.
      It's undefined by default, as it is on the subclasses to provide one.

    If any of these elements is not defined, an ``ImproperlyConfigured`` error
    will be raised when the backend will try to configure the client.
    """
    #: ElasticSearch transport class used by the client class to perform
    #: requests.
    transport_class = Transport
    #: ElasticSearch connection class used by the transport class to perform
    #: requests.
    connection_class = None

    def configure_client(self):
        """Instantiate and configure the ElasticSearch client.

        It simply takes the given HOSTS list and uses PARAMS as the keyword
        arguments of the ElasticSearch class.

        The client's transport_class is given by the class attribute
        ``transport_class``, and the connection class used by the transport
        class is given by the class attribute ``connection_class``.

        An ``ImproperlyConfigured`` exception is raised if any of these
        elements is undefined.
        """
        hosts = self.server['HOSTS']
        params = self.server['PARAMS']

        if not self.transport_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no transport class provided' % self.__class__)

        if not self.connection_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no connection class provided' % self.__class__)

        #pylint: disable=star-args
        self.client = Elasticsearch(hosts,
                                    transport_class=self.transport_class,
                                    connection_class=self.connection_class,
                                    **params)

    # Server methods
    # ==============
    # The underlying client does not require index names to perform server
    # related queries, such as "ping" or "info". The connection wrapper act
    # for them as a proxy.

    def ping(self, **kwargs):
        return self.client.ping(**kwargs)

    def info(self, **kwargs):
        return self.client.info(**kwargs)

    def put_script(self, lang, script_id, body, **kwargs):
        return self.client.put_script(lang, script_id, body, **kwargs)

    def get_script(self, lang, script_id, **kwargs):
        return self.client.get_script(lang, script_id, **kwargs)

    def delete_script(self, lang, script_id, **kwargs):
        return self.client.delete_script(lang, script_id, **kwargs)

    def put_template(self, template_id, body, **kwargs):
        return self.client.put_template(template_id, body, **kwargs)

    def get_template(self, template_id, body=None, **kwargs):
        return self.client.get_template(template_id, body, **kwargs)

    def delete_template(self, template_id=None, **kwargs):
        return self.client.delete_template(template_id, **kwargs)

    # Bulk methods
    # ============
    # The underlying client does not require index names, but it can be used.
    # As it makes sense to not give an index, developers are free to use these
    # as they want, as long as they are careful.

    def mget(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mget(body, index, doc_type, **kwargs)

    def bulk(self, body, index=None, doc_type=None, **kwargs):
        return self.client.bulk(body, index, doc_type, **kwargs)

    def msearch(self, body, index=None, doc_type=None, **kwargs):
        return self.client.msearch(body, index, doc_type, **kwargs)

    def mpercolate(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mpercolate(body, index, doc_type, **kwargs)

    # Scroll methods
    # ==============
    # The underlying client does not require an index to perform scroll.

    def scroll(self, scroll_id, **kwargs):
        return self.client.scroll(scroll_id, **kwargs)

    def clear_scroll(self, scroll_id, body=None, **kwargs):
        return self.client.clear_scroll(scroll_id, body, **kwargs)

    # Query methods
    # =============
    # The underlying client requires index names (or alias names) to perform
    # queries. The connection wrapper overrides these client methods to
    # automatically uses the configured names (indices and/or aliases).

    def create(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.create(
            self.indices, doc_type, body, doc_id, **kwargs)

    def index(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.index(
            self.indices, doc_type, body, doc_id, **kwargs)

    def exists(self, doc_id, doc_type='_all', **kwargs):
        return self.client.exists(self.indices, doc_id, doc_type, **kwargs)

    def get(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get(self.indices, doc_id, doc_type, **kwargs)

    def get_source(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get_source(self.indices, doc_id, doc_type, **kwargs)

    def update(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.update(
            self.indices, doc_type, doc_id, body, **kwargs)

    def search(self, doc_type=None, body=None, **kwargs):
        return self.client.search(self.indices, doc_type, body, **kwargs)

    def search_shards(self, doc_type=None, **kwargs):
        return self.client.search_shards(self.indices, doc_type, **kwargs)

    def search_template(self, doc_type=None, body=None, **kwargs):
        return self.client.search_template(
            self.indices, doc_type, body, **kwargs)

    def explain(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.explain(
            self.indices, doc_type, doc_id, body, **kwargs)

    def delete(self, doc_type, doc_id, **kwargs):
        return self.client.delete(self.indices, doc_type, doc_id, **kwargs)

    def count(self, doc_type=None, body=None, **kwargs):
        return self.client.count(self.indices, doc_type, body, **kwargs)

    def delete_by_query(self, doc_type=None, body=None, **kwargs):
        return self.client.delete_by_query(
            self.indices, doc_type, body, **kwargs)

    def suggest(self, body, **kwargs):
        return self.client.suggest(body, self.indices, **kwargs)

    def percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.percolate(
            self.indices, doc_type, doc_id, body, **kwargs)

    def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.count_percolate(
            self.indices, doc_type, doc_id, body, **kwargs)

    def mlt(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs)

    def termvector(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.termvector(
            self.indices, doc_type, doc_id, body, **kwargs)

    def mtermvectors(self, doc_type=None, body=None, **kwargs):
        return self.client.mtermvectors(self.indices, doc_type, body, **kwargs)

    def benchmark(self, doc_type=None, body=None, **kwargs):
        return self.client.benchmark(self.indices, doc_type, body, **kwargs)

    def abort_benchmark(self, name=None, **kwargs):
        return self.client.abort_benchmark(name, **kwargs)

    def list_benchmarks(self, doc_type=None, **kwargs):
        return self.client.list_benchmarks(self.indices, doc_type, **kwargs)

Example #4

Show file

File: db.py Project: guy9050/monocle

class ELmonocleDB:

    log = logging.getLogger("monocle.ELmonocleDB")

    def __init__(
        self,
        elastic_conn="localhost:9200",
        index=None,
        timeout=10,
        prefix=CHANGE_PREFIX,
        create=True,
        previous_schema=False,
        idents_config: Optional[IdentsConfig] = None,
        user=None,
        password=None,
        use_ssl=None,
        verify_certs=None,
        ssl_show_warn=None,
    ) -> None:
        host, port = elastic_conn.split(":")
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        ip = socket.gethostbyname(host)
        self.log.info("ES IP is %s" % ip)
        self.log.info("ES prefix is %s" % prefix)

        elastic_conn = [{
            "host": host,
            "port": port,
        }]

        if use_ssl:
            elastic_conn[0]["use_ssl"] = use_ssl

        if not verify_certs:
            elastic_conn[0]["verify_certs"] = verify_certs

        if not ssl_show_warn:
            elastic_conn[0]["ssl_show_warn"] = ssl_show_warn

        if user and password:
            elastic_conn[0]["http_auth"] = "%s:%s" % (user, password)

        while True:
            try:
                s.connect((ip, int(port)))
                s.shutdown(2)
                s.close()
                break
            except Exception as excpt:
                self.log.info(
                    "Unable to connect to %s: %s. Sleeping for %ds." %
                    (elastic_conn, excpt, timeout))
                time.sleep(timeout)

        self.log.info("Connecting to ES server at %s" % elastic_conn)
        self.es = Elasticsearch(elastic_conn)
        self.log.info(self.es.info())

        if previous_schema:
            self.prefix = PREV_CHANGE_PREFIX
        else:
            self.prefix = prefix

        if not index:
            self.log.info("No index provided")
            return

        self.idents_config = idents_config or []

        self.index = "{}{}".format(self.prefix, index)
        self.log.info("Using ES index %s" % self.index)
        self.mapping = {
            "properties": {
                "id": {
                    "type": "keyword"
                },
                "type": {
                    "type": "keyword"
                },
                "number": {
                    "type": "keyword"
                },
                "change_id": {
                    "type": "keyword"
                },
                "title": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 8191
                        }
                    },
                },
                "text": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 8191
                        }
                    },
                },
                "url": {
                    "type": "keyword"
                },
                "commit_count": {
                    "type": "integer"
                },
                "additions": {
                    "type": "integer"
                },
                "deletions": {
                    "type": "integer"
                },
                "changed_files_count": {
                    "type": "integer"
                },
                "changed_files": {
                    "properties": {
                        "additions": {
                            "type": "integer"
                        },
                        "deletions": {
                            "type": "integer"
                        },
                        "path": {
                            "type": "keyword"
                        },
                    }
                },
                "commits": {
                    "properties": {
                        "sha": {
                            "type": "keyword"
                        },
                        "author": {
                            "properties": {
                                "uid": {
                                    "type": "keyword"
                                },
                                "muid": {
                                    "type": "keyword"
                                },
                            }
                        },
                        "committer": {
                            "properties": {
                                "uid": {
                                    "type": "keyword"
                                },
                                "muid": {
                                    "type": "keyword"
                                },
                            }
                        },
                        "authored_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "committed_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "additions": {
                            "type": "integer"
                        },
                        "deletions": {
                            "type": "integer"
                        },
                        "title": {
                            "type": "text"
                        },
                    }
                },
                "repository_prefix": {
                    "type": "keyword"
                },
                "repository_fullname": {
                    "type": "keyword"
                },
                "repository_shortname": {
                    "type": "keyword"
                },
                "author": {
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    }
                },
                "on_author": {
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    }
                },
                "committer": {
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    }
                },
                "merged_by": {
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    }
                },
                "branch": {
                    "type": "keyword"
                },
                "target_branch": {
                    "type": "keyword"
                },
                "created_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "on_created_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "merged_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "updated_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "closed_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "state": {
                    "type": "keyword"
                },
                "duration": {
                    "type": "integer"
                },
                "mergeable": {
                    "type": "keyword"
                },
                "labels": {
                    "type": "keyword"
                },
                "assignees": {
                    "type": "nested",
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    },
                },
                "approval": {
                    "type": "keyword"
                },
                "draft": {
                    "type": "boolean"
                },
                "self_merged": {
                    "type": "boolean"
                },
            }
        }
        settings = {"mappings": self.mapping}
        self.ic = self.es.indices
        if create:
            self.ic.create(index=self.index, ignore=400, body=settings)
        # The authors_histo is failing on some context with this error when the
        # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be
        # set by changing the [search.max_buckets] cluster level)
        # This is an attempt to mitigate the issue
        cluster_settings = {"transient": {"search.max_buckets": 100000}}
        self.es.cluster.put_settings(body=cluster_settings)

    def update(self, source_it: List[Union[Change, Event]]) -> None:
        def gen(it):
            for _source in it:
                source = change_or_event_to_dict(_source)
                d = {}
                d["_index"] = self.index
                d["_op_type"] = "update"
                d["_id"] = source["id"]
                d["doc"] = source
                d["doc_as_upsert"] = True
                yield d

        bulk(self.es, gen(source_it))
        self.es.indices.refresh(index=self.index)

    def delete_index(self):
        self.log.info("Deleting index: %s" % self.index)
        self.ic.delete(index=self.index)

    def delete_repository(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "query": {
                "bool": {
                    "filter": {
                        "regexp": {
                            "repository_fullname": {
                                "value": repository_fullname
                            }
                        }
                    }
                }
            }
        }
        params["body"] = body
        self.es.delete_by_query(**params)
        self.es.indices.refresh(index=self.index)

    def get_last_updated(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "sort": [{
                "updated_at": {
                    "order": "desc"
                }
            }],
            "query": {
                "bool": {
                    "filter": [
                        {
                            "term": {
                                "type": "Change"
                            }
                        },
                        {
                            "regexp": {
                                "repository_fullname": {
                                    "value": repository_fullname
                                }
                            }
                        },
                    ]
                }
            },
        }
        params["body"] = body
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        ret = [r["_source"] for r in res["hits"]["hits"]]
        if not ret:
            return []
        return ret[0]

    def run_named_query(self, name, *args, **kwargs):
        if name not in queries.public_queries:
            raise UnknownQueryException("Unknown query: %s" % name)
        return getattr(queries, name)(self.es, self.index, *args, **kwargs)

    def get_indices(self):
        return [
            ind.replace(self.prefix, "")
            for ind in self.es.indices.get(self.prefix + "*")
        ]

    def iter_index(self):
        body = {"query": {"match_all": {}}}
        return scan(self.es, query=body, index=self.index, size=5000)

    def update_idents(self) -> None:

        import json

        bulk_size = 7500

        def get_obj_hash(obj: Dict) -> int:
            obj_json = json.dumps(obj, sort_keys=True)
            return hash(obj_json)

        def update_ident(dict_ident: Dict) -> Dict:
            dict_ident["muid"] = create_muid(dict_ident["uid"],
                                             self.idents_config)
            return dict_ident

        def _update_idents(
                obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]:

            prev_hash = get_obj_hash(obj)

            if obj["type"] == "Change":
                obj["author"] = update_ident(obj["author"])
                if "committer" in obj:
                    obj["committer"] = update_ident(obj["committer"])
                if "merged_by" in obj:
                    obj["merged_by"] = update_ident(obj["merged_by"])
                if "assignees" in obj:
                    obj["assignees"] = list(map(update_ident,
                                                obj["assignees"]))
                if "commits" in obj:
                    for commit in obj["commits"]:
                        commit["author"] = update_ident(commit["author"])
                        commit["committer"] = update_ident(commit["committer"])
            else:
                if "author" in obj:
                    obj["author"] = update_ident(obj["author"])
                if "on_author" in obj:
                    obj["on_author"] = update_ident(obj["on_author"])
            updated = not prev_hash == get_obj_hash(obj)
            if updated:
                return dict_to_change_or_event(obj), True
            else:
                return None, False

        def bulk_update(to_update: List) -> List:
            print("Updating %s objects ..." % len(to_update))
            self.update(to_update)
            return []

        to_update = []
        total_read = 0
        for _obj in self.iter_index():
            total_read += 1
            if total_read % bulk_size == 0:
                print("%s objects read from the database" % total_read)
            obj = _obj["_source"]
            obj, updated = _update_idents(obj)
            if updated:
                to_update.append(obj)
            if len(to_update) == bulk_size:
                to_update = bulk_update(to_update)

        bulk_update(to_update)