"must_not": { "exists": { "field": "processedText" } } } } }, index=indexName) if len(results['hits']['hits']) > 0: print("Records Found: ", len(results['hits']['hits']) , "Processing Now") import re from bs4 import BeautifulSoup for item in range(len(results['hits']['hits'])): print("Processing", results['hits']['hits'][item]['_id']) soup = BeautifulSoup(results['hits']['hits'][item]['_source']['html'], 'html.parser') for script in soup(["script", "style",""]): script.extract() text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = ' '.join(chunk for chunk in chunks if chunk) body = { "doc": { "processedText": text } } _update = es.update(index=results['hits']['hits'][item]['_index'], id=results['hits']['hits'][item]['_id'], body=body) print("HTML Cleaned, and Converted to Searchable Text, Updating Doc", _update) print("Quiting Now")
class ELmonocleDB: log = logging.getLogger("monocle.ELmonocleDB") def __init__( self, elastic_conn="localhost:9200", index=None, timeout=10, prefix=CHANGE_PREFIX, create=True, previous_schema=False, idents_config: Optional[IdentsConfig] = None, user=None, password=None, use_ssl=None, verify_certs=None, ssl_show_warn=None, ) -> None: host, port = elastic_conn.split(":") s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ip = socket.gethostbyname(host) self.log.info("ES IP is %s" % ip) self.log.info("ES prefix is %s" % prefix) elastic_conn = [ { "host": host, "port": port, } ] if use_ssl: elastic_conn[0]["use_ssl"] = use_ssl if not verify_certs: elastic_conn[0]["verify_certs"] = verify_certs if not ssl_show_warn: elastic_conn[0]["ssl_show_warn"] = ssl_show_warn if user and password: elastic_conn[0]["http_auth"] = "%s:%s" % (user, password) while True: try: s.connect((ip, int(port))) s.shutdown(2) s.close() break except Exception as excpt: self.log.info( "Unable to connect to %s: %s. Sleeping for %ds." % (elastic_conn, excpt, timeout) ) time.sleep(timeout) self.log.info("Connecting to ES server at %s" % elastic_conn) self.es = Elasticsearch(elastic_conn) self.log.info(self.es.info()) if previous_schema: self.prefix = PREV_CHANGE_PREFIX else: self.prefix = prefix if not index: self.log.info("No index provided") return self.idents_config = idents_config or [] self.index = "{}{}".format(self.prefix, index) self.log.info("Using ES index %s" % self.index) self.mapping = { "properties": { "id": {"type": "keyword"}, "type": {"type": "keyword"}, "number": {"type": "keyword"}, "change_id": {"type": "keyword"}, "title": { "type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}}, }, "text": { "type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}}, }, "url": {"type": "keyword"}, "commit_count": {"type": "integer"}, "additions": {"type": "integer"}, "deletions": {"type": "integer"}, "changed_files_count": {"type": "integer"}, "changed_files": { "properties": { "additions": {"type": "integer"}, "deletions": {"type": "integer"}, "path": {"type": "keyword"}, } }, "commits": { "properties": { "sha": {"type": "keyword"}, "author": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "committer": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "authored_at": { "type": "date", "format": "date_time_no_millis", }, "committed_at": { "type": "date", "format": "date_time_no_millis", }, "additions": {"type": "integer"}, "deletions": {"type": "integer"}, "title": {"type": "text"}, } }, "repository_prefix": {"type": "keyword"}, "repository_fullname": {"type": "keyword"}, "repository_shortname": {"type": "keyword"}, "author": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "on_author": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "committer": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "merged_by": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "branch": {"type": "keyword"}, "target_branch": {"type": "keyword"}, "created_at": {"type": "date", "format": "date_time_no_millis"}, "on_created_at": {"type": "date", "format": "date_time_no_millis"}, "merged_at": {"type": "date", "format": "date_time_no_millis"}, "updated_at": {"type": "date", "format": "date_time_no_millis"}, "closed_at": {"type": "date", "format": "date_time_no_millis"}, "state": {"type": "keyword"}, "duration": {"type": "integer"}, "mergeable": {"type": "keyword"}, "labels": {"type": "keyword"}, "assignees": { "type": "nested", "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, }, }, "approval": {"type": "keyword"}, "draft": {"type": "boolean"}, "self_merged": {"type": "boolean"}, "crawler_metadata": { "properties": { "last_commit_at": { "type": "date", "format": "date_time_no_millis", }, "last_post_at": { "type": "date", "format": "date_time_no_millis", }, "total_docs_posted": {"type": "integer"}, "total_changes_updated": {"type": "integer"}, "total_orphans_updated": {"type": "integer"}, } }, "tasks_data": { "properties": { "tid": {"type": "keyword"}, "ttype": {"type": "keyword"}, "crawler_name": {"type": "keyword"}, "updated_at": {"type": "date", "format": "date_time_no_millis"}, "change_url": {"type": "keyword"}, "severity": {"type": "keyword"}, "priority": {"type": "keyword"}, "score": {"type": "integer"}, "url": {"type": "keyword"}, "title": { "type": "text", "fields": { "keyword": {"type": "keyword", "ignore_above": 8191} }, }, "_adopted": {"type": "boolean"}, } }, } } settings = {"mappings": self.mapping} self.ic = self.es.indices if create: self.ic.create(index=self.index, ignore=400, body=settings) # The authors_histo is failing on some context with this error when the # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be # set by changing the [search.max_buckets] cluster level) # This is an attempt to mitigate the issue cluster_settings = {"transient": {"search.max_buckets": 100000}} self.es.cluster.put_settings(body=cluster_settings) def update(self, source_it: List[Union[Change, Event]]) -> None: def gen(it): for _source in it: source = change_or_event_to_dict(_source) d = {} d["_index"] = self.index d["_op_type"] = "update" d["_id"] = source["id"] d["doc"] = source d["doc_as_upsert"] = True yield d bulk(self.es, gen(source_it)) self.es.indices.refresh(index=self.index) def update_task_data( self, source_it: Union[ List[TaskDataForEL], List[OrphanTaskDataForEL], List[AdoptedTaskDataForEL], ], ) -> Optional[BulkIndexError]: def gen(it): for _source in it: d = {} d["_index"] = self.index d["_op_type"] = "update" d["_id"] = _source._id d["doc"] = {} d["doc"].update({"id": _source._id}) if isinstance(_source, TaskDataForEL): d["doc"].update( {"tasks_data": [asdict(td) for td in _source.tasks_data]} ) if isinstance(_source, OrphanTaskDataForEL): d["doc"].update({"tasks_data": asdict(_source.task_data)}) d["doc"]["type"] = "OrphanTaskData" if isinstance(_source, AdoptedTaskDataForEL): d["doc"].update({"tasks_data": asdict(_source.task_data)}) d["doc_as_upsert"] = True yield d ret = None try: bulk(self.es, gen(source_it)) except BulkIndexError as err: ret = err self.es.indices.refresh(index=self.index) return ret def compute_crawler_id_by_name(self, name, _type): return "crawler/%s/%s" % (_type, name) def get_task_crawler_metadata(self, name: str) -> Dict: try: ret = self.es.get( self.index, self.compute_crawler_id_by_name(name, "tasks_crawler") ) return ret["_source"]["crawler_metadata"] except Exception: return {} def set_task_crawler_metadata( self, name: str, commit_date: datetime = None, push_infos: Dict = None ): metadata = {} if commit_date: metadata.update({"last_commit_at": commit_date}) if push_infos: prev_metadata = self.get_task_crawler_metadata(name) metadata.update( { "last_post_at": push_infos["last_post_at"], "total_docs_posted": prev_metadata.get("total_docs_posted", 0) + push_infos["total_docs_posted"], "total_changes_updated": prev_metadata.get( "total_changes_updated", 0 ) + push_infos["total_changes_updated"], "total_orphans_updated": prev_metadata.get( "total_orphans_updated", 0 ) + push_infos["total_orphans_updated"], } ) body = { "doc": {"type": "TaskCrawlerDataCommit", "crawler_metadata": metadata}, "doc_as_upsert": True, } ret = None try: self.es.update( self.index, self.compute_crawler_id_by_name(name, "tasks_crawler"), body=body, ) self.es.indices.refresh(index=self.index) except Exception as err: ret = err return ret def delete_index(self): self.log.info("Deleting index: %s" % self.index) self.ic.delete(index=self.index) def delete_repository(self, repository_fullname): params = {"index": self.index} body = { "query": { "bool": { "filter": { "regexp": { "repository_fullname": {"value": repository_fullname} } } } } } params["body"] = body self.es.delete_by_query(**params) self.es.indices.refresh(index=self.index) def get_last_updated(self, repository_fullname): params = {"index": self.index} body = { "sort": [{"updated_at": {"order": "desc"}}], "query": { "bool": { "filter": [ {"term": {"type": "Change"}}, { "regexp": { "repository_fullname": {"value": repository_fullname} } }, ] } }, } params["body"] = body try: res = self.es.search(**params) except Exception: return [] ret = [r["_source"] for r in res["hits"]["hits"]] if not ret: return [] return ret[0] def get_changes_by_url(self, change_urls, size): params = { "index": self.index, "body": { "size": size, "query": { "bool": { "filter": [ {"term": {"type": "Change"}}, {"terms": {"url": change_urls}}, ] } }, }, } try: res = self.es.search(**params) except Exception: return [] return [r["_source"] for r in res["hits"]["hits"]] def get_orphan_tds_by_change_urls(self, change_urls): assert len(change_urls) <= 50 size = 5000 # Asumming not more that 100 TD data relataed to a change params = { "index": self.index, "body": { "size": size, "query": { "bool": { "must_not": {"exists": {"field": "tasks_data._adopted"}}, "filter": [ {"term": {"type": "OrphanTaskData"}}, {"terms": {"tasks_data.change_url": change_urls}}, ], } }, }, } try: res = self.es.search(**params) except Exception: return [] return [r["_source"] for r in res["hits"]["hits"]] def get_orphan_tds_and_declare_adpotion(self, changes_url): assert len(changes_url) <= 50 tds = self.get_orphan_tds_by_change_urls(changes_url) if tds: adopted_tds = [ AdoptedTaskDataForEL( _id=td["id"], task_data=AdoptedTaskData(_adopted=True), ) for td in tds ] self.update_task_data(adopted_tds) return tds def update_changes_with_orphan_tds(self, mapping: Dict[str, str]): change_urls = list(mapping.keys()) while change_urls: change_urls_to_process = change_urls[:50] change_urls = change_urls[50:] tds = self.get_orphan_tds_and_declare_adpotion(change_urls_to_process) # Group tds in buckets by change_url _map: Dict[str, List] = dict() for td in tds: _map.setdefault(td["tasks_data"]["change_url"], []).append( td["tasks_data"] ) # Create update docs to attach tds to matching changes to_update = [] for change_url, tds in _map.items(): to_update.append( TaskDataForEL( _id=mapping[change_url], tasks_data=createELTaskData(tds), ) ) self.update_task_data(to_update) def run_named_query(self, name, *args, **kwargs): if name not in queries.public_queries: raise UnknownQueryException("Unknown query: %s" % name) return getattr(queries, name)(self.es, self.index, *args, **kwargs) def get_indices(self): return [ ind.replace(self.prefix, "") for ind in self.es.indices.get(self.prefix + "*") ] def iter_index(self): body = {"query": {"match_all": {}}} return scan(self.es, query=body, index=self.index, size=5000) def update_idents(self) -> None: import json bulk_size = 7500 def get_obj_hash(obj: Dict) -> int: obj_json = json.dumps(obj, sort_keys=True) return hash(obj_json) def update_ident(dict_ident: Dict) -> Dict: dict_ident["muid"] = create_muid(dict_ident["uid"], self.idents_config) return dict_ident def _update_idents(obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]: prev_hash = get_obj_hash(obj) if obj["type"] == "Change": obj["author"] = update_ident(obj["author"]) if "committer" in obj: obj["committer"] = update_ident(obj["committer"]) if "merged_by" in obj: obj["merged_by"] = update_ident(obj["merged_by"]) if "assignees" in obj: obj["assignees"] = list(map(update_ident, obj["assignees"])) if "commits" in obj: for commit in obj["commits"]: commit["author"] = update_ident(commit["author"]) commit["committer"] = update_ident(commit["committer"]) if obj["type"] in get_events_list(): if "author" in obj: obj["author"] = update_ident(obj["author"]) if "on_author" in obj: obj["on_author"] = update_ident(obj["on_author"]) updated = not prev_hash == get_obj_hash(obj) if updated: return dict_to_change_or_event(obj), True else: return None, False def bulk_update(to_update: List) -> List: print("Updating %s objects ..." % len(to_update)) self.update(to_update) return [] to_update = [] total_read = 0 for _obj in self.iter_index(): total_read += 1 if total_read % bulk_size == 0: print("%s objects read from the database" % total_read) obj = _obj["_source"] obj, updated = _update_idents(obj) if updated: to_update.append(obj) if len(to_update) == bulk_size: to_update = bulk_update(to_update) bulk_update(to_update)
class ESConnector: """ as many MS will communicate with ElasticSearch, centralize access with this library """ def __init__(self, host=None, port=9200, timeout=10, local_env=False): self.host = host self.port = port self.timeout = timeout self.local_env = local_env self.es = None def _connect(self): """ connect to a member of the ElasticSearch cluster """ try: if self.local_env: self.es = Elasticsearch([{'host': self.host, 'port': self.port}]) else: self.es = Elasticsearch([{'host': self.host, 'port': self.port}], sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=self.timeout) self.idx = IndicesClient(self.es) return except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchError.unknown_exception(backtrace, str(e)) def _create_index(self, index, doc_type, settings=None, mappings=None): """ create a new empty index mandatory args: index = index name doc_type = document type, ie. any valid string settings = ElasticSearch cluster configuration mappings = dict of document fields by type and indexing preference """ if not settings: settings = {'index': {'number_of_shards': '1', 'number_of_replicas': '0'}} if not mappings: mappings = {'property': {'id': {'type': 'string', 'index': 'not_analyzed'}}} try: response = self.es.create(index=index, doc_type=doc_type, body=dumps(settings)) self.idx.put_mapping(index=index, doc_type=doc_type, body=dumps(mappings)) if not 'created' in response or not response['created']: return ElasticSearchError.unable_to_create_index(index) log.info('Index: {} created'.format(index)) log.info('ES create(): response: {}'.format(response)) return except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except NotFoundError as e: return ElasticSearchError.missing_index(self.index) except RequestError as e: return ElasticSearchError.invalid_request(str(e)) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchError.unknown_exception(backtrace, str(e)) def drop_index(self, index): try: if index in self.es.indices.stats()['indices'].keys(): self.es.indices.delete(index=index, ignore=[400, 404]) log.info('Index: {} deleted'.format(index)) return except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except NotFoundError as e: return ElasticSearcheError.missing_index(self.index) except RequestError as e: return ElasticSearcheError.invalid_request(str(e)) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchError.unknown_exception(backtrace, str(e)) def add_document(self, index=None, doc_type=None, doc_id=0, settings={}, mappings={}, values={}): """ add a new document to an existing index mandatory args: index = index name doc_type = document type, ie. any valid string settings = ElasticSearch cluster configuration mappings = dict of document fields by type and indexing preference values = dictionary of fields and values """ try: err_msg = self._connect() if err_msg: return err_msg if index not in self.es.indices.stats()['indices'].keys(): err_msg = self._create_index(index, doc_type, settings, mappings) if err_msg: return err_msg response = self.es.create(index=index, doc_type=doc_type, id=doc_id, body=dumps(values)) log.info('ES create(): response: {}'.format(response)) return ElasticSearchWrite.object_created(response) except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except RequestError as e: return ElasticSearchError.invalid_request(str(e)) except NotFoundError as e: return ElasticSearchError.missing_index(index) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchWriteError.unknown_exception(doc_id, values, backtrace, str(e)) def update_document(self, index, doc_type, doc_id, values): """ update an existing document in an existing index mandatory args: index = index name doc_type = document type, ie. any valid string doc_id = document_id values = dictionary of fields and values """ try: err_msg = self._connect() if err_msg: return err_msg log.info('ES body: {}'.format(values)) response = self.es.update(index=index, doc_type=doc_type, id=doc_id, body=dumps(values)) log.info('ES update(): response: {}'.format(response)) return ElasticSearchWrite.object_updated(response) except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except RequestError as e: return ElasticSearchError.invalid_request(str(e)) except NotFoundError as e: return ElasticSearchError.missing_index(index) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchWriteError.unknown_exception(doc_id, values, backtrace, str(e)) def find_document(self, index, doc_type, dsl=None, fields=None): """ find an existing document in an existing index mandatory args: index = index name doc_type = document type, ie. any valid string dsl = query parameters in DSL format fields = list of fields to return """ try: err_msg = self._connect() if err_msg: return err_msg response = self.es.search(index=index, doc_type=doc_type, body=dumps(dsl), _source=fields) return ElasticSearchRead.object_found(response) except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except RequestError as e: return ElasticSearchError.invalid_request(str(e)) except NotFoundError as e: return ElasticSearchError.missing_index(index) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchReadError.unknown_exception(dsl, fields, backtrace, str(e)) def search_documents(self, index, doc_type, dsl, fields=None): """ find an existing document in an existing index mandatory args: index = index name doc_type = document type, ie. any valid string dsl = query parameters in DSL format fields = list of fields to return """ try: err_msg = self._connect() if err_msg: return err_msg response = self.es.search(index=index, doc_type=doc_type, body=dumps(dsl), _source=fields) return ElasticSearchRead.objects_found(response) except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except RequestError as e: return ElasticSearchError.invalid_request(str(e)) except NotFoundError as e: return ElasticSearchError.missing_index(index) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchReadError.unknown_exception(dsl, fields, backtrace, str(e))
class BaseElasticsearchBackend(Base): """Base connection wrapper based on the ElasticSearch official library. It uses two entry points to configure the underlying connection: * ``transport_class``: the transport class from ``elasticsearch``. By default ``elasticsearch.transport.Transport``. * ``connection_class``: the connection class used by the transport class. It's undefined by default, as it is on the subclasses to provide one. If any of these elements is not defined, an ``ImproperlyConfigured`` error will be raised when the backend will try to configure the client. """ #: ElasticSearch transport class used by the client class to perform #: requests. transport_class = Transport #: ElasticSearch connection class used by the transport class to perform #: requests. connection_class = None def configure_client(self): """Instantiate and configure the ElasticSearch client. It simply takes the given HOSTS list and uses PARAMS as the keyword arguments of the ElasticSearch class. The client's transport_class is given by the class attribute ``transport_class``, and the connection class used by the transport class is given by the class attribute ``connection_class``. An ``ImproperlyConfigured`` exception is raised if any of these elements is undefined. """ hosts = self.server['HOSTS'] params = self.server['PARAMS'] if not self.transport_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no transport class provided' % self.__class__) if not self.connection_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no connection class provided' % self.__class__) #pylint: disable=star-args self.client = Elasticsearch(hosts, transport_class=self.transport_class, connection_class=self.connection_class, **params) # Server methods # ============== # The underlying client does not require index names to perform server # related queries, such as "ping" or "info". The connection wrapper act # for them as a proxy. def ping(self, **kwargs): return self.client.ping(**kwargs) def info(self, **kwargs): return self.client.info(**kwargs) def put_script(self, lang, script_id, body, **kwargs): return self.client.put_script(lang, script_id, body, **kwargs) def get_script(self, lang, script_id, **kwargs): return self.client.get_script(lang, script_id, **kwargs) def delete_script(self, lang, script_id, **kwargs): return self.client.delete_script(lang, script_id, **kwargs) def put_template(self, template_id, body, **kwargs): return self.client.put_template(template_id, body, **kwargs) def get_template(self, template_id, body=None, **kwargs): return self.client.get_template(template_id, body, **kwargs) def delete_template(self, template_id=None, **kwargs): return self.client.delete_template(template_id, **kwargs) # Bulk methods # ============ # The underlying client does not require index names, but it can be used. # As it makes sense to not give an index, developers are free to use these # as they want, as long as they are careful. def mget(self, body, index=None, doc_type=None, **kwargs): return self.client.mget(body, index, doc_type, **kwargs) def bulk(self, body, index=None, doc_type=None, **kwargs): return self.client.bulk(body, index, doc_type, **kwargs) def msearch(self, body, index=None, doc_type=None, **kwargs): return self.client.msearch(body, index, doc_type, **kwargs) def mpercolate(self, body, index=None, doc_type=None, **kwargs): return self.client.mpercolate(body, index, doc_type, **kwargs) # Scroll methods # ============== # The underlying client does not require an index to perform scroll. def scroll(self, scroll_id, **kwargs): return self.client.scroll(scroll_id, **kwargs) def clear_scroll(self, scroll_id, body=None, **kwargs): return self.client.clear_scroll(scroll_id, body, **kwargs) # Query methods # ============= # The underlying client requires index names (or alias names) to perform # queries. The connection wrapper overrides these client methods to # automatically uses the configured names (indices and/or aliases). def create(self, doc_type, body, doc_id=None, **kwargs): return self.client.create( self.indices, doc_type, body, doc_id, **kwargs) def index(self, doc_type, body, doc_id=None, **kwargs): return self.client.index( self.indices, doc_type, body, doc_id, **kwargs) def exists(self, doc_id, doc_type='_all', **kwargs): return self.client.exists(self.indices, doc_id, doc_type, **kwargs) def get(self, doc_id, doc_type='_all', **kwargs): return self.client.get(self.indices, doc_id, doc_type, **kwargs) def get_source(self, doc_id, doc_type='_all', **kwargs): return self.client.get_source(self.indices, doc_id, doc_type, **kwargs) def update(self, doc_type, doc_id, body=None, **kwargs): return self.client.update( self.indices, doc_type, doc_id, body, **kwargs) def search(self, doc_type=None, body=None, **kwargs): return self.client.search(self.indices, doc_type, body, **kwargs) def search_shards(self, doc_type=None, **kwargs): return self.client.search_shards(self.indices, doc_type, **kwargs) def search_template(self, doc_type=None, body=None, **kwargs): return self.client.search_template( self.indices, doc_type, body, **kwargs) def explain(self, doc_type, doc_id, body=None, **kwargs): return self.client.explain( self.indices, doc_type, doc_id, body, **kwargs) def delete(self, doc_type, doc_id, **kwargs): return self.client.delete(self.indices, doc_type, doc_id, **kwargs) def count(self, doc_type=None, body=None, **kwargs): return self.client.count(self.indices, doc_type, body, **kwargs) def delete_by_query(self, doc_type=None, body=None, **kwargs): return self.client.delete_by_query( self.indices, doc_type, body, **kwargs) def suggest(self, body, **kwargs): return self.client.suggest(body, self.indices, **kwargs) def percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.percolate( self.indices, doc_type, doc_id, body, **kwargs) def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.count_percolate( self.indices, doc_type, doc_id, body, **kwargs) def mlt(self, doc_type, doc_id, body=None, **kwargs): return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs) def termvector(self, doc_type, doc_id, body=None, **kwargs): return self.client.termvector( self.indices, doc_type, doc_id, body, **kwargs) def mtermvectors(self, doc_type=None, body=None, **kwargs): return self.client.mtermvectors(self.indices, doc_type, body, **kwargs) def benchmark(self, doc_type=None, body=None, **kwargs): return self.client.benchmark(self.indices, doc_type, body, **kwargs) def abort_benchmark(self, name=None, **kwargs): return self.client.abort_benchmark(name, **kwargs) def list_benchmarks(self, doc_type=None, **kwargs): return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
class BaseElasticsearchBackend(Base): """Base connection wrapper based on the ElasticSearch official library. It uses two entry points to configure the underlying connection: * ``transport_class``: the transport class from ``elasticsearch``. By default ``elasticsearch.transport.Transport``. * ``connection_class``: the connection class used by the transport class. It's undefined by default, as it is on the subclasses to provide one. If any of these elements is not defined, an ``ImproperlyConfigured`` error will be raised when the backend will try to configure the client. """ #: ElasticSearch transport class used by the client class to perform #: requests. transport_class = Transport #: ElasticSearch connection class used by the transport class to perform #: requests. connection_class = None def configure_client(self): """Instantiate and configure the ElasticSearch client. It simply takes the given HOSTS list and uses PARAMS as the keyword arguments of the ElasticSearch class. The client's transport_class is given by the class attribute ``transport_class``, and the connection class used by the transport class is given by the class attribute ``connection_class``. An ``ImproperlyConfigured`` exception is raised if any of these elements is undefined. """ hosts = self.server['HOSTS'] params = self.server['PARAMS'] if not self.transport_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no transport class provided' % self.__class__) if not self.connection_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no connection class provided' % self.__class__) #pylint: disable=star-args self.client = Elasticsearch(hosts, transport_class=self.transport_class, connection_class=self.connection_class, **params) # Server methods # ============== # The underlying client does not require index names to perform server # related queries, such as "ping" or "info". The connection wrapper act # for them as a proxy. def ping(self, **kwargs): return self.client.ping(**kwargs) def info(self, **kwargs): return self.client.info(**kwargs) def put_script(self, lang, script_id, body, **kwargs): return self.client.put_script(lang, script_id, body, **kwargs) def get_script(self, lang, script_id, **kwargs): return self.client.get_script(lang, script_id, **kwargs) def delete_script(self, lang, script_id, **kwargs): return self.client.delete_script(lang, script_id, **kwargs) def put_template(self, template_id, body, **kwargs): return self.client.put_template(template_id, body, **kwargs) def get_template(self, template_id, body=None, **kwargs): return self.client.get_template(template_id, body, **kwargs) def delete_template(self, template_id=None, **kwargs): return self.client.delete_template(template_id, **kwargs) # Bulk methods # ============ # The underlying client does not require index names, but it can be used. # As it makes sense to not give an index, developers are free to use these # as they want, as long as they are careful. def mget(self, body, index=None, doc_type=None, **kwargs): return self.client.mget(body, index, doc_type, **kwargs) def bulk(self, body, index=None, doc_type=None, **kwargs): return self.client.bulk(body, index, doc_type, **kwargs) def msearch(self, body, index=None, doc_type=None, **kwargs): return self.client.msearch(body, index, doc_type, **kwargs) def mpercolate(self, body, index=None, doc_type=None, **kwargs): return self.client.mpercolate(body, index, doc_type, **kwargs) # Scroll methods # ============== # The underlying client does not require an index to perform scroll. def scroll(self, scroll_id, **kwargs): return self.client.scroll(scroll_id, **kwargs) def clear_scroll(self, scroll_id, body=None, **kwargs): return self.client.clear_scroll(scroll_id, body, **kwargs) # Query methods # ============= # The underlying client requires index names (or alias names) to perform # queries. The connection wrapper overrides these client methods to # automatically uses the configured names (indices and/or aliases). def create(self, doc_type, body, doc_id=None, **kwargs): return self.client.create(self.indices, doc_type, body, doc_id, **kwargs) def index(self, doc_type, body, doc_id=None, **kwargs): return self.client.index(self.indices, doc_type, body, doc_id, **kwargs) def exists(self, doc_id, doc_type='_all', **kwargs): return self.client.exists(self.indices, doc_id, doc_type, **kwargs) def get(self, doc_id, doc_type='_all', **kwargs): return self.client.get(self.indices, doc_id, doc_type, **kwargs) def get_source(self, doc_id, doc_type='_all', **kwargs): return self.client.get_source(self.indices, doc_id, doc_type, **kwargs) def update(self, doc_type, doc_id, body=None, **kwargs): return self.client.update(self.indices, doc_type, doc_id, body, **kwargs) def search(self, doc_type=None, body=None, **kwargs): return self.client.search(self.indices, doc_type, body, **kwargs) def search_shards(self, doc_type=None, **kwargs): return self.client.search_shards(self.indices, doc_type, **kwargs) def search_template(self, doc_type=None, body=None, **kwargs): return self.client.search_template(self.indices, doc_type, body, **kwargs) def explain(self, doc_type, doc_id, body=None, **kwargs): return self.client.explain(self.indices, doc_type, doc_id, body, **kwargs) def delete(self, doc_type, doc_id, **kwargs): return self.client.delete(self.indices, doc_type, doc_id, **kwargs) def count(self, doc_type=None, body=None, **kwargs): return self.client.count(self.indices, doc_type, body, **kwargs) def delete_by_query(self, doc_type=None, body=None, **kwargs): return self.client.delete_by_query(self.indices, doc_type, body, **kwargs) def suggest(self, body, **kwargs): return self.client.suggest(body, self.indices, **kwargs) def percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.percolate(self.indices, doc_type, doc_id, body, **kwargs) def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.count_percolate(self.indices, doc_type, doc_id, body, **kwargs) def mlt(self, doc_type, doc_id, body=None, **kwargs): return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs) def termvector(self, doc_type, doc_id, body=None, **kwargs): return self.client.termvector(self.indices, doc_type, doc_id, body, **kwargs) def mtermvectors(self, doc_type=None, body=None, **kwargs): return self.client.mtermvectors(self.indices, doc_type, body, **kwargs) def benchmark(self, doc_type=None, body=None, **kwargs): return self.client.benchmark(self.indices, doc_type, body, **kwargs) def abort_benchmark(self, name=None, **kwargs): return self.client.abort_benchmark(name, **kwargs) def list_benchmarks(self, doc_type=None, **kwargs): return self.client.list_benchmarks(self.indices, doc_type, **kwargs)