import certifi from elasticsearch.client import Elasticsearch es = Elasticsearch( ["https://cb35af4e37890e33bd330e954f130550.ap-northeast-1.aws.found.io"], port=9243, http_auth="taisa831:mondorol6697", use_ssl=True, verify_certs=True, ca_certs=certifi.where()) res = es.get(index="elasticmini", doc_type="users", id="1")['_source'] print(res) #res = es.search(body="q=2017-05-20") #print(res)
class ELmonocleDB: log = logging.getLogger("monocle.ELmonocleDB") def __init__( self, elastic_conn="localhost:9200", index=None, timeout=10, prefix=CHANGE_PREFIX, create=True, previous_schema=False, idents_config: Optional[IdentsConfig] = None, user=None, password=None, use_ssl=None, verify_certs=None, ssl_show_warn=None, ) -> None: host, port = elastic_conn.split(":") s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ip = socket.gethostbyname(host) self.log.info("ES IP is %s" % ip) self.log.info("ES prefix is %s" % prefix) elastic_conn = [ { "host": host, "port": port, } ] if use_ssl: elastic_conn[0]["use_ssl"] = use_ssl if not verify_certs: elastic_conn[0]["verify_certs"] = verify_certs if not ssl_show_warn: elastic_conn[0]["ssl_show_warn"] = ssl_show_warn if user and password: elastic_conn[0]["http_auth"] = "%s:%s" % (user, password) while True: try: s.connect((ip, int(port))) s.shutdown(2) s.close() break except Exception as excpt: self.log.info( "Unable to connect to %s: %s. Sleeping for %ds." % (elastic_conn, excpt, timeout) ) time.sleep(timeout) self.log.info("Connecting to ES server at %s" % elastic_conn) self.es = Elasticsearch(elastic_conn) self.log.info(self.es.info()) if previous_schema: self.prefix = PREV_CHANGE_PREFIX else: self.prefix = prefix if not index: self.log.info("No index provided") return self.idents_config = idents_config or [] self.index = "{}{}".format(self.prefix, index) self.log.info("Using ES index %s" % self.index) self.mapping = { "properties": { "id": {"type": "keyword"}, "type": {"type": "keyword"}, "number": {"type": "keyword"}, "change_id": {"type": "keyword"}, "title": { "type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}}, }, "text": { "type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}}, }, "url": {"type": "keyword"}, "commit_count": {"type": "integer"}, "additions": {"type": "integer"}, "deletions": {"type": "integer"}, "changed_files_count": {"type": "integer"}, "changed_files": { "properties": { "additions": {"type": "integer"}, "deletions": {"type": "integer"}, "path": {"type": "keyword"}, } }, "commits": { "properties": { "sha": {"type": "keyword"}, "author": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "committer": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "authored_at": { "type": "date", "format": "date_time_no_millis", }, "committed_at": { "type": "date", "format": "date_time_no_millis", }, "additions": {"type": "integer"}, "deletions": {"type": "integer"}, "title": {"type": "text"}, } }, "repository_prefix": {"type": "keyword"}, "repository_fullname": {"type": "keyword"}, "repository_shortname": {"type": "keyword"}, "author": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "on_author": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "committer": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "merged_by": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "branch": {"type": "keyword"}, "target_branch": {"type": "keyword"}, "created_at": {"type": "date", "format": "date_time_no_millis"}, "on_created_at": {"type": "date", "format": "date_time_no_millis"}, "merged_at": {"type": "date", "format": "date_time_no_millis"}, "updated_at": {"type": "date", "format": "date_time_no_millis"}, "closed_at": {"type": "date", "format": "date_time_no_millis"}, "state": {"type": "keyword"}, "duration": {"type": "integer"}, "mergeable": {"type": "keyword"}, "labels": {"type": "keyword"}, "assignees": { "type": "nested", "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, }, }, "approval": {"type": "keyword"}, "draft": {"type": "boolean"}, "self_merged": {"type": "boolean"}, "crawler_metadata": { "properties": { "last_commit_at": { "type": "date", "format": "date_time_no_millis", }, "last_post_at": { "type": "date", "format": "date_time_no_millis", }, "total_docs_posted": {"type": "integer"}, "total_changes_updated": {"type": "integer"}, "total_orphans_updated": {"type": "integer"}, } }, "tasks_data": { "properties": { "tid": {"type": "keyword"}, "ttype": {"type": "keyword"}, "crawler_name": {"type": "keyword"}, "updated_at": {"type": "date", "format": "date_time_no_millis"}, "change_url": {"type": "keyword"}, "severity": {"type": "keyword"}, "priority": {"type": "keyword"}, "score": {"type": "integer"}, "url": {"type": "keyword"}, "title": { "type": "text", "fields": { "keyword": {"type": "keyword", "ignore_above": 8191} }, }, "_adopted": {"type": "boolean"}, } }, } } settings = {"mappings": self.mapping} self.ic = self.es.indices if create: self.ic.create(index=self.index, ignore=400, body=settings) # The authors_histo is failing on some context with this error when the # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be # set by changing the [search.max_buckets] cluster level) # This is an attempt to mitigate the issue cluster_settings = {"transient": {"search.max_buckets": 100000}} self.es.cluster.put_settings(body=cluster_settings) def update(self, source_it: List[Union[Change, Event]]) -> None: def gen(it): for _source in it: source = change_or_event_to_dict(_source) d = {} d["_index"] = self.index d["_op_type"] = "update" d["_id"] = source["id"] d["doc"] = source d["doc_as_upsert"] = True yield d bulk(self.es, gen(source_it)) self.es.indices.refresh(index=self.index) def update_task_data( self, source_it: Union[ List[TaskDataForEL], List[OrphanTaskDataForEL], List[AdoptedTaskDataForEL], ], ) -> Optional[BulkIndexError]: def gen(it): for _source in it: d = {} d["_index"] = self.index d["_op_type"] = "update" d["_id"] = _source._id d["doc"] = {} d["doc"].update({"id": _source._id}) if isinstance(_source, TaskDataForEL): d["doc"].update( {"tasks_data": [asdict(td) for td in _source.tasks_data]} ) if isinstance(_source, OrphanTaskDataForEL): d["doc"].update({"tasks_data": asdict(_source.task_data)}) d["doc"]["type"] = "OrphanTaskData" if isinstance(_source, AdoptedTaskDataForEL): d["doc"].update({"tasks_data": asdict(_source.task_data)}) d["doc_as_upsert"] = True yield d ret = None try: bulk(self.es, gen(source_it)) except BulkIndexError as err: ret = err self.es.indices.refresh(index=self.index) return ret def compute_crawler_id_by_name(self, name, _type): return "crawler/%s/%s" % (_type, name) def get_task_crawler_metadata(self, name: str) -> Dict: try: ret = self.es.get( self.index, self.compute_crawler_id_by_name(name, "tasks_crawler") ) return ret["_source"]["crawler_metadata"] except Exception: return {} def set_task_crawler_metadata( self, name: str, commit_date: datetime = None, push_infos: Dict = None ): metadata = {} if commit_date: metadata.update({"last_commit_at": commit_date}) if push_infos: prev_metadata = self.get_task_crawler_metadata(name) metadata.update( { "last_post_at": push_infos["last_post_at"], "total_docs_posted": prev_metadata.get("total_docs_posted", 0) + push_infos["total_docs_posted"], "total_changes_updated": prev_metadata.get( "total_changes_updated", 0 ) + push_infos["total_changes_updated"], "total_orphans_updated": prev_metadata.get( "total_orphans_updated", 0 ) + push_infos["total_orphans_updated"], } ) body = { "doc": {"type": "TaskCrawlerDataCommit", "crawler_metadata": metadata}, "doc_as_upsert": True, } ret = None try: self.es.update( self.index, self.compute_crawler_id_by_name(name, "tasks_crawler"), body=body, ) self.es.indices.refresh(index=self.index) except Exception as err: ret = err return ret def delete_index(self): self.log.info("Deleting index: %s" % self.index) self.ic.delete(index=self.index) def delete_repository(self, repository_fullname): params = {"index": self.index} body = { "query": { "bool": { "filter": { "regexp": { "repository_fullname": {"value": repository_fullname} } } } } } params["body"] = body self.es.delete_by_query(**params) self.es.indices.refresh(index=self.index) def get_last_updated(self, repository_fullname): params = {"index": self.index} body = { "sort": [{"updated_at": {"order": "desc"}}], "query": { "bool": { "filter": [ {"term": {"type": "Change"}}, { "regexp": { "repository_fullname": {"value": repository_fullname} } }, ] } }, } params["body"] = body try: res = self.es.search(**params) except Exception: return [] ret = [r["_source"] for r in res["hits"]["hits"]] if not ret: return [] return ret[0] def get_changes_by_url(self, change_urls, size): params = { "index": self.index, "body": { "size": size, "query": { "bool": { "filter": [ {"term": {"type": "Change"}}, {"terms": {"url": change_urls}}, ] } }, }, } try: res = self.es.search(**params) except Exception: return [] return [r["_source"] for r in res["hits"]["hits"]] def get_orphan_tds_by_change_urls(self, change_urls): assert len(change_urls) <= 50 size = 5000 # Asumming not more that 100 TD data relataed to a change params = { "index": self.index, "body": { "size": size, "query": { "bool": { "must_not": {"exists": {"field": "tasks_data._adopted"}}, "filter": [ {"term": {"type": "OrphanTaskData"}}, {"terms": {"tasks_data.change_url": change_urls}}, ], } }, }, } try: res = self.es.search(**params) except Exception: return [] return [r["_source"] for r in res["hits"]["hits"]] def get_orphan_tds_and_declare_adpotion(self, changes_url): assert len(changes_url) <= 50 tds = self.get_orphan_tds_by_change_urls(changes_url) if tds: adopted_tds = [ AdoptedTaskDataForEL( _id=td["id"], task_data=AdoptedTaskData(_adopted=True), ) for td in tds ] self.update_task_data(adopted_tds) return tds def update_changes_with_orphan_tds(self, mapping: Dict[str, str]): change_urls = list(mapping.keys()) while change_urls: change_urls_to_process = change_urls[:50] change_urls = change_urls[50:] tds = self.get_orphan_tds_and_declare_adpotion(change_urls_to_process) # Group tds in buckets by change_url _map: Dict[str, List] = dict() for td in tds: _map.setdefault(td["tasks_data"]["change_url"], []).append( td["tasks_data"] ) # Create update docs to attach tds to matching changes to_update = [] for change_url, tds in _map.items(): to_update.append( TaskDataForEL( _id=mapping[change_url], tasks_data=createELTaskData(tds), ) ) self.update_task_data(to_update) def run_named_query(self, name, *args, **kwargs): if name not in queries.public_queries: raise UnknownQueryException("Unknown query: %s" % name) return getattr(queries, name)(self.es, self.index, *args, **kwargs) def get_indices(self): return [ ind.replace(self.prefix, "") for ind in self.es.indices.get(self.prefix + "*") ] def iter_index(self): body = {"query": {"match_all": {}}} return scan(self.es, query=body, index=self.index, size=5000) def update_idents(self) -> None: import json bulk_size = 7500 def get_obj_hash(obj: Dict) -> int: obj_json = json.dumps(obj, sort_keys=True) return hash(obj_json) def update_ident(dict_ident: Dict) -> Dict: dict_ident["muid"] = create_muid(dict_ident["uid"], self.idents_config) return dict_ident def _update_idents(obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]: prev_hash = get_obj_hash(obj) if obj["type"] == "Change": obj["author"] = update_ident(obj["author"]) if "committer" in obj: obj["committer"] = update_ident(obj["committer"]) if "merged_by" in obj: obj["merged_by"] = update_ident(obj["merged_by"]) if "assignees" in obj: obj["assignees"] = list(map(update_ident, obj["assignees"])) if "commits" in obj: for commit in obj["commits"]: commit["author"] = update_ident(commit["author"]) commit["committer"] = update_ident(commit["committer"]) if obj["type"] in get_events_list(): if "author" in obj: obj["author"] = update_ident(obj["author"]) if "on_author" in obj: obj["on_author"] = update_ident(obj["on_author"]) updated = not prev_hash == get_obj_hash(obj) if updated: return dict_to_change_or_event(obj), True else: return None, False def bulk_update(to_update: List) -> List: print("Updating %s objects ..." % len(to_update)) self.update(to_update) return [] to_update = [] total_read = 0 for _obj in self.iter_index(): total_read += 1 if total_read % bulk_size == 0: print("%s objects read from the database" % total_read) obj = _obj["_source"] obj, updated = _update_idents(obj) if updated: to_update.append(obj) if len(to_update) == bulk_size: to_update = bulk_update(to_update) bulk_update(to_update)
class BaseElasticsearchBackend(Base): """Base connection wrapper based on the ElasticSearch official library. It uses two entry points to configure the underlying connection: * ``transport_class``: the transport class from ``elasticsearch``. By default ``elasticsearch.transport.Transport``. * ``connection_class``: the connection class used by the transport class. It's undefined by default, as it is on the subclasses to provide one. If any of these elements is not defined, an ``ImproperlyConfigured`` error will be raised when the backend will try to configure the client. """ #: ElasticSearch transport class used by the client class to perform #: requests. transport_class = Transport #: ElasticSearch connection class used by the transport class to perform #: requests. connection_class = None def configure_client(self): """Instantiate and configure the ElasticSearch client. It simply takes the given HOSTS list and uses PARAMS as the keyword arguments of the ElasticSearch class. The client's transport_class is given by the class attribute ``transport_class``, and the connection class used by the transport class is given by the class attribute ``connection_class``. An ``ImproperlyConfigured`` exception is raised if any of these elements is undefined. """ hosts = self.server['HOSTS'] params = self.server['PARAMS'] if not self.transport_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no transport class provided' % self.__class__) if not self.connection_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no connection class provided' % self.__class__) #pylint: disable=star-args self.client = Elasticsearch(hosts, transport_class=self.transport_class, connection_class=self.connection_class, **params) # Server methods # ============== # The underlying client does not require index names to perform server # related queries, such as "ping" or "info". The connection wrapper act # for them as a proxy. def ping(self, **kwargs): return self.client.ping(**kwargs) def info(self, **kwargs): return self.client.info(**kwargs) def put_script(self, lang, script_id, body, **kwargs): return self.client.put_script(lang, script_id, body, **kwargs) def get_script(self, lang, script_id, **kwargs): return self.client.get_script(lang, script_id, **kwargs) def delete_script(self, lang, script_id, **kwargs): return self.client.delete_script(lang, script_id, **kwargs) def put_template(self, template_id, body, **kwargs): return self.client.put_template(template_id, body, **kwargs) def get_template(self, template_id, body=None, **kwargs): return self.client.get_template(template_id, body, **kwargs) def delete_template(self, template_id=None, **kwargs): return self.client.delete_template(template_id, **kwargs) # Bulk methods # ============ # The underlying client does not require index names, but it can be used. # As it makes sense to not give an index, developers are free to use these # as they want, as long as they are careful. def mget(self, body, index=None, doc_type=None, **kwargs): return self.client.mget(body, index, doc_type, **kwargs) def bulk(self, body, index=None, doc_type=None, **kwargs): return self.client.bulk(body, index, doc_type, **kwargs) def msearch(self, body, index=None, doc_type=None, **kwargs): return self.client.msearch(body, index, doc_type, **kwargs) def mpercolate(self, body, index=None, doc_type=None, **kwargs): return self.client.mpercolate(body, index, doc_type, **kwargs) # Scroll methods # ============== # The underlying client does not require an index to perform scroll. def scroll(self, scroll_id, **kwargs): return self.client.scroll(scroll_id, **kwargs) def clear_scroll(self, scroll_id, body=None, **kwargs): return self.client.clear_scroll(scroll_id, body, **kwargs) # Query methods # ============= # The underlying client requires index names (or alias names) to perform # queries. The connection wrapper overrides these client methods to # automatically uses the configured names (indices and/or aliases). def create(self, doc_type, body, doc_id=None, **kwargs): return self.client.create(self.indices, doc_type, body, doc_id, **kwargs) def index(self, doc_type, body, doc_id=None, **kwargs): return self.client.index(self.indices, doc_type, body, doc_id, **kwargs) def exists(self, doc_id, doc_type='_all', **kwargs): return self.client.exists(self.indices, doc_id, doc_type, **kwargs) def get(self, doc_id, doc_type='_all', **kwargs): return self.client.get(self.indices, doc_id, doc_type, **kwargs) def get_source(self, doc_id, doc_type='_all', **kwargs): return self.client.get_source(self.indices, doc_id, doc_type, **kwargs) def update(self, doc_type, doc_id, body=None, **kwargs): return self.client.update(self.indices, doc_type, doc_id, body, **kwargs) def search(self, doc_type=None, body=None, **kwargs): return self.client.search(self.indices, doc_type, body, **kwargs) def search_shards(self, doc_type=None, **kwargs): return self.client.search_shards(self.indices, doc_type, **kwargs) def search_template(self, doc_type=None, body=None, **kwargs): return self.client.search_template(self.indices, doc_type, body, **kwargs) def explain(self, doc_type, doc_id, body=None, **kwargs): return self.client.explain(self.indices, doc_type, doc_id, body, **kwargs) def delete(self, doc_type, doc_id, **kwargs): return self.client.delete(self.indices, doc_type, doc_id, **kwargs) def count(self, doc_type=None, body=None, **kwargs): return self.client.count(self.indices, doc_type, body, **kwargs) def delete_by_query(self, doc_type=None, body=None, **kwargs): return self.client.delete_by_query(self.indices, doc_type, body, **kwargs) def suggest(self, body, **kwargs): return self.client.suggest(body, self.indices, **kwargs) def percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.percolate(self.indices, doc_type, doc_id, body, **kwargs) def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.count_percolate(self.indices, doc_type, doc_id, body, **kwargs) def mlt(self, doc_type, doc_id, body=None, **kwargs): return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs) def termvector(self, doc_type, doc_id, body=None, **kwargs): return self.client.termvector(self.indices, doc_type, doc_id, body, **kwargs) def mtermvectors(self, doc_type=None, body=None, **kwargs): return self.client.mtermvectors(self.indices, doc_type, body, **kwargs) def benchmark(self, doc_type=None, body=None, **kwargs): return self.client.benchmark(self.indices, doc_type, body, **kwargs) def abort_benchmark(self, name=None, **kwargs): return self.client.abort_benchmark(name, **kwargs) def list_benchmarks(self, doc_type=None, **kwargs): return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
class BaseElasticsearchBackend(Base): """Base connection wrapper based on the ElasticSearch official library. It uses two entry points to configure the underlying connection: * ``transport_class``: the transport class from ``elasticsearch``. By default ``elasticsearch.transport.Transport``. * ``connection_class``: the connection class used by the transport class. It's undefined by default, as it is on the subclasses to provide one. If any of these elements is not defined, an ``ImproperlyConfigured`` error will be raised when the backend will try to configure the client. """ #: ElasticSearch transport class used by the client class to perform #: requests. transport_class = Transport #: ElasticSearch connection class used by the transport class to perform #: requests. connection_class = None def configure_client(self): """Instantiate and configure the ElasticSearch client. It simply takes the given HOSTS list and uses PARAMS as the keyword arguments of the ElasticSearch class. The client's transport_class is given by the class attribute ``transport_class``, and the connection class used by the transport class is given by the class attribute ``connection_class``. An ``ImproperlyConfigured`` exception is raised if any of these elements is undefined. """ hosts = self.server['HOSTS'] params = self.server['PARAMS'] if not self.transport_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no transport class provided' % self.__class__) if not self.connection_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no connection class provided' % self.__class__) #pylint: disable=star-args self.client = Elasticsearch(hosts, transport_class=self.transport_class, connection_class=self.connection_class, **params) # Server methods # ============== # The underlying client does not require index names to perform server # related queries, such as "ping" or "info". The connection wrapper act # for them as a proxy. def ping(self, **kwargs): return self.client.ping(**kwargs) def info(self, **kwargs): return self.client.info(**kwargs) def put_script(self, lang, script_id, body, **kwargs): return self.client.put_script(lang, script_id, body, **kwargs) def get_script(self, lang, script_id, **kwargs): return self.client.get_script(lang, script_id, **kwargs) def delete_script(self, lang, script_id, **kwargs): return self.client.delete_script(lang, script_id, **kwargs) def put_template(self, template_id, body, **kwargs): return self.client.put_template(template_id, body, **kwargs) def get_template(self, template_id, body=None, **kwargs): return self.client.get_template(template_id, body, **kwargs) def delete_template(self, template_id=None, **kwargs): return self.client.delete_template(template_id, **kwargs) # Bulk methods # ============ # The underlying client does not require index names, but it can be used. # As it makes sense to not give an index, developers are free to use these # as they want, as long as they are careful. def mget(self, body, index=None, doc_type=None, **kwargs): return self.client.mget(body, index, doc_type, **kwargs) def bulk(self, body, index=None, doc_type=None, **kwargs): return self.client.bulk(body, index, doc_type, **kwargs) def msearch(self, body, index=None, doc_type=None, **kwargs): return self.client.msearch(body, index, doc_type, **kwargs) def mpercolate(self, body, index=None, doc_type=None, **kwargs): return self.client.mpercolate(body, index, doc_type, **kwargs) # Scroll methods # ============== # The underlying client does not require an index to perform scroll. def scroll(self, scroll_id, **kwargs): return self.client.scroll(scroll_id, **kwargs) def clear_scroll(self, scroll_id, body=None, **kwargs): return self.client.clear_scroll(scroll_id, body, **kwargs) # Query methods # ============= # The underlying client requires index names (or alias names) to perform # queries. The connection wrapper overrides these client methods to # automatically uses the configured names (indices and/or aliases). def create(self, doc_type, body, doc_id=None, **kwargs): return self.client.create( self.indices, doc_type, body, doc_id, **kwargs) def index(self, doc_type, body, doc_id=None, **kwargs): return self.client.index( self.indices, doc_type, body, doc_id, **kwargs) def exists(self, doc_id, doc_type='_all', **kwargs): return self.client.exists(self.indices, doc_id, doc_type, **kwargs) def get(self, doc_id, doc_type='_all', **kwargs): return self.client.get(self.indices, doc_id, doc_type, **kwargs) def get_source(self, doc_id, doc_type='_all', **kwargs): return self.client.get_source(self.indices, doc_id, doc_type, **kwargs) def update(self, doc_type, doc_id, body=None, **kwargs): return self.client.update( self.indices, doc_type, doc_id, body, **kwargs) def search(self, doc_type=None, body=None, **kwargs): return self.client.search(self.indices, doc_type, body, **kwargs) def search_shards(self, doc_type=None, **kwargs): return self.client.search_shards(self.indices, doc_type, **kwargs) def search_template(self, doc_type=None, body=None, **kwargs): return self.client.search_template( self.indices, doc_type, body, **kwargs) def explain(self, doc_type, doc_id, body=None, **kwargs): return self.client.explain( self.indices, doc_type, doc_id, body, **kwargs) def delete(self, doc_type, doc_id, **kwargs): return self.client.delete(self.indices, doc_type, doc_id, **kwargs) def count(self, doc_type=None, body=None, **kwargs): return self.client.count(self.indices, doc_type, body, **kwargs) def delete_by_query(self, doc_type=None, body=None, **kwargs): return self.client.delete_by_query( self.indices, doc_type, body, **kwargs) def suggest(self, body, **kwargs): return self.client.suggest(body, self.indices, **kwargs) def percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.percolate( self.indices, doc_type, doc_id, body, **kwargs) def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.count_percolate( self.indices, doc_type, doc_id, body, **kwargs) def mlt(self, doc_type, doc_id, body=None, **kwargs): return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs) def termvector(self, doc_type, doc_id, body=None, **kwargs): return self.client.termvector( self.indices, doc_type, doc_id, body, **kwargs) def mtermvectors(self, doc_type=None, body=None, **kwargs): return self.client.mtermvectors(self.indices, doc_type, body, **kwargs) def benchmark(self, doc_type=None, body=None, **kwargs): return self.client.benchmark(self.indices, doc_type, body, **kwargs) def abort_benchmark(self, name=None, **kwargs): return self.client.abort_benchmark(name, **kwargs) def list_benchmarks(self, doc_type=None, **kwargs): return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
class TestSchemaManager(unittest.TestCase): test_schema_index = 'test_pseudonym' def setUp(self): self.client = Elasticsearch() self.manager = SchemaManager(self.client, schema_index=self.test_schema_index) def tearDown(self): try: self.client.indices.delete(self.test_schema_index) except: pass def test_schema_compiling(self): cfg = {'aliases': [{'name': 'alias1', 'strategy': {'date': {'indexes': {'201401': datetime.date(2014, 1, 1).isoformat()}}}}]} self.manager.update(cfg) schema = self.client.get(index=self.test_schema_index, id='master') self.assertEqual(schema['_version'], 1) source = schema.pop('_source') schema_doc = json.loads(source.get('schema')) self.assertEqual({a['name'] for a in schema_doc['aliases']}, {'alias1'}) self.assertEqual({i['name'] for i in schema_doc['indexes']}, {'201401'}) cfg['aliases'][0]['strategy']['date']['indexes']['201402'] = datetime.date(2014, 2, 1).isoformat() self.manager.update(cfg) schema = self.client.get(index=self.test_schema_index, id='master') self.assertEqual(schema['_version'], 2) source = schema.pop('_source') schema_doc = json.loads(source.get('schema')) self.assertEqual({a['name'] for a in schema_doc['aliases']}, {'alias1'}) self.assertEqual({i['name'] for i in schema_doc['indexes']}, {'201401', '201402'}) cfg['aliases'].append({'name': 'alias2', 'strategy': {'date': {'indexes': {'201501': datetime.date(2015, 1, 1).isoformat()}}}}) self.manager.update(cfg) schema = self.client.get(index=self.test_schema_index, id='master') self.assertEqual(schema['_version'], 3) source = schema.pop('_source') schema_doc = json.loads(source.get('schema')) self.assertEqual({a['name'] for a in schema_doc['aliases']}, {'alias1', 'alias2'}) self.assertEqual({i['name'] for i in schema_doc['indexes']}, {'201401', '201402', '201501'}) def test_add_index(self): cfg = {'aliases': [{'name': 'alias1', 'strategy': {'date': {'indexes': {'201401': datetime.date(2014, 1, 1).isoformat()}}}}]} self.manager.update(cfg) self.manager.add_index('alias1', '201402', datetime.date(2014, 1, 2).isoformat()) schema = self.client.get(index=self.test_schema_index, id='master') source = schema.pop('_source') schema_doc = json.loads(source.get('schema')) for alias in schema_doc['aliases']: if alias['name'] == 'alias1': break self.assertIn('201402', alias['indexes']) self.assertIn('201402', [i['name'] for i in schema_doc['indexes']]) def test_remove_index(self): cfg = {'aliases': [{'name': 'alias1', 'strategy': {'date': {'indexes': {'201501': datetime.date(2015, 1, 1).isoformat(), '201401': datetime.date(2014, 1, 1).isoformat()}}}}]} self.manager.update(cfg) self.manager.remove_index('201401') schema = self.client.get(index=self.test_schema_index, id='master')['_source'] schema_doc = json.loads(schema.get('schema')) self.assertEqual(len(schema_doc['indexes']), 1) self.assertEqual(schema_doc['indexes'][0]['name'], '201501') self.assertEqual(len(schema_doc['aliases']), 1) self.assertEqual(schema_doc['aliases'][0]['indexes'], ['201501']) def test_reindex_cutover(self): source_index = "reindex_2017_01" # Add both indexes to aliases before cutover target_index = '%s-a' % source_index alias1 = 'cutover1' cfg = {'aliases': [{'name': alias1, 'strategy': {'date': {'indexes': {source_index: datetime.date(2017, 1, 1).isoformat()}}}}]} self.manager.update(cfg) _, schema = self.manager.get_current_schema(True) self.assertEquals(schema['aliases'][0]['name'], alias1) source_routing = None for index in schema['indexes']: if index['name'] == source_index: source_routing = index.get('routing') self.manager.reindex_cutover(source_index) _, schema = self.manager.get_current_schema(True) aliases = [alias for alias in schema['aliases'] if alias['name'] is alias1] for alias in aliases: self.assertTrue(target_index in alias['indexes']) self.assertTrue(source_index not in alias['indexes']) target_routing = None for index in schema['indexes']: if index['name'] == target_index: target_routing = index.get('routing') self.assertIsNotNone(target_routing) self.assertEquals(source_routing, target_routing) def test_get_target_index(self): source_name = 'assets_2017_01' target = self.manager._get_target_index(source_name) self.assertEquals(target, 'assets_2017_01-a') target = self.manager._get_target_index(target) self.assertEquals(target, 'assets_2017_01-b')