def main(): parser = argparse.ArgumentParser() parser.add_argument('--host', default=os.environ.get('ES_HOST', None), help='Elasticsearch host') parser.add_argument('--port', default=os.environ.get('ES_PORT', None), type=int, help='Elasticsearch port') parser.add_argument('--username', default=os.environ.get('ES_USERNAME', None), help='Elasticsearch username') parser.add_argument('--password', default=os.environ.get('ES_PASSWORD', None), help='Elasticsearch password') parser.add_argument('--url-prefix', help='Elasticsearch URL prefix') parser.add_argument('--no-auth', action='store_const', const=True, help='Suppress prompt for basic auth') parser.add_argument('--ssl', action='store_true', default=env('ES_USE_SSL', None), help='Use TLS') parser.add_argument('--no-ssl', dest='ssl', action='store_false', help='Do not use TLS') parser.add_argument('--verify-certs', action='store_true', default=None, help='Verify TLS certificates') parser.add_argument('--no-verify-certs', dest='verify_certs', action='store_false', help='Do not verify TLS certificates') parser.add_argument('--index', help='Index name to create') parser.add_argument('--old-index', help='Old index name to copy') parser.add_argument('--send_get_body_as', default='GET', help='Method for querying Elasticsearch - POST, GET or source') parser.add_argument( '--boto-profile', default=None, dest='profile', help='DEPRECATED: (use --profile) Boto profile to use for signing requests') parser.add_argument( '--profile', default=None, help='AWS profile to use for signing requests. Optionally use the AWS_DEFAULT_PROFILE environment variable') parser.add_argument( '--aws-region', default=None, help='AWS Region to use for signing requests. Optionally use the AWS_DEFAULT_REGION environment variable') parser.add_argument('--timeout', default=60, help='Elasticsearch request timeout') parser.add_argument('--config', default='config.yaml', help='Global config file (default: config.yaml)') parser.add_argument('--recreate', type=bool, default=False, help='Force re-creation of the index (this will cause data loss).') args = parser.parse_args() if os.path.isfile('config.yaml'): filename = 'config.yaml' elif os.path.isfile(args.config): filename = args.config else: filename = '' if filename: with open(filename) as config_file: data = yaml.load(config_file) host = args.host if args.host else data.get('es_host') port = args.port if args.port else data.get('es_port') username = args.username if args.username else data.get('es_username') password = args.password if args.password else data.get('es_password') url_prefix = args.url_prefix if args.url_prefix is not None else data.get('es_url_prefix', '') use_ssl = args.ssl if args.ssl is not None else data.get('use_ssl') verify_certs = args.verify_certs if args.verify_certs is not None else data.get('verify_certs') is not False aws_region = data.get('aws_region', None) send_get_body_as = data.get('send_get_body_as', 'GET') ca_certs = data.get('ca_certs') client_cert = data.get('client_cert') client_key = data.get('client_key') index = args.index if args.index is not None else data.get('writeback_index') old_index = args.old_index if args.old_index is not None else None else: username = args.username if args.username else None password = args.password if args.password else None aws_region = args.aws_region host = args.host if args.host else raw_input('Enter Elasticsearch host: ') port = args.port if args.port else int(raw_input('Enter Elasticsearch port: ')) use_ssl = (args.ssl if args.ssl is not None else raw_input('Use SSL? t/f: ').lower() in ('t', 'true')) if use_ssl: verify_certs = (args.verify_certs if args.verify_certs is not None else raw_input('Verify TLS certificates? t/f: ').lower() not in ('f', 'false')) else: verify_certs = True if args.no_auth is None and username is None: username = raw_input('Enter optional basic-auth username (or leave blank): ') password = getpass.getpass('Enter optional basic-auth password (or leave blank): ') url_prefix = (args.url_prefix if args.url_prefix is not None else raw_input('Enter optional Elasticsearch URL prefix (prepends a string to the URL of every request): ')) send_get_body_as = args.send_get_body_as ca_certs = None client_cert = None client_key = None index = args.index if args.index is not None else raw_input('New index name? (Default elastalert_status) ') if not index: index = 'elastalert_status' old_index = (args.old_index if args.old_index is not None else raw_input('Name of existing index to copy? (Default None) ')) timeout = args.timeout auth = Auth() http_auth = auth(host=host, username=username, password=password, aws_region=aws_region, profile_name=args.profile) es = Elasticsearch( host=host, port=port, timeout=timeout, use_ssl=use_ssl, verify_certs=verify_certs, connection_class=RequestsHttpConnection, http_auth=http_auth, url_prefix=url_prefix, send_get_body_as=send_get_body_as, client_cert=client_cert, ca_certs=ca_certs, client_key=client_key) esversion = es.info()["version"]["number"] print("Elastic Version:" + esversion.split(".")[0]) elasticversion = int(esversion.split(".")[0]) if(elasticversion > 5): mapping = {'type': 'keyword'} else: mapping = {'index': 'not_analyzed', 'type': 'string'} print("Mapping used for string:" + str(mapping)) silence_mapping = { 'silence': { 'properties': { 'rule_name': mapping, 'until': { 'type': 'date', 'format': 'dateOptionalTime', }, '@timestamp': { 'type': 'date', 'format': 'dateOptionalTime', }, }, }, } ess_mapping = { 'elastalert_status': { 'properties': { 'rule_name': mapping, '@timestamp': { 'type': 'date', 'format': 'dateOptionalTime', }, }, }, } es_mapping = { 'elastalert': { 'properties': { 'rule_name': mapping, '@timestamp': { 'type': 'date', 'format': 'dateOptionalTime', }, 'alert_time': { 'type': 'date', 'format': 'dateOptionalTime', }, 'match_time': { 'type': 'date', 'format': 'dateOptionalTime', }, 'match_body': { 'type': 'object', 'enabled': False, }, 'aggregate_id': mapping, }, }, } past_mapping = { 'past_elastalert': { 'properties': { 'rule_name': mapping, 'match_body': { 'type': 'object', 'enabled': False, }, '@timestamp': { 'type': 'date', 'format': 'dateOptionalTime', }, 'aggregate_id': mapping, }, }, } error_mapping = { 'elastalert_error': { 'properties': { 'data': { 'type': 'object', 'enabled': False, }, '@timestamp': { 'type': 'date', 'format': 'dateOptionalTime', }, }, }, } es_index = IndicesClient(es) if not args.recreate: if es_index.exists(index): print('Index ' + index + ' already exists. Skipping index creation.') return None # (Re-)Create indices. if (elasticversion > 5): index_names = ( index, index + '_status', index + '_silence', index + '_error', index + '_past', ) else: index_names = ( index, ) for index_name in index_names: if es_index.exists(index_name): print('Deleting index ' + index_name + '.') try: es_index.delete(index_name) except NotFoundError: # Why does this ever occur?? It shouldn't. But it does. pass es_index.create(index_name) # To avoid a race condition. TODO: replace this with a real check time.sleep(2) if(elasticversion > 5): es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping) es.indices.put_mapping(index=index + '_status', doc_type='elastalert_status', body=ess_mapping) es.indices.put_mapping(index=index + '_silence', doc_type='silence', body=silence_mapping) es.indices.put_mapping(index=index + '_error', doc_type='elastalert_error', body=error_mapping) es.indices.put_mapping(index=index + '_past', doc_type='past_elastalert', body=past_mapping) print('New index %s created' % index) else: es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_status', body=ess_mapping) es.indices.put_mapping(index=index, doc_type='silence', body=silence_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_error', body=error_mapping) es.indices.put_mapping(index=index, doc_type='past_elastalert', body=past_mapping) print('New index %s created' % index) if old_index: print("Copying all data from old index '{0}' to new index '{1}'".format(old_index, index)) # Use the defaults for chunk_size, scroll, scan_kwargs, and bulk_kwargs elasticsearch.helpers.reindex(es, old_index, index) print('Done!')
class ELmonocleDB: log = logging.getLogger("monocle.ELmonocleDB") def __init__( self, elastic_conn="localhost:9200", index=None, timeout=10, prefix=CHANGE_PREFIX, create=True, previous_schema=False, idents_config: Optional[IdentsConfig] = None, user=None, password=None, use_ssl=None, verify_certs=None, ssl_show_warn=None, ) -> None: host, port = elastic_conn.split(":") s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ip = socket.gethostbyname(host) self.log.info("ES IP is %s" % ip) self.log.info("ES prefix is %s" % prefix) elastic_conn = [ { "host": host, "port": port, } ] if use_ssl: elastic_conn[0]["use_ssl"] = use_ssl if not verify_certs: elastic_conn[0]["verify_certs"] = verify_certs if not ssl_show_warn: elastic_conn[0]["ssl_show_warn"] = ssl_show_warn if user and password: elastic_conn[0]["http_auth"] = "%s:%s" % (user, password) while True: try: s.connect((ip, int(port))) s.shutdown(2) s.close() break except Exception as excpt: self.log.info( "Unable to connect to %s: %s. Sleeping for %ds." % (elastic_conn, excpt, timeout) ) time.sleep(timeout) self.log.info("Connecting to ES server at %s" % elastic_conn) self.es = Elasticsearch(elastic_conn) self.log.info(self.es.info()) if previous_schema: self.prefix = PREV_CHANGE_PREFIX else: self.prefix = prefix if not index: self.log.info("No index provided") return self.idents_config = idents_config or [] self.index = "{}{}".format(self.prefix, index) self.log.info("Using ES index %s" % self.index) self.mapping = { "properties": { "id": {"type": "keyword"}, "type": {"type": "keyword"}, "number": {"type": "keyword"}, "change_id": {"type": "keyword"}, "title": { "type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}}, }, "text": { "type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}}, }, "url": {"type": "keyword"}, "commit_count": {"type": "integer"}, "additions": {"type": "integer"}, "deletions": {"type": "integer"}, "changed_files_count": {"type": "integer"}, "changed_files": { "properties": { "additions": {"type": "integer"}, "deletions": {"type": "integer"}, "path": {"type": "keyword"}, } }, "commits": { "properties": { "sha": {"type": "keyword"}, "author": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "committer": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "authored_at": { "type": "date", "format": "date_time_no_millis", }, "committed_at": { "type": "date", "format": "date_time_no_millis", }, "additions": {"type": "integer"}, "deletions": {"type": "integer"}, "title": {"type": "text"}, } }, "repository_prefix": {"type": "keyword"}, "repository_fullname": {"type": "keyword"}, "repository_shortname": {"type": "keyword"}, "author": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "on_author": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "committer": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "merged_by": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "branch": {"type": "keyword"}, "target_branch": {"type": "keyword"}, "created_at": {"type": "date", "format": "date_time_no_millis"}, "on_created_at": {"type": "date", "format": "date_time_no_millis"}, "merged_at": {"type": "date", "format": "date_time_no_millis"}, "updated_at": {"type": "date", "format": "date_time_no_millis"}, "closed_at": {"type": "date", "format": "date_time_no_millis"}, "state": {"type": "keyword"}, "duration": {"type": "integer"}, "mergeable": {"type": "keyword"}, "labels": {"type": "keyword"}, "assignees": { "type": "nested", "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, }, }, "approval": {"type": "keyword"}, "draft": {"type": "boolean"}, "self_merged": {"type": "boolean"}, "crawler_metadata": { "properties": { "last_commit_at": { "type": "date", "format": "date_time_no_millis", }, "last_post_at": { "type": "date", "format": "date_time_no_millis", }, "total_docs_posted": {"type": "integer"}, "total_changes_updated": {"type": "integer"}, "total_orphans_updated": {"type": "integer"}, } }, "tasks_data": { "properties": { "tid": {"type": "keyword"}, "ttype": {"type": "keyword"}, "crawler_name": {"type": "keyword"}, "updated_at": {"type": "date", "format": "date_time_no_millis"}, "change_url": {"type": "keyword"}, "severity": {"type": "keyword"}, "priority": {"type": "keyword"}, "score": {"type": "integer"}, "url": {"type": "keyword"}, "title": { "type": "text", "fields": { "keyword": {"type": "keyword", "ignore_above": 8191} }, }, "_adopted": {"type": "boolean"}, } }, } } settings = {"mappings": self.mapping} self.ic = self.es.indices if create: self.ic.create(index=self.index, ignore=400, body=settings) # The authors_histo is failing on some context with this error when the # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be # set by changing the [search.max_buckets] cluster level) # This is an attempt to mitigate the issue cluster_settings = {"transient": {"search.max_buckets": 100000}} self.es.cluster.put_settings(body=cluster_settings) def update(self, source_it: List[Union[Change, Event]]) -> None: def gen(it): for _source in it: source = change_or_event_to_dict(_source) d = {} d["_index"] = self.index d["_op_type"] = "update" d["_id"] = source["id"] d["doc"] = source d["doc_as_upsert"] = True yield d bulk(self.es, gen(source_it)) self.es.indices.refresh(index=self.index) def update_task_data( self, source_it: Union[ List[TaskDataForEL], List[OrphanTaskDataForEL], List[AdoptedTaskDataForEL], ], ) -> Optional[BulkIndexError]: def gen(it): for _source in it: d = {} d["_index"] = self.index d["_op_type"] = "update" d["_id"] = _source._id d["doc"] = {} d["doc"].update({"id": _source._id}) if isinstance(_source, TaskDataForEL): d["doc"].update( {"tasks_data": [asdict(td) for td in _source.tasks_data]} ) if isinstance(_source, OrphanTaskDataForEL): d["doc"].update({"tasks_data": asdict(_source.task_data)}) d["doc"]["type"] = "OrphanTaskData" if isinstance(_source, AdoptedTaskDataForEL): d["doc"].update({"tasks_data": asdict(_source.task_data)}) d["doc_as_upsert"] = True yield d ret = None try: bulk(self.es, gen(source_it)) except BulkIndexError as err: ret = err self.es.indices.refresh(index=self.index) return ret def compute_crawler_id_by_name(self, name, _type): return "crawler/%s/%s" % (_type, name) def get_task_crawler_metadata(self, name: str) -> Dict: try: ret = self.es.get( self.index, self.compute_crawler_id_by_name(name, "tasks_crawler") ) return ret["_source"]["crawler_metadata"] except Exception: return {} def set_task_crawler_metadata( self, name: str, commit_date: datetime = None, push_infos: Dict = None ): metadata = {} if commit_date: metadata.update({"last_commit_at": commit_date}) if push_infos: prev_metadata = self.get_task_crawler_metadata(name) metadata.update( { "last_post_at": push_infos["last_post_at"], "total_docs_posted": prev_metadata.get("total_docs_posted", 0) + push_infos["total_docs_posted"], "total_changes_updated": prev_metadata.get( "total_changes_updated", 0 ) + push_infos["total_changes_updated"], "total_orphans_updated": prev_metadata.get( "total_orphans_updated", 0 ) + push_infos["total_orphans_updated"], } ) body = { "doc": {"type": "TaskCrawlerDataCommit", "crawler_metadata": metadata}, "doc_as_upsert": True, } ret = None try: self.es.update( self.index, self.compute_crawler_id_by_name(name, "tasks_crawler"), body=body, ) self.es.indices.refresh(index=self.index) except Exception as err: ret = err return ret def delete_index(self): self.log.info("Deleting index: %s" % self.index) self.ic.delete(index=self.index) def delete_repository(self, repository_fullname): params = {"index": self.index} body = { "query": { "bool": { "filter": { "regexp": { "repository_fullname": {"value": repository_fullname} } } } } } params["body"] = body self.es.delete_by_query(**params) self.es.indices.refresh(index=self.index) def get_last_updated(self, repository_fullname): params = {"index": self.index} body = { "sort": [{"updated_at": {"order": "desc"}}], "query": { "bool": { "filter": [ {"term": {"type": "Change"}}, { "regexp": { "repository_fullname": {"value": repository_fullname} } }, ] } }, } params["body"] = body try: res = self.es.search(**params) except Exception: return [] ret = [r["_source"] for r in res["hits"]["hits"]] if not ret: return [] return ret[0] def get_changes_by_url(self, change_urls, size): params = { "index": self.index, "body": { "size": size, "query": { "bool": { "filter": [ {"term": {"type": "Change"}}, {"terms": {"url": change_urls}}, ] } }, }, } try: res = self.es.search(**params) except Exception: return [] return [r["_source"] for r in res["hits"]["hits"]] def get_orphan_tds_by_change_urls(self, change_urls): assert len(change_urls) <= 50 size = 5000 # Asumming not more that 100 TD data relataed to a change params = { "index": self.index, "body": { "size": size, "query": { "bool": { "must_not": {"exists": {"field": "tasks_data._adopted"}}, "filter": [ {"term": {"type": "OrphanTaskData"}}, {"terms": {"tasks_data.change_url": change_urls}}, ], } }, }, } try: res = self.es.search(**params) except Exception: return [] return [r["_source"] for r in res["hits"]["hits"]] def get_orphan_tds_and_declare_adpotion(self, changes_url): assert len(changes_url) <= 50 tds = self.get_orphan_tds_by_change_urls(changes_url) if tds: adopted_tds = [ AdoptedTaskDataForEL( _id=td["id"], task_data=AdoptedTaskData(_adopted=True), ) for td in tds ] self.update_task_data(adopted_tds) return tds def update_changes_with_orphan_tds(self, mapping: Dict[str, str]): change_urls = list(mapping.keys()) while change_urls: change_urls_to_process = change_urls[:50] change_urls = change_urls[50:] tds = self.get_orphan_tds_and_declare_adpotion(change_urls_to_process) # Group tds in buckets by change_url _map: Dict[str, List] = dict() for td in tds: _map.setdefault(td["tasks_data"]["change_url"], []).append( td["tasks_data"] ) # Create update docs to attach tds to matching changes to_update = [] for change_url, tds in _map.items(): to_update.append( TaskDataForEL( _id=mapping[change_url], tasks_data=createELTaskData(tds), ) ) self.update_task_data(to_update) def run_named_query(self, name, *args, **kwargs): if name not in queries.public_queries: raise UnknownQueryException("Unknown query: %s" % name) return getattr(queries, name)(self.es, self.index, *args, **kwargs) def get_indices(self): return [ ind.replace(self.prefix, "") for ind in self.es.indices.get(self.prefix + "*") ] def iter_index(self): body = {"query": {"match_all": {}}} return scan(self.es, query=body, index=self.index, size=5000) def update_idents(self) -> None: import json bulk_size = 7500 def get_obj_hash(obj: Dict) -> int: obj_json = json.dumps(obj, sort_keys=True) return hash(obj_json) def update_ident(dict_ident: Dict) -> Dict: dict_ident["muid"] = create_muid(dict_ident["uid"], self.idents_config) return dict_ident def _update_idents(obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]: prev_hash = get_obj_hash(obj) if obj["type"] == "Change": obj["author"] = update_ident(obj["author"]) if "committer" in obj: obj["committer"] = update_ident(obj["committer"]) if "merged_by" in obj: obj["merged_by"] = update_ident(obj["merged_by"]) if "assignees" in obj: obj["assignees"] = list(map(update_ident, obj["assignees"])) if "commits" in obj: for commit in obj["commits"]: commit["author"] = update_ident(commit["author"]) commit["committer"] = update_ident(commit["committer"]) if obj["type"] in get_events_list(): if "author" in obj: obj["author"] = update_ident(obj["author"]) if "on_author" in obj: obj["on_author"] = update_ident(obj["on_author"]) updated = not prev_hash == get_obj_hash(obj) if updated: return dict_to_change_or_event(obj), True else: return None, False def bulk_update(to_update: List) -> List: print("Updating %s objects ..." % len(to_update)) self.update(to_update) return [] to_update = [] total_read = 0 for _obj in self.iter_index(): total_read += 1 if total_read % bulk_size == 0: print("%s objects read from the database" % total_read) obj = _obj["_source"] obj, updated = _update_idents(obj) if updated: to_update.append(obj) if len(to_update) == bulk_size: to_update = bulk_update(to_update) bulk_update(to_update)
class BaseElasticsearchBackend(Base): """Base connection wrapper based on the ElasticSearch official library. It uses two entry points to configure the underlying connection: * ``transport_class``: the transport class from ``elasticsearch``. By default ``elasticsearch.transport.Transport``. * ``connection_class``: the connection class used by the transport class. It's undefined by default, as it is on the subclasses to provide one. If any of these elements is not defined, an ``ImproperlyConfigured`` error will be raised when the backend will try to configure the client. """ #: ElasticSearch transport class used by the client class to perform #: requests. transport_class = Transport #: ElasticSearch connection class used by the transport class to perform #: requests. connection_class = None def configure_client(self): """Instantiate and configure the ElasticSearch client. It simply takes the given HOSTS list and uses PARAMS as the keyword arguments of the ElasticSearch class. The client's transport_class is given by the class attribute ``transport_class``, and the connection class used by the transport class is given by the class attribute ``connection_class``. An ``ImproperlyConfigured`` exception is raised if any of these elements is undefined. """ hosts = self.server['HOSTS'] params = self.server['PARAMS'] if not self.transport_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no transport class provided' % self.__class__) if not self.connection_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no connection class provided' % self.__class__) #pylint: disable=star-args self.client = Elasticsearch(hosts, transport_class=self.transport_class, connection_class=self.connection_class, **params) # Server methods # ============== # The underlying client does not require index names to perform server # related queries, such as "ping" or "info". The connection wrapper act # for them as a proxy. def ping(self, **kwargs): return self.client.ping(**kwargs) def info(self, **kwargs): return self.client.info(**kwargs) def put_script(self, lang, script_id, body, **kwargs): return self.client.put_script(lang, script_id, body, **kwargs) def get_script(self, lang, script_id, **kwargs): return self.client.get_script(lang, script_id, **kwargs) def delete_script(self, lang, script_id, **kwargs): return self.client.delete_script(lang, script_id, **kwargs) def put_template(self, template_id, body, **kwargs): return self.client.put_template(template_id, body, **kwargs) def get_template(self, template_id, body=None, **kwargs): return self.client.get_template(template_id, body, **kwargs) def delete_template(self, template_id=None, **kwargs): return self.client.delete_template(template_id, **kwargs) # Bulk methods # ============ # The underlying client does not require index names, but it can be used. # As it makes sense to not give an index, developers are free to use these # as they want, as long as they are careful. def mget(self, body, index=None, doc_type=None, **kwargs): return self.client.mget(body, index, doc_type, **kwargs) def bulk(self, body, index=None, doc_type=None, **kwargs): return self.client.bulk(body, index, doc_type, **kwargs) def msearch(self, body, index=None, doc_type=None, **kwargs): return self.client.msearch(body, index, doc_type, **kwargs) def mpercolate(self, body, index=None, doc_type=None, **kwargs): return self.client.mpercolate(body, index, doc_type, **kwargs) # Scroll methods # ============== # The underlying client does not require an index to perform scroll. def scroll(self, scroll_id, **kwargs): return self.client.scroll(scroll_id, **kwargs) def clear_scroll(self, scroll_id, body=None, **kwargs): return self.client.clear_scroll(scroll_id, body, **kwargs) # Query methods # ============= # The underlying client requires index names (or alias names) to perform # queries. The connection wrapper overrides these client methods to # automatically uses the configured names (indices and/or aliases). def create(self, doc_type, body, doc_id=None, **kwargs): return self.client.create(self.indices, doc_type, body, doc_id, **kwargs) def index(self, doc_type, body, doc_id=None, **kwargs): return self.client.index(self.indices, doc_type, body, doc_id, **kwargs) def exists(self, doc_id, doc_type='_all', **kwargs): return self.client.exists(self.indices, doc_id, doc_type, **kwargs) def get(self, doc_id, doc_type='_all', **kwargs): return self.client.get(self.indices, doc_id, doc_type, **kwargs) def get_source(self, doc_id, doc_type='_all', **kwargs): return self.client.get_source(self.indices, doc_id, doc_type, **kwargs) def update(self, doc_type, doc_id, body=None, **kwargs): return self.client.update(self.indices, doc_type, doc_id, body, **kwargs) def search(self, doc_type=None, body=None, **kwargs): return self.client.search(self.indices, doc_type, body, **kwargs) def search_shards(self, doc_type=None, **kwargs): return self.client.search_shards(self.indices, doc_type, **kwargs) def search_template(self, doc_type=None, body=None, **kwargs): return self.client.search_template(self.indices, doc_type, body, **kwargs) def explain(self, doc_type, doc_id, body=None, **kwargs): return self.client.explain(self.indices, doc_type, doc_id, body, **kwargs) def delete(self, doc_type, doc_id, **kwargs): return self.client.delete(self.indices, doc_type, doc_id, **kwargs) def count(self, doc_type=None, body=None, **kwargs): return self.client.count(self.indices, doc_type, body, **kwargs) def delete_by_query(self, doc_type=None, body=None, **kwargs): return self.client.delete_by_query(self.indices, doc_type, body, **kwargs) def suggest(self, body, **kwargs): return self.client.suggest(body, self.indices, **kwargs) def percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.percolate(self.indices, doc_type, doc_id, body, **kwargs) def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.count_percolate(self.indices, doc_type, doc_id, body, **kwargs) def mlt(self, doc_type, doc_id, body=None, **kwargs): return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs) def termvector(self, doc_type, doc_id, body=None, **kwargs): return self.client.termvector(self.indices, doc_type, doc_id, body, **kwargs) def mtermvectors(self, doc_type=None, body=None, **kwargs): return self.client.mtermvectors(self.indices, doc_type, body, **kwargs) def benchmark(self, doc_type=None, body=None, **kwargs): return self.client.benchmark(self.indices, doc_type, body, **kwargs) def abort_benchmark(self, name=None, **kwargs): return self.client.abort_benchmark(name, **kwargs) def list_benchmarks(self, doc_type=None, **kwargs): return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
class BaseElasticsearchBackend(Base): """Base connection wrapper based on the ElasticSearch official library. It uses two entry points to configure the underlying connection: * ``transport_class``: the transport class from ``elasticsearch``. By default ``elasticsearch.transport.Transport``. * ``connection_class``: the connection class used by the transport class. It's undefined by default, as it is on the subclasses to provide one. If any of these elements is not defined, an ``ImproperlyConfigured`` error will be raised when the backend will try to configure the client. """ #: ElasticSearch transport class used by the client class to perform #: requests. transport_class = Transport #: ElasticSearch connection class used by the transport class to perform #: requests. connection_class = None def configure_client(self): """Instantiate and configure the ElasticSearch client. It simply takes the given HOSTS list and uses PARAMS as the keyword arguments of the ElasticSearch class. The client's transport_class is given by the class attribute ``transport_class``, and the connection class used by the transport class is given by the class attribute ``connection_class``. An ``ImproperlyConfigured`` exception is raised if any of these elements is undefined. """ hosts = self.server['HOSTS'] params = self.server['PARAMS'] if not self.transport_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no transport class provided' % self.__class__) if not self.connection_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no connection class provided' % self.__class__) #pylint: disable=star-args self.client = Elasticsearch(hosts, transport_class=self.transport_class, connection_class=self.connection_class, **params) # Server methods # ============== # The underlying client does not require index names to perform server # related queries, such as "ping" or "info". The connection wrapper act # for them as a proxy. def ping(self, **kwargs): return self.client.ping(**kwargs) def info(self, **kwargs): return self.client.info(**kwargs) def put_script(self, lang, script_id, body, **kwargs): return self.client.put_script(lang, script_id, body, **kwargs) def get_script(self, lang, script_id, **kwargs): return self.client.get_script(lang, script_id, **kwargs) def delete_script(self, lang, script_id, **kwargs): return self.client.delete_script(lang, script_id, **kwargs) def put_template(self, template_id, body, **kwargs): return self.client.put_template(template_id, body, **kwargs) def get_template(self, template_id, body=None, **kwargs): return self.client.get_template(template_id, body, **kwargs) def delete_template(self, template_id=None, **kwargs): return self.client.delete_template(template_id, **kwargs) # Bulk methods # ============ # The underlying client does not require index names, but it can be used. # As it makes sense to not give an index, developers are free to use these # as they want, as long as they are careful. def mget(self, body, index=None, doc_type=None, **kwargs): return self.client.mget(body, index, doc_type, **kwargs) def bulk(self, body, index=None, doc_type=None, **kwargs): return self.client.bulk(body, index, doc_type, **kwargs) def msearch(self, body, index=None, doc_type=None, **kwargs): return self.client.msearch(body, index, doc_type, **kwargs) def mpercolate(self, body, index=None, doc_type=None, **kwargs): return self.client.mpercolate(body, index, doc_type, **kwargs) # Scroll methods # ============== # The underlying client does not require an index to perform scroll. def scroll(self, scroll_id, **kwargs): return self.client.scroll(scroll_id, **kwargs) def clear_scroll(self, scroll_id, body=None, **kwargs): return self.client.clear_scroll(scroll_id, body, **kwargs) # Query methods # ============= # The underlying client requires index names (or alias names) to perform # queries. The connection wrapper overrides these client methods to # automatically uses the configured names (indices and/or aliases). def create(self, doc_type, body, doc_id=None, **kwargs): return self.client.create( self.indices, doc_type, body, doc_id, **kwargs) def index(self, doc_type, body, doc_id=None, **kwargs): return self.client.index( self.indices, doc_type, body, doc_id, **kwargs) def exists(self, doc_id, doc_type='_all', **kwargs): return self.client.exists(self.indices, doc_id, doc_type, **kwargs) def get(self, doc_id, doc_type='_all', **kwargs): return self.client.get(self.indices, doc_id, doc_type, **kwargs) def get_source(self, doc_id, doc_type='_all', **kwargs): return self.client.get_source(self.indices, doc_id, doc_type, **kwargs) def update(self, doc_type, doc_id, body=None, **kwargs): return self.client.update( self.indices, doc_type, doc_id, body, **kwargs) def search(self, doc_type=None, body=None, **kwargs): return self.client.search(self.indices, doc_type, body, **kwargs) def search_shards(self, doc_type=None, **kwargs): return self.client.search_shards(self.indices, doc_type, **kwargs) def search_template(self, doc_type=None, body=None, **kwargs): return self.client.search_template( self.indices, doc_type, body, **kwargs) def explain(self, doc_type, doc_id, body=None, **kwargs): return self.client.explain( self.indices, doc_type, doc_id, body, **kwargs) def delete(self, doc_type, doc_id, **kwargs): return self.client.delete(self.indices, doc_type, doc_id, **kwargs) def count(self, doc_type=None, body=None, **kwargs): return self.client.count(self.indices, doc_type, body, **kwargs) def delete_by_query(self, doc_type=None, body=None, **kwargs): return self.client.delete_by_query( self.indices, doc_type, body, **kwargs) def suggest(self, body, **kwargs): return self.client.suggest(body, self.indices, **kwargs) def percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.percolate( self.indices, doc_type, doc_id, body, **kwargs) def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.count_percolate( self.indices, doc_type, doc_id, body, **kwargs) def mlt(self, doc_type, doc_id, body=None, **kwargs): return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs) def termvector(self, doc_type, doc_id, body=None, **kwargs): return self.client.termvector( self.indices, doc_type, doc_id, body, **kwargs) def mtermvectors(self, doc_type=None, body=None, **kwargs): return self.client.mtermvectors(self.indices, doc_type, body, **kwargs) def benchmark(self, doc_type=None, body=None, **kwargs): return self.client.benchmark(self.indices, doc_type, body, **kwargs) def abort_benchmark(self, name=None, **kwargs): return self.client.abort_benchmark(name, **kwargs) def list_benchmarks(self, doc_type=None, **kwargs): return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
class ELmonocleDB: log = logging.getLogger("monocle.ELmonocleDB") def __init__( self, elastic_conn="localhost:9200", index=None, timeout=10, prefix=CHANGE_PREFIX, create=True, previous_schema=False, idents_config: Optional[IdentsConfig] = None, user=None, password=None, use_ssl=None, verify_certs=None, ssl_show_warn=None, ) -> None: host, port = elastic_conn.split(":") s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ip = socket.gethostbyname(host) self.log.info("ES IP is %s" % ip) self.log.info("ES prefix is %s" % prefix) elastic_conn = [{ "host": host, "port": port, }] if use_ssl: elastic_conn[0]["use_ssl"] = use_ssl if not verify_certs: elastic_conn[0]["verify_certs"] = verify_certs if not ssl_show_warn: elastic_conn[0]["ssl_show_warn"] = ssl_show_warn if user and password: elastic_conn[0]["http_auth"] = "%s:%s" % (user, password) while True: try: s.connect((ip, int(port))) s.shutdown(2) s.close() break except Exception as excpt: self.log.info( "Unable to connect to %s: %s. Sleeping for %ds." % (elastic_conn, excpt, timeout)) time.sleep(timeout) self.log.info("Connecting to ES server at %s" % elastic_conn) self.es = Elasticsearch(elastic_conn) self.log.info(self.es.info()) if previous_schema: self.prefix = PREV_CHANGE_PREFIX else: self.prefix = prefix if not index: self.log.info("No index provided") return self.idents_config = idents_config or [] self.index = "{}{}".format(self.prefix, index) self.log.info("Using ES index %s" % self.index) self.mapping = { "properties": { "id": { "type": "keyword" }, "type": { "type": "keyword" }, "number": { "type": "keyword" }, "change_id": { "type": "keyword" }, "title": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 8191 } }, }, "text": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 8191 } }, }, "url": { "type": "keyword" }, "commit_count": { "type": "integer" }, "additions": { "type": "integer" }, "deletions": { "type": "integer" }, "changed_files_count": { "type": "integer" }, "changed_files": { "properties": { "additions": { "type": "integer" }, "deletions": { "type": "integer" }, "path": { "type": "keyword" }, } }, "commits": { "properties": { "sha": { "type": "keyword" }, "author": { "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, } }, "committer": { "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, } }, "authored_at": { "type": "date", "format": "date_time_no_millis", }, "committed_at": { "type": "date", "format": "date_time_no_millis", }, "additions": { "type": "integer" }, "deletions": { "type": "integer" }, "title": { "type": "text" }, } }, "repository_prefix": { "type": "keyword" }, "repository_fullname": { "type": "keyword" }, "repository_shortname": { "type": "keyword" }, "author": { "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, } }, "on_author": { "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, } }, "committer": { "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, } }, "merged_by": { "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, } }, "branch": { "type": "keyword" }, "target_branch": { "type": "keyword" }, "created_at": { "type": "date", "format": "date_time_no_millis" }, "on_created_at": { "type": "date", "format": "date_time_no_millis" }, "merged_at": { "type": "date", "format": "date_time_no_millis" }, "updated_at": { "type": "date", "format": "date_time_no_millis" }, "closed_at": { "type": "date", "format": "date_time_no_millis" }, "state": { "type": "keyword" }, "duration": { "type": "integer" }, "mergeable": { "type": "keyword" }, "labels": { "type": "keyword" }, "assignees": { "type": "nested", "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, }, }, "approval": { "type": "keyword" }, "draft": { "type": "boolean" }, "self_merged": { "type": "boolean" }, } } settings = {"mappings": self.mapping} self.ic = self.es.indices if create: self.ic.create(index=self.index, ignore=400, body=settings) # The authors_histo is failing on some context with this error when the # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be # set by changing the [search.max_buckets] cluster level) # This is an attempt to mitigate the issue cluster_settings = {"transient": {"search.max_buckets": 100000}} self.es.cluster.put_settings(body=cluster_settings) def update(self, source_it: List[Union[Change, Event]]) -> None: def gen(it): for _source in it: source = change_or_event_to_dict(_source) d = {} d["_index"] = self.index d["_op_type"] = "update" d["_id"] = source["id"] d["doc"] = source d["doc_as_upsert"] = True yield d bulk(self.es, gen(source_it)) self.es.indices.refresh(index=self.index) def delete_index(self): self.log.info("Deleting index: %s" % self.index) self.ic.delete(index=self.index) def delete_repository(self, repository_fullname): params = {"index": self.index} body = { "query": { "bool": { "filter": { "regexp": { "repository_fullname": { "value": repository_fullname } } } } } } params["body"] = body self.es.delete_by_query(**params) self.es.indices.refresh(index=self.index) def get_last_updated(self, repository_fullname): params = {"index": self.index} body = { "sort": [{ "updated_at": { "order": "desc" } }], "query": { "bool": { "filter": [ { "term": { "type": "Change" } }, { "regexp": { "repository_fullname": { "value": repository_fullname } } }, ] } }, } params["body"] = body try: res = self.es.search(**params) except Exception: return [] ret = [r["_source"] for r in res["hits"]["hits"]] if not ret: return [] return ret[0] def run_named_query(self, name, *args, **kwargs): if name not in queries.public_queries: raise UnknownQueryException("Unknown query: %s" % name) return getattr(queries, name)(self.es, self.index, *args, **kwargs) def get_indices(self): return [ ind.replace(self.prefix, "") for ind in self.es.indices.get(self.prefix + "*") ] def iter_index(self): body = {"query": {"match_all": {}}} return scan(self.es, query=body, index=self.index, size=5000) def update_idents(self) -> None: import json bulk_size = 7500 def get_obj_hash(obj: Dict) -> int: obj_json = json.dumps(obj, sort_keys=True) return hash(obj_json) def update_ident(dict_ident: Dict) -> Dict: dict_ident["muid"] = create_muid(dict_ident["uid"], self.idents_config) return dict_ident def _update_idents( obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]: prev_hash = get_obj_hash(obj) if obj["type"] == "Change": obj["author"] = update_ident(obj["author"]) if "committer" in obj: obj["committer"] = update_ident(obj["committer"]) if "merged_by" in obj: obj["merged_by"] = update_ident(obj["merged_by"]) if "assignees" in obj: obj["assignees"] = list(map(update_ident, obj["assignees"])) if "commits" in obj: for commit in obj["commits"]: commit["author"] = update_ident(commit["author"]) commit["committer"] = update_ident(commit["committer"]) else: if "author" in obj: obj["author"] = update_ident(obj["author"]) if "on_author" in obj: obj["on_author"] = update_ident(obj["on_author"]) updated = not prev_hash == get_obj_hash(obj) if updated: return dict_to_change_or_event(obj), True else: return None, False def bulk_update(to_update: List) -> List: print("Updating %s objects ..." % len(to_update)) self.update(to_update) return [] to_update = [] total_read = 0 for _obj in self.iter_index(): total_read += 1 if total_read % bulk_size == 0: print("%s objects read from the database" % total_read) obj = _obj["_source"] obj, updated = _update_idents(obj) if updated: to_update.append(obj) if len(to_update) == bulk_size: to_update = bulk_update(to_update) bulk_update(to_update)