def _get_es_status(host: str = "localhost", port: int = None): """Checks ElasticSearch status, hosts can be specified""" try: es_status = False es_res = {"status": ""} if port is not None: es_client = ElasticsearchClient().get() else: es_client = ElasticsearchClient()._get(host, port, 1) es_res = es_client.cluster.health() except ConnectionError as exception: logger.warning("connection error with ElasticSearch: %s" % exception) es_res['status'] = 'red' if es_res['status'] == 'green': es_status = True return es_status, es_res
def test_verify_dynamic_mapping(self): doc1 = { "manifest": { "data": "hello world!" }, "description": "Scooby dooby do, where are you, we got some work to do now.", "time1": "2017-11-02T09:50:20.123123Z", "time2": "2017-11-02 09:55:12", "time3": "2017-11-02", } bundle_uuid = str(uuid.uuid4()) version = get_version() bundle_fqid = f"{bundle_uuid}.{version}" es_client = ElasticsearchClient.get() es_client.index(index=self.dss_index_name, doc_type=ESDocType.doc.name, id=bundle_fqid, body=doc1) mapping = es_client.indices.get_mapping( self.dss_index_name)[self.dss_index_name]['mappings'] self.assertEqual(mapping['query']['properties']['query']['type'], 'percolator') self.assertEqual(mapping['doc']['properties']['description']['type'], 'keyword') self.assertEqual( mapping['doc']['properties']['description']['fields']['text'] ['type'], 'text') self.assertEqual(mapping['doc']['properties']['time1']['type'], 'date') self.assertEqual(mapping['doc']['properties']['time2']['type'], 'date') self.assertEqual(mapping['doc']['properties']['time3']['type'], 'date')
def get(uuid: str, replica: str): owner = security.get_token_email(request.token_info) es_client = ElasticsearchClient.get() try: response = es_client.get(index=Config.get_es_index_name( ESIndexType.subscriptions, Replica[replica]), doc_type=ESDocType.subscription.name, id=uuid) except NotFoundError: raise DSSException(requests.codes.not_found, "not_found", "Cannot find subscription!") source = response['_source'] source['uuid'] = uuid source['replica'] = replica if 'hmac_key_id' in response: source['hmac_key_id'] = response['hmac_key_id'] if 'hmac_secret_key' in source: source.pop('hmac_secret_key') if source['owner'] != owner: # common_error_handler defaults code to capitalized 'Forbidden' for Werkzeug exception. Keeping consistent. raise DSSException(requests.codes.forbidden, "Forbidden", "Your credentials can't access this subscription!") return jsonify(source), requests.codes.okay
def delete(uuid: str, replica: str): owner = security.get_token_email(request.token_info) es_client = ElasticsearchClient.get() try: response = es_client.get(index=Config.get_es_index_name( ESIndexType.subscriptions, Replica[replica]), doc_type=ESDocType.subscription.name, id=uuid) except NotFoundError: raise DSSException(requests.codes.not_found, "not_found", "Cannot find subscription!") stored_metadata = response['_source'] if stored_metadata['owner'] != owner: # common_error_handler defaults code to capitalized 'Forbidden' for Werkzeug exception. Keeping consistent. raise DSSException(requests.codes.forbidden, "Forbidden", "Your credentials can't access this subscription!") _delete_subscription(es_client, uuid) timestamp = datetime.datetime.utcnow() time_deleted = timestamp.strftime("%Y-%m-%dT%H%M%S.%fZ") return jsonify({'timeDeleted': time_deleted}), requests.codes.okay
def _refresh_percolate_queries(self, index_name: str): # When dynamic templates are used and queries for percolation have been added # to an index before the index contains mappings of fields referenced by those queries, # the queries must be reloaded when the mappings are present for the queries to match. # See: https://github.com/elastic/elasticsearch/issues/5750 subscription_index_name = Config.get_es_index_name(ESIndexType.subscriptions, self.replica) es_client = ElasticsearchClient.get() if not es_client.indices.exists(subscription_index_name): return subscription_queries = [{'_index': index_name, '_type': ESDocType.query.name, '_id': hit['_id'], '_source': hit['_source']['es_query'] } for hit in scan(es_client, index=subscription_index_name, doc_type=ESDocType.subscription.name, query={'query': {'match_all': {}}}) ] if subscription_queries: try: bulk(es_client, iter(subscription_queries), refresh=True) except BulkIndexError as ex: logger.error(f"Error occurred when adding subscription queries " f"to index {index_name} Errors: {ex.errors}")
def delete(uuid: str, replica: str): authenticated_user_email = request.token_info['email'] es_client = ElasticsearchClient.get() try: response = es_client.get(index=Config.get_es_index_name( ESIndexType.subscriptions, Replica[replica]), doc_type=ESDocType.subscription.name, id=uuid) except NotFoundError as ex: raise DSSException(requests.codes.not_found, "not_found", "Cannot find subscription!") stored_metadata = response['_source'] if stored_metadata['owner'] != authenticated_user_email: # common_error_handler defaults code to capitalized 'Forbidden' for Werkzeug exception. Keeping consistent. raise DSSException(requests.codes.forbidden, "Forbidden", "Your credentials can't access this subscription!") # get all indexes that use current alias alias_name = Config.get_es_alias_name(ESIndexType.docs, Replica[replica]) doc_indexes = _get_indexes_by_alias(es_client, alias_name) _unregister_percolate(es_client, doc_indexes, uuid) es_client.delete(index=Config.get_es_index_name(ESIndexType.subscriptions, Replica[replica]), doc_type=ESDocType.subscription.name, id=uuid) timestamp = datetime.datetime.utcnow() time_deleted = timestamp.strftime("%Y-%m-%dT%H%M%S.%fZ") return jsonify({'timeDeleted': time_deleted}), requests.codes.okay
def _prepare_index(self, dryrun): shape_descriptor = self.get_shape_descriptor() index_name = Config.get_es_index_name(ESIndexType.docs, self.replica, shape_descriptor) es_client = ElasticsearchClient.get() if not dryrun: IndexManager.create_index(es_client, self.replica, index_name) return index_name
def _write_to_index(self, index_name: str, version: typing.Optional[int] = None): es_client = ElasticsearchClient.get() initial_mappings = es_client.indices.get_mapping(index_name)[index_name]['mappings'] super()._write_to_index(index_name, version=version) current_mappings = es_client.indices.get_mapping(index_name)[index_name]['mappings'] if initial_mappings != current_mappings: self._refresh_percolate_queries(index_name)
def test_put(self): uuid_ = self._put_subscription() es_client = ElasticsearchClient.get() response = es_client.get(index=self.doc_index_name, doc_type=dss.ESDocType.query.name, id=uuid_) registered_query = response['_source'] self.assertEqual(self.sample_percolate_query, registered_query)
def from_index(cls, replica: Replica, bundle_fqid: BundleFQID, index_name, version=None): es_client = ElasticsearchClient.get() source = es_client.get(index_name, str(bundle_fqid), ESDocType.doc.name, version=version)['_source'] return cls(replica, bundle_fqid, source)
def check_count(self, es_query, expected_count, timeout=5): es_client = ElasticsearchClient.get() timeout_time = timeout + time.time() while time.time() <= timeout_time: count_resp = es_client.count(index=self.dss_index_name, doc_type=ESDocType.doc.name, body=es_query) if count_resp['count'] == expected_count: break else: time.sleep(0.5) else: self.fail("elasticsearch failed to return all results.")
def elasticsearch_delete_index(index_name: str): # ensure the indexes are test index. assert Config._CURRENT_CONFIG == BucketConfig.TEST assert Config.test_index_suffix.value assert index_name.endswith(Config.test_index_suffix.value) try: es_client = ElasticsearchClient.get() es_client.indices.delete(index=index_name, ignore=[404]) except Exception as e: logger.warning( "Error occurred while removing Elasticsearch index:%s Exception: %s", index_name, e)
def _prepare_index(self, dryrun): shape_descriptor = self['shape_descriptor'] if shape_descriptor is not None: hashed_shape_descriptor = hashlib.sha1( str(shape_descriptor).encode("utf-8")).hexdigest() else: hashed_shape_descriptor = "" index_name = Config.get_es_index_name(ESIndexType.docs, self.replica, hashed_shape_descriptor) es_client = ElasticsearchClient.get() if not dryrun: IndexManager.create_index(es_client, self.replica, index_name) return index_name
def _remove_versions(self, versions: typing.MutableMapping[str, int]): """ Remove this document from each given index provided that it contains the given version of this document. """ es_client = ElasticsearchClient.get() num_ok, errors = bulk(es_client, raise_on_error=False, actions=[{ '_op_type': 'delete', '_index': index_name, '_type': ESDocType.doc.name, '_version': version, '_id': str(self.fqid), } for index_name, version in versions.items()]) for item in errors: logger.warning(f"Document deletion failed: {json.dumps(item)}")
def _get_indexed_versions(self) -> typing.MutableMapping[str, int]: """ Returns a dictionary mapping the name of each index containing this document to the version of this document in that index. Note that `version` denotes document version, not bundle version. """ es_client = ElasticsearchClient.get() alias_name = Config.get_es_alias_name(ESIndexType.docs, self.replica) # First attempt to get the single instance of the document. The common case is that there is zero or one # instance. try: doc = es_client.get(id=str(self.fqid), index=alias_name, _source=False, stored_fields=[]) # One instance found return {doc['_index']: doc['_version']} except TransportError as e: if e.status_code == 404: # No instance found return {} elif e.status_code == 400: # This could be a general error or an one complaining that we attempted a single-index operation # against a multi-index alias. If the latter, we can actually avoid a round trip by parsing the index # names out of the error message generated at https://github.com/elastic/elasticsearch/blob/5.5 # /core/src/main/java/org/elasticsearch/cluster/metadata/IndexNameExpressionResolver.java#L194 error = e.info.get('error') if error: reason = error.get('reason') if reason: match = self.multi_index_error.fullmatch(reason) if match: indices = map(str.strip, match.group(2).split(',')) # Now get the document version from all indices in the alias doc = es_client.mget(_source=False, stored_fields=[], body={ 'docs': [{ '_id': str(self.fqid), '_index': index } for index in indices] }) return { doc['_index']: doc['_version'] for doc in doc['docs'] if doc.get('found') } raise
def _find_matching_subscriptions(self, index_name: str) -> typing.MutableSet[str]: percolate_document = { 'query': { 'percolate': { 'field': "query", 'document_type': ESDocType.doc.name, 'document': self } } } subscription_ids = set() for hit in scan(ElasticsearchClient.get(), index=index_name, query=percolate_document): subscription_ids.add(hit["_id"]) logger.debug(f"Found {len(subscription_ids)} matching subscription(s).") return subscription_ids
def _write_to_index(self, index_name: str, version: typing.Optional[int] = None): """ Place this document into the given index. :param version: if 0, write only if this document is currently absent from the given index if > 0, write only if the specified version of this document is currently present if None, write regardless """ es_client = ElasticsearchClient.get() body = self.to_json() logger.debug(f"Writing document to index {index_name}: {body}") es_client.index(index=index_name, doc_type=ESDocType.doc.name, id=str(self.fqid), body=body, op_type='create' if version == 0 else 'index', version=version if version else None)
def setUp(self): super().setUp() self.alias_name = dss.Config.get_es_alias_name(dss.ESIndexType.docs, self.replica) self.sub_index_name = dss.Config.get_es_index_name( dss.ESIndexType.subscriptions, self.replica) shape_identifier = self.index_document.get_shape_descriptor() self.doc_index_name = dss.Config.get_es_index_name( dss.ESIndexType.docs, self.replica, shape_identifier) es_client = ElasticsearchClient.get() IndexManager.create_index(es_client, self.replica, self.doc_index_name) es_client.index(index=self.doc_index_name, doc_type=dss.ESDocType.doc.name, id=str(uuid.uuid4()), body=self.index_document, refresh=True) self.callback_url = "https://example.com" self.sample_percolate_query = smartseq2_paired_ends_v2_or_v3_query
def populate_search_index(self, index_document: dict, count: int) -> list: es_client = ElasticsearchClient.get() bundles = [] for i in range(count): bundle_uuid = str(uuid.uuid4()) version = get_version() index_document['manifest']['version'] = version bundle_fqid = f"{bundle_uuid}.{version}" bundle_url = ( f"https://127.0.0.1:{self.app._port}" f"/v1/bundles/{bundle_uuid}?version={version}&replica={self.replica.name}" ) es_client.index(index=self.dss_index_name, doc_type=ESDocType.doc.name, id=bundle_fqid, body=index_document, refresh=(i == count - 1)) bundles.append((bundle_fqid, bundle_url)) return bundles
def _get_subscription(self, subscription_id: str) -> dict: subscription_query = { 'query': { 'ids': { 'type': ESDocType.subscription.name, 'values': [subscription_id] } } } response = ElasticsearchClient.get().search( index=Config.get_es_index_name(ESIndexType.subscriptions, self.replica), body=subscription_query) hits = response['hits']['hits'] assert len(hits) == 1 hit = hits[0] assert hit['_id'] == subscription_id subscription = hit['_source'] assert 'id' not in subscription subscription['id'] = subscription_id return subscription
def clear_indexes(index_names: List[str], doctypes: List[str]): """ Erases all of the documents in indexes with any of the doctypes provided. This can only be used in TEST configuration with IndexSuffix.name set. Only indexes with the same IndexSuffix.name can be erased. """ # ensure the indexes are test index. assert Config._CURRENT_CONFIG == BucketConfig.TEST assert Config.test_index_suffix.value for index_name in index_names: assert index_name.endswith(Config.test_index_suffix.value) es_client = ElasticsearchClient.get() if es_client.indices.exists(index_names): es_client.delete_by_query(index=index_names, body={'query': { 'match_all': {} }}, doc_type=doctypes, refresh=True, conflicts='proceed')
def find(replica: str): owner = security.get_token_email(request.token_info) es_client = ElasticsearchClient.get() search_obj = Search(using=es_client, index=Config.get_es_index_name( ESIndexType.subscriptions, Replica[replica]), doc_type=ESDocType.subscription.name) search = search_obj.query({'bool': {'must': [{'term': {'owner': owner}}]}}) responses = [{ 'uuid': hit.meta.id, 'replica': replica, 'owner': owner, **{k: v for k, v in hit.to_dict().items() if k != 'hmac_secret_key'} } for hit in search.scan()] full_response = {'subscriptions': responses} return jsonify(full_response), requests.codes.okay
def find(replica: str): owner = request.token_info['email'] es_client = ElasticsearchClient.get() search_obj = Search(using=es_client, index=Config.get_es_index_name( ESIndexType.subscriptions, Replica[replica]), doc_type=ESDocType.subscription.name) search = search_obj.query({'match': {'owner': owner}}) responses = [{ 'uuid': hit.meta.id, 'replica': replica, 'owner': owner, 'callback_url': hit.callback_url, 'es_query': hit.es_query.to_dict() } for hit in search.scan()] full_response = {'subscriptions': responses} return jsonify(full_response), requests.codes.okay
def test_search_session_expired_when_session_deleted(self): self.populate_search_index(self.index_document, 20) self.check_count(smartseq2_paired_ends_v3_query, 20) url = self.build_url({"per_page": 10}) search_obj = self.assertPostResponse( path=url, json_request_body=dict(es_query=smartseq2_paired_ends_v3_query), expected_code=requests.codes.partial) self.verify_search_result(search_obj.json, smartseq2_paired_ends_v3_query, 20, 10) next_url = self.get_next_url(search_obj.response.headers) scroll_id = self.verify_next_url(next_url, 10) es_client = ElasticsearchClient.get() es_client.clear_scroll(scroll_id) self.assertPostResponse( path=self.strip_next_url(next_url), json_request_body=dict(es_query=smartseq2_paired_ends_v3_query), expected_code=requests.codes.not_found, expected_error=ExpectedErrorFields( code="elasticsearch_context_not_found", status=requests.codes.not_found))
def _es_search_page(es_query: dict, replica: Replica, per_page: int, _scroll_id: typing.Optional[str], output_format: str) -> dict: es_query = deepcopy(es_query) es_client = ElasticsearchClient.get() # Do not return the raw indexed data unless it is requested if output_format != 'raw': es_query['_source'] = False # The time for a scroll search context to stay open per page. A page of results must be retreived before this # timeout expires. Subsequent calls to search will refresh the scroll timeout. For more details on time format see: # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#time-units scroll = '2m' # set a timeout of 2min to keep the search context alive. This is reset # From: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html # Scroll requests have optimizations that make them faster when the sort order is _doc. If you want to iterate over # all documents regardless of the order, this is the most efficient option: # { # "sort": [ # "_doc" # ] # } sort = {"sort": ["_doc"]} if _scroll_id is None: page = es_client.search(index=Config.get_es_alias_name( ESIndexType.docs, replica), doc_type=ESDocType.doc.name, scroll=scroll, size=per_page, body=es_query, sort=sort) logger.debug("Created ES scroll instance") else: page = es_client.scroll(scroll_id=_scroll_id, scroll=scroll) logger.debug( f"Retrieved ES results from scroll instance Scroll_id: {_scroll_id}" ) return page
def setUp(self): super().setUp() self.alias_name = dss.Config.get_es_alias_name(dss.ESIndexType.docs, self.replica) self.sub_index_name = dss.Config.get_es_index_name(dss.ESIndexType.subscriptions, self.replica) shape_identifier = self.index_document._get_shape_descriptor() shape_identifier = hashlib.sha1(f"{shape_identifier}".encode("utf-8")).hexdigest() self.doc_index_name = dss.Config.get_es_index_name(dss.ESIndexType.docs, self.replica, shape_identifier) es_client = ElasticsearchClient.get() IndexManager.create_index(es_client, self.replica, self.doc_index_name) es_client.index(index=self.doc_index_name, doc_type=dss.ESDocType.doc.name, id=str(uuid.uuid4()), body=self.index_document, refresh=True) self.endpoint = Endpoint(callback_url="https://example.com", method="POST", encoding="application/json", form_fields={'foo': 'bar'}, payload_form_field='baz') self.sample_percolate_query = smartseq2_paired_ends_vx_query self.hmac_key_id = 'dss_test' self.hmac_secret_key = '23/33'
def get_es_client(): domain_name = "dss-index-" + os.environ['DSS_DEPLOYMENT_STAGE'] host = boto3.client("es").describe_elasticsearch_domain( DomainName=domain_name)['DomainStatus']['Endpoint'] os.environ['DSS_ES_ENDPOINT'] = host return ElasticsearchClient.get()
def setUp(self): super().setUp() self.dss_index_name = dss.Config.get_es_index_name( dss.ESIndexType.docs, self.replica) es_client = ElasticsearchClient.get() IndexManager.create_index(es_client, self.replica, self.dss_index_name)
def put(json_request_body: dict, replica: str): uuid = str(uuid4()) es_query = json_request_body['es_query'] owner = request.token_info['email'] es_client = ElasticsearchClient.get() index_mapping = { "mappings": { ESDocType.subscription.name: { "properties": { "owner": { "type": "string", "index": "not_analyzed" }, "es_query": { "type": "object", "enabled": "false" } } } } } # Elasticsearch preprocesses inputs by splitting strings on punctuation. # So for [email protected], if I searched for people with the email address [email protected], # [email protected] would show up because elasticsearch matched example w/ example. # By including "index": "not_analyzed", Elasticsearch leaves all owner inputs alone. index_name = Config.get_es_index_name(ESIndexType.subscriptions, Replica[replica]) IndexManager.get_subscription_index(es_client, index_name, index_mapping) # get all indexes that use current alias alias_name = Config.get_es_alias_name(ESIndexType.docs, Replica[replica]) doc_indexes = _get_indexes_by_alias(es_client, alias_name) # try to subscribe query to each of the indexes. subscribed_indexes = [] for doc_index in doc_indexes: try: percolate_registration = _register_percolate( es_client, doc_index, uuid, es_query, replica) except ElasticsearchException as ex: logger.debug( f"Exception occured when registering a document to an index. Exception: {ex}" ) last_ex = ex else: logger.debug( f"Percolate query registration succeeded:\n{percolate_registration}" ) subscribed_indexes.append(doc_index) # Queries are unlikely to fit in all of the indexes, therefore errors will almost always occur. Only return an error # if no queries are successfully indexed. if doc_indexes and not subscribed_indexes: logger.critical( f"Percolate query registration failed: owner: {owner}, uuid: {uuid}, " f"replica: {replica}, es_query: {es_query}, Exception: {last_ex}") raise DSSException( requests.codes.internal_server_error, "elasticsearch_error", "Unable to register elasticsearch percolate query!") from last_ex json_request_body['owner'] = owner try: subscription_registration = _register_subscription( es_client, uuid, json_request_body, replica) logger.debug( f"Event Subscription succeeded:\n{subscription_registration}") except ElasticsearchException as ex: logger.critical( f"Event Subscription failed: owner: {owner}, uuid: {uuid}, " f"replica: {replica}, Exception: {ex}") # Delete percolate query to make sure queries and subscriptions are in sync. doc_indexes = _get_indexes_by_alias(es_client, alias_name) _unregister_percolate(es_client, doc_indexes, uuid) raise DSSException( requests.codes.internal_server_error, "elasticsearch_error", "Unable to register subscription! Rolling back percolate query.") return jsonify(dict(uuid=uuid)), requests.codes.created
output_format: str) -> dict: result_list = [] # type: typing.List[dict] for hit in page['hits']['hits']: result = {'bundle_fqid': hit['_id'], 'search_score': hit['_score']} if output_format == 'raw': result['metadata'] = hit['_source'] result_list.append(result) return { 'es_query': es_query, 'results': result_list, 'total_hits': page['hits']['total'] } es_client = ElasticsearchClient.get() replica = Replica.aws es_query = { "query": { "bool": { "must": [{ "exists": { "field": "files.links_json" } }] } } } output_format = 'raw' per_page = 1000