def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): client_options = kwargs.get('clientOptions', {}) if 'aws' in kwargs: if _HAS_AWS is False: raise ConfigurationError('aws extras must be installed to sign Elasticsearch requests') aws_args = kwargs.get('aws', {'region': 'us-east-1'}) aws = aws_session.Session() if 'access_id' in aws_args and 'secret_key' in aws_args: aws = aws_session.Session( aws_access_key_id = aws_args['access_id'], aws_secret_access_key = aws_args['secret_key']) credentials = aws.get_credentials() region = aws.region_name or aws_args['region'] aws_auth = AWSV4Sign(credentials, region, 'es') client_options['http_auth'] = aws_auth client_options['use_ssl'] = True client_options['verify_certs'] = True client_options['connection_class'] = es_connection.RequestsHttpConnection self.elastic = Elasticsearch( hosts=[url], **client_options) self.auto_commit_interval = auto_commit_interval self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() self.has_attachment_mapping = False self.attachment_field = attachment_field
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): self.graph = Graph(url) self.auto_commit_interval = auto_commit_interval self.unique_key = unique_key self.chunk_size = chunk_size self._formatter = DefaultDocumentFormatter() self.kwargs = kwargs.get("clientOptions")
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): self.elastic = Elasticsearch(hosts=[url]) self.auto_commit_interval = auto_commit_interval self.doc_type = 'string' # default type is string, change if needed self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter()
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='uid', chunk_size=DEFAULT_MAX_BULK, **kwargs): self.graph = Graph(url) self.url = url self.auto_commit_interval = auto_commit_interval self.unique_key = unique_key self.chunk_size = chunk_size self._formatter = DefaultDocumentFormatter() self.kwargs = kwargs.get("clientOptions") self.authorization_token = base64.b64encode(os.getenv('NEO4J_AUTH'))
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", **kwargs): self.elastic = Elasticsearch(hosts=[url]) self.auto_commit_interval = auto_commit_interval self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter()
def __init__( self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs ): client_options = kwargs.get("clientOptions", {}) if "aws" in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( "aws extras must be installed to sign Elasticsearch " "requests. Install with: " "pip install elastic2-doc-manager[aws]" ) client_options["http_auth"] = create_aws_auth(kwargs["aws"]) client_options["use_ssl"] = True client_options["verify_certs"] = True client_options["connection_class"] = es_connection.RequestsHttpConnection if type(url) is not list: url = [url] self.elastic = Elasticsearch(hosts=url, **client_options) self._formatter = DefaultDocumentFormatter() self.BulkBuffer = BulkBuffer(self) # As bulk operation can be done in another thread # lock is needed to prevent access to BulkBuffer # while commiting documents to Elasticsearch # It is because BulkBuffer might get outdated # docs from Elasticsearch if bulk is still ongoing self.lock = threading.Lock() self.auto_commit_interval = auto_commit_interval self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL) self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size self.has_attachment_mapping = False self.attachment_field = attachment_field self.auto_commiter = AutoCommiter( self, self.auto_send_interval, self.auto_commit_interval ) self.auto_commiter.start()
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): self.elastic = self._create_elasticsearch_client(url, kwargs.get('clientOptions', {})) self.auto_commit_interval = auto_commit_interval self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() self.has_attachment_mapping = False self.attachment_field = attachment_field
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): client_options = kwargs.get('clientOptions', {}) client_options.setdefault('sniff_on_start', True) client_options.setdefault('sniff_on_connection_fail', True) client_options.setdefault('sniffer_timeout', 60) if 'aws' in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( 'aws extras must be installed to sign Elasticsearch ' 'requests. Install with: ' 'pip install elastic2-doc-manager[aws]') client_options['http_auth'] = create_aws_auth(kwargs['aws']) client_options['use_ssl'] = True client_options['verify_certs'] = True client_options['connection_class'] = \ es_connection.RequestsHttpConnection if type(url) is not list: url = [url] self.elastic = Elasticsearch(hosts=url, **client_options) self._formatter = DefaultDocumentFormatter() self.BulkBuffer = BulkBuffer(self) # As bulk operation can be done in another thread # lock is needed to prevent access to BulkBuffer # while commiting documents to Elasticsearch # It is because BulkBuffer might get outdated # docs from Elasticsearch if bulk is still ongoing self.lock = Lock() self.auto_commit_interval = auto_commit_interval self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self.has_attachment_mapping = False self.attachment_field = attachment_field
class DocManager(DocManagerBase): """Elasticsearch implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions on Elasticsearch. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): self.elastic = Elasticsearch(hosts=[url], **kwargs.get('clientOptions', {})) self.auto_commit_interval = auto_commit_interval self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() self.has_attachment_mapping = False self.attachment_field = attachment_field def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split('.', 1) return index.lower(), doc_type def stop(self): """Stop the auto-commit thread.""" self.auto_commit_interval = None def apply_update(self, doc, update_spec): if "$set" not in update_spec and "$unset" not in update_spec: # Don't try to add ns and _ts fields back in from doc return update_spec return super(DocManager, self).apply_update(doc, update_spec) @wrap_exceptions def handle_command(self, doc, namespace, timestamp): db = namespace.split('.', 1)[0] if doc.get('dropDatabase'): dbs = self.command_helper.map_db(db) for _db in dbs: self.elastic.indices.delete(index=_db.lower()) if doc.get('renameCollection'): raise errors.OperationFailed( "elastic_doc_manager does not support renaming a mapping.") if doc.get('create'): db, coll = self.command_helper.map_collection(db, doc['create']) if db and coll: self.elastic.indices.put_mapping( index=db.lower(), doc_type=coll, body={"_source": { "enabled": True }}) if doc.get('drop'): db, coll = self.command_helper.map_collection(db, doc['drop']) if db and coll: self.elastic.indices.delete_mapping(index=db.lower(), doc_type=coll) @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ self.commit() index, doc_type = self._index_and_mapping(namespace) document = self.elastic.get(index=index, doc_type=doc_type, id=u(document_id)) updated = self.apply_update(document['_source'], update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document['_id'] self.upsert(updated, namespace, timestamp) # upsert() strips metadata, so only _id + fields in _source still here return updated @wrap_exceptions def upsert(self, doc, namespace, timestamp): """Insert a document into Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) # No need to duplicate '_id' in source document doc_id = u(doc.pop("_id")) metadata = {"ns": namespace, "_ts": timestamp} # Index the source document, using lowercase namespace as index name. self.elastic.index(index=index, doc_type=doc_type, body=self._formatter.format_document(doc), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Index document metadata with original namespace (mixed upper/lower). self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Leave _id, since it's part of the original document doc['_id'] = doc_id @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": index, "_ts": timestamp } } yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error("Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def insert_file(self, f, namespace, timestamp): doc = f.get_metadata() doc_id = str(doc.pop('_id')) index, doc_type = self._index_and_mapping(namespace) # make sure that elasticsearch treats it like a file if not self.has_attachment_mapping: body = { "properties": { self.attachment_field: { "type": "attachment" } } } self.elastic.indices.put_mapping(index=index, doc_type=doc_type, body=body) self.has_attachment_mapping = True metadata = { 'ns': namespace, '_ts': timestamp, } doc = self._formatter.format_document(doc) doc[self.attachment_field] = base64.b64encode(f.read()).decode() self.elastic.index(index=index, doc_type=doc_type, body=doc, id=doc_id, refresh=(self.auto_commit_interval == 0)) self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) self.elastic.delete(index=index, doc_type=doc_type, id=u(document_id), refresh=(self.auto_commit_interval == 0)) self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type, id=u(document_id), refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan(self.elastic, query=kwargs.pop('body', None), scroll='10m', **kwargs): hit['_source']['_id'] = hit['_id'] yield hit['_source'] def search(self, start_ts, end_ts): """Query Elasticsearch for documents in a time range. This method is used to find documents that may be in conflict during a rollback event in MongoDB. """ return self._stream_search(index=self.meta_index_name, body={ "query": { "filtered": { "filter": { "range": { "_ts": { "gte": start_ts, "lte": end_ts } } } } } }) def commit(self): """Refresh all Elasticsearch indexes.""" retry_until_ok(self.elastic.indices.refresh, index="") def run_auto_commit(self): """Periodically commit to the Elastic server.""" self.elastic.indices.refresh() if self.auto_commit_interval not in [None, 0]: Timer(self.auto_commit_interval, self.run_auto_commit).start() @wrap_exceptions def get_last_doc(self): """Get the most recently modified document from Elasticsearch. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ try: result = self.elastic.search(index=self.meta_index_name, body={ "query": { "match_all": {} }, "sort": [{ "_ts": "desc" }], }, size=1)["hits"]["hits"] for r in result: r['_source']['_id'] = r['_id'] return r['_source'] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
class DocManager(DocManagerBase): """Elasticsearch implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions on Elasticsearch. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): client_options = kwargs.get("clientOptions", {}) if "aws" in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( "aws extras must be installed to sign Elasticsearch " "requests. Install with: " "pip install elastic2-doc-manager[aws]") client_options["http_auth"] = create_aws_auth(kwargs["aws"]) client_options["use_ssl"] = True client_options["verify_certs"] = True client_options[ "connection_class"] = es_connection.RequestsHttpConnection if type(url) is not list: url = [url] self.elastic = Elasticsearch(hosts=url, **client_options) self._formatter = DefaultDocumentFormatter() self.BulkBuffer = BulkBuffer(self) # As bulk operation can be done in another thread # lock is needed to prevent access to BulkBuffer # while commiting documents to Elasticsearch # It is because BulkBuffer might get outdated # docs from Elasticsearch if bulk is still ongoing self.lock = threading.Lock() self.auto_commit_interval = auto_commit_interval self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL) # es6 deprecates support for multiple document types # using default_type for consistency # Will try and use multiple doc types only if explicity specified self.create_multi_type = kwargs.get("createMultiType", False) self.default_type = kwargs.get("defaultType", "_doc") self.meta_index_name = meta_index_name self.meta_type = meta_type if self.create_multi_type else self.default_type self.unique_key = unique_key self.chunk_size = chunk_size self.has_attachment_mapping = False self.attachment_field = attachment_field self.auto_commiter = AutoCommiter(self, self.auto_send_interval, self.auto_commit_interval) self.auto_commiter.start() def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split(".", 1) return index.lower(), (self.default_type, doc_type)[self.create_multi_type] def stop(self): """Stop the auto-commit thread.""" self.auto_commiter.join() self.auto_commit_interval = 0 # Commit any remaining docs from buffer self.commit() def apply_update(self, doc, update_spec): if "$set" not in update_spec and "$unset" not in update_spec: # Don't try to add ns and _ts fields back in from doc return update_spec return super(DocManager, self).apply_update(doc, update_spec) @wrap_exceptions def handle_command(self, doc, namespace, timestamp): # Flush buffer before handle command self.commit() db = namespace.split(".", 1)[0] if doc.get("dropDatabase"): dbs = self.command_helper.map_db(db) for _db in dbs: self.elastic.indices.delete(index=_db.lower()) if doc.get("renameCollection"): raise errors.OperationFailed( "elastic_doc_manager does not support renaming a mapping.") if doc.get("create"): db, coll = self.command_helper.map_collection(db, doc["create"]) coll = (self.default_type, coll)[self.create_multi_type] if db and coll: self.elastic.indices.put_mapping( index=db.lower(), doc_type=coll, body={"_source": { "enabled": True }}) if doc.get("drop"): db, coll = self.command_helper.map_collection(db, doc["drop"]) coll = (self.default_type, coll)[self.create_multi_type] if db and coll: # This will delete the items in coll, but not get rid of the # mapping. warnings.warn("Deleting all documents of type %s on index %s." "The mapping definition will persist and must be" "removed manually." % (coll, db)) responses = streaming_bulk( self.elastic, (dict(result, _op_type="delete") for result in scan( self.elastic, index=db.lower(), doc_type=coll)), ) for ok, resp in responses: if not ok: LOG.error( "Error occurred while deleting ElasticSearch docum" "ent during handling of 'drop' command: %r" % resp) @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ index, doc_type = self._index_and_mapping(namespace) with self.lock: # Check if document source is stored in local buffer document = self.BulkBuffer.get_from_sources( index, doc_type, str(document_id)) if document: # Document source collected from local buffer # Perform apply_update on it and then it will be # ready for commiting to Elasticsearch updated = self.apply_update(document, update_spec) # _id is immutable in MongoDB, so won't have changed in update updated["_id"] = document_id self.upsert(updated, namespace, timestamp) else: # Document source needs to be retrieved from Elasticsearch # before performing update. Pass update_spec to upsert function updated = {"_id": document_id} self.upsert(updated, namespace, timestamp, update_spec) # upsert() strips metadata, so only _id + fields in _source still here return updated @wrap_exceptions def upsert(self, doc, namespace, timestamp, update_spec=None): """Insert a document into Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) # No need to duplicate '_id' in source document doc_id = str(doc.pop("_id")) metadata = {"ns": namespace, "_ts": timestamp} # Index the source document, using lowercase namespace as index name. action = { "_op_type": "index", "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc), } # Index document metadata with original namespace (mixed upper/lower). meta_action = { "_op_type": "index", "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": bson.json_util.dumps(metadata), } self.index(action, meta_action, doc, update_spec) # Leave _id, since it's part of the original document doc["_id"] = doc_id @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = str(doc.pop("_id")) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc), } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": namespace, "_ts": timestamp }, } yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw["chunk_size"] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error("Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def insert_file(self, f, namespace, timestamp): doc = f.get_metadata() doc_id = str(doc.pop("_id")) index, doc_type = self._index_and_mapping(namespace) # make sure that elasticsearch treats it like a file if not self.has_attachment_mapping: body = { "properties": { self.attachment_field: { "type": "attachment" } } } self.elastic.indices.put_mapping(index=index, doc_type=doc_type, body=body) self.has_attachment_mapping = True metadata = {"ns": namespace, "_ts": timestamp} doc = self._formatter.format_document(doc) doc[self.attachment_field] = base64.b64encode(f.read()).decode() action = { "_op_type": "index", "_index": index, "_type": doc_type, "_id": doc_id, "_source": doc, } meta_action = { "_op_type": "index", "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": bson.json_util.dumps(metadata), } self.index(action, meta_action) @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) action = { "_op_type": "delete", "_index": index, "_type": doc_type, "_id": str(document_id), } meta_action = { "_op_type": "delete", "_index": self.meta_index_name, "_type": self.meta_type, "_id": str(document_id), } self.index(action, meta_action) @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan(self.elastic, query=kwargs.pop("body", None), scroll="10m", **kwargs): hit["_source"]["_id"] = hit["_id"] yield hit["_source"] def search(self, start_ts, end_ts): """Query Elasticsearch for documents in a time range. This method is used to find documents that may be in conflict during a rollback event in MongoDB. """ return self._stream_search( index=self.meta_index_name, body={ "query": { "range": { "_ts": { "gte": start_ts, "lte": end_ts } } } }, ) def index(self, action, meta_action, doc_source=None, update_spec=None): with self.lock: self.BulkBuffer.add_upsert(action, meta_action, doc_source, update_spec) # Divide by two to account for meta actions if (len(self.BulkBuffer.action_buffer) / 2 >= self.chunk_size or self.auto_commit_interval == 0): self.commit() def send_buffered_operations(self): """Send buffered operations to Elasticsearch. This method is periodically called by the AutoCommitThread. """ with self.lock: try: action_buffer = self.BulkBuffer.get_buffer() if action_buffer: successes, errors = bulk(self.elastic, action_buffer) LOG.debug( "Bulk request finished, successfully sent %d " "operations", successes, ) if errors: LOG.error("Bulk request finished with errors: %r", errors) except es_exceptions.ElasticsearchException: LOG.exception("Bulk request failed with exception") def commit(self): """Send buffered requests and refresh all indexes.""" self.send_buffered_operations() retry_until_ok(self.elastic.indices.refresh, index="") @wrap_exceptions def get_last_doc(self): """Get the most recently modified document from Elasticsearch. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ try: result = self.elastic.search( index=self.meta_index_name, body={ "query": { "match_all": {} }, "sort": [{ "_ts": "desc" }] }, size=1, )["hits"]["hits"] for r in result: r["_source"]["_id"] = r["_id"] return r["_source"] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
class DocManager(DocManagerBase): """DLKit -> Elasticsearch implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions on Elasticsearch. Massages the DLKit data to include repository ID and run info for assets """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): self.elastic = Elasticsearch(hosts=[url], **kwargs.get('clientOptions', {})) self.auto_commit_interval = auto_commit_interval self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() self.has_attachment_mapping = False self.attachment_field = attachment_field def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split('.', 1) return index.lower(), doc_type def stop(self): """Stop the auto-commit thread.""" self.auto_commit_interval = None def apply_update(self, doc, update_spec): if "$set" not in update_spec and "$unset" not in update_spec: # Don't try to add ns and _ts fields back in from doc return update_spec return super(DocManager, self).apply_update(doc, update_spec) @wrap_exceptions def handle_command(self, doc, namespace, timestamp): db = namespace.split('.', 1)[0] if doc.get('dropDatabase'): dbs = self.command_helper.map_db(db) for _db in dbs: self.elastic.indices.delete(index=_db.lower()) if doc.get('renameCollection'): raise errors.OperationFailed( "elastic_doc_manager does not support renaming a mapping.") if doc.get('create'): db, coll = self.command_helper.map_collection(db, doc['create']) if db and coll: self.elastic.indices.put_mapping( index=db.lower(), doc_type=coll, body={"_source": { "enabled": True }}) if doc.get('drop'): db, coll = self.command_helper.map_collection(db, doc['drop']) if db and coll: self.elastic.indices.delete_mapping(index=db.lower(), doc_type=coll) @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ self.commit() index, doc_type = self._index_and_mapping(namespace) document = self.elastic.get(index=index, doc_type=doc_type, id=u(document_id)) updated = self.apply_update(document['_source'], update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document['_id'] self.upsert(updated, namespace, timestamp) # [email protected] # if the update is to a repository.Composition, make sure the # assets listed in assetIds also have a reference to the # repositoryId / parent of the given composition. if ('edx-composition' in update_spec['genusTypeId'] and 'assetIds' in update_spec): # get repositoryId and it's parent app_user = User(username='******', authenticated=True) dummy_request = create_test_request(app_user) activate_managers(dummy_request) rm = get_session_data(dummy_request, 'rm') run_repo_id = update_spec['repositoryId'] run_repo = rm.get_repository(clean_id(run_repo_id)) course_repo = rm.get_parent_repositories(run_repo.ident).next() domain_repo = rm.get_parent_repositories(course_repo.ident).next() # now get the assets referenced in update_spec['assetIds'] and # append the course_run_name to their docs for asset_id in update_spec['assetIds']: asset_doc_id = ObjectId(clean_id(asset_id).identifier) asset_namespace = '{0}.{1}'.format(index, 'Asset') asset_document = self.elastic.get(index=index, doc_type='Asset', id=u(asset_doc_id)) if 'enclosedObjectId' in asset_document['_source']: am = get_session_data(dummy_request, 'am') bank = am.get_bank( clean_id(asset_document['_source']['repositoryId'])) items = bank.get_assessment_items( clean_id( asset_document['_source']['enclosedObjectId'])) try: item_text = ' '.join( [i.get_text('edxml') for i in items]) except AttributeError: item_text = '' else: item_text = ' '.join([ ac['text']['text'] for ac in asset_document['_source']['assetContents'] ]) full_text = '{0} {1} {2}'.format( asset_document['_source']['displayName']['text'], asset_document['_source']['description']['text'], item_text) denormalized_asset = asset_document['_source'].copy() add_metadata(denormalized_asset, 'runs', str(run_repo.ident)) add_metadata(denormalized_asset, 'courses', str(course_repo.ident)) add_metadata(denormalized_asset, 'domains', str(domain_repo.ident)) add_metadata(denormalized_asset, 'fullText', full_text) updated_asset = self.apply_update(asset_document['_source'], denormalized_asset) # _id is immutable in MongoDB, so won't have changed in update updated_asset['_id'] = asset_document['_id'] self.upsert(updated_asset, asset_namespace, timestamp) # also remove any runs, if asset removed from a composition if 'assetIds' in document['_source']: removed_assets = [ i for i in document['_source']['assetIds'] if i not in update_spec['assetIds'] ] for asset_id in removed_assets: asset_doc_id = ObjectId(clean_id(asset_id).identifier) asset_namespace = '{0}.{1}'.format(index, 'Asset') asset_document = self.elastic.get(index=index, doc_type='Asset', id=u(asset_doc_id)) denormalized_asset = asset_document['_source'].copy() denormalized_asset['runs'].remove(str(run_repo.ident)) denormalized_asset['courses'].remove(str( course_repo.ident)) denormalized_asset['domains'].remove(domain_repo.ident) updated_asset = self.apply_update( asset_document['_source'], denormalized_asset) # _id is immutable in MongoDB, so won't have changed in update updated_asset['_id'] = asset_document['_id'] self.upsert(updated_asset, asset_namespace, timestamp) # upsert() strips metadata, so only _id + fields in _source still here return updated @wrap_exceptions def upsert(self, doc, namespace, timestamp): """Insert a document into Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) # No need to duplicate '_id' in source document doc_id = u(doc.pop("_id")) metadata = {"ns": namespace, "_ts": timestamp} # Index the source document, using lowercase namespace as index name. self.elastic.index(index=index, doc_type=doc_type, body=self._formatter.format_document(doc), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Index document metadata with original namespace (mixed upper/lower). self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Leave _id, since it's part of the original document doc['_id'] = doc_id @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": index, "_ts": timestamp } } yield document_action yield document_meta if not doc: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error("Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def insert_file(self, f, namespace, timestamp): doc = f.get_metadata() doc_id = str(doc.pop('_id')) index, doc_type = self._index_and_mapping(namespace) # make sure that elasticsearch treats it like a file if not self.has_attachment_mapping: body = { "properties": { self.attachment_field: { "type": "attachment" } } } self.elastic.indices.put_mapping(index=index, doc_type=doc_type, body=body) self.has_attachment_mapping = True metadata = { 'ns': namespace, '_ts': timestamp, } doc = self._formatter.format_document(doc) doc[self.attachment_field] = base64.b64encode(f.read()).decode() self.elastic.index(index=index, doc_type=doc_type, body=doc, id=doc_id, refresh=(self.auto_commit_interval == 0)) self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) self.elastic.delete(index=index, doc_type=doc_type, id=u(document_id), refresh=(self.auto_commit_interval == 0)) self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type, id=u(document_id), refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan(self.elastic, query=kwargs.pop('body', None), scroll='10m', **kwargs): hit['_source']['_id'] = hit['_id'] yield hit['_source'] def search(self, start_ts, end_ts): """Query Elasticsearch for documents in a time range. This method is used to find documents that may be in conflict during a rollback event in MongoDB. """ return self._stream_search(index=self.meta_index_name, body={ "query": { "filtered": { "filter": { "range": { "_ts": { "gte": start_ts, "lte": end_ts } } } } } }) def commit(self): """Refresh all Elasticsearch indexes.""" retry_until_ok(self.elastic.indices.refresh, index="") def run_auto_commit(self): """Periodically commit to the Elastic server.""" self.elastic.indices.refresh() if self.auto_commit_interval not in [None, 0]: Timer(self.auto_commit_interval, self.run_auto_commit).start() @wrap_exceptions def get_last_doc(self): """Get the most recently modified document from Elasticsearch. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ try: result = self.elastic.search(index=self.meta_index_name, body={ "query": { "match_all": {} }, "sort": [{ "_ts": "desc" }], }, size=1)["hits"]["hits"] for r in result: r['_source']['_id'] = r['_id'] return r['_source'] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
class DocManager(DocManagerBase): """Elasticsearch implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions on Elasticsearch. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): client_options = kwargs.get('clientOptions', {}) client_options.setdefault('sniff_on_start', True) client_options.setdefault('sniff_on_connection_fail', True) client_options.setdefault('sniffer_timeout', 60) if 'aws' in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( 'aws extras must be installed to sign Elasticsearch ' 'requests. Install with: ' 'pip install elastic2-doc-manager[aws]') client_options['http_auth'] = create_aws_auth(kwargs['aws']) client_options['use_ssl'] = True client_options['verify_certs'] = True client_options['connection_class'] = \ es_connection.RequestsHttpConnection if type(url) is not list: url = [url] self.elastic = Elasticsearch(hosts=url, **client_options) self.auto_commit_interval = auto_commit_interval self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size self.routing = kwargs.get('routing', {}) if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() self.has_attachment_mapping = False self.attachment_field = attachment_field def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split('.', 1) return index.lower(), doc_type def _get_parent_field(self, index, doc_type): """Get the parent field name for this collection.""" try: return self.routing[index][doc_type]['variant_id'] except KeyError: return None def _is_child_type(self, index, doc_type): """Return True if this mapping type is a child""" return self._get_parent_field(index, doc_type) is not None def _get_parent_id_from_mongodb(self, index, doc_type, doc): """Get parent ID from doc""" parent_field = self._get_parent_field(index, doc_type) if parent_field is None: return None return self._formatter.transform_value(doc.pop(parent_field, None)) def _get_parent_id_from_elastic(self, doc): """Get parent ID from doc""" return doc.get('_parent') def _search_doc_by_id(self, index, doc_type, doc_id): """Search document in Elasticsearch by _id""" result = self.elastic.search(index=index, doc_type=doc_type, body={ 'query': { 'ids': { 'type': doc_type, 'values': [u(doc_id)] } } }) if result['hits']['total'] == 1: return result['hits']['hits'][0] else: return None def stop(self): """Stop the auto-commit thread.""" self.auto_commit_interval = None def apply_update(self, doc, update_spec): if "$set" not in update_spec and "$unset" not in update_spec: # Don't try to add ns and _ts fields back in from doc return update_spec return super(DocManager, self).apply_update(doc, update_spec) @wrap_exceptions def handle_command(self, doc, namespace, timestamp): db = namespace.split('.', 1)[0] if doc.get('dropDatabase'): dbs = self.command_helper.map_db(db) for _db in dbs: self.elastic.indices.delete(index=_db.lower()) if doc.get('renameCollection'): raise errors.OperationFailed( "elastic_doc_manager does not support renaming a mapping.") if doc.get('create'): db, coll = self.command_helper.map_collection(db, doc['create']) if db and coll: self.elastic.indices.put_mapping( index=db.lower(), doc_type=coll, body={ "_source": {"enabled": True} }) if doc.get('drop'): db, coll = self.command_helper.map_collection(db, doc['drop']) if db and coll: # This will delete the items in coll, but not get rid of the # mapping. warnings.warn("Deleting all documents of type %s on index %s." "The mapping definition will persist and must be" "removed manually." % (coll, db)) responses = streaming_bulk( self.elastic, (dict(result, _op_type='delete') for result in scan( self.elastic, index=db.lower(), doc_type=coll))) for ok, resp in responses: if not ok: LOG.error( "Error occurred while deleting ElasticSearch docum" "ent during handling of 'drop' command: %r" % resp) @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ self.commit() # index, doc_type = self._index_and_mapping(namespace) #generate custom document_id index, doc_type = self._index_and_mapping(namespace) if doc_type == "facility_variant": if document_id: # document = self.elastic.get(index="catalog", doc_type="variant", id=u(document_id)) result = self.elastic.search(index="catalog", doc_type="variant", body={ "query": { "match" : {"facility_variant_id" : u(document_id)} } }) if result['hits']['total'] == 1: document = result['hits']['hits'][0] if "_source" in document: elasticDoc = document['_source'] if elasticDoc: # import pdb; pdb.set_trace() # variant_id = ObjectId(doc['variant_id']) # variantDoc = m_variant.find_one({"_id" : variant_id}) if elasticDoc and "$set" in update_spec: updatedValues = update_spec['$set'] for item in updatedValues: if item in elasticDoc: elasticDoc[str(item)] = updatedValues[item] else: elasticDoc['status'] = update_spec['status'] elasticDoc['comment'] = update_spec['comment'] elasticDoc['reason'] = update_spec['reason'] elasticDoc['is_available'] = update_spec['is_available'] elasticDoc['mrp'] = update_spec['mrp'] elasticDoc['selling_price'] = update_spec['selling_price'] elasticDoc['discount'] = update_spec['discount'] elasticDoc['_id'] = document['_id'] elasticDoc['is_direct_update'] = True self.upsert(elasticDoc, namespace, timestamp) else: if "_id" in update_spec: self.upsert(update_spec, namespace, timestamp) else: # update_spec["_id"] = document_id variantDoc = m_variant.find_one({"_id" : document_id}) if variantDoc and "$set" in update_spec: updatedValues = update_spec['$set'] for item in updatedValues: if str(item) == "reason": variantDoc['variant_reason'] = updatedValues[item] else: variantDoc[str(item)] = updatedValues[item] variantDoc['variant_id'] = str(document_id) self.upsert(variantDoc, namespace, timestamp) @wrap_exceptions def upsert(self, doc, namespace, timestamp): """Insert a document into Elasticsearch.""" # try: # print "calling : upsert with : "+str(doc)+","+str(namespace) # except Exception as e: # print "Exception while calling print statement" # print e LOG.info("calling : upsert with : "+str(doc)+","+str(namespace)) # namespace = 'catalog.variant' # index, doc_type = self._index_and_mapping(namespace) def docs_to_upsert(): elasticDocs = [] # import pdb; pdb.set_trace() if 'is_direct_update' in doc: elasticDoc = doc elasticDoc.pop("is_direct_update") elasticDocs.append(elasticDoc) # index = "catalog" # doc_type = "variant" # namespace = 'catalog.variant' namespace = 'catalog.variant' index, doc_type = self._index_and_mapping(namespace) LOG.info("final object "+str(elasticDocs) + ", "+index +","+doc_type) # print "final object "+str(elasticDocs) + ", "+index +","+doc_type else: elasticDocs = elastic_doc(doc) namespace = 'catalog.variant' index, doc_type = self._index_and_mapping(namespace) for elasticDoc in elasticDocs: # Remove metadata and redundant _id doc_id = u(elasticDoc.pop("_id")) # Remove parent field # parent_id = self._get_parent_id_from_mongodb(index, doc_type, # elasticDoc) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(elasticDoc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": namespace, "_ts": timestamp } } # if parent_id is not None: # document_action["_parent"] = parent_id yield document_action yield document_meta if elasticDocs is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") index, doc_type = self._index_and_mapping(namespace) if "variant" == doc_type or 'is_direct_update' in doc: try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error( "Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" # print "calling : bulk_upsert" LOG.info("calling : bulk_upsert") def docs_to_upsert(): doc = None for doc in docs: elasticDocs = elastic_doc(doc) for elasticDoc in elasticDocs: doc = elasticDoc # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) # Remove parent field # parent_id = self._get_parent_id_from_mongodb(index, doc_type, # doc) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": namespace, "_ts": timestamp } } # if parent_id is not None: # document_action["_parent"] = parent_id yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") index, doc_type = self._index_and_mapping(namespace) if "variant" == doc_type: try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error( "Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def insert_file(self, f, namespace, timestamp): doc = f.get_metadata() doc_id = str(doc.pop('_id')) index, doc_type = self._index_and_mapping(namespace) # make sure that elasticsearch treats it like a file if not self.has_attachment_mapping: body = { "properties": { self.attachment_field: {"type": "attachment"} } } self.elastic.indices.put_mapping(index=index, doc_type=doc_type, body=body) self.has_attachment_mapping = True metadata = { 'ns': namespace, '_ts': timestamp, } # Remove parent id field parent_id = self._get_parent_id_from_mongodb(index, doc_type, doc) doc = self._formatter.format_document(doc) doc[self.attachment_field] = base64.b64encode(f.read()).decode() parent_args = {} if parent_id is not None: parent_args['parent'] = parent_id self.elastic.index( index=index, doc_type=doc_type, body=doc, id=doc_id, refresh=(self.auto_commit_interval == 0), **parent_args) self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) parent_args = {} if self._is_child_type(index, doc_type): # We can't use delete() directly here and have to do a full search # first. This is due to the fact that Elasticsearch needs the # parent ID to know where to route the delete request. We do # not have the parent ID available in our remove request though. document = self._search_doc_by_id(index, doc_type, document_id) if document is None: LOG.error('Could not find document with ID "%s" in ' 'Elasticsearch to apply remove', u(document_id)) return parent_id = self._get_parent_id_from_elastic(document) if parent_id is not None: parent_args['parent'] = parent_id self.elastic.delete(index=index, doc_type=doc_type, id=u(document_id), refresh=(self.auto_commit_interval == 0), **parent_args) self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type, id=u(document_id), refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan(self.elastic, query=kwargs.pop('body', None), scroll='10m', **kwargs): hit['_source']['_id'] = hit['_id'] if '_parent' in hit: hit['_source']['_parent'] = hit['_parent'] yield hit['_source'] def search(self, start_ts, end_ts): """Query Elasticsearch for documents in a time range. This method is used to find documents that may be in conflict during a rollback event in MongoDB. """ return self._stream_search( index=self.meta_index_name, body={ "query": { "range": { "_ts": {"gte": start_ts, "lte": end_ts} } } }) def commit(self): """Refresh all Elasticsearch indexes.""" retry_until_ok(self.elastic.indices.refresh, index="") def run_auto_commit(self): """Periodically commit to the Elastic server.""" self.elastic.indices.refresh() if self.auto_commit_interval not in [None, 0]: Timer(self.auto_commit_interval, self.run_auto_commit).start() @wrap_exceptions def get_last_doc(self): """Get the most recently modified document from Elasticsearch. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ try: result = self.elastic.search( index=self.meta_index_name, body={ "query": {"match_all": {}}, "sort": [{"_ts": "desc"}], }, size=1 )["hits"]["hits"] for r in result: r['_source']['_id'] = r['_id'] return r['_source'] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
class DocManager(DocManagerBase): """Elasticsearch implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions on Elasticsearch. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): client_options = kwargs.get('clientOptions', {}) if 'aws' in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( 'aws extras must be installed to sign Elasticsearch ' 'requests. Install with: ' 'pip install elastic2-doc-manager[aws]') client_options['http_auth'] = create_aws_auth(kwargs['aws']) client_options['use_ssl'] = True client_options['verify_certs'] = True client_options['connection_class'] = \ es_connection.RequestsHttpConnection if type(url) is not list: url = [url] self.elastic = Elasticsearch(hosts=url, **client_options) // config timeout self._formatter = DefaultDocumentFormatter() self.BulkBuffer = BulkBuffer(self) # As bulk operation can be done in another thread # lock is needed to prevent access to BulkBuffer # while commiting documents to Elasticsearch # It is because BulkBuffer might get outdated # docs from Elasticsearch if bulk is still ongoing self.lock = threading.Lock() self.auto_commit_interval = auto_commit_interval self.auto_send_interval = kwargs.get('autoSendInterval', DEFAULT_SEND_INTERVAL) self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size self.has_attachment_mapping = False self.attachment_field = attachment_field self.auto_commiter = AutoCommiter(self, self.auto_send_interval, self.auto_commit_interval) self.auto_commiter.start() def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split('.', 1) return index.lower(), doc_type def stop(self): """Stop the auto-commit thread.""" self.auto_commiter.join() self.auto_commit_interval = 0 # Commit any remaining docs from buffer self.commit() def apply_update(self, doc, update_spec): if "$set" not in update_spec and "$unset" not in update_spec: # Don't try to add ns and _ts fields back in from doc return update_spec return super(DocManager, self).apply_update(doc, update_spec) @wrap_exceptions def handle_command(self, doc, namespace, timestamp): # Flush buffer before handle command self.commit() db = namespace.split('.', 1)[0] if doc.get('dropDatabase'): dbs = self.command_helper.map_db(db) for _db in dbs: self.elastic.indices.delete(index=_db.lower()) if doc.get('renameCollection'): raise errors.OperationFailed( "elastic_doc_manager does not support renaming a mapping.") if doc.get('create'): db, coll = self.command_helper.map_collection(db, doc['create']) if db and coll: self.elastic.indices.put_mapping( index=db.lower(), doc_type=coll, body={ "_source": {"enabled": True} }) if doc.get('drop'): db, coll = self.command_helper.map_collection(db, doc['drop']) if db and coll: # This will delete the items in coll, but not get rid of the # mapping. warnings.warn("Deleting all documents of type %s on index %s." "The mapping definition will persist and must be" "removed manually." % (coll, db)) responses = streaming_bulk( self.elastic, (dict(result, _op_type='delete') for result in scan( self.elastic, index=db.lower(), doc_type=coll))) for ok, resp in responses: if not ok: LOG.error( "Error occurred while deleting ElasticSearch docum" "ent during handling of 'drop' command: %r" % resp) @wrap_exceptions def search_exist(self, index, document_id): es = Elasticsearch() res = es.search(index = index, body={"query": {"match": {"_id": document_id}}}) if res['hits']['total'] >= 1: return True else: return False @wrap_exceptions def search_doc(self, document_id): client = MongoClient("playdb01.prod.hcm.fplay", 27017, maxPoolSize=50) #DB Mongo Fteluv, Collection videos_v2 db = client.fteluv collection = db['videos_ver2'] cursor = collection.find({"_id": ObjectId(document_id)}) return cursor @wrap_exceptions def key_value_update(self, update_spec): for key, value in update_spec.iteritems(): print key for item in value: print item return key, item @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ print "document_id: %s" %document_id print "update_spec: %s" %update_spec doc_id = str(document_id) document_id = doc_id print "document_id again: %s" %document_id index, doc_type = self._index_and_mapping(namespace) with self.lock: # Check if document source is stored in local buffer document = self.BulkBuffer.get_from_sources(index, doc_type, u(document_id)) if self.search_exist(index, document_id)== False: print " Not Found" docs = self.search_doc(document_id) print " docs :%s " % docs self.bulk_upsert_update(docs, namespace, timestamp) if document: # Document source collected from local buffer # Perform apply_update on it and then it will be # ready for commiting to Elasticsearch updated = self.apply_update(document, update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document_id self.upsert(updated, namespace, timestamp) else: # Document source needs to be retrieved from Elasticsearch # before performing update. Pass update_spec to upsert function updated = {"_id": document_id} self.upsert(updated, namespace, timestamp, update_spec) # upsert() strips metadata, so only _id + fields in _source still here return updated @wrap_exceptions def upsert(self, doc, namespace, timestamp, update_spec=None): """Insert a document into Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) # No need to duplicate '_id' in source document doc_id = u(doc.pop("_id")) metadata = { 'ns': namespace, '_ts': timestamp } # Index the source document, using lowercase namespace as index name. action = { '_op_type': 'index', '_index': index, '_type': doc_type, '_id': doc_id, '_source': self._formatter.format_document(doc) } # Index document metadata with original namespace (mixed upper/lower). meta_action = { '_op_type': 'index', '_index': self.meta_index_name, '_type': self.meta_type, '_id': doc_id, '_source': bson.json_util.dumps(metadata) } self.index(action, meta_action, doc, update_spec) # Leave _id, since it's part of the original document doc['_id'] = doc_id @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) doc_status = doc["status"] if doc_status == 0: print "to be continue!!" continue document_action = { '_index': index, '_type': doc_type, '_id': doc_id, '_source': self._formatter.format_document(doc) } document_meta = { '_index': self.meta_index_name, '_type': self.meta_type, '_id': doc_id, '_source': { 'ns': namespace, '_ts': timestamp } } yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error( "Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass print "Done Bulk_upsert" @wrap_exceptions def bulk_upsert_update(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) doc_status = doc["status"] if doc_status == 0: print "to be continue!!" continue document_action = { '_index': index, '_type': doc_type, '_id': doc_id, '_source': self._formatter.format_document(doc) } document_meta = { '_index': self.meta_index_name, '_type': self.meta_type, '_id': doc_id, '_source': { 'ns': namespace, '_ts': timestamp } } yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error( "Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass print "Done Bulk_upsert_update" @wrap_exceptions def insert_file(self, f, namespace, timestamp): doc = f.get_metadata() doc_id = str(doc.pop('_id')) index, doc_type = self._index_and_mapping(namespace) # make sure that elasticsearch treats it like a file if not self.has_attachment_mapping: body = { "properties": { self.attachment_field: {"type": "attachment"} } } self.elastic.indices.put_mapping(index=index, doc_type=doc_type, body=body) self.has_attachment_mapping = True metadata = { 'ns': namespace, '_ts': timestamp, } doc = self._formatter.format_document(doc) doc[self.attachment_field] = base64.b64encode(f.read()).decode() action = { '_op_type': 'index', '_index': index, '_type': doc_type, '_id': doc_id, '_source': doc } meta_action = { '_op_type': 'index', '_index': self.meta_index_name, '_type': self.meta_type, '_id': doc_id, '_source': bson.json_util.dumps(metadata) } self.index(action, meta_action) @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) action = { '_op_type': 'delete', '_index': index, '_type': doc_type, '_id': u(document_id) } meta_action = { '_op_type': 'delete', '_index': self.meta_index_name, '_type': self.meta_type, '_id': u(document_id) } self.index(action, meta_action) print "remove" @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan(self.elastic, query=kwargs.pop('body', None), scroll='10m', **kwargs): hit['_source']['_id'] = hit['_id'] yield hit['_source'] def search(self, start_ts, end_ts): """Query Elasticsearch for documents in a time range. This method is used to find documents that may be in conflict during a rollback event in MongoDB. """ return self._stream_search( index=self.meta_index_name, body={ "query": { "range": { "_ts": {"gte": start_ts, "lte": end_ts} } } }) def index(self, action, meta_action, doc_source=None, update_spec=None): with self.lock: self.BulkBuffer.add_upsert(action, meta_action, doc_source, update_spec) # Divide by two to account for meta actions if len(self.BulkBuffer.action_buffer) / 2 >= self.chunk_size or self.auto_commit_interval == 0: self.commit() def send_buffered_operations(self): """Send buffered operations to Elasticsearch. This method is periodically called by the AutoCommitThread. """ with self.lock: try: action_buffer = self.BulkBuffer.get_buffer() if action_buffer: successes, errors = bulk(self.elastic, action_buffer) LOG.debug("Bulk request finished, successfully sent %d " "operations", successes) if errors: LOG.error( "Bulk request finished with errors: %r", errors) except es_exceptions.ElasticsearchException: LOG.exception("Bulk request failed with exception") def commit(self): """Send buffered requests and refresh all indexes.""" self.send_buffered_operations() retry_until_ok(self.elastic.indices.refresh, index="") @wrap_exceptions def get_last_doc(self): """Get the most recently modified document from Elasticsearch. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ try: result = self.elastic.search( index=self.meta_index_name, body={ "query": {"match_all": {}}, "sort": [{"_ts": "desc"}], }, size=1 )["hits"]["hits"] for r in result: r['_source']['_id'] = r['_id'] return r['_source'] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
class DocManager(DocManagerBase): """Elasticsearch implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions on Elasticsearch. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", **kwargs): self.elastic = Elasticsearch(hosts=[url]) self.auto_commit_interval = auto_commit_interval self.doc_type = 'string' # default type is string, change if needed self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() def stop(self): """Stop the auto-commit thread.""" self.auto_commit_interval = None def apply_update(self, doc, update_spec): if "$set" not in update_spec and "$unset" not in update_spec: # Don't try to add ns and _ts fields back in from doc return update_spec return super(DocManager, self).apply_update(doc, update_spec) @wrap_exceptions def update(self, doc, update_spec): """Apply updates given in update_spec to the document whose id matches that of doc. """ self.commit() document = self.elastic.get(index=doc['ns'], id=str(doc['_id'])) updated = self.apply_update(document['_source'], update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document['_id'] # Add metadata fields back into updated, for the purposes of # calling upsert(). Need to do this until these become separate # arguments in 2.x updated['ns'] = doc['ns'] updated['_ts'] = doc['_ts'] self.upsert(updated) # upsert() strips metadata, so only _id + fields in _source still here return updated @wrap_exceptions def upsert(self, doc): """Insert a document into Elasticsearch.""" doc_type = self.doc_type index = doc.pop('ns') # No need to duplicate '_id' in source document doc_id = str(doc.pop("_id")) metadata = {"ns": index, "_ts": doc.pop("_ts")} # Index the source document self.elastic.index(index=index, doc_type=doc_type, body=self._formatter.format_document(doc), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Index document metadata self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Leave _id, since it's part of the original document doc['_id'] = doc_id @wrap_exceptions def bulk_upsert(self, docs): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index = doc.pop("ns") doc_id = str(doc.pop("_id")) timestamp = doc.pop("_ts") document_action = { "_index": index, "_type": self.doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": index, "_ts": timestamp } } yield document_action yield document_meta if not doc: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: logging.error("Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def remove(self, doc): """Remove a document from Elasticsearch.""" self.elastic.delete(index=doc['ns'], doc_type=self.doc_type, id=str(doc["_id"]), refresh=(self.auto_commit_interval == 0)) self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type, id=str(doc["_id"]), refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan(self.elastic, query=kwargs.pop('body', None), scroll='10m', **kwargs): hit['_source']['_id'] = hit['_id'] yield hit['_source'] def search(self, start_ts, end_ts): """Query Elasticsearch for documents in a time range. This method is used to find documents that may be in conflict during a rollback event in MongoDB. """ return self._stream_search(index=self.meta_index_name, body={ "query": { "filtered": { "filter": { "range": { "_ts": { "gte": start_ts, "lte": end_ts } } } } } }) def commit(self): """Refresh all Elasticsearch indexes.""" retry_until_ok(self.elastic.indices.refresh, index="") def run_auto_commit(self): """Periodically commit to the Elastic server.""" self.elastic.indices.refresh() if self.auto_commit_interval not in [None, 0]: Timer(self.auto_commit_interval, self.run_auto_commit).start() @wrap_exceptions def get_last_doc(self): """Get the most recently modified document from Elasticsearch. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ try: result = self.elastic.search(index=self.meta_index_name, body={ "query": { "match_all": {} }, "sort": [{ "_ts": "desc" }], }, size=1)["hits"]["hits"] for r in result: r['_source']['_id'] = r['_id'] return r['_source'] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
class DocManager(DocManagerBase): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. We are using elastic native fields for _id and ns, but we also store them as fields in the document, due to compatibility issues. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): """ Establish a connection to Elastic """ self.elastic = Elasticsearch(hosts=[url]) self.auto_commit_interval = auto_commit_interval self.doc_type = 'string' # default type is string, change if needed self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() def stop(self): """ Stops the instance """ self.auto_commit_interval = None @wrap_exceptions def update(self, doc, update_spec): """Apply updates given in update_spec to the document whose id matches that of doc. """ document = self.elastic.get(index=doc['ns'], id=str(doc['_id'])) updated = self.apply_update(document['_source'], update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document['_id'] self.upsert(updated) return updated @wrap_exceptions def upsert(self, doc): """Update or insert a document into Elastic If you'd like to have different types of document in your database, you can store the doc type as a field in Mongo and set doc_type to that field. (e.g. doc_type = doc['_type']) """ doc_type = self.doc_type index = doc['ns'] # No need to duplicate '_id' in source document doc_id = str(doc.pop("_id")) self.elastic.index(index=index, doc_type=doc_type, body=self._formatter.format_document(doc), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Don't mutate doc argument doc['_id'] = doc_id @wrap_exceptions def bulk_upsert(self, docs): """Update or insert multiple documents into Elastic docs may be any iterable """ def docs_to_upsert(): doc = None for doc in docs: index = doc["ns"] doc_id = str(doc.pop("_id")) yield { "_index": index, "_type": self.doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } if not doc: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: logging.error("Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def remove(self, doc): """Removes documents from Elastic The input is a python dictionary that represents a mongo document. """ self.elastic.delete(index=doc['ns'], doc_type=self.doc_type, id=str(doc["_id"]), refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan(self.elastic, query=kwargs.pop('body', None), scroll='10m', **kwargs): hit['_source']['_id'] = hit['_id'] yield hit['_source'] def search(self, start_ts, end_ts): """Called to query Elastic for documents in a time range.""" return self._stream_search(index="_all", body={ "query": { "filtered": { "filter": { "range": { "_ts": { "gte": start_ts, "lte": end_ts } } } } } }) def commit(self): """This function is used to force a refresh/commit. """ retry_until_ok(self.elastic.indices.refresh, index="") def run_auto_commit(self): """Periodically commits to the Elastic server. """ self.elastic.indices.refresh() if self.auto_commit_interval not in [None, 0]: Timer(self.auto_commit_interval, self.run_auto_commit).start() @wrap_exceptions def get_last_doc(self): """Returns the last document stored in the Elastic engine. """ try: result = self.elastic.search(index="_all", body={ "query": { "match_all": {} }, "sort": [{ "_ts": "desc" }], }, size=1)["hits"]["hits"] for r in result: r['_source']['_id'] = r['_id'] return r['_source'] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
class DocManager(DocManagerBase): """Elasticsearch implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions on Elasticsearch. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): hosts = self._get_hosts(url) self.elastic = Elasticsearch( hosts=hosts, timeout=60, **kwargs.get('clientOptions', {})) self.auto_commit_interval = auto_commit_interval self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() self.has_attachment_mapping = False self.attachment_field = attachment_field def _get_hosts(self, url): if isinstance(url, list): return url elif isinstance(url, str): return url.split(',') else: raise errors.ConnectionFailed("Invalid URI for Elastic") def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split('.', 1) return index.lower(), doc_type def stop(self): """Stop the auto-commit thread.""" self.auto_commit_interval = None def apply_update(self, doc, update_spec): if "$set" not in update_spec and "$unset" not in update_spec: # Don't try to add ns and _ts fields back in from doc return update_spec return super(DocManager, self).apply_update(doc, update_spec) @wrap_exceptions def handle_command(self, doc, namespace, timestamp): db = namespace.split('.', 1)[0] if doc.get('dropDatabase'): dbs = self.command_helper.map_db(db) for _db in dbs: self.elastic.indices.delete(index=_db.lower()) if doc.get('renameCollection'): raise errors.OperationFailed( "elastic_doc_manager does not support renaming a mapping.") if doc.get('create'): db, coll = self.command_helper.map_collection(db, doc['create']) if db and coll: self.elastic.indices.put_mapping( index=db.lower(), doc_type=coll, body={ "_source": {"enabled": True} }) if doc.get('drop'): db, coll = self.command_helper.map_collection(db, doc['drop']) if db and coll: self.elastic.indices.delete_mapping(index=db.lower(), doc_type=coll) @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ self.commit() index, doc_type = self._index_and_mapping(namespace) document = self.elastic.get(index=index, doc_type=doc_type, id=u(document_id)) updated = self.apply_update(document['_source'], update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document['_id'] self.upsert(updated, namespace, timestamp) # upsert() strips metadata, so only _id + fields in _source still here return updated @wrap_exceptions def upsert(self, doc, namespace, timestamp): """Insert a document into Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) # No need to duplicate '_id' in source document doc_id = u(doc.pop("_id")) metadata = { "ns": namespace, "_ts": timestamp } # Index the source document, using lowercase namespace as index name. self.elastic.index(index=index, doc_type=doc_type, body=self._formatter.format_document(doc), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Index document metadata with original namespace (mixed upper/lower). self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Leave _id, since it's part of the original document doc['_id'] = doc_id @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": namespace, "_ts": timestamp } } yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error( "Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def insert_file(self, f, namespace, timestamp): doc = f.get_metadata() doc_id = str(doc.pop('_id')) index, doc_type = self._index_and_mapping(namespace) # make sure that elasticsearch treats it like a file if not self.has_attachment_mapping: body = { "properties": { self.attachment_field: {"type": "attachment"} } } self.elastic.indices.put_mapping(index=index, doc_type=doc_type, body=body) self.has_attachment_mapping = True metadata = { 'ns': namespace, '_ts': timestamp, } doc = self._formatter.format_document(doc) doc[self.attachment_field] = base64.b64encode(f.read()).decode() self.elastic.index(index=index, doc_type=doc_type, body=doc, id=doc_id, refresh=(self.auto_commit_interval == 0)) self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) self.elastic.delete(index=index, doc_type=doc_type, id=u(document_id), refresh=(self.auto_commit_interval == 0)) self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type, id=u(document_id), refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan(self.elastic, query=kwargs.pop('body', None), scroll='10m', **kwargs): hit['_source']['_id'] = hit['_id'] yield hit['_source'] def search(self, start_ts, end_ts): """Query Elasticsearch for documents in a time range. This method is used to find documents that may be in conflict during a rollback event in MongoDB. """ return self._stream_search( index=self.meta_index_name, body={ "query": { "filtered": { "filter": { "range": { "_ts": {"gte": start_ts, "lte": end_ts} } } } } }) def commit(self): """Refresh all Elasticsearch indexes.""" retry_until_ok(self.elastic.indices.refresh, index="") def run_auto_commit(self): """Periodically commit to the Elastic server.""" self.elastic.indices.refresh() if self.auto_commit_interval not in [None, 0]: Timer(self.auto_commit_interval, self.run_auto_commit).start() @wrap_exceptions def get_last_doc(self): """Get the most recently modified document from Elasticsearch. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ try: result = self.elastic.search( index=self.meta_index_name, body={ "query": {"match_all": {}}, "sort": [{"_ts": "desc"}], }, size=1 )["hits"]["hits"] for r in result: r['_source']['_id'] = r['_id'] return r['_source'] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
class DocManager(DocManagerBase): """ Neo4j implementation for the DocManager. Receives documents and communicates with Neo4j Server. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): self.graph = Graph(url) self.auto_commit_interval = auto_commit_interval self.unique_key = unique_key self.chunk_size = chunk_size self._formatter = DefaultDocumentFormatter() self.kwargs = kwargs.get("clientOptions") def apply_id_constraint(self, doc_types): for doc_type in doc_types: constraint = "CREATE CONSTRAINT ON (d:`{doc_type}`) ASSERT d._id IS UNIQUE".format(doc_type=doc_type) self.graph.cypher.execute(constraint) def stop(self): """Stop the auto-commit thread.""" self.auto_commit_interval = None @wrap_exceptions def upsert(self, doc, namespace, timestamp): """Inserts a document into Neo4j.""" index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) metadata = { "ns": namespace, "_ts": timestamp } doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id) self.apply_id_constraint(builder.doc_types) tx = self.graph.cypher.begin() for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for relationship in builder.relationships_query.keys(): tx.append(relationship, builder.relationships_query[relationship]) tx.commit() @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Neo4j.""" """Maximum chunk size is 1000. Transaction blocks won't have more than 1000 statements.""" metadata = { "ns": namespace, "_ts": timestamp } tx = self.graph.cypher.begin() for doc in docs: index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id) self.apply_id_constraint(builder.doc_types) for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for relationship in builder.relationships_query.keys(): tx.append(relationship, builder.relationships_query[relationship]) tx.commit() @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): doc_id = u(document_id) tx = self.graph.cypher.begin() index, doc_type = self._index_and_mapping(namespace) updater = NodesAndRelationshipsUpdater() updater.run_update(update_spec, doc_id, doc_type) for statement in updater.statements_with_params: for key in statement.keys(): tx.append(key, statement[key]) tx.commit() @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Removes a document from Neo4j.""" doc_id = u(document_id) index, doc_type = self._index_and_mapping(namespace) params_dict = {"doc_id": doc_id} tx = self.graph.cypher.begin() statement = "MATCH (d:Document) WHERE d._id={doc_id} OPTIONAL MATCH (d)-[r]-() DELETE d, r" tx.append(statement, params_dict) tx.commit() def search(self, start_ts, end_ts): LOG.error("Search") def commit(self): LOG.error("Commit") def get_last_doc(self): LOG.error("get last doc") def handle_command(self, doc, namespace, timestamp): db = namespace.split('.', 1)[0] def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split('.', 1) return index.lower(), doc_type
class DocManager(DocManagerBase): """Elasticsearch implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions on Elasticsearch. """ def __init__( self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs ): client_options = kwargs.get("clientOptions", {}) if "aws" in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( "aws extras must be installed to sign Elasticsearch " "requests. Install with: " "pip install elastic2-doc-manager[aws]" ) client_options["http_auth"] = create_aws_auth(kwargs["aws"]) client_options["use_ssl"] = True client_options["verify_certs"] = True client_options["connection_class"] = es_connection.RequestsHttpConnection if type(url) is not list: url = [url] self.elastic = Elasticsearch(hosts=url, **client_options) self._formatter = DefaultDocumentFormatter() self.BulkBuffer = BulkBuffer(self) # As bulk operation can be done in another thread # lock is needed to prevent access to BulkBuffer # while commiting documents to Elasticsearch # It is because BulkBuffer might get outdated # docs from Elasticsearch if bulk is still ongoing self.lock = threading.Lock() self.auto_commit_interval = auto_commit_interval self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL) self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size self.has_attachment_mapping = False self.attachment_field = attachment_field self.auto_commiter = AutoCommiter( self, self.auto_send_interval, self.auto_commit_interval ) self.auto_commiter.start() def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split(".", 1) return index.lower(), doc_type def stop(self): """Stop the auto-commit thread.""" self.auto_commiter.join() self.auto_commit_interval = 0 # Commit any remaining docs from buffer self.commit() def apply_update(self, doc, update_spec): if "$set" not in update_spec and "$unset" not in update_spec: # Don't try to add ns and _ts fields back in from doc return update_spec return super(DocManager, self).apply_update(doc, update_spec) @wrap_exceptions def handle_command(self, doc, namespace, timestamp): # Flush buffer before handle command self.commit() db = namespace.split(".", 1)[0] if doc.get("dropDatabase"): dbs = self.command_helper.map_db(db) for _db in dbs: self.elastic.indices.delete(index=_db.lower()) if doc.get("renameCollection"): raise errors.OperationFailed( "elastic_doc_manager does not support renaming a mapping." ) if doc.get("create"): db, coll = self.command_helper.map_collection(db, doc["create"]) if db and coll: self.elastic.indices.put_mapping( index=db.lower(), doc_type=coll, body={"_source": {"enabled": True}} ) if doc.get("drop"): db, coll = self.command_helper.map_collection(db, doc["drop"]) if db and coll: # This will delete the items in coll, but not get rid of the # mapping. warnings.warn( "Deleting all documents of type %s on index %s." "The mapping definition will persist and must be" "removed manually." % (coll, db) ) responses = streaming_bulk( self.elastic, ( dict(result, _op_type="delete") for result in scan( self.elastic, index=db.lower(), doc_type=coll ) ), ) for ok, resp in responses: if not ok: LOG.error( "Error occurred while deleting ElasticSearch docum" "ent during handling of 'drop' command: %r" % resp ) @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ index, doc_type = self._index_and_mapping(namespace) with self.lock: # Check if document source is stored in local buffer document = self.BulkBuffer.get_from_sources( index, doc_type, str(document_id) ) if document: # Document source collected from local buffer # Perform apply_update on it and then it will be # ready for commiting to Elasticsearch updated = self.apply_update(document, update_spec) # _id is immutable in MongoDB, so won't have changed in update updated["_id"] = document_id self.upsert(updated, namespace, timestamp) else: # Document source needs to be retrieved from Elasticsearch # before performing update. Pass update_spec to upsert function updated = {"_id": document_id} self.upsert(updated, namespace, timestamp, update_spec) # upsert() strips metadata, so only _id + fields in _source still here return updated @wrap_exceptions def upsert(self, doc, namespace, timestamp, update_spec=None): """Insert a document into Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) # No need to duplicate '_id' in source document doc_id = str(doc.pop("_id")) metadata = {"ns": namespace, "_ts": timestamp} # Index the source document, using lowercase namespace as index name. action = { "_op_type": "index", "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc), } # Index document metadata with original namespace (mixed upper/lower). meta_action = { "_op_type": "index", "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": bson.json_util.dumps(metadata), } self.index(action, meta_action, doc, update_spec) # Leave _id, since it's part of the original document doc["_id"] = doc_id @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = str(doc.pop("_id")) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc), } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": {"ns": namespace, "_ts": timestamp}, } yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search" ) try: kw = {} if self.chunk_size > 0: kw["chunk_size"] = self.chunk_size responses = streaming_bulk( client=self.elastic, actions=docs_to_upsert(), **kw ) for ok, resp in responses: if not ok: LOG.error( "Could not bulk-upsert document " "into ElasticSearch: %r" % resp ) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def insert_file(self, f, namespace, timestamp): doc = f.get_metadata() doc_id = str(doc.pop("_id")) index, doc_type = self._index_and_mapping(namespace) # make sure that elasticsearch treats it like a file if not self.has_attachment_mapping: body = {"properties": {self.attachment_field: {"type": "attachment"}}} self.elastic.indices.put_mapping(index=index, doc_type=doc_type, body=body) self.has_attachment_mapping = True metadata = {"ns": namespace, "_ts": timestamp} doc = self._formatter.format_document(doc) doc[self.attachment_field] = base64.b64encode(f.read()).decode() action = { "_op_type": "index", "_index": index, "_type": doc_type, "_id": doc_id, "_source": doc, } meta_action = { "_op_type": "index", "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": bson.json_util.dumps(metadata), } self.index(action, meta_action) @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) action = { "_op_type": "delete", "_index": index, "_type": doc_type, "_id": str(document_id), } meta_action = { "_op_type": "delete", "_index": self.meta_index_name, "_type": self.meta_type, "_id": str(document_id), } self.index(action, meta_action) @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan( self.elastic, query=kwargs.pop("body", None), scroll="10m", **kwargs ): hit["_source"]["_id"] = hit["_id"] yield hit["_source"] def search(self, start_ts, end_ts): """Query Elasticsearch for documents in a time range. This method is used to find documents that may be in conflict during a rollback event in MongoDB. """ return self._stream_search( index=self.meta_index_name, body={"query": {"range": {"_ts": {"gte": start_ts, "lte": end_ts}}}}, ) def index(self, action, meta_action, doc_source=None, update_spec=None): with self.lock: self.BulkBuffer.add_upsert(action, meta_action, doc_source, update_spec) # Divide by two to account for meta actions if ( len(self.BulkBuffer.action_buffer) / 2 >= self.chunk_size or self.auto_commit_interval == 0 ): self.commit() def send_buffered_operations(self): """Send buffered operations to Elasticsearch. This method is periodically called by the AutoCommitThread. """ with self.lock: try: action_buffer = self.BulkBuffer.get_buffer() if action_buffer: successes, errors = bulk(self.elastic, action_buffer) LOG.debug( "Bulk request finished, successfully sent %d " "operations", successes, ) if errors: LOG.error("Bulk request finished with errors: %r", errors) except es_exceptions.ElasticsearchException: LOG.exception("Bulk request failed with exception") def commit(self): """Send buffered requests and refresh all indexes.""" self.send_buffered_operations() retry_until_ok(self.elastic.indices.refresh, index="") @wrap_exceptions def get_last_doc(self): """Get the most recently modified document from Elasticsearch. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ try: result = self.elastic.search( index=self.meta_index_name, body={"query": {"match_all": {}}, "sort": [{"_ts": "desc"}]}, size=1, )["hits"]["hits"] for r in result: r["_source"]["_id"] = r["_id"] return r["_source"] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
class DocManager(DocManagerBase): """Elasticsearch implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions on Elasticsearch. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): client_options = kwargs.get("clientOptions", {}) if "aws" in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( "aws extras must be installed to sign Elasticsearch " "requests. Install with: " "pip install elastic2-doc-manager[aws]") client_options["http_auth"] = create_aws_auth(kwargs["aws"]) client_options["use_ssl"] = True client_options["verify_certs"] = False client_options[ "connection_class"] = es_connection.RequestsHttpConnection else: client_options["use_ssl"] = True client_options["verify_certs"] = False client_options[ "connection_class"] = es_connection.RequestsHttpConnection if type(url) is not list: url = [url] LOG.always('URL IN DOC MANAGER:') LOG.always(url) # self.elastic = Elasticsearch(hosts=url, **client_options) protocol = "http" if (os.environ.get('ELASTIC_SSL_ENABLED') == "false") else "https" username = os.environ.get('ELASTIC_USER') password = os.environ.get('ELASTIC_PASSWORD') hostname = os.environ.get('ELASTIC_HOST') port = os.environ.get('ELASTIC_PORT') timeout = int(__get_os_environ_or_default__('ELASTIC_TIMEOUT', 30)) max_retries = int( __get_os_environ_or_default__('ELASTIC_MAX_RETRY', 20)) retry_on_timeout = bool( int(__get_os_environ_or_default__('ELASTIC_RETRY_ON_TIMEOUT', True))) LOG.info(" value of ELASTIC_TIMEOUT: {}".format(timeout)) LOG.info(" value of ELASTIC_MAX_RETRY: {}".format(max_retries)) LOG.info( " value of ELASTIC_RETRY_ON_TIMEOUT: {}".format(retry_on_timeout)) # We're not using sniffing now - we will fix it using Connection with credentials. sniff_on_start = bool( int(__get_os_environ_or_default__('ELASTIC_SNIFF_ON_START', True))) sniff_on_connection_fail = bool( int( __get_os_environ_or_default__('ELASTIC_SNIFF_ON_CONN_FAIL', True))) sniffer_timeout = int( __get_os_environ_or_default__('ELASTIC_SNIFFER_TIMEOUT', 20)) if username and password: elastic_url = "{0}://{1}:{2}@{3}:{4}/".format( protocol, username, password, hostname, port) else: elastic_url = "{0}://{1}:{2}/".format(protocol, hostname, port) LOG.always('SELF-ASSEMBLED ELASTIC URL IN DOC MANAGER:') LOG.always(elastic_url) if os.environ.get('ELASTIC_SSL_ENABLED') == "false": use_ssl = False else: use_ssl = True # https://stackoverflow.com/questions/25908484/how-to-fix-read-timed-out-in-elasticsearch # es = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True) # https://elasticsearch-py.readthedocs.io/en/master/#sniffing # Sniffing caused authentication issue - it appears it was using username/password to retry. We'll revisit # this later to check if sniff can be integrated in case needed. Disabling it for now. SEAR-392 self.elastic = Elasticsearch( hosts=[elastic_url], verify_certs=False, use_ssl=use_ssl, timeout=timeout, max_retries=max_retries, retry_on_timeout=retry_on_timeout # sniff_on_start=sniff_on_start, # sniff_on_connection_fail=sniff_on_connection_fail, # sniffer_timeout=sniffer_timeout ) self.summary_title = 'dm_ingestion_time' self.counter_title = 'dm_ingest' self.REQUEST_TIME = Summary(self.summary_title, 'Bulk operations throughput') self.ingest_rate = Counter( self.counter_title, 'Number of documents ingested per bulk operation', ['collectionName']) self.doc_summary_title = 'new_doc_operation_time' self.doc_count_title = 'new_doc_operation' self.REQUEST_TIME_OP = Summary( self.doc_summary_title, 'Operations on documents for Elasticsearch') self.doc_operation_count = Counter(self.doc_count_title, 'Document operation', ['operation_type', 'index']) self._formatter = DefaultDocumentFormatter() self.BulkBuffer = BulkBuffer(self) # As bulk operation can be done in another thread # lock is needed to prevent access to BulkBuffer # while commiting documents to Elasticsearch # It is because BulkBuffer might get outdated # docs from Elasticsearch if bulk is still ongoing self.lock = threading.Lock() self.auto_commit_interval = auto_commit_interval self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL) self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size self.has_attachment_mapping = False self.attachment_field = attachment_field self.auto_commiter = AutoCommiter(self, self.auto_send_interval, self.auto_commit_interval) self.auto_commiter.start() # with open('./config/mapping_resources_and_run_data.json', 'r') as mapping_config: # try: # # local_mapping = json.load(mapping_config) # # local_mapping = str(local_mapping) # # # try: # # es_mapping = self.elastic.indices.get_mapping(index='resources_and_run_data') # # es_mapping = es_mapping\ # # .get('resources_and_run_data')\ # # .get('mappings')\ # # .get('resources_and_run_data') # # # # # es_mapping = str(es_mapping) # # # # # is_mapping_correct = local_mapping == es_mapping # # is_mapping_correct = diff(local_mapping, es_mapping) # # # # LOG.always('*******************************************') # # LOG.always('LOCAL') # # LOG.always(local_mapping) # # LOG.always('*******************************************') # # LOG.always(' ') # # LOG.always(' ') # # LOG.always('*******************************************') # # LOG.always('ES') # # LOG.always(es_mapping) # # LOG.always('*******************************************') # # # # LOG.always('*******************************************') # # LOG.always('diff') # # LOG.always(is_mapping_correct) # # LOG.always('*******************************************') # # # if not is_mapping_correct: # # except errors.ConnectionFailed: # LOG.exception( # 'Could not load mapping config on Elasticsearch' # ) # except ValueError: # LOG.exception( # 'Could not load mappings file' # ) # # return def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split(".", 1) return index.lower(), doc_type def stop(self): """Stop the auto-commit thread.""" self.auto_commiter.join() self.auto_commit_interval = 0 # Commit any remaining docs from buffer self.commit() def apply_update(self, doc, update_spec): if "$set" not in update_spec and "$unset" not in update_spec: # Don't try to add ns and _ts fields back in from doc return update_spec return super(DocManager, self).apply_update(doc, update_spec) @wrap_exceptions def handle_command(self, doc, namespace, timestamp): # Flush buffer before handle command self.commit() db = namespace.split(".", 1)[0] if doc.get("dropDatabase"): dbs = self.command_helper.map_db(db) for _db in dbs: self.elastic.indices.delete(index=_db.lower()) if doc.get("renameCollection"): raise errors.OperationFailed( "elastic_doc_manager does not support renaming a mapping.") if doc.get("create"): db, coll = self.command_helper.map_collection(db, doc["create"]) if db and coll: self.elastic.indices.put_mapping( index=db.lower(), doc_type=coll, body={"_source": { "enabled": True }}) if doc.get("drop"): db, coll = self.command_helper.map_collection(db, doc["drop"]) if db and coll: # This will delete the items in coll, but not get rid of the # mapping. warnings.warn("Deleting all documents of type %s on index %s." "The mapping definition will persist and must be" "removed manually." % (coll, db)) responses = streaming_bulk( self.elastic, (dict(result, _op_type="delete") for result in scan( self.elastic, index=db.lower(), doc_type=coll)), ) for ok, resp in responses: if not ok: LOG.error( "Error occurred while deleting ElasticSearch docum" "ent during handling of 'drop' command: %r" % resp) @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ index, doc_type = self._index_and_mapping(namespace) with self.lock: # Check if document source is stored in local buffer document = self.BulkBuffer.get_from_sources( index, doc_type, str(document_id)) LOG.debug('_________________________ UPDATING FILE') LOG.debug(update_spec) if document: # Document source collected from local buffer # Perform apply_update on it and then it will be # ready for commiting to Elasticsearch updated = self.apply_update(document, update_spec) # _id is immutable in MongoDB, so won't have changed in update updated["_id"] = document_id self.upsert(updated, namespace, timestamp, None, True) else: # Document source needs to be retrieved from Elasticsearch # before performing update. Pass update_spec to upsert function updated = {"_id": document_id} self.upsert(updated, namespace, timestamp, update_spec, False) # upsert() strips metadata, so only _id + fields in _source still here return updated @wrap_exceptions def upsert(self, doc, namespace, timestamp, update_spec=None, is_update=False): """Insert a document into Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) # No need to duplicate '_id' in source document doc_id = str(doc.pop("_id")) metadata = {"ns": namespace, "_ts": timestamp} action_source = self._formatter.format_document(doc) # Index the source document, using lowercase namespace as index name. action = { "_op_type": "index", "_index": index, "_type": doc_type, "_id": doc_id, "_source": action_source, } LOG.debug('_________________________ UPSERTING FILE') meta_action_source = bson.json_util.dumps(metadata) # Index document metadata with original namespace (mixed upper/lower). meta_action = { "_op_type": "index", "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": meta_action_source, } if is_update: action['_update'] = True meta_action['_update'] = True self.index(action, meta_action, doc, update_spec) # Leave _id, since it's part of the original document doc["_id"] = doc_id @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp, collectionName): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc_count = 0 doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = str(doc.pop("_id")) routing = False if os.environ.get('JOIN_INDEX'): if namespace == os.environ.get( 'JOIN_INDEX') + "." + os.environ.get('JOIN_INDEX'): if doc.get( os.environ.get('CHILD_FIELD_1')) and doc.get( os.environ.get('CHILD_FIELD_2')): routing = True doc["data_join"] = { "name": os.environ.get('JOIN_FIELD'), "parent": doc.get(os.environ.get('JOIN_FIELD')) } else: doc["data_join"] = {"name": "_id"} document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc), } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": namespace, "_ts": timestamp }, } if routing is True: document_meta["_routing"] = doc.get( os.environ.get('JOIN_FIELD')) document_action["_routing"] = doc.get( os.environ.get('JOIN_FIELD')) yield document_action yield document_meta doc_count += 1 if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") LOG.always(" - - - - - COLLECTION") LOG.always(collectionName) LOG.always(" - - - - - # OF DOCS") LOG.always(doc_count) try: kw = {} if self.chunk_size > 0: kw["chunk_size"] = self.chunk_size kw["max_retries"] = 10 ns, ns2 = namespace.split(".", 1) if collectionName: index_name, ns = collectionName.split(".", 1) @self.REQUEST_TIME.time() def process_request(metric): metric.inc() @ERROR_TIME.time() def error_catch(error): error.inc() responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error( '_ Could not bulk-upsert document. ERROR RESP: {r}'. format(r=resp)) error_catch( ERROR_CAUGHT.labels( 'Could not bulk-upsert document into ElasticSearch', resp)) else: if resp.get('index').get('_type') != 'mongodb_meta': process_request(self.ingest_rate.labels(ns)) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def insert_file(self, f, namespace, timestamp): doc = f.get_metadata() doc_id = str(doc.pop("_id")) index, doc_type = self._index_and_mapping(namespace) # make sure that elasticsearch treats it like a file if not self.has_attachment_mapping: body = { "properties": { self.attachment_field: { "type": "attachment" } } } self.elastic.indices.put_mapping(index=index, doc_type=doc_type, body=body) self.has_attachment_mapping = True metadata = {"ns": namespace, "_ts": timestamp} doc = self._formatter.format_document(doc) doc[self.attachment_field] = base64.b64encode(f.read()).decode() action = { "_op_type": "index", "_index": index, "_type": doc_type, "_id": doc_id, "_source": doc, } meta_action = { "_op_type": "index", "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": bson.json_util.dumps(metadata), } LOG.debug('_________________________ INSERTING FILE') self.index(action, meta_action) @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) action = { "_op_type": "delete", "_index": index, "_type": doc_type, "_id": str(document_id), } meta_action = { "_op_type": "delete", "_index": self.meta_index_name, "_type": self.meta_type, "_id": str(document_id), } # When removing a runData doc, we need to get the routing field into our action data # This allows the parent+child relationship to successfully dissolve on removal # Without the _routing field, this operation will throw an exception if os.environ.get('JOIN_INDEX') and (index == os.environ.get('JOIN_INDEX')): try: hit = self.elastic.search( index=index, body={"query": { "match": { "_id": str(document_id) } }}, size=1)["hits"]["hits"] for result in hit: if result and result['_routing']: action['_routing'] = result['_routing'] meta_action['_routing'] = result['_routing'] except: LOG.error( 'EXCEPTION: COULD NOT FIND DOCUMENT IN ELASTICSEARCH FOR REMOVAL OPERATION' ) self.index(action, meta_action) @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan(self.elastic, query=kwargs.pop("body", None), scroll="10m", **kwargs): hit["_source"]["_id"] = hit["_id"] yield hit["_source"] def search(self, start_ts, end_ts): """Query Elasticsearch for documents in a time range. This method is used to find documents that may be in conflict during a rollback event in MongoDB. """ return self._stream_search( index=self.meta_index_name, body={ "query": { "range": { "_ts": { "gte": start_ts, "lte": end_ts } } } }, ) def index(self, action, meta_action, doc_source=None, update_spec=None): if os.environ.get('JOIN_INDEX'): namespace = action["_type"] if namespace == os.environ.get('JOIN_INDEX'): if doc_source: is_child1 = doc_source.get(os.environ.get('CHILD_FIELD_1')) and \ doc_source.get(os.environ.get('CHILD_FIELD_2')) is_child2 = action['_source'].get(os.environ.get('CHILD_FIELD_1')) and \ action['_source'].get(os.environ.get('CHILD_FIELD_2')) if is_child1 or is_child2: action['_source']['data_join'] = { "name": os.environ.get('JOIN_FIELD'), "parent": action['_source'][os.environ.get('JOIN_FIELD')] } doc_source['data_join'] = { "name": os.environ.get('JOIN_FIELD'), "parent": doc_source[os.environ.get('JOIN_FIELD')] } action["_routing"] = doc_source.get( os.environ.get('JOIN_FIELD')) meta_action["_routing"] = doc_source.get( os.environ.get('JOIN_FIELD')) else: action['_source']['data_join'] = {'name': '_id'} doc_source['data_join'] = {'name': '_id'} with self.lock: self.BulkBuffer.add_upsert(action, meta_action, doc_source, update_spec) # Divide by two to account for meta actions if (len(self.BulkBuffer.action_buffer) / 2 >= self.chunk_size or self.auto_commit_interval == 0): self.commit() def send_buffered_operations(self): """Send buffered operations to Elasticsearch. This method is periodically called by the AutoCommitThread. """ with self.lock: @ERROR_TIME.time() def error_catch(error): error.inc() try: action_buffer = self.BulkBuffer.get_buffer() if action_buffer: successes, errors = bulk(self.elastic, action_buffer) LOG.debug( "Bulk request finished, successfully sent %d " "operations", successes, ) LOG.debug(' ') LOG.debug(' ') LOG.debug('*****************************************') LOG.debug(' ') LOG.debug('SUCCESSES') LOG.debug(successes) LOG.debug(' ') LOG.debug('ACTION BUFFER') LOG.debug(action_buffer) LOG.debug(' ') LOG.debug('*****************************************') LOG.debug(' ') LOG.debug(' ') if errors: for error in errors: error_catch( ERROR_CAUGHT.labels('Bulk request error', error)) LOG.error("Bulk request finished with errors: %r", errors) # TODO: Add collection name as label @self.REQUEST_TIME_OP.time() def process_request(operation_type, index): self.doc_operation_count.labels(operation_type, index).inc() doc = action_buffer[0] index = doc.get('_index') operation_type = doc.get('_op_type') if doc.get('_update'): process_request('update', index) LOG.debug('UPDATE!') elif operation_type == 'index': process_request('add', index) LOG.debug('ADD!') elif operation_type == 'delete': process_request('remove', index) LOG.debug('REMOVE!') # LOG.always( # "Counter: Documents removed: %d, " # "inserted: %d, updated: %d so far" % ( # op_remove, op_add, op_update)) except es_exceptions.ElasticsearchException as e: error_catch( ERROR_CAUGHT.labels('Bulk request failed with exception', 'send_buffered_operations')) LOG.exception( "Bulk request failed with exception {}".format(e)) def commit(self): """Send buffered requests and refresh all indexes.""" self.send_buffered_operations() retry_until_ok(self.elastic.indices.refresh, index="") @wrap_exceptions def get_last_doc(self): """Get the most recently modified document from Elasticsearch. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ try: result = self.elastic.search( index=self.meta_index_name, body={ "query": { "match_all": {} }, "sort": [{ "_ts": "desc" }] }, size=1, )["hits"]["hits"] for r in result: r["_source"]["_id"] = r["_id"] return r["_source"] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): client_options = kwargs.get("clientOptions", {}) if "aws" in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( "aws extras must be installed to sign Elasticsearch " "requests. Install with: " "pip install elastic2-doc-manager[aws]") client_options["http_auth"] = create_aws_auth(kwargs["aws"]) client_options["use_ssl"] = True client_options["verify_certs"] = False client_options[ "connection_class"] = es_connection.RequestsHttpConnection else: client_options["use_ssl"] = True client_options["verify_certs"] = False client_options[ "connection_class"] = es_connection.RequestsHttpConnection if type(url) is not list: url = [url] LOG.always('URL IN DOC MANAGER:') LOG.always(url) # self.elastic = Elasticsearch(hosts=url, **client_options) protocol = "http" if (os.environ.get('ELASTIC_SSL_ENABLED') == "false") else "https" username = os.environ.get('ELASTIC_USER') password = os.environ.get('ELASTIC_PASSWORD') hostname = os.environ.get('ELASTIC_HOST') port = os.environ.get('ELASTIC_PORT') timeout = int(__get_os_environ_or_default__('ELASTIC_TIMEOUT', 30)) max_retries = int( __get_os_environ_or_default__('ELASTIC_MAX_RETRY', 20)) retry_on_timeout = bool( int(__get_os_environ_or_default__('ELASTIC_RETRY_ON_TIMEOUT', True))) LOG.info(" value of ELASTIC_TIMEOUT: {}".format(timeout)) LOG.info(" value of ELASTIC_MAX_RETRY: {}".format(max_retries)) LOG.info( " value of ELASTIC_RETRY_ON_TIMEOUT: {}".format(retry_on_timeout)) # We're not using sniffing now - we will fix it using Connection with credentials. sniff_on_start = bool( int(__get_os_environ_or_default__('ELASTIC_SNIFF_ON_START', True))) sniff_on_connection_fail = bool( int( __get_os_environ_or_default__('ELASTIC_SNIFF_ON_CONN_FAIL', True))) sniffer_timeout = int( __get_os_environ_or_default__('ELASTIC_SNIFFER_TIMEOUT', 20)) if username and password: elastic_url = "{0}://{1}:{2}@{3}:{4}/".format( protocol, username, password, hostname, port) else: elastic_url = "{0}://{1}:{2}/".format(protocol, hostname, port) LOG.always('SELF-ASSEMBLED ELASTIC URL IN DOC MANAGER:') LOG.always(elastic_url) if os.environ.get('ELASTIC_SSL_ENABLED') == "false": use_ssl = False else: use_ssl = True # https://stackoverflow.com/questions/25908484/how-to-fix-read-timed-out-in-elasticsearch # es = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True) # https://elasticsearch-py.readthedocs.io/en/master/#sniffing # Sniffing caused authentication issue - it appears it was using username/password to retry. We'll revisit # this later to check if sniff can be integrated in case needed. Disabling it for now. SEAR-392 self.elastic = Elasticsearch( hosts=[elastic_url], verify_certs=False, use_ssl=use_ssl, timeout=timeout, max_retries=max_retries, retry_on_timeout=retry_on_timeout # sniff_on_start=sniff_on_start, # sniff_on_connection_fail=sniff_on_connection_fail, # sniffer_timeout=sniffer_timeout ) self.summary_title = 'dm_ingestion_time' self.counter_title = 'dm_ingest' self.REQUEST_TIME = Summary(self.summary_title, 'Bulk operations throughput') self.ingest_rate = Counter( self.counter_title, 'Number of documents ingested per bulk operation', ['collectionName']) self.doc_summary_title = 'new_doc_operation_time' self.doc_count_title = 'new_doc_operation' self.REQUEST_TIME_OP = Summary( self.doc_summary_title, 'Operations on documents for Elasticsearch') self.doc_operation_count = Counter(self.doc_count_title, 'Document operation', ['operation_type', 'index']) self._formatter = DefaultDocumentFormatter() self.BulkBuffer = BulkBuffer(self) # As bulk operation can be done in another thread # lock is needed to prevent access to BulkBuffer # while commiting documents to Elasticsearch # It is because BulkBuffer might get outdated # docs from Elasticsearch if bulk is still ongoing self.lock = threading.Lock() self.auto_commit_interval = auto_commit_interval self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL) self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size self.has_attachment_mapping = False self.attachment_field = attachment_field self.auto_commiter = AutoCommiter(self, self.auto_send_interval, self.auto_commit_interval) self.auto_commiter.start()
class DocManager(DocManagerBase): """ Neo4j implementation for the DocManager. Receives documents and communicates with Neo4j Server. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='uid', chunk_size=DEFAULT_MAX_BULK, **kwargs): self.graph = Graph(url) self.url = url self.auto_commit_interval = auto_commit_interval self.unique_key = unique_key self.chunk_size = chunk_size self._formatter = DefaultDocumentFormatter() self.kwargs = kwargs.get("clientOptions") self.authorization_token = base64.b64encode(os.getenv('NEO4J_AUTH')) def apply_id_constraint(self, doc_types): for doc_type in doc_types: doc_type = doc_type.upper() constraint = "CREATE CONSTRAINT ON (d:`{doc_type}`) ASSERT d.uid IS UNIQUE".format(doc_type=doc_type) self.graph.cypher.execute(constraint) def stop(self): """Stop the auto-commit thread.""" self.auto_commit_interval = None @wrap_exceptions def upsert(self, doc, namespace, timestamp): """Inserts a document into Neo4j.""" index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("uid")) metadata = { "_ts": timestamp } doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata) self.apply_id_constraint(builder.doc_types) tx = self.graph.cypher.begin() for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for query in builder.cypher_list: tx.append(query) # Adding cyphers from cypher list for relationship, params in builder.relationships_query: tx.append(relationship, params) for statement in builder.statements_with_params: for key in statement.keys(): tx.append(key, statement[key]) commit_result = None try: commit_result = tx.commit() print commit_result except Exception as e: LOG.error('{}'.format(e.message)) pass if commit_result: nodeids_list = self._get_nodeids(commit_result) self.create_geospatial_indices(nodeids_list) def _get_nodeids(self, commit_result): node_id_list = [] a = len(commit_result) for i in range(len(commit_result)): res = commit_result.pop(0) records = res.records if not records: continue for record in records: node_ids = list(record.__values__) node_id_list.extend(node_ids) return node_id_list def create_geospatial_indices(self, node_ids_list): """ Creates geo spatial indices on the node ids :param node_ids_list: list of node ids """ layer_name = 'geom' lat = 'lat' lon = 'lon' geometry_type = 'point' self._set_id_to_nodeid(node_ids_list) # if_layer = self.if_layer_exists(layer_name) # if if_layer: self._create_layer(layer_name, lat, lon) self._add_geometry(layer_name, geometry_type, lat, lon) result = self._add_node_to_layer(node_ids_list, layer_name) LOG.info('Geospatial index creation response {}', repr(result)) def _set_id_to_nodeid(self, node_ids_list): # TODO: We may want it to change to label name """ Set id on basis of node ids :param node_ids_list: :param label_name: :return: """ tx = self.graph.cypher.begin() for count, nodeid in enumerate(node_ids_list, 1): if count % 1000 == 0: tx.commit() tx = self.graph.cypher.begin() query = 'MATCH (n) where id(n) = {nodeid} set n.id={nodeid}'.format(nodeid=nodeid) tx.append(query) if not tx.finished: tx.commit() def _add_node_to_layer(self, node_ids_list, layer_name): """ Adds nodes to layer :param node_ids_list: list of node ids :param layer_name: <string> :return: [(nodeid, res)] """ endpoint = '/ext/SpatialPlugin/graphdb/addNodeToLayer' url = self.url + endpoint result_list = [] for nodeid in node_ids_list: node_endpoint = '/node/{}'.format(nodeid) node = self.url + node_endpoint payload = {'layer': layer_name, 'node': node} res = self._post_request(url, payload=payload) result_list.append((nodeid, res)) return result_list def _add_geometry(self, layer_name, geometry_type, lat, lon): """ Creates a geometry """ endpoint = '/index/node' url = self.url + endpoint payload = {'name': layer_name, 'config': { 'provider': 'spatial', 'geometry_type': geometry_type, 'lat': lat, 'lon': lon} } res = self._post_request(url, payload=payload) if res.status_code == 201: LOG.info('Geometry {} created'.format(geometry_type)) return True, res else: LOG.error('Gometry creation error: {}'.format(geometry_type)) return False, res def _create_layer(self, layer_name, lat, lon): """ Creates a layer :param layer_name: <string> :return: (<bool>, res) """ endpoint = '/ext/SpatialPlugin/graphdb/addSimplePointLayer' url = self.url + endpoint payload = {'layer': layer_name, 'lat': lat, 'lon': lon} res = self._post_request(url, payload=payload) if res.status_code == 200: LOG.info('Layer \'{}\' created successfully'.format(layer_name)) return True, res else: LOG.error('Layer creation error code: {} - {}'.format(res.status_code, res)) return False, res def _post_request(self, url, payload): payload = json.dumps(payload) headers = {'authorization': self.authorization_token, 'content-type': 'application/json'} res = req.post(url, data=payload, headers=headers) return res def if_layer_exists(self, layer_name): endpoint = '/ext/SpatialPlugin/graphdb/getLayer' url = self.url + endpoint payload = {'layer': layer_name} res = self._post_request(url, payload=payload) if res.status_code == 200: return True else: return False @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Neo4j.""" """Maximum chunk size is 1000. Transaction blocks won't have more than 1000 statements.""" metadata = { "_ts": timestamp } tx = self.graph.cypher.begin() for doc in docs: index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("uid")) doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata) self.apply_id_constraint(builder.doc_types) for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for query in builder.cypher_list: tx.append(query) # Adding cyphers from cypher list for relationship, params in builder.relationships_query: tx.append(relationship, params) for statement in builder.statements_with_params: for key in statement.keys(): tx.append(key, statement[key]) try: tx.commit() except Exception as e: LOG.error('{}'.format(e.message)) pass @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): doc_id = u(document_id) tx = self.graph.cypher.begin() index, doc_type = self._index_and_mapping(namespace) updater = NodesAndRelationshipsUpdater() updater.run_update(update_spec, doc_id, doc_type) for statement in updater.statements_with_params: for key in statement.keys(): tx.append(key, statement[key]) tx.commit() @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Removes a document from Neo4j.""" doc_id = u(document_id) index, doc_type = self._index_and_mapping(namespace) params_dict = {"doc_id": doc_id} tx = self.graph.cypher.begin() statement = "MATCH (d:Document) WHERE d.uid={doc_id} OPTIONAL MATCH (d)-[r]-() DELETE d, r" tx.append(statement, params_dict) tx.commit() @wrap_exceptions def search(self, start_ts, end_ts): statement = "MATCH (d:Document) WHERE d._ts>={start_ts} AND d._ts<={end_ts} RETURN d".format(start_ts=start_ts, end_ts=end_ts) results = self.graph.cypher.execute(statement) return results def commit(self): LOG.error("Commit") @wrap_exceptions def get_last_doc(self): """Get the most recently modified node from Neo4j. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ LOG.error("Commit") def handle_command(self, doc, namespace, timestamp): db = namespace.split('.', 1)[0] def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split('.', 1) return index.lower(), doc_type
class DocManager(DocManagerBase): """Elasticsearch implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions on Elasticsearch. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): self.elastic = Elasticsearch( hosts=[url], **kwargs.get('clientOptions', {})) self.auto_commit_interval = auto_commit_interval self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size self.routing = kwargs.get('routing', {}) if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() self.has_attachment_mapping = False self.attachment_field = attachment_field def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split('.', 1) return index.lower(), doc_type def _get_parent_id(self, doc_type, doc): """Get parent ID from doc""" if doc_type in self.routing: if '_parent' in doc: return doc.pop('_parent') parent_field = self.routing[doc_type].get('parentField') if not parent_field: return None parent_id = doc.pop(parent_field) if parent_field in doc else None return self._formatter.transform_value(parent_id) def _search_doc_by_id(self, index, doc_type, doc_id): """Search document in Elasticsearch by _id""" result = self.elastic.search(index=index, doc_type=doc_type, body={ 'query': { 'ids': { 'type': doc_type, 'values': [u(doc_id)] } } }) if result['hits']['total'] == 1: return result['hits']['hits'][0] else: return None def stop(self): """Stop the auto-commit thread.""" self.auto_commit_interval = None def apply_update(self, doc, update_spec): if "$set" not in update_spec and "$unset" not in update_spec: # Don't try to add ns and _ts fields back in from doc return update_spec return super(DocManager, self).apply_update(doc, update_spec) @wrap_exceptions def handle_command(self, doc, namespace, timestamp): db = namespace.split('.', 1)[0] if doc.get('dropDatabase'): dbs = self.command_helper.map_db(db) for _db in dbs: self.elastic.indices.delete(index=_db.lower()) if doc.get('renameCollection'): raise errors.OperationFailed( "elastic_doc_manager does not support renaming a mapping.") if doc.get('create'): db, coll = self.command_helper.map_collection(db, doc['create']) if db and coll: self.elastic.indices.put_mapping( index=db.lower(), doc_type=coll, body={ "_source": {"enabled": True} }) if doc.get('drop'): db, coll = self.command_helper.map_collection(db, doc['drop']) if db and coll: # This will delete the items in coll, but not get rid of the # mapping. warnings.warn("Deleting all documents of type %s on index %s." "The mapping definition will persist and must be" "removed manually." % (coll, db)) responses = streaming_bulk( self.elastic, (dict(result, _op_type='delete') for result in scan( self.elastic, index=db.lower(), doc_type=coll))) for ok, resp in responses: if not ok: LOG.error( "Error occurred while deleting ElasticSearch docum" "ent during handling of 'drop' command: %r" % resp) @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ self.commit() index, doc_type = self._index_and_mapping(namespace) if doc_type in self.routing and 'parentField' in self.routing[doc_type]: # We can't use get() here and have to do a full search instead. # This is due to the fact that Elasticsearch needs the parent ID to # know where to route the get request. We might not have the parent # ID available in our update request though. document = self._search_doc_by_id(index, doc_type, document_id) if document is None: LOG.error('Could not find document with ID "%s" in Elasticsearch to apply update', u(document_id)) return None else: document = self.elastic.get(index=index, doc_type=doc_type, id=u(document_id)) updated = self.apply_update(document['_source'], update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document['_id'] if '_parent' in document: updated['_parent'] = document['_parent'] self.upsert(updated, namespace, timestamp) # upsert() strips metadata, so only _id + fields in _source still here return updated @wrap_exceptions def upsert(self, doc, namespace, timestamp): """Insert a document into Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) # No need to duplicate '_id' in source document doc_id = u(doc.pop("_id")) metadata = { "ns": namespace, "_ts": timestamp } parent_id = self._get_parent_id(doc_type, doc) # Index the source document, using lowercase namespace as index name. if parent_id is None: self.elastic.index(index=index, doc_type=doc_type, body=self._formatter.format_document(doc), id=doc_id, refresh=(self.auto_commit_interval == 0)) else: self.elastic.index(index=index, doc_type=doc_type, body=self._formatter.format_document(doc), id=doc_id, parent=parent_id, refresh=(self.auto_commit_interval == 0)) # Index document metadata with original namespace (mixed upper/lower). self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Leave _id, since it's part of the original document doc['_id'] = doc_id @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": index, "_ts": timestamp } } parent_id = self._get_parent_id(doc_type, doc) if parent_id is not None: document_action["_parent"] = parent_id document_action["_source"] = self._formatter.format_document(doc) yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: LOG.error( "Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def insert_file(self, f, namespace, timestamp): doc = f.get_metadata() doc_id = str(doc.pop('_id')) index, doc_type = self._index_and_mapping(namespace) # make sure that elasticsearch treats it like a file if not self.has_attachment_mapping: body = { "properties": { self.attachment_field: {"type": "attachment"} } } self.elastic.indices.put_mapping(index=index, doc_type=doc_type, body=body) self.has_attachment_mapping = True metadata = { 'ns': namespace, '_ts': timestamp, } doc = self._formatter.format_document(doc) doc[self.attachment_field] = base64.b64encode(f.read()).decode() parent_id = self._get_parent_id(doc_type, doc) #LOG.error(" namespace: %r, doc_type: %s, doc %r, parentid: %s" % (namespace, doc_type, doc.keys(),parent_id)) if parent_id is None: self.elastic.index(index=index, doc_type=doc_type, body=doc, id=doc_id, refresh=(self.auto_commit_interval == 0)) else: self.elastic.index(index=index, doc_type=doc_type, body=doc, id=doc_id, parent=parent_id, refresh=(self.auto_commit_interval == 0)) self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Remove a document from Elasticsearch.""" index, doc_type = self._index_and_mapping(namespace) if doc_type in self.routing and 'parentField' in self.routing[doc_type]: # We can't use delete() directly here and have to do a full search first. # This is due to the fact that Elasticsearch needs the parent ID to # know where to route the delete request. We might not have the parent # ID available in our remove request though. document = self._search_doc_by_id(index, doc_type, document_id) if document is None: LOG.error('Could not find document with ID "%s" in Elasticsearch to apply remove', u(document_id)) return parent_id = self._get_parent_id(doc_type, document) self.elastic.delete(index=index, doc_type=doc_type, id=u(document_id), parent=parent_id, refresh=(self.auto_commit_interval == 0)) else: self.elastic.delete(index=index, doc_type=doc_type, id=u(document_id), refresh=(self.auto_commit_interval == 0)) self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type, id=u(document_id), refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan(self.elastic, query=kwargs.pop('body', None), scroll='10m', **kwargs): hit['_source']['_id'] = hit['_id'] if '_parent' in hit: hit['_source']['_parent'] = hit['_parent'] yield hit['_source'] def search(self, start_ts, end_ts): """Query Elasticsearch for documents in a time range. This method is used to find documents that may be in conflict during a rollback event in MongoDB. """ return self._stream_search( index=self.meta_index_name, body={ "query": { "filtered": { "filter": { "range": { "_ts": {"gte": start_ts, "lte": end_ts} } } } } }) def commit(self): """Refresh all Elasticsearch indexes.""" retry_until_ok(self.elastic.indices.refresh, index="") def run_auto_commit(self): """Periodically commit to the Elastic server.""" self.elastic.indices.refresh() if self.auto_commit_interval not in [None, 0]: Timer(self.auto_commit_interval, self.run_auto_commit).start() @wrap_exceptions def get_last_doc(self): """Get the most recently modified document from Elasticsearch. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ try: result = self.elastic.search( index=self.meta_index_name, body={ "query": {"match_all": {}}, "sort": [{"_ts": "desc"}], }, size=1 )["hits"]["hits"] for r in result: r['_source']['_id'] = r['_id'] return r['_source'] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
class DocManager(DocManagerBase): """ Neo4j implementation for the DocManager. Receives documents and communicates with Neo4j Server. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): self.graph = Graph(url) self.auto_commit_interval = auto_commit_interval self.unique_key = unique_key self.chunk_size = chunk_size self._formatter = DefaultDocumentFormatter() self.kwargs = kwargs.get("clientOptions") def apply_id_constraint(self, doc_types): for doc_type in doc_types: constraint = "CREATE CONSTRAINT ON (d:`{doc_type}`) ASSERT d._id IS UNIQUE".format(doc_type=doc_type) self.graph.cypher.execute(constraint) def stop(self): """Stop the auto-commit thread.""" self.auto_commit_interval = None @wrap_exceptions def upsert(self, doc, namespace, timestamp): """Inserts a document into Neo4j.""" index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) metadata = { "_ts": timestamp } doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata) self.apply_id_constraint(builder.doc_types) tx = self.graph.cypher.begin() for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for relationship in builder.relationships_query.keys(): tx.append(relationship, builder.relationships_query[relationship]) tx.commit() @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): def iterate_chunks(): more_chunks = True while more_chunks: tx = self.graph.cypher.begin() metadata = { "_ts": timestamp } for i in range(self.chunk_size): try: doc = next(docs) index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata) self.apply_id_constraint(builder.doc_types) for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for relationship in builder.relationships_query.keys(): tx.append(relationship, builder.relationships_query[relationship]) except StopIteration: more_chunks = False if i > 0: yield tx break if more_chunks: yield tx for tx in iterate_chunks(): tx.commit() @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): doc_id = u(document_id) tx = self.graph.cypher.begin() index, doc_type = self._index_and_mapping(namespace) updater = NodesAndRelationshipsUpdater() updater.run_update(update_spec, doc_id, doc_type) for statement in updater.statements_with_params: for key in statement.keys(): tx.append(key, statement[key]) tx.commit() @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Removes a document from Neo4j.""" doc_id = u(document_id) index, doc_type = self._index_and_mapping(namespace) params_dict = {"doc_id": doc_id} tx = self.graph.cypher.begin() statement = "MATCH (d:Document) WHERE d._id={doc_id} OPTIONAL MATCH (d)-[r]-() DELETE d, r" tx.append(statement, params_dict) tx.commit() @wrap_exceptions def search(self, start_ts, end_ts): statement = "MATCH (d:Document) WHERE d._ts>={start_ts} AND d._ts<={end_ts} RETURN d".format(start_ts=start_ts, end_ts=end_ts) results = self.graph.cypher.execute(statement) return results def commit(self): LOG.error("Commit") @wrap_exceptions def get_last_doc(self): """Get the most recently modified node from Neo4j. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ LOG.error("Commit") def handle_command(self, doc, namespace, timestamp): db = namespace.split('.', 1)[0] def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split('.', 1) return index.lower(), doc_type
class DocManager(DocManagerBase): """ Neo4j implementation for the DocManager. Receives documents and communicates with Neo4j Server. """ def __init__( self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", chunk_size=DEFAULT_MAX_BULK, **kwargs ): self.graph = Graph(url) self.auto_commit_interval = auto_commit_interval self.unique_key = unique_key self.chunk_size = chunk_size self._formatter = DefaultDocumentFormatter() self.kwargs = kwargs.get("clientOptions") def apply_id_constraint(self, doc_types): for doc_type in doc_types: constraint = "CREATE CONSTRAINT ON (d:`{doc_type}`) ASSERT d._id IS UNIQUE".format(doc_type=doc_type) self.graph.cypher.execute(constraint) def stop(self): """Stop the auto-commit thread.""" self.auto_commit_interval = None @wrap_exceptions def upsert(self, doc, namespace, timestamp): """Inserts a document into Neo4j.""" index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) metadata = {"_ts": timestamp} doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata) self.apply_id_constraint(builder.doc_types) tx = self.graph.cypher.begin() for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for relationship in builder.relationships_query.keys(): tx.append(relationship, builder.relationships_query[relationship]) tx.commit() @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): def iterate_chunks(): more_chunks = True while more_chunks: tx = self.graph.cypher.begin() metadata = {"_ts": timestamp} for i in range(self.chunk_size): try: doc = next(docs) index, doc_type = self._index_and_mapping(namespace) doc_id = u(doc.pop("_id")) doc = self._formatter.format_document(doc) builder = NodesAndRelationshipsBuilder(doc, doc_type, doc_id, metadata) self.apply_id_constraint(builder.doc_types) for statement in builder.query_nodes.keys(): tx.append(statement, builder.query_nodes[statement]) for relationship in builder.relationships_query.keys(): tx.append(relationship, builder.relationships_query[relationship]) except StopIteration: more_chunks = False if i > 0: yield tx break if more_chunks: yield tx for tx in iterate_chunks(): tx.commit() @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): doc_id = u(document_id) tx = self.graph.cypher.begin() index, doc_type = self._index_and_mapping(namespace) updater = NodesAndRelationshipsUpdater() updater.run_update(update_spec, doc_id, doc_type) for statement in updater.statements_with_params: for key in statement.keys(): tx.append(key, statement[key]) tx.commit() @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Removes a document from Neo4j.""" doc_id = u(document_id) index, doc_type = self._index_and_mapping(namespace) params_dict = {"doc_id": doc_id} tx = self.graph.cypher.begin() statement = "MATCH (d:Document) WHERE d._id={doc_id} OPTIONAL MATCH (d)-[r]-() DELETE d, r" tx.append(statement, params_dict) tx.commit() @wrap_exceptions def search(self, start_ts, end_ts): statement = "MATCH (d:Document) WHERE d._ts>={start_ts} AND d._ts<={end_ts} RETURN d".format( start_ts=start_ts, end_ts=end_ts ) results = self.graph.cypher.execute(statement) return results def commit(self): LOG.error("Commit") @wrap_exceptions def get_last_doc(self): """Get the most recently modified node from Neo4j. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ LOG.error("Commit") def handle_command(self, doc, namespace, timestamp): db = namespace.split(".", 1)[0] def _index_and_mapping(self, namespace): """Helper method for getting the index and type from a namespace.""" index, doc_type = namespace.split(".", 1) return index.lower(), doc_type
class DocManager(DocManagerBase): """Elasticsearch implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions on Elasticsearch. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): self.elastic = Elasticsearch(hosts=[url]) self.auto_commit_interval = auto_commit_interval self.doc_type = 'string' # default type is string, change if needed self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() def stop(self): """Stop the auto-commit thread.""" self.auto_commit_interval = None @wrap_exceptions def handle_command(self, doc, namespace_set): """Handle database and other command operations""" logging.debug ("ES:handle_command") if namespace_set: db, cmd_ns = doc['ns'].split(".", 1) coll = doc['drop'] if coll not in [None, ""]: index = db+"."+coll if index in namespace_set: logging.debug ("ES: received drop for " + index) self.elastic.indices.delete(index) @wrap_exceptions def update(self, doc, update_spec): """Apply updates given in update_spec to the document whose id matches that of doc. """ document = self.elastic.get(index=doc['ns'], id=str(doc['_id'])) updated = self.apply_update(document['_source'], update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document['_id'] self.upsert(updated) return updated @wrap_exceptions def upsert(self, doc): """Insert a document into Elasticsearch.""" doc_type = self.doc_type index = doc['ns'] # No need to duplicate '_id' in source document doc_id = str(doc.pop("_id")) self.elastic.index(index=index, doc_type=doc_type, body=self._formatter.format_document(doc), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Don't mutate doc argument doc['_id'] = doc_id @wrap_exceptions def bulk_upsert(self, docs): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: index = doc["ns"] doc_id = str(doc.pop("_id")) yield { "_index": index, "_type": self.doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } if not doc: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: logging.error( "Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def remove(self, doc): """Remove a document from Elasticsearch.""" self.elastic.delete(index=doc['ns'], doc_type=self.doc_type, id=str(doc["_id"]), refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan(self.elastic, query=kwargs.pop('body', None), scroll='10m', **kwargs): hit['_source']['_id'] = hit['_id'] yield hit['_source'] def search(self, start_ts, end_ts): """Query Elasticsearch for documents in a time range. This method is used to find documents that may be in conflict during a rollback event in MongoDB. """ return self._stream_search( index="_all", body={ "query": { "filtered": { "filter": { "range": { "_ts": {"gte": start_ts, "lte": end_ts} } } } } }) def commit(self): """Refresh all Elasticsearch indexes.""" retry_until_ok(self.elastic.indices.refresh, index="") def run_auto_commit(self): """Periodically commit to the Elastic server.""" self.elastic.indices.refresh() if self.auto_commit_interval not in [None, 0]: Timer(self.auto_commit_interval, self.run_auto_commit).start() @wrap_exceptions def get_last_doc(self): """Get the most recently modified document from Elasticsearch. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ try: result = self.elastic.search( index="_all", body={ "query": {"match_all": {}}, "sort": [{"_ts": "desc"}], }, size=1 )["hits"]["hits"] for r in result: r['_source']['_id'] = r['_id'] return r['_source'] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
class DocManager(DocManagerBase): """Elasticsearch implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions on Elasticsearch. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", **kwargs): self.elastic = Elasticsearch(hosts=[url]) self.auto_commit_interval = auto_commit_interval self.doc_type = 'string' # default type is string, change if needed self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() def stop(self): """Stop the auto-commit thread.""" self.auto_commit_interval = None def apply_update(self, doc, update_spec): if "$set" not in update_spec and "$unset" not in update_spec: # Don't try to add ns and _ts fields back in from doc return update_spec return super(DocManager, self).apply_update(doc, update_spec) @wrap_exceptions def update(self, doc, update_spec): """Apply updates given in update_spec to the document whose id matches that of doc. """ self.commit() document = self.elastic.get(index=doc['ns'], id=str(doc['_id'])) updated = self.apply_update(document['_source'], update_spec) # _id is immutable in MongoDB, so won't have changed in update updated['_id'] = document['_id'] # Add metadata fields back into updated, for the purposes of # calling upsert(). Need to do this until these become separate # arguments in 2.x updated['ns'] = doc['ns'] updated['_ts'] = doc['_ts'] self.upsert(updated) # upsert() strips metadata, so only _id + fields in _source still here return updated @wrap_exceptions def upsert(self, doc): """Insert a document into Elasticsearch.""" doc_type = self.doc_type index = doc.pop('ns') # No need to duplicate '_id' in source document doc_id = str(doc.pop("_id")) metadata = { "ns": index, "_ts": doc.pop("_ts") } # Index the source document self.elastic.index(index=index, doc_type=doc_type, body=self._formatter.format_document(doc), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Index document metadata self.elastic.index(index=self.meta_index_name, doc_type=self.meta_type, body=bson.json_util.dumps(metadata), id=doc_id, refresh=(self.auto_commit_interval == 0)) # Leave _id, since it's part of the original document doc['_id'] = doc_id @wrap_exceptions def bulk_upsert(self, docs): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index = doc.pop("ns") doc_id = str(doc.pop("_id")) timestamp = doc.pop("_ts") document_action = { "_index": index, "_type": self.doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc) } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": { "ns": index, "_ts": timestamp } } yield document_action yield document_meta if not doc: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search") try: kw = {} if self.chunk_size > 0: kw['chunk_size'] = self.chunk_size responses = streaming_bulk(client=self.elastic, actions=docs_to_upsert(), **kw) for ok, resp in responses: if not ok: logging.error( "Could not bulk-upsert document " "into ElasticSearch: %r" % resp) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass @wrap_exceptions def remove(self, doc): """Remove a document from Elasticsearch.""" self.elastic.delete(index=doc['ns'], doc_type=self.doc_type, id=str(doc["_id"]), refresh=(self.auto_commit_interval == 0)) self.elastic.delete(index=self.meta_index_name, doc_type=self.meta_type, id=str(doc["_id"]), refresh=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, *args, **kwargs): """Helper method for iterating over ES search results.""" for hit in scan(self.elastic, query=kwargs.pop('body', None), scroll='10m', **kwargs): hit['_source']['_id'] = hit['_id'] yield hit['_source'] def search(self, start_ts, end_ts): """Query Elasticsearch for documents in a time range. This method is used to find documents that may be in conflict during a rollback event in MongoDB. """ return self._stream_search( index=self.meta_index_name, body={ "query": { "filtered": { "filter": { "range": { "_ts": {"gte": start_ts, "lte": end_ts} } } } } }) def commit(self): """Refresh all Elasticsearch indexes.""" retry_until_ok(self.elastic.indices.refresh, index="") def run_auto_commit(self): """Periodically commit to the Elastic server.""" self.elastic.indices.refresh() if self.auto_commit_interval not in [None, 0]: Timer(self.auto_commit_interval, self.run_auto_commit).start() @wrap_exceptions def get_last_doc(self): """Get the most recently modified document from Elasticsearch. This method is used to help define a time window within which documents may be in conflict after a MongoDB rollback. """ try: result = self.elastic.search( index=self.meta_index_name, body={ "query": {"match_all": {}}, "sort": [{"_ts": "desc"}], }, size=1 )["hits"]["hits"] for r in result: r['_source']['_id'] = r['_id'] return r['_source'] except es_exceptions.RequestError: # no documents so ES returns 400 because of undefined _ts mapping return None
class DocManager(DocManagerBase): """Kafka implementation of the DocManager interface. Receives documents from an OplogThread and takes the appropriate actions to send to Kafka """ def __init__(self, url, unique_id='_id', **kwargs): self.kafkaprod = KafkaProducer( client_id='mongotokafka-producer-mconnect', bootstrap_servers=[url]) print("__init__ ran") print(str(self.kafkaprod.config)) self.unique_key = unique_id self._formatter = DefaultDocumentFormatter() def _topic_and_mapping(self, namespace): """Helper method for getting the topic from a namespace.""" topic_prefix, topic = namespace.split('.', 1) return topic_prefix + "_" + topic def stop(self): #logging.log(info,"Closing Kafka Broker") logging.info("Closing Kafka Broker") self.kafkaprod.close() def apply_update(self, doc, update_spec): if "$set" not in update_spec and "$unset" not in update_spec: # Don't try to add ns and _ts fields back in from doc return update_spec return super(DocManager, self).apply_update(doc, update_spec) def handle_command(self, doc, namespace, timestamp): db = namespace.split('.', 1)[0] if doc.get('dropDatabase'): raise errors.OperationFailed( "kafka_doc_manager does not currently support deleting a topic" ) if doc.get('renameCollection'): raise errors.OperationFailed( "kafka_doc_manager does not support renaming topics.") if doc.get('create'): db, coll = self.command_helper.map_collection(db, doc['create']) if db and coll: # if a MongoDB dbs is created, create Kafka topic for it # TODO pass if doc.get('drop'): db, coll = self.command_helper.map_collection(db, doc['drop']) if db and coll: # if a MongoDB collection is deleted, delete Kafka topic # TODO pass def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ # self.commit() self.upsert(updated, namespace, timestamp) return updated def upsert(self, doc, namespace, timestamp): """Insert a document into a Kafka topic.""" #Use Kafka Synchronous method to insert individual record. topic = self._topic_and_mapping(namespace) # Send document to Kafka with appropriate topic setting d_fixed = self._formatter.format_document(doc) doc_fixed = dumps(d_fixed) futureprod = self.kafkaprod.send(topic, str(doc_fixed)) # commit right away making sure kafka buffer is empty self.commit() #try: # record_metadata = futureprod.get(timeout=10) # logging.log(info,record_metadata.offset) #except KafkaError: # logging.exception("Kafka single upsert failed") # pass def bulk_upsert(self, docs, namespace, timestamp): #Insert multiple documents into Kafka topics. # Make calls to Kafka.send async non-blocking # create loop to read through "docs" and send each one to the buffer for doc in docs: topic = self._topic_and_mapping(namespace) d_fixed = self._formatter.format_document(doc) doc_fixed = dumps(d_fixed) futureprod = self.kafkaprod.send(topic, str(doc_fixed)) self.commit() def insert_file(self, f, namespace, timestamp): # Not implemented for Kafka pass def remove(self, document_id, namespace, timestamp): #Kafka does not allow deletion of random messages/offsets in a topic pass def _stream_search(self, *args, **kwargs): # Kafka does not allow searching for specific values in the topic pass def search(self, start_ts, end_ts): # Kafka does not allow searching of topics pass def commit(self): # Kafka does not normally require commits, but the flush command can temporarily block to empty a buffer self.kafkaprod.flush() def run_auto_commit(self): # Kafka does not normally require commits, but the flush command can temporarily block to empty a buffer self.kafkaprod.flush() def get_last_doc(self): # While we could pull the last doc from Kafka, rollbacks in Mongo # will replay docs into the given Kafka topic, the _id should # allow syncing for any consumers pass